NotificationsYou must be signed in to change notification settings
Fork32
Star279

Commitaae585f

committed

Mark interlinear annotation chars and Egyptian hieroglyph format controls as non-zero width

1 parent436b0db commitaae585fCopy full SHA for aae585f

File tree

3 files changed

+30

-33

lines changed

scripts
- unicode.py
src
- tables.rs
- tests.rs

3 files changed

+30

-33

lines changed

`‎scripts/unicode.py‎`

Lines changed: 5 additions & 30 deletions

Original file line number	Diff line number	Diff line change
`@@ -150,15 +150,14 @@ def load_zero_widths() -> "list[bool]":`
`150`	`150`	"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
`151`	`151`	character. `c` is considered a zero-width character if
`152`	`152`
`153`		- - it is in general categories `Cc`, `Cf`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
`154`		- and is not a `Prepended_Concatenation_Mark` (determined from `PropList.txt`),
	`153`	+ - it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
`155`	`154`	- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
`156`	`155`	- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
`157`	`156`	`"""`
`158`	`157`
`159`	`158`	`zw_map= []`
`160`	`159`
`161`		-# Characters with general category `Cc`, `Cf`, `Mn`, or `Me` have 0 width...
	`160`	+# Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
`162`	`161`	`withfetch_open("UnicodeData.txt")ascategories:`
`163`	`162`	`current=0`
`164`	`163`	`forlineincategories.readlines():`
`@@ -169,7 +168,7 @@ def load_zero_widths() -> "list[bool]":`
`169`	`168`	`raw_data[1],`
`170`	`169`	`raw_data[2],`
`171`	`170`	`]`
`172`		`-zero_width=cat_codein ["Cc","Cf","Mn","Me"]`
	`171`	`+zero_width=cat_codein ["Cc","Mn","Me"]`
`173`	`172`
`174`	`173`	`assertcurrent<=codepoint`
`175`	`174`	`whilecurrent<=codepoint:`
`@@ -186,32 +185,9 @@ def load_zero_widths() -> "list[bool]":`
`186`	`185`	`# Catch any leftover codepoints. They must be unassigned (so nonzero width)`
`187`	`186`	`zw_map.append(False)`
`188`	`187`
`189`		-# ...unless they are a `Prepended_Concatenation_Mark`.
`190`		`-# https://www.unicode.org/reports/tr44/:`
`191`		`-# "A small class of visible format controls,`
`192`		`-# which precede and then span a sequence of other characters, usually digits.`
`193`		`-# These have also been known as "subtending marks",`
`194`		`-# because most of them take a form which visually extends underneath the sequence of following digits."`
`195`		`-withfetch_open("PropList.txt")asproperties:`
`196`		`-single=re.compile(r"^([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+")`
`197`		`-multiple=re.compile(`
`198`		`-r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+"`
`199`		`- )`
`200`		`-forlineinproperties.readlines():`
`201`		`-raw_data=None# (low, high)`
`202`		`-ifmatch:=single.match(line):`
`203`		`-raw_data= (match.group(1),match.group(1))`
`204`		`-elifmatch:=multiple.match(line):`
`205`		`-raw_data= (match.group(1),match.group(2))`
`206`		`-else:`
`207`		`-continue`
`208`		`-low=int(raw_data[0],16)`
`209`		`-high=int(raw_data[1],16)`
`210`		`-forcpinrange(low,high+1):`
`211`		`-zw_map[cp]=False`
`212`		`-`
`213`	`188`	# `Default_Ignorable_Code_Point`s also have 0 width:
`214`	`189`	`# https://www.unicode.org/faq/unsup_char.html#3`
	`190`	`+# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095`
`215`	`191`	`withfetch_open("DerivedCoreProperties.txt")asproperties:`
`216`	`192`	`single=re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")`
`217`	`193`	`multiple=re.compile(`
`@@ -552,8 +528,7 @@ def main(module_filename: str):`
`552`	`528`	- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
`553`	`529`	`- Hangul jamo medial vowels & final consonants are zero-width.`
`554`	`530`	- All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
`555`		- - All codepoints in general categories `Cc`, `Cf`, `Mn`, or `Me` are zero-width,
`556`		- except for `Prepended_Concatenation_Mark`s.
	`531`	+ - All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
`557`	`532`	- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
`558`	`533`	- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
`559`	`534`	`- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width`

`‎src/tables.rs‎`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@ pub mod charwidth {`
`182`	`182`	`0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,`
`183`	`183`	`0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,`
`184`	`184`	`0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,`
`185`		`-0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x47,0xBD,0x06,0x06,0x06,0x06,`
	`185`	`+0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0xBD,0x06,0x06,0x06,0x06,`
`186`	`186`	`0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,`
`187`	`187`	`0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,`
`188`	`188`	`0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,`
`@@ -430,7 +430,7 @@ pub mod charwidth {`
`430`	`430`	`0x6A,0xAA,0x55,0x55,0x55,0x55,0x55,0xAA,0xAA,0xAA,0xAA,0xAA,0xAA,0xAA,0xAA,`
`431`	`431`	`0x56,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,`
`432`	`432`	`0x55,0x54,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,`
`433`		`-0x55,0x55,0xAA,0x6A,0x55,0x55,0x00,0x00,0x00,0x5D,0x55,0x55,0x55,0x55,0x55,`
	`433`	`+0x55,0x55,0xAA,0x6A,0x55,0x55,0x00,0x00,0x54,0x5D,0x55,0x55,0x55,0x55,0x55,`
`434`	`434`	`0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x51,0x55,0x55,0x55,0x55,`
`435`	`435`	`0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x05,0x40,0x55,0x01,0x41,0x55,`
`436`	`436`	`0x00,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x40,0x15,0x55,0x55,`

`‎src/tests.rs‎`

Lines changed: 23 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -223,7 +223,7 @@ fn test_jamo() {`
`223`	`223`	`}`
`224`	`224`
`225`	`225`	`#[test]`
`226`		`-fntest_prepended_concatenation_mark(){`
	`226`	`+fntest_prepended_concatenation_marks(){`
`227`	`227`	`usesuper::UnicodeWidthChar;`
`228`	`228`	`#[cfg(feature ="no_std")]`
`229`	`229`	`use core::option::Option::{None,Some};`
`@@ -233,3 +233,25 @@ fn test_prepended_concatenation_mark() {`
`233`	`233`	`assert_eq!(UnicodeWidthChar::width('\u{08E2}'),Some(1));`
`234`	`234`	`assert_eq!(UnicodeWidthChar::width('\u{110BD}'),Some(1));`
`235`	`235`	`}`
	`236`	`+`
	`237`	`+#[test]`
	`238`	`+fntest_interlinear_annotation_chars(){`
	`239`	`+usesuper::UnicodeWidthChar;`
	`240`	`+#[cfg(feature ="no_std")]`
	`241`	`+use core::option::Option::{None,Some};`
	`242`	`+`
	`243`	`+assert_eq!(UnicodeWidthChar::width('\u{FFF9}'),Some(1));`
	`244`	`+assert_eq!(UnicodeWidthChar::width('\u{FFFA}'),Some(1));`
	`245`	`+assert_eq!(UnicodeWidthChar::width('\u{FFFB}'),Some(1));`
	`246`	`+}`
	`247`	`+`
	`248`	`+#[test]`
	`249`	`+fntest_hieroglyph_format_controls(){`
	`250`	`+usesuper::UnicodeWidthChar;`
	`251`	`+#[cfg(feature ="no_std")]`
	`252`	`+use core::option::Option::{None,Some};`
	`253`	`+`
	`254`	`+assert_eq!(UnicodeWidthChar::width('\u{13430}'),Some(1));`
	`255`	`+assert_eq!(UnicodeWidthChar::width('\u{13436}'),Some(1));`
	`256`	`+assert_eq!(UnicodeWidthChar::width('\u{1343C}'),Some(1));`
	`257`	`+}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitaae585f

File tree

3 files changed

3 files changed

`‎scripts/unicode.py‎`

`‎src/tables.rs‎`

`‎src/tests.rs‎`

0 commit comments