NotificationsYou must be signed in to change notification settings
Fork32
Star279

Commit787fed3

committed

unicode.py: Don't useUnicodeData.txt anymore

1 parent00ee4b0 commit787fed3Copy full SHA for 787fed3

File tree

1 file changed

+28

-49

lines changed

scripts
- unicode.py

1 file changed

+28

-49

lines changed

`‎scripts/unicode.py‎`

Lines changed: 28 additions & 49 deletions

Original file line number	Diff line number	Diff line change
`@@ -11,9 +11,12 @@`
`11`	`11`	`# except according to those terms.`
`12`	`12`
`13`	`13`	`# This script uses the following Unicode tables:`
	`14`	`+#`
	`15`	`+# - DerivedCoreProperties.txt`
`14`	`16`	`# - EastAsianWidth.txt`
	`17`	`+# - HangulSyllableType.txt`
	`18`	`+# - PropList.txt`
`15`	`19`	`# - ReadMe.txt`
`16`		`-# - UnicodeData.txt`
`17`	`20`	`#`
`18`	`21`	`# Since this should not require frequent updates, we just store this`
`19`	`22`	`# out-of-line and check the generated module into git.`
`@@ -150,41 +153,20 @@ def load_zero_widths() -> "list[bool]":`
`150`	`153`	"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
`151`	`154`	character. `c` is considered a zero-width character if
`152`	`155`
`153`		- - it is in general category `Cc`,
`154`		- - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
	`156`	`+ - it is a control character,`
`155`	`157`	- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
	`158`	+ - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
	`159`	+ - or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug,
`156`	`160`	- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
`157`	`161`	`"""`
`158`	`162`
`159`		`-zw_map= []`
	`163`	`+zw_map= [False]*NUM_CODEPOINTS`
`160`	`164`
`161`		-# Characters with general category `Cc` have 0 width
`162`		`-withfetch_open("UnicodeData.txt")ascategories:`
`163`		`-current=0`
`164`		`-forlineincategories.readlines():`
`165`		`-iflen(raw_data:=line.split(";"))!=15:`
`166`		`-continue`
`167`		`- [codepoint,name,cat_code]= [`
`168`		`-int(raw_data[0],16),`
`169`		`-raw_data[1],`
`170`		`-raw_data[2],`
`171`		`- ]`
`172`		`-zero_width=cat_code=="Cc"`
`173`		`-`
`174`		`-assertcurrent<=codepoint`
`175`		`-whilecurrent<=codepoint:`
`176`		`-ifname.endswith(", Last>")orcurrent==codepoint:`
`177`		`-# if name ends with Last, we backfill the width value to all codepoints since`
`178`		`-# the previous codepoint (aka the start of the range)`
`179`		`-zw_map.append(zero_width)`
`180`		`-else:`
`181`		`-# unassigned characters are implicitly given Neutral width, which is nonzero`
`182`		`-zw_map.append(False)`
`183`		`-current+=1`
`184`		`-`
`185`		`-whilelen(zw_map)<NUM_CODEPOINTS:`
`186`		`-# Catch any leftover codepoints. They must be unassigned (so nonzero width)`
`187`		`-zw_map.append(False)`
	`165`	`+# Control characters have width 0`
	`166`	`+forcinrange(0x00,0x20):`
	`167`	`+zw_map[c]=True`
	`168`	`+forcinrange(0x7F,0xA0):`
	`169`	`+zw_map[c]=True`
`188`	`170`
`189`	`171`	# `Default_Ignorable_Code_Point`s also have 0 width:
`190`	`172`	`# https://www.unicode.org/faq/unsup_char.html#3`
`@@ -214,6 +196,12 @@ def load_zero_widths() -> "list[bool]":`
`214`	`196`	`forcpinrange(low,high+1):`
`215`	`197`	`zw_map[cp]=True`
`216`	`198`
	`199`	+# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
	`200`	`+# as they canonically decompose to two characters with this property,`
	`201`	`+# but they aren't.`
	`202`	`+forcin [0x0CC0,0x0CC7,0x0CC8,0x0CCA,0x0CCB,0x1B3B,0x1B3D,0x1B43]:`
	`203`	`+zw_map[c]=True`
	`204`	`+`
`217`	`205`	# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
`218`	`206`	# as zero-width. This matches the behavior of glibc `wcwidth`.
`219`	`207`	`#`
`@@ -248,18 +236,6 @@ def load_zero_widths() -> "list[bool]":`
`248`	`236`	`# width 2. Therefore, we treat it as having width 2.`
`249`	`237`	`zw_map[0x115F]=False`
`250`	`238`
`251`		-# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
`252`		`-# as they canonically decompose to two characters with this property,`
`253`		`-# but they aren't.`
`254`		`-zw_map[0x0CC0]=True`
`255`		`-zw_map[0x0CC7]=True`
`256`		`-zw_map[0x0CC8]=True`
`257`		`-zw_map[0x0CCA]=True`
`258`		`-zw_map[0x0CCB]=True`
`259`		`-zw_map[0x1B3B]=True`
`260`		`-zw_map[0x1B3D]=True`
`261`		`-zw_map[0x1B43]=True`
`262`		`-`
`263`	`239`	`returnzw_map`
`264`	`240`
`265`	`241`
`@@ -297,7 +273,7 @@ def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]":`
`297`	`273`	`result.sort()`
`298`	`274`	`returnresult`
`299`	`275`
`300`		`-defwidth(self)->"EffectiveWidth":`
	`276`	`+defwidth(self)->"EffectiveWidth \| None":`
`301`	`277`	`"""If all codepoints in this bucket have the same width, return that width; otherwise,`
`302`	`278`	return `None`."""
`303`	`279`	`iflen(self.widths)==0:`
`@@ -542,13 +518,16 @@ def main(module_filename: str):`
`542`	`518`	`lookup table for character width, and write a Rust module utilizing that table to`
`543`	`519`	`module_filename`.
`544`	`520`
`545`		`- We obey the following rules in decreasing order of importance:`
	`521`	`+ We obey the following rules, in decreasing order of importance:`
	`522`	`+`
`546`	`523`	- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
`547`	`524`	`- Hangul jamo medial vowels & final consonants are zero-width.`
`548`		- - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
`549`		- - All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
`550`		- - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
`551`		- - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
	`525`	+ - `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
	`526`	`+ - Control characters are zero-width.`
	`527`	+ - `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters,
	`528`	`+ are zero-width.`
	`529`	+ - Codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
	`530`	+ - Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
`552`	`531`	`- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width`
`553`	`532`	of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
`554`	`533`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit787fed3

File tree

1 file changed

1 file changed

`‎scripts/unicode.py‎`

0 commit comments