NotificationsYou must be signed in to change notification settings
Fork32
Star278

Commita2db56b

committed

Refactorunicode.py

- Align tables- Use helper function to parse properties

1 parentda626ef commita2db56bCopy full SHA for a2db56b

File tree

2 files changed

+96

-102

lines changed

scripts
- unicode.py
src
- tables.rs

2 files changed

+96

-102

lines changed

`‎scripts/unicode.py‎`

Lines changed: 81 additions & 91 deletions

Original file line number	Diff line number	Diff line change
`@@ -24,12 +24,14 @@`
`24`	`24`
`25`	`25`	`importenum`
`26`	`26`	`importmath`
	`27`	`+importoperator`
`27`	`28`	`importos`
`28`	`29`	`importre`
`29`	`30`	`importsys`
`30`	`31`	`importurllib.request`
`31`	`32`	`fromcollectionsimportdefaultdict`
`32`	`33`	`fromitertoolsimportbatched`
	`34`	`+fromtypingimportCallable`
`33`	`35`
`34`	`36`	`UNICODE_VERSION="15.1.0"`
`35`	`37`	`"""The version of the Unicode data files to download."""`
`@@ -90,13 +92,32 @@ def fetch_open(filename: str, local_prefix: str = ""):`
`90`	`92`	`sys.exit(1)`
`91`	`93`
`92`	`94`
`93`		`-defload_unicode_version()->"tuple[int, int, int]":`
	`95`	`+defload_unicode_version()->tuple[int,int,int]:`
`94`	`96`	"""Returns the current Unicode version by fetching and processing `ReadMe.txt`."""
`95`	`97`	`withfetch_open("ReadMe.txt")asreadme:`
`96`	`98`	`pattern=r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"`
`97`	`99`	`returntuple(map(int,re.search(pattern,readme.read()).groups()))`
`98`	`100`
`99`	`101`
	`102`	`+defload_property(filename:str,pattern:str,action:Callable[[int],None]):`
	`103`	`+withfetch_open(filename)asproperties:`
	`104`	`+single=re.compile(rf"^([0-9A-F]+)\s;\s{pattern}\s+")`
	`105`	`+multiple=re.compile(rf"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\s{pattern}\s+")`
	`106`	`+`
	`107`	`+forlineinproperties.readlines():`
	`108`	`+raw_data=None# (low, high)`
	`109`	`+ifmatch:=single.match(line):`
	`110`	`+raw_data= (match.group(1),match.group(1))`
	`111`	`+elifmatch:=multiple.match(line):`
	`112`	`+raw_data= (match.group(1),match.group(2))`
	`113`	`+else:`
	`114`	`+continue`
	`115`	`+low=int(raw_data[0],16)`
	`116`	`+high=int(raw_data[1],16)`
	`117`	`+forcpinrange(low,high+1):`
	`118`	`+action(cp)`
	`119`	`+`
	`120`	`+`
`100`	`121`	`classEffectiveWidth(enum.IntEnum):`
`101`	`122`	`"""Represents the width of a Unicode character. All East Asian Width classes resolve into`
`102`	`123`	either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
`@@ -112,15 +133,15 @@ class EffectiveWidth(enum.IntEnum):`
`112`	`133`	`""" Two columns wide in a CJK context. One column wide in all other contexts. """`
`113`	`134`
`114`	`135`
`115`		`-defload_east_asian_widths()->"list[EffectiveWidth]":`
	`136`	`+defload_east_asian_widths()->list[EffectiveWidth]:`
`116`	`137`	`"""Return a list of effective widths, indexed by codepoint.`
`117`	`138`	Widths are determined by fetching and parsing `EastAsianWidth.txt`.
`118`	`139`
`119`	`140`	`Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`.
`120`	`141`
`121`	`142`	`Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
`122`	`143`
`123`		- `Ambiguous`chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
	`144`	+ `Ambiguous`characters are assigned `EffectiveWidth.AMBIGUOUS`."""
`124`	`145`	`withfetch_open("EastAsianWidth.txt")aseaw:`
`125`	`146`	`# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."`
`126`	`147`	`single=re.compile(r"^([0-9A-F]+)\s;\s(\w+) +# (\w+)")`
`@@ -161,7 +182,7 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":`
`161`	`182`	`returnwidth_map`
`162`	`183`
`163`	`184`
`164`		`-defload_zero_widths()->"list[bool]":`
	`185`	`+defload_zero_widths()->list[bool]:`
`165`	`186`	"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
`166`	`187`	character. `c` is considered a zero-width character if
`167`	`188`
`@@ -180,26 +201,11 @@ def load_zero_widths() -> "list[bool]":`
`180`	`201`	# `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
`181`	`202`	# as well as a few `Mc` characters that need to be included so that
`182`	`203`	`# canonically equivalent sequences have the same width.`
`183`		`-withfetch_open("DerivedCoreProperties.txt")asproperties:`
`184`		`-single=re.compile(`
`185`		`-r"^([0-9A-F]+)\s;\s(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"`
`186`		`- )`
`187`		`-multiple=re.compile(`
`188`		`-r"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\s(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"`
`189`		`- )`
`190`		`-`
`191`		`-forlineinproperties.readlines():`
`192`		`-raw_data=None# (low, high)`
`193`		`-ifmatch:=single.match(line):`
`194`		`-raw_data= (match.group(1),match.group(1))`
`195`		`-elifmatch:=multiple.match(line):`
`196`		`-raw_data= (match.group(1),match.group(2))`
`197`		`-else:`
`198`		`-continue`
`199`		`-low=int(raw_data[0],16)`
`200`		`-high=int(raw_data[1],16)`
`201`		`-forcpinrange(low,high+1):`
`202`		`-zw_map[cp]=True`
	`204`	`+load_property(`
	`205`	`+"DerivedCoreProperties.txt",`
	`206`	`+r"(?:Default_Ignorable_Code_Point\|Grapheme_Extend)",`
	`207`	`+lambdacp:operator.setitem(zw_map,cp,True),`
	`208`	`+ )`
`203`	`209`
`204`	`210`	# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
`205`	`211`	`# as they canonically decompose to two characters with this property,`
`@@ -217,29 +223,11 @@ def load_zero_widths() -> "list[bool]":`
`217`	`223`	`# and the resulting grapheme has width 2.`
`218`	`224`	`#`
`219`	`225`	`# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)`
`220`		`-withfetch_open("HangulSyllableType.txt")ascategories:`
`221`		`-single=re.compile(r"^([0-9A-F]+)\s;\s(V\|T)\s+")`
`222`		`-multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\s(V\|T)\s+")`
`223`		`-`
`224`		`-forlineincategories.readlines():`
`225`		`-raw_data=None# (low, high)`
`226`		`-ifmatch:=single.match(line):`
`227`		`-raw_data= (match.group(1),match.group(1))`
`228`		`-elifmatch:=multiple.match(line):`
`229`		`-raw_data= (match.group(1),match.group(2))`
`230`		`-else:`
`231`		`-continue`
`232`		`-low=int(raw_data[0],16)`
`233`		`-high=int(raw_data[1],16)`
`234`		`-forcpinrange(low,high+1):`
`235`		`-zw_map[cp]=True`
`236`		`-`
`237`		`-# Special case: U+115F HANGUL CHOSEONG FILLER.`
`238`		-# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
`239`		`-# zero width. However, the expected usage is to combine it with vowel or trailing jamo`
`240`		`-# (which are considered 0-width on their own) to form a composed Hangul syllable with`
`241`		`-# width 2. Therefore, we treat it as having width 2.`
`242`		`-zw_map[0x115F]=False`
	`226`	`+load_property(`
	`227`	`+"HangulSyllableType.txt",`
	`228`	`+r"(?:V\|T)",`
	`229`	`+lambdacp:operator.setitem(zw_map,cp,True),`
	`230`	`+ )`
`243`	`231`
`244`	`232`	`# Syriac abbreviation mark:`
`245`	`233`	# Zero-width `Prepended_Concatenation_Mark`
`@@ -252,7 +240,14 @@ def load_zero_widths() -> "list[bool]":`
`252`	`240`	`zw_map[0x0891]=True`
`253`	`241`	`zw_map[0x08E2]=True`
`254`	`242`
`255`		`-# U+A8FA DEVANAGARI CARET`
	`243`	`+# HANGUL CHOSEONG FILLER`
	`244`	+# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
	`245`	`+# zero width. However, the expected usage is to combine it with vowel or trailing jamo`
	`246`	`+# (which are considered 0-width on their own) to form a composed Hangul syllable with`
	`247`	`+# width 2. Therefore, we treat it as having width 2.`
	`248`	`+zw_map[0x115F]=False`
	`249`	`+`
	`250`	`+# DEVANAGARI CARET`
`256`	`251`	`# https://www.unicode.org/versions/Unicode15.0.0/ch12.pdf#G667447`
`257`	`252`	`zw_map[0xA8FA]=True`
`258`	`253`
`@@ -287,13 +282,13 @@ def try_extend(self, attempt: "Bucket") -> bool:`
`287`	`282`	`self.widths=more`
`288`	`283`	`returnTrue`
`289`	`284`
`290`		`-defentries(self)->"list[tuple[Codepoint, EffectiveWidth]]":`
	`285`	`+defentries(self)->list[tuple[Codepoint,EffectiveWidth]]:`
`291`	`286`	`"""Return a list of the codepoint/width pairs in this bucket, sorted by codepoint."""`
`292`	`287`	`result=list(self.entry_set)`
`293`	`288`	`result.sort()`
`294`	`289`	`returnresult`
`295`	`290`
`296`		`-defwidth(self)->"EffectiveWidth \| None":`
	`291`	`+defwidth(self)->EffectiveWidth\|None:`
`297`	`292`	`"""If all codepoints in this bucket have the same width, return that width; otherwise,`
`298`	`293`	return `None`."""
`299`	`294`	`iflen(self.widths)==0:`
`@@ -305,7 +300,7 @@ def width(self) -> "EffectiveWidth \| None":`
`305`	`300`	`returnpotential_width`
`306`	`301`
`307`	`302`
`308`		`-defmake_buckets(entries,low_bit:BitPos,cap_bit:BitPos)->"list[Bucket]":`
	`303`	`+defmake_buckets(entries,low_bit:BitPos,cap_bit:BitPos)->list[Bucket]:`
`309`	`304`	"""Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All
`310`	`305`	codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the
`311`	`306`	`same bucket. Returns a list of the buckets in increasing order of those bits."""`
`@@ -373,7 +368,7 @@ def buckets(self):`
`373`	`368`	`"""Returns an iterator over this table's buckets."""`
`374`	`369`	`returnself.indexed`
`375`	`370`
`376`		`-defto_bytes(self)->"list[int]":`
	`371`	`+defto_bytes(self)->list[int]:`
`377`	`372`	`"""Returns this table's entries as a list of bytes. The bytes are formatted according to`
`378`	`373`	the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries
`379`	`374`	to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will
`@@ -389,8 +384,8 @@ def to_bytes(self) -> "list[int]":`
`389`	`384`
`390`	`385`
`391`	`386`	`defmake_tables(`
`392`		`-table_cfgs:"list[tuple[BitPos, BitPos, OffsetType]]",entries`
`393`		`-)->"list[Table]":`
	`387`	`+table_cfgs:list[tuple[BitPos,BitPos,OffsetType]],entries`
	`388`	`+)->list[Table]:`
`394`	`389`	"""Creates a table for each configuration in `table_cfgs`, with the first config corresponding
`395`	`390`	`to the top-level lookup table, the second config corresponding to the second-level lookup`
`396`	`391`	table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs
`@@ -404,7 +399,7 @@ def make_tables(`
`404`	`399`	`returntables`
`405`	`400`
`406`	`401`
`407`		`-defload_emoji_presentation_sequences()->"list[int]":`
	`402`	`+defload_emoji_presentation_sequences()->list[int]:`
`408`	`403`	`"""Outputs a list of character ranages, corresponding to all the valid characters for starting`
`409`	`404`	`an emoji presentation sequence."""`
`410`	`405`
`@@ -420,7 +415,7 @@ def load_emoji_presentation_sequences() -> "list[int]":`
`420`	`415`	`returncodepoints`
`421`	`416`
`422`	`417`
`423`		`-defload_text_presentation_sequences()->"list[int]":`
	`418`	`+defload_text_presentation_sequences()->list[int]:`
`424`	`419`	`"""Outputs a list of character ranages, corresponding to all the valid characters`
`425`	`420`	`whose widths change with a text presentation sequence."""`
`426`	`421`
`@@ -435,24 +430,12 @@ def load_text_presentation_sequences() -> "list[int]":`
`435`	`430`	`text_presentation_seq_codepoints.add(cp)`
`436`	`431`
`437`	`432`	`default_emoji_codepoints=set()`
`438`		`-withfetch_open("emoji/emoji-data.txt")asemoji_data:`
`439`		`-single=re.compile(r"^([0-9A-F]+)\s;\sEmoji_Presentation\s+")`
`440`		`-multiple=re.compile(`
`441`		`-r"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\sEmoji_Presentation\s+"`
`442`		`- )`
`443`	`433`
`444`		`-forlineinemoji_data.readlines():`
`445`		`-raw_data=None# (low, high)`
`446`		`-ifmatch:=single.match(line):`
`447`		`-raw_data= (match.group(1),match.group(1))`
`448`		`-elifmatch:=multiple.match(line):`
`449`		`-raw_data= (match.group(1),match.group(2))`
`450`		`-else:`
`451`		`-continue`
`452`		`-low=int(raw_data[0],16)`
`453`		`-high=int(raw_data[1],16)`
`454`		`-forcpinrange(low,high+1):`
`455`		`-default_emoji_codepoints.add(cp)`
	`434`	`+load_property(`
	`435`	`+"emoji/emoji-data.txt",`
	`436`	`+"Emoji_Presentation",`
	`437`	`+lambdacp:default_emoji_codepoints.add(cp),`
	`438`	`+ )`
`456`	`439`
`457`	`440`	`codepoints= []`
`458`	`441`	`forcpintext_presentation_seq_codepoints.intersection(default_emoji_codepoints):`
`@@ -466,11 +449,11 @@ def load_text_presentation_sequences() -> "list[int]":`
`466`	`449`
`467`	`450`
`468`	`451`	`defmake_presentation_sequence_table(`
`469`		`-seqs:"list[int]",`
`470`		`-width_map:"list[EffectiveWidth]",`
`471`		`-spurious_false:"set[EffectiveWidth]",`
`472`		`-spurious_true:"set[EffectiveWidth]",`
`473`		`-)->"tuple[list[tuple[int, int]], list[list[int]]]":`
	`452`	`+seqs:list[Codepoint],`
	`453`	`+width_map:list[EffectiveWidth],`
	`454`	`+spurious_false:set[EffectiveWidth],`
	`455`	`+spurious_true:set[EffectiveWidth],`
	`456`	`+)->tuple[list[tuple[int,int]],list[list[int]]]:`
`474`	`457`	`"""Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.`
`475`	`458`	`The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.`
`476`	`459`	`"""`
`@@ -488,13 +471,13 @@ def make_presentation_sequence_table(`
`488`	`471`	`):`
`489`	`472`	`delprefixes_dict[k]`
`490`	`473`
`491`		`-msbs:"list[int]"=list(prefixes_dict.keys())`
	`474`	`+msbs:list[int]=list(prefixes_dict.keys())`
`492`	`475`
`493`	`476`	`forcp,widthinenumerate(width_map):`
`494`	`477`	`ifwidthinspurious_trueand (cp>>10)inmsbs:`
`495`	`478`	`prefixes_dict[cp>>10].add(cp&0x3FF)`
`496`	`479`
`497`		`-leaves:"list[list[int]]"= []`
	`480`	`+leaves:list[list[int]]= []`
`498`	`481`	`forcpsinprefixes_dict.values():`
`499`	`482`	`leaf= [0]*128`
`500`	`483`	`forcpincps:`
`@@ -524,10 +507,10 @@ def make_presentation_sequence_table(`
`524`	`507`
`525`	`508`	`defemit_module(`
`526`	`509`	`out_name:str,`
`527`		`-unicode_version:"tuple[int, int, int]",`
`528`		`-tables:"list[Table]",`
`529`		`-emoji_presentation_table:"tuple[list[tuple[int, int]], list[list[int]]]",`
`530`		`-text_presentation_table:"tuple[list[tuple[int, int]], list[list[int]]]",`
	`510`	`+unicode_version:tuple[int,int,int],`
	`511`	`+tables:list[Table],`
	`512`	`+emoji_presentation_table:tuple[list[tuple[int,int]],list[list[int]]],`
	`513`	`+text_presentation_table:tuple[list[tuple[int,int]],list[list[int]]],`
`531`	`514`	`):`
`532`	`515`	"""Outputs a Rust module to `out_name` using table data from `tables`.
`533`	`516`	If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
`@@ -574,18 +557,18 @@ def emit_module(`
`574`	`557`	`pub fn lookup_width(c: char, is_cjk: bool) -> usize {`
`575`	`558`	`let cp = c as usize;`
`576`	`559`
`577`		`- let t1_offset = TABLES_0[cp >> 13 & 0xFF];`
	`560`	`+ let t1_offset = TABLES_0.0[cp >> 13 & 0xFF];`
`578`	`561`
`579`	`562`	`// Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte,`
`580`	`563`	`// so each sub-table is 128 bytes in size.`
`581`	`564`	`// (Sub-tables are selected using the computed offset from the previous table.)`
`582`		`- let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];`
	`565`	`+ let t2_offset = TABLES_1.0[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];`
`583`	`566`
`584`	`567`	`// Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits.`
`585`	`568`	`// This is accomplished by packing four stored entries into one byte.`
`586`	`569`	`// So each sub-table is 2**(6-2) == 16 bytes in size.`
`587`	`570`	`// Since this is the last table, each entry represents an encoded width.`
`588`		`- let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];`
	`571`	`+ let packed_widths = TABLES_2.0[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];`
`589`	`572`
`590`	`573`	`// Extract the packed width`
`591`	`574`	`let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;`
`@@ -669,6 +652,12 @@ def emit_module(`
`669`	`652`	// Use the 3 LSB of `cp` to index into `leaf_byte`.
`670`	`653`	`((leaf_byte >> (cp & 7)) & 1) == 1`
`671`	`654`	`}`
	`655`	`+`
	`656`	`+ #[repr(align(128))]`
	`657`	`+ struct Align128<T>(T);`
	`658`	`+`
	`659`	`+ #[repr(align(16))]`
	`660`	`+ struct Align16<T>(T);`
`672`	`661`	`"""`
`673`	`662`	`)`
`674`	`663`
`@@ -677,26 +666,27 @@ def emit_module(`
`677`	`666`	`new_subtable_count=len(table.buckets())`
`678`	`667`	`ifi==len(tables)-1:`
`679`	`668`	`table.indices_to_widths()# for the last table, indices == widths`
	`669`	`+align=16`
	`670`	`+else:`
	`671`	`+align=128`
`680`	`672`	`byte_array=table.to_bytes()`
`681`	`673`	`module.write(`
`682`	`674`	`f"""`
`683`	`675`	/// Autogenerated.{subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
`684`		`- static TABLES_{i}: [u8;{len(byte_array)}] = ["""`
	`676`	`+ static TABLES_{i}:Align{align}<[u8;{len(byte_array)}]> =Align{align}(["""`
`685`	`677`	`)`
`686`	`678`	`forj,byteinenumerate(byte_array):`
`687`	`679`	`# Add line breaks for every 15th entry (chosen to match what rustfmt does)`
`688`	`680`	`ifj%15==0:`
`689`	`681`	`module.write("\n ")`
`690`	`682`	`module.write(f" 0x{byte:02X},")`
`691`		`-module.write("\n ];\n")`
	`683`	`+module.write("\n ]);\n")`
`692`	`684`	`subtable_count=new_subtable_count`
`693`	`685`
`694`	`686`	`# emoji table`
`695`	`687`
`696`	`688`	`module.write(`
`697`	`689`	`f"""`
`698`		`- #[repr(align(128))]`
`699`		`- struct Align128<T>(T);`
`700`	`690`	`/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint`
`701`	`691`	`/// to get whether it can start an emoji presentation sequence.`
`702`	`692`	`static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{len(emoji_presentation_leaves)}]> = Align128([`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commita2db56b

File tree

2 files changed

2 files changed

`‎scripts/unicode.py‎`

0 commit comments