NotificationsYou must be signed in to change notification settings
Fork32
Star278

Commit3aa94a5

authored

Merge pull request#43 from Jules-Bertholet/text-presentation

Support text presentation sequences

2 parents74c8394 +2e2d3bb commit3aa94a5Copy full SHA for 3aa94a5

File tree

4 files changed

+320

-81

lines changed

4 files changed

+320

-81

lines changed

`‎scripts/unicode.py‎`

Lines changed: 169 additions & 53 deletions

Original file line number	Diff line number	Diff line change
`@@ -123,9 +123,9 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":`
`123`	`123`	`Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
`124`	`124`	`withfetch_open("EastAsianWidth.txt")aseaw:`
`125`	`125`	`# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."`
`126`		`-single=re.compile(r"^([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")`
	`126`	`+single=re.compile(r"^([0-9A-F]+)\s;\s(\w+) +# (\w+)")`
`127`	`127`	`# matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..."`
`128`		`-multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")`
	`128`	`+multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\s(\w+) +# (\w+)")`
`129`	`129`	`# map between width category code and condensed width`
`130`	`130`	`width_codes= {`
`131`	`131`	`**{c:EffectiveWidth.NARROWforcin ["N","Na","H"]},`
`@@ -189,10 +189,10 @@ def load_zero_widths() -> "list[bool]":`
`189`	`189`	`# canonically equivalent sequences have the same width.`
`190`	`190`	`withfetch_open("DerivedCoreProperties.txt")asproperties:`
`191`	`191`	`single=re.compile(`
`192`		`-r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"`
	`192`	`+r"^([0-9A-F]+)\s;\s(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"`
`193`	`193`	`)`
`194`	`194`	`multiple=re.compile(`
`195`		`-r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"`
	`195`	`+r"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\s(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"`
`196`	`196`	`)`
`197`	`197`
`198`	`198`	`forlineinproperties.readlines():`
`@@ -225,8 +225,8 @@ def load_zero_widths() -> "list[bool]":`
`225`	`225`	`#`
`226`	`226`	`# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)`
`227`	`227`	`withfetch_open("HangulSyllableType.txt")ascategories:`
`228`		`-single=re.compile(r"^([0-9A-F]+)\s+;\s+(V\|T)\s+")`
`229`		`-multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V\|T)\s+")`
	`228`	`+single=re.compile(r"^([0-9A-F]+)\s;\s(V\|T)\s+")`
	`229`	`+multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\s(V\|T)\s+")`
`230`	`230`
`231`	`231`	`forlineincategories.readlines():`
`232`	`232`	`raw_data=None# (low, high)`
`@@ -396,14 +396,14 @@ def make_tables(`
`396`	`396`	`returntables`
`397`	`397`
`398`	`398`
`399`		`-defload_variation_sequences()->"list[int]":`
	`399`	`+defload_emoji_presentation_sequences()->"list[int]":`
`400`	`400`	`"""Outputs a list of character ranages, corresponding to all the valid characters for starting`
`401`	`401`	`an emoji presentation sequence."""`
`402`	`402`
`403`	`403`	`withfetch_open("emoji/emoji-variation-sequences.txt")assequences:`
`404`	`404`	`# Match all emoji presentation sequences`
`405`	`405`	`# (one codepoint followed by U+FE0F, and labeled "emoji style")`
`406`		`-sequence=re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")`
	`406`	`+sequence=re.compile(r"^([0-9A-F]+)\s+FE0F\s;\semoji style")`
`407`	`407`	`codepoints= []`
`408`	`408`	`forlineinsequences.readlines():`
`409`	`409`	`ifmatch:=sequence.match(line):`
`@@ -412,55 +412,114 @@ def load_variation_sequences() -> "list[int]":`
`412`	`412`	`returncodepoints`
`413`	`413`
`414`	`414`
`415`		`-defmake_variation_sequence_table(`
	`415`	`+defload_text_presentation_sequences()->"list[int]":`
	`416`	`+"""Outputs a list of character ranages, corresponding to all the valid characters`
	`417`	`+ whose widths change with a text presentation sequence."""`
	`418`	`+`
	`419`	`+text_presentation_seq_codepoints=set()`
	`420`	`+withfetch_open("emoji/emoji-variation-sequences.txt")assequences:`
	`421`	`+# Match all text presentation sequences`
	`422`	`+# (one codepoint followed by U+FE0E, and labeled "text style")`
	`423`	`+sequence=re.compile(r"^([0-9A-F]+)\s+FE0E\s;\stext style")`
	`424`	`+forlineinsequences.readlines():`
	`425`	`+ifmatch:=sequence.match(line):`
	`426`	`+cp=int(match.group(1),16)`
	`427`	`+text_presentation_seq_codepoints.add(cp)`
	`428`	`+`
	`429`	`+default_emoji_codepoints=set()`
	`430`	`+withfetch_open("emoji/emoji-data.txt")asemoji_data:`
	`431`	`+single=re.compile(r"^([0-9A-F]+)\s;\sEmoji_Presentation\s+")`
	`432`	`+multiple=re.compile(`
	`433`	`+r"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\sEmoji_Presentation\s+"`
	`434`	`+ )`
	`435`	`+`
	`436`	`+forlineinemoji_data.readlines():`
	`437`	`+raw_data=None# (low, high)`
	`438`	`+ifmatch:=single.match(line):`
	`439`	`+raw_data= (match.group(1),match.group(1))`
	`440`	`+elifmatch:=multiple.match(line):`
	`441`	`+raw_data= (match.group(1),match.group(2))`
	`442`	`+else:`
	`443`	`+continue`
	`444`	`+low=int(raw_data[0],16)`
	`445`	`+high=int(raw_data[1],16)`
	`446`	`+forcpinrange(low,high+1):`
	`447`	`+default_emoji_codepoints.add(cp)`
	`448`	`+`
	`449`	`+codepoints= []`
	`450`	`+forcpintext_presentation_seq_codepoints.intersection(default_emoji_codepoints):`
	`451`	`+# "Enclosed Ideographic Supplement" block;`
	`452`	`+# wide even in text presentation`
	`453`	`+ifnotcpinrange(0x1F200,0x1F300):`
	`454`	`+codepoints.append(cp)`
	`455`	`+`
	`456`	`+codepoints.sort()`
	`457`	`+returncodepoints`
	`458`	`+`
	`459`	`+`
	`460`	`+defmake_presentation_sequence_table(`
`416`	`461`	`seqs:"list[int]",`
`417`	`462`	`width_map:"list[EffectiveWidth]",`
`418`		`-)->"tuple[list[int], list[list[int]]]":`
`419`		`-"""Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.`
`420`		`- (Characters that are always wide may be excluded.)`
	`463`	`+spurious_false:"set[EffectiveWidth]",`
	`464`	`+spurious_true:"set[EffectiveWidth]",`
	`465`	`+)->"tuple[list[tuple[int, int]], list[list[int]]]":`
	`466`	`+"""Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.`
`421`	`467`	`The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.`
`422`	`468`	`"""`
`423`	`469`
`424`	`470`	`prefixes_dict=defaultdict(set)`
`425`	`471`	`forcpinseqs:`
`426`	`472`	`prefixes_dict[cp>>10].add(cp&0x3FF)`
`427`	`473`
`428`		`-# We don't strictly need to keep track of characters that are always wide,`
`429`		`-# because being in an emoji variation seq won't affect their width.`
`430`		`-# So store their info only when it wouldn't inflate the size of the tables.`
`431`	`474`	`forkinlist(prefixes_dict.keys()):`
`432`	`475`	`ifall(`
`433`	`476`	`map(`
`434`		`-lambdacp:width_map[(k<<10)\|cp]==EffectiveWidth.WIDE,`
	`477`	`+lambdacp:width_map[(k<<10)\|cp]inspurious_false,`
`435`	`478`	`prefixes_dict[k],`
`436`	`479`	`)`
`437`	`480`	`):`
`438`	`481`	`delprefixes_dict[k]`
`439`	`482`
`440`		`-indexes=list(prefixes_dict.keys())`
	`483`	`+msbs:"list[int]"=list(prefixes_dict.keys())`
`441`	`484`
`442`		-# Similarly, we can spuriously return `true` for always-wide characters
`443`		`-# even if not part of a presentation seq; this saves an additional lookup,`
`444`		`-# so we should do it where there is no size cost.`
`445`	`485`	`forcp,widthinenumerate(width_map):`
`446`		`-ifwidth==EffectiveWidth.WIDEand (cp>>10)inindexes:`
	`486`	`+ifwidthinspurious_trueand (cp>>10)inmsbs:`
`447`	`487`	`prefixes_dict[cp>>10].add(cp&0x3FF)`
`448`	`488`
`449`		`-leaves= []`
	`489`	`+leaves:"list[list[int]]"= []`
`450`	`490`	`forcpsinprefixes_dict.values():`
`451`	`491`	`leaf= [0]*128`
`452`	`492`	`forcpincps:`
`453`	`493`	`idx_in_leaf,bit_shift=divmod(cp,8)`
`454`	`494`	`leaf[idx_in_leaf]\|=1<<bit_shift`
`455`	`495`	`leaves.append(leaf)`
	`496`	`+`
	`497`	`+indexes= [(msb,index)for (index,msb)inenumerate(msbs)]`
	`498`	`+`
	`499`	`+# Cull duplicate leaves`
	`500`	`+i=0`
	`501`	`+whilei<len(leaves):`
	`502`	`+first_idx=leaves.index(leaves[i])`
	`503`	`+iffirst_idx==i:`
	`504`	`+i+=1`
	`505`	`+else:`
	`506`	`+forjinrange(0,len(indexes)):`
	`507`	`+ifindexes[j][1]==i:`
	`508`	`+indexes[j]= (indexes[j][0],first_idx)`
	`509`	`+elifindexes[j][1]>i:`
	`510`	`+indexes[j]= (indexes[j][0],indexes[j][1]-1)`
	`511`	`+`
	`512`	`+leaves.pop(i)`
	`513`	`+`
`456`	`514`	`return (indexes,leaves)`
`457`	`515`
`458`	`516`
`459`	`517`	`defemit_module(`
`460`	`518`	`out_name:str,`
`461`	`519`	`unicode_version:"tuple[int, int, int]",`
`462`	`520`	`tables:"list[Table]",`
`463`		`-variation_table:"tuple[list[int], list[list[int]]]",`
	`521`	`+emoji_presentation_table:"tuple[list[tuple[int, int]], list[list[int]]]",`
	`522`	`+text_presentation_table:"tuple[list[tuple[int, int]], list[list[int]]]",`
`464`	`523`	`):`
`465`	`524`	"""Outputs a Rust module to `out_name` using table data from `tables`.
`466`	`525`	If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
`@@ -537,7 +596,8 @@ def emit_module(`
`537`	`596`	`"""`
`538`	`597`	`)`
`539`	`598`
`540`		`-variation_idx,variation_leaves=variation_table`
	`599`	`+emoji_presentation_idx,emoji_presentation_leaves=emoji_presentation_table`
	`600`	`+text_presentation_idx,text_presentation_leaves=text_presentation_table`
`541`	`601`
`542`	`602`	`module.write(`
`543`	`603`	`"""`
`@@ -555,7 +615,7 @@ def emit_module(`
`555`	`615`	`"""`
`556`	`616`	`)`
`557`	`617`
`558`		`-fori,msbsinenumerate(variation_idx):`
	`618`	`+formsbs,iinemoji_presentation_idx:`
`559`	`619`	`module.write(f"{msbs} =>{i},\n")`
`560`	`620`
`561`	`621`	`module.write(`
`@@ -571,6 +631,39 @@ def emit_module(`
`571`	`631`	`"""`
`572`	`632`	`)`
`573`	`633`
	`634`	`+module.write(`
	`635`	`+"""`
	`636`	+ /// Returns `true` iff `c` has default emoji presentation, but forms a [text presentation sequence]
	`637`	`+ /// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence)`
	`638`	+ /// when followed by `'\\u{FEOE}'`, and is not ideographic.
	`639`	`+ /// Such sequences are considered to have width 1.`
	`640`	`+ ///`
	`641`	+ /// This may spuriously return `true` for characters of narrow or ambiguous width.
	`642`	`+ #[inline]`
	`643`	`+ pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool {`
	`644`	`+ let cp: u32 = c.into();`
	`645`	`+ // First level of lookup uses all but 10 LSB`
	`646`	`+ let top_bits = cp >> 10;`
	`647`	`+ let idx_of_leaf: usize = match top_bits {`
	`648`	`+"""`
	`649`	`+ )`
	`650`	`+`
	`651`	`+formsbs,iintext_presentation_idx:`
	`652`	`+module.write(f"{msbs} =>{i},\n")`
	`653`	`+`
	`654`	`+module.write(`
	`655`	`+""" _ => return false,`
	`656`	`+ };`
	`657`	+ // Extract the 3-9th (0-indexed) least significant bits of `cp`,
	`658`	+ // and use them to index into `leaf_row`.
	`659`	`+ let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();`
	`660`	`+ let leaf_byte = TEXT_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];`
	`661`	+ // Use the 3 LSB of `cp` to index into `leaf_byte`.
	`662`	`+ ((leaf_byte >> (cp & 7)) & 1) == 1`
	`663`	`+ }`
	`664`	`+"""`
	`665`	`+ )`
	`666`	`+`
`574`	`667`	`module.write(`
`575`	`668`	`"""`
`576`	`669`	/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
`@@ -626,12 +719,32 @@ def emit_module(`
`626`	`719`	`f"""`
`627`	`720`	`#[repr(align(128))]`
`628`	`721`	`struct Align128<T>(T);`
`629`		- /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
`630`		`- /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.`
`631`		`- static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{len(variation_leaves)}]> = Align128([`
	`722`	`+ /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint`
	`723`	`+ /// to get whether it can start an emoji presentation sequence.`
	`724`	`+ static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{len(emoji_presentation_leaves)}]> = Align128([`
	`725`	`+"""`
	`726`	`+ )`
	`727`	`+forleafinemoji_presentation_leaves:`
	`728`	`+module.write(" [\n")`
	`729`	`+forrowinbatched(leaf,14):`
	`730`	`+module.write(" ")`
	`731`	`+forentryinrow:`
	`732`	`+module.write(f" 0x{entry:02X},")`
	`733`	`+module.write("\n")`
	`734`	`+module.write(" ],\n")`
	`735`	`+`
	`736`	`+module.write(" ]);\n")`
	`737`	`+`
	`738`	`+# text table`
	`739`	`+`
	`740`	`+module.write(`
	`741`	`+f"""`
	`742`	`+ /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint`
	`743`	`+ /// to get whether it can start a text presentation sequence.`
	`744`	`+ static TEXT_PRESENTATION_LEAVES: Align128<[[u8; 128];{len(text_presentation_leaves)}]> = Align128([`
`632`	`745`	`"""`
`633`	`746`	`)`
`634`		`-forleafinvariation_leaves:`
	`747`	`+forleafintext_presentation_leaves:`
`635`	`748`	`module.write(" [\n")`
`636`	`749`	`forrowinbatched(leaf,14):`
`637`	`750`	`module.write(" ")`
`@@ -650,21 +763,7 @@ def main(module_path: str):`
`650`	`763`	`lookup table for character width, and write a Rust module utilizing that table to`
`651`	`764`	`module_filename`.
`652`	`765`
`653`		`- We obey the following rules, in decreasing order of importance:`
`654`		`-`
`655`		`- - Emoji presentation sequences are double-width.`
`656`		- - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
`657`		`- - Hangul jamo medial vowels & final consonants are zero-width.`
`658`		- - `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
`659`		`- - Control characters are zero-width.`
`660`		- - `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters,
`661`		`- are zero-width.`
`662`		- - Codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
`663`		- - Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
`664`		`- - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width`
`665`		- of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
`666`		`-`
`667`		- These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
	`766`	+ See `lib.rs` for documentation of the exact width rules.
`668`	`767`	`"""`
`669`	`768`	`version=load_unicode_version()`
`670`	`769`	`print(f"Generating module for Unicode{version[0]}.{version[1]}.{version[2]}")`
`@@ -682,8 +781,18 @@ def main(module_path: str):`
`682`	`781`
`683`	`782`	`tables=make_tables(TABLE_CFGS,enumerate(width_map))`
`684`	`783`
`685`		`-emoji_variations=load_variation_sequences()`
`686`		`-variation_table=make_variation_sequence_table(emoji_variations,width_map)`
	`784`	`+emoji_presentations=load_emoji_presentation_sequences()`
	`785`	`+emoji_presentation_table=make_presentation_sequence_table(`
	`786`	`+emoji_presentations,width_map, {EffectiveWidth.WIDE}, {EffectiveWidth.WIDE}`
	`787`	`+ )`
	`788`	`+`
	`789`	`+text_presentations=load_text_presentation_sequences()`
	`790`	`+text_presentation_table=make_presentation_sequence_table(`
	`791`	`+text_presentations,`
	`792`	`+width_map,`
	`793`	`+set(),`
	`794`	`+ {EffectiveWidth.NARROW,EffectiveWidth.AMBIGUOUS},`
	`795`	`+ )`
`687`	`796`
`688`	`797`	`# Download normalization test file for use by tests`
`689`	`798`	`fetch_open("NormalizationTest.txt","../tests/")`
`@@ -694,16 +803,23 @@ def main(module_path: str):`
`694`	`803`	`size_bytes=len(table.to_bytes())`
`695`	`804`	`print(f"Table{i} size:{size_bytes} bytes")`
`696`	`805`	`total_size+=size_bytes`
`697`		`-emoji_index_size=len(variation_table[0])*4`
`698`		`-print(f"Emoji presentation index size:{emoji_index_size} bytes")`
`699`		`-total_size+=emoji_index_size`
`700`		`-emoji_leaves_size=len(variation_table[1])*len(variation_table[1][0])`
`701`		`-print(f"Emoji presentation leaves size:{emoji_leaves_size} bytes")`
`702`		`-total_size+=emoji_leaves_size`
	`806`	`+`
	`807`	`+fors,tablein [`
	`808`	`+ ("Emoji",emoji_presentation_table),`
	`809`	`+ ("Text",text_presentation_table),`
	`810`	`+ ]:`
	`811`	`+index_size=len(table[0])*4`
	`812`	`+print(f"{s} presentation index size:{index_size} bytes")`
	`813`	`+total_size+=index_size`
	`814`	`+leaves_size=len(table[1])*len(table[1][0])`
	`815`	`+print(f"{s} presentation leaves size:{leaves_size} bytes")`
	`816`	`+total_size+=leaves_size`
`703`	`817`	`print("------------------------")`
`704`	`818`	`print(f" Total size:{total_size} bytes")`
`705`	`819`
`706`		`-emit_module(module_path,version,tables,variation_table)`
	`820`	`+emit_module(`
	`821`	`+module_path,version,tables,emoji_presentation_table,text_presentation_table`
	`822`	`+ )`
`707`	`823`	`print(f'Wrote to "{module_path}"')`
`708`	`824`
`709`	`825`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit3aa94a5

File tree

4 files changed

4 files changed

`‎scripts/unicode.py‎`

0 commit comments