@@ -123,9 +123,9 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
123123 `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
124124with fetch_open ("EastAsianWidth.txt" )as eaw :
125125# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
126- single = re .compile (r"^([0-9A-F]+)\s+ ;\s+ (\w+) +# (\w+)" )
126+ single = re .compile (r"^([0-9A-F]+)\s* ;\s* (\w+) +# (\w+)" )
127127# matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..."
128- multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+ ;\s+ (\w+) +# (\w+)" )
128+ multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s* ;\s* (\w+) +# (\w+)" )
129129# map between width category code and condensed width
130130width_codes = {
131131** {c :EffectiveWidth .NARROW for c in ["N" ,"Na" ,"H" ]},
@@ -189,10 +189,10 @@ def load_zero_widths() -> "list[bool]":
189189# canonically equivalent sequences have the same width.
190190with fetch_open ("DerivedCoreProperties.txt" )as properties :
191191single = re .compile (
192- r"^([0-9A-F]+)\s+ ;\s+ (?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
192+ r"^([0-9A-F]+)\s* ;\s* (?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
193193 )
194194multiple = re .compile (
195- r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+ ;\s+ (?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
195+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s* ;\s* (?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
196196 )
197197
198198for line in properties .readlines ():
@@ -225,8 +225,8 @@ def load_zero_widths() -> "list[bool]":
225225#
226226# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
227227with fetch_open ("HangulSyllableType.txt" )as categories :
228- single = re .compile (r"^([0-9A-F]+)\s+ ;\s+ (V|T)\s+" )
229- multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+ ;\s+ (V|T)\s+" )
228+ single = re .compile (r"^([0-9A-F]+)\s* ;\s* (V|T)\s+" )
229+ multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s* ;\s* (V|T)\s+" )
230230
231231for line in categories .readlines ():
232232raw_data = None # (low, high)
@@ -396,14 +396,14 @@ def make_tables(
396396return tables
397397
398398
399- def load_variation_sequences ()-> "list[int]" :
399+ def load_emoji_presentation_sequences ()-> "list[int]" :
400400"""Outputs a list of character ranages, corresponding to all the valid characters for starting
401401 an emoji presentation sequence."""
402402
403403with fetch_open ("emoji/emoji-variation-sequences.txt" )as sequences :
404404# Match all emoji presentation sequences
405405# (one codepoint followed by U+FE0F, and labeled "emoji style")
406- sequence = re .compile (r"^([0-9A-F]+)\s+FE0F\s*;\s+ emoji style" )
406+ sequence = re .compile (r"^([0-9A-F]+)\s+FE0F\s*;\s* emoji style" )
407407codepoints = []
408408for line in sequences .readlines ():
409409if match := sequence .match (line ):
@@ -412,55 +412,114 @@ def load_variation_sequences() -> "list[int]":
412412return codepoints
413413
414414
415- def make_variation_sequence_table (
415+ def load_text_presentation_sequences ()-> "list[int]" :
416+ """Outputs a list of character ranages, corresponding to all the valid characters
417+ whose widths change with a text presentation sequence."""
418+
419+ text_presentation_seq_codepoints = set ()
420+ with fetch_open ("emoji/emoji-variation-sequences.txt" )as sequences :
421+ # Match all text presentation sequences
422+ # (one codepoint followed by U+FE0E, and labeled "text style")
423+ sequence = re .compile (r"^([0-9A-F]+)\s+FE0E\s*;\s*text style" )
424+ for line in sequences .readlines ():
425+ if match := sequence .match (line ):
426+ cp = int (match .group (1 ),16 )
427+ text_presentation_seq_codepoints .add (cp )
428+
429+ default_emoji_codepoints = set ()
430+ with fetch_open ("emoji/emoji-data.txt" )as emoji_data :
431+ single = re .compile (r"^([0-9A-F]+)\s*;\s*Emoji_Presentation\s+" )
432+ multiple = re .compile (
433+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Presentation\s+"
434+ )
435+
436+ for line in emoji_data .readlines ():
437+ raw_data = None # (low, high)
438+ if match := single .match (line ):
439+ raw_data = (match .group (1 ),match .group (1 ))
440+ elif match := multiple .match (line ):
441+ raw_data = (match .group (1 ),match .group (2 ))
442+ else :
443+ continue
444+ low = int (raw_data [0 ],16 )
445+ high = int (raw_data [1 ],16 )
446+ for cp in range (low ,high + 1 ):
447+ default_emoji_codepoints .add (cp )
448+
449+ codepoints = []
450+ for cp in text_presentation_seq_codepoints .intersection (default_emoji_codepoints ):
451+ # "Enclosed Ideographic Supplement" block;
452+ # wide even in text presentation
453+ if not cp in range (0x1F200 ,0x1F300 ):
454+ codepoints .append (cp )
455+
456+ codepoints .sort ()
457+ return codepoints
458+
459+
460+ def make_presentation_sequence_table (
416461seqs :"list[int]" ,
417462width_map :"list[EffectiveWidth]" ,
418- )-> "tuple[list[int], list[list[int]]]" :
419- """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
420- (Characters that are always wide may be excluded.)
463+ spurious_false :"set[EffectiveWidth]" ,
464+ spurious_true :"set[EffectiveWidth]" ,
465+ )-> "tuple[list[tuple[int, int]], list[list[int]]]" :
466+ """Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
421467 The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
422468 """
423469
424470prefixes_dict = defaultdict (set )
425471for cp in seqs :
426472prefixes_dict [cp >> 10 ].add (cp & 0x3FF )
427473
428- # We don't strictly need to keep track of characters that are always wide,
429- # because being in an emoji variation seq won't affect their width.
430- # So store their info only when it wouldn't inflate the size of the tables.
431474for k in list (prefixes_dict .keys ()):
432475if all (
433476map (
434- lambda cp :width_map [(k << 10 )| cp ]== EffectiveWidth . WIDE ,
477+ lambda cp :width_map [(k << 10 )| cp ]in spurious_false ,
435478prefixes_dict [k ],
436479 )
437480 ):
438481del prefixes_dict [k ]
439482
440- indexes = list (prefixes_dict .keys ())
483+ msbs : "list[int]" = list (prefixes_dict .keys ())
441484
442- # Similarly, we can spuriously return `true` for always-wide characters
443- # even if not part of a presentation seq; this saves an additional lookup,
444- # so we should do it where there is no size cost.
445485for cp ,width in enumerate (width_map ):
446- if width == EffectiveWidth . WIDE and (cp >> 10 )in indexes :
486+ if width in spurious_true and (cp >> 10 )in msbs :
447487prefixes_dict [cp >> 10 ].add (cp & 0x3FF )
448488
449- leaves = []
489+ leaves : "list[list[int]]" = []
450490for cps in prefixes_dict .values ():
451491leaf = [0 ]* 128
452492for cp in cps :
453493idx_in_leaf ,bit_shift = divmod (cp ,8 )
454494leaf [idx_in_leaf ]|= 1 << bit_shift
455495leaves .append (leaf )
496+
497+ indexes = [(msb ,index )for (index ,msb )in enumerate (msbs )]
498+
499+ # Cull duplicate leaves
500+ i = 0
501+ while i < len (leaves ):
502+ first_idx = leaves .index (leaves [i ])
503+ if first_idx == i :
504+ i += 1
505+ else :
506+ for j in range (0 ,len (indexes )):
507+ if indexes [j ][1 ]== i :
508+ indexes [j ]= (indexes [j ][0 ],first_idx )
509+ elif indexes [j ][1 ]> i :
510+ indexes [j ]= (indexes [j ][0 ],indexes [j ][1 ]- 1 )
511+
512+ leaves .pop (i )
513+
456514return (indexes ,leaves )
457515
458516
459517def emit_module (
460518out_name :str ,
461519unicode_version :"tuple[int, int, int]" ,
462520tables :"list[Table]" ,
463- variation_table :"tuple[list[int], list[list[int]]]" ,
521+ emoji_presentation_table :"tuple[list[tuple[int, int]], list[list[int]]]" ,
522+ text_presentation_table :"tuple[list[tuple[int, int]], list[list[int]]]" ,
464523):
465524"""Outputs a Rust module to `out_name` using table data from `tables`.
466525 If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -537,7 +596,8 @@ def emit_module(
537596"""
538597 )
539598
540- variation_idx ,variation_leaves = variation_table
599+ emoji_presentation_idx ,emoji_presentation_leaves = emoji_presentation_table
600+ text_presentation_idx ,text_presentation_leaves = text_presentation_table
541601
542602module .write (
543603"""
@@ -555,7 +615,7 @@ def emit_module(
555615"""
556616 )
557617
558- for i , msbs in enumerate ( variation_idx ) :
618+ for msbs , i in emoji_presentation_idx :
559619module .write (f"{ msbs } =>{ i } ,\n " )
560620
561621module .write (
@@ -571,6 +631,39 @@ def emit_module(
571631"""
572632 )
573633
634+ module .write (
635+ """
636+ /// Returns `true` iff `c` has default emoji presentation, but forms a [text presentation sequence]
637+ /// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence)
638+ /// when followed by `'\\ u{FEOE}'`, and is not ideographic.
639+ /// Such sequences are considered to have width 1.
640+ ///
641+ /// This may spuriously return `true` for characters of narrow or ambiguous width.
642+ #[inline]
643+ pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool {
644+ let cp: u32 = c.into();
645+ // First level of lookup uses all but 10 LSB
646+ let top_bits = cp >> 10;
647+ let idx_of_leaf: usize = match top_bits {
648+ """
649+ )
650+
651+ for msbs ,i in text_presentation_idx :
652+ module .write (f"{ msbs } =>{ i } ,\n " )
653+
654+ module .write (
655+ """ _ => return false,
656+ };
657+ // Extract the 3-9th (0-indexed) least significant bits of `cp`,
658+ // and use them to index into `leaf_row`.
659+ let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
660+ let leaf_byte = TEXT_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
661+ // Use the 3 LSB of `cp` to index into `leaf_byte`.
662+ ((leaf_byte >> (cp & 7)) & 1) == 1
663+ }
664+ """
665+ )
666+
574667module .write (
575668"""
576669 /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -626,12 +719,32 @@ def emit_module(
626719f"""
627720 #[repr(align(128))]
628721 struct Align128<T>(T);
629- /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
630- /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
631- static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{ len (variation_leaves )} ]> = Align128([
722+ /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
723+ /// to get whether it can start an emoji presentation sequence.
724+ static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{ len (emoji_presentation_leaves )} ]> = Align128([
725+ """
726+ )
727+ for leaf in emoji_presentation_leaves :
728+ module .write (" [\n " )
729+ for row in batched (leaf ,14 ):
730+ module .write (" " )
731+ for entry in row :
732+ module .write (f" 0x{ entry :02X} ," )
733+ module .write ("\n " )
734+ module .write (" ],\n " )
735+
736+ module .write (" ]);\n " )
737+
738+ # text table
739+
740+ module .write (
741+ f"""
742+ /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
743+ /// to get whether it can start a text presentation sequence.
744+ static TEXT_PRESENTATION_LEAVES: Align128<[[u8; 128];{ len (text_presentation_leaves )} ]> = Align128([
632745"""
633746 )
634- for leaf in variation_leaves :
747+ for leaf in text_presentation_leaves :
635748module .write (" [\n " )
636749for row in batched (leaf ,14 ):
637750module .write (" " )
@@ -650,21 +763,7 @@ def main(module_path: str):
650763 lookup table for character width, and write a Rust module utilizing that table to
651764 `module_filename`.
652765
653- We obey the following rules, in decreasing order of importance:
654-
655- - Emoji presentation sequences are double-width.
656- - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
657- - Hangul jamo medial vowels & final consonants are zero-width.
658- - `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
659- - Control characters are zero-width.
660- - `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters,
661- are zero-width.
662- - Codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
663- - Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
664- - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
665- of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
666-
667- These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
766+ See `lib.rs` for documentation of the exact width rules.
668767 """
669768version = load_unicode_version ()
670769print (f"Generating module for Unicode{ version [0 ]} .{ version [1 ]} .{ version [2 ]} " )
@@ -682,8 +781,18 @@ def main(module_path: str):
682781
683782tables = make_tables (TABLE_CFGS ,enumerate (width_map ))
684783
685- emoji_variations = load_variation_sequences ()
686- variation_table = make_variation_sequence_table (emoji_variations ,width_map )
784+ emoji_presentations = load_emoji_presentation_sequences ()
785+ emoji_presentation_table = make_presentation_sequence_table (
786+ emoji_presentations ,width_map , {EffectiveWidth .WIDE }, {EffectiveWidth .WIDE }
787+ )
788+
789+ text_presentations = load_text_presentation_sequences ()
790+ text_presentation_table = make_presentation_sequence_table (
791+ text_presentations ,
792+ width_map ,
793+ set (),
794+ {EffectiveWidth .NARROW ,EffectiveWidth .AMBIGUOUS },
795+ )
687796
688797# Download normalization test file for use by tests
689798fetch_open ("NormalizationTest.txt" ,"../tests/" )
@@ -694,16 +803,23 @@ def main(module_path: str):
694803size_bytes = len (table .to_bytes ())
695804print (f"Table{ i } size:{ size_bytes } bytes" )
696805total_size += size_bytes
697- emoji_index_size = len (variation_table [0 ])* 4
698- print (f"Emoji presentation index size:{ emoji_index_size } bytes" )
699- total_size += emoji_index_size
700- emoji_leaves_size = len (variation_table [1 ])* len (variation_table [1 ][0 ])
701- print (f"Emoji presentation leaves size:{ emoji_leaves_size } bytes" )
702- total_size += emoji_leaves_size
806+
807+ for s ,table in [
808+ ("Emoji" ,emoji_presentation_table ),
809+ ("Text" ,text_presentation_table ),
810+ ]:
811+ index_size = len (table [0 ])* 4
812+ print (f"{ s } presentation index size:{ index_size } bytes" )
813+ total_size += index_size
814+ leaves_size = len (table [1 ])* len (table [1 ][0 ])
815+ print (f"{ s } presentation leaves size:{ leaves_size } bytes" )
816+ total_size += leaves_size
703817print ("------------------------" )
704818print (f" Total size:{ total_size } bytes" )
705819
706- emit_module (module_path ,version ,tables ,variation_table )
820+ emit_module (
821+ module_path ,version ,tables ,emoji_presentation_table ,text_presentation_table
822+ )
707823print (f'Wrote to "{ module_path } "' )
708824
709825