1717# - HangulSyllableType.txt
1818# - PropList.txt
1919# - ReadMe.txt
20+ # - emoji/emoji-variation-sequences.txt
2021#
2122# Since this should not require frequent updates, we just store this
2223# out-of-line and check the generated module into git.
2627import os
2728import re
2829import sys
30+ from collections import defaultdict
31+ from itertools import batched
2932
3033NUM_CODEPOINTS = 0x110000
3134"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
@@ -69,12 +72,13 @@ def fetch_open(filename: str):
6972"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
7073 fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
7174 """
72- if not os .path .exists (os .path .basename (filename )):
75+ basename = os .path .basename (filename )
76+ if not os .path .exists (basename ):
7377os .system (f"curl -O http://www.unicode.org/Public/UNIDATA/{ filename } " )
7478try :
75- return open (filename ,encoding = "utf-8" )
79+ return open (basename ,encoding = "utf-8" )
7680except OSError :
77- sys .stderr .write (f"cannot load{ filename } " )
81+ sys .stderr .write (f"cannot load{ basename } " )
7882sys .exit (1 )
7983
8084
@@ -384,8 +388,71 @@ def make_tables(
384388return tables
385389
386390
391+ def load_variation_sequences ()-> "list[int]" :
392+ """Outputs a list of character ranages, corresponding to all the valid characters for starting
393+ an emoji presentation sequence."""
394+
395+ with fetch_open ("emoji/emoji-variation-sequences.txt" )as sequences :
396+ # Match all emoji presentation sequences
397+ # (one codepoint followed by U+FE0F, and labeled "emoji style")
398+ sequence = re .compile (r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style" )
399+ codepoints = []
400+ for line in sequences .readlines ():
401+ if match := sequence .match (line ):
402+ cp = int (match .group (1 ),16 )
403+ codepoints .append (cp )
404+ return codepoints
405+
406+
407+ def make_variation_sequence_table (
408+ seqs :"list[int]" ,
409+ width_map :"list[EffectiveWidth]" ,
410+ )-> "tuple[list[int], list[list[int]]]" :
411+ """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
412+ (Characters that are always wide may be excluded.)
413+ The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
414+ """
415+
416+ prefixes_dict = defaultdict (set )
417+ for cp in seqs :
418+ prefixes_dict [cp >> 10 ].add (cp & 0x3FF )
419+
420+ # We don't strictly need to keep track of characters that are always wide,
421+ # because being in an emoji variation seq won't affect their width.
422+ # So store their info only when it wouldn't inflate the size of the tables.
423+ for k in list (prefixes_dict .keys ()):
424+ if all (
425+ map (
426+ lambda cp :width_map [(k << 10 )| cp ]== EffectiveWidth .WIDE ,
427+ prefixes_dict [k ],
428+ )
429+ ):
430+ del prefixes_dict [k ]
431+
432+ indexes = list (prefixes_dict .keys ())
433+
434+ # Similarly, we can spuriously return `true` for always-wide characters
435+ # even if not part of a presentation seq; this saves an additional lookup,
436+ # so we should do it where there is no size cost.
437+ for cp ,width in enumerate (width_map ):
438+ if width == EffectiveWidth .WIDE and (cp >> 10 )in indexes :
439+ prefixes_dict [cp >> 10 ].add (cp & 0x3FF )
440+
441+ leaves = []
442+ for cps in prefixes_dict .values ():
443+ leaf = [0 ]* 128
444+ for cp in cps :
445+ idx_in_leaf ,bit_shift = divmod (cp ,8 )
446+ leaf [idx_in_leaf ]|= 1 << bit_shift
447+ leaves .append (leaf )
448+ return (indexes ,leaves )
449+
450+
387451def emit_module (
388- out_name :str ,unicode_version :"tuple[int, int, int]" ,tables :"list[Table]"
452+ out_name :str ,
453+ unicode_version :"tuple[int, int, int]" ,
454+ tables :"list[Table]" ,
455+ variation_table :"tuple[list[int], list[list[int]]]" ,
389456):
390457"""Outputs a Rust module to `out_name` using table data from `tables`.
391458 If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -462,6 +529,40 @@ def emit_module(
462529"""
463530 )
464531
532+ variation_idx ,variation_leaves = variation_table
533+
534+ module .write (
535+ """
536+ /// Whether this character forms an [emoji presentation sequence]
537+ /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
538+ /// when followed by `'\\ u{FEOF}'`.
539+ /// Emoji presentation sequences are considered to have width 2.
540+ /// This may spuriously return `true` or `false` for characters that are always wide.
541+ #[inline]
542+ pub fn starts_emoji_presentation_seq(c: char) -> bool {
543+ let cp: u32 = c.into();
544+ // First level of lookup uses all but 10 LSB
545+ let top_bits = cp >> 10;
546+ let idx_of_leaf: usize = match top_bits {
547+ """
548+ )
549+
550+ for i ,msbs in enumerate (variation_idx ):
551+ module .write (f"{ msbs } =>{ i } ,\n " )
552+
553+ module .write (
554+ """ _ => return false,
555+ };
556+ // Extract the 3-9th (0-indexed) least significant bits of `cp`,
557+ // and use them to index into `leaf_row`.
558+ let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
559+ let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
560+ // Use the 3 LSB of `cp` to index into `leaf_byte`.
561+ ((leaf_byte >> (cp & 7)) & 1) == 1
562+ }
563+ """
564+ )
565+
465566module .write (
466567"""
467568 /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -510,6 +611,29 @@ def emit_module(
510611module .write (f" 0x{ byte :02X} ," )
511612module .write ("\n ];\n " )
512613subtable_count = new_subtable_count
614+
615+ # emoji table
616+
617+ module .write (
618+ f"""
619+ #[repr(align(128))]
620+ struct Align128<T>(T);
621+ /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
622+ /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
623+ static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{ len (variation_leaves )} ]> = Align128([
624+ """
625+ )
626+ for leaf in variation_leaves :
627+ module .write (" [\n " )
628+ for row in batched (leaf ,14 ):
629+ module .write (" " )
630+ for entry in row :
631+ module .write (f" 0x{ entry :02X} ," )
632+ module .write ("\n " )
633+ module .write (" ],\n " )
634+
635+ module .write (" ]);\n " )
636+
513637module .write ("}\n " )
514638
515639
@@ -520,6 +644,7 @@ def main(module_filename: str):
520644
521645 We obey the following rules, in decreasing order of importance:
522646
647+ - Emoji presentation sequences are double-width.
523648 - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
524649 - Hangul jamo medial vowels & final consonants are zero-width.
525650 - `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
@@ -549,16 +674,25 @@ def main(module_filename: str):
549674
550675tables = make_tables (TABLE_CFGS ,enumerate (width_map ))
551676
677+ emoji_variations = load_variation_sequences ()
678+ variation_table = make_variation_sequence_table (emoji_variations ,width_map )
679+
552680print ("------------------------" )
553681total_size = 0
554682for i ,table in enumerate (tables ):
555683size_bytes = len (table .to_bytes ())
556- print (f"Table{ i } Size :{ size_bytes } bytes" )
684+ print (f"Table{ i } size :{ size_bytes } bytes" )
557685total_size += size_bytes
686+ emoji_index_size = len (variation_table [0 ])* 4
687+ print (f"Emoji presentation index size:{ emoji_index_size } bytes" )
688+ total_size += emoji_index_size
689+ emoji_leaves_size = len (variation_table [1 ])* len (variation_table [1 ][0 ])
690+ print (f"Emoji presentation leaves size:{ emoji_leaves_size } bytes" )
691+ total_size += emoji_leaves_size
558692print ("------------------------" )
559- print (f" TotalSize :{ total_size } bytes" )
693+ print (f" Totalsize :{ total_size } bytes" )
560694
561- emit_module (module_filename ,version ,tables )
695+ emit_module (module_filename ,version ,tables , variation_table )
562696print (f'Wrote to "{ module_filename } "' )
563697
564698