2424
2525import enum
2626import math
27+ import operator
2728import os
2829import re
2930import sys
3031import urllib .request
3132from collections import defaultdict
3233from itertools import batched
34+ from typing import Callable
3335
3436UNICODE_VERSION = "15.1.0"
3537"""The version of the Unicode data files to download."""
@@ -90,13 +92,32 @@ def fetch_open(filename: str, local_prefix: str = ""):
9092sys .exit (1 )
9193
9294
93- def load_unicode_version ()-> " tuple[int, int, int]" :
95+ def load_unicode_version ()-> tuple [int ,int ,int ]:
9496"""Returns the current Unicode version by fetching and processing `ReadMe.txt`."""
9597with fetch_open ("ReadMe.txt" )as readme :
9698pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
9799return tuple (map (int ,re .search (pattern ,readme .read ()).groups ()))
98100
99101
102+ def load_property (filename :str ,pattern :str ,action :Callable [[int ],None ]):
103+ with fetch_open (filename )as properties :
104+ single = re .compile (rf"^([0-9A-F]+)\s*;\s*{ pattern } \s+" )
105+ multiple = re .compile (rf"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*{ pattern } \s+" )
106+
107+ for line in properties .readlines ():
108+ raw_data = None # (low, high)
109+ if match := single .match (line ):
110+ raw_data = (match .group (1 ),match .group (1 ))
111+ elif match := multiple .match (line ):
112+ raw_data = (match .group (1 ),match .group (2 ))
113+ else :
114+ continue
115+ low = int (raw_data [0 ],16 )
116+ high = int (raw_data [1 ],16 )
117+ for cp in range (low ,high + 1 ):
118+ action (cp )
119+
120+
100121class EffectiveWidth (enum .IntEnum ):
101122"""Represents the width of a Unicode character. All East Asian Width classes resolve into
102123 either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
@@ -112,15 +133,15 @@ class EffectiveWidth(enum.IntEnum):
112133""" Two columns wide in a CJK context. One column wide in all other contexts. """
113134
114135
115- def load_east_asian_widths ()-> " list[EffectiveWidth]" :
136+ def load_east_asian_widths ()-> list [EffectiveWidth ]:
116137"""Return a list of effective widths, indexed by codepoint.
117138 Widths are determined by fetching and parsing `EastAsianWidth.txt`.
118139
119140 `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`.
120141
121142 `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
122143
123- `Ambiguous`chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
144+ `Ambiguous`characters are assigned `EffectiveWidth.AMBIGUOUS`."""
124145with fetch_open ("EastAsianWidth.txt" )as eaw :
125146# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
126147single = re .compile (r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)" )
@@ -161,7 +182,7 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
161182return width_map
162183
163184
164- def load_zero_widths ()-> " list[bool]" :
185+ def load_zero_widths ()-> list [bool ]:
165186"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
166187 character. `c` is considered a zero-width character if
167188
@@ -180,26 +201,11 @@ def load_zero_widths() -> "list[bool]":
180201# `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
181202# as well as a few `Mc` characters that need to be included so that
182203# canonically equivalent sequences have the same width.
183- with fetch_open ("DerivedCoreProperties.txt" )as properties :
184- single = re .compile (
185- r"^([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
186- )
187- multiple = re .compile (
188- r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
189- )
190-
191- for line in properties .readlines ():
192- raw_data = None # (low, high)
193- if match := single .match (line ):
194- raw_data = (match .group (1 ),match .group (1 ))
195- elif match := multiple .match (line ):
196- raw_data = (match .group (1 ),match .group (2 ))
197- else :
198- continue
199- low = int (raw_data [0 ],16 )
200- high = int (raw_data [1 ],16 )
201- for cp in range (low ,high + 1 ):
202- zw_map [cp ]= True
204+ load_property (
205+ "DerivedCoreProperties.txt" ,
206+ r"(?:Default_Ignorable_Code_Point|Grapheme_Extend)" ,
207+ lambda cp :operator .setitem (zw_map ,cp ,True ),
208+ )
203209
204210# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
205211# as they canonically decompose to two characters with this property,
@@ -217,29 +223,11 @@ def load_zero_widths() -> "list[bool]":
217223# and the resulting grapheme has width 2.
218224#
219225# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
220- with fetch_open ("HangulSyllableType.txt" )as categories :
221- single = re .compile (r"^([0-9A-F]+)\s*;\s*(V|T)\s+" )
222- multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(V|T)\s+" )
223-
224- for line in categories .readlines ():
225- raw_data = None # (low, high)
226- if match := single .match (line ):
227- raw_data = (match .group (1 ),match .group (1 ))
228- elif match := multiple .match (line ):
229- raw_data = (match .group (1 ),match .group (2 ))
230- else :
231- continue
232- low = int (raw_data [0 ],16 )
233- high = int (raw_data [1 ],16 )
234- for cp in range (low ,high + 1 ):
235- zw_map [cp ]= True
236-
237- # Special case: U+115F HANGUL CHOSEONG FILLER.
238- # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
239- # zero width. However, the expected usage is to combine it with vowel or trailing jamo
240- # (which are considered 0-width on their own) to form a composed Hangul syllable with
241- # width 2. Therefore, we treat it as having width 2.
242- zw_map [0x115F ]= False
226+ load_property (
227+ "HangulSyllableType.txt" ,
228+ r"(?:V|T)" ,
229+ lambda cp :operator .setitem (zw_map ,cp ,True ),
230+ )
243231
244232# Syriac abbreviation mark:
245233# Zero-width `Prepended_Concatenation_Mark`
@@ -252,7 +240,14 @@ def load_zero_widths() -> "list[bool]":
252240zw_map [0x0891 ]= True
253241zw_map [0x08E2 ]= True
254242
255- # U+A8FA DEVANAGARI CARET
243+ # HANGUL CHOSEONG FILLER
244+ # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
245+ # zero width. However, the expected usage is to combine it with vowel or trailing jamo
246+ # (which are considered 0-width on their own) to form a composed Hangul syllable with
247+ # width 2. Therefore, we treat it as having width 2.
248+ zw_map [0x115F ]= False
249+
250+ # DEVANAGARI CARET
256251# https://www.unicode.org/versions/Unicode15.0.0/ch12.pdf#G667447
257252zw_map [0xA8FA ]= True
258253
@@ -287,13 +282,13 @@ def try_extend(self, attempt: "Bucket") -> bool:
287282self .widths = more
288283return True
289284
290- def entries (self )-> " list[tuple[Codepoint, EffectiveWidth]]" :
285+ def entries (self )-> list [tuple [Codepoint ,EffectiveWidth ]]:
291286"""Return a list of the codepoint/width pairs in this bucket, sorted by codepoint."""
292287result = list (self .entry_set )
293288result .sort ()
294289return result
295290
296- def width (self )-> " EffectiveWidth | None" :
291+ def width (self )-> EffectiveWidth | None :
297292"""If all codepoints in this bucket have the same width, return that width; otherwise,
298293 return `None`."""
299294if len (self .widths )== 0 :
@@ -305,7 +300,7 @@ def width(self) -> "EffectiveWidth | None":
305300return potential_width
306301
307302
308- def make_buckets (entries ,low_bit :BitPos ,cap_bit :BitPos )-> " list[Bucket]" :
303+ def make_buckets (entries ,low_bit :BitPos ,cap_bit :BitPos )-> list [Bucket ]:
309304"""Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All
310305 codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the
311306 same bucket. Returns a list of the buckets in increasing order of those bits."""
@@ -373,7 +368,7 @@ def buckets(self):
373368"""Returns an iterator over this table's buckets."""
374369return self .indexed
375370
376- def to_bytes (self )-> " list[int]" :
371+ def to_bytes (self )-> list [int ]:
377372"""Returns this table's entries as a list of bytes. The bytes are formatted according to
378373 the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries
379374 to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will
@@ -389,8 +384,8 @@ def to_bytes(self) -> "list[int]":
389384
390385
391386def make_tables (
392- table_cfgs :" list[tuple[BitPos, BitPos, OffsetType]]" ,entries
393- )-> " list[Table]" :
387+ table_cfgs :list [tuple [BitPos ,BitPos ,OffsetType ]],entries
388+ )-> list [Table ]:
394389"""Creates a table for each configuration in `table_cfgs`, with the first config corresponding
395390 to the top-level lookup table, the second config corresponding to the second-level lookup
396391 table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs
@@ -404,7 +399,7 @@ def make_tables(
404399return tables
405400
406401
407- def load_emoji_presentation_sequences ()-> " list[int]" :
402+ def load_emoji_presentation_sequences ()-> list [int ]:
408403"""Outputs a list of character ranages, corresponding to all the valid characters for starting
409404 an emoji presentation sequence."""
410405
@@ -420,7 +415,7 @@ def load_emoji_presentation_sequences() -> "list[int]":
420415return codepoints
421416
422417
423- def load_text_presentation_sequences ()-> " list[int]" :
418+ def load_text_presentation_sequences ()-> list [int ]:
424419"""Outputs a list of character ranages, corresponding to all the valid characters
425420 whose widths change with a text presentation sequence."""
426421
@@ -435,24 +430,12 @@ def load_text_presentation_sequences() -> "list[int]":
435430text_presentation_seq_codepoints .add (cp )
436431
437432default_emoji_codepoints = set ()
438- with fetch_open ("emoji/emoji-data.txt" )as emoji_data :
439- single = re .compile (r"^([0-9A-F]+)\s*;\s*Emoji_Presentation\s+" )
440- multiple = re .compile (
441- r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Presentation\s+"
442- )
443433
444- for line in emoji_data .readlines ():
445- raw_data = None # (low, high)
446- if match := single .match (line ):
447- raw_data = (match .group (1 ),match .group (1 ))
448- elif match := multiple .match (line ):
449- raw_data = (match .group (1 ),match .group (2 ))
450- else :
451- continue
452- low = int (raw_data [0 ],16 )
453- high = int (raw_data [1 ],16 )
454- for cp in range (low ,high + 1 ):
455- default_emoji_codepoints .add (cp )
434+ load_property (
435+ "emoji/emoji-data.txt" ,
436+ "Emoji_Presentation" ,
437+ lambda cp :default_emoji_codepoints .add (cp ),
438+ )
456439
457440codepoints = []
458441for cp in text_presentation_seq_codepoints .intersection (default_emoji_codepoints ):
@@ -466,11 +449,11 @@ def load_text_presentation_sequences() -> "list[int]":
466449
467450
468451def make_presentation_sequence_table (
469- seqs :" list[int]" ,
470- width_map :" list[EffectiveWidth]" ,
471- spurious_false :" set[EffectiveWidth]" ,
472- spurious_true :" set[EffectiveWidth]" ,
473- )-> " tuple[list[tuple[int, int]], list[list[int]]]" :
452+ seqs :list [Codepoint ] ,
453+ width_map :list [EffectiveWidth ],
454+ spurious_false :set [EffectiveWidth ],
455+ spurious_true :set [EffectiveWidth ],
456+ )-> tuple [list [tuple [int ,int ]],list [list [int ]]]:
474457"""Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
475458 The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
476459 """
@@ -488,13 +471,13 @@ def make_presentation_sequence_table(
488471 ):
489472del prefixes_dict [k ]
490473
491- msbs :" list[int]" = list (prefixes_dict .keys ())
474+ msbs :list [int ]= list (prefixes_dict .keys ())
492475
493476for cp ,width in enumerate (width_map ):
494477if width in spurious_true and (cp >> 10 )in msbs :
495478prefixes_dict [cp >> 10 ].add (cp & 0x3FF )
496479
497- leaves :" list[list[int]]" = []
480+ leaves :list [list [int ]]= []
498481for cps in prefixes_dict .values ():
499482leaf = [0 ]* 128
500483for cp in cps :
@@ -524,10 +507,10 @@ def make_presentation_sequence_table(
524507
525508def emit_module (
526509out_name :str ,
527- unicode_version :" tuple[int, int, int]" ,
528- tables :" list[Table]" ,
529- emoji_presentation_table :" tuple[list[tuple[int, int]], list[list[int]]]" ,
530- text_presentation_table :" tuple[list[tuple[int, int]], list[list[int]]]" ,
510+ unicode_version :tuple [int ,int ,int ],
511+ tables :list [Table ],
512+ emoji_presentation_table :tuple [list [tuple [int ,int ]],list [list [int ]]],
513+ text_presentation_table :tuple [list [tuple [int ,int ]],list [list [int ]]],
531514):
532515"""Outputs a Rust module to `out_name` using table data from `tables`.
533516 If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -574,18 +557,18 @@ def emit_module(
574557 pub fn lookup_width(c: char, is_cjk: bool) -> usize {
575558 let cp = c as usize;
576559
577- let t1_offset = TABLES_0[cp >> 13 & 0xFF];
560+ let t1_offset = TABLES_0.0 [cp >> 13 & 0xFF];
578561
579562 // Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte,
580563 // so each sub-table is 128 bytes in size.
581564 // (Sub-tables are selected using the computed offset from the previous table.)
582- let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
565+ let t2_offset = TABLES_1.0 [128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
583566
584567 // Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits.
585568 // This is accomplished by packing four stored entries into one byte.
586569 // So each sub-table is 2**(6-2) == 16 bytes in size.
587570 // Since this is the last table, each entry represents an encoded width.
588- let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
571+ let packed_widths = TABLES_2.0 [16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
589572
590573 // Extract the packed width
591574 let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
@@ -669,6 +652,12 @@ def emit_module(
669652 // Use the 3 LSB of `cp` to index into `leaf_byte`.
670653 ((leaf_byte >> (cp & 7)) & 1) == 1
671654 }
655+
656+ #[repr(align(128))]
657+ struct Align128<T>(T);
658+
659+ #[repr(align(16))]
660+ struct Align16<T>(T);
672661"""
673662 )
674663
@@ -677,26 +666,27 @@ def emit_module(
677666new_subtable_count = len (table .buckets ())
678667if i == len (tables )- 1 :
679668table .indices_to_widths ()# for the last table, indices == widths
669+ align = 16
670+ else :
671+ align = 128
680672byte_array = table .to_bytes ()
681673module .write (
682674f"""
683675 /// Autogenerated.{ subtable_count } sub-table(s). Consult [`lookup_width`] for layout info.
684- static TABLES_{ i } : [u8;{ len (byte_array )} ] = ["""
676+ static TABLES_{ i } :Align { align } < [u8;{ len (byte_array )} ]> =Align { align } ( ["""
685677 )
686678for j ,byte in enumerate (byte_array ):
687679# Add line breaks for every 15th entry (chosen to match what rustfmt does)
688680if j % 15 == 0 :
689681module .write ("\n " )
690682module .write (f" 0x{ byte :02X} ," )
691- module .write ("\n ];\n " )
683+ module .write ("\n ]) ;\n " )
692684subtable_count = new_subtable_count
693685
694686# emoji table
695687
696688module .write (
697689f"""
698- #[repr(align(128))]
699- struct Align128<T>(T);
700690 /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
701691 /// to get whether it can start an emoji presentation sequence.
702692 static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{ len (emoji_presentation_leaves )} ]> = Align128([