@@ -64,7 +64,8 @@ class OffsetType(enum.IntEnum):
6464
6565def fetch_open (filename :str ):
6666"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
67- fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
67+ fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
68+ """
6869if not os .path .exists (os .path .basename (filename )):
6970os .system (f"curl -O http://www.unicode.org/Public/UNIDATA/{ filename } " )
7071try :
@@ -83,7 +84,8 @@ def load_unicode_version() -> "tuple[int, int, int]":
8384
8485class EffectiveWidth (enum .IntEnum ):
8586"""Represents the width of a Unicode character. All East Asian Width classes resolve into
86- either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""
87+ either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
88+ """
8789
8890ZERO = 0
8991""" Zero columns wide. """
@@ -146,10 +148,17 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
146148
147149def load_zero_widths ()-> "list[bool]" :
148150"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
149- character. `c` is considered a zero-width character if `c` is in general categories
150- `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
151+ character. `c` is considered a zero-width character if
152+
153+ - it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
154+ - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
155+ - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
156+ """
157+
158+ zw_map = []
159+
160+ # Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
151161with fetch_open ("UnicodeData.txt" )as categories :
152- zw_map = []
153162current = 0
154163for line in categories .readlines ():
155164if len (raw_data := line .split (";" ))!= 15 :
@@ -159,7 +168,7 @@ def load_zero_widths() -> "list[bool]":
159168raw_data [1 ],
160169raw_data [2 ],
161170 ]
162- zero_width = cat_code in ["Cc" ,"Cf" , " Mn" ,"Me" ]
171+ zero_width = cat_code in ["Cc" ,"Mn" ,"Me" ]
163172
164173assert current <= codepoint
165174while current <= codepoint :
@@ -176,12 +185,68 @@ def load_zero_widths() -> "list[bool]":
176185# Catch any leftover codepoints. They must be unassigned (so nonzero width)
177186zw_map .append (False )
178187
179- return zw_map
188+ # `Default_Ignorable_Code_Point`s also have 0 width:
189+ # https://www.unicode.org/faq/unsup_char.html#3
190+ # https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
191+ with fetch_open ("DerivedCoreProperties.txt" )as properties :
192+ single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" )
193+ multiple = re .compile (
194+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
195+ )
196+
197+ for line in properties .readlines ():
198+ raw_data = None # (low, high)
199+ if match := single .match (line ):
200+ raw_data = (match .group (1 ),match .group (1 ))
201+ elif match := multiple .match (line ):
202+ raw_data = (match .group (1 ),match .group (2 ))
203+ else :
204+ continue
205+ low = int (raw_data [0 ],16 )
206+ high = int (raw_data [1 ],16 )
207+ for cp in range (low ,high + 1 ):
208+ zw_map [cp ]= True
209+
210+ # Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
211+ # as zero-width. This matches the behavior of glibc `wcwidth`.
212+ #
213+ # Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`,
214+ # a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine
215+ # into a single wide grapheme. So we treat vowel and trailing jamo as
216+ # 0-width, such that only the width of the leading jamo is counted
217+ # and the resulting grapheme has width 2.
218+ #
219+ # (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
220+ with fetch_open ("HangulSyllableType.txt" )as categories :
221+ single = re .compile (r"^([0-9A-F]+)\s+;\s+(V|T)\s+" )
222+ multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+" )
223+
224+ for line in categories .readlines ():
225+ raw_data = None # (low, high)
226+ if match := single .match (line ):
227+ raw_data = (match .group (1 ),match .group (1 ))
228+ elif match := multiple .match (line ):
229+ raw_data = (match .group (1 ),match .group (2 ))
230+ else :
231+ continue
232+ low = int (raw_data [0 ],16 )
233+ high = int (raw_data [1 ],16 )
234+ for cp in range (low ,high + 1 ):
235+ zw_map [cp ]= True
236+
237+ # Special case: U+115F HANGUL CHOSEONG FILLER.
238+ # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
239+ # zero width. However, the expected usage is to combine it with vowel or trailing jamo
240+ # (which are considered 0-width on their own) to form a composed Hangul syllable with
241+ # width 2. Therefore, we treat it as having width 2.
242+ zw_map [0x115F ]= False
243+ return zw_map
180244
181245
182246class Bucket :
183247"""A bucket contains a group of codepoints and an ordered width list. If one bucket's width
184- list overlaps with another's width list, those buckets can be merged via `try_extend`."""
248+ list overlaps with another's width list, those buckets can be merged via `try_extend`.
249+ """
185250
186251def __init__ (self ):
187252"""Creates an empty bucket."""
@@ -230,9 +295,9 @@ def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
230295 same bucket. Returns a list of the buckets in increasing order of those bits."""
231296num_bits = cap_bit - low_bit
232297assert num_bits > 0
233- buckets = [Bucket ()for _ in range (0 ,2 ** num_bits )]
298+ buckets = [Bucket ()for _ in range (0 ,2 ** num_bits )]
234299mask = (1 << num_bits )- 1
235- for ( codepoint ,width ) in entries :
300+ for codepoint ,width in entries :
236301buckets [(codepoint >> low_bit )& mask ].append (codepoint ,width )
237302return buckets
238303
@@ -269,7 +334,7 @@ def __init__(
269334buckets .extend (make_buckets (entries ,self .low_bit ,self .cap_bit ))
270335
271336for bucket in buckets :
272- for ( i ,existing ) in enumerate (self .indexed ):
337+ for i ,existing in enumerate (self .indexed ):
273338if existing .try_extend (bucket ):
274339self .entries .append (i )
275340break
@@ -283,7 +348,8 @@ def __init__(
283348
284349def indices_to_widths (self ):
285350"""Destructively converts the indices in this table to the `EffectiveWidth` values of
286- their buckets. Assumes that no bucket contains codepoints with different widths."""
351+ their buckets. Assumes that no bucket contains codepoints with different widths.
352+ """
287353self .entries = list (map (lambda i :int (self .indexed [i ].width ()),self .entries ))
288354del self .indexed
289355
@@ -315,7 +381,7 @@ def make_tables(
315381 to include in the top-level table."""
316382tables = []
317383entry_groups = [entries ]
318- for ( low_bit ,cap_bit ,offset_type ) in table_cfgs :
384+ for low_bit ,cap_bit ,offset_type in table_cfgs :
319385table = Table (entry_groups ,low_bit ,cap_bit ,offset_type )
320386entry_groups = map (lambda bucket :bucket .entries (),table .buckets ())
321387tables .append (table )
@@ -326,7 +392,8 @@ def emit_module(
326392out_name :str ,unicode_version :"tuple[int, int, int]" ,tables :"list[Table]"
327393):
328394"""Outputs a Rust module to `out_name` using table data from `tables`.
329- If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
395+ If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
396+ """
330397if os .path .exists (out_name ):
331398os .remove (out_name )
332399with open (out_name ,"w" ,newline = "\n " ,encoding = "utf-8" )as module :
@@ -432,7 +499,7 @@ def emit_module(
432499 )
433500
434501subtable_count = 1
435- for ( i ,table ) in enumerate (tables ):
502+ for i ,table in enumerate (tables ):
436503new_subtable_count = len (table .buckets ())
437504if i == len (tables )- 1 :
438505table .indices_to_widths ()# for the last table, indices == widths
@@ -442,7 +509,7 @@ def emit_module(
442509 /// Autogenerated.{ subtable_count } sub-table(s). Consult [`lookup_width`] for layout info.
443510 static TABLES_{ i } : [u8;{ len (byte_array )} ] = ["""
444511 )
445- for ( j ,byte ) in enumerate (byte_array ):
512+ for j ,byte in enumerate (byte_array ):
446513# Add line breaks for every 15th entry (chosen to match what rustfmt does)
447514if j % 15 == 0 :
448515module .write ("\n " )
@@ -458,16 +525,17 @@ def main(module_filename: str):
458525 `module_filename`.
459526
460527 We obey the following rules in decreasing order of importance:
461- - The soft hyphen (`U+00AD`) is single-width.
462- - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
463- - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
528+ - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
529+ - Hangul jamo medial vowels & final consonants are zero-width.
530+ - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
531+ - All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
464532 - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
465533 - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
466534 - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
467- of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
535+ of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
468536
469- These rules are based off ofMarkus Kuhn's free `wcwidth()`implementation:
470- http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c """
537+ These rules are based off ofUAX11, other Unicode standards, and various `wcwidth()`implementations.
538+ """
471539version = load_unicode_version ()
472540print (f"Generating module for Unicode{ version [0 ]} .{ version [1 ]} .{ version [2 ]} " )
473541
@@ -482,15 +550,11 @@ def main(module_filename: str):
482550# Override for soft hyphen
483551width_map [0x00AD ]= EffectiveWidth .NARROW
484552
485- # Override for Hangul Jamo medial vowels & final consonants
486- for i in range (0x1160 ,0x11FF + 1 ):
487- width_map [i ]= EffectiveWidth .ZERO
488-
489553tables = make_tables (TABLE_CFGS ,enumerate (width_map ))
490554
491555print ("------------------------" )
492556total_size = 0
493- for ( i ,table ) in enumerate (tables ):
557+ for i ,table in enumerate (tables ):
494558size_bytes = len (table .to_bytes ())
495559print (f"Table{ i } Size:{ size_bytes } bytes" )
496560total_size += size_bytes