@@ -150,14 +150,15 @@ def load_zero_widths() -> "list[bool]":
150150"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
151151 character. `c` is considered a zero-width character if
152152
153- - it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
153+ - it is in general category `Cc`,
154+ - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
154155 - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
155156 - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
156157 """
157158
158159zw_map = []
159160
160- # Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
161+ # Characters with general category `Cc` have 0 width
161162with fetch_open ("UnicodeData.txt" )as categories :
162163current = 0
163164for line in categories .readlines ():
@@ -168,7 +169,7 @@ def load_zero_widths() -> "list[bool]":
168169raw_data [1 ],
169170raw_data [2 ],
170171 ]
171- zero_width = cat_code in [ "Cc" , "Mn" , "Me" ]
172+ zero_width = cat_code == "Cc"
172173
173174assert current <= codepoint
174175while current <= codepoint :
@@ -188,10 +189,16 @@ def load_zero_widths() -> "list[bool]":
188189# `Default_Ignorable_Code_Point`s also have 0 width:
189190# https://www.unicode.org/faq/unsup_char.html#3
190191# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
192+ #
193+ # `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
194+ # as well as a few `Mc` characters that need to be included so that
195+ # canonically equivalent sequences have the same width.
191196with fetch_open ("DerivedCoreProperties.txt" )as properties :
192- single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" )
197+ single = re .compile (
198+ r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
199+ )
193200multiple = re .compile (
194- r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
201+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?: Default_Ignorable_Code_Point|Grapheme_Extend) \s+"
195202 )
196203
197204for line in properties .readlines ():
@@ -240,6 +247,19 @@ def load_zero_widths() -> "list[bool]":
240247# (which are considered 0-width on their own) to form a composed Hangul syllable with
241248# width 2. Therefore, we treat it as having width 2.
242249zw_map [0x115F ]= False
250+
251+ # Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
252+ # as they canonically decompose to two characters with this property,
253+ # but they aren't.
254+ zw_map [0x0CC0 ]= True
255+ zw_map [0x0CC7 ]= True
256+ zw_map [0x0CC8 ]= True
257+ zw_map [0x0CCA ]= True
258+ zw_map [0x0CCB ]= True
259+ zw_map [0x1B3B ]= True
260+ zw_map [0x1B3D ]= True
261+ zw_map [0x1B43 ]= True
262+
243263return zw_map
244264
245265