|
11 | 11 | # except according to those terms. |
12 | 12 |
|
13 | 13 | # This script uses the following Unicode tables: |
| 14 | +# |
| 15 | +# - DerivedCoreProperties.txt |
14 | 16 | # - EastAsianWidth.txt |
| 17 | +# - HangulSyllableType.txt |
| 18 | +# - PropList.txt |
15 | 19 | # - ReadMe.txt |
16 | | -# - UnicodeData.txt |
17 | 20 | # |
18 | 21 | # Since this should not require frequent updates, we just store this |
19 | 22 | # out-of-line and check the generated module into git. |
@@ -150,41 +153,20 @@ def load_zero_widths() -> "list[bool]": |
150 | 153 | """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width |
151 | 154 | character. `c` is considered a zero-width character if |
152 | 155 |
|
153 | | - - it is in general category `Cc`, |
154 | | - - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`), |
| 156 | + - it is a control character, |
155 | 157 | - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`), |
| 158 | + - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`), |
| 159 | + - or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug, |
156 | 160 | - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`). |
157 | 161 | """ |
158 | 162 |
|
159 | | -zw_map= [] |
| 163 | +zw_map= [False]*NUM_CODEPOINTS |
160 | 164 |
|
161 | | -# Characters with general category `Cc` have 0 width |
162 | | -withfetch_open("UnicodeData.txt")ascategories: |
163 | | -current=0 |
164 | | -forlineincategories.readlines(): |
165 | | -iflen(raw_data:=line.split(";"))!=15: |
166 | | -continue |
167 | | - [codepoint,name,cat_code]= [ |
168 | | -int(raw_data[0],16), |
169 | | -raw_data[1], |
170 | | -raw_data[2], |
171 | | - ] |
172 | | -zero_width=cat_code=="Cc" |
173 | | - |
174 | | -assertcurrent<=codepoint |
175 | | -whilecurrent<=codepoint: |
176 | | -ifname.endswith(", Last>")orcurrent==codepoint: |
177 | | -# if name ends with Last, we backfill the width value to all codepoints since |
178 | | -# the previous codepoint (aka the start of the range) |
179 | | -zw_map.append(zero_width) |
180 | | -else: |
181 | | -# unassigned characters are implicitly given Neutral width, which is nonzero |
182 | | -zw_map.append(False) |
183 | | -current+=1 |
184 | | - |
185 | | -whilelen(zw_map)<NUM_CODEPOINTS: |
186 | | -# Catch any leftover codepoints. They must be unassigned (so nonzero width) |
187 | | -zw_map.append(False) |
| 165 | +# Control characters have width 0 |
| 166 | +forcinrange(0x00,0x20): |
| 167 | +zw_map[c]=True |
| 168 | +forcinrange(0x7F,0xA0): |
| 169 | +zw_map[c]=True |
188 | 170 |
|
189 | 171 | # `Default_Ignorable_Code_Point`s also have 0 width: |
190 | 172 | # https://www.unicode.org/faq/unsup_char.html#3 |
@@ -214,6 +196,12 @@ def load_zero_widths() -> "list[bool]": |
214 | 196 | forcpinrange(low,high+1): |
215 | 197 | zw_map[cp]=True |
216 | 198 |
|
| 199 | +# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`, |
| 200 | +# as they canonically decompose to two characters with this property, |
| 201 | +# but they aren't. |
| 202 | +forcin [0x0CC0,0x0CC7,0x0CC8,0x0CCA,0x0CCB,0x1B3B,0x1B3D,0x1B43]: |
| 203 | +zw_map[c]=True |
| 204 | + |
217 | 205 | # Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo` |
218 | 206 | # as zero-width. This matches the behavior of glibc `wcwidth`. |
219 | 207 | # |
@@ -248,18 +236,6 @@ def load_zero_widths() -> "list[bool]": |
248 | 236 | # width 2. Therefore, we treat it as having width 2. |
249 | 237 | zw_map[0x115F]=False |
250 | 238 |
|
251 | | -# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`, |
252 | | -# as they canonically decompose to two characters with this property, |
253 | | -# but they aren't. |
254 | | -zw_map[0x0CC0]=True |
255 | | -zw_map[0x0CC7]=True |
256 | | -zw_map[0x0CC8]=True |
257 | | -zw_map[0x0CCA]=True |
258 | | -zw_map[0x0CCB]=True |
259 | | -zw_map[0x1B3B]=True |
260 | | -zw_map[0x1B3D]=True |
261 | | -zw_map[0x1B43]=True |
262 | | - |
263 | 239 | returnzw_map |
264 | 240 |
|
265 | 241 |
|
@@ -297,7 +273,7 @@ def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]": |
297 | 273 | result.sort() |
298 | 274 | returnresult |
299 | 275 |
|
300 | | -defwidth(self)->"EffectiveWidth": |
| 276 | +defwidth(self)->"EffectiveWidth | None": |
301 | 277 | """If all codepoints in this bucket have the same width, return that width; otherwise, |
302 | 278 | return `None`.""" |
303 | 279 | iflen(self.widths)==0: |
@@ -542,13 +518,16 @@ def main(module_filename: str): |
542 | 518 | lookup table for character width, and write a Rust module utilizing that table to |
543 | 519 | `module_filename`. |
544 | 520 |
|
545 | | - We obey the following rules in decreasing order of importance: |
| 521 | + We obey the following rules, in decreasing order of importance: |
| 522 | +
|
546 | 523 | - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c) |
547 | 524 | - Hangul jamo medial vowels & final consonants are zero-width. |
548 | | - - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER. |
549 | | - - All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width. |
550 | | - - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width. |
551 | | - - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. |
| 525 | + - `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER. |
| 526 | + - Control characters are zero-width. |
| 527 | + - `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters, |
| 528 | + are zero-width. |
| 529 | + - Codepoints with an East Asian Width of `Ambigous` are ambiguous-width. |
| 530 | + - Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. |
552 | 531 | - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width |
553 | 532 | of `Neutral`, `Narrow`, or `Halfwidth`) are single-width. |
554 | 533 |
|
|