Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit9c4477c

Browse files
authored
Merge pull request#37 from Jules-Bertholet/canonical-equivalence
Ensure that canonically equivalent strings have the same width
2 parents7c489c3 +fdf5eb7 commit9c4477c

File tree

4 files changed

+382
-323
lines changed

4 files changed

+382
-323
lines changed

‎Cargo.toml‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
2222
core = {version ="1.0",package ="rustc-std-workspace-core",optional =true }
2323
compiler_builtins = {version ="0.1",optional =true }
2424

25+
[dev-dependencies]
26+
unicode-normalization ="0.1.23"
27+
2528
[features]
2629
default = []
2730
bench = []

‎scripts/unicode.py‎

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,14 +150,15 @@ def load_zero_widths() -> "list[bool]":
150150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
151151
character. `c` is considered a zero-width character if
152152
153-
- it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
153+
- it is in general category `Cc`,
154+
- or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
154155
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
155156
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
156157
"""
157158

158159
zw_map= []
159160

160-
# Characters with general category `Cc`, `Mn`, or `Me`have 0 width...
161+
# Characters with general category `Cc`have 0 width
161162
withfetch_open("UnicodeData.txt")ascategories:
162163
current=0
163164
forlineincategories.readlines():
@@ -168,7 +169,7 @@ def load_zero_widths() -> "list[bool]":
168169
raw_data[1],
169170
raw_data[2],
170171
]
171-
zero_width=cat_codein ["Cc","Mn","Me"]
172+
zero_width=cat_code=="Cc"
172173

173174
assertcurrent<=codepoint
174175
whilecurrent<=codepoint:
@@ -188,10 +189,16 @@ def load_zero_widths() -> "list[bool]":
188189
# `Default_Ignorable_Code_Point`s also have 0 width:
189190
# https://www.unicode.org/faq/unsup_char.html#3
190191
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
192+
#
193+
# `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
194+
# as well as a few `Mc` characters that need to be included so that
195+
# canonically equivalent sequences have the same width.
191196
withfetch_open("DerivedCoreProperties.txt")asproperties:
192-
single=re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
197+
single=re.compile(
198+
r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
199+
)
193200
multiple=re.compile(
194-
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
201+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
195202
)
196203

197204
forlineinproperties.readlines():
@@ -240,6 +247,19 @@ def load_zero_widths() -> "list[bool]":
240247
# (which are considered 0-width on their own) to form a composed Hangul syllable with
241248
# width 2. Therefore, we treat it as having width 2.
242249
zw_map[0x115F]=False
250+
251+
# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
252+
# as they canonically decompose to two characters with this property,
253+
# but they aren't.
254+
zw_map[0x0CC0]=True
255+
zw_map[0x0CC7]=True
256+
zw_map[0x0CC8]=True
257+
zw_map[0x0CCA]=True
258+
zw_map[0x0CCB]=True
259+
zw_map[0x1B3B]=True
260+
zw_map[0x1B3D]=True
261+
zw_map[0x1B43]=True
262+
243263
returnzw_map
244264

245265

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp