Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitaae585f

Browse files
Mark interlinear annotation chars and Egyptian hieroglyph format controls as non-zero width
1 parent436b0db commitaae585f

File tree

3 files changed

+30
-33
lines changed

3 files changed

+30
-33
lines changed

‎scripts/unicode.py‎

Lines changed: 5 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -150,15 +150,14 @@ def load_zero_widths() -> "list[bool]":
150150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
151151
character. `c` is considered a zero-width character if
152152
153-
- it is in general categories `Cc`, `Cf`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
154-
and is not a `Prepended_Concatenation_Mark` (determined from `PropList.txt`),
153+
- it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
155154
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
156155
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
157156
"""
158157

159158
zw_map= []
160159

161-
# Characters with general category `Cc`, `Cf`, `Mn`, or `Me` have 0 width...
160+
# Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
162161
withfetch_open("UnicodeData.txt")ascategories:
163162
current=0
164163
forlineincategories.readlines():
@@ -169,7 +168,7 @@ def load_zero_widths() -> "list[bool]":
169168
raw_data[1],
170169
raw_data[2],
171170
]
172-
zero_width=cat_codein ["Cc","Cf","Mn","Me"]
171+
zero_width=cat_codein ["Cc","Mn","Me"]
173172

174173
assertcurrent<=codepoint
175174
whilecurrent<=codepoint:
@@ -186,32 +185,9 @@ def load_zero_widths() -> "list[bool]":
186185
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
187186
zw_map.append(False)
188187

189-
# ...unless they are a `Prepended_Concatenation_Mark`.
190-
# https://www.unicode.org/reports/tr44/:
191-
# "A small class of visible format controls,
192-
# which precede and then span a sequence of other characters, usually digits.
193-
# These have also been known as "subtending marks",
194-
# because most of them take a form which visually extends underneath the sequence of following digits."
195-
withfetch_open("PropList.txt")asproperties:
196-
single=re.compile(r"^([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+")
197-
multiple=re.compile(
198-
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+"
199-
)
200-
forlineinproperties.readlines():
201-
raw_data=None# (low, high)
202-
ifmatch:=single.match(line):
203-
raw_data= (match.group(1),match.group(1))
204-
elifmatch:=multiple.match(line):
205-
raw_data= (match.group(1),match.group(2))
206-
else:
207-
continue
208-
low=int(raw_data[0],16)
209-
high=int(raw_data[1],16)
210-
forcpinrange(low,high+1):
211-
zw_map[cp]=False
212-
213188
# `Default_Ignorable_Code_Point`s also have 0 width:
214189
# https://www.unicode.org/faq/unsup_char.html#3
190+
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
215191
withfetch_open("DerivedCoreProperties.txt")asproperties:
216192
single=re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
217193
multiple=re.compile(
@@ -552,8 +528,7 @@ def main(module_filename: str):
552528
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
553529
- Hangul jamo medial vowels & final consonants are zero-width.
554530
- All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
555-
- All codepoints in general categories `Cc`, `Cf`, `Mn`, or `Me` are zero-width,
556-
except for `Prepended_Concatenation_Mark`s.
531+
- All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
557532
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
558533
- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
559534
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width

‎src/tables.rs‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ pub mod charwidth {
182182
0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,
183183
0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,
184184
0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,
185-
0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x47,0xBD,0x06,0x06,0x06,0x06,
185+
0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0xBD,0x06,0x06,0x06,0x06,
186186
0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,
187187
0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,
188188
0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,0x06,
@@ -430,7 +430,7 @@ pub mod charwidth {
430430
0x6A,0xAA,0x55,0x55,0x55,0x55,0x55,0xAA,0xAA,0xAA,0xAA,0xAA,0xAA,0xAA,0xAA,
431431
0x56,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,
432432
0x55,0x54,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,
433-
0x55,0x55,0xAA,0x6A,0x55,0x55,0x00,0x00,0x00,0x5D,0x55,0x55,0x55,0x55,0x55,
433+
0x55,0x55,0xAA,0x6A,0x55,0x55,0x00,0x00,0x54,0x5D,0x55,0x55,0x55,0x55,0x55,
434434
0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x51,0x55,0x55,0x55,0x55,
435435
0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x05,0x40,0x55,0x01,0x41,0x55,
436436
0x00,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x40,0x15,0x55,0x55,

‎src/tests.rs‎

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ fn test_jamo() {
223223
}
224224

225225
#[test]
226-
fntest_prepended_concatenation_mark(){
226+
fntest_prepended_concatenation_marks(){
227227
usesuper::UnicodeWidthChar;
228228
#[cfg(feature ="no_std")]
229229
use core::option::Option::{None,Some};
@@ -233,3 +233,25 @@ fn test_prepended_concatenation_mark() {
233233
assert_eq!(UnicodeWidthChar::width('\u{08E2}'),Some(1));
234234
assert_eq!(UnicodeWidthChar::width('\u{110BD}'),Some(1));
235235
}
236+
237+
#[test]
238+
fntest_interlinear_annotation_chars(){
239+
usesuper::UnicodeWidthChar;
240+
#[cfg(feature ="no_std")]
241+
use core::option::Option::{None,Some};
242+
243+
assert_eq!(UnicodeWidthChar::width('\u{FFF9}'),Some(1));
244+
assert_eq!(UnicodeWidthChar::width('\u{FFFA}'),Some(1));
245+
assert_eq!(UnicodeWidthChar::width('\u{FFFB}'),Some(1));
246+
}
247+
248+
#[test]
249+
fntest_hieroglyph_format_controls(){
250+
usesuper::UnicodeWidthChar;
251+
#[cfg(feature ="no_std")]
252+
use core::option::Option::{None,Some};
253+
254+
assert_eq!(UnicodeWidthChar::width('\u{13430}'),Some(1));
255+
assert_eq!(UnicodeWidthChar::width('\u{13436}'),Some(1));
256+
assert_eq!(UnicodeWidthChar::width('\u{1343C}'),Some(1));
257+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp