Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitfda272b

Browse files
authored
Merge pull request#34 from Jules-Bertholet/default-ignorable-code-point
Fixes to characters considered zero-width
2 parents8942487 +aae585f commitfda272b

File tree

5 files changed

+368
-230
lines changed

5 files changed

+368
-230
lines changed

‎README.md‎

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ fn main() {
2626

2727
**NOTE:** The computed width values may not match the actual rendered column
2828
width. For example, the woman scientist emoji comprises of a woman emoji, a
29-
zero-width joiner and a microscope emoji.
29+
zero-width joiner and a microscope emoji. Such[emoji ZWJ sequences](https://www.unicode.org/reports/tr51/#Emoji_ZWJ_Sequences)
30+
are considered to have the sum of the widths of their constituent parts:
3031

3132
```rust
3233
externcrate unicode_width;
@@ -39,8 +40,10 @@ fn main() {
3940
}
4041
```
4142

42-
See[Unicode Standard Annex#11][UAX11] for precise details on what is and isn't
43-
covered by this crate.
43+
Additionally,[defective combining character sequences](https://unicode.org/glossary/#defective_combining_character_sequence)
44+
and nonstandard[Korean jamo](https://unicode.org/glossary/#jamo) sequences may
45+
be rendered with a different width than what this crate says. (This is not an
46+
exhaustive list.)
4447

4548
##features
4649

‎scripts/unicode.py‎

Lines changed: 91 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ class OffsetType(enum.IntEnum):
6464

6565
deffetch_open(filename:str):
6666
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
67-
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
67+
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
68+
"""
6869
ifnotos.path.exists(os.path.basename(filename)):
6970
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
7071
try:
@@ -83,7 +84,8 @@ def load_unicode_version() -> "tuple[int, int, int]":
8384

8485
classEffectiveWidth(enum.IntEnum):
8586
"""Represents the width of a Unicode character. All East Asian Width classes resolve into
86-
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""
87+
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
88+
"""
8789

8890
ZERO=0
8991
""" Zero columns wide. """
@@ -146,10 +148,17 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
146148

147149
defload_zero_widths()->"list[bool]":
148150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
149-
character. `c` is considered a zero-width character if `c` is in general categories
150-
`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
151+
character. `c` is considered a zero-width character if
152+
153+
- it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
154+
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
155+
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
156+
"""
157+
158+
zw_map= []
159+
160+
# Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
151161
withfetch_open("UnicodeData.txt")ascategories:
152-
zw_map= []
153162
current=0
154163
forlineincategories.readlines():
155164
iflen(raw_data:=line.split(";"))!=15:
@@ -159,7 +168,7 @@ def load_zero_widths() -> "list[bool]":
159168
raw_data[1],
160169
raw_data[2],
161170
]
162-
zero_width=cat_codein ["Cc","Cf","Mn","Me"]
171+
zero_width=cat_codein ["Cc","Mn","Me"]
163172

164173
assertcurrent<=codepoint
165174
whilecurrent<=codepoint:
@@ -176,12 +185,68 @@ def load_zero_widths() -> "list[bool]":
176185
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
177186
zw_map.append(False)
178187

179-
returnzw_map
188+
# `Default_Ignorable_Code_Point`s also have 0 width:
189+
# https://www.unicode.org/faq/unsup_char.html#3
190+
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
191+
withfetch_open("DerivedCoreProperties.txt")asproperties:
192+
single=re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
193+
multiple=re.compile(
194+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
195+
)
196+
197+
forlineinproperties.readlines():
198+
raw_data=None# (low, high)
199+
ifmatch:=single.match(line):
200+
raw_data= (match.group(1),match.group(1))
201+
elifmatch:=multiple.match(line):
202+
raw_data= (match.group(1),match.group(2))
203+
else:
204+
continue
205+
low=int(raw_data[0],16)
206+
high=int(raw_data[1],16)
207+
forcpinrange(low,high+1):
208+
zw_map[cp]=True
209+
210+
# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
211+
# as zero-width. This matches the behavior of glibc `wcwidth`.
212+
#
213+
# Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`,
214+
# a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine
215+
# into a single wide grapheme. So we treat vowel and trailing jamo as
216+
# 0-width, such that only the width of the leading jamo is counted
217+
# and the resulting grapheme has width 2.
218+
#
219+
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
220+
withfetch_open("HangulSyllableType.txt")ascategories:
221+
single=re.compile(r"^([0-9A-F]+)\s+;\s+(V|T)\s+")
222+
multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+")
223+
224+
forlineincategories.readlines():
225+
raw_data=None# (low, high)
226+
ifmatch:=single.match(line):
227+
raw_data= (match.group(1),match.group(1))
228+
elifmatch:=multiple.match(line):
229+
raw_data= (match.group(1),match.group(2))
230+
else:
231+
continue
232+
low=int(raw_data[0],16)
233+
high=int(raw_data[1],16)
234+
forcpinrange(low,high+1):
235+
zw_map[cp]=True
236+
237+
# Special case: U+115F HANGUL CHOSEONG FILLER.
238+
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
239+
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
240+
# (which are considered 0-width on their own) to form a composed Hangul syllable with
241+
# width 2. Therefore, we treat it as having width 2.
242+
zw_map[0x115F]=False
243+
returnzw_map
180244

181245

182246
classBucket:
183247
"""A bucket contains a group of codepoints and an ordered width list. If one bucket's width
184-
list overlaps with another's width list, those buckets can be merged via `try_extend`."""
248+
list overlaps with another's width list, those buckets can be merged via `try_extend`.
249+
"""
185250

186251
def__init__(self):
187252
"""Creates an empty bucket."""
@@ -230,9 +295,9 @@ def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
230295
same bucket. Returns a list of the buckets in increasing order of those bits."""
231296
num_bits=cap_bit-low_bit
232297
assertnum_bits>0
233-
buckets= [Bucket()for_inrange(0,2**num_bits)]
298+
buckets= [Bucket()for_inrange(0,2**num_bits)]
234299
mask= (1<<num_bits)-1
235-
for(codepoint,width)inentries:
300+
forcodepoint,widthinentries:
236301
buckets[(codepoint>>low_bit)&mask].append(codepoint,width)
237302
returnbuckets
238303

@@ -269,7 +334,7 @@ def __init__(
269334
buckets.extend(make_buckets(entries,self.low_bit,self.cap_bit))
270335

271336
forbucketinbuckets:
272-
for(i,existing)inenumerate(self.indexed):
337+
fori,existinginenumerate(self.indexed):
273338
ifexisting.try_extend(bucket):
274339
self.entries.append(i)
275340
break
@@ -283,7 +348,8 @@ def __init__(
283348

284349
defindices_to_widths(self):
285350
"""Destructively converts the indices in this table to the `EffectiveWidth` values of
286-
their buckets. Assumes that no bucket contains codepoints with different widths."""
351+
their buckets. Assumes that no bucket contains codepoints with different widths.
352+
"""
287353
self.entries=list(map(lambdai:int(self.indexed[i].width()),self.entries))
288354
delself.indexed
289355

@@ -315,7 +381,7 @@ def make_tables(
315381
to include in the top-level table."""
316382
tables= []
317383
entry_groups= [entries]
318-
for(low_bit,cap_bit,offset_type)intable_cfgs:
384+
forlow_bit,cap_bit,offset_typeintable_cfgs:
319385
table=Table(entry_groups,low_bit,cap_bit,offset_type)
320386
entry_groups=map(lambdabucket:bucket.entries(),table.buckets())
321387
tables.append(table)
@@ -326,7 +392,8 @@ def emit_module(
326392
out_name:str,unicode_version:"tuple[int, int, int]",tables:"list[Table]"
327393
):
328394
"""Outputs a Rust module to `out_name` using table data from `tables`.
329-
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
395+
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
396+
"""
330397
ifos.path.exists(out_name):
331398
os.remove(out_name)
332399
withopen(out_name,"w",newline="\n",encoding="utf-8")asmodule:
@@ -432,7 +499,7 @@ def emit_module(
432499
)
433500

434501
subtable_count=1
435-
for(i,table)inenumerate(tables):
502+
fori,tableinenumerate(tables):
436503
new_subtable_count=len(table.buckets())
437504
ifi==len(tables)-1:
438505
table.indices_to_widths()# for the last table, indices == widths
@@ -442,7 +509,7 @@ def emit_module(
442509
/// Autogenerated.{subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
443510
static TABLES_{i}: [u8;{len(byte_array)}] = ["""
444511
)
445-
for(j,byte)inenumerate(byte_array):
512+
forj,byteinenumerate(byte_array):
446513
# Add line breaks for every 15th entry (chosen to match what rustfmt does)
447514
ifj%15==0:
448515
module.write("\n ")
@@ -458,16 +525,17 @@ def main(module_filename: str):
458525
`module_filename`.
459526
460527
We obey the following rules in decreasing order of importance:
461-
- The soft hyphen (`U+00AD`) is single-width.
462-
- Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
463-
- All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
528+
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
529+
- Hangul jamo medial vowels & final consonants are zero-width.
530+
- All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
531+
- All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
464532
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
465533
- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
466534
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
467-
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
535+
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
468536
469-
These rules are based off ofMarkus Kuhn's free`wcwidth()`implementation:
470-
http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c"""
537+
These rules are based off ofUAX11, other Unicode standards, and various`wcwidth()`implementations.
538+
"""
471539
version=load_unicode_version()
472540
print(f"Generating module for Unicode{version[0]}.{version[1]}.{version[2]}")
473541

@@ -482,15 +550,11 @@ def main(module_filename: str):
482550
# Override for soft hyphen
483551
width_map[0x00AD]=EffectiveWidth.NARROW
484552

485-
# Override for Hangul Jamo medial vowels & final consonants
486-
foriinrange(0x1160,0x11FF+1):
487-
width_map[i]=EffectiveWidth.ZERO
488-
489553
tables=make_tables(TABLE_CFGS,enumerate(width_map))
490554

491555
print("------------------------")
492556
total_size=0
493-
for(i,table)inenumerate(tables):
557+
fori,tableinenumerate(tables):
494558
size_bytes=len(table.to_bytes())
495559
print(f"Table{i} Size:{size_bytes} bytes")
496560
total_size+=size_bytes

‎src/lib.rs‎

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,10 @@
4343
//! ```
4444
4545
#![deny(missing_docs, unsafe_code)]
46-
#![doc(html_logo_url ="https://unicode-rs.github.io/unicode-rs_sm.png",
47-
html_favicon_url ="https://unicode-rs.github.io/unicode-rs_sm.png")]
48-
46+
#![doc(
47+
html_logo_url ="https://unicode-rs.github.io/unicode-rs_sm.png",
48+
html_favicon_url ="https://unicode-rs.github.io/unicode-rs_sm.png"
49+
)]
4950
#![cfg_attr(feature ="bench", feature(test))]
5051
#![no_std]
5152

@@ -87,10 +88,14 @@ pub trait UnicodeWidthChar {
8788

8889
implUnicodeWidthCharforchar{
8990
#[inline]
90-
fnwidth(self) ->Option<usize>{ cw::width(self,false)}
91+
fnwidth(self) ->Option<usize>{
92+
cw::width(self,false)
93+
}
9194

9295
#[inline]
93-
fnwidth_cjk(self) ->Option<usize>{ cw::width(self,true)}
96+
fnwidth_cjk(self) ->Option<usize>{
97+
cw::width(self,true)
98+
}
9499
}
95100

96101
/// Methods for determining displayed width of Unicode strings.
@@ -103,7 +108,7 @@ pub trait UnicodeWidthStr {
103108
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
104109
/// as 1 column wide. This is consistent with the recommendations for
105110
/// non-CJK contexts, or when the context cannot be reliably determined.
106-
fnwidth<'a>(&'aself) ->usize;
111+
fnwidth(&self) ->usize;
107112

108113
/// Returns the string's displayed width in columns.
109114
///
@@ -113,7 +118,7 @@ pub trait UnicodeWidthStr {
113118
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
114119
/// as 2 column wide. This is consistent with the recommendations for
115120
/// CJK contexts.
116-
fnwidth_cjk<'a>(&'aself) ->usize;
121+
fnwidth_cjk(&self) ->usize;
117122
}
118123

119124
implUnicodeWidthStrforstr{

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp