Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit3063422

Browse files
authored
Merge pull request#45 from Jules-Bertholet/control
Assign width 1 to control characters
2 parents86970a1 +4efb180 commit3063422

File tree

5 files changed

+547
-565
lines changed

5 files changed

+547
-565
lines changed

‎README.md‎

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use unicode_width::UnicodeWidthStr;
1616

1717
fnmain() {
1818
letteststr="Hello, world!";
19-
letwidth=UnicodeWidthStr::width(teststr);
19+
letwidth=teststr.width();
2020
println!("{}",teststr);
2121
println!("The above string is {} columns wide.",width);
2222
letwidth=teststr.width_cjk();
@@ -34,9 +34,9 @@ extern crate unicode_width;
3434
useunicode_width::UnicodeWidthStr;
3535

3636
fnmain() {
37-
assert_eq!(UnicodeWidthStr::width("👩"),2);// Woman
38-
assert_eq!(UnicodeWidthStr::width("🔬"),2);// Microscope
39-
assert_eq!(UnicodeWidthStr::width("👩‍🔬"),4);// Woman scientist
37+
assert_eq!("👩".width(),2);// Woman
38+
assert_eq!("🔬".width(),2);// Microscope
39+
assert_eq!("👩‍🔬".width(),4);// Woman scientist
4040
}
4141
```
4242

‎scripts/unicode.py‎

Lines changed: 2 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -165,21 +165,14 @@ def load_zero_widths() -> "list[bool]":
165165
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
166166
character. `c` is considered a zero-width character if
167167
168-
- it is a control character,
169-
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
168+
- it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
170169
- or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
171170
- or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug,
172171
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
173172
"""
174173

175174
zw_map= [False]*NUM_CODEPOINTS
176175

177-
# Control characters have width 0
178-
forcinrange(0x00,0x20):
179-
zw_map[c]=True
180-
forcinrange(0x7F,0xA0):
181-
zw_map[c]=True
182-
183176
# `Default_Ignorable_Code_Point`s also have 0 width:
184177
# https://www.unicode.org/faq/unsup_char.html#3
185178
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
@@ -563,7 +556,7 @@ def emit_module(
563556
/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
564557
/// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes.
565558
#[inline]
566-
fn lookup_width(c: char, is_cjk: bool) -> usize {
559+
pubfn lookup_width(c: char, is_cjk: bool) -> usize {
567560
let cp = c as usize;
568561
569562
let t1_offset = TABLES_0[cp >> 13 & 0xFF];
@@ -664,36 +657,6 @@ def emit_module(
664657
"""
665658
)
666659

667-
module.write(
668-
"""
669-
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
670-
/// `None` if `c` is a control character other than `'\\x00'`.
671-
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
672-
/// they're treated as single width.
673-
#[inline]
674-
pub fn width(c: char, is_cjk: bool) -> Option<usize> {
675-
if c < '\\u{7F}' {
676-
if c >= '\\u{20}' {
677-
// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
678-
Some(1)
679-
} else if c == '\\0' {
680-
// U+0000 *is* a control code, but it's special-cased
681-
Some(0)
682-
} else {
683-
// U+0001 to U+0020 (exclusive) are control codes
684-
None
685-
}
686-
} else if c >= '\\u{A0}' {
687-
// No characters >= U+00A0 are control codes, so we can consult the lookup tables
688-
Some(lookup_width(c, is_cjk))
689-
} else {
690-
// U+007F to U+00A0 (exclusive) are control codes
691-
None
692-
}
693-
}
694-
"""
695-
)
696-
697660
subtable_count=1
698661
fori,tableinenumerate(tables):
699662
new_subtable_count=len(table.buckets())

‎src/lib.rs‎

Lines changed: 68 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,13 @@
3434
//!
3535
//! 1. [Emoji presentation sequences] have width 2.
3636
//! (The width of a string may therefore differ from the sum of the widths of its characters.)
37-
//! 2. Outside of an East Asian context, [text presentation sequences]have width 1
38-
//!iff their base character fulfills all the following requirements:
37+
//! 2. Outside of an East Asian context, [text presentation sequences]fulfilling all the following requirements
38+
//!have width 1:
3939
//! - Has the [`Emoji_Presentation`] property, and
4040
//! - Not in the [Enclosed Ideographic Supplement] block.
41-
//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
42-
//! 4. The following have width 0:
41+
//! 3. The sequence `"\r\n"` has width 1.
42+
//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
43+
//! 5. The following have width 0:
4344
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
4445
//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
4546
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -55,9 +56,6 @@
5556
//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
5657
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
5758
//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
58-
//! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
59-
//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
60-
//! have no defined width, and are ignored when determining the width of a string.
6159
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
6260
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
6361
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
@@ -99,7 +97,7 @@ mod tables;
9997
/// Methods for determining displayed width of Unicode characters.
10098
pubtraitUnicodeWidthChar{
10199
/// Returns the character's displayed width in columns, or `None` if the
102-
/// character is a control character other than `'\x00'`.
100+
/// character is a control character.
103101
///
104102
/// This function treats characters in the Ambiguous category according
105103
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -108,7 +106,7 @@ pub trait UnicodeWidthChar {
108106
fnwidth(self) ->Option<usize>;
109107

110108
/// Returns the character's displayed width in columns, or `None` if the
111-
/// character is a control character other than `'\x00'`.
109+
/// character is a control character.
112110
///
113111
/// This function treats characters in the Ambiguous category according
114112
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -120,23 +118,42 @@ pub trait UnicodeWidthChar {
120118
implUnicodeWidthCharforchar{
121119
#[inline]
122120
fnwidth(self) ->Option<usize>{
123-
cw::width(self,false)
121+
single_char_width(self,false)
124122
}
125123

126124
#[inline]
127125
fnwidth_cjk(self) ->Option<usize>{
128-
cw::width(self,true)
126+
single_char_width(self,true)
127+
}
128+
}
129+
130+
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
131+
/// `None` if `c` is a control character.
132+
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
133+
/// they're treated as single width.
134+
#[inline]
135+
fnsingle_char_width(c:char,is_cjk:bool) ->Option<usize>{
136+
if c <'\u{7F}'{
137+
if c >='\u{20}'{
138+
// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
139+
Some(1)
140+
}else{
141+
// U+0001 to U+0020 (exclusive) are control codes
142+
None
143+
}
144+
}elseif c >='\u{A0}'{
145+
// No characters >= U+00A0 are control codes, so we can consult the lookup tables
146+
Some(cw::lookup_width(c, is_cjk))
147+
}else{
148+
// U+007F to U+00A0 (exclusive) are control codes
149+
None
129150
}
130151
}
131152

132153
/// Methods for determining displayed width of Unicode strings.
133154
pubtraitUnicodeWidthStr{
134155
/// Returns the string's displayed width in columns.
135156
///
136-
/// Control characters are treated as having zero width,
137-
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
138-
/// are assigned width 2.
139-
///
140157
/// This function treats characters in the Ambiguous category according
141158
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
142159
/// as 1 column wide. This is consistent with the recommendations for
@@ -145,10 +162,6 @@ pub trait UnicodeWidthStr {
145162

146163
/// Returns the string's displayed width in columns.
147164
///
148-
/// Control characters are treated as having zero width,
149-
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
150-
/// are assigned width 2.
151-
///
152165
/// This function treats characters in the Ambiguous category according
153166
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
154167
/// as 2 column wide. This is consistent with the recommendations for
@@ -168,30 +181,48 @@ impl UnicodeWidthStr for str {
168181
}
169182
}
170183

171-
#[derive(Clone,Copy,Debug,PartialEq,Eq)]
172-
enumVariationSelector{
184+
#[derive(Clone,Copy,Debug,Default,PartialEq,Eq)]
185+
enumNextCharInfo{
186+
#[default]
187+
Default,
188+
LineFeed =0x0A,
173189
Vs15 =0x0E,
174190
Vs16 =0x0F,
175191
}
176192

177193
fnstr_width(s:&str,is_cjk:bool) ->usize{
178194
s.chars()
179-
.rfold((0,None), |(sum, vsel), c|match c{
180-
'\u{FE0E}' =>(sum,Some(VariationSelector::Vs15)),
181-
'\u{FE0F}' =>(sum,Some(VariationSelector::Vs16)),
182-
_ =>{
183-
let add =match vsel{
184-
Some(VariationSelector::Vs15)
185-
if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>
186-
{
187-
1
188-
}
189-
190-
Some(VariationSelector::Vs16)if cw::starts_emoji_presentation_seq(c) =>2,
191-
_ => cw::width(c, is_cjk).unwrap_or(0),
192-
};
193-
(sum + add,None)
194-
}
195+
.rfold((0,NextCharInfo::Default), |(sum, next_info), c|{
196+
let(add, info) =width_in_str(c, is_cjk, next_info);
197+
(sum + add, info)
195198
})
196199
.0
197200
}
201+
202+
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`.
203+
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
204+
/// they're treated as single width.
205+
#[inline]
206+
fnwidth_in_str(c:char,is_cjk:bool,next_info:NextCharInfo) ->(usize,NextCharInfo){
207+
match next_info{
208+
NextCharInfo::Vs15if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>{
209+
(1,NextCharInfo::Default)
210+
}
211+
NextCharInfo::Vs16if cw::starts_emoji_presentation_seq(c) =>(2,NextCharInfo::Default),
212+
_ =>{
213+
if c <='\u{A0}'{
214+
match c{
215+
'\n' =>(1,NextCharInfo::LineFeed),
216+
'\r'if next_info ==NextCharInfo::LineFeed =>(0,NextCharInfo::Default),
217+
_ =>(1,NextCharInfo::Default),
218+
}
219+
}else{
220+
match c{
221+
'\u{FE0E}' =>(0,NextCharInfo::Vs15),
222+
'\u{FE0F}' =>(0,NextCharInfo::Vs16),
223+
_ =>(cw::lookup_width(c, is_cjk),NextCharInfo::Default),
224+
}
225+
}
226+
}
227+
}
228+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp