Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitdc86c74

Browse files
Assign the same CJK width to canonically equivalent strings
1 parenta2db56b commitdc86c74

File tree

4 files changed

+462
-394
lines changed

4 files changed

+462
-394
lines changed

‎scripts/unicode.py‎

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,14 @@
1515
# - DerivedCoreProperties.txt
1616
# - EastAsianWidth.txt
1717
# - HangulSyllableType.txt
18+
# - NormalizationTest.txt (for tests only)
1819
# - PropList.txt
1920
# - ReadMe.txt
21+
# - Scripts.txt
22+
# - UnicodeData.txt
23+
# - emoji/emoji-data.txt
2024
# - emoji/emoji-variation-sequences.txt
25+
# - extracted/DerivedGeneralCategory.txt
2126
#
2227
# Since this should not require frequent updates, we just store this
2328
# out-of-line and check the generated module into git.
@@ -142,6 +147,7 @@ def load_east_asian_widths() -> list[EffectiveWidth]:
142147
`Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
143148
144149
`Ambiguous` characters are assigned `EffectiveWidth.AMBIGUOUS`."""
150+
145151
withfetch_open("EastAsianWidth.txt")aseaw:
146152
# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
147153
single=re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
@@ -179,7 +185,43 @@ def load_east_asian_widths() -> list[EffectiveWidth]:
179185
# Catch any leftover codepoints and assign them implicit Neutral/narrow width.
180186
width_map.append(EffectiveWidth.NARROW)
181187

182-
returnwidth_map
188+
# Characters from alphabetic scripts are narrow
189+
load_property(
190+
"Scripts.txt",
191+
r"(?:Latin|Greek|Cyrillic)",
192+
lambdacp: (
193+
operator.setitem(width_map,cp,EffectiveWidth.NARROW)
194+
ifwidth_map[cp]==EffectiveWidth.AMBIGUOUS
195+
andnot (0x2160<=cp<=0x217F)# Roman numerals remain ambiguous
196+
elseNone
197+
),
198+
)
199+
200+
# Ambiguous `Modifier_Symbol`s are narrow
201+
load_property(
202+
"extracted/DerivedGeneralCategory.txt",
203+
"Sk",
204+
lambdacp: (
205+
operator.setitem(width_map,cp,EffectiveWidth.NARROW)
206+
ifwidth_map[cp]==EffectiveWidth.AMBIGUOUS
207+
elseNone
208+
),
209+
)
210+
211+
# GREEK ANO TELEIA: NFC decomposes to U+00B7 MIDDLE DOT
212+
width_map[0x0387]=EffectiveWidth.AMBIGUOUS
213+
214+
# Canonical equivalence for symbols with stroke
215+
withfetch_open("UnicodeData.txt")asudata:
216+
single=re.compile(r"([0-9A-Z]+);.*?;.*?;.*?;.*?;([0-9A-Z]+) 0338;")
217+
forlineinudata.readlines():
218+
ifmatch:=single.match(line):
219+
composed=int(match.group(1),16)
220+
decomposed=int(match.group(2),16)
221+
ifwidth_map[decomposed]==EffectiveWidth.AMBIGUOUS:
222+
width_map[composed]=EffectiveWidth.AMBIGUOUS
223+
224+
returnwidth_map
183225

184226

185227
defload_zero_widths()->list[bool]:

‎src/lib.rs‎

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,9 @@
4040
//! 3. The sequence `"\r\n"` has width 1.
4141
//! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
4242
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
43-
//! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
44-
//! 6. The following have width 0:
43+
//! 5. In an East Asian context only, `<`, `=`, or `>` have width 2 when followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY].
44+
//! 6. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
45+
//! 7. The following have width 0:
4546
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
4647
//! with the [`Default_Ignorable_Code_Point`] property.
4748
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -64,18 +65,26 @@
6465
//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
6566
//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
6667
//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
67-
//!7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
68+
//!8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
6869
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
69-
//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
70-
//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
71-
//! 9. All other characters have width 1.
70+
//! 9. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
71+
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
72+
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
73+
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
74+
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
75+
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
76+
//! 10. All other characters have width 1.
77+
//!
78+
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
7279
//!
7380
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
7481
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
7582
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
83+
//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
7684
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
7785
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
7886
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
87+
//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
7988
//!
8089
//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
8190
//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
@@ -84,14 +93,13 @@
8493
//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
8594
//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
8695
//!
87-
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
96+
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
8897
//!
8998
//! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078
9099
//!
91100
//! ## Canonical equivalence
92101
//!
93-
//! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
94-
//! However, this guarantee does not currently hold for the CJK width variants.
102+
//! Canonically equivalent strings are assigned the same width (CJK and non-CJK).
95103
96104
#![forbid(unsafe_code)]
97105
#![deny(missing_docs)]
@@ -198,14 +206,17 @@ enum NextCharInfo {
198206
#[default]
199207
Default,
200208
/// `'\n'`
201-
LineFeed =0x0A,
209+
LineFeed,
210+
/// '\u{0338}'
211+
/// For preserving canonical equivalence with CJK
212+
CombiningLongSolidusOverlay,
202213
/// `'\u{A4FC}'..='\u{A4FD}'`
203214
/// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>
204215
TrailingLisuToneLetter,
205216
/// `'\u{FE0E}'`
206-
Vs15 =0x0E,
217+
Vs15,
207218
/// `'\u{FE0F}'`
208-
Vs16 =0x0F,
219+
Vs16,
209220
}
210221

211222
fnstr_width(s:&str,is_cjk:bool) ->usize{
@@ -222,7 +233,11 @@ fn str_width(s: &str, is_cjk: bool) -> usize {
222233
/// they're treated as single width.
223234
#[inline]
224235
fnwidth_in_str(c:char,is_cjk:bool,next_info:NextCharInfo) ->(usize,NextCharInfo){
225-
if next_info ==NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c){
236+
if(is_cjk
237+
&& next_info ==NextCharInfo::CombiningLongSolidusOverlay
238+
&&matches!(c,'<' |'=' |'>'))
239+
||(next_info ==NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c))
240+
{
226241
(2,NextCharInfo::Default)
227242
}elseif c <='\u{A0}'{
228243
match c{
@@ -235,6 +250,7 @@ fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextC
235250
('\u{A4F8}'..='\u{A4FB}',NextCharInfo::TrailingLisuToneLetter) =>{
236251
(0,NextCharInfo::Default)
237252
}
253+
('\u{0338}', _) =>(0,NextCharInfo::CombiningLongSolidusOverlay),
238254
('\u{A4FC}'..='\u{A4FD}', _) =>(1,NextCharInfo::TrailingLisuToneLetter),
239255
('\u{FE0E}', _) =>(0,NextCharInfo::Vs15),
240256
('\u{FE0F}', _) =>(0,NextCharInfo::Vs16),

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp