NotificationsYou must be signed in to change notification settings
Fork32
Star278

Commitdc86c74

committed

Assign the same CJK width to canonically equivalent strings

1 parenta2db56b commitdc86c74Copy full SHA for dc86c74

File tree

4 files changed

+462

-394

lines changed

4 files changed

+462

-394

lines changed

`‎scripts/unicode.py‎`

Lines changed: 43 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -15,9 +15,14 @@`
`15`	`15`	`# - DerivedCoreProperties.txt`
`16`	`16`	`# - EastAsianWidth.txt`
`17`	`17`	`# - HangulSyllableType.txt`
	`18`	`+# - NormalizationTest.txt (for tests only)`
`18`	`19`	`# - PropList.txt`
`19`	`20`	`# - ReadMe.txt`
	`21`	`+# - Scripts.txt`
	`22`	`+# - UnicodeData.txt`
	`23`	`+# - emoji/emoji-data.txt`
`20`	`24`	`# - emoji/emoji-variation-sequences.txt`
	`25`	`+# - extracted/DerivedGeneralCategory.txt`
`21`	`26`	`#`
`22`	`27`	`# Since this should not require frequent updates, we just store this`
`23`	`28`	`# out-of-line and check the generated module into git.`
`@@ -142,6 +147,7 @@ def load_east_asian_widths() -> list[EffectiveWidth]:`
`142`	`147`	`Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
`143`	`148`
`144`	`149`	`Ambiguous` characters are assigned `EffectiveWidth.AMBIGUOUS`."""
	`150`	`+`
`145`	`151`	`withfetch_open("EastAsianWidth.txt")aseaw:`
`146`	`152`	`# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."`
`147`	`153`	`single=re.compile(r"^([0-9A-F]+)\s;\s(\w+) +# (\w+)")`
`@@ -179,7 +185,43 @@ def load_east_asian_widths() -> list[EffectiveWidth]:`
`179`	`185`	`# Catch any leftover codepoints and assign them implicit Neutral/narrow width.`
`180`	`186`	`width_map.append(EffectiveWidth.NARROW)`
`181`	`187`
`182`		`-returnwidth_map`
	`188`	`+# Characters from alphabetic scripts are narrow`
	`189`	`+load_property(`
	`190`	`+"Scripts.txt",`
	`191`	`+r"(?:Latin\|Greek\|Cyrillic)",`
	`192`	`+lambdacp: (`
	`193`	`+operator.setitem(width_map,cp,EffectiveWidth.NARROW)`
	`194`	`+ifwidth_map[cp]==EffectiveWidth.AMBIGUOUS`
	`195`	`+andnot (0x2160<=cp<=0x217F)# Roman numerals remain ambiguous`
	`196`	`+elseNone`
	`197`	`+ ),`
	`198`	`+ )`
	`199`	`+`
	`200`	+# Ambiguous `Modifier_Symbol`s are narrow
	`201`	`+load_property(`
	`202`	`+"extracted/DerivedGeneralCategory.txt",`
	`203`	`+"Sk",`
	`204`	`+lambdacp: (`
	`205`	`+operator.setitem(width_map,cp,EffectiveWidth.NARROW)`
	`206`	`+ifwidth_map[cp]==EffectiveWidth.AMBIGUOUS`
	`207`	`+elseNone`
	`208`	`+ ),`
	`209`	`+ )`
	`210`	`+`
	`211`	`+# GREEK ANO TELEIA: NFC decomposes to U+00B7 MIDDLE DOT`
	`212`	`+width_map[0x0387]=EffectiveWidth.AMBIGUOUS`
	`213`	`+`
	`214`	`+# Canonical equivalence for symbols with stroke`
	`215`	`+withfetch_open("UnicodeData.txt")asudata:`
	`216`	`+single=re.compile(r"([0-9A-Z]+);.?;.?;.?;.?;([0-9A-Z]+) 0338;")`
	`217`	`+forlineinudata.readlines():`
	`218`	`+ifmatch:=single.match(line):`
	`219`	`+composed=int(match.group(1),16)`
	`220`	`+decomposed=int(match.group(2),16)`
	`221`	`+ifwidth_map[decomposed]==EffectiveWidth.AMBIGUOUS:`
	`222`	`+width_map[composed]=EffectiveWidth.AMBIGUOUS`
	`223`	`+`
	`224`	`+returnwidth_map`
`183`	`225`
`184`	`226`
`185`	`227`	`defload_zero_widths()->list[bool]:`

`‎src/lib.rs‎`

Lines changed: 29 additions & 13 deletions

Original file line number	Diff line number	Diff line change
`@@ -40,8 +40,9 @@`
`40`	`40`	//! 3. The sequence `"\r\n"` has width 1.
`41`	`41`	//! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
`42`	`42`	//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
`43`		-//! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
`44`		`-//! 6. The following have width 0:`
	`43`	+//! 5. In an East Asian context only, `<`, `=`, or `>` have width 2 when followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY].
	`44`	+//! 6. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
	`45`	`+//! 7. The following have width 0:`
`45`	`46`	`//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)`
`46`	`47`	//! with the [`Default_Ignorable_Code_Point`] property.
`47`	`48`	`//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)`
`@@ -64,18 +65,26 @@`
`64`	`65`	//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
`65`	`66`	//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
`66`	`67`	//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
`67`		`-//!7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)`
	`68`	`+//!8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)`
`68`	`69`	//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
`69`		`-//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)`
`70`		-//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
`71`		`-//! 9. All other characters have width 1.`
	`70`	`+//! 9. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:`
	`71`	+//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
	`72`	+//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
	`73`	+//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
	`74`	+//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
	`75`	+//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
	`76`	`+//! 10. All other characters have width 1.`
	`77`	`+//!`
	`78`	+//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
`72`	`79`	`//!`
`73`	`80`	//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
`74`	`81`	//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
`75`	`82`	//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
	`83`	+//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
`76`	`84`	//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
`77`	`85`	//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
`78`	`86`	//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
	`87`	+//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
`79`	`88`	`//!`
`80`	`89`	//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
`81`	`90`	//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
`@@ -84,14 +93,13 @@`
`84`	`93`	`//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence`
`85`	`94`	`//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence`
`86`	`95`	`//!`
`87`		`-//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf`
	`96`	`+//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html`
`88`	`97`	`//!`
`89`	`98`	`//! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078`
`90`	`99`	`//!`
`91`	`100`	`//! ## Canonical equivalence`
`92`	`101`	`//!`
`93`		`-//! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.`
`94`		`-//! However, this guarantee does not currently hold for the CJK width variants.`
	`102`	`+//! Canonically equivalent strings are assigned the same width (CJK and non-CJK).`
`95`	`103`
`96`	`104`	`#![forbid(unsafe_code)]`
`97`	`105`	`#![deny(missing_docs)]`
`@@ -198,14 +206,17 @@ enum NextCharInfo {`
`198`	`206`	`#[default]`
`199`	`207`	`Default,`
`200`	`208`	/// `'\n'`
`201`		`-LineFeed =0x0A,`
	`209`	`+LineFeed,`
	`210`	`+/// '\u{0338}'`
	`211`	`+/// For preserving canonical equivalence with CJK`
	`212`	`+CombiningLongSolidusOverlay,`
`202`	`213`	/// `'\u{A4FC}'..='\u{A4FD}'`
`203`	`214`	`/// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>`
`204`	`215`	`TrailingLisuToneLetter,`
`205`	`216`	/// `'\u{FE0E}'`
`206`		`-Vs15 =0x0E,`
	`217`	`+Vs15,`
`207`	`218`	/// `'\u{FE0F}'`
`208`		`-Vs16 =0x0F,`
	`219`	`+Vs16,`
`209`	`220`	`}`
`210`	`221`
`211`	`222`	`fnstr_width(s:&str,is_cjk:bool) ->usize{`
`@@ -222,7 +233,11 @@ fn str_width(s: &str, is_cjk: bool) -> usize {`
`222`	`233`	`/// they're treated as single width.`
`223`	`234`	`#[inline]`
`224`	`235`	`fnwidth_in_str(c:char,is_cjk:bool,next_info:NextCharInfo) ->(usize,NextCharInfo){`
`225`		`-if next_info ==NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c){`
	`236`	`+if(is_cjk`
	`237`	`+ && next_info ==NextCharInfo::CombiningLongSolidusOverlay`
	`238`	`+ &&matches!(c,'<' \|'=' \|'>'))`
	`239`	`+ \|\|(next_info ==NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c))`
	`240`	`+{`
`226`	`241`	`(2,NextCharInfo::Default)`
`227`	`242`	`}elseif c <='\u{A0}'{`
`228`	`243`	`match c{`
`@@ -235,6 +250,7 @@ fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextC`
`235`	`250`	`('\u{A4F8}'..='\u{A4FB}',NextCharInfo::TrailingLisuToneLetter) =>{`
`236`	`251`	`(0,NextCharInfo::Default)`
`237`	`252`	`}`
	`253`	`+('\u{0338}', _) =>(0,NextCharInfo::CombiningLongSolidusOverlay),`
`238`	`254`	`('\u{A4FC}'..='\u{A4FD}', _) =>(1,NextCharInfo::TrailingLisuToneLetter),`
`239`	`255`	`('\u{FE0E}', _) =>(0,NextCharInfo::Vs15),`
`240`	`256`	`('\u{FE0F}', _) =>(0,NextCharInfo::Vs16),`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitdc86c74

File tree

4 files changed

4 files changed

`‎scripts/unicode.py‎`

`‎src/lib.rs‎`

0 commit comments