3434//!
3535//! 1. [Emoji presentation sequences] have width 2.
3636//! (The width of a string may therefore differ from the sum of the widths of its characters.)
37- //! 2. Outside of an East Asian context, [text presentation sequences]have width 1
38- //!iff their base character fulfills all the following requirements :
37+ //! 2. Outside of an East Asian context, [text presentation sequences]fulfilling all the following requirements
38+ //!have width 1 :
3939//! - Has the [`Emoji_Presentation`] property, and
4040//! - Not in the [Enclosed Ideographic Supplement] block.
41- //! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
42- //! 4. The following have width 0:
41+ //! 3. The sequence `"\r\n"` has width 1.
42+ //! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
43+ //! 5. The following have width 0:
4344//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
4445//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
4546//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
5556//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
5657//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
5758//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
58- //! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
59- //! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
60- //! have no defined width, and are ignored when determining the width of a string.
6159//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
6260//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
6361//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
@@ -99,7 +97,7 @@ mod tables;
9997/// Methods for determining displayed width of Unicode characters.
10098pub trait UnicodeWidthChar {
10199/// Returns the character's displayed width in columns, or `None` if the
102- /// character is a control character other than `'\x00'` .
100+ /// character is a control character.
103101///
104102/// This function treats characters in the Ambiguous category according
105103/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -108,7 +106,7 @@ pub trait UnicodeWidthChar {
108106fn width ( self ) ->Option < usize > ;
109107
110108/// Returns the character's displayed width in columns, or `None` if the
111- /// character is a control character other than `'\x00'` .
109+ /// character is a control character.
112110///
113111/// This function treats characters in the Ambiguous category according
114112/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -120,23 +118,42 @@ pub trait UnicodeWidthChar {
120118impl UnicodeWidthChar for char {
121119#[ inline]
122120fn width ( self ) ->Option < usize > {
123- cw :: width ( self , false )
121+ single_char_width ( self , false )
124122}
125123
126124#[ inline]
127125fn width_cjk ( self ) ->Option < usize > {
128- cw:: width ( self , true )
126+ single_char_width ( self , true )
127+ }
128+ }
129+
130+ /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
131+ /// `None` if `c` is a control character.
132+ /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
133+ /// they're treated as single width.
134+ #[ inline]
135+ fn single_char_width ( c : char , is_cjk : bool ) ->Option < usize > {
136+ if c <'\u{7F}' {
137+ if c >='\u{20}' {
138+ // U+0020 to U+007F (exclusive) are single-width ASCII codepoints
139+ Some ( 1 )
140+ } else {
141+ // U+0001 to U+0020 (exclusive) are control codes
142+ None
143+ }
144+ } else if c >='\u{A0}' {
145+ // No characters >= U+00A0 are control codes, so we can consult the lookup tables
146+ Some ( cw:: lookup_width ( c, is_cjk) )
147+ } else {
148+ // U+007F to U+00A0 (exclusive) are control codes
149+ None
129150}
130151}
131152
132153/// Methods for determining displayed width of Unicode strings.
133154pub trait UnicodeWidthStr {
134155/// Returns the string's displayed width in columns.
135156///
136- /// Control characters are treated as having zero width,
137- /// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
138- /// are assigned width 2.
139- ///
140157/// This function treats characters in the Ambiguous category according
141158/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
142159/// as 1 column wide. This is consistent with the recommendations for
@@ -145,10 +162,6 @@ pub trait UnicodeWidthStr {
145162
146163/// Returns the string's displayed width in columns.
147164///
148- /// Control characters are treated as having zero width,
149- /// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
150- /// are assigned width 2.
151- ///
152165/// This function treats characters in the Ambiguous category according
153166/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
154167/// as 2 column wide. This is consistent with the recommendations for
@@ -168,30 +181,48 @@ impl UnicodeWidthStr for str {
168181}
169182}
170183
171- #[ derive( Clone , Copy , Debug , PartialEq , Eq ) ]
172- enum VariationSelector {
184+ #[ derive( Clone , Copy , Debug , Default , PartialEq , Eq ) ]
185+ enum NextCharInfo {
186+ #[ default]
187+ Default ,
188+ LineFeed =0x0A ,
173189Vs15 =0x0E ,
174190Vs16 =0x0F ,
175191}
176192
177193fn str_width ( s : & str , is_cjk : bool ) ->usize {
178194 s. chars ( )
179- . rfold ( ( 0 , None ) , |( sum, vsel) , c|match c{
180- '\u{FE0E}' =>( sum, Some ( VariationSelector :: Vs15 ) ) ,
181- '\u{FE0F}' =>( sum, Some ( VariationSelector :: Vs16 ) ) ,
182- _ =>{
183- let add =match vsel{
184- Some ( VariationSelector :: Vs15 )
185- if !is_cjk && cw:: starts_non_ideographic_text_presentation_seq ( c) =>
186- {
187- 1
188- }
189-
190- Some ( VariationSelector :: Vs16 ) if cw:: starts_emoji_presentation_seq ( c) =>2 ,
191- _ => cw:: width ( c, is_cjk) . unwrap_or ( 0 ) ,
192- } ;
193- ( sum + add, None )
194- }
195+ . rfold ( ( 0 , NextCharInfo :: Default ) , |( sum, next_info) , c|{
196+ let ( add, info) =width_in_str ( c, is_cjk, next_info) ;
197+ ( sum + add, info)
195198} )
196199. 0
197200}
201+
202+ /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`.
203+ /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
204+ /// they're treated as single width.
205+ #[ inline]
206+ fn width_in_str ( c : char , is_cjk : bool , next_info : NextCharInfo ) ->( usize , NextCharInfo ) {
207+ match next_info{
208+ NextCharInfo :: Vs15 if !is_cjk && cw:: starts_non_ideographic_text_presentation_seq ( c) =>{
209+ ( 1 , NextCharInfo :: Default )
210+ }
211+ NextCharInfo :: Vs16 if cw:: starts_emoji_presentation_seq ( c) =>( 2 , NextCharInfo :: Default ) ,
212+ _ =>{
213+ if c <='\u{A0}' {
214+ match c{
215+ '\n' =>( 1 , NextCharInfo :: LineFeed ) ,
216+ '\r' if next_info ==NextCharInfo :: LineFeed =>( 0 , NextCharInfo :: Default ) ,
217+ _ =>( 1 , NextCharInfo :: Default ) ,
218+ }
219+ } else {
220+ match c{
221+ '\u{FE0E}' =>( 0 , NextCharInfo :: Vs15 ) ,
222+ '\u{FE0F}' =>( 0 , NextCharInfo :: Vs16 ) ,
223+ _ =>( cw:: lookup_width ( c, is_cjk) , NextCharInfo :: Default ) ,
224+ }
225+ }
226+ }
227+ }
228+ }