May 21, 2024 · May 13, 2024 · May 13, 2024
diff --git a/scripts/unicode.py b/scripts/unicode.py
        {EffectiveWidth.NARROW, EffectiveWidth.AMBIGUOUS},
    )

    # Downloadfiles for use by tests
    # Downloadnormalization test file for use by tests
    fetch_open("NormalizationTest.txt", "../tests/")
    fetch_open("auxiliary/GraphemeBreakTest.txt", "../tests/")

    print("------------------------")
    total_size = 0
diff --git a/src/lib.rs b/src/lib.rs
 //! # Rules for determining width
 //!
 //! This crate currently uses the following rules to determine the width of a
 //! character or string, in order of decreasing precedence. These may be tweaked in the future;
 //! however see [guarantees](#guarantees) below.
 //! character or string, in order of decreasing precedence. These may be tweaked in the future.
 //!
 //! 1. [Emoji presentation sequences] have width 2.
 //! 2. Outside of an East Asian context, [text presentation sequences] have width 1
 //!    if their base character:
 //!    - Has the [`Emoji_Presentation`] property, and
 //!    - Is not in the [Enclosed Ideographic Supplement] block.
 //! 3. The sequence `"\r\n"` has width 1.
 //! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
 //! 5. The following have width 0:
 //! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
 //!    followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
 //! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
 //! 6. The following have width 0:
 //!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
 //!       with the [`Default_Ignorable_Code_Point`] property.
 //!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
 //!      - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
 //!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
 //!       with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
 //!6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
 //!7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
 //!    with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
 //!7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
 //!8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
 //!    with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
 //!8. All other characters have width 1.
 //!9. All other characters have width 1.
 //!
 //! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
 //! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
 //!
 //! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
 //!
 //!## Guarantees
 //![Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078
 //!
 //! - Any two canonically equivalent strings have the same non-CJK width.
 //!   This will not change in any future semver-compatible version.
 //!   (This guarantee does not currently hold for the CJK width variants.)
 //! - The width of any string equals the sum of the widths of its [extended grapheme clusters].
 //!   This is unlikely to change in any future semver-compatible version.
 //!   (This guarantee holds for both CJK and non-CJK width.)
 //! ## Canonical equivalence
 //!
 //! [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
 //! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
 //! However, this guarantee does not currently hold for the CJK width variants.

 #![forbid(unsafe_code)]
 #![deny(missing_docs)]
 mod tables;

 /// Methods for determining displayed width of Unicode characters.
 ///
 /// **NB:** the width of a string may differ from the sum of the widths of its characters;
 /// see the [crate-level documentation](crate#rules-for-determining-width) for more.
 /// Instead of working with individual characters, consider using [extended grapheme clusters],
 /// perhaps with the [`unicode-segmentation`] crate.
 ///
 /// [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
 /// [`unicode-segmentation`]: https://docs.rs/unicode-segmentation/latest/unicode_segmentation/trait.UnicodeSegmentation.html#tymethod.graphemes
 pub trait UnicodeWidthChar {
    /// Returns the character's displayed width in columns, or `None` if the
    /// character is a control character.
 enum NextCharInfo {
    #[default]
    Default,
    /// `'\n'`
    LineFeed = 0x0A,
    /// `'\u{A4FC}'..='\u{A4FD}'`
    /// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>
    TrailingLisuToneLetter,
    /// `'\u{FE0E}'`
    Vs15 = 0x0E,
    /// `'\u{FE0F}'`
    Vs16 = 0x0F,
 }

 /// they're treated as single width.
 #[inline]
 fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
    match next_info {
        NextCharInfo::Vs15 if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => {
            (1, NextCharInfo::Default)
    if next_info == NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c) {
        (2, NextCharInfo::Default)
    } else if c <= '\u{A0}' {
        match c {
            '\n' => (1, NextCharInfo::LineFeed),
            '\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
            _ => (1, NextCharInfo::Default),
        }
        NextCharInfo::Vs16 if cw::starts_emoji_presentation_seq(c) => (2, NextCharInfo::Default),
        _ => {
            if c <= '\u{A0}' {
                match c {
                    '\n' => (1, NextCharInfo::LineFeed),
                    '\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
                    _ => (1, NextCharInfo::Default),
                }
            } else {
                match c {
                    '\u{FE0E}' => (0, NextCharInfo::Vs15),
                    '\u{FE0F}' => (0, NextCharInfo::Vs16),
                    _ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
                }
    } else {
        match (c, next_info) {
            ('\u{A4F8}'..='\u{A4FB}', NextCharInfo::TrailingLisuToneLetter) => {
                (0, NextCharInfo::Default)
            }
            ('\u{A4FC}'..='\u{A4FD}', _) => (1, NextCharInfo::TrailingLisuToneLetter),
            ('\u{FE0E}', _) => (0, NextCharInfo::Vs15),
            ('\u{FE0F}', _) => (0, NextCharInfo::Vs16),
            (_, NextCharInfo::Vs15)
                if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>
            {
                (1, NextCharInfo::Default)
            }
            _ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
        }
    }
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -754,9 +754,8 @@ def main(module_path: str):
		{EffectiveWidth.NARROW, EffectiveWidth.AMBIGUOUS},
		)

		# Downloadfiles for use by tests
		# Downloadnormalization test file for use by tests
		fetch_open("NormalizationTest.txt", "../tests/")
		fetch_open("auxiliary/GraphemeBreakTest.txt", "../tests/")

		print("------------------------")
		total_size = 0
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -30,17 +30,18 @@
		//! # Rules for determining width
		//!
		//! This crate currently uses the following rules to determine the width of a
		//! character or string, in order of decreasing precedence. These may be tweaked in the future;
		//! however see [guarantees](#guarantees) below.
		//! character or string, in order of decreasing precedence. These may be tweaked in the future.
		//!
		//! 1. [Emoji presentation sequences] have width 2.
		//! 2. Outside of an East Asian context, [text presentation sequences] have width 1
		//! if their base character:
		//! - Has the [`Emoji_Presentation`] property, and
		//! - Is not in the [Enclosed Ideographic Supplement] block.
		//! 3. The sequence `"\r\n"` has width 1.
		//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
		//! 5. The following have width 0:
		//! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
		//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
		//! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
		//! 6. The following have width 0:
		//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
		//! with the [`Default_Ignorable_Code_Point`] property.
		//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
Expand All		@@ -56,11 +57,11 @@
		//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
		//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
		//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
		//!6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
		//!7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
		//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
		//!7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
		//!8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
		//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
		//!8. All other characters have width 1.
		//!9. All other characters have width 1.
		//!
		//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
		//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
Expand All		@@ -77,16 +78,12 @@
		//!
		//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
		//!
		//!## Guarantees
		//![Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078
		//!
		//! - Any two canonically equivalent strings have the same non-CJK width.
		//! This will not change in any future semver-compatible version.
		//! (This guarantee does not currently hold for the CJK width variants.)
		//! - The width of any string equals the sum of the widths of its [extended grapheme clusters].
		//! This is unlikely to change in any future semver-compatible version.
		//! (This guarantee holds for both CJK and non-CJK width.)
		//! ## Canonical equivalence
		//!
		//! [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
		//! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
		//! However, this guarantee does not currently hold for the CJK width variants.

		#![forbid(unsafe_code)]
		#![deny(missing_docs)]
Expand All		@@ -102,14 +99,6 @@ pub use tables::UNICODE_VERSION;
		mod tables;

		/// Methods for determining displayed width of Unicode characters.
		///
		/// NB: the width of a string may differ from the sum of the widths of its characters;
		/// see the [crate-level documentation](crate#rules-for-determining-width) for more.
		/// Instead of working with individual characters, consider using [extended grapheme clusters],
		/// perhaps with the [`unicode-segmentation`] crate.
		///
		/// [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
		/// [`unicode-segmentation`]: https://docs.rs/unicode-segmentation/latest/unicode_segmentation/trait.UnicodeSegmentation.html#tymethod.graphemes
		pub trait UnicodeWidthChar {
		/// Returns the character's displayed width in columns, or `None` if the
		/// character is a control character.
Expand DownExpand Up		@@ -200,8 +189,14 @@ impl UnicodeWidthStr for str {
		enum NextCharInfo {
		#[default]
		Default,
		/// `'\n'`
		LineFeed = 0x0A,
		/// `'\u{A4FC}'..='\u{A4FD}'`
		/// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>
		TrailingLisuToneLetter,
		/// `'\u{FE0E}'`
		Vs15 = 0x0E,
		/// `'\u{FE0F}'`
		Vs16 = 0x0F,
		}

Expand All		@@ -219,25 +214,28 @@ fn str_width(s: &str, is_cjk: bool) -> usize {
		/// they're treated as single width.
		#[inline]
		fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
		match next_info {
		NextCharInfo::Vs15 if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => {
		(1, NextCharInfo::Default)
		if next_info == NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c) {
		(2, NextCharInfo::Default)
		} else if c <= '\u{A0}' {
		match c {
		'\n' => (1, NextCharInfo::LineFeed),
		'\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
		_ => (1, NextCharInfo::Default),
		}
		NextCharInfo::Vs16 if cw::starts_emoji_presentation_seq(c) => (2, NextCharInfo::Default),
		_ => {
		if c <= '\u{A0}' {
		match c {
		'\n' => (1, NextCharInfo::LineFeed),
		'\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
		_ => (1, NextCharInfo::Default),
		}
		} else {
		match c {
		'\u{FE0E}' => (0, NextCharInfo::Vs15),
		'\u{FE0F}' => (0, NextCharInfo::Vs16),
		_ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
		}
		} else {
		match (c, next_info) {
		('\u{A4F8}'..='\u{A4FB}', NextCharInfo::TrailingLisuToneLetter) => {
		(0, NextCharInfo::Default)
		}
		('\u{A4FC}'..='\u{A4FD}', _) => (1, NextCharInfo::TrailingLisuToneLetter),
		('\u{FE0E}', _) => (0, NextCharInfo::Vs15),
		('\u{FE0F}', _) => (0, NextCharInfo::Vs16),
		(_, NextCharInfo::Vs15)
		if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>
		{
		(1, NextCharInfo::Default)
		}
		_ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
		}
		}
		}