3030//! # Rules for determining width
3131//!
3232//! This crate currently uses the following rules to determine the width of a
33- //! character or string, in order of decreasing precedence. These may be tweaked in the future.
33+ //! character or string, in order of decreasing precedence. These may be tweaked in the future;
34+ //! however see [guarantees](#guarantees) below.
3435//!
3536//! 1. [Emoji presentation sequences] have width 2.
36- //! (The width of a string may therefore differ from the sum of the widths of its characters.)
37- //! 2. Outside of an East Asian context, [text presentation sequences] fulfilling all the following requirements
38- //! have width 1:
37+ //! 2. Outside of an East Asian context, [text presentation sequences] have width 1
38+ //! if their base character:
3939//! - Has the [`Emoji_Presentation`] property, and
40- //! -Not in the [Enclosed Ideographic Supplement] block.
40+ //! -Is not in the [Enclosed Ideographic Supplement] block.
4141//! 3. The sequence `"\r\n"` has width 1.
4242//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
4343//! 5. The following have width 0:
4444//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
45- //! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
45+ //! with the [`Default_Ignorable_Code_Point`] property.
4646//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
4747//! with the [`Grapheme_Extend`] property.
4848//! - The following 8 characters, all of which have NFD decompositions consisting of two [`Grapheme_Extend`] characters:
6262//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
6363//! 8. All other characters have width 1.
6464//!
65+ //! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
6566//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
6667//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
6768//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
7172//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
7273//! [`Ambiguous`]: https://www.unicode.org/reports/tr11/#ED6
7374//!
74- //! [Emoji presentation sequences]:( https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
75- //! [text presentation sequences]:( https://unicode.org/reports/tr51/#def_text_presentation_sequence)
75+ //! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
76+ //! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
7677//!
7778//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
7879//!
79- //! ##Canonical equivalence
80+ //! ##Guarantees
8081//!
81- //! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
82- //! However, this guarantee does not currently hold for the CJK width variants.
82+ //! - Any two canonically equivalent strings have the same non-CJK width.
83+ //! This will not change in any future semver-compatible version.
84+ //! (This guarantee does not currently hold for the CJK width variants.)
85+ //! - The width of any string equals the sum of the widths of its [extended grapheme clusters].
86+ //! This is unlikely to change in any future semver-compatible version.
87+ //! (This guarantee holds for both CJK and non-CJK width.)
88+ //!
89+ //! [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
8390
8491#![ forbid( unsafe_code) ]
8592#![ deny( missing_docs) ]
@@ -95,6 +102,14 @@ pub use tables::UNICODE_VERSION;
95102mod tables;
96103
97104/// Methods for determining displayed width of Unicode characters.
105+ ///
106+ /// **NB:** the width of a string may differ from the sum of the widths of its characters;
107+ /// see the [crate-level documentation](crate#rules-for-determining-width) for more.
108+ /// Instead of working with individual characters, consider using [extended grapheme clusters],
109+ /// perhaps with the [`unicode-segmentation`] crate.
110+ ///
111+ /// [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
112+ /// [`unicode-segmentation`]: https://docs.rs/unicode-segmentation/latest/unicode_segmentation/trait.UnicodeSegmentation.html#tymethod.graphemes
98113pub trait UnicodeWidthChar {
99114/// Returns the character's displayed width in columns, or `None` if the
100115/// character is a control character.