Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Assign width 1 to control characters#45

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
Manishearth merged 1 commit intounicode-rs:masterfromJules-Bertholet:control
May 9, 2024
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletionsREADME.md
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -16,7 +16,7 @@ use unicode_width::UnicodeWidthStr;

fn main() {
let teststr = "Hello, world!";
let width =UnicodeWidthStr::width(teststr);
let width =teststr.width();
println!("{}", teststr);
println!("The above string is {} columns wide.", width);
let width = teststr.width_cjk();
Expand All@@ -34,9 +34,9 @@ extern crate unicode_width;
use unicode_width::UnicodeWidthStr;

fn main() {
assert_eq!(UnicodeWidthStr::width("👩"), 2); // Woman
assert_eq!(UnicodeWidthStr::width("🔬"), 2); // Microscope
assert_eq!(UnicodeWidthStr::width("👩‍🔬"), 4); // Woman scientist
assert_eq!("👩".width(), 2); // Woman
assert_eq!("🔬".width(), 2); // Microscope
assert_eq!("👩‍🔬".width(), 4); // Woman scientist
}
```

Expand Down
41 changes: 2 additions & 39 deletionsscripts/unicode.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -165,21 +165,14 @@ def load_zero_widths() -> "list[bool]":
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
character. `c` is considered a zero-width character if

- it is a control character,
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
- it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
- or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
- or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug,
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
"""

zw_map = [False] * NUM_CODEPOINTS

# Control characters have width 0
for c in range(0x00, 0x20):
zw_map[c] = True
for c in range(0x7F, 0xA0):
zw_map[c] = True

# `Default_Ignorable_Code_Point`s also have 0 width:
# https://www.unicode.org/faq/unsup_char.html#3
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
Expand DownExpand Up@@ -563,7 +556,7 @@ def emit_module(
/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
/// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes.
#[inline]
fn lookup_width(c: char, is_cjk: bool) -> usize {
pubfn lookup_width(c: char, is_cjk: bool) -> usize {
let cp = c as usize;

let t1_offset = TABLES_0[cp >> 13 & 0xFF];
Expand DownExpand Up@@ -664,36 +657,6 @@ def emit_module(
"""
)

module.write(
"""
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
/// `None` if `c` is a control character other than `'\\x00'`.
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
/// they're treated as single width.
#[inline]
pub fn width(c: char, is_cjk: bool) -> Option<usize> {
if c < '\\u{7F}' {
if c >= '\\u{20}' {
// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
Some(1)
} else if c == '\\0' {
// U+0000 *is* a control code, but it's special-cased
Some(0)
} else {
// U+0001 to U+0020 (exclusive) are control codes
None
}
} else if c >= '\\u{A0}' {
// No characters >= U+00A0 are control codes, so we can consult the lookup tables
Some(lookup_width(c, is_cjk))
} else {
// U+007F to U+00A0 (exclusive) are control codes
None
}
}
"""
)

subtable_count = 1
for i, table in enumerate(tables):
new_subtable_count = len(table.buckets())
Expand Down
105 changes: 68 additions & 37 deletionssrc/lib.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -34,12 +34,13 @@
//!
//! 1. [Emoji presentation sequences] have width 2.
//! (The width of a string may therefore differ from the sum of the widths of its characters.)
//! 2. Outside of an East Asian context, [text presentation sequences]have width 1
//!iff their base character fulfills all the following requirements:
//! 2. Outside of an East Asian context, [text presentation sequences]fulfilling all the following requirements
//!have width 1:
//! - Has the [`Emoji_Presentation`] property, and
//! - Not in the [Enclosed Ideographic Supplement] block.
//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 4. The following have width 0:
//! 3. The sequence `"\r\n"` has width 1.
//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 5. The following have width 0:
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
Expand All@@ -55,9 +56,6 @@
//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
//! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
//! have no defined width, and are ignored when determining the width of a string.
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
Expand DownExpand Up@@ -99,7 +97,7 @@ mod tables;
/// Methods for determining displayed width of Unicode characters.
pub trait UnicodeWidthChar {
/// Returns the character's displayed width in columns, or `None` if the
/// character is a control character other than `'\x00'`.
/// character is a control character.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
Expand All@@ -108,7 +106,7 @@ pub trait UnicodeWidthChar {
fn width(self) -> Option<usize>;

/// Returns the character's displayed width in columns, or `None` if the
/// character is a control character other than `'\x00'`.
/// character is a control character.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
Expand All@@ -120,23 +118,42 @@ pub trait UnicodeWidthChar {
impl UnicodeWidthChar for char {
#[inline]
fn width(self) -> Option<usize> {
cw::width(self, false)
single_char_width(self, false)
}

#[inline]
fn width_cjk(self) -> Option<usize> {
cw::width(self, true)
single_char_width(self, true)
}
}

/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
/// `None` if `c` is a control character.
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
/// they're treated as single width.
#[inline]
fn single_char_width(c: char, is_cjk: bool) -> Option<usize> {
if c < '\u{7F}' {
if c >= '\u{20}' {
// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
Some(1)
} else {
// U+0001 to U+0020 (exclusive) are control codes
None
}
} else if c >= '\u{A0}' {
// No characters >= U+00A0 are control codes, so we can consult the lookup tables
Some(cw::lookup_width(c, is_cjk))
} else {
// U+007F to U+00A0 (exclusive) are control codes
None
}
}

/// Methods for determining displayed width of Unicode strings.
pub trait UnicodeWidthStr {
/// Returns the string's displayed width in columns.
///
/// Control characters are treated as having zero width,
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// are assigned width 2.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 1 column wide. This is consistent with the recommendations for
Expand All@@ -145,10 +162,6 @@ pub trait UnicodeWidthStr {

/// Returns the string's displayed width in columns.
///
/// Control characters are treated as having zero width,
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// are assigned width 2.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 2 column wide. This is consistent with the recommendations for
Expand All@@ -168,30 +181,48 @@ impl UnicodeWidthStr for str {
}
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum VariationSelector {
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
enum NextCharInfo {
#[default]
Default,
LineFeed = 0x0A,
Vs15 = 0x0E,
Vs16 = 0x0F,
}

fn str_width(s: &str, is_cjk: bool) -> usize {
s.chars()
.rfold((0, None), |(sum, vsel), c| match c {
'\u{FE0E}' => (sum, Some(VariationSelector::Vs15)),
'\u{FE0F}' => (sum, Some(VariationSelector::Vs16)),
_ => {
let add = match vsel {
Some(VariationSelector::Vs15)
if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>
{
1
}

Some(VariationSelector::Vs16) if cw::starts_emoji_presentation_seq(c) => 2,
_ => cw::width(c, is_cjk).unwrap_or(0),
};
(sum + add, None)
}
.rfold((0, NextCharInfo::Default), |(sum, next_info), c| {
let (add, info) = width_in_str(c, is_cjk, next_info);
(sum + add, info)
})
.0
}

/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`.
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
/// they're treated as single width.
#[inline]
fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
match next_info {
NextCharInfo::Vs15 if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => {
(1, NextCharInfo::Default)
}
NextCharInfo::Vs16 if cw::starts_emoji_presentation_seq(c) => (2, NextCharInfo::Default),
_ => {
if c <= '\u{A0}' {
match c {
'\n' => (1, NextCharInfo::LineFeed),
'\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
_ => (1, NextCharInfo::Default),
}
} else {
match c {
'\u{FE0E}' => (0, NextCharInfo::Vs15),
'\u{FE0F}' => (0, NextCharInfo::Vs16),
_ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
}
}
}
}
}
Loading

[8]ページ先頭

©2009-2025 Movatter.jp