Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit6b503fa

Browse files
Support emoji presentation sequences
1 parent3885393 commit6b503fa

File tree

6 files changed

+309
-18
lines changed

6 files changed

+309
-18
lines changed

‎.github/workflows/rust.yml‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ jobs:
3232
runs-on:ubuntu-latest
3333
steps:
3434
-uses:actions/checkout@v3
35+
-uses:actions/setup-python@v5
36+
with:
37+
python-version:'3.12'
3538
-name:Regen
3639
run:cd scripts && python3 unicode.py
3740
-name:Diff

‎benches/benches.rs‎

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,11 @@ fn jawiki(b: &mut Bencher) {
104104
let string = std::fs::read_to_string(data_path).unwrap_or_default();
105105
b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
106106
}
107+
108+
#[bench]
109+
fnemoji(b:&mutBencher){
110+
// To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt
111+
let data_path ="bench_data/emoji-style.txt";
112+
let string = std::fs::read_to_string(data_path).unwrap_or_default();
113+
b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
114+
}

‎scripts/unicode.py‎

Lines changed: 141 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# - HangulSyllableType.txt
1818
# - PropList.txt
1919
# - ReadMe.txt
20+
# - emoji/emoji-variation-sequences.txt
2021
#
2122
# Since this should not require frequent updates, we just store this
2223
# out-of-line and check the generated module into git.
@@ -26,6 +27,8 @@
2627
importos
2728
importre
2829
importsys
30+
fromcollectionsimportdefaultdict
31+
fromitertoolsimportbatched
2932

3033
NUM_CODEPOINTS=0x110000
3134
"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
@@ -69,12 +72,13 @@ def fetch_open(filename: str):
6972
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
7073
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
7174
"""
72-
ifnotos.path.exists(os.path.basename(filename)):
75+
basename=os.path.basename(filename)
76+
ifnotos.path.exists(basename):
7377
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
7478
try:
75-
returnopen(filename,encoding="utf-8")
79+
returnopen(basename,encoding="utf-8")
7680
exceptOSError:
77-
sys.stderr.write(f"cannot load{filename}")
81+
sys.stderr.write(f"cannot load{basename}")
7882
sys.exit(1)
7983

8084

@@ -384,8 +388,71 @@ def make_tables(
384388
returntables
385389

386390

391+
defload_variation_sequences()->"list[int]":
392+
"""Outputs a list of character ranages, corresponding to all the valid characters for starting
393+
an emoji presentation sequence."""
394+
395+
withfetch_open("emoji/emoji-variation-sequences.txt")assequences:
396+
# Match all emoji presentation sequences
397+
# (one codepoint followed by U+FE0F, and labeled "emoji style")
398+
sequence=re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
399+
codepoints= []
400+
forlineinsequences.readlines():
401+
ifmatch:=sequence.match(line):
402+
cp=int(match.group(1),16)
403+
codepoints.append(cp)
404+
returncodepoints
405+
406+
407+
defmake_variation_sequence_table(
408+
seqs:"list[int]",
409+
width_map:"list[EffectiveWidth]",
410+
)->"tuple[list[int], list[list[int]]]":
411+
"""Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
412+
(Characters that are always wide may be excluded.)
413+
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
414+
"""
415+
416+
prefixes_dict=defaultdict(set)
417+
forcpinseqs:
418+
prefixes_dict[cp>>10].add(cp&0x3FF)
419+
420+
# We don't strictly need to keep track of characters that are always wide,
421+
# because being in an emoji variation seq won't affect their width.
422+
# So store their info only when it wouldn't inflate the size of the tables.
423+
forkinlist(prefixes_dict.keys()):
424+
ifall(
425+
map(
426+
lambdacp:width_map[(k<<10)|cp]==EffectiveWidth.WIDE,
427+
prefixes_dict[k],
428+
)
429+
):
430+
delprefixes_dict[k]
431+
432+
indexes=list(prefixes_dict.keys())
433+
434+
# Similarly, we can spuriously return `true` for always-wide characters
435+
# even if not part of a presentation seq; this saves an additional lookup,
436+
# so we should do it where there is no size cost.
437+
forcp,widthinenumerate(width_map):
438+
ifwidth==EffectiveWidth.WIDEand (cp>>10)inindexes:
439+
prefixes_dict[cp>>10].add(cp&0x3FF)
440+
441+
leaves= []
442+
forcpsinprefixes_dict.values():
443+
leaf= [0]*128
444+
forcpincps:
445+
idx_in_leaf,bit_shift=divmod(cp,8)
446+
leaf[idx_in_leaf]|=1<<bit_shift
447+
leaves.append(leaf)
448+
return (indexes,leaves)
449+
450+
387451
defemit_module(
388-
out_name:str,unicode_version:"tuple[int, int, int]",tables:"list[Table]"
452+
out_name:str,
453+
unicode_version:"tuple[int, int, int]",
454+
tables:"list[Table]",
455+
variation_table:"tuple[list[int], list[list[int]]]",
389456
):
390457
"""Outputs a Rust module to `out_name` using table data from `tables`.
391458
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -462,6 +529,40 @@ def emit_module(
462529
"""
463530
)
464531

532+
variation_idx,variation_leaves=variation_table
533+
534+
module.write(
535+
"""
536+
/// Whether this character forms an [emoji presentation sequence]
537+
/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
538+
/// when followed by `'\\u{FEOF}'`.
539+
/// Emoji presentation sequences are considered to have width 2.
540+
/// This may spuriously return `true` or `false` for characters that are always wide.
541+
#[inline]
542+
pub fn starts_emoji_presentation_seq(c: char) -> bool {
543+
let cp: u32 = c.into();
544+
// First level of lookup uses all but 10 LSB
545+
let top_bits = cp >> 10;
546+
let idx_of_leaf: usize = match top_bits {
547+
"""
548+
)
549+
550+
fori,msbsinenumerate(variation_idx):
551+
module.write(f"{msbs} =>{i},\n")
552+
553+
module.write(
554+
""" _ => return false,
555+
};
556+
// Extract the 3-9th (0-indexed) least significant bits of `cp`,
557+
// and use them to index into `leaf_row`.
558+
let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
559+
let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
560+
// Use the 3 LSB of `cp` to index into `leaf_byte`.
561+
((leaf_byte >> (cp & 7)) & 1) == 1
562+
}
563+
"""
564+
)
565+
465566
module.write(
466567
"""
467568
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -510,6 +611,29 @@ def emit_module(
510611
module.write(f" 0x{byte:02X},")
511612
module.write("\n ];\n")
512613
subtable_count=new_subtable_count
614+
615+
# emoji table
616+
617+
module.write(
618+
f"""
619+
#[repr(align(128))]
620+
struct Align128<T>(T);
621+
/// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
622+
/// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
623+
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{len(variation_leaves)}]> = Align128([
624+
"""
625+
)
626+
forleafinvariation_leaves:
627+
module.write(" [\n")
628+
forrowinbatched(leaf,14):
629+
module.write(" ")
630+
forentryinrow:
631+
module.write(f" 0x{entry:02X},")
632+
module.write("\n")
633+
module.write(" ],\n")
634+
635+
module.write(" ]);\n")
636+
513637
module.write("}\n")
514638

515639

@@ -520,6 +644,7 @@ def main(module_filename: str):
520644
521645
We obey the following rules, in decreasing order of importance:
522646
647+
- Emoji presentation sequences are double-width.
523648
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
524649
- Hangul jamo medial vowels & final consonants are zero-width.
525650
- `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
@@ -549,16 +674,25 @@ def main(module_filename: str):
549674

550675
tables=make_tables(TABLE_CFGS,enumerate(width_map))
551676

677+
emoji_variations=load_variation_sequences()
678+
variation_table=make_variation_sequence_table(emoji_variations,width_map)
679+
552680
print("------------------------")
553681
total_size=0
554682
fori,tableinenumerate(tables):
555683
size_bytes=len(table.to_bytes())
556-
print(f"Table{i}Size:{size_bytes} bytes")
684+
print(f"Table{i}size:{size_bytes} bytes")
557685
total_size+=size_bytes
686+
emoji_index_size=len(variation_table[0])*4
687+
print(f"Emoji presentation index size:{emoji_index_size} bytes")
688+
total_size+=emoji_index_size
689+
emoji_leaves_size=len(variation_table[1])*len(variation_table[1][0])
690+
print(f"Emoji presentation leaves size:{emoji_leaves_size} bytes")
691+
total_size+=emoji_leaves_size
558692
print("------------------------")
559-
print(f" TotalSize:{total_size} bytes")
693+
print(f" Totalsize:{total_size} bytes")
560694

561-
emit_module(module_filename,version,tables)
695+
emit_module(module_filename,version,tables,variation_table)
562696
print(f'Wrote to "{module_filename}"')
563697

564698

‎src/lib.rs‎

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,11 @@
3333
//! This crate currently uses the following rules to determine the width of a
3434
//! character or string, in order of decreasing precedence. These may be tweaked in the future.
3535
//!
36-
//! 1. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
37-
//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
38-
//! 3. The following have width 0:
36+
//! 1. [Emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
37+
//! have width 2. (The width of a string may therefore differ from the sum of the widths of its characters.)
38+
//! 2. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
39+
//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
40+
//! 4. The following have width 0:
3941
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
4042
//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
4143
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -53,15 +55,15 @@
5355
//! with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593)
5456
//! of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
5557
//! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
56-
//!4. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
58+
//!5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
5759
//! have no defined width, and are ignored when determining the width of a string.
58-
//!5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
60+
//!6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
5961
//! with an [`East_Asian_Width`] of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2)
6062
//! or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2.
61-
//!6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
63+
//!7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
6264
//! with an [`East_Asian_Width`] of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6)
6365
//! have width 2 in an East Asian context, and width 1 otherwise.
64-
//!7. All other characters have width 1.
66+
//!8. All other characters have width 1.
6567
//!
6668
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
6769
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
@@ -122,7 +124,9 @@ impl UnicodeWidthChar for char {
122124
pubtraitUnicodeWidthStr{
123125
/// Returns the string's displayed width in columns.
124126
///
125-
/// Control characters are treated as having zero width.
127+
/// Control characters are treated as having zero width,
128+
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
129+
/// are assigned width 2.
126130
///
127131
/// This function treats characters in the Ambiguous category according
128132
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -132,7 +136,9 @@ pub trait UnicodeWidthStr {
132136

133137
/// Returns the string's displayed width in columns.
134138
///
135-
/// Control characters are treated as having zero width.
139+
/// Control characters are treated as having zero width,
140+
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
141+
/// are assigned width 2.
136142
///
137143
/// This function treats characters in the Ambiguous category according
138144
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -144,11 +150,28 @@ pub trait UnicodeWidthStr {
144150
implUnicodeWidthStrforstr{
145151
#[inline]
146152
fnwidth(&self) ->usize{
147-
self.chars().map(|c| cw::width(c,false).unwrap_or(0)).sum()
153+
str_width(self,false)
148154
}
149155

150156
#[inline]
151157
fnwidth_cjk(&self) ->usize{
152-
self.chars().map(|c| cw::width(c,true).unwrap_or(0)).sum()
158+
str_width(self,true)
153159
}
154160
}
161+
162+
fnstr_width(s:&str,is_cjk:bool) ->usize{
163+
s.chars()
164+
.rfold((0,false), |(sum, was_fe0f), c|{
165+
if c =='\u{FE0F}'{
166+
(sum,true)
167+
}else{
168+
let add =if was_fe0f && cw::starts_emoji_presentation_seq(c){
169+
2
170+
}else{
171+
cw::width(c, is_cjk).unwrap_or(0)
172+
};
173+
(sum + add,false)
174+
}
175+
})
176+
.0
177+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp