NotificationsYou must be signed in to change notification settings
Fork32
Star278

Commit6b503fa

committed

Support emoji presentation sequences

1 parent3885393 commit6b503faCopy full SHA for 6b503fa

File tree

6 files changed

+309

-18

lines changed

.github/workflows
- rust.yml
benches
- benches.rs
scripts
- unicode.py
src
- lib.rs
- tables.rs
tests
- tests.rs

6 files changed

+309

-18

lines changed

`‎.github/workflows/rust.yml‎`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,9 @@ jobs:`
`32`	`32`	`runs-on:ubuntu-latest`
`33`	`33`	`steps:`
`34`	`34`	`-uses:actions/checkout@v3`
	`35`	`+ -uses:actions/setup-python@v5`
	`36`	`+with:`
	`37`	`+python-version:'3.12'`
`35`	`38`	`-name:Regen`
`36`	`39`	`run:cd scripts && python3 unicode.py`
`37`	`40`	`-name:Diff`

`‎benches/benches.rs‎`

Lines changed: 8 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -104,3 +104,11 @@ fn jawiki(b: &mut Bencher) {`
`104`	`104`	`let string = std::fs::read_to_string(data_path).unwrap_or_default();`
`105`	`105`	`b.iter(\|\| test::black_box(UnicodeWidthStr::width(string.as_str())));`
`106`	`106`	`}`
	`107`	`+`
	`108`	`+#[bench]`
	`109`	`+fnemoji(b:&mutBencher){`
	`110`	`+// To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt`
	`111`	`+let data_path ="bench_data/emoji-style.txt";`
	`112`	`+let string = std::fs::read_to_string(data_path).unwrap_or_default();`
	`113`	`+ b.iter(\|\| test::black_box(UnicodeWidthStr::width(string.as_str())));`
	`114`	`+}`

`‎scripts/unicode.py‎`

Lines changed: 141 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`# - HangulSyllableType.txt`
`18`	`18`	`# - PropList.txt`
`19`	`19`	`# - ReadMe.txt`
	`20`	`+# - emoji/emoji-variation-sequences.txt`
`20`	`21`	`#`
`21`	`22`	`# Since this should not require frequent updates, we just store this`
`22`	`23`	`# out-of-line and check the generated module into git.`
`@@ -26,6 +27,8 @@`
`26`	`27`	`importos`
`27`	`28`	`importre`
`28`	`29`	`importsys`
	`30`	`+fromcollectionsimportdefaultdict`
	`31`	`+fromitertoolsimportbatched`
`29`	`32`
`30`	`33`	`NUM_CODEPOINTS=0x110000`
`31`	`34`	"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
`@@ -69,12 +72,13 @@ def fetch_open(filename: str):`
`69`	`72`	"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
`70`	`73`	fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
`71`	`74`	`"""`
`72`		`-ifnotos.path.exists(os.path.basename(filename)):`
	`75`	`+basename=os.path.basename(filename)`
	`76`	`+ifnotos.path.exists(basename):`
`73`	`77`	`os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")`
`74`	`78`	`try:`
`75`		`-returnopen(filename,encoding="utf-8")`
	`79`	`+returnopen(basename,encoding="utf-8")`
`76`	`80`	`exceptOSError:`
`77`		`-sys.stderr.write(f"cannot load{filename}")`
	`81`	`+sys.stderr.write(f"cannot load{basename}")`
`78`	`82`	`sys.exit(1)`
`79`	`83`
`80`	`84`
`@@ -384,8 +388,71 @@ def make_tables(`
`384`	`388`	`returntables`
`385`	`389`
`386`	`390`
	`391`	`+defload_variation_sequences()->"list[int]":`
	`392`	`+"""Outputs a list of character ranages, corresponding to all the valid characters for starting`
	`393`	`+ an emoji presentation sequence."""`
	`394`	`+`
	`395`	`+withfetch_open("emoji/emoji-variation-sequences.txt")assequences:`
	`396`	`+# Match all emoji presentation sequences`
	`397`	`+# (one codepoint followed by U+FE0F, and labeled "emoji style")`
	`398`	`+sequence=re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")`
	`399`	`+codepoints= []`
	`400`	`+forlineinsequences.readlines():`
	`401`	`+ifmatch:=sequence.match(line):`
	`402`	`+cp=int(match.group(1),16)`
	`403`	`+codepoints.append(cp)`
	`404`	`+returncodepoints`
	`405`	`+`
	`406`	`+`
	`407`	`+defmake_variation_sequence_table(`
	`408`	`+seqs:"list[int]",`
	`409`	`+width_map:"list[EffectiveWidth]",`
	`410`	`+)->"tuple[list[int], list[list[int]]]":`
	`411`	`+"""Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.`
	`412`	`+ (Characters that are always wide may be excluded.)`
	`413`	`+ The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.`
	`414`	`+ """`
	`415`	`+`
	`416`	`+prefixes_dict=defaultdict(set)`
	`417`	`+forcpinseqs:`
	`418`	`+prefixes_dict[cp>>10].add(cp&0x3FF)`
	`419`	`+`
	`420`	`+# We don't strictly need to keep track of characters that are always wide,`
	`421`	`+# because being in an emoji variation seq won't affect their width.`
	`422`	`+# So store their info only when it wouldn't inflate the size of the tables.`
	`423`	`+forkinlist(prefixes_dict.keys()):`
	`424`	`+ifall(`
	`425`	`+map(`
	`426`	`+lambdacp:width_map[(k<<10)\|cp]==EffectiveWidth.WIDE,`
	`427`	`+prefixes_dict[k],`
	`428`	`+ )`
	`429`	`+ ):`
	`430`	`+delprefixes_dict[k]`
	`431`	`+`
	`432`	`+indexes=list(prefixes_dict.keys())`
	`433`	`+`
	`434`	+# Similarly, we can spuriously return `true` for always-wide characters
	`435`	`+# even if not part of a presentation seq; this saves an additional lookup,`
	`436`	`+# so we should do it where there is no size cost.`
	`437`	`+forcp,widthinenumerate(width_map):`
	`438`	`+ifwidth==EffectiveWidth.WIDEand (cp>>10)inindexes:`
	`439`	`+prefixes_dict[cp>>10].add(cp&0x3FF)`
	`440`	`+`
	`441`	`+leaves= []`
	`442`	`+forcpsinprefixes_dict.values():`
	`443`	`+leaf= [0]*128`
	`444`	`+forcpincps:`
	`445`	`+idx_in_leaf,bit_shift=divmod(cp,8)`
	`446`	`+leaf[idx_in_leaf]\|=1<<bit_shift`
	`447`	`+leaves.append(leaf)`
	`448`	`+return (indexes,leaves)`
	`449`	`+`
	`450`	`+`
`387`	`451`	`defemit_module(`
`388`		`-out_name:str,unicode_version:"tuple[int, int, int]",tables:"list[Table]"`
	`452`	`+out_name:str,`
	`453`	`+unicode_version:"tuple[int, int, int]",`
	`454`	`+tables:"list[Table]",`
	`455`	`+variation_table:"tuple[list[int], list[list[int]]]",`
`389`	`456`	`):`
`390`	`457`	"""Outputs a Rust module to `out_name` using table data from `tables`.
`391`	`458`	If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
`@@ -462,6 +529,40 @@ def emit_module(`
`462`	`529`	`"""`
`463`	`530`	`)`
`464`	`531`
	`532`	`+variation_idx,variation_leaves=variation_table`
	`533`	`+`
	`534`	`+module.write(`
	`535`	`+"""`
	`536`	`+ /// Whether this character forms an [emoji presentation sequence]`
	`537`	`+ /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)`
	`538`	+ /// when followed by `'\\u{FEOF}'`.
	`539`	`+ /// Emoji presentation sequences are considered to have width 2.`
	`540`	+ /// This may spuriously return `true` or `false` for characters that are always wide.
	`541`	`+ #[inline]`
	`542`	`+ pub fn starts_emoji_presentation_seq(c: char) -> bool {`
	`543`	`+ let cp: u32 = c.into();`
	`544`	`+ // First level of lookup uses all but 10 LSB`
	`545`	`+ let top_bits = cp >> 10;`
	`546`	`+ let idx_of_leaf: usize = match top_bits {`
	`547`	`+"""`
	`548`	`+ )`
	`549`	`+`
	`550`	`+fori,msbsinenumerate(variation_idx):`
	`551`	`+module.write(f"{msbs} =>{i},\n")`
	`552`	`+`
	`553`	`+module.write(`
	`554`	`+""" _ => return false,`
	`555`	`+ };`
	`556`	+ // Extract the 3-9th (0-indexed) least significant bits of `cp`,
	`557`	+ // and use them to index into `leaf_row`.
	`558`	`+ let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();`
	`559`	`+ let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];`
	`560`	+ // Use the 3 LSB of `cp` to index into `leaf_byte`.
	`561`	`+ ((leaf_byte >> (cp & 7)) & 1) == 1`
	`562`	`+ }`
	`563`	`+"""`
	`564`	`+ )`
	`565`	`+`
`465`	`566`	`module.write(`
`466`	`567`	`"""`
`467`	`568`	/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
`@@ -510,6 +611,29 @@ def emit_module(`
`510`	`611`	`module.write(f" 0x{byte:02X},")`
`511`	`612`	`module.write("\n ];\n")`
`512`	`613`	`subtable_count=new_subtable_count`
	`614`	`+`
	`615`	`+# emoji table`
	`616`	`+`
	`617`	`+module.write(`
	`618`	`+f"""`
	`619`	`+ #[repr(align(128))]`
	`620`	`+ struct Align128<T>(T);`
	`621`	+ /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
	`622`	`+ /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.`
	`623`	`+ static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{len(variation_leaves)}]> = Align128([`
	`624`	`+"""`
	`625`	`+ )`
	`626`	`+forleafinvariation_leaves:`
	`627`	`+module.write(" [\n")`
	`628`	`+forrowinbatched(leaf,14):`
	`629`	`+module.write(" ")`
	`630`	`+forentryinrow:`
	`631`	`+module.write(f" 0x{entry:02X},")`
	`632`	`+module.write("\n")`
	`633`	`+module.write(" ],\n")`
	`634`	`+`
	`635`	`+module.write(" ]);\n")`
	`636`	`+`
`513`	`637`	`module.write("}\n")`
`514`	`638`
`515`	`639`
`@@ -520,6 +644,7 @@ def main(module_filename: str):`
`520`	`644`
`521`	`645`	`We obey the following rules, in decreasing order of importance:`
`522`	`646`
	`647`	`+ - Emoji presentation sequences are double-width.`
`523`	`648`	- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
`524`	`649`	`- Hangul jamo medial vowels & final consonants are zero-width.`
`525`	`650`	- `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
`@@ -549,16 +674,25 @@ def main(module_filename: str):`
`549`	`674`
`550`	`675`	`tables=make_tables(TABLE_CFGS,enumerate(width_map))`
`551`	`676`
	`677`	`+emoji_variations=load_variation_sequences()`
	`678`	`+variation_table=make_variation_sequence_table(emoji_variations,width_map)`
	`679`	`+`
`552`	`680`	`print("------------------------")`
`553`	`681`	`total_size=0`
`554`	`682`	`fori,tableinenumerate(tables):`
`555`	`683`	`size_bytes=len(table.to_bytes())`
`556`		`-print(f"Table{i}Size:{size_bytes} bytes")`
	`684`	`+print(f"Table{i}size:{size_bytes} bytes")`
`557`	`685`	`total_size+=size_bytes`
	`686`	`+emoji_index_size=len(variation_table[0])*4`
	`687`	`+print(f"Emoji presentation index size:{emoji_index_size} bytes")`
	`688`	`+total_size+=emoji_index_size`
	`689`	`+emoji_leaves_size=len(variation_table[1])*len(variation_table[1][0])`
	`690`	`+print(f"Emoji presentation leaves size:{emoji_leaves_size} bytes")`
	`691`	`+total_size+=emoji_leaves_size`
`558`	`692`	`print("------------------------")`
`559`		`-print(f" TotalSize:{total_size} bytes")`
	`693`	`+print(f" Totalsize:{total_size} bytes")`
`560`	`694`
`561`		`-emit_module(module_filename,version,tables)`
	`695`	`+emit_module(module_filename,version,tables,variation_table)`
`562`	`696`	`print(f'Wrote to "{module_filename}"')`
`563`	`697`
`564`	`698`

`‎src/lib.rs‎`

Lines changed: 34 additions & 11 deletions

Original file line number	Diff line number	Diff line change
`@@ -33,9 +33,11 @@`
`33`	`33`	`//! This crate currently uses the following rules to determine the width of a`
`34`	`34`	`//! character or string, in order of decreasing precedence. These may be tweaked in the future.`
`35`	`35`	`//!`
`36`		-//! 1. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
`37`		-//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
`38`		`-//! 3. The following have width 0:`
	`36`	`+//! 1. [Emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)`
	`37`	`+//! have width 2. (The width of a string may therefore differ from the sum of the widths of its characters.)`
	`38`	+//! 2. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
	`39`	+//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
	`40`	`+//! 4. The following have width 0:`
`39`	`41`	`//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)`
`40`	`42`	//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
`41`	`43`	`//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)`
`@@ -53,15 +55,15 @@`
`53`	`55`	//! with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593)
`54`	`56`	//! of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
`55`	`57`	//! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
`56`		`-//!4. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)`
	`58`	`+//!5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)`
`57`	`59`	`//! have no defined width, and are ignored when determining the width of a string.`
`58`		`-//!5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)`
	`60`	`+//!6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)`
`59`	`61`	//! with an [`East_Asian_Width`] of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2)
`60`	`62`	//! or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2.
`61`		`-//!6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)`
	`63`	`+//!7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)`
`62`	`64`	//! with an [`East_Asian_Width`] of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6)
`63`	`65`	`//! have width 2 in an East Asian context, and width 1 otherwise.`
`64`		`-//!7. All other characters have width 1.`
	`66`	`+//!8. All other characters have width 1.`
`65`	`67`	`//!`
`66`	`68`	//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
`67`	`69`	//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
`@@ -122,7 +124,9 @@ impl UnicodeWidthChar for char {`
`122`	`124`	`pubtraitUnicodeWidthStr{`
`123`	`125`	`/// Returns the string's displayed width in columns.`
`124`	`126`	`///`
`125`		`-/// Control characters are treated as having zero width.`
	`127`	`+/// Control characters are treated as having zero width,`
	`128`	`+/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)`
	`129`	`+/// are assigned width 2.`
`126`	`130`	`///`
`127`	`131`	`/// This function treats characters in the Ambiguous category according`
`128`	`132`	`/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)`
`@@ -132,7 +136,9 @@ pub trait UnicodeWidthStr {`
`132`	`136`
`133`	`137`	`/// Returns the string's displayed width in columns.`
`134`	`138`	`///`
`135`		`-/// Control characters are treated as having zero width.`
	`139`	`+/// Control characters are treated as having zero width,`
	`140`	`+/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)`
	`141`	`+/// are assigned width 2.`
`136`	`142`	`///`
`137`	`143`	`/// This function treats characters in the Ambiguous category according`
`138`	`144`	`/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)`
`@@ -144,11 +150,28 @@ pub trait UnicodeWidthStr {`
`144`	`150`	`implUnicodeWidthStrforstr{`
`145`	`151`	`#[inline]`
`146`	`152`	`fnwidth(&self) ->usize{`
`147`		`-self.chars().map(\|c\| cw::width(c,false).unwrap_or(0)).sum()`
	`153`	`+str_width(self,false)`
`148`	`154`	`}`
`149`	`155`
`150`	`156`	`#[inline]`
`151`	`157`	`fnwidth_cjk(&self) ->usize{`
`152`		`-self.chars().map(\|c\| cw::width(c,true).unwrap_or(0)).sum()`
	`158`	`+str_width(self,true)`
`153`	`159`	`}`
`154`	`160`	`}`
	`161`	`+`
	`162`	`+fnstr_width(s:&str,is_cjk:bool) ->usize{`
	`163`	`+ s.chars()`
	`164`	`+.rfold((0,false), \|(sum, was_fe0f), c\|{`
	`165`	`+if c =='\u{FE0F}'{`
	`166`	`+(sum,true)`
	`167`	`+}else{`
	`168`	`+let add =if was_fe0f && cw::starts_emoji_presentation_seq(c){`
	`169`	`+2`
	`170`	`+}else{`
	`171`	`+ cw::width(c, is_cjk).unwrap_or(0)`
	`172`	`+};`
	`173`	`+(sum + add,false)`
	`174`	`+}`
	`175`	`+})`
	`176`	`+.0`
	`177`	`+}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit6b503fa

File tree

6 files changed

6 files changed

`‎.github/workflows/rust.yml‎`

`‎benches/benches.rs‎`

`‎scripts/unicode.py‎`

`‎src/lib.rs‎`

0 commit comments