NotificationsYou must be signed in to change notification settings
Fork32
Star279

Commitfda272b

authored

Merge pull request#34 from Jules-Bertholet/default-ignorable-code-point

Fixes to characters considered zero-width

2 parents8942487 +aae585f commitfda272bCopy full SHA for fda272b

File tree

5 files changed

+368

-230

lines changed

5 files changed

+368

-230

lines changed

`‎README.md‎`

Lines changed: 6 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,8 @@ fn main() {`
`26`	`26`
`27`	`27`	`NOTE: The computed width values may not match the actual rendered column`
`28`	`28`	`width. For example, the woman scientist emoji comprises of a woman emoji, a`
`29`		`-zero-width joiner and a microscope emoji.`
	`29`	`+zero-width joiner and a microscope emoji. Such[emoji ZWJ sequences](https://www.unicode.org/reports/tr51/#Emoji_ZWJ_Sequences)`
	`30`	`+are considered to have the sum of the widths of their constituent parts:`
`30`	`31`
`31`	`32`	```rust
`32`	`33`	`externcrate unicode_width;`
`@@ -39,8 +40,10 @@ fn main() {`
`39`	`40`	`}`
`40`	`41`	```
`41`	`42`
`42`		`-See[Unicode Standard Annex#11][UAX11] for precise details on what is and isn't`
`43`		`-covered by this crate.`
	`43`	`+Additionally,[defective combining character sequences](https://unicode.org/glossary/#defective_combining_character_sequence)`
	`44`	`+and nonstandard[Korean jamo](https://unicode.org/glossary/#jamo) sequences may`
	`45`	`+be rendered with a different width than what this crate says. (This is not an`
	`46`	`+exhaustive list.)`
`44`	`47`
`45`	`48`	`##features`
`46`	`49`

`‎scripts/unicode.py‎`

Lines changed: 91 additions & 27 deletions

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,8 @@ class OffsetType(enum.IntEnum):`
`64`	`64`
`65`	`65`	`deffetch_open(filename:str):`
`66`	`66`	"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
`67`		- fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
	`67`	+ fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
	`68`	`+ """`
`68`	`69`	`ifnotos.path.exists(os.path.basename(filename)):`
`69`	`70`	`os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")`
`70`	`71`	`try:`
`@@ -83,7 +84,8 @@ def load_unicode_version() -> "tuple[int, int, int]":`
`83`	`84`
`84`	`85`	`classEffectiveWidth(enum.IntEnum):`
`85`	`86`	`"""Represents the width of a Unicode character. All East Asian Width classes resolve into`
`86`		- either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""
	`87`	+ either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
	`88`	`+ """`
`87`	`89`
`88`	`90`	`ZERO=0`
`89`	`91`	`""" Zero columns wide. """`
`@@ -146,10 +148,17 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":`
`146`	`148`
`147`	`149`	`defload_zero_widths()->"list[bool]":`
`148`	`150`	"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
`149`		- character. `c` is considered a zero-width character if `c` is in general categories
`150`		- `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
	`151`	+ character. `c` is considered a zero-width character if
	`152`	`+`
	`153`	+ - it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
	`154`	+ - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
	`155`	+ - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
	`156`	`+ """`
	`157`	`+`
	`158`	`+zw_map= []`
	`159`	`+`
	`160`	+# Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
`151`	`161`	`withfetch_open("UnicodeData.txt")ascategories:`
`152`		`-zw_map= []`
`153`	`162`	`current=0`
`154`	`163`	`forlineincategories.readlines():`
`155`	`164`	`iflen(raw_data:=line.split(";"))!=15:`
`@@ -159,7 +168,7 @@ def load_zero_widths() -> "list[bool]":`
`159`	`168`	`raw_data[1],`
`160`	`169`	`raw_data[2],`
`161`	`170`	`]`
`162`		`-zero_width=cat_codein ["Cc","Cf","Mn","Me"]`
	`171`	`+zero_width=cat_codein ["Cc","Mn","Me"]`
`163`	`172`
`164`	`173`	`assertcurrent<=codepoint`
`165`	`174`	`whilecurrent<=codepoint:`
`@@ -176,12 +185,68 @@ def load_zero_widths() -> "list[bool]":`
`176`	`185`	`# Catch any leftover codepoints. They must be unassigned (so nonzero width)`
`177`	`186`	`zw_map.append(False)`
`178`	`187`
`179`		`-returnzw_map`
	`188`	+# `Default_Ignorable_Code_Point`s also have 0 width:
	`189`	`+# https://www.unicode.org/faq/unsup_char.html#3`
	`190`	`+# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095`
	`191`	`+withfetch_open("DerivedCoreProperties.txt")asproperties:`
	`192`	`+single=re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")`
	`193`	`+multiple=re.compile(`
	`194`	`+r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"`
	`195`	`+ )`
	`196`	`+`
	`197`	`+forlineinproperties.readlines():`
	`198`	`+raw_data=None# (low, high)`
	`199`	`+ifmatch:=single.match(line):`
	`200`	`+raw_data= (match.group(1),match.group(1))`
	`201`	`+elifmatch:=multiple.match(line):`
	`202`	`+raw_data= (match.group(1),match.group(2))`
	`203`	`+else:`
	`204`	`+continue`
	`205`	`+low=int(raw_data[0],16)`
	`206`	`+high=int(raw_data[1],16)`
	`207`	`+forcpinrange(low,high+1):`
	`208`	`+zw_map[cp]=True`
	`209`	`+`
	`210`	+# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
	`211`	+# as zero-width. This matches the behavior of glibc `wcwidth`.
	`212`	`+#`
	`213`	+# Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`,
	`214`	+# a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine
	`215`	`+# into a single wide grapheme. So we treat vowel and trailing jamo as`
	`216`	`+# 0-width, such that only the width of the leading jamo is counted`
	`217`	`+# and the resulting grapheme has width 2.`
	`218`	`+#`
	`219`	`+# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)`
	`220`	`+withfetch_open("HangulSyllableType.txt")ascategories:`
	`221`	`+single=re.compile(r"^([0-9A-F]+)\s+;\s+(V\|T)\s+")`
	`222`	`+multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V\|T)\s+")`
	`223`	`+`
	`224`	`+forlineincategories.readlines():`
	`225`	`+raw_data=None# (low, high)`
	`226`	`+ifmatch:=single.match(line):`
	`227`	`+raw_data= (match.group(1),match.group(1))`
	`228`	`+elifmatch:=multiple.match(line):`
	`229`	`+raw_data= (match.group(1),match.group(2))`
	`230`	`+else:`
	`231`	`+continue`
	`232`	`+low=int(raw_data[0],16)`
	`233`	`+high=int(raw_data[1],16)`
	`234`	`+forcpinrange(low,high+1):`
	`235`	`+zw_map[cp]=True`
	`236`	`+`
	`237`	`+# Special case: U+115F HANGUL CHOSEONG FILLER.`
	`238`	+# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
	`239`	`+# zero width. However, the expected usage is to combine it with vowel or trailing jamo`
	`240`	`+# (which are considered 0-width on their own) to form a composed Hangul syllable with`
	`241`	`+# width 2. Therefore, we treat it as having width 2.`
	`242`	`+zw_map[0x115F]=False`
	`243`	`+returnzw_map`
`180`	`244`
`181`	`245`
`182`	`246`	`classBucket:`
`183`	`247`	`"""A bucket contains a group of codepoints and an ordered width list. If one bucket's width`
`184`		- list overlaps with another's width list, those buckets can be merged via `try_extend`."""
	`248`	+ list overlaps with another's width list, those buckets can be merged via `try_extend`.
	`249`	`+ """`
`185`	`250`
`186`	`251`	`def__init__(self):`
`187`	`252`	`"""Creates an empty bucket."""`
`@@ -230,9 +295,9 @@ def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":`
`230`	`295`	`same bucket. Returns a list of the buckets in increasing order of those bits."""`
`231`	`296`	`num_bits=cap_bit-low_bit`
`232`	`297`	`assertnum_bits>0`
`233`		`-buckets= [Bucket()for_inrange(0,2**num_bits)]`
	`298`	`+buckets= [Bucket()for_inrange(0,2**num_bits)]`
`234`	`299`	`mask= (1<<num_bits)-1`
`235`		`-for(codepoint,width)inentries:`
	`300`	`+forcodepoint,widthinentries:`
`236`	`301`	`buckets[(codepoint>>low_bit)&mask].append(codepoint,width)`
`237`	`302`	`returnbuckets`
`238`	`303`
`@@ -269,7 +334,7 @@ def __init__(`
`269`	`334`	`buckets.extend(make_buckets(entries,self.low_bit,self.cap_bit))`
`270`	`335`
`271`	`336`	`forbucketinbuckets:`
`272`		`-for(i,existing)inenumerate(self.indexed):`
	`337`	`+fori,existinginenumerate(self.indexed):`
`273`	`338`	`ifexisting.try_extend(bucket):`
`274`	`339`	`self.entries.append(i)`
`275`	`340`	`break`
`@@ -283,7 +348,8 @@ def __init__(`
`283`	`348`
`284`	`349`	`defindices_to_widths(self):`
`285`	`350`	"""Destructively converts the indices in this table to the `EffectiveWidth` values of
`286`		`- their buckets. Assumes that no bucket contains codepoints with different widths."""`
	`351`	`+ their buckets. Assumes that no bucket contains codepoints with different widths.`
	`352`	`+ """`
`287`	`353`	`self.entries=list(map(lambdai:int(self.indexed[i].width()),self.entries))`
`288`	`354`	`delself.indexed`
`289`	`355`
`@@ -315,7 +381,7 @@ def make_tables(`
`315`	`381`	`to include in the top-level table."""`
`316`	`382`	`tables= []`
`317`	`383`	`entry_groups= [entries]`
`318`		`-for(low_bit,cap_bit,offset_type)intable_cfgs:`
	`384`	`+forlow_bit,cap_bit,offset_typeintable_cfgs:`
`319`	`385`	`table=Table(entry_groups,low_bit,cap_bit,offset_type)`
`320`	`386`	`entry_groups=map(lambdabucket:bucket.entries(),table.buckets())`
`321`	`387`	`tables.append(table)`
`@@ -326,7 +392,8 @@ def emit_module(`
`326`	`392`	`out_name:str,unicode_version:"tuple[int, int, int]",tables:"list[Table]"`
`327`	`393`	`):`
`328`	`394`	"""Outputs a Rust module to `out_name` using table data from `tables`.
`329`		- If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
	`395`	+ If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
	`396`	`+ """`
`330`	`397`	`ifos.path.exists(out_name):`
`331`	`398`	`os.remove(out_name)`
`332`	`399`	`withopen(out_name,"w",newline="\n",encoding="utf-8")asmodule:`
`@@ -432,7 +499,7 @@ def emit_module(`
`432`	`499`	`)`
`433`	`500`
`434`	`501`	`subtable_count=1`
`435`		`-for(i,table)inenumerate(tables):`
	`502`	`+fori,tableinenumerate(tables):`
`436`	`503`	`new_subtable_count=len(table.buckets())`
`437`	`504`	`ifi==len(tables)-1:`
`438`	`505`	`table.indices_to_widths()# for the last table, indices == widths`
`@@ -442,7 +509,7 @@ def emit_module(`
`442`	`509`	/// Autogenerated.{subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
`443`	`510`	`static TABLES_{i}: [u8;{len(byte_array)}] = ["""`
`444`	`511`	`)`
`445`		`-for(j,byte)inenumerate(byte_array):`
	`512`	`+forj,byteinenumerate(byte_array):`
`446`	`513`	`# Add line breaks for every 15th entry (chosen to match what rustfmt does)`
`447`	`514`	`ifj%15==0:`
`448`	`515`	`module.write("\n ")`
`@@ -458,16 +525,17 @@ def main(module_filename: str):`
`458`	`525`	`module_filename`.
`459`	`526`
`460`	`527`	`We obey the following rules in decreasing order of importance:`
`461`		- - The soft hyphen (`U+00AD`) is single-width.
`462`		- - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
`463`		- - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
	`528`	+ - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
	`529`	`+ - Hangul jamo medial vowels & final consonants are zero-width.`
	`530`	+ - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
	`531`	+ - All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
`464`	`532`	- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
`465`	`533`	- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
`466`	`534`	`- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width`
`467`		- of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
	`535`	+of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
`468`	`536`
`469`		- These rules are based off ofMarkus Kuhn's free`wcwidth()`implementation:
`470`		`-http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c"""`
	`537`	+ These rules are based off ofUAX11, other Unicode standards, and various`wcwidth()`implementations.
	`538`	`+ """`
`471`	`539`	`version=load_unicode_version()`
`472`	`540`	`print(f"Generating module for Unicode{version[0]}.{version[1]}.{version[2]}")`
`473`	`541`
`@@ -482,15 +550,11 @@ def main(module_filename: str):`
`482`	`550`	`# Override for soft hyphen`
`483`	`551`	`width_map[0x00AD]=EffectiveWidth.NARROW`
`484`	`552`
`485`		`-# Override for Hangul Jamo medial vowels & final consonants`
`486`		`-foriinrange(0x1160,0x11FF+1):`
`487`		`-width_map[i]=EffectiveWidth.ZERO`
`488`		`-`
`489`	`553`	`tables=make_tables(TABLE_CFGS,enumerate(width_map))`
`490`	`554`
`491`	`555`	`print("------------------------")`
`492`	`556`	`total_size=0`
`493`		`-for(i,table)inenumerate(tables):`
	`557`	`+fori,tableinenumerate(tables):`
`494`	`558`	`size_bytes=len(table.to_bytes())`
`495`	`559`	`print(f"Table{i} Size:{size_bytes} bytes")`
`496`	`560`	`total_size+=size_bytes`

`‎src/lib.rs‎`

Lines changed: 12 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -43,9 +43,10 @@`
`43`	`43`	//! ```
`44`	`44`
`45`	`45`	`#![deny(missing_docs, unsafe_code)]`
`46`		`-#![doc(html_logo_url ="https://unicode-rs.github.io/unicode-rs_sm.png",`
`47`		`- html_favicon_url ="https://unicode-rs.github.io/unicode-rs_sm.png")]`
`48`		`-`
	`46`	`+#![doc(`
	`47`	`+ html_logo_url ="https://unicode-rs.github.io/unicode-rs_sm.png",`
	`48`	`+ html_favicon_url ="https://unicode-rs.github.io/unicode-rs_sm.png"`
	`49`	`+)]`
`49`	`50`	`#![cfg_attr(feature ="bench", feature(test))]`
`50`	`51`	`#![no_std]`
`51`	`52`
`@@ -87,10 +88,14 @@ pub trait UnicodeWidthChar {`
`87`	`88`
`88`	`89`	`implUnicodeWidthCharforchar{`
`89`	`90`	`#[inline]`
`90`		`-fnwidth(self) ->Option<usize>{ cw::width(self,false)}`
	`91`	`+fnwidth(self) ->Option<usize>{`
	`92`	`+ cw::width(self,false)`
	`93`	`+}`
`91`	`94`
`92`	`95`	`#[inline]`
`93`		`-fnwidth_cjk(self) ->Option<usize>{ cw::width(self,true)}`
	`96`	`+fnwidth_cjk(self) ->Option<usize>{`
	`97`	`+ cw::width(self,true)`
	`98`	`+}`
`94`	`99`	`}`
`95`	`100`
`96`	`101`	`/// Methods for determining displayed width of Unicode strings.`
`@@ -103,7 +108,7 @@ pub trait UnicodeWidthStr {`
`103`	`108`	`/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)`
`104`	`109`	`/// as 1 column wide. This is consistent with the recommendations for`
`105`	`110`	`/// non-CJK contexts, or when the context cannot be reliably determined.`
`106`		`-fnwidth<'a>(&'aself) ->usize;`
	`111`	`+fnwidth(&self) ->usize;`
`107`	`112`
`108`	`113`	`/// Returns the string's displayed width in columns.`
`109`	`114`	`///`
`@@ -113,7 +118,7 @@ pub trait UnicodeWidthStr {`
`113`	`118`	`/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)`
`114`	`119`	`/// as 2 column wide. This is consistent with the recommendations for`
`115`	`120`	`/// CJK contexts.`
`116`		`-fnwidth_cjk<'a>(&'aself) ->usize;`
	`121`	`+fnwidth_cjk(&self) ->usize;`
`117`	`122`	`}`
`118`	`123`
`119`	`124`	`implUnicodeWidthStrforstr{`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitfda272b

File tree

5 files changed

5 files changed

`‎README.md‎`

`‎scripts/unicode.py‎`

`‎src/lib.rs‎`

0 commit comments