NotificationsYou must be signed in to change notification settings
Fork32
Star279

Commit9c4477c

authored

Merge pull request#37 from Jules-Bertholet/canonical-equivalence

Ensure that canonically equivalent strings have the same width

2 parents7c489c3 +fdf5eb7 commit9c4477cCopy full SHA for 9c4477c

File tree

4 files changed

+382

-323

lines changed

4 files changed

+382

-323

lines changed

`‎Cargo.toml‎`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,9 @@ std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }`
`22`	`22`	`core = {version ="1.0",package ="rustc-std-workspace-core",optional =true }`
`23`	`23`	`compiler_builtins = {version ="0.1",optional =true }`
`24`	`24`
	`25`	`+[dev-dependencies]`
	`26`	`+unicode-normalization ="0.1.23"`
	`27`	`+`
`25`	`28`	`[features]`
`26`	`29`	`default = []`
`27`	`30`	`bench = []`

`‎scripts/unicode.py‎`

Lines changed: 25 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -150,14 +150,15 @@ def load_zero_widths() -> "list[bool]":`
`150`	`150`	"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
`151`	`151`	character. `c` is considered a zero-width character if
`152`	`152`
`153`		- - it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
	`153`	+ - it is in general category `Cc`,
	`154`	+ - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
`154`	`155`	- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
`155`	`156`	- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
`156`	`157`	`"""`
`157`	`158`
`158`	`159`	`zw_map= []`
`159`	`160`
`160`		-# Characters with general category `Cc`, `Mn`, or `Me`have 0 width...
	`161`	+# Characters with general category `Cc`have 0 width
`161`	`162`	`withfetch_open("UnicodeData.txt")ascategories:`
`162`	`163`	`current=0`
`163`	`164`	`forlineincategories.readlines():`
`@@ -168,7 +169,7 @@ def load_zero_widths() -> "list[bool]":`
`168`	`169`	`raw_data[1],`
`169`	`170`	`raw_data[2],`
`170`	`171`	`]`
`171`		`-zero_width=cat_codein ["Cc","Mn","Me"]`
	`172`	`+zero_width=cat_code=="Cc"`
`172`	`173`
`173`	`174`	`assertcurrent<=codepoint`
`174`	`175`	`whilecurrent<=codepoint:`
`@@ -188,10 +189,16 @@ def load_zero_widths() -> "list[bool]":`
`188`	`189`	# `Default_Ignorable_Code_Point`s also have 0 width:
`189`	`190`	`# https://www.unicode.org/faq/unsup_char.html#3`
`190`	`191`	`# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095`
	`192`	`+#`
	`193`	+# `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
	`194`	+# as well as a few `Mc` characters that need to be included so that
	`195`	`+# canonically equivalent sequences have the same width.`
`191`	`196`	`withfetch_open("DerivedCoreProperties.txt")asproperties:`
`192`		`-single=re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")`
	`197`	`+single=re.compile(`
	`198`	`+r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"`
	`199`	`+ )`
`193`	`200`	`multiple=re.compile(`
`194`		`-r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"`
	`201`	`+r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"`
`195`	`202`	`)`
`196`	`203`
`197`	`204`	`forlineinproperties.readlines():`
`@@ -240,6 +247,19 @@ def load_zero_widths() -> "list[bool]":`
`240`	`247`	`# (which are considered 0-width on their own) to form a composed Hangul syllable with`
`241`	`248`	`# width 2. Therefore, we treat it as having width 2.`
`242`	`249`	`zw_map[0x115F]=False`
	`250`	`+`
	`251`	+# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
	`252`	`+# as they canonically decompose to two characters with this property,`
	`253`	`+# but they aren't.`
	`254`	`+zw_map[0x0CC0]=True`
	`255`	`+zw_map[0x0CC7]=True`
	`256`	`+zw_map[0x0CC8]=True`
	`257`	`+zw_map[0x0CCA]=True`
	`258`	`+zw_map[0x0CCB]=True`
	`259`	`+zw_map[0x1B3B]=True`
	`260`	`+zw_map[0x1B3D]=True`
	`261`	`+zw_map[0x1B43]=True`
	`262`	`+`
`243`	`263`	`returnzw_map`
`244`	`264`
`245`	`265`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit9c4477c

File tree

4 files changed

4 files changed

`‎Cargo.toml‎`

`‎scripts/unicode.py‎`

0 commit comments