Commiteb9d304

authored

Merge pull request#31 from ohhithere/fix-internal-skeleton

Fix internalSkeleton

2 parents22d684a +78707a7 commiteb9d304Copy full SHA for eb9d304

File tree

4 files changed

+744

-902

lines changed

scripts
- unicode.py
src

4 files changed

+744

-902

lines changed

`‎scripts/unicode.py‎`

Lines changed: 35 additions & 46 deletions

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`# - confusables.txt`
`18`	`18`	`# - ReadMe.txt`
`19`	`19`	`# This script also uses the following Unicode UCD data:`
	`20`	`+# - DerivedCoreProperties.txt`
`20`	`21`	`# - Scripts.txt`
`21`	`22`	`#`
`22`	`23`	`# Since this should not require frequent updates, we just store this`
`@@ -53,6 +54,8 @@ def fetch(f):`
`53`	`54`	`sys.stderr.write("cannot load %s\n"%f)`
`54`	`55`	`exit(1)`
`55`	`56`
	`57`	`+returnf`
	`58`	`+`
`56`	`59`	`# Download a UCD table file`
`57`	`60`	`deffetch_unidata(f):`
`58`	`61`	`ifnotos.path.exists(os.path.basename(f)):`
`@@ -63,14 +66,14 @@ def fetch_unidata(f):`
`63`	`66`	`sys.stderr.write("cannot load %s"%f)`
`64`	`67`	`exit(1)`
`65`	`68`
`66`		`-# Loads code point data from IdentifierStatus.txt and`
`67`		`-# IdentifierType.txt`
`68`		`-# Implementation from unicode-segmentation`
	`69`	`+returnf`
	`70`	`+`
	`71`	`+# Loads code point data from provided filename f`
	`72`	`+# Implementation adapted from unicode-segmentation`
`69`	`73`	`defload_properties(f,interestingprops=None):`
`70`		`-fetch(f)`
`71`	`74`	`props= {}`
`72`		`-re1=re.compile(r"^ ([0-9A-F]+) ; *(\w+)")`
`73`		`-re2=re.compile(r"^ ([0-9A-F]+)\.\.([0-9A-F]+) ; *(\w+)")`
	`75`	`+re1=re.compile(r"^ ([0-9A-F]+) ; ([^#\s]+) #")`
	`76`	`+re2=re.compile(r"^ ([0-9A-F]+)\.\.([0-9A-F]+) ; ([^#\s]+) #")`
`74`	`77`
`75`	`78`	`forlineinfileinput.input(os.path.basename(f),openhook=fileinput.hook_encoded("utf-8")):`
`76`	`79`	`prop=None`
`@@ -99,42 +102,6 @@ def load_properties(f, interestingprops = None):`
`99`	`102`
`100`	`103`	`returnprops`
`101`	`104`
`102`		`-# Loads script data from Scripts.txt`
`103`		`-defload_script_properties(f,interestingprops):`
`104`		`-fetch_unidata(f)`
`105`		`-props= {}`
`106`		`-# Note: these regexes are different from those in unicode-segmentation,`
`107`		`-# becase we need to handle spaces here`
`108`		`-re1=re.compile(r"^ ([0-9A-F]+) ; ([^#]+) #")`
`109`		`-re2=re.compile(r"^ ([0-9A-F]+)\.\.([0-9A-F]+) ; ([^#]+) #")`
`110`		`-`
`111`		`-forlineinfileinput.input(os.path.basename(f)):`
`112`		`-prop=None`
`113`		`-d_lo=0`
`114`		`-d_hi=0`
`115`		`-m=re1.match(line)`
`116`		`-ifm:`
`117`		`-d_lo=m.group(1)`
`118`		`-d_hi=m.group(1)`
`119`		`-prop=m.group(2).strip()`
`120`		`-else:`
`121`		`-m=re2.match(line)`
`122`		`-ifm:`
`123`		`-d_lo=m.group(1)`
`124`		`-d_hi=m.group(2)`
`125`		`-prop=m.group(3).strip()`
`126`		`-else:`
`127`		`-continue`
`128`		`-ifinterestingpropsandpropnotininterestingprops:`
`129`		`-continue`
`130`		`-d_lo=int(d_lo,16)`
`131`		`-d_hi=int(d_hi,16)`
`132`		`-ifpropnotinprops:`
`133`		`-props[prop]= []`
`134`		`-props[prop].append((d_lo,d_hi))`
`135`		`-`
`136`		`-returnprops`
`137`		`-`
`138`	`105`	`# Loads confusables data from confusables.txt`
`139`	`106`	`defload_confusables(f):`
`140`	`107`	`fetch(f)`
`@@ -189,7 +156,7 @@ def load_scripts(f):`
`189`	`156`	`# changes are introduced, update accordingly.`
`190`	`157`
`191`	`158`	`(longforms,shortforms)=aliases()`
`192`		`-scripts=load_script_properties(f, [])`
	`159`	`+scripts=load_properties(fetch_unidata(f), [])`
`193`	`160`
`194`	`161`	`script_table= []`
`195`	`162`	`script_list= []`
`@@ -546,10 +513,10 @@ def emit_identifier_module(f):`
`546`	`513`	`""")`
`547`	`514`
`548`	`515`	`f.write(" // Identifier status table:\n")`
`549`		`-identifier_status_table=load_properties("IdentifierStatus.txt")`
	`516`	`+identifier_status_table=load_properties(fetch("IdentifierStatus.txt"))`
`550`	`517`	`emit_table(f,"IDENTIFIER_STATUS",identifier_status_table['Allowed'],"&'static [(char, char)]",is_pub=False,`
`551`	`518`	`pfun=lambdax:"(%s,%s)"% (escape_char(x[0]),escape_char(x[1])))`
`552`		`-identifier_type=load_properties("IdentifierType.txt")`
	`519`	`+identifier_type=load_properties(fetch("IdentifierType.txt"))`
`553`	`520`	`type_table= []`
`554`	`521`	`fortyinidentifier_type:`
`555`	`522`	`type_table.extend([(x,y,ty)for (x,y)inidentifier_type[ty]])`
`@@ -560,6 +527,26 @@ def emit_identifier_module(f):`
`560`	`527`	`pfun=lambdax:"(%s,%s, IdentifierType::%s)"% (escape_char(x[0]),escape_char(x[1]),x[2]))`
`561`	`528`	`f.write("}\n\n")`
`562`	`529`
	`530`	`+defemit_default_ignorable_detection_module(f):`
	`531`	`+f.write("pub mod default_ignorable_code_point {")`
	`532`	`+f.write("""`
	`533`	`+`
	`534`	`+ #[inline]`
	`535`	`+ pub fn default_ignorable_code_point(c: char) -> bool {`
	`536`	`+ match c as usize {`
	`537`	`+ _ => super::util::bsearch_range_table(c, DEFAULT_IGNORABLE)`
	`538`	`+ }`
	`539`	`+ }`
	`540`	`+`
	`541`	`+""")`
	`542`	`+`
	`543`	`+f.write(" // Default ignorable code point table:\n")`
	`544`	`+default_ignorable_table=load_properties(fetch_unidata("DerivedCoreProperties.txt"), ["Default_Ignorable_Code_Point"])`
	`545`	`+emit_table(f,"DEFAULT_IGNORABLE",default_ignorable_table["Default_Ignorable_Code_Point"],"&'static [(char, char)]",is_pub=False,`
	`546`	`+pfun=lambdax:"(%s,%s)"% (escape_char(x[0]),escape_char(x[1])))`
	`547`	`+`
	`548`	`+f.write("}\n\n")`
	`549`	`+`
`563`	`550`	`defemit_confusable_detection_module(f):`
`564`	`551`	`f.write("pub mod confusable_detection {")`
`565`	`552`	`f.write("""`
`@@ -601,7 +588,7 @@ def emit_potiential_mixed_script_confusable(f):`
`601`	`588`	`}`
`602`	`589`	`}`
`603`	`590`	`""")`
`604`		`-identifier_status_table=load_properties("IdentifierStatus.txt")`
	`591`	`+identifier_status_table=load_properties(fetch("IdentifierStatus.txt"))`
`605`	`592`	`_,scripts=load_scripts("Scripts.txt")`
`606`	`593`	`identifier_allowed=identifier_status_table['Allowed']`
`607`	`594`	`(mixedscript_confusable,mixedscript_confusable_unresolved)=load_potential_mixedscript_confusables("confusables.txt",identifier_allowed,scripts)`
`@@ -688,6 +675,8 @@ def emit_util_mod(f):`
`688`	`675`	`emit_util_mod(rf)`
`689`	`676`	`### identifier module`
`690`	`677`	`emit_identifier_module(rf)`
	`678`	`+### default_ignorable_detection module`
	`679`	`+emit_default_ignorable_detection_module(rf)`
`691`	`680`	`### confusable_detection module`
`692`	`681`	`emit_confusable_detection_module(rf)`
`693`	`682`	`### mixed_script_confusable_detection module`

`‎src/confusable_detection.rs‎`

Lines changed: 7 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,12 @@ fn char_prototype(c: char) -> OnceOrMore<char, StaticSliceIterCloned> {`
`34`	`34`
`35`	`35`	`/// Calculate skeleton for string, as defined by UTS 39`
`36`	`36`	`pubfnskeleton(s:&str) ->implIterator<Item =char> +'_{`
	`37`	`+usecrate::tables::default_ignorable_code_point::default_ignorable_code_point;`
`37`	`38`	`use unicode_normalization::UnicodeNormalization;`
`38`		`- s.chars().nfd().flat_map(char_prototype).nfd()`
	`39`	`+`
	`40`	`+ s.chars()`
	`41`	`+.nfd()`
	`42`	`+.filter(\|c\| !default_ignorable_code_point(*c))`
	`43`	`+.flat_map(char_prototype)`
	`44`	`+.nfd()`
`39`	`45`	`}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commiteb9d304

File tree

4 files changed

4 files changed

`‎scripts/unicode.py‎`

`‎src/confusable_detection.rs‎`

0 commit comments