Commit592ce00

authored

Merge pull request#134 from Jules-Bertholet/fix

2 parents3ff9de6 +dce3a34 commit592ce00Copy full SHA for 592ce00

File tree

11 files changed

+1271

-2069

lines changed

.github/workflows
- rust.yml
benches
- chars.rs
scripts
- unicode.py
- unicode_gen_breaktests.py
src
tests
- test.rs
- testdata
  - mod.rs

11 files changed

+1271

-2069

lines changed

`‎.github/workflows/rust.yml‎`

Lines changed: 9 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -7,28 +7,31 @@ on:`
`7`	`7`	`branches:[ master ]`
`8`	`8`
`9`	`9`	`env:`
	`10`	`+CARGO_INCREMENTAL:0`
`10`	`11`	`CARGO_TERM_COLOR:always`
	`12`	`+RUST_BACKTRACE:1`
	`13`	`+RUSTFLAGS:-D warnings`
	`14`	`+RUSTDOCFLAGS:-D warnings`
`11`	`15`
`12`	`16`	`jobs:`
`13`	`17`	`build:`
`14`		`-`
`15`	`18`	`runs-on:ubuntu-latest`
`16`		`-`
`17`	`19`	`steps:`
`18`	`20`	`-uses:actions/checkout@v2`
`19`	`21`	`-name:Build`
`20`	`22`	`run:cargo build --verbose`
`21`	`23`	`-name:Run tests`
`22`	`24`	`run:cargo test --verbose`
`23`		`-fmt:`
	`25`	`+ -name:Run clippy`
	`26`	`+run:cargo clippy --all-targets --all --verbose`
`24`	`27`
	`28`	`+fmt:`
`25`	`29`	`runs-on:ubuntu-latest`
`26`		`-`
`27`	`30`	`steps:`
`28`	`31`	`-uses:actions/checkout@v2`
`29`	`32`	`-name:Rustfmt`
`30`		`-run:cargo fmt --check`
	`33`	`+run:cargo fmt --all --check`
`31`	`34`	`-name:Verify regenerated files`
`32`	`35`	`run:./scripts/unicode.py && diff tables.rs src/tables.rs`
`33`	`36`	`-name:Verify regenerated tests`
`34`		`-run:./scripts/unicode_gen_breaktests.py &&rustfmt testdata.rs &&diff testdata.rssrc/testdata.rs`
	`37`	`+run:./scripts/unicode_gen_breaktests.py && diff testdata.rstests/testdata/mod.rs`

`‎benches/chars.rs‎`

Lines changed: 2 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,6 @@`
`6`	`6`	`//! is how much slower full unicode handling is.`
`7`	`7`
`8`	`8`	`use criterion::{black_box, criterion_group, criterion_main,BenchmarkId,Criterion};`
`9`		`-use unicode_segmentation;`
`10`	`9`
`11`	`10`	`use std::fs;`
`12`	`11`	`use unicode_segmentation::UnicodeSegmentation;`
`@@ -24,14 +23,14 @@ const FILES: &[&str] = &[`
`24`	`23`
`25`	`24`	`#[inline(always)]`
`26`	`25`	`fngrapheme(text:&str){`
`27`		`-for cinUnicodeSegmentation::graphemes(black_box(&*text),true){`
	`26`	`+for cinUnicodeSegmentation::graphemes(black_box(text),true){`
`28`	`27`	`black_box(c);`
`29`	`28`	`}`
`30`	`29`	`}`
`31`	`30`
`32`	`31`	`#[inline(always)]`
`33`	`32`	`fnscalar(text:&str){`
`34`		`-for cinblack_box(&*text).chars(){`
	`33`	`+for cinblack_box(text).chars(){`
`35`	`34`	`black_box(c);`
`36`	`35`	`}`
`37`	`36`	`}`

`‎scripts/unicode.py‎`

Lines changed: 48 additions & 26 deletions

Original file line number	Diff line number	Diff line change
`@@ -155,11 +155,11 @@ def format_table_content(f, content, indent):`
`155`	`155`	`line=" "*indent+chunk`
`156`	`156`	`f.write(line)`
`157`	`157`
`158`		`-defload_properties(f,interestingprops):`
	`158`	`+defload_properties(f,interestingprops:"list[str \| tuple[str, str]] \| None"=None):`
`159`	`159`	`fetch(f)`
`160`	`160`	`props= {}`
`161`		`-re1=re.compile(r"^([0-9A-F]+) ;*(\w+)")`
`162`		`-re2=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+) ;*(\w+)")`
	`161`	`+re1=re.compile(r"^\s([0-9A-F]+)\s;\s(\w+)(?:\s;\s*(\w+))?")`
	`162`	`+re2=re.compile(r"^\s([0-9A-F]+)\.\.([0-9A-F]+)\s;\s(\w+)(?:\s;\s*(\w+))?")`
`163`	`163`
`164`	`164`	`forlineinfileinput.input(os.path.basename(f)):`
`165`	`165`	`prop=None`
`@@ -168,17 +168,21 @@ def load_properties(f, interestingprops):`
`168`	`168`	`m=re1.match(line)`
`169`	`169`	`ifm:`
`170`	`170`	`d_lo=m.group(1)`
`171`		`-d_hi=m.group(1)`
	`171`	`+d_hi=d_lo`
`172`	`172`	`prop=m.group(2)`
	`173`	`+value=m.group(3)`
`173`	`174`	`else:`
`174`	`175`	`m=re2.match(line)`
`175`	`176`	`ifm:`
`176`	`177`	`d_lo=m.group(1)`
`177`	`178`	`d_hi=m.group(2)`
`178`	`179`	`prop=m.group(3)`
	`180`	`+value=m.group(4)`
`179`	`181`	`else:`
`180`	`182`	`continue`
`181`		`-ifinterestingpropsandpropnotininterestingprops:`
	`183`	`+ifvalueisnotNone:`
	`184`	`+prop= (prop,value)`
	`185`	`+ifinterestingpropsisnotNoneandpropnotininterestingprops:`
`182`	`186`	`continue`
`183`	`187`	`d_lo=int(d_lo,16)`
`184`	`188`	`d_hi=int(d_hi,16)`
`@@ -195,7 +199,7 @@ def load_properties(f, interestingprops):`
`195`	`199`	`defescape_char(c):`
`196`	`200`	`return"'\\u{%x}'"%c`
`197`	`201`
`198`		`-defemit_table(f,name,t_data,t_type="&'static[(char, char)]",is_pub=True,`
	`202`	`+defemit_table(f,name,t_data,t_type="&[(char, char)]",is_pub=True,`
`199`	`203`	`pfun=lambdax:"(%s,%s)"% (escape_char(x[0]),escape_char(x[1])),is_const=True):`
`200`	`204`	`pub_string="const"`
`201`	`205`	`ifnotis_const:`
`@@ -217,7 +221,7 @@ def emit_util_mod(f):`
`217`	`221`	`f.write("""`
`218`	`222`	`pub mod util {`
`219`	`223`	`#[inline]`
`220`		`- pub fn bsearch_range_table(c: char, r: &'static[(char,char)]) -> bool {`
	`224`	`+ pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {`
`221`	`225`	`use core::cmp::Ordering::{Equal, Less, Greater};`
`222`	`226`	`r.binary_search_by(\|&(lo,hi)\| {`
`223`	`227`	`if lo <= c && c <= hi { Equal }`
`@@ -252,13 +256,22 @@ def emit_util_mod(f):`
`252`	`256`
`253`	`257`	`""")`
`254`	`258`
`255`		`-defemit_property_module(f,mod,tbl,emit):`
`256`		`-f.write("mod %s {\n"%mod)`
`257`		`-forcatinsorted(emit):`
`258`		`-emit_table(f,"%s_table"%cat,tbl[cat],is_pub=False)`
	`259`	`+defemit_property_module(f,mod,tbl,emit:"list[str \| tuple[str, str]]"):`
	`260`	`+f.write("pub mod %s {\n"%mod)`
	`261`	`+`
	`262`	`+cats= []`
	`263`	`+forcatinemit:`
	`264`	`+iftype(cat)istuple:`
	`265`	`+cats.append((f"{cat[0]}_{cat[1]}",cat))`
	`266`	`+else:`
	`267`	`+cats.append((cat,cat))`
	`268`	`+cats.sort(key=lambdax:x[0])`
	`269`	`+`
	`270`	`+forcat_str,catincats:`
	`271`	`+emit_table(f,"%s_table"%cat_str,tbl[cat],is_pub=False)`
`259`	`272`	`f.write(" #[inline]\n")`
`260`		`-f.write(" pub fn %s(c: char) -> bool {\n"%cat)`
`261`		`-f.write(" super::util::bsearch_range_table(c, %s_table)\n"%cat)`
	`273`	`+f.write(" pub fn %s(c: char) -> bool {\n"%cat_str)`
	`274`	`+f.write(" super::util::bsearch_range_table(c, %s_table)\n"%cat_str)`
`262`	`275`	`f.write(" }\n\n")`
`263`	`276`	`f.write("}\n\n")`
`264`	`277`
`@@ -303,7 +316,7 @@ def emit_break_module(f, break_table, break_cats, name):`
`303`	`316`	`f.write((" %sC_"%Name[0])+cat+",\n")`
`304`	`317`	`f.write(""" }`
`305`	`318`
`306`		`- fn bsearch_range_value_table(c: char, r: &'static[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {`
	`319`	`+ fn bsearch_range_value_table(c: char, r: &[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {`
`307`	`320`	`use core::cmp::Ordering::{Equal, Less, Greater};`
`308`	`321`	`match r.binary_search_by(\|&(lo, hi, _)\| {`
`309`	`322`	`if lo <= c && c <= hi { Equal }`
`@@ -355,11 +368,11 @@ def emit_break_module(f, break_table, break_cats, name):`
`355`	`368`	`else:`
`356`	`369`	`lookup_type="u32"`
`357`	`370`
`358`		`-emit_table(f,"%s_cat_lookup"%name,lookup_table,"&'static[%s]"%lookup_type,`
	`371`	`+emit_table(f,"%s_cat_lookup"%name,lookup_table,"&[%s]"%lookup_type,`
`359`	`372`	`pfun=lambdax:"%d"%x,`
`360`	`373`	`is_pub=False,is_const=True)`
`361`	`374`
`362`		`-emit_table(f,"%s_cat_table"%name,break_table,"&'static[(char, char, %sCat)]"%Name,`
	`375`	`+emit_table(f,"%s_cat_table"%name,break_table,"&[(char, char, %sCat)]"%Name,`
`363`	`376`	`pfun=lambdax:"(%s,%s,%sC_%s)"% (escape_char(x[0]),escape_char(x[1]),Name[0],x[2]),`
`364`	`377`	`is_pub=False,is_const=True)`
`365`	`378`	`f.write("}\n")`
`@@ -379,17 +392,26 @@ def emit_break_module(f, break_table, break_cats, name):`
`379`	`392`
`380`	`393`	`# download and parse all the data`
`381`	`394`	`gencats=load_gencats("UnicodeData.txt")`
`382`		`-derived=load_properties("DerivedCoreProperties.txt", ["Alphabetic"])`
	`395`	`+derived=load_properties("DerivedCoreProperties.txt", ["Alphabetic", ("InCB","Consonant"), ("InCB","Extend"), ("InCB","Linker")])`
`383`	`396`
`384`	`397`	`emit_util_mod(rf)`
`385`	`398`	`for (name,cat,pfuns)in ("general_category",gencats, ["N"]), \`
`386`		`- ("derived_property",derived, ["Alphabetic"]):`
	`399`	`+ ("derived_property",derived, ["Alphabetic", ("InCB","Extend")]):`
`387`	`400`	`emit_property_module(rf,name,cat,pfuns)`
`388`	`401`
	`402`	`+rf.write("""pub fn is_incb_linker(c: char) -> bool {`
	`403`	`+ matches!(c,""")`
	`404`	`+`
	`405`	`+for (lo,hi)inderived[("InCB","Linker")]:`
	`406`	`+rf.write(f" \| '\\u{{{lo:X}}}'")`
	`407`	`+iflo!=hi:`
	`408`	`+rf.write(f"..'\\u{{{lo:X}}}'")`
	`409`	`+`
	`410`	`+rf.write(")\n}\n\n")`
	`411`	`+`
`389`	`412`	`### grapheme cluster module`
`390`	`413`	`# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values`
`391`		`-grapheme_cats=load_properties("auxiliary/GraphemeBreakProperty.txt", [])`
`392`		`-`
	`414`	`+grapheme_cats=load_properties("auxiliary/GraphemeBreakProperty.txt")`
`393`	`415`	`# Control`
`394`	`416`	`# Note:`
`395`	`417`	# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
`@@ -398,22 +420,22 @@ def emit_break_module(f, break_table, break_cats, name):`
`398`	`420`	`grapheme_cats["Control"]=group_cat(list(`
`399`	`421`	`set(ungroup_cat(grapheme_cats["Control"]))`
`400`	`422`	`-set(ungroup_cat([surrogate_codepoints]))))`
`401`		`-`
	`423`	`+grapheme_cats["InCB_Consonant"]=derived[("InCB","Consonant")]`
	`424`	`+emoji_props=load_properties("emoji-data.txt", ["Extended_Pictographic"])`
	`425`	`+grapheme_cats["Extended_Pictographic"]=emoji_props["Extended_Pictographic"]`
`402`	`426`	`grapheme_table= []`
`403`	`427`	`forcatingrapheme_cats:`
`404`	`428`	`grapheme_table.extend([(x,y,cat)for (x,y)ingrapheme_cats[cat]])`
`405`		`-emoji_props=load_properties("emoji-data.txt", ["Extended_Pictographic"])`
`406`		`-grapheme_table.extend([(x,y,"Extended_Pictographic")for (x,y)inemoji_props["Extended_Pictographic"]])`
`407`	`429`	`grapheme_table.sort(key=lambdaw:w[0])`
`408`	`430`	`last=-1`
`409`	`431`	`forcharsingrapheme_table:`
`410`	`432`	`ifchars[0]<=last:`
`411`	`433`	`raise"Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"`
`412`	`434`	`last=chars[1]`
`413`		`-emit_break_module(rf,grapheme_table,list(grapheme_cats.keys())+ ["Extended_Pictographic"],"grapheme")`
	`435`	`+emit_break_module(rf,grapheme_table,list(grapheme_cats.keys()),"grapheme")`
`414`	`436`	`rf.write("\n")`
`415`	`437`
`416`		`-word_cats=load_properties("auxiliary/WordBreakProperty.txt", [])`
	`438`	`+word_cats=load_properties("auxiliary/WordBreakProperty.txt")`
`417`	`439`	`word_table= []`
`418`	`440`	`forcatinword_cats:`
`419`	`441`	`word_table.extend([(x,y,cat)for (x,y)inword_cats[cat]])`
`@@ -425,7 +447,7 @@ def emit_break_module(f, break_table, break_cats, name):`
`425`	`447`	`emoji_table= [(x,y,"Extended_Pictographic")for (x,y)inemoji_props["Extended_Pictographic"]]`
`426`	`448`	`emit_break_module(rf,emoji_table, ["Extended_Pictographic"],"emoji")`
`427`	`449`
`428`		`-sentence_cats=load_properties("auxiliary/SentenceBreakProperty.txt", [])`
	`450`	`+sentence_cats=load_properties("auxiliary/SentenceBreakProperty.txt")`
`429`	`451`	`sentence_table= []`
`430`	`452`	`forcatinsentence_cats:`
`431`	`453`	`sentence_table.extend([(x,y,cat)for (x,y)insentence_cats[cat]])`

`‎scripts/unicode_gen_breaktests.py‎`

Lines changed: 6 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -140,8 +140,8 @@ def showfun(x):`
`140`	`140`	`returnoutstr`
`141`	`141`
`142`	`142`	`defcreate_grapheme_data(f):`
`143`		`-# rules 9.1and 9.2 are for extended graphemes only`
`144`		`-optsplits= ['9.1','9.2']`
	`143`	`+# rules 9.1, 9.2,and 9.3 are for extended graphemes only`
	`144`	`+optsplits= ['9.1','9.2','9.3']`
`145`	`145`	`d=load_test_data("auxiliary/GraphemeBreakTest.txt",optsplits)`
`146`	`146`
`147`	`147`	`test_same= []`
`@@ -169,8 +169,8 @@ def create_grapheme_data(f):`
`169`	`169`	`else:`
`170`	`170`	`test_diff.append((allchars,extgraphs,c))`
`171`	`171`
`172`		`-stype="&'static[(&'staticstr, &'static [&'staticstr])]"`
`173`		`-dtype="&'static[(&'staticstr, &'static [&'staticstr], &'static [&'staticstr])]"`
	`172`	`+stype="&[(&str, &[&str])]"`
	`173`	`+dtype="&[(&str, &[&str], &[&str])]"`
`174`	`174`	`f.write(" // official Unicode test data\n")`
`175`	`175`	`f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)`
`176`	`176`	`unicode.emit_table(f,"TEST_SAME",test_same,stype,True,showfun,True)`
`@@ -185,7 +185,7 @@ def create_words_data(f):`
`185`	`185`	`allchars= [cnforsincforcnins]`
`186`	`186`	`test.append((allchars,c))`
`187`	`187`
`188`		`-wtype="&'static[(&'staticstr, &'static [&'staticstr])]"`
	`188`	`+wtype="&[(&str, &[&str])]"`
`189`	`189`	`f.write(" // official Unicode test data\n")`
`190`	`190`	`f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)`
`191`	`191`	`unicode.emit_table(f,"TEST_WORD",test,wtype,True,showfun,True)`
`@@ -199,7 +199,7 @@ def create_sentence_data(f):`
`199`	`199`	`allchars= [cnforsincforcnins]`
`200`	`200`	`test.append((allchars,c))`
`201`	`201`
`202`		`-wtype="&'static[(&'staticstr, &'static [&'staticstr])]"`
	`202`	`+wtype="&[(&str, &[&str])]"`
`203`	`203`	`f.write(" // official Unicode test data\n")`
`204`	`204`	`f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)`
`205`	`205`	`unicode.emit_table(f,"TEST_SENTENCE",test,wtype,True,showfun,True)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit592ce00

File tree

11 files changed

11 files changed

`‎.github/workflows/rust.yml‎`

`‎benches/chars.rs‎`

`‎scripts/unicode.py‎`

`‎scripts/unicode_gen_breaktests.py‎`

0 commit comments