Commitb159d9e

authored

Merge pull request#68 from unicode-rs/unicode-11

Update to Unicode 11

2 parents7be58ca +df71866 commitb159d9eCopy full SHA for b159d9e

File tree

8 files changed

+2284

-2394

lines changed

scripts
- unicode.py
- unicode_gen_breaktests.py
src

8 files changed

+2284

-2394

lines changed

`‎scripts/unicode.py‎`

Lines changed: 26 additions & 11 deletions

Original file line number	Diff line number	Diff line change
`@@ -54,13 +54,21 @@`
`54`	`54`	`# these are the surrogate codepoints, which are not valid rust characters`
`55`	`55`	`surrogate_codepoints= (0xd800,0xdfff)`
`56`	`56`
	`57`	`+UNICODE_VERSION= (11,0,0)`
	`58`	`+`
	`59`	`+UNICODE_VERSION_NUMBER="%s.%s.%s"%UNICODE_VERSION`
	`60`	`+`
`57`	`61`	`defis_surrogate(n):`
`58`	`62`	`returnsurrogate_codepoints[0]<=n<=surrogate_codepoints[1]`
`59`	`63`
`60`	`64`	`deffetch(f):`
`61`	`65`	`ifnotos.path.exists(os.path.basename(f)):`
`62`		`-os.system("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"`
`63`		`-%f)`
	`66`	`+if"emoji"inf:`
	`67`	`+os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"`
	`68`	`+% (UNICODE_VERSION[0],UNICODE_VERSION[1],f))`
	`69`	`+else:`
	`70`	`+os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"`
	`71`	`+% (UNICODE_VERSION_NUMBER,f))`
`64`	`72`
`65`	`73`	`ifnotos.path.exists(os.path.basename(f)):`
`66`	`74`	`sys.stderr.write("cannot load %s"%f)`
`@@ -262,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):`
`262`	`270`	`pub use self::%sCat::*;`
`263`	`271`
`264`	`272`	`#[allow(non_camel_case_types)]`
`265`		`- #[derive(Clone, Copy, PartialEq, Eq)]`
	`273`	`+ #[derive(Clone, Copy, PartialEq, Eq, Debug)]`
`266`	`274`	`pub enum %sCat {`
`267`	`275`	`"""% (name,Name,Name))`
`268`	`276`
`@@ -305,18 +313,13 @@ def emit_break_module(f, break_table, break_cats, name):`
`305`	`313`	`withopen(r,"w")asrf:`
`306`	`314`	`# write the file's preamble`
`307`	`315`	`rf.write(preamble)`
`308`		`-`
`309`		`-# download and parse all the data`
`310`		`-fetch("ReadMe.txt")`
`311`		`-withopen("ReadMe.txt")asreadme:`
`312`		`-pattern=r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"`
`313`		`-unicode_version=re.search(pattern,readme.read()).groups()`
`314`	`316`	`rf.write("""`
`315`	`317`	`/// The version of [Unicode](http://www.unicode.org/)`
`316`	`318`	`/// that this version of unicode-segmentation is based on.`
`317`	`319`	`pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);`
`318`		`-"""%unicode_version)`
	`320`	`+"""%UNICODE_VERSION)`
`319`	`321`
	`322`	`+# download and parse all the data`
`320`	`323`	`gencats=load_gencats("UnicodeData.txt")`
`321`	`324`	`derived=load_properties("DerivedCoreProperties.txt", ["Alphabetic"])`
`322`	`325`
`@@ -341,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):`
`341`	`344`	`grapheme_table= []`
`342`	`345`	`forcatingrapheme_cats:`
`343`	`346`	`grapheme_table.extend([(x,y,cat)for (x,y)ingrapheme_cats[cat]])`
	`347`	`+emoji_props=load_properties("emoji-data.txt", ["Extended_Pictographic"])`
	`348`	`+grapheme_table.extend([(x,y,"Extended_Pictographic")for (x,y)inemoji_props["Extended_Pictographic"]])`
`344`	`349`	`grapheme_table.sort(key=lambdaw:w[0])`
`345`		`-emit_break_module(rf,grapheme_table,list(grapheme_cats.keys()),"grapheme")`
	`350`	`+last=-1`
	`351`	`+forcharsingrapheme_table:`
	`352`	`+ifchars[0]<=last:`
	`353`	`+raise"Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"`
	`354`	`+last=chars[1]`
	`355`	`+emit_break_module(rf,grapheme_table,list(grapheme_cats.keys())+ ["Extended_Pictographic"],"grapheme")`
`346`	`356`	`rf.write("\n")`
`347`	`357`
`348`	`358`	`word_cats=load_properties("auxiliary/WordBreakProperty.txt", [])`
`@@ -352,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):`
`352`	`362`	`word_table.sort(key=lambdaw:w[0])`
`353`	`363`	`emit_break_module(rf,word_table,list(word_cats.keys()),"word")`
`354`	`364`
	`365`	`+# There are some emoji which are also ALetter, so this needs to be stored separately`
	`366`	`+# For efficiency, we could still merge the two tables and produce an ALetterEP state`
	`367`	`+emoji_table= [(x,y,"Extended_Pictographic")for (x,y)inemoji_props["Extended_Pictographic"]]`
	`368`	`+emit_break_module(rf,emoji_table, ["Extended_Pictographic"],"emoji")`
	`369`	`+`
`355`	`370`	`sentence_cats=load_properties("auxiliary/SentenceBreakProperty.txt", [])`
`356`	`371`	`sentence_table= []`
`357`	`372`	`forcatinsentence_cats:`

`‎scripts/unicode_gen_breaktests.py‎`

Lines changed: 3 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -172,7 +172,7 @@ def create_grapheme_data(f):`
`172`	`172`	`stype="&'static [(&'static str, &'static [&'static str])]"`
`173`	`173`	`dtype="&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"`
`174`	`174`	`f.write(" // official Unicode test data\n")`
`175`		`-f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")`
	`175`	`+f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)`
`176`	`176`	`unicode.emit_table(f,"TEST_SAME",test_same,stype,True,showfun,True)`
`177`	`177`	`unicode.emit_table(f,"TEST_DIFF",test_diff,dtype,True,showfun,True)`
`178`	`178`
`@@ -187,7 +187,7 @@ def create_words_data(f):`
`187`	`187`
`188`	`188`	`wtype="&'static [(&'static str, &'static [&'static str])]"`
`189`	`189`	`f.write(" // official Unicode test data\n")`
`190`		`-f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/WordBreakTest.txt\n")`
	`190`	`+f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)`
`191`	`191`	`unicode.emit_table(f,"TEST_WORD",test,wtype,True,showfun,True)`
`192`	`192`
`193`	`193`	`defcreate_sentence_data(f):`
`@@ -201,7 +201,7 @@ def create_sentence_data(f):`
`201`	`201`
`202`	`202`	`wtype="&'static [(&'static str, &'static [&'static str])]"`
`203`	`203`	`f.write(" // official Unicode test data\n")`
`204`		`-f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/SentenceBreakTest.txt\n")`
	`204`	`+f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)`
`205`	`205`	`unicode.emit_table(f,"TEST_SENTENCE",test,wtype,True,showfun,True)`
`206`	`206`
`207`	`207`	`if__name__=="__main__":`

`‎src/grapheme.rs‎`

Lines changed: 13 additions & 10 deletions

Original file line number	Diff line number	Diff line change
`@@ -147,8 +147,8 @@ enum GraphemeState {`
`147`	`147`	`// The codepoint after is a Regional Indicator Symbol, so a boundary iff`
`148`	`148`	`// it is preceded by an even number of RIS codepoints. (GB12, GB13)`
`149`	`149`	`Regional,`
`150`		`-// The codepoint after isin the E_Modifier category, so whether it's a boundary`
`151`		`-// depends on pre-context according toGB10.`
	`150`	`+// The codepoint after isExtended_Pictographic,`
	`151`	`+//so whether it's a boundarydepends on pre-context according toGB11.`
`152`	`152`	`Emoji,`
`153`	`153`	`}`
`154`	`154`
`@@ -239,11 +239,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {`
`239`	`239`	`(_,GC_ZWJ) =>NotBreak,// GB9`
`240`	`240`	`(_,GC_SpacingMark) =>Extended,// GB9a`
`241`	`241`	`(GC_Prepend, _) =>Extended,// GB9b`
`242`		`-(GC_E_Base,GC_E_Modifier) =>NotBreak,// GB10`
`243`		`-(GC_E_Base_GAZ,GC_E_Modifier) =>NotBreak,// GB10`
`244`		`-(GC_Extend,GC_E_Modifier) =>Emoji,// GB10`
`245`		`-(GC_ZWJ,GC_Glue_After_Zwj) =>NotBreak,// GB11`
`246`		`-(GC_ZWJ,GC_E_Base_GAZ) =>NotBreak,// GB11`
	`242`	`+(GC_ZWJ,GC_Extended_Pictographic) =>Emoji,// GB11`
`247`	`243`	`(GC_Regional_Indicator,GC_Regional_Indicator) =>Regional,// GB12, GB13`
`248`	`244`	`(_, _) =>Break,// GB999`
`249`	`245`	`}`
`@@ -415,10 +411,17 @@ impl GraphemeCursor {`
`415`	`411`
`416`	`412`	`fnhandle_emoji(&mutself,chunk:&str,chunk_start:usize){`
`417`	`413`	`use tables::graphemeas gr;`
`418`		`-for chin chunk.chars().rev(){`
	`414`	`+letmut iter = chunk.chars().rev();`
	`415`	`+ifletSome(ch) = iter.next(){`
	`416`	`+if gr::grapheme_category(ch) != gr::GC_ZWJ{`
	`417`	`+self.decide(true);`
	`418`	`+return;`
	`419`	`+}`
	`420`	`+}`
	`421`	`+for chin iter{`
`419`	`422`	`match gr::grapheme_category(ch){`
`420`	`423`	`gr::GC_Extend =>(),`
`421`		`- gr::GC_E_Base \| gr::GC_E_Base_GAZ =>{`
	`424`	`+ gr::GC_Extended_Pictographic =>{`
`422`	`425`	`self.decide(false);`
`423`	`426`	`return;`
`424`	`427`	`}`
`@@ -484,7 +487,7 @@ impl GraphemeCursor {`
`484`	`487`	`letmut need_pre_context =true;`
`485`	`488`	`matchself.cat_after.unwrap(){`
`486`	`489`	`gr::GC_Regional_Indicator =>self.state =GraphemeState::Regional,`
`487`		`- gr::GC_E_Modifier =>self.state =GraphemeState::Emoji,`
	`490`	`+ gr::GC_Extended_Pictographic =>self.state =GraphemeState::Emoji,`
`488`	`491`	`_ => need_pre_context =self.cat_before.is_none(),`
`489`	`492`	`}`
`490`	`493`	`if need_pre_context{`

`‎src/lib.rs‎`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`	`//!`
`30`	`30`	`//! let s = "The quick (\"brown\") fox";`
`31`	`31`	`//! let w = s.split_word_bounds().collect::<Vec<&str>>();`
`32`		`-//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];`
	`32`	`+//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];`
`33`	`33`	`//! assert_eq!(w, b);`
`34`	`34`	`//! }`
`35`	`35`	//! ```
`@@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {`
`156`	`156`	/// ```
`157`	`157`	`/// # use self::unicode_segmentation::UnicodeSegmentation;`
`158`	`158`	`/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();`
`159`		`-/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];`
	`159`	`+/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];`
`160`	`160`	`///`
`161`	`161`	`/// assert_eq!(&swu1[..], b);`
`162`	`162`	/// ```

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitb159d9e

File tree

8 files changed

8 files changed

`‎scripts/unicode.py‎`

`‎scripts/unicode_gen_breaktests.py‎`

`‎src/grapheme.rs‎`

`‎src/lib.rs‎`

0 commit comments