Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb159d9e

Browse files
authored
Merge pull request#68 from unicode-rs/unicode-11
Update to Unicode 11
2 parents7be58ca +df71866 commitb159d9e

File tree

8 files changed

+2284
-2394
lines changed

8 files changed

+2284
-2394
lines changed

‎scripts/unicode.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,21 @@
5454
# these are the surrogate codepoints, which are not valid rust characters
5555
surrogate_codepoints= (0xd800,0xdfff)
5656

57+
UNICODE_VERSION= (11,0,0)
58+
59+
UNICODE_VERSION_NUMBER="%s.%s.%s"%UNICODE_VERSION
60+
5761
defis_surrogate(n):
5862
returnsurrogate_codepoints[0]<=n<=surrogate_codepoints[1]
5963

6064
deffetch(f):
6165
ifnotos.path.exists(os.path.basename(f)):
62-
os.system("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"
63-
%f)
66+
if"emoji"inf:
67+
os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
68+
% (UNICODE_VERSION[0],UNICODE_VERSION[1],f))
69+
else:
70+
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
71+
% (UNICODE_VERSION_NUMBER,f))
6472

6573
ifnotos.path.exists(os.path.basename(f)):
6674
sys.stderr.write("cannot load %s"%f)
@@ -262,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
262270
pub use self::%sCat::*;
263271
264272
#[allow(non_camel_case_types)]
265-
#[derive(Clone, Copy, PartialEq, Eq)]
273+
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
266274
pub enum %sCat {
267275
"""% (name,Name,Name))
268276

@@ -305,18 +313,13 @@ def emit_break_module(f, break_table, break_cats, name):
305313
withopen(r,"w")asrf:
306314
# write the file's preamble
307315
rf.write(preamble)
308-
309-
# download and parse all the data
310-
fetch("ReadMe.txt")
311-
withopen("ReadMe.txt")asreadme:
312-
pattern=r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
313-
unicode_version=re.search(pattern,readme.read()).groups()
314316
rf.write("""
315317
/// The version of [Unicode](http://www.unicode.org/)
316318
/// that this version of unicode-segmentation is based on.
317319
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
318-
"""%unicode_version)
320+
"""%UNICODE_VERSION)
319321

322+
# download and parse all the data
320323
gencats=load_gencats("UnicodeData.txt")
321324
derived=load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
322325

@@ -341,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
341344
grapheme_table= []
342345
forcatingrapheme_cats:
343346
grapheme_table.extend([(x,y,cat)for (x,y)ingrapheme_cats[cat]])
347+
emoji_props=load_properties("emoji-data.txt", ["Extended_Pictographic"])
348+
grapheme_table.extend([(x,y,"Extended_Pictographic")for (x,y)inemoji_props["Extended_Pictographic"]])
344349
grapheme_table.sort(key=lambdaw:w[0])
345-
emit_break_module(rf,grapheme_table,list(grapheme_cats.keys()),"grapheme")
350+
last=-1
351+
forcharsingrapheme_table:
352+
ifchars[0]<=last:
353+
raise"Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
354+
last=chars[1]
355+
emit_break_module(rf,grapheme_table,list(grapheme_cats.keys())+ ["Extended_Pictographic"],"grapheme")
346356
rf.write("\n")
347357

348358
word_cats=load_properties("auxiliary/WordBreakProperty.txt", [])
@@ -352,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
352362
word_table.sort(key=lambdaw:w[0])
353363
emit_break_module(rf,word_table,list(word_cats.keys()),"word")
354364

365+
# There are some emoji which are also ALetter, so this needs to be stored separately
366+
# For efficiency, we could still merge the two tables and produce an ALetterEP state
367+
emoji_table= [(x,y,"Extended_Pictographic")for (x,y)inemoji_props["Extended_Pictographic"]]
368+
emit_break_module(rf,emoji_table, ["Extended_Pictographic"],"emoji")
369+
355370
sentence_cats=load_properties("auxiliary/SentenceBreakProperty.txt", [])
356371
sentence_table= []
357372
forcatinsentence_cats:

‎scripts/unicode_gen_breaktests.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def create_grapheme_data(f):
172172
stype="&'static [(&'static str, &'static [&'static str])]"
173173
dtype="&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
174174
f.write(" // official Unicode test data\n")
175-
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
175+
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)
176176
unicode.emit_table(f,"TEST_SAME",test_same,stype,True,showfun,True)
177177
unicode.emit_table(f,"TEST_DIFF",test_diff,dtype,True,showfun,True)
178178

@@ -187,7 +187,7 @@ def create_words_data(f):
187187

188188
wtype="&'static [(&'static str, &'static [&'static str])]"
189189
f.write(" // official Unicode test data\n")
190-
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/WordBreakTest.txt\n")
190+
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)
191191
unicode.emit_table(f,"TEST_WORD",test,wtype,True,showfun,True)
192192

193193
defcreate_sentence_data(f):
@@ -201,7 +201,7 @@ def create_sentence_data(f):
201201

202202
wtype="&'static [(&'static str, &'static [&'static str])]"
203203
f.write(" // official Unicode test data\n")
204-
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/SentenceBreakTest.txt\n")
204+
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)
205205
unicode.emit_table(f,"TEST_SENTENCE",test,wtype,True,showfun,True)
206206

207207
if__name__=="__main__":

‎src/grapheme.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,8 @@ enum GraphemeState {
147147
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
148148
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
149149
Regional,
150-
// The codepoint after isin the E_Modifier category, so whether it's a boundary
151-
// depends on pre-context according toGB10.
150+
// The codepoint after isExtended_Pictographic,
151+
//so whether it's a boundarydepends on pre-context according toGB11.
152152
Emoji,
153153
}
154154

@@ -239,11 +239,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
239239
(_,GC_ZWJ) =>NotBreak,// GB9
240240
(_,GC_SpacingMark) =>Extended,// GB9a
241241
(GC_Prepend, _) =>Extended,// GB9b
242-
(GC_E_Base,GC_E_Modifier) =>NotBreak,// GB10
243-
(GC_E_Base_GAZ,GC_E_Modifier) =>NotBreak,// GB10
244-
(GC_Extend,GC_E_Modifier) =>Emoji,// GB10
245-
(GC_ZWJ,GC_Glue_After_Zwj) =>NotBreak,// GB11
246-
(GC_ZWJ,GC_E_Base_GAZ) =>NotBreak,// GB11
242+
(GC_ZWJ,GC_Extended_Pictographic) =>Emoji,// GB11
247243
(GC_Regional_Indicator,GC_Regional_Indicator) =>Regional,// GB12, GB13
248244
(_, _) =>Break,// GB999
249245
}
@@ -415,10 +411,17 @@ impl GraphemeCursor {
415411

416412
fnhandle_emoji(&mutself,chunk:&str,chunk_start:usize){
417413
use tables::graphemeas gr;
418-
for chin chunk.chars().rev(){
414+
letmut iter = chunk.chars().rev();
415+
ifletSome(ch) = iter.next(){
416+
if gr::grapheme_category(ch) != gr::GC_ZWJ{
417+
self.decide(true);
418+
return;
419+
}
420+
}
421+
for chin iter{
419422
match gr::grapheme_category(ch){
420423
gr::GC_Extend =>(),
421-
gr::GC_E_Base | gr::GC_E_Base_GAZ =>{
424+
gr::GC_Extended_Pictographic =>{
422425
self.decide(false);
423426
return;
424427
}
@@ -484,7 +487,7 @@ impl GraphemeCursor {
484487
letmut need_pre_context =true;
485488
matchself.cat_after.unwrap(){
486489
gr::GC_Regional_Indicator =>self.state =GraphemeState::Regional,
487-
gr::GC_E_Modifier =>self.state =GraphemeState::Emoji,
490+
gr::GC_Extended_Pictographic =>self.state =GraphemeState::Emoji,
488491
_ => need_pre_context =self.cat_before.is_none(),
489492
}
490493
if need_pre_context{

‎src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
//!
3030
//! let s = "The quick (\"brown\") fox";
3131
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
32-
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];
32+
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
3333
//! assert_eq!(w, b);
3434
//! }
3535
//! ```
@@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
156156
/// ```
157157
/// # use self::unicode_segmentation::UnicodeSegmentation;
158158
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
159-
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];
159+
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
160160
///
161161
/// assert_eq!(&swu1[..], b);
162162
/// ```

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp