5454# these are the surrogate codepoints, which are not valid rust characters
5555surrogate_codepoints = (0xd800 ,0xdfff )
5656
57+ UNICODE_VERSION = (11 ,0 ,0 )
58+
59+ UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
60+
5761def is_surrogate (n ):
5862return surrogate_codepoints [0 ]<= n <= surrogate_codepoints [1 ]
5963
6064def fetch (f ):
6165if not os .path .exists (os .path .basename (f )):
62- os .system ("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"
63- % f )
66+ if "emoji" in f :
67+ os .system ("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
68+ % (UNICODE_VERSION [0 ],UNICODE_VERSION [1 ],f ))
69+ else :
70+ os .system ("curl -O http://www.unicode.org/Public/%s/ucd/%s"
71+ % (UNICODE_VERSION_NUMBER ,f ))
6472
6573if not os .path .exists (os .path .basename (f )):
6674sys .stderr .write ("cannot load %s" % f )
@@ -262,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
262270 pub use self::%sCat::*;
263271
264272 #[allow(non_camel_case_types)]
265- #[derive(Clone, Copy, PartialEq, Eq)]
273+ #[derive(Clone, Copy, PartialEq, Eq, Debug )]
266274 pub enum %sCat {
267275""" % (name ,Name ,Name ))
268276
@@ -305,18 +313,13 @@ def emit_break_module(f, break_table, break_cats, name):
305313with open (r ,"w" )as rf :
306314# write the file's preamble
307315rf .write (preamble )
308-
309- # download and parse all the data
310- fetch ("ReadMe.txt" )
311- with open ("ReadMe.txt" )as readme :
312- pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
313- unicode_version = re .search (pattern ,readme .read ()).groups ()
314316rf .write ("""
315317/// The version of [Unicode](http://www.unicode.org/)
316318/// that this version of unicode-segmentation is based on.
317319pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
318- """ % unicode_version )
320+ """ % UNICODE_VERSION )
319321
322+ # download and parse all the data
320323gencats = load_gencats ("UnicodeData.txt" )
321324derived = load_properties ("DerivedCoreProperties.txt" , ["Alphabetic" ])
322325
@@ -341,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
341344grapheme_table = []
342345for cat in grapheme_cats :
343346grapheme_table .extend ([(x ,y ,cat )for (x ,y )in grapheme_cats [cat ]])
347+ emoji_props = load_properties ("emoji-data.txt" , ["Extended_Pictographic" ])
348+ grapheme_table .extend ([(x ,y ,"Extended_Pictographic" )for (x ,y )in emoji_props ["Extended_Pictographic" ]])
344349grapheme_table .sort (key = lambda w :w [0 ])
345- emit_break_module (rf ,grapheme_table ,list (grapheme_cats .keys ()),"grapheme" )
350+ last = - 1
351+ for chars in grapheme_table :
352+ if chars [0 ]<= last :
353+ raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
354+ last = chars [1 ]
355+ emit_break_module (rf ,grapheme_table ,list (grapheme_cats .keys ())+ ["Extended_Pictographic" ],"grapheme" )
346356rf .write ("\n " )
347357
348358word_cats = load_properties ("auxiliary/WordBreakProperty.txt" , [])
@@ -352,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
352362word_table .sort (key = lambda w :w [0 ])
353363emit_break_module (rf ,word_table ,list (word_cats .keys ()),"word" )
354364
365+ # There are some emoji which are also ALetter, so this needs to be stored separately
366+ # For efficiency, we could still merge the two tables and produce an ALetterEP state
367+ emoji_table = [(x ,y ,"Extended_Pictographic" )for (x ,y )in emoji_props ["Extended_Pictographic" ]]
368+ emit_break_module (rf ,emoji_table , ["Extended_Pictographic" ],"emoji" )
369+
355370sentence_cats = load_properties ("auxiliary/SentenceBreakProperty.txt" , [])
356371sentence_table = []
357372for cat in sentence_cats :