Oct 30, 2019 · Oct 22, 2019 · Oct 22, 2019 · Oct 22, 2019 · Oct 22, 2019 · Oct 22, 2019
diff --git a/scripts/unicode.py b/scripts/unicode.py
 # these are the surrogate codepoints, which are not valid rust characters
 surrogate_codepoints = (0xd800, 0xdfff)

 UNICODE_VERSION = (11, 0, 0)

 UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION

 def is_surrogate(n):
    return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]

 def fetch(f):
    if not os.path.exists(os.path.basename(f)):
        os.system("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"
                  % f)
        if "emoji" in f:
            os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
                      % (UNICODE_VERSION[0], UNICODE_VERSION[1], f))
        else:
            os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
                      % (UNICODE_VERSION_NUMBER, f))

    if not os.path.exists(os.path.basename(f)):
        sys.stderr.write("cannot load %s" % f)
    pub use self::%sCat::*;

    #[allow(non_camel_case_types)]
    #[derive(Clone, Copy, PartialEq, Eq)]
    #[derive(Clone, Copy, PartialEq, Eq, Debug)]
    pub enum %sCat {
 """ % (name, Name, Name))

    with open(r, "w") as rf:
        # write the file's preamble
        rf.write(preamble)

        # download and parse all the data
        fetch("ReadMe.txt")
        with open("ReadMe.txt") as readme:
            pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
            unicode_version = re.search(pattern, readme.read()).groups()
        rf.write("""
 /// The version of [Unicode](http://www.unicode.org/)
 /// that this version of unicode-segmentation is based on.
 pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
 """ %unicode_version)
 """ %UNICODE_VERSION)

        # download and parse all the data
        gencats = load_gencats("UnicodeData.txt")
        derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])

        grapheme_table = []
        for cat in grapheme_cats:
            grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
        emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
        grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
        grapheme_table.sort(key=lambda w: w[0])
        emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
        last = -1
        for chars in grapheme_table:
            if chars[0] <= last:
                raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
            last = chars[1]
        emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
        rf.write("\n")

        word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
        word_table.sort(key=lambda w: w[0])
        emit_break_module(rf, word_table, list(word_cats.keys()), "word")

        # There are some emoji which are also ALetter, so this needs to be stored separately
        # For efficiency, we could still merge the two tables and produce an ALetterEP state
        emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
        emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")

        sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
        sentence_table = []
        for cat in sentence_cats:
diff --git a/scripts/unicode_gen_breaktests.py b/scripts/unicode_gen_breaktests.py
    stype = "&'static [(&'static str, &'static [&'static str])]"
    dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
    f.write("    // official Unicode test data\n")
    f.write("    // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
    unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
    unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)


    wtype = "&'static [(&'static str, &'static [&'static str])]"
    f.write("    // official Unicode test data\n")
    f.write("    // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/WordBreakTest.txt\n")
    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
    unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

 def create_sentence_data(f):

    wtype = "&'static [(&'static str, &'static [&'static str])]"
    f.write("    // official Unicode test data\n")
    f.write("    // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/SentenceBreakTest.txt\n")
    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
    unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

 if __name__ == "__main__":
diff --git a/src/grapheme.rs b/src/grapheme.rs
    // The codepoint after is a Regional Indicator Symbol, so a boundary iff
    // it is preceded by an even number of RIS codepoints. (GB12, GB13)
    Regional,
    // The codepoint after isin the E_Modifier category, so whether it's a boundary
    // depends on pre-context according toGB10.
    // The codepoint after isExtended_Pictographic,
    //so whether it's a boundarydepends on pre-context according toGB11.
    Emoji,
 }

        (_, GC_ZWJ) => NotBreak,  // GB9
        (_, GC_SpacingMark) => Extended,  // GB9a
        (GC_Prepend, _) => Extended,  // GB9b
        (GC_E_Base, GC_E_Modifier) => NotBreak,  // GB10
        (GC_E_Base_GAZ, GC_E_Modifier) => NotBreak,  // GB10
        (GC_Extend, GC_E_Modifier) => Emoji,  // GB10
        (GC_ZWJ, GC_Glue_After_Zwj) => NotBreak,  // GB11
        (GC_ZWJ, GC_E_Base_GAZ) => NotBreak,  // GB11
        (GC_ZWJ, GC_Extended_Pictographic) => Emoji,  // GB11
        (GC_Regional_Indicator, GC_Regional_Indicator) => Regional,  // GB12, GB13
        (_, _) => Break,  // GB999
    }

    fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
        use tables::grapheme as gr;
        for ch in chunk.chars().rev() {
        let mut iter = chunk.chars().rev();
        if let Some(ch) = iter.next() {
            if gr::grapheme_category(ch) != gr::GC_ZWJ {
                self.decide(true);
                return;
            }
        }
        for ch in iter {
            match gr::grapheme_category(ch) {
                gr::GC_Extend => (),
                gr::GC_E_Base | gr::GC_E_Base_GAZ => {
                gr::GC_Extended_Pictographic => {
                    self.decide(false);
                    return;
                }
            let mut need_pre_context = true;
            match self.cat_after.unwrap() {
                gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
                gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
                gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
                _ => need_pre_context = self.cat_before.is_none(),
            }
            if need_pre_context {
diff --git a/src/lib.rs b/src/lib.rs
 //!
 //!     let s = "The quick (\"brown\")  fox";
 //!     let w = s.split_word_bounds().collect::<Vec<&str>>();
 //!     let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];
 //!     let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
 //!     assert_eq!(w, b);
 //! }
 //! ```
    /// ```
    /// # use self::unicode_segmentation::UnicodeSegmentation;
    /// let swu1 = "The quick (\"brown\")  fox".split_word_bounds().collect::<Vec<&str>>();
    /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];
    /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
    ///
    /// assert_eq!(&swu1[..], b);
    /// ```
Original file line number	Diff line number	Diff line change
Expand Up		@@ -54,13 +54,21 @@
		# these are the surrogate codepoints, which are not valid rust characters
		surrogate_codepoints = (0xd800, 0xdfff)

		UNICODE_VERSION = (11, 0, 0)

		UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION

		def is_surrogate(n):
		return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]

		def fetch(f):
		if not os.path.exists(os.path.basename(f)):
		os.system("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"
		% f)
		if "emoji" in f:
		os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
		% (UNICODE_VERSION[0], UNICODE_VERSION[1], f))
		else:
		os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
		% (UNICODE_VERSION_NUMBER, f))

		if not os.path.exists(os.path.basename(f)):
		sys.stderr.write("cannot load %s" % f)
Expand DownExpand Up		@@ -262,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
		pub use self::%sCat::*;

		#[allow(non_camel_case_types)]
		#[derive(Clone, Copy, PartialEq, Eq)]
		#[derive(Clone, Copy, PartialEq, Eq, Debug)]
		pub enum %sCat {
		""" % (name, Name, Name))

Expand DownExpand Up		@@ -305,18 +313,13 @@ def emit_break_module(f, break_table, break_cats, name):
		with open(r, "w") as rf:
		# write the file's preamble
		rf.write(preamble)

		# download and parse all the data
		fetch("ReadMe.txt")
		with open("ReadMe.txt") as readme:
		pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
		unicode_version = re.search(pattern, readme.read()).groups()
		rf.write("""
		/// The version of [Unicode](http://www.unicode.org/)
		/// that this version of unicode-segmentation is based on.
		pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
		""" %unicode_version)
		""" %UNICODE_VERSION)

		# download and parse all the data
		gencats = load_gencats("UnicodeData.txt")
		derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])

Expand All		@@ -341,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
		grapheme_table = []
		for cat in grapheme_cats:
		grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
		emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
		grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
		grapheme_table.sort(key=lambda w: w[0])
		emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
		last = -1
		for chars in grapheme_table:
		if chars[0] <= last:
		raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
		last = chars[1]
		emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
		rf.write("\n")

		word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
Expand All		@@ -352,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
		word_table.sort(key=lambda w: w[0])
		emit_break_module(rf, word_table, list(word_cats.keys()), "word")

		# There are some emoji which are also ALetter, so this needs to be stored separately
		# For efficiency, we could still merge the two tables and produce an ALetterEP state
		emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
		emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")

		sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
		sentence_table = []
		for cat in sentence_cats:
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -172,7 +172,7 @@ def create_grapheme_data(f):
		stype = "&'static [(&'static str, &'static [&'static str])]"
		dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
		f.write(" // official Unicode test data\n")
		f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
		f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
		unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
		unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)

Expand All		@@ -187,7 +187,7 @@ def create_words_data(f):

		wtype = "&'static [(&'static str, &'static [&'static str])]"
		f.write(" // official Unicode test data\n")
		f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/WordBreakTest.txt\n")
		f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
		unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

		def create_sentence_data(f):
Expand All		@@ -201,7 +201,7 @@ def create_sentence_data(f):

		wtype = "&'static [(&'static str, &'static [&'static str])]"
		f.write(" // official Unicode test data\n")
		f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/SentenceBreakTest.txt\n")
		f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
		unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

		if __name__ == "__main__":
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -147,8 +147,8 @@ enum GraphemeState {
		// The codepoint after is a Regional Indicator Symbol, so a boundary iff
		// it is preceded by an even number of RIS codepoints. (GB12, GB13)
		Regional,
		// The codepoint after isin the E_Modifier category, so whether it's a boundary
		// depends on pre-context according toGB10.
		// The codepoint after isExtended_Pictographic,
		//so whether it's a boundarydepends on pre-context according toGB11.
		Emoji,
		}

Expand DownExpand Up		@@ -239,11 +239,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
		(_, GC_ZWJ) => NotBreak, // GB9
		(_, GC_SpacingMark) => Extended, // GB9a
		(GC_Prepend, _) => Extended, // GB9b
		(GC_E_Base, GC_E_Modifier) => NotBreak, // GB10
		(GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
		(GC_Extend, GC_E_Modifier) => Emoji, // GB10
		(GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11
		(GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11
		(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
		(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
		(_, _) => Break, // GB999
		}
Expand DownExpand Up		@@ -415,10 +411,17 @@ impl GraphemeCursor {

		fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
		use tables::grapheme as gr;
		for ch in chunk.chars().rev() {
		let mut iter = chunk.chars().rev();
		if let Some(ch) = iter.next() {
		if gr::grapheme_category(ch) != gr::GC_ZWJ {
		self.decide(true);
		return;
		}
		}
		for ch in iter {
		match gr::grapheme_category(ch) {
		gr::GC_Extend => (),
		gr::GC_E_Base \| gr::GC_E_Base_GAZ => {
		gr::GC_Extended_Pictographic => {
		self.decide(false);
		return;
		}
Expand DownExpand Up		@@ -484,7 +487,7 @@ impl GraphemeCursor {
		let mut need_pre_context = true;
		match self.cat_after.unwrap() {
		gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
		gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
		gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
		_ => need_pre_context = self.cat_before.is_none(),
		}
		if need_pre_context {
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,7 +29,7 @@
		//!
		//! let s = "The quick (\"brown\") fox";
		//! let w = s.split_word_bounds().collect::<Vec<&str>>();
		//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];
		//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
		//! assert_eq!(w, b);
		//! }
		//! ```
Expand DownExpand Up		@@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
		/// ```
		/// # use self::unicode_segmentation::UnicodeSegmentation;
		/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
		/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];
		/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
		///
		/// assert_eq!(&swu1[..], b);
		/// ```
Expand Down