May 15, 2019 · May 2, 2017 · May 4, 2017 · May 13, 2019 · May 13, 2019
diff --git a/Cargo.toml b/Cargo.toml
 keywords = ["text", "unicode", "grapheme", "word", "boundary"]
 readme = "README.md"
 description = """
 This crate provides Grapheme ClusterandWord boundaries
 This crate provides Grapheme Cluster, WordandSentence boundaries
 according to Unicode Standard Annex #29 rules.
 """

diff --git a/scripts/unicode.py b/scripts/unicode.py
            word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
        word_table.sort(key=lambda w: w[0])
        emit_break_module(rf, word_table, word_cats.keys(), "word")

        sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
        sentence_table = []
        for cat in sentence_cats:
            sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
        sentence_table.sort(key=lambda w: w[0])
        emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
diff --git a/scripts/unicode_gen_breaktests.py b/scripts/unicode_gen_breaktests.py
    f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
    unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

 def create_sentence_data(f):
    d = load_test_data("auxiliary/SentenceBreakTest.txt")

    test = []

    for (c, i) in d:
        allchars = [cn for s in c for cn in s]
        test.append((allchars, c))

    wtype = "&'static [(&'static str, &'static [&'static str])]"
    f.write("    // official Unicode test data\n")
    f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakTest.txt\n")
    unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

 if __name__ == "__main__":
    with open("testdata.rs", "w") as rf:
        rf.write(unicode.preamble)
        create_grapheme_data(rf)
        create_words_data(rf)
        create_sentence_data(rf)
diff --git a/src/lib.rs b/src/lib.rs
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 //! Iterators which split strings on Grapheme ClusterorWord boundaries, according
 //! Iterators which split strings on Grapheme Cluster, WordorSentence boundaries, according
 //! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
 //!
 //! ```rust
 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
 pub use tables::UNICODE_VERSION;
 pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
 pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};

 mod grapheme;
 mod tables;
 mod word;
 mod sentence;

 #[cfg(test)]
 mod test;
    /// assert_eq!(&swi1[..], b);
    /// ```
    fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;

    /// Returns an iterator over substrings of `self` separated on
    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
    ///
    /// The concatenation of the substrings returned by this function is just the original string.
    fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;

    /// Returns an iterator over substrings of `self` separated on
    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
    ///
    /// Here, "sentences" are just those substrings which, after splitting on
    /// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
    /// substring must contain at least one character with the
    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
    /// property, or with
    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
    fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;

    /// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
    /// and their offsets. See `split_sentence_bounds()` for more information.
    fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
 }

 impl UnicodeSegmentation for str {
    fn split_word_bound_indices(&self) -> UWordBoundIndices {
        word::new_word_bound_indices(self)
    }

    #[inline]
    fn unicode_sentences(&self) -> UnicodeSentences {
        sentence::new_unicode_sentences(self)
    }

    #[inline]
    fn split_sentence_bounds(&self) -> USentenceBounds {
        sentence::new_sentence_bounds(self)
    }

    #[inline]
    fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
        sentence::new_sentence_bound_indices(self)
    }
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"
		keywords = ["text", "unicode", "grapheme", "word", "boundary"]
		readme = "README.md"
		description = """
		This crate provides Grapheme ClusterandWord boundaries
		This crate provides Grapheme Cluster, WordandSentence boundaries
		according to Unicode Standard Annex #29 rules.
		"""

Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -351,3 +351,10 @@ def emit_break_module(f, break_table, break_cats, name):
		word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
		word_table.sort(key=lambda w: w[0])
		emit_break_module(rf, word_table, word_cats.keys(), "word")

		sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
		sentence_table = []
		for cat in sentence_cats:
		sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
		sentence_table.sort(key=lambda w: w[0])
		emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -190,8 +190,23 @@ def create_words_data(f):
		f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
		unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

		def create_sentence_data(f):
		d = load_test_data("auxiliary/SentenceBreakTest.txt")

		test = []

		for (c, i) in d:
		allchars = [cn for s in c for cn in s]
		test.append((allchars, c))

		wtype = "&'static [(&'static str, &'static [&'static str])]"
		f.write(" // official Unicode test data\n")
		f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakTest.txt\n")
		unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

		if __name__ == "__main__":
		with open("testdata.rs", "w") as rf:
		rf.write(unicode.preamble)
		create_grapheme_data(rf)
		create_words_data(rf)
		create_sentence_data(rf)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,7 +8,7 @@
		// option. This file may not be copied, modified, or distributed
		// except according to those terms.

		//! Iterators which split strings on Grapheme ClusterorWord boundaries, according
		//! Iterators which split strings on Grapheme Cluster, WordorSentence boundaries, according
		//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
		//!
		//! ```rust
Expand DownExpand Up		@@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};
		pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
		pub use tables::UNICODE_VERSION;
		pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
		pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};

		mod grapheme;
		mod tables;
		mod word;
		mod sentence;

		#[cfg(test)]
		mod test;
Expand DownExpand Up		@@ -174,6 +176,27 @@ pub trait UnicodeSegmentation {
		/// assert_eq!(&swi1[..], b);
		/// ```
		fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;

		/// Returns an iterator over substrings of `self` separated on
		/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
		///
		/// The concatenation of the substrings returned by this function is just the original string.
		fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;

		/// Returns an iterator over substrings of `self` separated on
		/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
		///
		/// Here, "sentences" are just those substrings which, after splitting on
tomcumming marked this conversation as resolved. Show resolvedHide resolved
		/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
		/// substring must contain at least one character with the
		/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
		/// property, or with
		/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
		fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
tomcumming marked this conversation as resolved. Show resolvedHide resolved

		/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
		/// and their offsets. See `split_sentence_bounds()` for more information.
		fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
		}

		impl UnicodeSegmentation for str {
Expand DownExpand Up		@@ -201,4 +224,19 @@ impl UnicodeSegmentation for str {
		fn split_word_bound_indices(&self) -> UWordBoundIndices {
		word::new_word_bound_indices(self)
		}

		#[inline]
		fn unicode_sentences(&self) -> UnicodeSentences {
		sentence::new_unicode_sentences(self)
		}

		#[inline]
		fn split_sentence_bounds(&self) -> USentenceBounds {
		sentence::new_sentence_bounds(self)
		}

		#[inline]
		fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
		sentence::new_sentence_bound_indices(self)
		}
		}