Mar 9, 2021 · Mar 7, 2021
diff --git a/src/lib.rs b/src/lib.rs
 pub use grapheme::{Graphemes, GraphemeIndices};
 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
 pub use tables::UNICODE_VERSION;
 pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
 pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices};
 pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};

 mod grapheme;
    /// ```
    fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;

    /// Returns an iterator over the words of `self`, separated on
    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
    /// offsets.
    ///
    /// Here, "words" are just those substrings which, after splitting on
    /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
    /// substring must contain at least one character with the
    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
    /// property, or with
    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
    ///
    /// # Example
    ///
    /// ```
    /// # use self::unicode_segmentation::UnicodeSegmentation;
    /// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
    /// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
    /// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
    ///                 (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
    ///
    /// assert_eq!(&uwi1[..], b);
    /// ```
    fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;

    /// Returns an iterator over substrings of `self` separated on
    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
    ///
        word::new_unicode_words(self)
    }

    #[inline]
    fn unicode_word_indices(&self) -> UnicodeWordIndices {
        word::new_unicode_word_indices(self)
    }

    #[inline]
    fn split_word_bounds(&self) -> UWordBounds {
        word::new_word_bounds(self)
diff --git a/src/word.rs b/src/word.rs
    fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
 }

 /// An iterator over the substrings of a string which, after splitting the string on
 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
 /// contain any characters with the
 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
 /// property, or with
 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
 /// This iterator also provides the byte offsets for each substring.
 ///
 /// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
 /// its documentation for more.
 ///
 /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 pub struct UnicodeWordIndices<'a> {
    inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
 }

 impl<'a> Iterator for UnicodeWordIndices<'a> {
    type Item = (usize, &'a str);

    #[inline]
    fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() }
 }
 impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
    #[inline]
    fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() }
 }

 /// External iterator for a string's
 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
 ///
 }

 #[inline]
 pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
    use super::UnicodeSegmentation;
 fn has_alphanumeric(s: &&str) -> bool {
    use tables::util::is_alphanumeric;

    fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
    s.chars().any(|c| is_alphanumeric(c))
 }

 #[inline]
 pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
    use super::UnicodeSegmentation;

    UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
 }

 #[inline]
 pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
    use super::UnicodeSegmentation;

    UnicodeWordIndices { inner: s.split_word_bound_indices().filter(|(_, c)| has_alphanumeric(c)) }
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -66,7 +66,7 @@ extern crate quickcheck;
		pub use grapheme::{Graphemes, GraphemeIndices};
		pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
		pub use tables::UNICODE_VERSION;
		pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
		pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices};
		pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};

		mod grapheme;
Expand DownExpand Up		@@ -146,6 +146,30 @@ pub trait UnicodeSegmentation {
		/// ```
		fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;

		/// Returns an iterator over the words of `self`, separated on
		/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
		/// offsets.
		///
		/// Here, "words" are just those substrings which, after splitting on
		/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
		/// substring must contain at least one character with the
		/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
		/// property, or with
		/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
		///
		/// # Example
		///
		/// ```
		/// # use self::unicode_segmentation::UnicodeSegmentation;
		/// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
		/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
		/// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
		/// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
		///
		/// assert_eq!(&uwi1[..], b);
		/// ```
		fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;

		/// Returns an iterator over substrings of `self` separated on
		/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
		///
Expand DownExpand Up		@@ -249,6 +273,11 @@ impl UnicodeSegmentation for str {
		word::new_unicode_words(self)
		}

		#[inline]
		fn unicode_word_indices(&self) -> UnicodeWordIndices {
		word::new_unicode_word_indices(self)
		}

		#[inline]
		fn split_word_bounds(&self) -> UWordBounds {
		word::new_word_bounds(self)
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -40,6 +40,34 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
		fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
		}

		/// An iterator over the substrings of a string which, after splitting the string on
		/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
		/// contain any characters with the
		/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
		/// property, or with
		/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
		/// This iterator also provides the byte offsets for each substring.
		///
		/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
		/// its documentation for more.
		///
		/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
		/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
		pub struct UnicodeWordIndices<'a> {
		inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
		}

		impl<'a> Iterator for UnicodeWordIndices<'a> {
		type Item = (usize, &'a str);

		#[inline]
		fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() }
		}
		impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
		#[inline]
		fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() }
		}

		/// External iterator for a string's
		/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
		///
Expand DownExpand Up		@@ -671,12 +699,22 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
		}

		#[inline]
		pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
		use super::UnicodeSegmentation;
		fn has_alphanumeric(s: &&str) -> bool {
		use tables::util::is_alphanumeric;

		fn has_alphanumeric(s: &&str) -> bool { s.chars().any(\|c\| is_alphanumeric(c)) }
		let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
		s.chars().any(\|c\| is_alphanumeric(c))
		}

		#[inline]
		pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
		use super::UnicodeSegmentation;

		UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
		}

		#[inline]
		pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
		use super::UnicodeSegmentation;

		UnicodeWordIndices { inner: s.split_word_bound_indices().filter(\|(_, c)\| has_alphanumeric(c)) }
		}