Commitcea3ce6

authored

Merge pull request#91 from basile-henry/basile/unicode-word-indices

Add unicode_word_indices

2 parents247c0b1 +8bd6e3a commitcea3ce6Copy full SHA for cea3ce6

File tree

2 files changed

+72

-5

lines changed

src
- lib.rs
- word.rs

2 files changed

+72

-5

lines changed

`‎src/lib.rs‎`

Lines changed: 30 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ extern crate quickcheck;`
`66`	`66`	`pubuse grapheme::{Graphemes,GraphemeIndices};`
`67`	`67`	`pubuse grapheme::{GraphemeCursor,GraphemeIncomplete};`
`68`	`68`	`pubuse tables::UNICODE_VERSION;`
`69`		`-pubuse word::{UWordBounds,UWordBoundIndices,UnicodeWords};`
	`69`	`+pubuse word::{UWordBounds,UWordBoundIndices,UnicodeWords,UnicodeWordIndices};`
`70`	`70`	`pubuse sentence::{USentenceBounds,USentenceBoundIndices,UnicodeSentences};`
`71`	`71`
`72`	`72`	`mod grapheme;`
`@@ -146,6 +146,30 @@ pub trait UnicodeSegmentation {`
`146`	`146`	/// ```
`147`	`147`	`fnunicode_words<'a>(&'aself) ->UnicodeWords<'a>;`
`148`	`148`
	`149`	+/// Returns an iterator over the words of `self`, separated on
	`150`	`+/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their`
	`151`	`+/// offsets.`
	`152`	`+///`
	`153`	`+/// Here, "words" are just those substrings which, after splitting on`
	`154`	`+/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the`
	`155`	`+/// substring must contain at least one character with the`
	`156`	`+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)`
	`157`	`+/// property, or with`
	`158`	`+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).`
	`159`	`+///`
	`160`	`+/// # Example`
	`161`	`+///`
	`162`	+/// ```
	`163`	`+/// # use self::unicode_segmentation::UnicodeSegmentation;`
	`164`	`+/// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";`
	`165`	`+/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();`
	`166`	`+/// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),`
	`167`	`+/// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];`
	`168`	`+///`
	`169`	`+/// assert_eq!(&uwi1[..], b);`
	`170`	+/// ```
	`171`	`+fnunicode_word_indices<'a>(&'aself) ->UnicodeWordIndices<'a>;`
	`172`	`+`
`149`	`173`	/// Returns an iterator over substrings of `self` separated on
`150`	`174`	`/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).`
`151`	`175`	`///`
`@@ -249,6 +273,11 @@ impl UnicodeSegmentation for str {`
`249`	`273`	`word::new_unicode_words(self)`
`250`	`274`	`}`
`251`	`275`
	`276`	`+#[inline]`
	`277`	`+fnunicode_word_indices(&self) ->UnicodeWordIndices{`
	`278`	`+ word::new_unicode_word_indices(self)`
	`279`	`+}`
	`280`	`+`
`252`	`281`	`#[inline]`
`253`	`282`	`fnsplit_word_bounds(&self) ->UWordBounds{`
`254`	`283`	`word::new_word_bounds(self)`

`‎src/word.rs‎`

Lines changed: 42 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,34 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> {`
`40`	`40`	`fnnext_back(&mutself) ->Option<&'astr>{self.inner.next_back()}`
`41`	`41`	`}`
`42`	`42`
	`43`	`+/// An iterator over the substrings of a string which, after splitting the string on`
	`44`	`+/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),`
	`45`	`+/// contain any characters with the`
	`46`	`+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)`
	`47`	`+/// property, or with`
	`48`	`+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).`
	`49`	`+/// This iterator also provides the byte offsets for each substring.`
	`50`	`+///`
	`51`	+/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
	`52`	`+/// its documentation for more.`
	`53`	`+///`
	`54`	+/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
	`55`	+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
	`56`	`+pubstructUnicodeWordIndices<'a>{`
	`57`	`+inner:Filter<UWordBoundIndices<'a>,fn(&(usize,&str)) ->bool>,`
	`58`	`+}`
	`59`	`+`
	`60`	`+impl<'a>IteratorforUnicodeWordIndices<'a>{`
	`61`	`+typeItem =(usize,&'astr);`
	`62`	`+`
	`63`	`+#[inline]`
	`64`	`+fnnext(&mutself) ->Option<(usize,&'astr)>{self.inner.next()}`
	`65`	`+}`
	`66`	`+impl<'a>DoubleEndedIteratorforUnicodeWordIndices<'a>{`
	`67`	`+#[inline]`
	`68`	`+fnnext_back(&mutself) ->Option<(usize,&'astr)>{self.inner.next_back()}`
	`69`	`+}`
	`70`	`+`
`43`	`71`	`/// External iterator for a string's`
`44`	`72`	`/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).`
`45`	`73`	`///`
`@@ -671,12 +699,22 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {`
`671`	`699`	`}`
`672`	`700`
`673`	`701`	`#[inline]`
`674`		`-pubfnnew_unicode_words<'b>(s:&'bstr) ->UnicodeWords<'b>{`
`675`		`-usesuper::UnicodeSegmentation;`
	`702`	`+fnhas_alphanumeric(s:&&str) ->bool{`
`676`	`703`	`use tables::util::is_alphanumeric;`
`677`	`704`
`678`		`-fnhas_alphanumeric(s:&&str) ->bool{ s.chars().any(\|c\|is_alphanumeric(c))}`
`679`		`-let has_alphanumeric:fn(&&str) ->bool = has_alphanumeric;// coerce to fn pointer`
	`705`	`+ s.chars().any(\|c\|is_alphanumeric(c))`
	`706`	`+}`
	`707`	`+`
	`708`	`+#[inline]`
	`709`	`+pubfnnew_unicode_words<'b>(s:&'bstr) ->UnicodeWords<'b>{`
	`710`	`+usesuper::UnicodeSegmentation;`
`680`	`711`
`681`	`712`	`UnicodeWords{inner: s.split_word_bounds().filter(has_alphanumeric)}`
`682`	`713`	`}`
	`714`	`+`
	`715`	`+#[inline]`
	`716`	`+pubfnnew_unicode_word_indices<'b>(s:&'bstr) ->UnicodeWordIndices<'b>{`
	`717`	`+usesuper::UnicodeSegmentation;`
	`718`	`+`
	`719`	`+UnicodeWordIndices{inner: s.split_word_bound_indices().filter(\|(_, c)\|has_alphanumeric(c))}`
	`720`	`+}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitcea3ce6

File tree

2 files changed

2 files changed

`‎src/lib.rs‎`

`‎src/word.rs‎`

0 commit comments