Commit7beb8a6

committed

add backwards iterator

1 parent6f96a23 commit7beb8a6Copy full SHA for 7beb8a6

File tree

2 files changed

+113

-31

lines changed

src
- lib.rs
- word.rs

2 files changed

+113

-31

lines changed

`‎src/lib.rs`

Lines changed: 12 additions & 12 deletions

Original file line number	Diff line number	Diff line change
`@@ -138,7 +138,7 @@ pub trait UnicodeSegmentation {`
`138`	`138`	`///`
`139`	`139`	`/// assert_eq!(&uw1[..], b);`
`140`	`140`	/// ```
`141`		`-fnunicode_words(&self) ->UnicodeWords;`
	`141`	`+fnunicode_words(&self) ->UnicodeWords<'_>;`
`142`	`142`
`143`	`143`	/// Returns an iterator over the words of `self`, separated on
`144`	`144`	`/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their`
`@@ -162,7 +162,7 @@ pub trait UnicodeSegmentation {`
`162`	`162`	`///`
`163`	`163`	`/// assert_eq!(&uwi1[..], b);`
`164`	`164`	/// ```
`165`		`-fnunicode_word_indices(&self) ->UnicodeWordIndices;`
	`165`	`+fnunicode_word_indices(&self) ->UnicodeWordIndices<'_>;`
`166`	`166`
`167`	`167`	/// Returns an iterator over substrings of `self` separated on
`168`	`168`	`/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).`
`@@ -178,7 +178,7 @@ pub trait UnicodeSegmentation {`
`178`	`178`	`///`
`179`	`179`	`/// assert_eq!(&swu1[..], b);`
`180`	`180`	/// ```
`181`		`-fnsplit_word_bounds(&self) ->UWordBounds;`
	`181`	`+fnsplit_word_bounds(&self) ->UWordBounds<'_>;`
`182`	`182`
`183`	`183`	/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
`184`	`184`	/// and their offsets. See `split_word_bounds()` for more information.
`@@ -193,7 +193,7 @@ pub trait UnicodeSegmentation {`
`193`	`193`	`///`
`194`	`194`	`/// assert_eq!(&swi1[..], b);`
`195`	`195`	/// ```
`196`		`-fnsplit_word_bound_indices(&self) ->UWordBoundIndices;`
	`196`	`+fnsplit_word_bound_indices(&self) ->UWordBoundIndices<'_>;`
`197`	`197`
`198`	`198`	/// Returns an iterator over substrings of `self` separated on
`199`	`199`	`/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).`
`@@ -215,7 +215,7 @@ pub trait UnicodeSegmentation {`
`215`	`215`	`///`
`216`	`216`	`/// assert_eq!(&us1[..], b);`
`217`	`217`	/// ```
`218`		`-fnunicode_sentences(&self) ->implIterator<Item =&'_str>;`
	`218`	`+fnunicode_sentences(&self) ->UnicodeSentences<'_>;`
`219`	`219`
`220`	`220`	/// Returns an iterator over substrings of `self` separated on
`221`	`221`	`/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).`
`@@ -253,7 +253,7 @@ pub trait UnicodeSegmentation {`
`253`	`253`
`254`	`254`	`implUnicodeSegmentationforstr{`
`255`	`255`	`#[inline]`
`256`		`-fngraphemes(&self,is_extended:bool) ->Graphemes{`
	`256`	`+fngraphemes(&self,is_extended:bool) ->Graphemes<'_>{`
`257`	`257`	`grapheme::new_graphemes(self, is_extended)`
`258`	`258`	`}`
`259`	`259`
`@@ -263,32 +263,32 @@ impl UnicodeSegmentation for str {`
`263`	`263`	`}`
`264`	`264`
`265`	`265`	`#[inline]`
`266`		`-fnunicode_words(&self) ->UnicodeWords{`
	`266`	`+fnunicode_words(&self) ->UnicodeWords<'_>{`
`267`	`267`	`word::new_unicode_words(self)`
`268`	`268`	`}`
`269`	`269`
`270`	`270`	`#[inline]`
`271`		`-fnunicode_word_indices(&self) ->UnicodeWordIndices{`
	`271`	`+fnunicode_word_indices(&self) ->UnicodeWordIndices<'_>{`
`272`	`272`	`word::new_unicode_word_indices(self)`
`273`	`273`	`}`
`274`	`274`
`275`	`275`	`#[inline]`
`276`		`-fnsplit_word_bounds(&self) ->UWordBounds{`
	`276`	`+fnsplit_word_bounds(&self) ->UWordBounds<'_>{`
`277`	`277`	`word::new_word_bounds(self)`
`278`	`278`	`}`
`279`	`279`
`280`	`280`	`#[inline]`
`281`		`-fnsplit_word_bound_indices(&self) ->UWordBoundIndices{`
	`281`	`+fnsplit_word_bound_indices(&self) ->UWordBoundIndices<'_>{`
`282`	`282`	`word::new_word_bound_indices(self)`
`283`	`283`	`}`
`284`	`284`
`285`	`285`	`#[inline]`
`286`		`-fnunicode_sentences(&self) ->implIterator<Item =&'_str>{`
	`286`	`+fnunicode_sentences(&self) ->UnicodeSentences<'_>{`
`287`	`287`	`sentence::new_unicode_sentences(self)`
`288`	`288`	`}`
`289`	`289`
`290`	`290`	`#[inline]`
`291`		`-fnsplit_sentence_bounds(&self) ->USentenceBounds{`
	`291`	`+fnsplit_sentence_bounds(&self) ->USentenceBounds<'_>{`
`292`	`292`	`sentence::new_sentence_bounds(self)`
`293`	`293`	`}`
`294`	`294`

`‎src/word.rs`

Lines changed: 101 additions & 19 deletions

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,6 @@`
`11`	`11`	`externcrate alloc;`
`12`	`12`	`use alloc::boxed::Box;`
`13`	`13`	`use core::cmp;`
`14`		`-use core::iter::Filter;`
`15`	`14`
`16`	`15`	`usecrate::tables::word::WordCat;`
`17`	`16`
`@@ -28,7 +27,7 @@ use crate::tables::word::WordCat;`
`28`	`27`	/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
`29`	`28`	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
`30`	`29`	`pubstructUnicodeWords<'a>{`
`31`		`-inner:Box<dynIterator<Item =&'astr> +'a>,`
	`30`	`+inner:Box<dynDoubleEndedIterator<Item =&'astr> +'a>,`
`32`	`31`	`}`
`33`	`32`
`34`	`33`	`impl<'a>IteratorforUnicodeWords<'a>{`
`@@ -45,6 +44,13 @@ impl<'a> Iterator for UnicodeWords<'a> {`
`45`	`44`	`}`
`46`	`45`	`}`
`47`	`46`
	`47`	`+impl<'a>DoubleEndedIteratorforUnicodeWords<'a>{`
	`48`	`+#[inline]`
	`49`	`+fnnext_back(&mutself) ->Option<&'astr>{`
	`50`	`+self.inner.next_back()`
	`51`	`+}`
	`52`	`+}`
	`53`	`+`
`48`	`54`	`/// An iterator over the substrings of a string which, after splitting the string on`
`49`	`55`	`/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),`
`50`	`56`	`/// contain any characters with the`
`@@ -58,16 +64,15 @@ impl<'a> Iterator for UnicodeWords<'a> {`
`58`	`64`	`///`
`59`	`65`	/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
`60`	`66`	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
`61`		`-#[derive(Debug)]`
`62`	`67`	`pubstructUnicodeWordIndices<'a>{`
`63`	`68`	`#[allow(clippy::type_complexity)]`
`64`		`-inner:Filter<UWordBoundIndices<'a>,fn(&(usize,&str)) ->bool>,`
	`69`	`+inner:Box<dynDoubleEndedIterator<Item =(usize,&'astr)> +'a>,`
`65`	`70`	`}`
`66`	`71`
`67`	`72`	`impl<'a>IteratorforUnicodeWordIndices<'a>{`
`68`	`73`	`typeItem =(usize,&'astr);`
`69`	`74`
`70`		`-#[inline]`
	`75`	`+#[inline(always)]`
`71`	`76`	`fnnext(&mutself) ->Option<(usize,&'astr)>{`
`72`	`77`	`self.inner.next()`
`73`	`78`	`}`
`@@ -722,12 +727,12 @@ impl<'a> AsciiWordBoundIter<'a> {`
`722`	`727`	`AsciiWordBoundIter{rest: s,offset:0}`
`723`	`728`	`}`
`724`	`729`
`725`		`-#[inline(always)]`
	`730`	`+#[inline]`
`726`	`731`	`fnis_core(b:u8) ->bool{`
`727`	`732`	`b.is_ascii_alphanumeric() \|\| b ==b'_'`
`728`	`733`	`}`
`729`	`734`
`730`		`-#[inline(always)]`
	`735`	`+#[inline]`
`731`	`736`	`fnis_infix(b:u8,prev:u8,next:u8) ->bool{`
`732`	`737`	`match b{`
`733`	`738`	`// numeric separators`
`@@ -744,6 +749,7 @@ impl<'a> AsciiWordBoundIter<'a> {`
`744`	`749`	`impl<'a>IteratorforAsciiWordBoundIter<'a>{`
`745`	`750`	`typeItem =(usize,&'astr);`
`746`	`751`
	`752`	`+#[inline]`
`747`	`753`	`fnnext(&mutself) ->Option<Self::Item>{`
`748`	`754`	`ifself.rest.is_empty(){`
`749`	`755`	`returnNone;`
`@@ -802,6 +808,66 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {`
`802`	`808`	`}`
`803`	`809`	`}`
`804`	`810`
	`811`	`+impl<'a>DoubleEndedIteratorforAsciiWordBoundIter<'a>{`
	`812`	`+fnnext_back(&mutself) ->Option<(usize,&'astr)>{`
	`813`	`+let rest =self.rest;`
	`814`	`+if rest.is_empty(){`
	`815`	`+returnNone;`
	`816`	`+}`
	`817`	`+let bytes = rest.as_bytes();`
	`818`	`+let len = bytes.len();`
	`819`	`+`
	`820`	`+// 1) Trailing spaces`
	`821`	`+if bytes[len -1] ==b' '{`
	`822`	`+// find start of this last run of spaces`
	`823`	`+letmut start = len -1;`
	`824`	`+while start >0 && bytes[start -1] ==b' '{`
	`825`	`+ start -=1;`
	`826`	`+}`
	`827`	`+let word =&rest[start..];`
	`828`	`+let pos =self.offset + start;`
	`829`	`+self.rest =&rest[..start];`
	`830`	`+returnSome((pos, word));`
	`831`	`+}`
	`832`	`+`
	`833`	`+// 2) Trailing core-run (letters/digits/underscore + infix)`
	`834`	`+ifSelf::is_core(bytes[len -1]){`
	`835`	+// scan backwards as long as we see `is_core` or an `is_infix`
	`836`	`+letmut start = len -1;`
	`837`	`+while start >0{`
	`838`	`+let b = bytes[start -1];`
	`839`	`+let prev =if start >=2{ bytes[start -2]}else{ b};`
	`840`	`+let next = bytes[start];// the byte we just included`
	`841`	`+ifSelf::is_core(b) \|\|Self::is_infix(b, prev, next){`
	`842`	`+ start -=1;`
	`843`	`+}else{`
	`844`	`+break;`
	`845`	`+}`
	`846`	`+}`
	`847`	`+let word =&rest[start..];`
	`848`	`+let pos =self.offset + start;`
	`849`	`+self.rest =&rest[..start];`
	`850`	`+returnSome((pos, word));`
	`851`	`+}`
	`852`	`+`
	`853`	`+// 3) CR+LF at end`
	`854`	`+if len >=2 && bytes[len -2] ==b'\r' && bytes[len -1] ==b'\n'{`
	`855`	`+let start = len -2;`
	`856`	`+let word =&rest[start..];`
	`857`	`+let pos =self.offset + start;`
	`858`	`+self.rest =&rest[..start];`
	`859`	`+returnSome((pos, word));`
	`860`	`+}`
	`861`	`+`
	`862`	`+// 4) Single non-core byte`
	`863`	`+let start = len -1;`
	`864`	`+let word =&rest[start..];`
	`865`	`+let pos =self.offset + start;`
	`866`	`+self.rest =&rest[..start];`
	`867`	`+Some((pos, word))`
	`868`	`+}`
	`869`	`+}`
	`870`	`+`
`805`	`871`	`#[inline]`
`806`	`872`	`pubfnnew_word_bounds(s:&str) ->UWordBounds<'_>{`
`807`	`873`	`UWordBounds{`
`@@ -832,20 +898,25 @@ fn has_alphanumeric(s: &&str) -> bool {`
`832`	`898`	`}`
`833`	`899`
`834`	`900`	`#[inline]`
`835`		`-fnnew_unicode_words_ascii<'a>(s:&'astr) ->implIterator<Item =&'astr> +'a{`
	`901`	`+fnhas_ascii_alphanumeric(s:&&str) ->bool{`
	`902`	`+ s.chars().any(\|c\| c.is_ascii_alphanumeric())`
	`903`	`+}`
	`904`	`+`
	`905`	`+#[inline]`
	`906`	`+fnnew_unicode_words_ascii<'a>(s:&'astr) ->implDoubleEndedIterator<Item =&'astr> +'a{`
`836`	`907`	`new_ascii_word_bound_indices(s)`
`837`	`908`	`.map(\|(_, w)\| w)`
`838`		`-.filter(\|w\| w.chars().any(\|c\| c.is_ascii_alphanumeric()))`
	`909`	`+.filter(has_ascii_alphanumeric)`
`839`	`910`	`}`
`840`	`911`
`841`	`912`	`#[inline]`
`842`		`-fnnew_unicode_words_general<'a>(s:&'astr) ->implIterator<Item =&'astr> +'a{`
	`913`	`+fnnew_unicode_words_general<'a>(s:&'astr) ->implDoubleEndedIterator<Item =&'astr> +'a{`
`843`	`914`	`new_word_bounds(s).filter(has_alphanumeric)`
`844`	`915`	`}`
`845`	`916`
`846`	`917`	`#[inline]`
`847`	`918`	`pubfnnew_unicode_words(s:&str) ->UnicodeWords<'_>{`
`848`		`-let iter:Box<dynIterator<Item =&str>> =if s.is_ascii(){`
	`919`	`+let iter:Box<dynDoubleEndedIterator<Item =&str>> =if s.is_ascii(){`
`849`	`920`	`Box::new(new_unicode_words_ascii(s))`
`850`	`921`	`}else{`
`851`	`922`	`Box::new(new_unicode_words_general(s))`
`@@ -855,14 +926,13 @@ pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {`
`855`	`926`	`}`
`856`	`927`
`857`	`928`	`#[inline]`
`858`		`-pubfnnew_unicode_word_indices(s:&str) ->UnicodeWordIndices<'_>{`
`859`		`-usesuper::UnicodeSegmentation;`
`860`		`-`
`861`		`-UnicodeWordIndices{`
`862`		`-inner: s`
`863`		`-.split_word_bound_indices()`
`864`		`-.filter(\|(_, c)\|has_alphanumeric(c)),`
`865`		`-}`
	`929`	`+pubfnnew_unicode_word_indices<'a>(s:&'astr) ->UnicodeWordIndices<'a>{`
	`930`	`+let iter:Box<dynDoubleEndedIterator<Item =(usize,&str)>> =if s.is_ascii(){`
	`931`	`+Box::new(new_ascii_word_bound_indices(s).filter(\|(_, w)\|has_ascii_alphanumeric(w)))`
	`932`	`+}else{`
	`933`	`+Box::new(new_word_bound_indices(s).filter(\|(_, w)\|has_alphanumeric(w)))`
	`934`	`+};`
	`935`	`+UnicodeWordIndices{inner: iter}`
`866`	`936`	`}`
`867`	`937`
`868`	`938`	`#[cfg(test)]`
`@@ -921,5 +991,17 @@ mod tests {`
`921`	`991`
`922`	`992`	`prop_assert_eq!(fast, uni);`
`923`	`993`	`}`
	`994`	`+`
	`995`	`+/// Fast path must equal general path for any ASCII input, forwards and backwards.`
	`996`	`+ #[test]`
	`997`	`+fn proptest_ascii_matches_unicode_word_indices_rev(`
	`998`	`+// Vec<char> → String, length 0‒99`
	`999`	`+ s in proptest::collection::vec(ascii_char(),0..100)`
	`1000`	`+.prop_map(\|v\| v.into_iter().collect::<String>())`
	`1001`	`+){`
	`1002`	`+let fast_rev:Vec<(usize,&str)> = new_ascii_word_bound_indices(&s).rev().collect();`
	`1003`	`+let uni_rev:Vec<(usize,&str)> = new_word_bound_indices(&s).rev().collect();`
	`1004`	`+ prop_assert_eq!(fast_rev, uni_rev);`
	`1005`	`+}`
`924`	`1006`	`}`
`925`	`1007`	`}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit7beb8a6

File tree

2 files changed

2 files changed

`‎src/lib.rs`

`‎src/word.rs`

0 commit comments