Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit7beb8a6

Browse files
committed
add backwards iterator
1 parent6f96a23 commit7beb8a6

File tree

2 files changed

+113
-31
lines changed

2 files changed

+113
-31
lines changed

‎src/lib.rs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ pub trait UnicodeSegmentation {
138138
///
139139
/// assert_eq!(&uw1[..], b);
140140
/// ```
141-
fnunicode_words(&self) ->UnicodeWords;
141+
fnunicode_words(&self) ->UnicodeWords<'_>;
142142

143143
/// Returns an iterator over the words of `self`, separated on
144144
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
@@ -162,7 +162,7 @@ pub trait UnicodeSegmentation {
162162
///
163163
/// assert_eq!(&uwi1[..], b);
164164
/// ```
165-
fnunicode_word_indices(&self) ->UnicodeWordIndices;
165+
fnunicode_word_indices(&self) ->UnicodeWordIndices<'_>;
166166

167167
/// Returns an iterator over substrings of `self` separated on
168168
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
@@ -178,7 +178,7 @@ pub trait UnicodeSegmentation {
178178
///
179179
/// assert_eq!(&swu1[..], b);
180180
/// ```
181-
fnsplit_word_bounds(&self) ->UWordBounds;
181+
fnsplit_word_bounds(&self) ->UWordBounds<'_>;
182182

183183
/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
184184
/// and their offsets. See `split_word_bounds()` for more information.
@@ -193,7 +193,7 @@ pub trait UnicodeSegmentation {
193193
///
194194
/// assert_eq!(&swi1[..], b);
195195
/// ```
196-
fnsplit_word_bound_indices(&self) ->UWordBoundIndices;
196+
fnsplit_word_bound_indices(&self) ->UWordBoundIndices<'_>;
197197

198198
/// Returns an iterator over substrings of `self` separated on
199199
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -215,7 +215,7 @@ pub trait UnicodeSegmentation {
215215
///
216216
/// assert_eq!(&us1[..], b);
217217
/// ```
218-
fnunicode_sentences(&self) ->implIterator<Item =&'_str>;
218+
fnunicode_sentences(&self) ->UnicodeSentences<'_>;
219219

220220
/// Returns an iterator over substrings of `self` separated on
221221
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -253,7 +253,7 @@ pub trait UnicodeSegmentation {
253253

254254
implUnicodeSegmentationforstr{
255255
#[inline]
256-
fngraphemes(&self,is_extended:bool) ->Graphemes{
256+
fngraphemes(&self,is_extended:bool) ->Graphemes<'_>{
257257
grapheme::new_graphemes(self, is_extended)
258258
}
259259

@@ -263,32 +263,32 @@ impl UnicodeSegmentation for str {
263263
}
264264

265265
#[inline]
266-
fnunicode_words(&self) ->UnicodeWords{
266+
fnunicode_words(&self) ->UnicodeWords<'_>{
267267
word::new_unicode_words(self)
268268
}
269269

270270
#[inline]
271-
fnunicode_word_indices(&self) ->UnicodeWordIndices{
271+
fnunicode_word_indices(&self) ->UnicodeWordIndices<'_>{
272272
word::new_unicode_word_indices(self)
273273
}
274274

275275
#[inline]
276-
fnsplit_word_bounds(&self) ->UWordBounds{
276+
fnsplit_word_bounds(&self) ->UWordBounds<'_>{
277277
word::new_word_bounds(self)
278278
}
279279

280280
#[inline]
281-
fnsplit_word_bound_indices(&self) ->UWordBoundIndices{
281+
fnsplit_word_bound_indices(&self) ->UWordBoundIndices<'_>{
282282
word::new_word_bound_indices(self)
283283
}
284284

285285
#[inline]
286-
fnunicode_sentences(&self) ->implIterator<Item =&'_str>{
286+
fnunicode_sentences(&self) ->UnicodeSentences<'_>{
287287
sentence::new_unicode_sentences(self)
288288
}
289289

290290
#[inline]
291-
fnsplit_sentence_bounds(&self) ->USentenceBounds{
291+
fnsplit_sentence_bounds(&self) ->USentenceBounds<'_>{
292292
sentence::new_sentence_bounds(self)
293293
}
294294

‎src/word.rs

Lines changed: 101 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
externcrate alloc;
1212
use alloc::boxed::Box;
1313
use core::cmp;
14-
use core::iter::Filter;
1514

1615
usecrate::tables::word::WordCat;
1716

@@ -28,7 +27,7 @@ use crate::tables::word::WordCat;
2827
/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
2928
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
3029
pubstructUnicodeWords<'a>{
31-
inner:Box<dynIterator<Item =&'astr> +'a>,
30+
inner:Box<dynDoubleEndedIterator<Item =&'astr> +'a>,
3231
}
3332

3433
impl<'a>IteratorforUnicodeWords<'a>{
@@ -45,6 +44,13 @@ impl<'a> Iterator for UnicodeWords<'a> {
4544
}
4645
}
4746

47+
impl<'a>DoubleEndedIteratorforUnicodeWords<'a>{
48+
#[inline]
49+
fnnext_back(&mutself) ->Option<&'astr>{
50+
self.inner.next_back()
51+
}
52+
}
53+
4854
/// An iterator over the substrings of a string which, after splitting the string on
4955
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
5056
/// contain any characters with the
@@ -58,16 +64,15 @@ impl<'a> Iterator for UnicodeWords<'a> {
5864
///
5965
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
6066
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
61-
#[derive(Debug)]
6267
pubstructUnicodeWordIndices<'a>{
6368
#[allow(clippy::type_complexity)]
64-
inner:Filter<UWordBoundIndices<'a>,fn(&(usize,&str)) ->bool>,
69+
inner:Box<dynDoubleEndedIterator<Item =(usize,&'astr)> +'a>,
6570
}
6671

6772
impl<'a>IteratorforUnicodeWordIndices<'a>{
6873
typeItem =(usize,&'astr);
6974

70-
#[inline]
75+
#[inline(always)]
7176
fnnext(&mutself) ->Option<(usize,&'astr)>{
7277
self.inner.next()
7378
}
@@ -722,12 +727,12 @@ impl<'a> AsciiWordBoundIter<'a> {
722727
AsciiWordBoundIter{rest: s,offset:0}
723728
}
724729

725-
#[inline(always)]
730+
#[inline]
726731
fnis_core(b:u8) ->bool{
727732
b.is_ascii_alphanumeric() || b ==b'_'
728733
}
729734

730-
#[inline(always)]
735+
#[inline]
731736
fnis_infix(b:u8,prev:u8,next:u8) ->bool{
732737
match b{
733738
// numeric separators
@@ -744,6 +749,7 @@ impl<'a> AsciiWordBoundIter<'a> {
744749
impl<'a>IteratorforAsciiWordBoundIter<'a>{
745750
typeItem =(usize,&'astr);
746751

752+
#[inline]
747753
fnnext(&mutself) ->Option<Self::Item>{
748754
ifself.rest.is_empty(){
749755
returnNone;
@@ -802,6 +808,66 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
802808
}
803809
}
804810

811+
impl<'a>DoubleEndedIteratorforAsciiWordBoundIter<'a>{
812+
fnnext_back(&mutself) ->Option<(usize,&'astr)>{
813+
let rest =self.rest;
814+
if rest.is_empty(){
815+
returnNone;
816+
}
817+
let bytes = rest.as_bytes();
818+
let len = bytes.len();
819+
820+
// 1) Trailing spaces
821+
if bytes[len -1] ==b' '{
822+
// find start of this last run of spaces
823+
letmut start = len -1;
824+
while start >0 && bytes[start -1] ==b' '{
825+
start -=1;
826+
}
827+
let word =&rest[start..];
828+
let pos =self.offset + start;
829+
self.rest =&rest[..start];
830+
returnSome((pos, word));
831+
}
832+
833+
// 2) Trailing core-run (letters/digits/underscore + infix)
834+
ifSelf::is_core(bytes[len -1]){
835+
// scan backwards as long as we see `is_core` or an `is_infix`
836+
letmut start = len -1;
837+
while start >0{
838+
let b = bytes[start -1];
839+
let prev =if start >=2{ bytes[start -2]}else{ b};
840+
let next = bytes[start];// the byte we just included
841+
ifSelf::is_core(b) ||Self::is_infix(b, prev, next){
842+
start -=1;
843+
}else{
844+
break;
845+
}
846+
}
847+
let word =&rest[start..];
848+
let pos =self.offset + start;
849+
self.rest =&rest[..start];
850+
returnSome((pos, word));
851+
}
852+
853+
// 3) CR+LF at end
854+
if len >=2 && bytes[len -2] ==b'\r' && bytes[len -1] ==b'\n'{
855+
let start = len -2;
856+
let word =&rest[start..];
857+
let pos =self.offset + start;
858+
self.rest =&rest[..start];
859+
returnSome((pos, word));
860+
}
861+
862+
// 4) Single non-core byte
863+
let start = len -1;
864+
let word =&rest[start..];
865+
let pos =self.offset + start;
866+
self.rest =&rest[..start];
867+
Some((pos, word))
868+
}
869+
}
870+
805871
#[inline]
806872
pubfnnew_word_bounds(s:&str) ->UWordBounds<'_>{
807873
UWordBounds{
@@ -832,20 +898,25 @@ fn has_alphanumeric(s: &&str) -> bool {
832898
}
833899

834900
#[inline]
835-
fnnew_unicode_words_ascii<'a>(s:&'astr) ->implIterator<Item =&'astr> +'a{
901+
fnhas_ascii_alphanumeric(s:&&str) ->bool{
902+
s.chars().any(|c| c.is_ascii_alphanumeric())
903+
}
904+
905+
#[inline]
906+
fnnew_unicode_words_ascii<'a>(s:&'astr) ->implDoubleEndedIterator<Item =&'astr> +'a{
836907
new_ascii_word_bound_indices(s)
837908
.map(|(_, w)| w)
838-
.filter(|w| w.chars().any(|c| c.is_ascii_alphanumeric()))
909+
.filter(has_ascii_alphanumeric)
839910
}
840911

841912
#[inline]
842-
fnnew_unicode_words_general<'a>(s:&'astr) ->implIterator<Item =&'astr> +'a{
913+
fnnew_unicode_words_general<'a>(s:&'astr) ->implDoubleEndedIterator<Item =&'astr> +'a{
843914
new_word_bounds(s).filter(has_alphanumeric)
844915
}
845916

846917
#[inline]
847918
pubfnnew_unicode_words(s:&str) ->UnicodeWords<'_>{
848-
let iter:Box<dynIterator<Item =&str>> =if s.is_ascii(){
919+
let iter:Box<dynDoubleEndedIterator<Item =&str>> =if s.is_ascii(){
849920
Box::new(new_unicode_words_ascii(s))
850921
}else{
851922
Box::new(new_unicode_words_general(s))
@@ -855,14 +926,13 @@ pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
855926
}
856927

857928
#[inline]
858-
pubfnnew_unicode_word_indices(s:&str) ->UnicodeWordIndices<'_>{
859-
usesuper::UnicodeSegmentation;
860-
861-
UnicodeWordIndices{
862-
inner: s
863-
.split_word_bound_indices()
864-
.filter(|(_, c)|has_alphanumeric(c)),
865-
}
929+
pubfnnew_unicode_word_indices<'a>(s:&'astr) ->UnicodeWordIndices<'a>{
930+
let iter:Box<dynDoubleEndedIterator<Item =(usize,&str)>> =if s.is_ascii(){
931+
Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)|has_ascii_alphanumeric(w)))
932+
}else{
933+
Box::new(new_word_bound_indices(s).filter(|(_, w)|has_alphanumeric(w)))
934+
};
935+
UnicodeWordIndices{inner: iter}
866936
}
867937

868938
#[cfg(test)]
@@ -921,5 +991,17 @@ mod tests {
921991

922992
prop_assert_eq!(fast, uni);
923993
}
994+
995+
/// Fast path must equal general path for any ASCII input, forwards and backwards.
996+
#[test]
997+
fn proptest_ascii_matches_unicode_word_indices_rev(
998+
// Vec<char> → String, length 0‒99
999+
s in proptest::collection::vec(ascii_char(),0..100)
1000+
.prop_map(|v| v.into_iter().collect::<String>())
1001+
){
1002+
let fast_rev:Vec<(usize,&str)> = new_ascii_word_bound_indices(&s).rev().collect();
1003+
let uni_rev:Vec<(usize,&str)> = new_word_bound_indices(&s).rev().collect();
1004+
prop_assert_eq!(fast_rev, uni_rev);
1005+
}
9241006
}
9251007
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp