11
11
extern crate alloc;
12
12
use alloc:: boxed:: Box ;
13
13
use core:: cmp;
14
- use core:: iter:: Filter ;
15
14
16
15
use crate :: tables:: word:: WordCat ;
17
16
@@ -28,7 +27,7 @@ use crate::tables::word::WordCat;
28
27
/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
29
28
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
30
29
pub struct UnicodeWords < ' a > {
31
- inner : Box < dyn Iterator < Item =& ' a str > +' a > ,
30
+ inner : Box < dyn DoubleEndedIterator < Item =& ' a str > +' a > ,
32
31
}
33
32
34
33
impl < ' a > Iterator for UnicodeWords < ' a > {
@@ -45,6 +44,13 @@ impl<'a> Iterator for UnicodeWords<'a> {
45
44
}
46
45
}
47
46
47
+ impl < ' a > DoubleEndedIterator for UnicodeWords < ' a > {
48
+ #[ inline]
49
+ fn next_back ( & mut self ) ->Option < & ' a str > {
50
+ self . inner . next_back ( )
51
+ }
52
+ }
53
+
48
54
/// An iterator over the substrings of a string which, after splitting the string on
49
55
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
50
56
/// contain any characters with the
@@ -58,16 +64,15 @@ impl<'a> Iterator for UnicodeWords<'a> {
58
64
///
59
65
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
60
66
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
61
- #[ derive( Debug ) ]
62
67
pub struct UnicodeWordIndices < ' a > {
63
68
#[ allow( clippy:: type_complexity) ]
64
- inner : Filter < UWordBoundIndices < ' a > , fn ( & ( usize , & str ) ) -> bool > ,
69
+ inner : Box < dyn DoubleEndedIterator < Item = ( usize , & ' a str ) > + ' a > ,
65
70
}
66
71
67
72
impl < ' a > Iterator for UnicodeWordIndices < ' a > {
68
73
type Item =( usize , & ' a str ) ;
69
74
70
- #[ inline]
75
+ #[ inline( always ) ]
71
76
fn next ( & mut self ) ->Option < ( usize , & ' a str ) > {
72
77
self . inner . next ( )
73
78
}
@@ -722,12 +727,12 @@ impl<'a> AsciiWordBoundIter<'a> {
722
727
AsciiWordBoundIter { rest : s, offset : 0 }
723
728
}
724
729
725
- #[ inline( always ) ]
730
+ #[ inline]
726
731
fn is_core ( b : u8 ) ->bool {
727
732
b. is_ascii_alphanumeric ( ) || b ==b'_'
728
733
}
729
734
730
- #[ inline( always ) ]
735
+ #[ inline]
731
736
fn is_infix ( b : u8 , prev : u8 , next : u8 ) ->bool {
732
737
match b{
733
738
// numeric separators
@@ -744,6 +749,7 @@ impl<'a> AsciiWordBoundIter<'a> {
744
749
impl < ' a > Iterator for AsciiWordBoundIter < ' a > {
745
750
type Item =( usize , & ' a str ) ;
746
751
752
+ #[ inline]
747
753
fn next ( & mut self ) ->Option < Self :: Item > {
748
754
if self . rest . is_empty ( ) {
749
755
return None ;
@@ -802,6 +808,66 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
802
808
}
803
809
}
804
810
811
+ impl < ' a > DoubleEndedIterator for AsciiWordBoundIter < ' a > {
812
+ fn next_back ( & mut self ) ->Option < ( usize , & ' a str ) > {
813
+ let rest =self . rest ;
814
+ if rest. is_empty ( ) {
815
+ return None ;
816
+ }
817
+ let bytes = rest. as_bytes ( ) ;
818
+ let len = bytes. len ( ) ;
819
+
820
+ // 1) Trailing spaces
821
+ if bytes[ len -1 ] ==b' ' {
822
+ // find start of this last run of spaces
823
+ let mut start = len -1 ;
824
+ while start >0 && bytes[ start -1 ] ==b' ' {
825
+ start -=1 ;
826
+ }
827
+ let word =& rest[ start..] ;
828
+ let pos =self . offset + start;
829
+ self . rest =& rest[ ..start] ;
830
+ return Some ( ( pos, word) ) ;
831
+ }
832
+
833
+ // 2) Trailing core-run (letters/digits/underscore + infix)
834
+ if Self :: is_core ( bytes[ len -1 ] ) {
835
+ // scan backwards as long as we see `is_core` or an `is_infix`
836
+ let mut start = len -1 ;
837
+ while start >0 {
838
+ let b = bytes[ start -1 ] ;
839
+ let prev =if start >=2 { bytes[ start -2 ] } else { b} ;
840
+ let next = bytes[ start] ; // the byte we just included
841
+ if Self :: is_core ( b) ||Self :: is_infix ( b, prev, next) {
842
+ start -=1 ;
843
+ } else {
844
+ break ;
845
+ }
846
+ }
847
+ let word =& rest[ start..] ;
848
+ let pos =self . offset + start;
849
+ self . rest =& rest[ ..start] ;
850
+ return Some ( ( pos, word) ) ;
851
+ }
852
+
853
+ // 3) CR+LF at end
854
+ if len >=2 && bytes[ len -2 ] ==b'\r' && bytes[ len -1 ] ==b'\n' {
855
+ let start = len -2 ;
856
+ let word =& rest[ start..] ;
857
+ let pos =self . offset + start;
858
+ self . rest =& rest[ ..start] ;
859
+ return Some ( ( pos, word) ) ;
860
+ }
861
+
862
+ // 4) Single non-core byte
863
+ let start = len -1 ;
864
+ let word =& rest[ start..] ;
865
+ let pos =self . offset + start;
866
+ self . rest =& rest[ ..start] ;
867
+ Some ( ( pos, word) )
868
+ }
869
+ }
870
+
805
871
#[ inline]
806
872
pub fn new_word_bounds ( s : & str ) ->UWordBounds < ' _ > {
807
873
UWordBounds {
@@ -832,20 +898,25 @@ fn has_alphanumeric(s: &&str) -> bool {
832
898
}
833
899
834
900
#[ inline]
835
- fn new_unicode_words_ascii < ' a > ( s : & ' a str ) ->impl Iterator < Item =& ' a str > +' a {
901
+ fn has_ascii_alphanumeric ( s : & & str ) ->bool {
902
+ s. chars ( ) . any ( |c| c. is_ascii_alphanumeric ( ) )
903
+ }
904
+
905
+ #[ inline]
906
+ fn new_unicode_words_ascii < ' a > ( s : & ' a str ) ->impl DoubleEndedIterator < Item =& ' a str > +' a {
836
907
new_ascii_word_bound_indices ( s)
837
908
. map ( |( _, w) | w)
838
- . filter ( |w| w . chars ( ) . any ( |c| c . is_ascii_alphanumeric ( ) ) )
909
+ . filter ( has_ascii_alphanumeric )
839
910
}
840
911
841
912
#[ inline]
842
- fn new_unicode_words_general < ' a > ( s : & ' a str ) ->impl Iterator < Item =& ' a str > +' a {
913
+ fn new_unicode_words_general < ' a > ( s : & ' a str ) ->impl DoubleEndedIterator < Item =& ' a str > +' a {
843
914
new_word_bounds ( s) . filter ( has_alphanumeric)
844
915
}
845
916
846
917
#[ inline]
847
918
pub fn new_unicode_words ( s : & str ) ->UnicodeWords < ' _ > {
848
- let iter: Box < dyn Iterator < Item =& str > > =if s. is_ascii ( ) {
919
+ let iter: Box < dyn DoubleEndedIterator < Item =& str > > =if s. is_ascii ( ) {
849
920
Box :: new ( new_unicode_words_ascii ( s) )
850
921
} else {
851
922
Box :: new ( new_unicode_words_general ( s) )
@@ -855,14 +926,13 @@ pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
855
926
}
856
927
857
928
#[ inline]
858
- pub fn new_unicode_word_indices ( s : & str ) ->UnicodeWordIndices < ' _ > {
859
- use super :: UnicodeSegmentation ;
860
-
861
- UnicodeWordIndices {
862
- inner : s
863
- . split_word_bound_indices ( )
864
- . filter ( |( _, c) |has_alphanumeric ( c) ) ,
865
- }
929
+ pub fn new_unicode_word_indices < ' a > ( s : & ' a str ) ->UnicodeWordIndices < ' a > {
930
+ let iter: Box < dyn DoubleEndedIterator < Item =( usize , & str ) > > =if s. is_ascii ( ) {
931
+ Box :: new ( new_ascii_word_bound_indices ( s) . filter ( |( _, w) |has_ascii_alphanumeric ( w) ) )
932
+ } else {
933
+ Box :: new ( new_word_bound_indices ( s) . filter ( |( _, w) |has_alphanumeric ( w) ) )
934
+ } ;
935
+ UnicodeWordIndices { inner : iter}
866
936
}
867
937
868
938
#[ cfg( test) ]
@@ -921,5 +991,17 @@ mod tests {
921
991
922
992
prop_assert_eq!( fast, uni) ;
923
993
}
994
+
995
+ /// Fast path must equal general path for any ASCII input, forwards and backwards.
996
+ #[ test]
997
+ fn proptest_ascii_matches_unicode_word_indices_rev(
998
+ // Vec<char> → String, length 0‒99
999
+ s in proptest:: collection:: vec( ascii_char( ) , 0 ..100 )
1000
+ . prop_map( |v| v. into_iter( ) . collect:: <String >( ) )
1001
+ ) {
1002
+ let fast_rev: Vec <( usize , & str ) > = new_ascii_word_bound_indices( & s) . rev( ) . collect( ) ;
1003
+ let uni_rev: Vec <( usize , & str ) > = new_word_bound_indices( & s) . rev( ) . collect( ) ;
1004
+ prop_assert_eq!( fast_rev, uni_rev) ;
1005
+ }
924
1006
}
925
1007
}