@@ -16,6 +16,8 @@ mod fwd {
1616use tables:: sentence:: SentenceCat ;
1717use core:: cmp;
1818
19+ // Describe a parsed part of source string as described in this table:
20+ // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
1921#[ derive( Clone , Copy , PartialEq , Eq ) ]
2022enum StatePart {
2123Sot ,
@@ -49,6 +51,8 @@ mod fwd {
4951}
5052
5153impl SentenceBreaksState {
54+ // Attempt to advance the internal state by one part
55+ // Whitespace and some punctutation will be collapsed
5256fn next ( & self , cat : SentenceCat ) ->SentenceBreaksState {
5357let & SentenceBreaksState ( parts) =self ;
5458let parts =match ( parts[ 3 ] , cat) {
@@ -85,27 +89,28 @@ mod fwd {
8589] )
8690}
8791
92+ // Helper function to check if state head matches a single `StatePart`
8893fn match1 ( & self , part : StatePart ) ->bool {
8994let & SentenceBreaksState ( parts) =self ;
9095 part == parts[ 3 ]
9196}
9297
98+ // Helper function to check if first two `StateParts` in state match
99+ // the given two
93100fn match2 ( & self , part1 : StatePart , part2 : StatePart ) ->bool {
94101let & SentenceBreaksState ( parts) =self ;
95102 part1 == parts[ 2 ] && part2 == parts[ 3 ]
96103}
97104}
98105
106+ // https://unicode.org/reports/tr29/#SB8
107+ // TODO cache this, it is currently quadratic
99108fn match_sb8 ( state : & SentenceBreaksState , ahead : & str ) ->bool {
100- let aterm_part ={
101- // ATerm Close* Sp*
102- let & SentenceBreaksState ( parts) = state;
103- let mut idx =if parts[ 3 ] ==StatePart :: SpPlus { 2 } else { 3 } ;
104- if parts[ idx] ==StatePart :: ClosePlus { idx -=1 }
105- parts[ idx]
106- } ;
109+ let & SentenceBreaksState ( parts) = state;
110+ let mut idx =if parts[ 3 ] ==StatePart :: SpPlus { 2 } else { 3 } ;
111+ if parts[ idx] ==StatePart :: ClosePlus { idx -=1 }
107112
108- if aterm_part ==StatePart :: ATerm {
113+ if parts [ idx ] ==StatePart :: ATerm {
109114use tables:: sentenceas se;
110115
111116for next_charin ahead. chars ( ) {
@@ -124,6 +129,7 @@ mod fwd {
124129false
125130}
126131
132+ // https://unicode.org/reports/tr29/#SB8a
127133fn match_sb8a ( state : & SentenceBreaksState ) ->bool {
128134// SATerm Close* Sp*
129135let & SentenceBreaksState ( parts) = state;
@@ -132,13 +138,15 @@ mod fwd {
132138 parts[ idx] ==StatePart :: STerm || parts[ idx] ==StatePart :: ATerm
133139}
134140
141+ // https://unicode.org/reports/tr29/#SB9
135142fn match_sb9 ( state : & SentenceBreaksState ) ->bool {
136143// SATerm Close*
137144let & SentenceBreaksState ( parts) = state;
138145let idx =if parts[ 3 ] ==StatePart :: ClosePlus { 2 } else { 3 } ;
139146 parts[ idx] ==StatePart :: STerm || parts[ idx] ==StatePart :: ATerm
140147}
141148
149+ // https://unicode.org/reports/tr29/#SB11
142150fn match_sb11 ( state : & SentenceBreaksState ) ->bool {
143151// SATerm Close* Sp* ParaSep?
144152let & SentenceBreaksState ( parts) = state;
@@ -180,67 +188,69 @@ mod fwd {
180188self . state =self . state . next ( next_cat) ;
181189
182190match next_cat{
183- // SB1
191+ // SB1 https://unicode.org/reports/tr29/#SB1
184192 _if state_before. match1 ( StatePart :: Sot ) =>
185193return Some ( position_before) ,
186194
187- // SB3
195+ // SB2 is handled when inner iterator (chars) is finished
196+
197+ // SB3 https://unicode.org/reports/tr29/#SB3
188198SentenceCat :: SC_LF if state_before. match1 ( StatePart :: CR ) =>
189199continue ,
190200
191- // SB4
201+ // SB4 https://unicode.org/reports/tr29/#SB4
192202 _if state_before. match1 ( StatePart :: Sep )
193203 || state_before. match1 ( StatePart :: CR )
194204 || state_before. match1 ( StatePart :: LF )
195205 =>return Some ( position_before) ,
196206
197- // SB5
207+ // SB5 https://unicode.org/reports/tr29/#SB5
198208SentenceCat :: SC_Extend |
199209SentenceCat :: SC_Format =>self . state = state_before,
200210
201- // SB6
211+ // SB6 https://unicode.org/reports/tr29/#SB6
202212SentenceCat :: SC_Numeric if state_before. match1 ( StatePart :: ATerm ) =>
203213continue ,
204214
205- // SB7
215+ // SB7 https://unicode.org/reports/tr29/#SB7
206216SentenceCat :: SC_Upper if state_before. match2 ( StatePart :: UpperLower , StatePart :: ATerm ) =>
207217continue ,
208218
209- // SB8
219+ // SB8 https://unicode.org/reports/tr29/#SB8
210220 _if match_sb8 ( & state_before, & self . string [ position_before..] ) =>
211221continue ,
212222
213- // SB8a
223+ // SB8a https://unicode.org/reports/tr29/#SB8a
214224SentenceCat :: SC_SContinue |
215225SentenceCat :: SC_STerm |
216226SentenceCat :: SC_ATerm if match_sb8a ( & state_before) =>
217227continue ,
218228
219- // SB9
229+ // SB9 https://unicode.org/reports/tr29/#SB9
220230SentenceCat :: SC_Close |
221231SentenceCat :: SC_Sp |
222232SentenceCat :: SC_Sep |
223233SentenceCat :: SC_CR |
224234SentenceCat :: SC_LF if match_sb9 ( & state_before) =>
225235continue ,
226236
227- // SB10
237+ // SB10 https://unicode.org/reports/tr29/#SB10
228238SentenceCat :: SC_Sp |
229239SentenceCat :: SC_Sep |
230240SentenceCat :: SC_CR |
231241SentenceCat :: SC_LF if match_sb8a ( & state_before) =>
232242continue ,
233243
234- // SB11
244+ // SB11 https://unicode.org/reports/tr29/#SB11
235245 _if match_sb11 ( & state_before) =>
236246return Some ( position_before) ,
237247
238- // SB998
248+ // SB998 https://unicode.org/reports/tr29/#SB998
239249 _ =>continue
240250}
241251}
242252
243- // SB2
253+ // SB2 https://unicode.org/reports/tr29/#SB2
244254if self . state . match1 ( StatePart :: Sot ) {
245255None
246256} else if self . state . match1 ( StatePart :: Eot ) {