Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit9c7abf2

Browse files
committed
Documentation and code reorg
1 parent50058a5 commit9c7abf2

File tree

1 file changed

+31
-21
lines changed

1 file changed

+31
-21
lines changed

‎src/sentence.rs‎

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ mod fwd {
1616
use tables::sentence::SentenceCat;
1717
use core::cmp;
1818

19+
// Describe a parsed part of source string as described in this table:
20+
// https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
1921
#[derive(Clone,Copy,PartialEq,Eq)]
2022
enumStatePart{
2123
Sot,
@@ -49,6 +51,8 @@ mod fwd {
4951
}
5052

5153
implSentenceBreaksState{
54+
// Attempt to advance the internal state by one part
55+
// Whitespace and some punctutation will be collapsed
5256
fnnext(&self,cat:SentenceCat) ->SentenceBreaksState{
5357
let&SentenceBreaksState(parts) =self;
5458
let parts =match(parts[3], cat){
@@ -85,27 +89,28 @@ mod fwd {
8589
])
8690
}
8791

92+
// Helper function to check if state head matches a single `StatePart`
8893
fnmatch1(&self,part:StatePart) ->bool{
8994
let&SentenceBreaksState(parts) =self;
9095
part == parts[3]
9196
}
9297

98+
// Helper function to check if first two `StateParts` in state match
99+
// the given two
93100
fnmatch2(&self,part1:StatePart,part2:StatePart) ->bool{
94101
let&SentenceBreaksState(parts) =self;
95102
part1 == parts[2] && part2 == parts[3]
96103
}
97104
}
98105

106+
// https://unicode.org/reports/tr29/#SB8
107+
// TODO cache this, it is currently quadratic
99108
fnmatch_sb8(state:&SentenceBreaksState,ahead:&str) ->bool{
100-
let aterm_part ={
101-
// ATerm Close* Sp*
102-
let&SentenceBreaksState(parts) = state;
103-
letmut idx =if parts[3] ==StatePart::SpPlus{2}else{3};
104-
if parts[idx] ==StatePart::ClosePlus{ idx -=1}
105-
parts[idx]
106-
};
109+
let&SentenceBreaksState(parts) = state;
110+
letmut idx =if parts[3] ==StatePart::SpPlus{2}else{3};
111+
if parts[idx] ==StatePart::ClosePlus{ idx -=1}
107112

108-
ifaterm_part ==StatePart::ATerm{
113+
ifparts[idx] ==StatePart::ATerm{
109114
use tables::sentenceas se;
110115

111116
for next_charin ahead.chars(){
@@ -124,6 +129,7 @@ mod fwd {
124129
false
125130
}
126131

132+
// https://unicode.org/reports/tr29/#SB8a
127133
fnmatch_sb8a(state:&SentenceBreaksState) ->bool{
128134
// SATerm Close* Sp*
129135
let&SentenceBreaksState(parts) = state;
@@ -132,13 +138,15 @@ mod fwd {
132138
parts[idx] ==StatePart::STerm || parts[idx] ==StatePart::ATerm
133139
}
134140

141+
// https://unicode.org/reports/tr29/#SB9
135142
fnmatch_sb9(state:&SentenceBreaksState) ->bool{
136143
// SATerm Close*
137144
let&SentenceBreaksState(parts) = state;
138145
let idx =if parts[3] ==StatePart::ClosePlus{2}else{3};
139146
parts[idx] ==StatePart::STerm || parts[idx] ==StatePart::ATerm
140147
}
141148

149+
// https://unicode.org/reports/tr29/#SB11
142150
fnmatch_sb11(state:&SentenceBreaksState) ->bool{
143151
// SATerm Close* Sp* ParaSep?
144152
let&SentenceBreaksState(parts) = state;
@@ -180,67 +188,69 @@ mod fwd {
180188
self.state =self.state.next(next_cat);
181189

182190
match next_cat{
183-
// SB1
191+
// SB1 https://unicode.org/reports/tr29/#SB1
184192
_if state_before.match1(StatePart::Sot) =>
185193
returnSome(position_before),
186194

187-
// SB3
195+
// SB2 is handled when inner iterator (chars) is finished
196+
197+
// SB3 https://unicode.org/reports/tr29/#SB3
188198
SentenceCat::SC_LFif state_before.match1(StatePart::CR) =>
189199
continue,
190200

191-
// SB4
201+
// SB4 https://unicode.org/reports/tr29/#SB4
192202
_if state_before.match1(StatePart::Sep)
193203
|| state_before.match1(StatePart::CR)
194204
|| state_before.match1(StatePart::LF)
195205
=>returnSome(position_before),
196206

197-
// SB5
207+
// SB5 https://unicode.org/reports/tr29/#SB5
198208
SentenceCat::SC_Extend |
199209
SentenceCat::SC_Format =>self.state = state_before,
200210

201-
// SB6
211+
// SB6 https://unicode.org/reports/tr29/#SB6
202212
SentenceCat::SC_Numericif state_before.match1(StatePart::ATerm) =>
203213
continue,
204214

205-
// SB7
215+
// SB7 https://unicode.org/reports/tr29/#SB7
206216
SentenceCat::SC_Upperif state_before.match2(StatePart::UpperLower,StatePart::ATerm) =>
207217
continue,
208218

209-
// SB8
219+
// SB8 https://unicode.org/reports/tr29/#SB8
210220
_ifmatch_sb8(&state_before,&self.string[position_before..]) =>
211221
continue,
212222

213-
// SB8a
223+
// SB8a https://unicode.org/reports/tr29/#SB8a
214224
SentenceCat::SC_SContinue |
215225
SentenceCat::SC_STerm |
216226
SentenceCat::SC_ATermifmatch_sb8a(&state_before) =>
217227
continue,
218228

219-
// SB9
229+
// SB9 https://unicode.org/reports/tr29/#SB9
220230
SentenceCat::SC_Close |
221231
SentenceCat::SC_Sp |
222232
SentenceCat::SC_Sep |
223233
SentenceCat::SC_CR |
224234
SentenceCat::SC_LFifmatch_sb9(&state_before) =>
225235
continue,
226236

227-
// SB10
237+
// SB10 https://unicode.org/reports/tr29/#SB10
228238
SentenceCat::SC_Sp |
229239
SentenceCat::SC_Sep |
230240
SentenceCat::SC_CR |
231241
SentenceCat::SC_LFifmatch_sb8a(&state_before) =>
232242
continue,
233243

234-
// SB11
244+
// SB11 https://unicode.org/reports/tr29/#SB11
235245
_ifmatch_sb11(&state_before) =>
236246
returnSome(position_before),
237247

238-
// SB998
248+
// SB998 https://unicode.org/reports/tr29/#SB998
239249
_ =>continue
240250
}
241251
}
242252

243-
// SB2
253+
// SB2 https://unicode.org/reports/tr29/#SB2
244254
ifself.state.match1(StatePart::Sot){
245255
None
246256
}elseifself.state.match1(StatePart::Eot){

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp