Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit7ac6f29

Browse files
committed
Added forward iterator for unicode sentences
Passes all tests in the examples provided here:http://www.unicode.org/Public/9.0.0/ucd/auxiliary/SentenceBreakTest.txt
1 parentfa10dd3 commit7ac6f29

File tree

4 files changed

+338
-2
lines changed

4 files changed

+338
-2
lines changed

‎Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"
1212
keywords = ["text","unicode","grapheme","word","boundary"]
1313
readme ="README.md"
1414
description ="""
15-
This crate provides Grapheme ClusterandWord boundaries
15+
This crate provides Grapheme Cluster, WordandSentence boundaries
1616
according to Unicode Standard Annex #29 rules.
1717
"""
1818

‎src/lib.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11-
//! Iterators which split strings on Grapheme ClusterorWord boundaries, according
11+
//! Iterators which split strings on Grapheme Cluster, WordorSentence boundaries, according
1212
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
1313
//!
1414
//! ```rust
@@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};
6767
pubuse grapheme::{GraphemeCursor,GraphemeIncomplete};
6868
pubuse tables::UNICODE_VERSION;
6969
pubuse word::{UWordBounds,UWordBoundIndices,UnicodeWords};
70+
pubuse sentence::{USentenceBounds};
7071

7172
mod grapheme;
7273
mod tables;
7374
mod word;
75+
mod sentence;
7476

7577
#[cfg(test)]
7678
mod test;
@@ -174,6 +176,12 @@ pub trait UnicodeSegmentation {
174176
/// assert_eq!(&swi1[..], b);
175177
/// ```
176178
fnsplit_word_bound_indices<'a>(&'aself) ->UWordBoundIndices<'a>;
179+
180+
/// Returns an iterator over substrings of `self` separated on
181+
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
182+
///
183+
/// The concatenation of the substrings returned by this function is just the original string.
184+
fnsplit_sentence_bounds<'a>(&'aself) ->USentenceBounds<'a>;
177185
}
178186

179187
implUnicodeSegmentationforstr{
@@ -201,4 +209,9 @@ impl UnicodeSegmentation for str {
201209
fnsplit_word_bound_indices(&self) ->UWordBoundIndices{
202210
word::new_word_bound_indices(self)
203211
}
212+
213+
#[inline]
214+
fnsplit_sentence_bounds(&self) ->USentenceBounds{
215+
sentence::new_sentence_bounds(self)
216+
}
204217
}

‎src/sentence.rs

Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
use core::cmp;
12+
13+
// All of the logic for forward iteration over sentences
14+
mod fwd{
15+
use tables::sentence::SentenceCat;
16+
use core::cmp;
17+
18+
#[derive(Clone,Copy,PartialEq,Eq)]
19+
enumStatePart{
20+
Sot,
21+
Eot,
22+
Other,
23+
CR,
24+
LF,
25+
Sep,
26+
ATerm,
27+
UpperLower,
28+
ClosePlus,
29+
SpPlus,
30+
STerm
31+
}
32+
33+
#[derive(Clone,PartialEq,Eq)]
34+
structSentenceBreaksState(pub[StatePart;4]);
35+
36+
constINITIAL_STATE:SentenceBreaksState =SentenceBreaksState([
37+
StatePart::Sot,
38+
StatePart::Sot,
39+
StatePart::Sot,
40+
StatePart::Sot
41+
]);
42+
43+
pubstructSentenceBreaks<'a>{
44+
pubstring:&'astr,
45+
pos:usize,
46+
state:SentenceBreaksState
47+
}
48+
49+
implSentenceBreaksState{
50+
fnnext(&self,cat:SentenceCat) ->SentenceBreaksState{
51+
let&SentenceBreaksState(parts) =self;
52+
let parts =match(parts[3], cat){
53+
(StatePart::ClosePlus,SentenceCat::SC_Close) => parts,
54+
(StatePart::SpPlus,SentenceCat::SC_Sp) => parts,
55+
_ =>[
56+
parts[1],
57+
parts[2],
58+
parts[3],
59+
match cat{
60+
SentenceCat::SC_CR =>StatePart::CR,
61+
SentenceCat::SC_LF =>StatePart::LF,
62+
SentenceCat::SC_Sep =>StatePart::Sep,
63+
SentenceCat::SC_ATerm =>StatePart::ATerm,
64+
SentenceCat::SC_Upper |
65+
SentenceCat::SC_Lower =>StatePart::UpperLower,
66+
SentenceCat::SC_Close =>StatePart::ClosePlus,
67+
SentenceCat::SC_Sp =>StatePart::SpPlus,
68+
SentenceCat::SC_STerm =>StatePart::STerm,
69+
_ =>StatePart::Other
70+
}
71+
]
72+
};
73+
SentenceBreaksState(parts)
74+
}
75+
76+
fnend(&self) ->SentenceBreaksState{
77+
let&SentenceBreaksState(parts) =self;
78+
SentenceBreaksState([
79+
parts[1],
80+
parts[2],
81+
parts[3],
82+
StatePart::Eot
83+
])
84+
}
85+
86+
fnmatch1(&self,part:StatePart) ->bool{
87+
let&SentenceBreaksState(parts) =self;
88+
part == parts[3]
89+
}
90+
91+
fnmatch2(&self,part1:StatePart,part2:StatePart) ->bool{
92+
let&SentenceBreaksState(parts) =self;
93+
part1 == parts[2] && part2 == parts[3]
94+
}
95+
}
96+
97+
fnmatch_sb8(state:&SentenceBreaksState,ahead:&str) ->bool{
98+
let aterm_part ={
99+
// ATerm Close* Sp*
100+
let&SentenceBreaksState(parts) = state;
101+
letmut idx =if parts[3] ==StatePart::SpPlus{2}else{3};
102+
if parts[idx] ==StatePart::ClosePlus{ idx -=1}
103+
parts[idx]
104+
};
105+
106+
if aterm_part ==StatePart::ATerm{
107+
use tables::sentenceas se;
108+
109+
for next_charin ahead.chars(){
110+
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
111+
match se::sentence_category(next_char){
112+
se::SC_Lower =>returntrue,
113+
se::SC_OLetter |
114+
se::SC_Upper |
115+
se::SC_Sep | se::SC_CR | se::SC_LF |
116+
se::SC_STerm | se::SC_ATerm =>returnfalse,
117+
_ =>continue
118+
}
119+
}
120+
}
121+
122+
false
123+
}
124+
125+
fnmatch_sb8a(state:&SentenceBreaksState) ->bool{
126+
// SATerm Close* Sp*
127+
let&SentenceBreaksState(parts) = state;
128+
letmut idx =if parts[3] ==StatePart::SpPlus{2}else{3};
129+
if parts[idx] ==StatePart::ClosePlus{ idx -=1}
130+
parts[idx] ==StatePart::STerm || parts[idx] ==StatePart::ATerm
131+
}
132+
133+
fnmatch_sb9(state:&SentenceBreaksState) ->bool{
134+
// SATerm Close*
135+
let&SentenceBreaksState(parts) = state;
136+
let idx =if parts[3] ==StatePart::ClosePlus{2}else{3};
137+
parts[idx] ==StatePart::STerm || parts[idx] ==StatePart::ATerm
138+
}
139+
140+
fnmatch_sb11(state:&SentenceBreaksState) ->bool{
141+
// SATerm Close* Sp* ParaSep?
142+
let&SentenceBreaksState(parts) = state;
143+
letmut idx =match parts[3]{
144+
StatePart::Sep |
145+
StatePart::CR |
146+
StatePart::LF =>2,
147+
_ =>3
148+
};
149+
150+
if parts[idx] ==StatePart::SpPlus{ idx -=1}
151+
if parts[idx] ==StatePart::ClosePlus{ idx -=1}
152+
153+
parts[idx] ==StatePart::STerm || parts[idx] ==StatePart::ATerm
154+
}
155+
156+
impl<'a>IteratorforSentenceBreaks<'a>{
157+
// Returns the index of the character which follows a break
158+
typeItem =usize;
159+
160+
#[inline]
161+
fnsize_hint(&self) ->(usize,Option<usize>){
162+
let slen =self.string.len();
163+
// A sentence could be one character
164+
(cmp::min(slen,2),Some(slen +1))
165+
}
166+
167+
#[inline]
168+
fnnext(&mutself) ->Option<usize>{
169+
use tables::sentenceas se;
170+
171+
for next_charinself.string[self.pos..].chars(){
172+
let position_before =self.pos;
173+
let state_before =self.state.clone();
174+
175+
let next_cat = se::sentence_category(next_char);
176+
177+
self.pos += next_char.len_utf8();
178+
self.state =self.state.next(next_cat);
179+
180+
match next_cat{
181+
// SB1
182+
_if state_before.match1(StatePart::Sot) =>
183+
returnSome(position_before),
184+
185+
// SB3
186+
SentenceCat::SC_LFif state_before.match1(StatePart::CR) =>
187+
continue,
188+
189+
// SB4
190+
_if state_before.match1(StatePart::Sep)
191+
|| state_before.match1(StatePart::CR)
192+
|| state_before.match1(StatePart::LF)
193+
=>returnSome(position_before),
194+
195+
// SB5
196+
SentenceCat::SC_Extend |
197+
SentenceCat::SC_Format =>self.state = state_before,
198+
199+
// SB6
200+
SentenceCat::SC_Numericif state_before.match1(StatePart::ATerm) =>
201+
continue,
202+
203+
// SB7
204+
SentenceCat::SC_Upperif state_before.match2(StatePart::UpperLower,StatePart::ATerm) =>
205+
continue,
206+
207+
// SB8
208+
_ifmatch_sb8(&state_before,&self.string[position_before..]) =>
209+
continue,
210+
211+
// SB8a
212+
SentenceCat::SC_SContinue |
213+
SentenceCat::SC_STerm |
214+
SentenceCat::SC_ATermifmatch_sb8a(&state_before) =>
215+
continue,
216+
217+
// SB9
218+
SentenceCat::SC_Close |
219+
SentenceCat::SC_Sp |
220+
SentenceCat::SC_Sep |
221+
SentenceCat::SC_CR |
222+
SentenceCat::SC_LFifmatch_sb9(&state_before) =>
223+
continue,
224+
225+
// SB10
226+
SentenceCat::SC_Sp |
227+
SentenceCat::SC_Sep |
228+
SentenceCat::SC_CR |
229+
SentenceCat::SC_LFifmatch_sb8a(&state_before) =>
230+
continue,
231+
232+
// SB11
233+
_ifmatch_sb11(&state_before) =>
234+
returnSome(position_before),
235+
236+
// SB998
237+
_ =>continue
238+
}
239+
}
240+
241+
// SB2
242+
ifself.state.match1(StatePart::Sot){
243+
None
244+
}elseifself.state.match1(StatePart::Eot){
245+
None
246+
}else{
247+
self.state =self.state.end();
248+
Some(self.pos)
249+
}
250+
}
251+
}
252+
253+
pubfnnew_sentence_breaks<'a>(source:&'astr) ->SentenceBreaks<'a>{
254+
SentenceBreaks{string: source,pos:0,state:INITIAL_STATE}
255+
}
256+
257+
}
258+
259+
/// External iterator for a string's
260+
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
261+
pubstructUSentenceBounds<'a>{
262+
iter: fwd::SentenceBreaks<'a>,
263+
sentence_start:Option<usize>
264+
}
265+
266+
#[inline]
267+
pubfnnew_sentence_bounds<'a>(source:&'astr) ->USentenceBounds<'a>{
268+
USentenceBounds{
269+
iter: fwd::new_sentence_breaks(source),
270+
sentence_start:None
271+
}
272+
}
273+
274+
impl<'a>IteratorforUSentenceBounds<'a>{
275+
typeItem =&'astr;
276+
277+
#[inline]
278+
fnsize_hint(&self) ->(usize,Option<usize>){
279+
let(lower, upper) =self.iter.size_hint();
280+
(cmp::max(0, lower -1), upper.map(|u| cmp::max(0, u -1)))
281+
}
282+
283+
#[inline]
284+
fnnext(&mutself) ->Option<&'astr>{
285+
ifself.sentence_start ==None{
286+
ifletSome(start_pos) =self.iter.next(){
287+
self.sentence_start =Some(start_pos)
288+
}else{
289+
returnNone
290+
}
291+
}
292+
293+
ifletSome(break_pos) =self.iter.next(){
294+
let start_pos =self.sentence_start.unwrap();
295+
let sentence =&self.iter.string[start_pos..break_pos];
296+
self.sentence_start =Some(break_pos);
297+
Some(sentence)
298+
}else{
299+
None
300+
}
301+
}
302+
}

‎src/test.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,27 @@ fn test_words() {
141141
}
142142
}
143143

144+
145+
#[test]
146+
fntest_sentences(){
147+
use testdata::TEST_SENTENCE;
148+
149+
for&(s, w)inTEST_SENTENCE.iter(){
150+
macro_rules! assert_{
151+
($test:expr, $exp:expr, $name:expr) =>{
152+
// collect into vector for better diagnostics in failure case
153+
let testing = $test.collect::<Vec<_>>();
154+
let expected = $exp.collect::<Vec<_>>();
155+
assert_eq!(testing, expected,"{} test for testcase ({:?}, {:?}) failed.", $name, s, w)
156+
}
157+
}
158+
159+
assert_!(s.split_sentence_bounds(),
160+
w.iter().cloned(),
161+
"Forward sentence boundaries");
162+
}
163+
}
164+
144165
quickcheck!{
145166
fn quickcheck_forward_reverse_graphemes_extended(s:String) ->bool{
146167
let a = s.graphemes(true).collect::<Vec<_>>();

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp