Commit7ac6f29

committed

Added forward iterator for unicode sentences

Passes all tests in the examples provided here:http://www.unicode.org/Public/9.0.0/ucd/auxiliary/SentenceBreakTest.txt

1 parentfa10dd3 commit7ac6f29Copy full SHA for 7ac6f29

File tree

4 files changed

+338

-2

lines changed

Cargo.toml
src

4 files changed

+338

-2

lines changed

`‎Cargo.toml‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"`
`12`	`12`	`keywords = ["text","unicode","grapheme","word","boundary"]`
`13`	`13`	`readme ="README.md"`
`14`	`14`	`description ="""`
`15`		`-This crate provides Grapheme ClusterandWord boundaries`
	`15`	`+This crate provides Grapheme Cluster, WordandSentence boundaries`
`16`	`16`	`according to Unicode Standard Annex #29 rules.`
`17`	`17`	`"""`
`18`	`18`

`‎src/lib.rs‎`

Lines changed: 14 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`// option. This file may not be copied, modified, or distributed`
`9`	`9`	`// except according to those terms.`
`10`	`10`
`11`		`-//! Iterators which split strings on Grapheme ClusterorWord boundaries, according`
	`11`	`+//! Iterators which split strings on Grapheme Cluster, WordorSentence boundaries, according`
`12`	`12`	`//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.`
`13`	`13`	`//!`
`14`	`14`	//! ```rust
`@@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};`
`67`	`67`	`pubuse grapheme::{GraphemeCursor,GraphemeIncomplete};`
`68`	`68`	`pubuse tables::UNICODE_VERSION;`
`69`	`69`	`pubuse word::{UWordBounds,UWordBoundIndices,UnicodeWords};`
	`70`	`+pubuse sentence::{USentenceBounds};`
`70`	`71`
`71`	`72`	`mod grapheme;`
`72`	`73`	`mod tables;`
`73`	`74`	`mod word;`
	`75`	`+mod sentence;`
`74`	`76`
`75`	`77`	`#[cfg(test)]`
`76`	`78`	`mod test;`
`@@ -174,6 +176,12 @@ pub trait UnicodeSegmentation {`
`174`	`176`	`/// assert_eq!(&swi1[..], b);`
`175`	`177`	/// ```
`176`	`178`	`fnsplit_word_bound_indices<'a>(&'aself) ->UWordBoundIndices<'a>;`
	`179`	`+`
	`180`	+/// Returns an iterator over substrings of `self` separated on
	`181`	`+/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).`
	`182`	`+///`
	`183`	`+/// The concatenation of the substrings returned by this function is just the original string.`
	`184`	`+fnsplit_sentence_bounds<'a>(&'aself) ->USentenceBounds<'a>;`
`177`	`185`	`}`
`178`	`186`
`179`	`187`	`implUnicodeSegmentationforstr{`
`@@ -201,4 +209,9 @@ impl UnicodeSegmentation for str {`
`201`	`209`	`fnsplit_word_bound_indices(&self) ->UWordBoundIndices{`
`202`	`210`	`word::new_word_bound_indices(self)`
`203`	`211`	`}`
	`212`	`+`
	`213`	`+#[inline]`
	`214`	`+fnsplit_sentence_bounds(&self) ->USentenceBounds{`
	`215`	`+ sentence::new_sentence_bounds(self)`
	`216`	`+}`
`204`	`217`	`}`

`‎src/sentence.rs‎`

Lines changed: 302 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,302 @@`
	`1`	`+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT`
	`2`	`+// file at the top-level directory of this distribution and at`
	`3`	`+// http://rust-lang.org/COPYRIGHT.`
	`4`	`+//`
	`5`	`+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or`
	`6`	`+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license`
	`7`	`+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your`
	`8`	`+// option. This file may not be copied, modified, or distributed`
	`9`	`+// except according to those terms.`
	`10`	`+`
	`11`	`+use core::cmp;`
	`12`	`+`
	`13`	`+// All of the logic for forward iteration over sentences`
	`14`	`+mod fwd{`
	`15`	`+use tables::sentence::SentenceCat;`
	`16`	`+use core::cmp;`
	`17`	`+`
	`18`	`+#[derive(Clone,Copy,PartialEq,Eq)]`
	`19`	`+enumStatePart{`
	`20`	`+Sot,`
	`21`	`+Eot,`
	`22`	`+Other,`
	`23`	`+CR,`
	`24`	`+LF,`
	`25`	`+Sep,`
	`26`	`+ATerm,`
	`27`	`+UpperLower,`
	`28`	`+ClosePlus,`
	`29`	`+SpPlus,`
	`30`	`+STerm`
	`31`	`+}`
	`32`	`+`
	`33`	`+#[derive(Clone,PartialEq,Eq)]`
	`34`	`+structSentenceBreaksState(pub[StatePart;4]);`
	`35`	`+`
	`36`	`+constINITIAL_STATE:SentenceBreaksState =SentenceBreaksState([`
	`37`	`+StatePart::Sot,`
	`38`	`+StatePart::Sot,`
	`39`	`+StatePart::Sot,`
	`40`	`+StatePart::Sot`
	`41`	`+]);`
	`42`	`+`
	`43`	`+pubstructSentenceBreaks<'a>{`
	`44`	`+pubstring:&'astr,`
	`45`	`+pos:usize,`
	`46`	`+state:SentenceBreaksState`
	`47`	`+}`
	`48`	`+`
	`49`	`+implSentenceBreaksState{`
	`50`	`+fnnext(&self,cat:SentenceCat) ->SentenceBreaksState{`
	`51`	`+let&SentenceBreaksState(parts) =self;`
	`52`	`+let parts =match(parts[3], cat){`
	`53`	`+(StatePart::ClosePlus,SentenceCat::SC_Close) => parts,`
	`54`	`+(StatePart::SpPlus,SentenceCat::SC_Sp) => parts,`
	`55`	`+ _ =>[`
	`56`	`+ parts[1],`
	`57`	`+ parts[2],`
	`58`	`+ parts[3],`
	`59`	`+match cat{`
	`60`	`+SentenceCat::SC_CR =>StatePart::CR,`
	`61`	`+SentenceCat::SC_LF =>StatePart::LF,`
	`62`	`+SentenceCat::SC_Sep =>StatePart::Sep,`
	`63`	`+SentenceCat::SC_ATerm =>StatePart::ATerm,`
	`64`	`+SentenceCat::SC_Upper \|`
	`65`	`+SentenceCat::SC_Lower =>StatePart::UpperLower,`
	`66`	`+SentenceCat::SC_Close =>StatePart::ClosePlus,`
	`67`	`+SentenceCat::SC_Sp =>StatePart::SpPlus,`
	`68`	`+SentenceCat::SC_STerm =>StatePart::STerm,`
	`69`	`+ _ =>StatePart::Other`
	`70`	`+}`
	`71`	`+]`
	`72`	`+};`
	`73`	`+SentenceBreaksState(parts)`
	`74`	`+}`
	`75`	`+`
	`76`	`+fnend(&self) ->SentenceBreaksState{`
	`77`	`+let&SentenceBreaksState(parts) =self;`
	`78`	`+SentenceBreaksState([`
	`79`	`+ parts[1],`
	`80`	`+ parts[2],`
	`81`	`+ parts[3],`
	`82`	`+StatePart::Eot`
	`83`	`+])`
	`84`	`+}`
	`85`	`+`
	`86`	`+fnmatch1(&self,part:StatePart) ->bool{`
	`87`	`+let&SentenceBreaksState(parts) =self;`
	`88`	`+ part == parts[3]`
	`89`	`+}`
	`90`	`+`
	`91`	`+fnmatch2(&self,part1:StatePart,part2:StatePart) ->bool{`
	`92`	`+let&SentenceBreaksState(parts) =self;`
	`93`	`+ part1 == parts[2] && part2 == parts[3]`
	`94`	`+}`
	`95`	`+}`
	`96`	`+`
	`97`	`+fnmatch_sb8(state:&SentenceBreaksState,ahead:&str) ->bool{`
	`98`	`+let aterm_part ={`
	`99`	`+// ATerm Close* Sp*`
	`100`	`+let&SentenceBreaksState(parts) = state;`
	`101`	`+letmut idx =if parts[3] ==StatePart::SpPlus{2}else{3};`
	`102`	`+if parts[idx] ==StatePart::ClosePlus{ idx -=1}`
	`103`	`+ parts[idx]`
	`104`	`+};`
	`105`	`+`
	`106`	`+if aterm_part ==StatePart::ATerm{`
	`107`	`+use tables::sentenceas se;`
	`108`	`+`
	`109`	`+for next_charin ahead.chars(){`
	`110`	`+//( ¬(OLetter \| Upper \| Lower \| ParaSep \| SATerm) )* Lower`
	`111`	`+match se::sentence_category(next_char){`
	`112`	`+ se::SC_Lower =>returntrue,`
	`113`	`+ se::SC_OLetter \|`
	`114`	`+ se::SC_Upper \|`
	`115`	`+ se::SC_Sep \| se::SC_CR \| se::SC_LF \|`
	`116`	`+ se::SC_STerm \| se::SC_ATerm =>returnfalse,`
	`117`	`+ _ =>continue`
	`118`	`+}`
	`119`	`+}`
	`120`	`+}`
	`121`	`+`
	`122`	`+false`
	`123`	`+}`
	`124`	`+`
	`125`	`+fnmatch_sb8a(state:&SentenceBreaksState) ->bool{`
	`126`	`+// SATerm Close* Sp*`
	`127`	`+let&SentenceBreaksState(parts) = state;`
	`128`	`+letmut idx =if parts[3] ==StatePart::SpPlus{2}else{3};`
	`129`	`+if parts[idx] ==StatePart::ClosePlus{ idx -=1}`
	`130`	`+ parts[idx] ==StatePart::STerm \|\| parts[idx] ==StatePart::ATerm`
	`131`	`+}`
	`132`	`+`
	`133`	`+fnmatch_sb9(state:&SentenceBreaksState) ->bool{`
	`134`	`+// SATerm Close*`
	`135`	`+let&SentenceBreaksState(parts) = state;`
	`136`	`+let idx =if parts[3] ==StatePart::ClosePlus{2}else{3};`
	`137`	`+ parts[idx] ==StatePart::STerm \|\| parts[idx] ==StatePart::ATerm`
	`138`	`+}`
	`139`	`+`
	`140`	`+fnmatch_sb11(state:&SentenceBreaksState) ->bool{`
	`141`	`+// SATerm Close* Sp* ParaSep?`
	`142`	`+let&SentenceBreaksState(parts) = state;`
	`143`	`+letmut idx =match parts[3]{`
	`144`	`+StatePart::Sep \|`
	`145`	`+StatePart::CR \|`
	`146`	`+StatePart::LF =>2,`
	`147`	`+ _ =>3`
	`148`	`+};`
	`149`	`+`
	`150`	`+if parts[idx] ==StatePart::SpPlus{ idx -=1}`
	`151`	`+if parts[idx] ==StatePart::ClosePlus{ idx -=1}`
	`152`	`+`
	`153`	`+ parts[idx] ==StatePart::STerm \|\| parts[idx] ==StatePart::ATerm`
	`154`	`+}`
	`155`	`+`
	`156`	`+impl<'a>IteratorforSentenceBreaks<'a>{`
	`157`	`+// Returns the index of the character which follows a break`
	`158`	`+typeItem =usize;`
	`159`	`+`
	`160`	`+#[inline]`
	`161`	`+fnsize_hint(&self) ->(usize,Option<usize>){`
	`162`	`+let slen =self.string.len();`
	`163`	`+// A sentence could be one character`
	`164`	`+(cmp::min(slen,2),Some(slen +1))`
	`165`	`+}`
	`166`	`+`
	`167`	`+#[inline]`
	`168`	`+fnnext(&mutself) ->Option<usize>{`
	`169`	`+use tables::sentenceas se;`
	`170`	`+`
	`171`	`+for next_charinself.string[self.pos..].chars(){`
	`172`	`+let position_before =self.pos;`
	`173`	`+let state_before =self.state.clone();`
	`174`	`+`
	`175`	`+let next_cat = se::sentence_category(next_char);`
	`176`	`+`
	`177`	`+self.pos += next_char.len_utf8();`
	`178`	`+self.state =self.state.next(next_cat);`
	`179`	`+`
	`180`	`+match next_cat{`
	`181`	`+// SB1`
	`182`	`+ _if state_before.match1(StatePart::Sot) =>`
	`183`	`+returnSome(position_before),`
	`184`	`+`
	`185`	`+// SB3`
	`186`	`+SentenceCat::SC_LFif state_before.match1(StatePart::CR) =>`
	`187`	`+continue,`
	`188`	`+`
	`189`	`+// SB4`
	`190`	`+ _if state_before.match1(StatePart::Sep)`
	`191`	`+ \|\| state_before.match1(StatePart::CR)`
	`192`	`+ \|\| state_before.match1(StatePart::LF)`
	`193`	`+ =>returnSome(position_before),`
	`194`	`+`
	`195`	`+// SB5`
	`196`	`+SentenceCat::SC_Extend \|`
	`197`	`+SentenceCat::SC_Format =>self.state = state_before,`
	`198`	`+`
	`199`	`+// SB6`
	`200`	`+SentenceCat::SC_Numericif state_before.match1(StatePart::ATerm) =>`
	`201`	`+continue,`
	`202`	`+`
	`203`	`+// SB7`
	`204`	`+SentenceCat::SC_Upperif state_before.match2(StatePart::UpperLower,StatePart::ATerm) =>`
	`205`	`+continue,`
	`206`	`+`
	`207`	`+// SB8`
	`208`	`+ _ifmatch_sb8(&state_before,&self.string[position_before..]) =>`
	`209`	`+continue,`
	`210`	`+`
	`211`	`+// SB8a`
	`212`	`+SentenceCat::SC_SContinue \|`
	`213`	`+SentenceCat::SC_STerm \|`
	`214`	`+SentenceCat::SC_ATermifmatch_sb8a(&state_before) =>`
	`215`	`+continue,`
	`216`	`+`
	`217`	`+// SB9`
	`218`	`+SentenceCat::SC_Close \|`
	`219`	`+SentenceCat::SC_Sp \|`
	`220`	`+SentenceCat::SC_Sep \|`
	`221`	`+SentenceCat::SC_CR \|`
	`222`	`+SentenceCat::SC_LFifmatch_sb9(&state_before) =>`
	`223`	`+continue,`
	`224`	`+`
	`225`	`+// SB10`
	`226`	`+SentenceCat::SC_Sp \|`
	`227`	`+SentenceCat::SC_Sep \|`
	`228`	`+SentenceCat::SC_CR \|`
	`229`	`+SentenceCat::SC_LFifmatch_sb8a(&state_before) =>`
	`230`	`+continue,`
	`231`	`+`
	`232`	`+// SB11`
	`233`	`+ _ifmatch_sb11(&state_before) =>`
	`234`	`+returnSome(position_before),`
	`235`	`+`
	`236`	`+// SB998`
	`237`	`+ _ =>continue`
	`238`	`+}`
	`239`	`+}`
	`240`	`+`
	`241`	`+// SB2`
	`242`	`+ifself.state.match1(StatePart::Sot){`
	`243`	`+None`
	`244`	`+}elseifself.state.match1(StatePart::Eot){`
	`245`	`+None`
	`246`	`+}else{`
	`247`	`+self.state =self.state.end();`
	`248`	`+Some(self.pos)`
	`249`	`+}`
	`250`	`+}`
	`251`	`+}`
	`252`	`+`
	`253`	`+pubfnnew_sentence_breaks<'a>(source:&'astr) ->SentenceBreaks<'a>{`
	`254`	`+SentenceBreaks{string: source,pos:0,state:INITIAL_STATE}`
	`255`	`+}`
	`256`	`+`
	`257`	`+}`
	`258`	`+`
	`259`	`+/// External iterator for a string's`
	`260`	`+/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).`
	`261`	`+pubstructUSentenceBounds<'a>{`
	`262`	`+iter: fwd::SentenceBreaks<'a>,`
	`263`	`+sentence_start:Option<usize>`
	`264`	`+}`
	`265`	`+`
	`266`	`+#[inline]`
	`267`	`+pubfnnew_sentence_bounds<'a>(source:&'astr) ->USentenceBounds<'a>{`
	`268`	`+USentenceBounds{`
	`269`	`+iter: fwd::new_sentence_breaks(source),`
	`270`	`+sentence_start:None`
	`271`	`+}`
	`272`	`+}`
	`273`	`+`
	`274`	`+impl<'a>IteratorforUSentenceBounds<'a>{`
	`275`	`+typeItem =&'astr;`
	`276`	`+`
	`277`	`+#[inline]`
	`278`	`+fnsize_hint(&self) ->(usize,Option<usize>){`
	`279`	`+let(lower, upper) =self.iter.size_hint();`
	`280`	`+(cmp::max(0, lower -1), upper.map(\|u\| cmp::max(0, u -1)))`
	`281`	`+}`
	`282`	`+`
	`283`	`+#[inline]`
	`284`	`+fnnext(&mutself) ->Option<&'astr>{`
	`285`	`+ifself.sentence_start ==None{`
	`286`	`+ifletSome(start_pos) =self.iter.next(){`
	`287`	`+self.sentence_start =Some(start_pos)`
	`288`	`+}else{`
	`289`	`+returnNone`
	`290`	`+}`
	`291`	`+}`
	`292`	`+`
	`293`	`+ifletSome(break_pos) =self.iter.next(){`
	`294`	`+let start_pos =self.sentence_start.unwrap();`
	`295`	`+let sentence =&self.iter.string[start_pos..break_pos];`
	`296`	`+self.sentence_start =Some(break_pos);`
	`297`	`+Some(sentence)`
	`298`	`+}else{`
	`299`	`+None`
	`300`	`+}`
	`301`	`+}`
	`302`	`+}`

`‎src/test.rs‎`

Lines changed: 21 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -141,6 +141,27 @@ fn test_words() {`
`141`	`141`	`}`
`142`	`142`	`}`
`143`	`143`
	`144`	`+`
	`145`	`+#[test]`
	`146`	`+fntest_sentences(){`
	`147`	`+use testdata::TEST_SENTENCE;`
	`148`	`+`
	`149`	`+for&(s, w)inTEST_SENTENCE.iter(){`
	`150`	`+macro_rules! assert_{`
	`151`	`+($test:expr, $exp:expr, $name:expr) =>{`
	`152`	`+// collect into vector for better diagnostics in failure case`
	`153`	`+let testing = $test.collect::<Vec<_>>();`
	`154`	`+let expected = $exp.collect::<Vec<_>>();`
	`155`	`+ assert_eq!(testing, expected,"{} test for testcase ({:?}, {:?}) failed.", $name, s, w)`
	`156`	`+}`
	`157`	`+}`
	`158`	`+`
	`159`	`+assert_!(s.split_sentence_bounds(),`
	`160`	`+ w.iter().cloned(),`
	`161`	`+"Forward sentence boundaries");`
	`162`	`+}`
	`163`	`+}`
	`164`	`+`
`144`	`165`	`quickcheck!{`
`145`	`166`	`fn quickcheck_forward_reverse_graphemes_extended(s:String) ->bool{`
`146`	`167`	`let a = s.graphemes(true).collect::<Vec<_>>();`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit7ac6f29

File tree

4 files changed

4 files changed

`‎Cargo.toml‎`

`‎src/lib.rs‎`

`‎src/sentence.rs‎`

`‎src/test.rs‎`

0 commit comments