Commit50058a5

committed

Adds unicode_sentences and split_sentence_bound_indices

1 parent7ac6f29 commit50058a5Copy full SHA for 50058a5

File tree

2 files changed

+87

-1

lines changed

src
- lib.rs
- sentence.rs

2 files changed

+87

-1

lines changed

`‎src/lib.rs‎`

Lines changed: 26 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ pub use grapheme::{Graphemes, GraphemeIndices};`
`67`	`67`	`pubuse grapheme::{GraphemeCursor,GraphemeIncomplete};`
`68`	`68`	`pubuse tables::UNICODE_VERSION;`
`69`	`69`	`pubuse word::{UWordBounds,UWordBoundIndices,UnicodeWords};`
`70`		`-pubuse sentence::{USentenceBounds};`
	`70`	`+pubuse sentence::{USentenceBounds,USentenceBoundIndices,UnicodeSentences};`
`71`	`71`
`72`	`72`	`mod grapheme;`
`73`	`73`	`mod tables;`
`@@ -181,7 +181,22 @@ pub trait UnicodeSegmentation {`
`181`	`181`	`/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).`
`182`	`182`	`///`
`183`	`183`	`/// The concatenation of the substrings returned by this function is just the original string.`
	`184`	`+fnunicode_sentences<'a>(&'aself) ->UnicodeSentences<'a>;`
	`185`	`+`
	`186`	+/// Returns an iterator over substrings of `self` separated on
	`187`	`+/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).`
	`188`	`+///`
	`189`	`+/// Here, "sentences" are just those substrings which, after splitting on`
	`190`	`+/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the`
	`191`	`+/// substring must contain at least one character with the`
	`192`	`+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)`
	`193`	`+/// property, or with`
	`194`	`+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).`
`184`	`195`	`fnsplit_sentence_bounds<'a>(&'aself) ->USentenceBounds<'a>;`
	`196`	`+`
	`197`	+/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
	`198`	+/// and their offsets. See `split_sentence_bounds()` for more information.
	`199`	`+fnsplit_sentence_bound_indices<'a>(&'aself) ->USentenceBoundIndices<'a>;`
`185`	`200`	`}`
`186`	`201`
`187`	`202`	`implUnicodeSegmentationforstr{`
`@@ -210,8 +225,18 @@ impl UnicodeSegmentation for str {`
`210`	`225`	`word::new_word_bound_indices(self)`
`211`	`226`	`}`
`212`	`227`
	`228`	`+#[inline]`
	`229`	`+fnunicode_sentences(&self) ->UnicodeSentences{`
	`230`	`+ sentence::new_unicode_sentences(self)`
	`231`	`+}`
	`232`	`+`
`213`	`233`	`#[inline]`
`214`	`234`	`fnsplit_sentence_bounds(&self) ->USentenceBounds{`
`215`	`235`	`sentence::new_sentence_bounds(self)`
`216`	`236`	`}`
	`237`	`+`
	`238`	`+#[inline]`
	`239`	`+fnsplit_sentence_bound_indices(&self) ->USentenceBoundIndices{`
	`240`	`+ sentence::new_sentence_bound_indices(self)`
	`241`	`+}`
`217`	`242`	`}`

`‎src/sentence.rs‎`

Lines changed: 61 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`// except according to those terms.`
`10`	`10`
`11`	`11`	`use core::cmp;`
	`12`	`+use core::iter::Filter;`
`12`	`13`
`13`	`14`	`// All of the logic for forward iteration over sentences`
`14`	`15`	`mod fwd{`
`@@ -40,6 +41,7 @@ mod fwd {`
`40`	`41`	`StatePart::Sot`
`41`	`42`	`]);`
`42`	`43`
	`44`	`+#[derive(Clone)]`
`43`	`45`	`pubstructSentenceBreaks<'a>{`
`44`	`46`	`pubstring:&'astr,`
`45`	`47`	`pos:usize,`
`@@ -256,13 +258,32 @@ mod fwd {`
`256`	`258`
`257`	`259`	`}`
`258`	`260`
	`261`	`+/// An iterator over the substrings of a string which, after splitting the string on`
	`262`	`+/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),`
	`263`	`+/// contain any characters with the`
	`264`	`+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)`
	`265`	`+/// property, or with`
	`266`	`+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).`
	`267`	`+#[derive(Clone)]`
	`268`	`+pubstructUnicodeSentences<'a>{`
	`269`	`+inner:Filter<USentenceBounds<'a>,fn(&&str) ->bool>,`
	`270`	`+}`
	`271`	`+`
`259`	`272`	`/// External iterator for a string's`
`260`	`273`	`/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).`
	`274`	`+#[derive(Clone)]`
`261`	`275`	`pubstructUSentenceBounds<'a>{`
`262`	`276`	`iter: fwd::SentenceBreaks<'a>,`
`263`	`277`	`sentence_start:Option<usize>`
`264`	`278`	`}`
`265`	`279`
	`280`	`+/// External iterator for sentence boundaries and byte offsets.`
	`281`	`+#[derive(Clone)]`
	`282`	`+pubstructUSentenceBoundIndices<'a>{`
	`283`	`+start_offset:usize,`
	`284`	`+iter:USentenceBounds<'a>,`
	`285`	`+}`
	`286`	`+`
`266`	`287`	`#[inline]`
`267`	`288`	`pubfnnew_sentence_bounds<'a>(source:&'astr) ->USentenceBounds<'a>{`
`268`	`289`	`USentenceBounds{`
`@@ -271,6 +292,32 @@ pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {`
`271`	`292`	`}`
`272`	`293`	`}`
`273`	`294`
	`295`	`+#[inline]`
	`296`	`+pubfnnew_sentence_bound_indices<'a>(source:&'astr) ->USentenceBoundIndices<'a>{`
	`297`	`+USentenceBoundIndices{`
	`298`	`+start_offset: source.as_ptr()asusize,`
	`299`	`+iter:new_sentence_bounds(source)`
	`300`	`+}`
	`301`	`+}`
	`302`	`+`
	`303`	`+#[inline]`
	`304`	`+pubfnnew_unicode_sentences<'b>(s:&'bstr) ->UnicodeSentences<'b>{`
	`305`	`+usesuper::UnicodeSegmentation;`
	`306`	`+use tables::util::is_alphanumeric;`
	`307`	`+`
	`308`	`+fnhas_alphanumeric(s:&&str) ->bool{ s.chars().any(\|c\|is_alphanumeric(c))}`
	`309`	`+let has_alphanumeric:fn(&&str) ->bool = has_alphanumeric;// coerce to fn pointer`
	`310`	`+`
	`311`	`+UnicodeSentences{inner: s.split_sentence_bounds().filter(has_alphanumeric)}`
	`312`	`+}`
	`313`	`+`
	`314`	`+impl<'a>IteratorforUnicodeSentences<'a>{`
	`315`	`+typeItem =&'astr;`
	`316`	`+`
	`317`	`+#[inline]`
	`318`	`+fnnext(&mutself) ->Option<&'astr>{self.inner.next()}`
	`319`	`+}`
	`320`	`+`
`274`	`321`	`impl<'a>IteratorforUSentenceBounds<'a>{`
`275`	`322`	`typeItem =&'astr;`
`276`	`323`
`@@ -300,3 +347,17 @@ impl<'a> Iterator for USentenceBounds<'a> {`
`300`	`347`	`}`
`301`	`348`	`}`
`302`	`349`	`}`
	`350`	`+`
	`351`	`+impl<'a>IteratorforUSentenceBoundIndices<'a>{`
	`352`	`+typeItem =(usize,&'astr);`
	`353`	`+`
	`354`	`+#[inline]`
	`355`	`+fnnext(&mutself) ->Option<(usize,&'astr)>{`
	`356`	`+self.iter.next().map(\|s\|(s.as_ptr()asusize -self.start_offset, s))`
	`357`	`+}`
	`358`	`+`
	`359`	`+#[inline]`
	`360`	`+fnsize_hint(&self) ->(usize,Option<usize>){`
	`361`	`+self.iter.size_hint()`
	`362`	`+}`
	`363`	`+}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit50058a5

File tree

2 files changed

2 files changed

`‎src/lib.rs‎`

`‎src/sentence.rs‎`

0 commit comments