|
8 | 8 | // option. This file may not be copied, modified, or distributed |
9 | 9 | // except according to those terms. |
10 | 10 |
|
11 | | -//! Iterators which split strings on Grapheme ClusterorWord boundaries, according |
| 11 | +//! Iterators which split strings on Grapheme Cluster, WordorSentence boundaries, according |
12 | 12 | //! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules. |
13 | 13 | //! |
14 | 14 | //! ```rust |
@@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices}; |
67 | 67 | pubuse grapheme::{GraphemeCursor,GraphemeIncomplete}; |
68 | 68 | pubuse tables::UNICODE_VERSION; |
69 | 69 | pubuse word::{UWordBounds,UWordBoundIndices,UnicodeWords}; |
| 70 | +pubuse sentence::{USentenceBounds,USentenceBoundIndices,UnicodeSentences}; |
70 | 71 |
|
71 | 72 | mod grapheme; |
72 | 73 | mod tables; |
73 | 74 | mod word; |
| 75 | +mod sentence; |
74 | 76 |
|
75 | 77 | #[cfg(test)] |
76 | 78 | mod test; |
@@ -174,6 +176,27 @@ pub trait UnicodeSegmentation { |
174 | 176 | /// assert_eq!(&swi1[..], b); |
175 | 177 | /// ``` |
176 | 178 | fnsplit_word_bound_indices<'a>(&'aself) ->UWordBoundIndices<'a>; |
| 179 | + |
| 180 | +/// Returns an iterator over substrings of `self` separated on |
| 181 | +/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). |
| 182 | +/// |
| 183 | +/// The concatenation of the substrings returned by this function is just the original string. |
| 184 | +fnunicode_sentences<'a>(&'aself) ->UnicodeSentences<'a>; |
| 185 | + |
| 186 | +/// Returns an iterator over substrings of `self` separated on |
| 187 | +/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). |
| 188 | +/// |
| 189 | +/// Here, "sentences" are just those substrings which, after splitting on |
| 190 | +/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the |
| 191 | +/// substring must contain at least one character with the |
| 192 | +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
| 193 | +/// property, or with |
| 194 | +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
| 195 | +fnsplit_sentence_bounds<'a>(&'aself) ->USentenceBounds<'a>; |
| 196 | + |
| 197 | +/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries, |
| 198 | +/// and their offsets. See `split_sentence_bounds()` for more information. |
| 199 | +fnsplit_sentence_bound_indices<'a>(&'aself) ->USentenceBoundIndices<'a>; |
177 | 200 | } |
178 | 201 |
|
179 | 202 | implUnicodeSegmentationforstr{ |
@@ -201,4 +224,19 @@ impl UnicodeSegmentation for str { |
201 | 224 | fnsplit_word_bound_indices(&self) ->UWordBoundIndices{ |
202 | 225 | word::new_word_bound_indices(self) |
203 | 226 | } |
| 227 | + |
| 228 | +#[inline] |
| 229 | +fnunicode_sentences(&self) ->UnicodeSentences{ |
| 230 | + sentence::new_unicode_sentences(self) |
| 231 | +} |
| 232 | + |
| 233 | +#[inline] |
| 234 | +fnsplit_sentence_bounds(&self) ->USentenceBounds{ |
| 235 | + sentence::new_sentence_bounds(self) |
| 236 | +} |
| 237 | + |
| 238 | +#[inline] |
| 239 | +fnsplit_sentence_bound_indices(&self) ->USentenceBoundIndices{ |
| 240 | + sentence::new_sentence_bound_indices(self) |
| 241 | +} |
204 | 242 | } |