Commitc7a6b6f

authored

Merge pull request#24 from tomcumming/master

Unicode sentence boundaries

2 parents8ca8e23 +9c7abf2 commitc7a6b6fCopy full SHA for c7a6b6f

File tree

8 files changed

+1757

-2

lines changed

Cargo.toml
scripts
- unicode.py
- unicode_gen_breaktests.py
src

8 files changed

+1757

-2

lines changed

`‎Cargo.toml‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"`
`12`	`12`	`keywords = ["text","unicode","grapheme","word","boundary"]`
`13`	`13`	`readme ="README.md"`
`14`	`14`	`description ="""`
`15`		`-This crate provides Grapheme ClusterandWord boundaries`
	`15`	`+This crate provides Grapheme Cluster, WordandSentence boundaries`
`16`	`16`	`according to Unicode Standard Annex #29 rules.`
`17`	`17`	`"""`
`18`	`18`

`‎scripts/unicode.py‎`

Lines changed: 7 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -351,3 +351,10 @@ def emit_break_module(f, break_table, break_cats, name):`
`351`	`351`	`word_table.extend([(x,y,cat)for (x,y)inword_cats[cat]])`
`352`	`352`	`word_table.sort(key=lambdaw:w[0])`
`353`	`353`	`emit_break_module(rf,word_table,word_cats.keys(),"word")`
	`354`	`+`
	`355`	`+sentence_cats=load_properties("auxiliary/SentenceBreakProperty.txt", [])`
	`356`	`+sentence_table= []`
	`357`	`+forcatinsentence_cats:`
	`358`	`+sentence_table.extend([(x,y,cat)for (x,y)insentence_cats[cat]])`
	`359`	`+sentence_table.sort(key=lambdaw:w[0])`
	`360`	`+emit_break_module(rf,sentence_table,sentence_cats.keys(),"sentence")`

`‎scripts/unicode_gen_breaktests.py‎`

Lines changed: 15 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -190,8 +190,23 @@ def create_words_data(f):`
`190`	`190`	`f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")`
`191`	`191`	`unicode.emit_table(f,"TEST_WORD",test,wtype,True,showfun,True)`
`192`	`192`
	`193`	`+defcreate_sentence_data(f):`
	`194`	`+d=load_test_data("auxiliary/SentenceBreakTest.txt")`
	`195`	`+`
	`196`	`+test= []`
	`197`	`+`
	`198`	`+for (c,i)ind:`
	`199`	`+allchars= [cnforsincforcnins]`
	`200`	`+test.append((allchars,c))`
	`201`	`+`
	`202`	`+wtype="&'static [(&'static str, &'static [&'static str])]"`
	`203`	`+f.write(" // official Unicode test data\n")`
	`204`	`+f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakTest.txt\n")`
	`205`	`+unicode.emit_table(f,"TEST_SENTENCE",test,wtype,True,showfun,True)`
	`206`	`+`
`193`	`207`	`if__name__=="__main__":`
`194`	`208`	`withopen("testdata.rs","w")asrf:`
`195`	`209`	`rf.write(unicode.preamble)`
`196`	`210`	`create_grapheme_data(rf)`
`197`	`211`	`create_words_data(rf)`
	`212`	`+create_sentence_data(rf)`

`‎src/lib.rs‎`

Lines changed: 39 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`// option. This file may not be copied, modified, or distributed`
`9`	`9`	`// except according to those terms.`
`10`	`10`
`11`		`-//! Iterators which split strings on Grapheme ClusterorWord boundaries, according`
	`11`	`+//! Iterators which split strings on Grapheme Cluster, WordorSentence boundaries, according`
`12`	`12`	`//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.`
`13`	`13`	`//!`
`14`	`14`	//! ```rust
`@@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};`
`67`	`67`	`pubuse grapheme::{GraphemeCursor,GraphemeIncomplete};`
`68`	`68`	`pubuse tables::UNICODE_VERSION;`
`69`	`69`	`pubuse word::{UWordBounds,UWordBoundIndices,UnicodeWords};`
	`70`	`+pubuse sentence::{USentenceBounds,USentenceBoundIndices,UnicodeSentences};`
`70`	`71`
`71`	`72`	`mod grapheme;`
`72`	`73`	`mod tables;`
`73`	`74`	`mod word;`
	`75`	`+mod sentence;`
`74`	`76`
`75`	`77`	`#[cfg(test)]`
`76`	`78`	`mod test;`
`@@ -174,6 +176,27 @@ pub trait UnicodeSegmentation {`
`174`	`176`	`/// assert_eq!(&swi1[..], b);`
`175`	`177`	/// ```
`176`	`178`	`fnsplit_word_bound_indices<'a>(&'aself) ->UWordBoundIndices<'a>;`
	`179`	`+`
	`180`	+/// Returns an iterator over substrings of `self` separated on
	`181`	`+/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).`
	`182`	`+///`
	`183`	`+/// The concatenation of the substrings returned by this function is just the original string.`
	`184`	`+fnunicode_sentences<'a>(&'aself) ->UnicodeSentences<'a>;`
	`185`	`+`
	`186`	+/// Returns an iterator over substrings of `self` separated on
	`187`	`+/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).`
	`188`	`+///`
	`189`	`+/// Here, "sentences" are just those substrings which, after splitting on`
	`190`	`+/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the`
	`191`	`+/// substring must contain at least one character with the`
	`192`	`+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)`
	`193`	`+/// property, or with`
	`194`	`+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).`
	`195`	`+fnsplit_sentence_bounds<'a>(&'aself) ->USentenceBounds<'a>;`
	`196`	`+`
	`197`	+/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
	`198`	+/// and their offsets. See `split_sentence_bounds()` for more information.
	`199`	`+fnsplit_sentence_bound_indices<'a>(&'aself) ->USentenceBoundIndices<'a>;`
`177`	`200`	`}`
`178`	`201`
`179`	`202`	`implUnicodeSegmentationforstr{`
`@@ -201,4 +224,19 @@ impl UnicodeSegmentation for str {`
`201`	`224`	`fnsplit_word_bound_indices(&self) ->UWordBoundIndices{`
`202`	`225`	`word::new_word_bound_indices(self)`
`203`	`226`	`}`
	`227`	`+`
	`228`	`+#[inline]`
	`229`	`+fnunicode_sentences(&self) ->UnicodeSentences{`
	`230`	`+ sentence::new_unicode_sentences(self)`
	`231`	`+}`
	`232`	`+`
	`233`	`+#[inline]`
	`234`	`+fnsplit_sentence_bounds(&self) ->USentenceBounds{`
	`235`	`+ sentence::new_sentence_bounds(self)`
	`236`	`+}`
	`237`	`+`
	`238`	`+#[inline]`
	`239`	`+fnsplit_sentence_bound_indices(&self) ->USentenceBoundIndices{`
	`240`	`+ sentence::new_sentence_bound_indices(self)`
	`241`	`+}`
`204`	`242`	`}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitc7a6b6f

File tree

8 files changed

8 files changed

`‎Cargo.toml‎`

`‎scripts/unicode.py‎`

`‎scripts/unicode_gen_breaktests.py‎`

`‎src/lib.rs‎`

0 commit comments