Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitcea3ce6

Browse files
authored
Merge pull request#91 from basile-henry/basile/unicode-word-indices
Add unicode_word_indices
2 parents247c0b1 +8bd6e3a commitcea3ce6

File tree

2 files changed

+72
-5
lines changed

2 files changed

+72
-5
lines changed

‎src/lib.rs

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ extern crate quickcheck;
6666
pubuse grapheme::{Graphemes,GraphemeIndices};
6767
pubuse grapheme::{GraphemeCursor,GraphemeIncomplete};
6868
pubuse tables::UNICODE_VERSION;
69-
pubuse word::{UWordBounds,UWordBoundIndices,UnicodeWords};
69+
pubuse word::{UWordBounds,UWordBoundIndices,UnicodeWords,UnicodeWordIndices};
7070
pubuse sentence::{USentenceBounds,USentenceBoundIndices,UnicodeSentences};
7171

7272
mod grapheme;
@@ -146,6 +146,30 @@ pub trait UnicodeSegmentation {
146146
/// ```
147147
fnunicode_words<'a>(&'aself) ->UnicodeWords<'a>;
148148

149+
/// Returns an iterator over the words of `self`, separated on
150+
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
151+
/// offsets.
152+
///
153+
/// Here, "words" are just those substrings which, after splitting on
154+
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
155+
/// substring must contain at least one character with the
156+
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
157+
/// property, or with
158+
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
159+
///
160+
/// # Example
161+
///
162+
/// ```
163+
/// # use self::unicode_segmentation::UnicodeSegmentation;
164+
/// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
165+
/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
166+
/// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
167+
/// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
168+
///
169+
/// assert_eq!(&uwi1[..], b);
170+
/// ```
171+
fnunicode_word_indices<'a>(&'aself) ->UnicodeWordIndices<'a>;
172+
149173
/// Returns an iterator over substrings of `self` separated on
150174
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
151175
///
@@ -249,6 +273,11 @@ impl UnicodeSegmentation for str {
249273
word::new_unicode_words(self)
250274
}
251275

276+
#[inline]
277+
fnunicode_word_indices(&self) ->UnicodeWordIndices{
278+
word::new_unicode_word_indices(self)
279+
}
280+
252281
#[inline]
253282
fnsplit_word_bounds(&self) ->UWordBounds{
254283
word::new_word_bounds(self)

‎src/word.rs

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,34 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
4040
fnnext_back(&mutself) ->Option<&'astr>{self.inner.next_back()}
4141
}
4242

43+
/// An iterator over the substrings of a string which, after splitting the string on
44+
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
45+
/// contain any characters with the
46+
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
47+
/// property, or with
48+
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
49+
/// This iterator also provides the byte offsets for each substring.
50+
///
51+
/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
52+
/// its documentation for more.
53+
///
54+
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
55+
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
56+
pubstructUnicodeWordIndices<'a>{
57+
inner:Filter<UWordBoundIndices<'a>,fn(&(usize,&str)) ->bool>,
58+
}
59+
60+
impl<'a>IteratorforUnicodeWordIndices<'a>{
61+
typeItem =(usize,&'astr);
62+
63+
#[inline]
64+
fnnext(&mutself) ->Option<(usize,&'astr)>{self.inner.next()}
65+
}
66+
impl<'a>DoubleEndedIteratorforUnicodeWordIndices<'a>{
67+
#[inline]
68+
fnnext_back(&mutself) ->Option<(usize,&'astr)>{self.inner.next_back()}
69+
}
70+
4371
/// External iterator for a string's
4472
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
4573
///
@@ -671,12 +699,22 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
671699
}
672700

673701
#[inline]
674-
pubfnnew_unicode_words<'b>(s:&'bstr) ->UnicodeWords<'b>{
675-
usesuper::UnicodeSegmentation;
702+
fnhas_alphanumeric(s:&&str) ->bool{
676703
use tables::util::is_alphanumeric;
677704

678-
fnhas_alphanumeric(s:&&str) ->bool{ s.chars().any(|c|is_alphanumeric(c))}
679-
let has_alphanumeric:fn(&&str) ->bool = has_alphanumeric;// coerce to fn pointer
705+
s.chars().any(|c|is_alphanumeric(c))
706+
}
707+
708+
#[inline]
709+
pubfnnew_unicode_words<'b>(s:&'bstr) ->UnicodeWords<'b>{
710+
usesuper::UnicodeSegmentation;
680711

681712
UnicodeWords{inner: s.split_word_bounds().filter(has_alphanumeric)}
682713
}
714+
715+
#[inline]
716+
pubfnnew_unicode_word_indices<'b>(s:&'bstr) ->UnicodeWordIndices<'b>{
717+
usesuper::UnicodeSegmentation;
718+
719+
UnicodeWordIndices{inner: s.split_word_bound_indices().filter(|(_, c)|has_alphanumeric(c))}
720+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp