Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Add unicode_word_indices#91

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletionsrc/lib.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -66,7 +66,7 @@ extern crate quickcheck;
pub use grapheme::{Graphemes, GraphemeIndices};
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices};
pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};

mod grapheme;
Expand DownExpand Up@@ -146,6 +146,30 @@ pub trait UnicodeSegmentation {
/// ```
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;

/// Returns an iterator over the words of `self`, separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
/// offsets.
///
/// Here, "words" are just those substrings which, after splitting on
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
/// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
///
/// assert_eq!(&uwi1[..], b);
/// ```
fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
Expand DownExpand Up@@ -249,6 +273,11 @@ impl UnicodeSegmentation for str {
word::new_unicode_words(self)
}

#[inline]
fn unicode_word_indices(&self) -> UnicodeWordIndices {
word::new_unicode_word_indices(self)
}

#[inline]
fn split_word_bounds(&self) -> UWordBounds {
word::new_word_bounds(self)
Expand Down
46 changes: 42 additions & 4 deletionssrc/word.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -40,6 +40,34 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
}

/// An iterator over the substrings of a string which, after splitting the string on
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
/// contain any characters with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
/// This iterator also provides the byte offsets for each substring.
///
/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
/// its documentation for more.
///
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
pub struct UnicodeWordIndices<'a> {
inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
}

impl<'a> Iterator for UnicodeWordIndices<'a> {
type Item = (usize, &'a str);

#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() }
}
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() }
}

/// External iterator for a string's
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
Expand DownExpand Up@@ -671,12 +699,22 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
}

#[inline]
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
use super::UnicodeSegmentation;
fn has_alphanumeric(s: &&str) -> bool {
use tables::util::is_alphanumeric;

fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
s.chars().any(|c| is_alphanumeric(c))
}

#[inline]
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
use super::UnicodeSegmentation;

UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
}

#[inline]
pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
use super::UnicodeSegmentation;

UnicodeWordIndices { inner: s.split_word_bound_indices().filter(|(_, c)| has_alphanumeric(c)) }
}

[8]ページ先頭

©2009-2025 Movatter.jp