Jul 28, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 13, 2025 · Jul 15, 2025
diff --git a/Cargo.toml b/Cargo.toml
 [dev-dependencies]
 quickcheck = "0.7"
 criterion = "0.5"
 proptest = "1.7.0"

 [[bench]]
 name = "chars"
 [[bench]]
 name = "word_bounds"
 harness = false

 [[bench]]
 name = "unicode_word_indices"
 harness = false

diff --git a/benches/chars.rs b/benches/chars.rs
    for file in FILES {
        group.bench_with_input(
            BenchmarkId::new("grapheme", file),
            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
            |b, content| b.iter(|| grapheme(content)),
        );
    }

    for file in FILES {
        group.bench_with_input(
            BenchmarkId::new("scalar", file),
            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
            |b, content| b.iter(|| scalar(content)),
        );
    }
diff --git a/benches/texts/log.txt b/benches/texts/log.txt
 2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later
diff --git a/benches/unicode_word_indices.rs b/benches/unicode_word_indices.rs
 use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

 use std::fs;
 use unicode_segmentation::UnicodeSegmentation;

 const FILES: &[&str] = &[
    "log", //"arabic",
    "english",
    //"hindi",
    "japanese",
    //"korean",
    //"mandarin",
    //"russian",
    //"source_code",
 ];

 #[inline(always)]
 fn grapheme(text: &str) {
    for w in text.unicode_word_indices() {
        black_box(w);
    }
 }

 fn bench_all(c: &mut Criterion) {
    let mut group = c.benchmark_group("unicode_word_indices");

    for file in FILES {
        let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap();
        group.throughput(criterion::Throughput::Bytes(input.len() as u64));
        group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| {
            b.iter(|| grapheme(content))
        });
    }
 }

 criterion_group!(benches, bench_all);
 criterion_main!(benches);
diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs
    for file in FILES {
        group.bench_with_input(
            BenchmarkId::new("grapheme", file),
            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
            &fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(),
            |b, content| b.iter(|| grapheme(content)),
        );
    }
diff --git a/benches/words.rs b/benches/words.rs
    for file in FILES {
        group.bench_with_input(
            BenchmarkId::new("grapheme", file),
            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
            |b, content| b.iter(|| grapheme(content)),
        );
    }

    for file in FILES {
        group.bench_with_input(
            BenchmarkId::new("scalar", file),
            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
            |b, content| b.iter(|| scalar(content)),
        );
    }
diff --git a/src/lib.rs b/src/lib.rs
 )]
 #![no_std]

 #[cfg(test)]
 extern crate std;

 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
 pub use grapheme::{GraphemeIndices, Graphemes};
 pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
 pub use tables::UNICODE_VERSION;
 pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
 pub use word::{UWordBoundIndices, UWordBounds};

 use crate::word::{UnicodeWordIndices, UnicodeWords};

 mod grapheme;
 mod sentence;

 impl UnicodeSegmentation for str {
    #[inline]
    fn graphemes(&self, is_extended: bool) -> Graphemes {
    fn graphemes(&self, is_extended: bool) -> Graphemes<'_> {
        grapheme::new_graphemes(self, is_extended)
    }

    }

    #[inline]
    fn unicode_words(&self) -> UnicodeWords {
    fn unicode_words(&self) -> UnicodeWords<'_> {
        word::new_unicode_words(self)
    }

    #[inline]
    fn unicode_word_indices(&self) -> UnicodeWordIndices {
    fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> {
        word::new_unicode_word_indices(self)
    }

    #[inline]
    fn split_word_bounds(&self) -> UWordBounds {
    fn split_word_bounds(&self) -> UWordBounds<'_> {
        word::new_word_bounds(self)
    }

    #[inline]
    fn split_word_bound_indices(&self) -> UWordBoundIndices {
    fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> {
        word::new_word_bound_indices(self)
    }

    #[inline]
    fn unicode_sentences(&self) -> UnicodeSentences {
    fn unicode_sentences(&self) -> UnicodeSentences<'_> {
        sentence::new_unicode_sentences(self)
    }

    #[inline]
    fn split_sentence_bounds(&self) -> USentenceBounds {
    fn split_sentence_bounds(&self) -> USentenceBounds<'_> {
        sentence::new_sentence_bounds(self)
    }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
		[dev-dependencies]
		quickcheck = "0.7"
		criterion = "0.5"
		proptest = "1.7.0"

		[[bench]]
		name = "chars"
Expand All		@@ -36,3 +37,8 @@ harness = false
		[[bench]]
		name = "word_bounds"
		harness = false

		[[bench]]
		name = "unicode_word_indices"
		harness = false
Original file line number	Diff line number	Diff line change
Expand Up		@@ -41,15 +41,15 @@ fn bench_all(c: &mut Criterion) {
		for file in FILES {
		group.bench_with_input(
		BenchmarkId::new("grapheme", file),
		&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
		&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
		\|b, content\| b.iter(\|\| grapheme(content)),
		);
		}

		for file in FILES {
		group.bench_with_input(
		BenchmarkId::new("scalar", file),
		&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
		&fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
		\|b, content\| b.iter(\|\| scalar(content)),
		);
		}
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		2018-07-12 13:59:01 UTC \| ERROR \| (worker.go:131 in process) \| Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,37 @@
		use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

		use std::fs;
		use unicode_segmentation::UnicodeSegmentation;

		const FILES: &[&str] = &[
		"log", //"arabic",
		"english",
		//"hindi",
		"japanese",
		//"korean",
		//"mandarin",
		//"russian",
		//"source_code",
		];

		#[inline(always)]
		fn grapheme(text: &str) {
		for w in text.unicode_word_indices() {
		black_box(w);
		}
		}

		fn bench_all(c: &mut Criterion) {
		let mut group = c.benchmark_group("unicode_word_indices");

		for file in FILES {
		let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap();
		group.throughput(criterion::Throughput::Bytes(input.len() as u64));
		group.bench_with_input(BenchmarkId::from_parameter(file), &input, \|b, content\| {
		b.iter(\|\| grapheme(content))
		});
		}
		}

		criterion_group!(benches, bench_all);
		criterion_main!(benches);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) {
		for file in FILES {
		group.bench_with_input(
		BenchmarkId::new("grapheme", file),
		&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
		&fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(),
		\|b, content\| b.iter(\|\| grapheme(content)),
		);
		}
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -56,11 +56,16 @@
		)]
		#![no_std]

		#[cfg(test)]
		extern crate std;

		pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
		pub use grapheme::{GraphemeIndices, Graphemes};
		pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
		pub use tables::UNICODE_VERSION;
		pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
		pub use word::{UWordBoundIndices, UWordBounds};

		use crate::word::{UnicodeWordIndices, UnicodeWords};

		mod grapheme;
		mod sentence;
Expand DownExpand Up		@@ -248,7 +253,7 @@ pub trait UnicodeSegmentation {

		impl UnicodeSegmentation for str {
		#[inline]
		fn graphemes(&self, is_extended: bool) -> Graphemes {
		fn graphemes(&self, is_extended: bool) -> Graphemes<'_> {
		grapheme::new_graphemes(self, is_extended)
		}

Expand All		@@ -258,32 +263,32 @@ impl UnicodeSegmentation for str {
		}

		#[inline]
		fn unicode_words(&self) -> UnicodeWords {
		fn unicode_words(&self) -> UnicodeWords<'_> {
		word::new_unicode_words(self)
		}

		#[inline]
		fn unicode_word_indices(&self) -> UnicodeWordIndices {
		fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> {
		word::new_unicode_word_indices(self)
		}

		#[inline]
		fn split_word_bounds(&self) -> UWordBounds {
		fn split_word_bounds(&self) -> UWordBounds<'_> {
		word::new_word_bounds(self)
		}

		#[inline]
		fn split_word_bound_indices(&self) -> UWordBoundIndices {
		fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> {
		word::new_word_bound_indices(self)
		}

		#[inline]
		fn unicode_sentences(&self) -> UnicodeSentences {
		fn unicode_sentences(&self) -> UnicodeSentences<'_> {
		sentence::new_unicode_sentences(self)
		}

		#[inline]
		fn split_sentence_bounds(&self) -> USentenceBounds {
		fn split_sentence_bounds(&self) -> USentenceBounds<'_> {
		sentence::new_sentence_bounds(self)
		}

Expand Down