Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Optimization for grapheme iteration.#77

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
Manishearth merged 2 commits intounicode-rs:masterfromcessen:faster_graphemes
Feb 12, 2020
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletionsscripts/unicode.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -280,22 +280,29 @@ def emit_break_module(f, break_table, break_cats, name):
f.write((" %sC_" % Name[0]) + cat + ",\n")
f.write(""" }

fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
use core;
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Ok(idx) => {
let (_, _, cat) = r[idx];
cat
let (lower, upper, cat) = r[idx];
(lower as u32, upper as u32, cat)
}
Err(idx) => {
(
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
%sC_Any,
)
}
Err(_) => %sC_Any
}
}

pub fn %s_category(c: char) -> %sCat {
pub fn %s_category(c: char) ->(u32, u32,%sCat) {
bsearch_range_value_table(c, %s_cat_table)
}

Expand Down
41 changes: 26 additions & 15 deletionssrc/grapheme.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -178,6 +178,8 @@ pub struct GraphemeCursor {
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
// to needing more input.
resuming: bool,
// Cached grapheme category and associated scalar value range.
grapheme_cat_cache: (u32, u32, GraphemeCat),
}

/// An error return indicating that not enough content was available in the
Expand DownExpand Up@@ -276,9 +278,20 @@ impl GraphemeCursor {
pre_context_offset: None,
ris_count: None,
resuming: false,
grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
}
}

fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
use tables::grapheme as gr;
// If this char isn't within the cached range, update the cache to the
// range that includes it.
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
self.grapheme_cat_cache = gr::grapheme_category(ch);
}
self.grapheme_cat_cache.2
}

// Not sure I'm gonna keep this, the advantage over new() seems thin.

/// Set the cursor to a new location in the same string.
Expand DownExpand Up@@ -349,7 +362,7 @@ impl GraphemeCursor {
self.pre_context_offset = None;
if self.is_extended && chunk_start + chunk.len() == self.offset {
let ch = chunk.chars().rev().next().unwrap();
ifgr::grapheme_category(ch) == gr::GC_Prepend {
ifself.grapheme_category(ch) == gr::GC_Prepend {
self.decide(false); // GB9b
return;
}
Expand All@@ -359,7 +372,7 @@ impl GraphemeCursor {
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
_ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
let ch = chunk.chars().rev().next().unwrap();
self.cat_before = Some(gr::grapheme_category(ch));
self.cat_before = Some(self.grapheme_category(ch));
},
}
}
Expand DownExpand Up@@ -393,7 +406,7 @@ impl GraphemeCursor {
use tables::grapheme as gr;
let mut ris_count = self.ris_count.unwrap_or(0);
for ch in chunk.chars().rev() {
ifgr::grapheme_category(ch) != gr::GC_Regional_Indicator {
ifself.grapheme_category(ch) != gr::GC_Regional_Indicator {
self.ris_count = Some(ris_count);
self.decide((ris_count % 2) == 0);
return;
Expand All@@ -413,13 +426,13 @@ impl GraphemeCursor {
use tables::grapheme as gr;
let mut iter = chunk.chars().rev();
if let Some(ch) = iter.next() {
ifgr::grapheme_category(ch) != gr::GC_ZWJ {
ifself.grapheme_category(ch) != gr::GC_ZWJ {
self.decide(true);
return;
}
}
for ch in iter {
matchgr::grapheme_category(ch) {
matchself.grapheme_category(ch) {
gr::GC_Extend => (),
gr::GC_Extended_Pictographic => {
self.decide(false);
Expand DownExpand Up@@ -481,7 +494,7 @@ impl GraphemeCursor {
let offset_in_chunk = self.offset - chunk_start;
if self.cat_after.is_none() {
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
self.cat_after = Some(gr::grapheme_category(ch));
self.cat_after = Some(self.grapheme_category(ch));
}
if self.offset == chunk_start {
let mut need_pre_context = true;
Expand All@@ -497,7 +510,7 @@ impl GraphemeCursor {
}
if self.cat_before.is_none() {
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
self.cat_before = Some(gr::grapheme_category(ch));
self.cat_before = Some(self.grapheme_category(ch));
}
match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
PairResult::NotBreak => return self.decision(false),
Expand DownExpand Up@@ -553,7 +566,6 @@ impl GraphemeCursor {
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
/// ```
pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
use tables::grapheme as gr;
if self.offset == self.len {
return Ok(None);
}
Expand All@@ -562,14 +574,14 @@ impl GraphemeCursor {
loop {
if self.resuming {
if self.cat_after.is_none() {
self.cat_after = Some(gr::grapheme_category(ch));
self.cat_after = Some(self.grapheme_category(ch));
}
} else {
self.offset += ch.len_utf8();
self.state = GraphemeState::Unknown;
self.cat_before = self.cat_after.take();
if self.cat_before.is_none() {
self.cat_before = Some(gr::grapheme_category(ch));
self.cat_before = Some(self.grapheme_category(ch));
}
if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
self.ris_count = self.ris_count.map(|c| c + 1);
Expand All@@ -578,7 +590,7 @@ impl GraphemeCursor {
}
if let Some(next_ch) = iter.next() {
ch = next_ch;
self.cat_after = Some(gr::grapheme_category(ch));
self.cat_after = Some(self.grapheme_category(ch));
} else if self.offset == self.len {
self.decide(true);
} else {
Expand DownExpand Up@@ -629,7 +641,6 @@ impl GraphemeCursor {
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
/// ```
pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
use tables::grapheme as gr;
if self.offset == 0 {
return Ok(None);
}
Expand All@@ -644,7 +655,7 @@ impl GraphemeCursor {
return Err(GraphemeIncomplete::PrevChunk);
}
if self.resuming {
self.cat_before = Some(gr::grapheme_category(ch));
self.cat_before = Some(self.grapheme_category(ch));
} else {
self.offset -= ch.len_utf8();
self.cat_after = self.cat_before.take();
Expand All@@ -654,12 +665,12 @@ impl GraphemeCursor {
}
if let Some(prev_ch) = iter.next() {
ch = prev_ch;
self.cat_before = Some(gr::grapheme_category(ch));
self.cat_before = Some(self.grapheme_category(ch));
} else if self.offset == 0 {
self.decide(true);
} else {
self.resuming = true;
self.cat_after = Some(gr::grapheme_category(ch));
self.cat_after = Some(self.grapheme_category(ch));
return Err(GraphemeIncomplete::PrevChunk);
}
}
Expand Down
4 changes: 2 additions & 2 deletionssrc/sentence.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -115,7 +115,7 @@ mod fwd {

for next_char in ahead.chars() {
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
match se::sentence_category(next_char) {
match se::sentence_category(next_char).2 {
se::SC_Lower => return true,
se::SC_OLetter |
se::SC_Upper |
Expand DownExpand Up@@ -182,7 +182,7 @@ mod fwd {
let position_before = self.pos;
let state_before = self.state.clone();

let next_cat = se::sentence_category(next_char);
let next_cat = se::sentence_category(next_char).2;

self.pos += next_char.len_utf8();
self.state = self.state.next(next_cat);
Expand Down
68 changes: 48 additions & 20 deletionssrc/tables.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -345,22 +345,29 @@ pub mod grapheme {
GC_ZWJ,
}

fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) {
use core;
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Ok(idx) => {
let (_, _, cat) = r[idx];
cat
let (lower, upper, cat) = r[idx];
(lower as u32, upper as u32, cat)
}
Err(idx) => {
(
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
GC_Any,
)
}
Err(_) => GC_Any
}
}

pub fn grapheme_category(c: char) -> GraphemeCat {
pub fn grapheme_category(c: char) ->(u32, u32,GraphemeCat) {
bsearch_range_value_table(c, grapheme_cat_table)
}

Expand DownExpand Up@@ -980,22 +987,29 @@ pub mod word {
WC_ZWJ,
}

fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> WordCat {
fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) {
use core;
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Ok(idx) => {
let (_, _, cat) = r[idx];
cat
let (lower, upper, cat) = r[idx];
(lower as u32, upper as u32, cat)
}
Err(idx) => {
(
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
WC_Any,
)
}
Err(_) => WC_Any
}
}

pub fn word_category(c: char) -> WordCat {
pub fn word_category(c: char) ->(u32, u32,WordCat) {
bsearch_range_value_table(c, word_cat_table)
}

Expand DownExpand Up@@ -1439,22 +1453,29 @@ pub mod emoji {
EC_Extended_Pictographic,
}

fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> EmojiCat {
fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) {
use core;
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Ok(idx) => {
let (_, _, cat) = r[idx];
cat
let (lower, upper, cat) = r[idx];
(lower as u32, upper as u32, cat)
}
Err(idx) => {
(
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
EC_Any,
)
}
Err(_) => EC_Any
}
}

pub fn emoji_category(c: char) -> EmojiCat {
pub fn emoji_category(c: char) ->(u32, u32,EmojiCat) {
bsearch_range_value_table(c, emoji_cat_table)
}

Expand DownExpand Up@@ -1535,22 +1556,29 @@ pub mod sentence {
SC_Upper,
}

fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> SentenceCat {
fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) {
use core;
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Ok(idx) => {
let (_, _, cat) = r[idx];
cat
let (lower, upper, cat) = r[idx];
(lower as u32, upper as u32, cat)
}
Err(idx) => {
(
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
SC_Any,
)
}
Err(_) => SC_Any
}
}

pub fn sentence_category(c: char) -> SentenceCat {
pub fn sentence_category(c: char) ->(u32, u32,SentenceCat) {
bsearch_range_value_table(c, sentence_cat_table)
}

Expand Down
Loading

[8]ページ先頭

©2009-2025 Movatter.jp