Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit98fd158

Browse files
committed
Optimization for grapheme iteration.
Typical text involves a lot of chars one-after-another thatare in a similar scalar value range and in the same category. Thiscommit takes advantage of that to speed up grapheme iteration bycaching the range and category of the last char, and always checkingagainst that cache first before doing a binary search on the wholegrapheme category table.This results in significant speed ups on many texts, in some casesup to 2x faster.
1 parentac54e2d commit98fd158

File tree

5 files changed

+89
-46
lines changed

5 files changed

+89
-46
lines changed

‎scripts/unicode.py‎

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -280,22 +280,28 @@ def emit_break_module(f, break_table, break_cats, name):
280280
f.write((" %sC_"%Name[0])+cat+",\n")
281281
f.write(""" }
282282
283-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
283+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) ->(u32, u32,%sCat) {
284284
use core::cmp::Ordering::{Equal, Less, Greater};
285285
match r.binary_search_by(|&(lo, hi, _)| {
286286
if lo <= c && c <= hi { Equal }
287287
else if hi < c { Less }
288288
else { Greater }
289289
}) {
290290
Ok(idx) => {
291-
let (_, _, cat) = r[idx];
292-
cat
291+
let (lower, upper, cat) = r[idx];
292+
(lower as u32, upper as u32, cat)
293+
}
294+
Err(idx) => {
295+
(
296+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
297+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
298+
%sC_Any,
299+
)
293300
}
294-
Err(_) => %sC_Any
295301
}
296302
}
297303
298-
pub fn %s_category(c: char) -> %sCat {
304+
pub fn %s_category(c: char) ->(u32, u32,%sCat) {
299305
bsearch_range_value_table(c, %s_cat_table)
300306
}
301307

‎src/grapheme.rs‎

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ pub struct GraphemeCursor {
178178
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
179179
// to needing more input.
180180
resuming:bool,
181+
// Cached grapheme category and associated scalar value range.
182+
grapheme_cat_cache:(u32,u32,GraphemeCat),
181183
}
182184

183185
/// An error return indicating that not enough content was available in the
@@ -276,9 +278,20 @@ impl GraphemeCursor {
276278
pre_context_offset:None,
277279
ris_count:None,
278280
resuming:false,
281+
grapheme_cat_cache:(0,0,GraphemeCat::GC_Control),
279282
}
280283
}
281284

285+
fngrapheme_category(&mutself,ch:char) ->GraphemeCat{
286+
use tables::graphemeas gr;
287+
// If this char isn't within the cached range, update the cache to the
288+
// range that includes it.
289+
if(chasu32) <self.grapheme_cat_cache.0 ||(chasu32) >self.grapheme_cat_cache.1{
290+
self.grapheme_cat_cache = gr::grapheme_category(ch);
291+
}
292+
self.grapheme_cat_cache.2
293+
}
294+
282295
// Not sure I'm gonna keep this, the advantage over new() seems thin.
283296

284297
/// Set the cursor to a new location in the same string.
@@ -349,7 +362,7 @@ impl GraphemeCursor {
349362
self.pre_context_offset =None;
350363
ifself.is_extended && chunk_start + chunk.len() ==self.offset{
351364
let ch = chunk.chars().rev().next().unwrap();
352-
ifgr::grapheme_category(ch) == gr::GC_Prepend{
365+
ifself.grapheme_category(ch) == gr::GC_Prepend{
353366
self.decide(false);// GB9b
354367
return;
355368
}
@@ -359,7 +372,7 @@ impl GraphemeCursor {
359372
GraphemeState::Emoji =>self.handle_emoji(chunk, chunk_start),
360373
_ =>ifself.cat_before.is_none() &&self.offset == chunk.len() + chunk_start{
361374
let ch = chunk.chars().rev().next().unwrap();
362-
self.cat_before =Some(gr::grapheme_category(ch));
375+
self.cat_before =Some(self.grapheme_category(ch));
363376
},
364377
}
365378
}
@@ -393,7 +406,7 @@ impl GraphemeCursor {
393406
use tables::graphemeas gr;
394407
letmut ris_count =self.ris_count.unwrap_or(0);
395408
for chin chunk.chars().rev(){
396-
ifgr::grapheme_category(ch) != gr::GC_Regional_Indicator{
409+
ifself.grapheme_category(ch) != gr::GC_Regional_Indicator{
397410
self.ris_count =Some(ris_count);
398411
self.decide((ris_count %2) ==0);
399412
return;
@@ -413,13 +426,13 @@ impl GraphemeCursor {
413426
use tables::graphemeas gr;
414427
letmut iter = chunk.chars().rev();
415428
ifletSome(ch) = iter.next(){
416-
ifgr::grapheme_category(ch) != gr::GC_ZWJ{
429+
ifself.grapheme_category(ch) != gr::GC_ZWJ{
417430
self.decide(true);
418431
return;
419432
}
420433
}
421434
for chin iter{
422-
matchgr::grapheme_category(ch){
435+
matchself.grapheme_category(ch){
423436
gr::GC_Extend =>(),
424437
gr::GC_Extended_Pictographic =>{
425438
self.decide(false);
@@ -481,7 +494,7 @@ impl GraphemeCursor {
481494
let offset_in_chunk =self.offset - chunk_start;
482495
ifself.cat_after.is_none(){
483496
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
484-
self.cat_after =Some(gr::grapheme_category(ch));
497+
self.cat_after =Some(self.grapheme_category(ch));
485498
}
486499
ifself.offset == chunk_start{
487500
letmut need_pre_context =true;
@@ -497,7 +510,7 @@ impl GraphemeCursor {
497510
}
498511
ifself.cat_before.is_none(){
499512
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
500-
self.cat_before =Some(gr::grapheme_category(ch));
513+
self.cat_before =Some(self.grapheme_category(ch));
501514
}
502515
matchcheck_pair(self.cat_before.unwrap(),self.cat_after.unwrap()){
503516
PairResult::NotBreak =>returnself.decision(false),
@@ -562,14 +575,14 @@ impl GraphemeCursor {
562575
loop{
563576
ifself.resuming{
564577
ifself.cat_after.is_none(){
565-
self.cat_after =Some(gr::grapheme_category(ch));
578+
self.cat_after =Some(self.grapheme_category(ch));
566579
}
567580
}else{
568581
self.offset += ch.len_utf8();
569582
self.state =GraphemeState::Unknown;
570583
self.cat_before =self.cat_after.take();
571584
ifself.cat_before.is_none(){
572-
self.cat_before =Some(gr::grapheme_category(ch));
585+
self.cat_before =Some(self.grapheme_category(ch));
573586
}
574587
ifself.cat_before.unwrap() ==GraphemeCat::GC_Regional_Indicator{
575588
self.ris_count =self.ris_count.map(|c| c +1);
@@ -578,7 +591,7 @@ impl GraphemeCursor {
578591
}
579592
ifletSome(next_ch) = iter.next(){
580593
ch = next_ch;
581-
self.cat_after =Some(gr::grapheme_category(ch));
594+
self.cat_after =Some(self.grapheme_category(ch));
582595
}elseifself.offset ==self.len{
583596
self.decide(true);
584597
}else{
@@ -644,7 +657,7 @@ impl GraphemeCursor {
644657
returnErr(GraphemeIncomplete::PrevChunk);
645658
}
646659
ifself.resuming{
647-
self.cat_before =Some(gr::grapheme_category(ch));
660+
self.cat_before =Some(self.grapheme_category(ch));
648661
}else{
649662
self.offset -= ch.len_utf8();
650663
self.cat_after =self.cat_before.take();
@@ -654,12 +667,12 @@ impl GraphemeCursor {
654667
}
655668
ifletSome(prev_ch) = iter.next(){
656669
ch = prev_ch;
657-
self.cat_before =Some(gr::grapheme_category(ch));
670+
self.cat_before =Some(self.grapheme_category(ch));
658671
}elseifself.offset ==0{
659672
self.decide(true);
660673
}else{
661674
self.resuming =true;
662-
self.cat_after =Some(gr::grapheme_category(ch));
675+
self.cat_after =Some(self.grapheme_category(ch));
663676
returnErr(GraphemeIncomplete::PrevChunk);
664677
}
665678
}

‎src/sentence.rs‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ mod fwd {
115115

116116
for next_charin ahead.chars(){
117117
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
118-
match se::sentence_category(next_char){
118+
match se::sentence_category(next_char).2{
119119
se::SC_Lower =>returntrue,
120120
se::SC_OLetter |
121121
se::SC_Upper |
@@ -182,7 +182,7 @@ mod fwd {
182182
let position_before =self.pos;
183183
let state_before =self.state.clone();
184184

185-
let next_cat = se::sentence_category(next_char);
185+
let next_cat = se::sentence_category(next_char).2;
186186

187187
self.pos += next_char.len_utf8();
188188
self.state =self.state.next(next_cat);

‎src/tables.rs‎

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -345,22 +345,28 @@ pub mod grapheme {
345345
GC_ZWJ,
346346
}
347347

348-
fnbsearch_range_value_table(c:char,r:&'static[(char,char,GraphemeCat)]) ->GraphemeCat{
348+
fnbsearch_range_value_table(c:char,r:&'static[(char,char,GraphemeCat)]) ->(u32,u32,GraphemeCat){
349349
use core::cmp::Ordering::{Equal,Less,Greater};
350350
match r.binary_search_by(|&(lo, hi, _)|{
351351
if lo <= c && c <= hi{Equal}
352352
elseif hi < c{Less}
353353
else{Greater}
354354
}){
355355
Ok(idx) =>{
356-
let(_, _, cat) = r[idx];
357-
cat
356+
let(lower, upper, cat) = r[idx];
357+
(lowerasu32, upperasu32, cat)
358+
}
359+
Err(idx) =>{
360+
(
361+
if idx >0{ r[idx-1].1asu32 +1}else{0},
362+
r.get(idx).map(|c|c.0asu32 -1).unwrap_or(core::u32::MAX),
363+
GC_Any,
364+
)
358365
}
359-
Err(_) =>GC_Any
360366
}
361367
}
362368

363-
pubfngrapheme_category(c:char) ->GraphemeCat{
369+
pubfngrapheme_category(c:char) ->(u32,u32,GraphemeCat){
364370
bsearch_range_value_table(c, grapheme_cat_table)
365371
}
366372

@@ -980,22 +986,28 @@ pub mod word {
980986
WC_ZWJ,
981987
}
982988

983-
fnbsearch_range_value_table(c:char,r:&'static[(char,char,WordCat)]) ->WordCat{
989+
fnbsearch_range_value_table(c:char,r:&'static[(char,char,WordCat)]) ->(u32,u32,WordCat){
984990
use core::cmp::Ordering::{Equal,Less,Greater};
985991
match r.binary_search_by(|&(lo, hi, _)|{
986992
if lo <= c && c <= hi{Equal}
987993
elseif hi < c{Less}
988994
else{Greater}
989995
}){
990996
Ok(idx) =>{
991-
let(_, _, cat) = r[idx];
992-
cat
997+
let(lower, upper, cat) = r[idx];
998+
(lowerasu32, upperasu32, cat)
999+
}
1000+
Err(idx) =>{
1001+
(
1002+
if idx >0{ r[idx-1].1asu32 +1}else{0},
1003+
r.get(idx).map(|c|c.0asu32 -1).unwrap_or(core::u32::MAX),
1004+
WC_Any,
1005+
)
9931006
}
994-
Err(_) =>WC_Any
9951007
}
9961008
}
9971009

998-
pubfnword_category(c:char) ->WordCat{
1010+
pubfnword_category(c:char) ->(u32,u32,WordCat){
9991011
bsearch_range_value_table(c, word_cat_table)
10001012
}
10011013

@@ -1439,22 +1451,28 @@ pub mod emoji {
14391451
EC_Extended_Pictographic,
14401452
}
14411453

1442-
fnbsearch_range_value_table(c:char,r:&'static[(char,char,EmojiCat)]) ->EmojiCat{
1454+
fnbsearch_range_value_table(c:char,r:&'static[(char,char,EmojiCat)]) ->(u32,u32,EmojiCat){
14431455
use core::cmp::Ordering::{Equal,Less,Greater};
14441456
match r.binary_search_by(|&(lo, hi, _)|{
14451457
if lo <= c && c <= hi{Equal}
14461458
elseif hi < c{Less}
14471459
else{Greater}
14481460
}){
14491461
Ok(idx) =>{
1450-
let(_, _, cat) = r[idx];
1451-
cat
1462+
let(lower, upper, cat) = r[idx];
1463+
(lowerasu32, upperasu32, cat)
1464+
}
1465+
Err(idx) =>{
1466+
(
1467+
if idx >0{ r[idx-1].1asu32 +1}else{0},
1468+
r.get(idx).map(|c|c.0asu32 -1).unwrap_or(core::u32::MAX),
1469+
EC_Any,
1470+
)
14521471
}
1453-
Err(_) =>EC_Any
14541472
}
14551473
}
14561474

1457-
pubfnemoji_category(c:char) ->EmojiCat{
1475+
pubfnemoji_category(c:char) ->(u32,u32,EmojiCat){
14581476
bsearch_range_value_table(c, emoji_cat_table)
14591477
}
14601478

@@ -1535,22 +1553,28 @@ pub mod sentence {
15351553
SC_Upper,
15361554
}
15371555

1538-
fnbsearch_range_value_table(c:char,r:&'static[(char,char,SentenceCat)]) ->SentenceCat{
1556+
fnbsearch_range_value_table(c:char,r:&'static[(char,char,SentenceCat)]) ->(u32,u32,SentenceCat){
15391557
use core::cmp::Ordering::{Equal,Less,Greater};
15401558
match r.binary_search_by(|&(lo, hi, _)|{
15411559
if lo <= c && c <= hi{Equal}
15421560
elseif hi < c{Less}
15431561
else{Greater}
15441562
}){
15451563
Ok(idx) =>{
1546-
let(_, _, cat) = r[idx];
1547-
cat
1564+
let(lower, upper, cat) = r[idx];
1565+
(lowerasu32, upperasu32, cat)
1566+
}
1567+
Err(idx) =>{
1568+
(
1569+
if idx >0{ r[idx-1].1asu32 +1}else{0},
1570+
r.get(idx).map(|c|c.0asu32 -1).unwrap_or(core::u32::MAX),
1571+
SC_Any,
1572+
)
15481573
}
1549-
Err(_) =>SC_Any
15501574
}
15511575
}
15521576

1553-
pubfnsentence_category(c:char) ->SentenceCat{
1577+
pubfnsentence_category(c:char) ->(u32,u32,SentenceCat){
15541578
bsearch_range_value_table(c, sentence_cat_table)
15551579
}
15561580

‎src/word.rs‎

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ enum RegionalState {
125125

126126
fnis_emoji(ch:char) ->bool{
127127
use tables::emoji;
128-
emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic
128+
emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
129129
}
130130

131131
impl<'a>IteratorforUWordBounds<'a>{
@@ -164,7 +164,7 @@ impl<'a> Iterator for UWordBounds<'a> {
164164
prev_zwj = cat == wd::WC_ZWJ;
165165
// if there's a category cached, grab it
166166
cat =matchself.cat{
167-
None => wd::word_category(ch),
167+
None => wd::word_category(ch).2,
168168
_ =>self.cat.take().unwrap()
169169
};
170170
take_cat =true;
@@ -391,7 +391,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
391391

392392
// if there's a category cached, grab it
393393
cat =matchself.catb{
394-
None => wd::word_category(ch),
394+
None => wd::word_category(ch).2,
395395
_ =>self.catb.take().unwrap()
396396
};
397397
take_cat =true;
@@ -533,7 +533,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
533533
if regional_state ==RegionalState::Unknown{
534534
let count =self.string[..previdx]
535535
.chars().rev()
536-
.map(|c| wd::word_category(c))
536+
.map(|c| wd::word_category(c).2)
537537
.filter(|&c| !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
538538
.take_while(|&c| c == wd::WC_Regional_Indicator)
539539
.count();
@@ -624,7 +624,7 @@ impl<'a> UWordBounds<'a> {
624624
let nidx = idx +self.string[idx..].chars().next().unwrap().len_utf8();
625625
if nidx <self.string.len(){
626626
let nch =self.string[nidx..].chars().next().unwrap();
627-
Some(wd::word_category(nch))
627+
Some(wd::word_category(nch).2)
628628
}else{
629629
None
630630
}
@@ -635,7 +635,7 @@ impl<'a> UWordBounds<'a> {
635635
use tables::wordas wd;
636636
if idx >0{
637637
let nch =self.string[..idx].chars().next_back().unwrap();
638-
Some(wd::word_category(nch))
638+
Some(wd::word_category(nch).2)
639639
}else{
640640
None
641641
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp