Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb82ed4d

Browse files
authored
Merge pull request#77 from cessen/faster_graphemes
Optimization for grapheme iteration.
2 parentsac54e2d +29ecd77 commitb82ed4d

File tree

5 files changed

+94
-48
lines changed

5 files changed

+94
-48
lines changed

‎scripts/unicode.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -280,22 +280,29 @@ def emit_break_module(f, break_table, break_cats, name):
280280
f.write((" %sC_"%Name[0])+cat+",\n")
281281
f.write(""" }
282282
283-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
283+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
284+
use core;
284285
use core::cmp::Ordering::{Equal, Less, Greater};
285286
match r.binary_search_by(|&(lo, hi, _)| {
286287
if lo <= c && c <= hi { Equal }
287288
else if hi < c { Less }
288289
else { Greater }
289290
}) {
290291
Ok(idx) => {
291-
let (_, _, cat) = r[idx];
292-
cat
292+
let (lower, upper, cat) = r[idx];
293+
(lower as u32, upper as u32, cat)
294+
}
295+
Err(idx) => {
296+
(
297+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
298+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
299+
%sC_Any,
300+
)
293301
}
294-
Err(_) => %sC_Any
295302
}
296303
}
297304
298-
pub fn %s_category(c: char) -> %sCat {
305+
pub fn %s_category(c: char) ->(u32, u32,%sCat) {
299306
bsearch_range_value_table(c, %s_cat_table)
300307
}
301308

‎src/grapheme.rs

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ pub struct GraphemeCursor {
178178
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
179179
// to needing more input.
180180
resuming:bool,
181+
// Cached grapheme category and associated scalar value range.
182+
grapheme_cat_cache:(u32,u32,GraphemeCat),
181183
}
182184

183185
/// An error return indicating that not enough content was available in the
@@ -276,9 +278,20 @@ impl GraphemeCursor {
276278
pre_context_offset:None,
277279
ris_count:None,
278280
resuming:false,
281+
grapheme_cat_cache:(0,0,GraphemeCat::GC_Control),
279282
}
280283
}
281284

285+
fngrapheme_category(&mutself,ch:char) ->GraphemeCat{
286+
use tables::graphemeas gr;
287+
// If this char isn't within the cached range, update the cache to the
288+
// range that includes it.
289+
if(chasu32) <self.grapheme_cat_cache.0 ||(chasu32) >self.grapheme_cat_cache.1{
290+
self.grapheme_cat_cache = gr::grapheme_category(ch);
291+
}
292+
self.grapheme_cat_cache.2
293+
}
294+
282295
// Not sure I'm gonna keep this, the advantage over new() seems thin.
283296

284297
/// Set the cursor to a new location in the same string.
@@ -349,7 +362,7 @@ impl GraphemeCursor {
349362
self.pre_context_offset =None;
350363
ifself.is_extended && chunk_start + chunk.len() ==self.offset{
351364
let ch = chunk.chars().rev().next().unwrap();
352-
ifgr::grapheme_category(ch) == gr::GC_Prepend{
365+
ifself.grapheme_category(ch) == gr::GC_Prepend{
353366
self.decide(false);// GB9b
354367
return;
355368
}
@@ -359,7 +372,7 @@ impl GraphemeCursor {
359372
GraphemeState::Emoji =>self.handle_emoji(chunk, chunk_start),
360373
_ =>ifself.cat_before.is_none() &&self.offset == chunk.len() + chunk_start{
361374
let ch = chunk.chars().rev().next().unwrap();
362-
self.cat_before =Some(gr::grapheme_category(ch));
375+
self.cat_before =Some(self.grapheme_category(ch));
363376
},
364377
}
365378
}
@@ -393,7 +406,7 @@ impl GraphemeCursor {
393406
use tables::graphemeas gr;
394407
letmut ris_count =self.ris_count.unwrap_or(0);
395408
for chin chunk.chars().rev(){
396-
ifgr::grapheme_category(ch) != gr::GC_Regional_Indicator{
409+
ifself.grapheme_category(ch) != gr::GC_Regional_Indicator{
397410
self.ris_count =Some(ris_count);
398411
self.decide((ris_count %2) ==0);
399412
return;
@@ -413,13 +426,13 @@ impl GraphemeCursor {
413426
use tables::graphemeas gr;
414427
letmut iter = chunk.chars().rev();
415428
ifletSome(ch) = iter.next(){
416-
ifgr::grapheme_category(ch) != gr::GC_ZWJ{
429+
ifself.grapheme_category(ch) != gr::GC_ZWJ{
417430
self.decide(true);
418431
return;
419432
}
420433
}
421434
for chin iter{
422-
matchgr::grapheme_category(ch){
435+
matchself.grapheme_category(ch){
423436
gr::GC_Extend =>(),
424437
gr::GC_Extended_Pictographic =>{
425438
self.decide(false);
@@ -481,7 +494,7 @@ impl GraphemeCursor {
481494
let offset_in_chunk =self.offset - chunk_start;
482495
ifself.cat_after.is_none(){
483496
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
484-
self.cat_after =Some(gr::grapheme_category(ch));
497+
self.cat_after =Some(self.grapheme_category(ch));
485498
}
486499
ifself.offset == chunk_start{
487500
letmut need_pre_context =true;
@@ -497,7 +510,7 @@ impl GraphemeCursor {
497510
}
498511
ifself.cat_before.is_none(){
499512
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
500-
self.cat_before =Some(gr::grapheme_category(ch));
513+
self.cat_before =Some(self.grapheme_category(ch));
501514
}
502515
matchcheck_pair(self.cat_before.unwrap(),self.cat_after.unwrap()){
503516
PairResult::NotBreak =>returnself.decision(false),
@@ -553,7 +566,6 @@ impl GraphemeCursor {
553566
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
554567
/// ```
555568
pubfnnext_boundary(&mutself,chunk:&str,chunk_start:usize) ->Result<Option<usize>,GraphemeIncomplete>{
556-
use tables::graphemeas gr;
557569
ifself.offset ==self.len{
558570
returnOk(None);
559571
}
@@ -562,14 +574,14 @@ impl GraphemeCursor {
562574
loop{
563575
ifself.resuming{
564576
ifself.cat_after.is_none(){
565-
self.cat_after =Some(gr::grapheme_category(ch));
577+
self.cat_after =Some(self.grapheme_category(ch));
566578
}
567579
}else{
568580
self.offset += ch.len_utf8();
569581
self.state =GraphemeState::Unknown;
570582
self.cat_before =self.cat_after.take();
571583
ifself.cat_before.is_none(){
572-
self.cat_before =Some(gr::grapheme_category(ch));
584+
self.cat_before =Some(self.grapheme_category(ch));
573585
}
574586
ifself.cat_before.unwrap() ==GraphemeCat::GC_Regional_Indicator{
575587
self.ris_count =self.ris_count.map(|c| c +1);
@@ -578,7 +590,7 @@ impl GraphemeCursor {
578590
}
579591
ifletSome(next_ch) = iter.next(){
580592
ch = next_ch;
581-
self.cat_after =Some(gr::grapheme_category(ch));
593+
self.cat_after =Some(self.grapheme_category(ch));
582594
}elseifself.offset ==self.len{
583595
self.decide(true);
584596
}else{
@@ -629,7 +641,6 @@ impl GraphemeCursor {
629641
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
630642
/// ```
631643
pubfnprev_boundary(&mutself,chunk:&str,chunk_start:usize) ->Result<Option<usize>,GraphemeIncomplete>{
632-
use tables::graphemeas gr;
633644
ifself.offset ==0{
634645
returnOk(None);
635646
}
@@ -644,7 +655,7 @@ impl GraphemeCursor {
644655
returnErr(GraphemeIncomplete::PrevChunk);
645656
}
646657
ifself.resuming{
647-
self.cat_before =Some(gr::grapheme_category(ch));
658+
self.cat_before =Some(self.grapheme_category(ch));
648659
}else{
649660
self.offset -= ch.len_utf8();
650661
self.cat_after =self.cat_before.take();
@@ -654,12 +665,12 @@ impl GraphemeCursor {
654665
}
655666
ifletSome(prev_ch) = iter.next(){
656667
ch = prev_ch;
657-
self.cat_before =Some(gr::grapheme_category(ch));
668+
self.cat_before =Some(self.grapheme_category(ch));
658669
}elseifself.offset ==0{
659670
self.decide(true);
660671
}else{
661672
self.resuming =true;
662-
self.cat_after =Some(gr::grapheme_category(ch));
673+
self.cat_after =Some(self.grapheme_category(ch));
663674
returnErr(GraphemeIncomplete::PrevChunk);
664675
}
665676
}

‎src/sentence.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ mod fwd {
115115

116116
for next_charin ahead.chars(){
117117
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
118-
match se::sentence_category(next_char){
118+
match se::sentence_category(next_char).2{
119119
se::SC_Lower =>returntrue,
120120
se::SC_OLetter |
121121
se::SC_Upper |
@@ -182,7 +182,7 @@ mod fwd {
182182
let position_before =self.pos;
183183
let state_before =self.state.clone();
184184

185-
let next_cat = se::sentence_category(next_char);
185+
let next_cat = se::sentence_category(next_char).2;
186186

187187
self.pos += next_char.len_utf8();
188188
self.state =self.state.next(next_cat);

‎src/tables.rs

Lines changed: 48 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -345,22 +345,29 @@ pub mod grapheme {
345345
GC_ZWJ,
346346
}
347347

348-
fnbsearch_range_value_table(c:char,r:&'static[(char,char,GraphemeCat)]) ->GraphemeCat{
348+
fnbsearch_range_value_table(c:char,r:&'static[(char,char,GraphemeCat)]) ->(u32,u32,GraphemeCat){
349+
use core;
349350
use core::cmp::Ordering::{Equal,Less,Greater};
350351
match r.binary_search_by(|&(lo, hi, _)|{
351352
if lo <= c && c <= hi{Equal}
352353
elseif hi < c{Less}
353354
else{Greater}
354355
}){
355356
Ok(idx) =>{
356-
let(_, _, cat) = r[idx];
357-
cat
357+
let(lower, upper, cat) = r[idx];
358+
(lowerasu32, upperasu32, cat)
359+
}
360+
Err(idx) =>{
361+
(
362+
if idx >0{ r[idx-1].1asu32 +1}else{0},
363+
r.get(idx).map(|c|c.0asu32 -1).unwrap_or(core::u32::MAX),
364+
GC_Any,
365+
)
358366
}
359-
Err(_) =>GC_Any
360367
}
361368
}
362369

363-
pubfngrapheme_category(c:char) ->GraphemeCat{
370+
pubfngrapheme_category(c:char) ->(u32,u32,GraphemeCat){
364371
bsearch_range_value_table(c, grapheme_cat_table)
365372
}
366373

@@ -980,22 +987,29 @@ pub mod word {
980987
WC_ZWJ,
981988
}
982989

983-
fnbsearch_range_value_table(c:char,r:&'static[(char,char,WordCat)]) ->WordCat{
990+
fnbsearch_range_value_table(c:char,r:&'static[(char,char,WordCat)]) ->(u32,u32,WordCat){
991+
use core;
984992
use core::cmp::Ordering::{Equal,Less,Greater};
985993
match r.binary_search_by(|&(lo, hi, _)|{
986994
if lo <= c && c <= hi{Equal}
987995
elseif hi < c{Less}
988996
else{Greater}
989997
}){
990998
Ok(idx) =>{
991-
let(_, _, cat) = r[idx];
992-
cat
999+
let(lower, upper, cat) = r[idx];
1000+
(lowerasu32, upperasu32, cat)
1001+
}
1002+
Err(idx) =>{
1003+
(
1004+
if idx >0{ r[idx-1].1asu32 +1}else{0},
1005+
r.get(idx).map(|c|c.0asu32 -1).unwrap_or(core::u32::MAX),
1006+
WC_Any,
1007+
)
9931008
}
994-
Err(_) =>WC_Any
9951009
}
9961010
}
9971011

998-
pubfnword_category(c:char) ->WordCat{
1012+
pubfnword_category(c:char) ->(u32,u32,WordCat){
9991013
bsearch_range_value_table(c, word_cat_table)
10001014
}
10011015

@@ -1439,22 +1453,29 @@ pub mod emoji {
14391453
EC_Extended_Pictographic,
14401454
}
14411455

1442-
fnbsearch_range_value_table(c:char,r:&'static[(char,char,EmojiCat)]) ->EmojiCat{
1456+
fnbsearch_range_value_table(c:char,r:&'static[(char,char,EmojiCat)]) ->(u32,u32,EmojiCat){
1457+
use core;
14431458
use core::cmp::Ordering::{Equal,Less,Greater};
14441459
match r.binary_search_by(|&(lo, hi, _)|{
14451460
if lo <= c && c <= hi{Equal}
14461461
elseif hi < c{Less}
14471462
else{Greater}
14481463
}){
14491464
Ok(idx) =>{
1450-
let(_, _, cat) = r[idx];
1451-
cat
1465+
let(lower, upper, cat) = r[idx];
1466+
(lowerasu32, upperasu32, cat)
1467+
}
1468+
Err(idx) =>{
1469+
(
1470+
if idx >0{ r[idx-1].1asu32 +1}else{0},
1471+
r.get(idx).map(|c|c.0asu32 -1).unwrap_or(core::u32::MAX),
1472+
EC_Any,
1473+
)
14521474
}
1453-
Err(_) =>EC_Any
14541475
}
14551476
}
14561477

1457-
pubfnemoji_category(c:char) ->EmojiCat{
1478+
pubfnemoji_category(c:char) ->(u32,u32,EmojiCat){
14581479
bsearch_range_value_table(c, emoji_cat_table)
14591480
}
14601481

@@ -1535,22 +1556,29 @@ pub mod sentence {
15351556
SC_Upper,
15361557
}
15371558

1538-
fnbsearch_range_value_table(c:char,r:&'static[(char,char,SentenceCat)]) ->SentenceCat{
1559+
fnbsearch_range_value_table(c:char,r:&'static[(char,char,SentenceCat)]) ->(u32,u32,SentenceCat){
1560+
use core;
15391561
use core::cmp::Ordering::{Equal,Less,Greater};
15401562
match r.binary_search_by(|&(lo, hi, _)|{
15411563
if lo <= c && c <= hi{Equal}
15421564
elseif hi < c{Less}
15431565
else{Greater}
15441566
}){
15451567
Ok(idx) =>{
1546-
let(_, _, cat) = r[idx];
1547-
cat
1568+
let(lower, upper, cat) = r[idx];
1569+
(lowerasu32, upperasu32, cat)
1570+
}
1571+
Err(idx) =>{
1572+
(
1573+
if idx >0{ r[idx-1].1asu32 +1}else{0},
1574+
r.get(idx).map(|c|c.0asu32 -1).unwrap_or(core::u32::MAX),
1575+
SC_Any,
1576+
)
15481577
}
1549-
Err(_) =>SC_Any
15501578
}
15511579
}
15521580

1553-
pubfnsentence_category(c:char) ->SentenceCat{
1581+
pubfnsentence_category(c:char) ->(u32,u32,SentenceCat){
15541582
bsearch_range_value_table(c, sentence_cat_table)
15551583
}
15561584

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp