@@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name):
274274 pub enum %sCat {
275275""" % (name ,Name ,Name ))
276276
277+ # We don't want the lookup table to be too large so choose a reasonable
278+ # cutoff. 0x20000 is selected because most of the range table entries are
279+ # within the interval of [0x0, 0x20000]
280+ lookup_value_cutoff = 0x20000
281+
282+ # Length of lookup table. It has to be a divisor of `lookup_value_cutoff`.
283+ lookup_table_len = 0x400
284+
285+ lookup_interval = round (lookup_value_cutoff / lookup_table_len )
286+
287+ # Lookup table is a mapping from `character code / lookup_interval` to
288+ # the index in the range table that covers the `character code`.
289+ lookup_table = [0 ]* lookup_table_len
290+ j = 0
291+ for i in range (0 ,lookup_table_len ):
292+ lookup_from = i * lookup_interval
293+ while j < len (break_table ):
294+ (_ ,entry_to ,_ )= break_table [j ]
295+ if entry_to >= lookup_from :
296+ break
297+ j += 1
298+ lookup_table [i ]= j
299+
277300break_cats .append ("Any" )
278301break_cats .sort ()
279302for cat in break_cats :
280303f .write ((" %sC_" % Name [0 ])+ cat + ",\n " )
281304f .write (""" }
282305
283- fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
306+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (Option< u32>, Option< u32> , %sCat) {
284307 use core::cmp::Ordering::{Equal, Less, Greater};
285308 match r.binary_search_by(|&(lo, hi, _)| {
286309 if lo <= c && c <= hi { Equal }
@@ -289,23 +312,70 @@ def emit_break_module(f, break_table, break_cats, name):
289312 }) {
290313 Ok(idx) => {
291314 let (lower, upper, cat) = r[idx];
292- (lower as u32, upper as u32, cat)
315+ (Some( lower as u32), Some( upper as u32) , cat)
293316 }
294317 Err(idx) => {
295318 (
296- if idx > 0 { r[idx-1].1 as u32 + 1 } else {0 },
297- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX) ,
319+ if idx > 0 {Some( r[idx-1].1 as u32 + 1) } else {None },
320+ r.get(idx).map(|c|c.0 as u32 - 1),
298321 %sC_Any,
299322 )
300323 }
301324 }
302325 }
303326
304327 pub fn %s_category(c: char) -> (u32, u32, %sCat) {
305- bsearch_range_value_table(c, %s_cat_table)
328+ // Perform a quick O(1) lookup in a precomputed table to determine
329+ // the slice of the range table to search in.
330+ let lookup_interval = 0x%x;
331+ let idx = (c as u32 / lookup_interval) as usize;
332+ let range = %s_cat_lookup.get(idx..(idx + 2)).map_or(
333+ // If the `idx` is outside of the precomputed table - use the slice
334+ // starting from the last covered index in the precomputed table and
335+ // ending with the length of the range table.
336+ %d..%d,
337+ |r| (r[0] as usize)..((r[1] + 1) as usize)
338+ );
339+ let (lower, upper, cat) = bsearch_range_value_table(c, &%s_cat_table[range]);
340+
341+ (
342+ lower.unwrap_or_else(|| {
343+ if idx == 0 {
344+ 0
345+ } else {
346+ // Use an entry just before the lookup index to find the lower
347+ // bound for Any category.
348+ let i = *%s_cat_lookup.get(idx - 1).unwrap_or(&%d) as usize;
349+ %s_cat_table[i].1 as u32 + 1
350+ }
351+ }),
352+ upper.unwrap_or_else(|| {
353+ %s_cat_lookup.get(idx + 1).map_or_else(|| {
354+ // If idx was outside of the lookup table - upper bound is either
355+ // already found in the ranges table, or has to be a u32::MAX.
356+ core::u32::MAX
357+ }, |&i| {
358+ // Otherwise use an entry just after the lookup index to find the
359+ // lower bound for Any category.
360+ %s_cat_table[i as usize].0 as u32 - 1
361+ })
362+ }),
363+ cat
364+ )
306365 }
307366
308- """ % (Name ,Name ,Name [0 ],name ,Name ,name ))
367+ """ % (Name ,Name ,Name [0 ],name ,Name ,lookup_interval ,name ,j ,len (break_table ),name ,name ,j ,name ,name ,name ))
368+
369+ if len (break_table )<= 0xff :
370+ lookup_type = "u8"
371+ elif len (break_table )<= 0xffff :
372+ lookup_type = "u16"
373+ else :
374+ lookup_type = "u32"
375+
376+ emit_table (f ,"%s_cat_lookup" % name ,lookup_table ,"&'static [%s]" % lookup_type ,
377+ pfun = lambda x :"%d" % x ,
378+ is_pub = False ,is_const = True )
309379
310380emit_table (f ,"%s_cat_table" % name ,break_table ,"&'static [(char, char, %sCat)]" % Name ,
311381pfun = lambda x :"(%s,%s,%sC_%s)" % (escape_char (x [0 ]),escape_char (x [1 ]),Name [0 ],x [2 ]),