20
20
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
21
21
import collections
22
22
import urllib .request
23
+ from itertools import batched
23
24
24
25
UNICODE_VERSION = "15.1.0"
25
26
UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
@@ -354,20 +355,26 @@ def is_first_and_last(first, last):
354
355
return False
355
356
return first [1 :- 8 ]== last [1 :- 7 ]
356
357
357
- def gen_mph_data (name ,d ,kv_type ,kv_callback ):
358
+ def gen_mph_data (name ,d ,kv_type ,kv_callback , kv_row_width ):
358
359
(salt ,keys )= minimal_perfect_hash (d )
359
- out .write ("pub(crate) const %s_SALT: &[u16] = &[\n " % name .upper ())
360
- for s in salt :
361
- out .write (" 0x{:x},\n " .format (s ))
360
+ out .write (f"\n pub(crate) const{ name .upper ()} _SALT: &[u16] = &[\n " )
361
+ for s_row in batched (salt ,13 ):
362
+ out .write (" " )
363
+ for s in s_row :
364
+ out .write (f" 0x{ s :03X} ," )
365
+ out .write ("\n " )
366
+ out .write ("];\n " )
367
+ out .write (f"pub(crate) const{ name .upper ()} _KV: &[{ kv_type } ] = &[\n " )
368
+ for k_row in batched (keys ,kv_row_width ):
369
+ out .write (" " )
370
+ for k in k_row :
371
+ out .write (f"{ kv_callback (k )} ," )
372
+ out .write ("\n " )
362
373
out .write ("];\n " )
363
- out .write ("pub(crate) const {}_KV: &[{}] = &[\n " .format (name .upper (),kv_type ))
364
- for k in keys :
365
- out .write (" {},\n " .format (kv_callback (k )))
366
- out .write ("];\n \n " )
367
374
368
375
def gen_combining_class (combining_classes ,out ):
369
376
gen_mph_data ('canonical_combining_class' ,combining_classes ,'u32' ,
370
- lambda k :"0x{:X}" . format ( int (combining_classes [k ])| (k << 8 )) )
377
+ lambda k :f "0x{ int (combining_classes [k ])| (k << 8 ):07X } " , 8 )
371
378
372
379
def gen_composition_table (canon_comp ,out ):
373
380
table = {}
@@ -376,7 +383,7 @@ def gen_composition_table(canon_comp, out):
376
383
table [(c1 << 16 )| c2 ]= c3
377
384
(salt ,keys )= minimal_perfect_hash (table )
378
385
gen_mph_data ('COMPOSITION_TABLE' ,table ,'(u32, char)' ,
379
- lambda k :"(0x%s , '\\ u{%s} ')" % ( hexify ( k ), hexify ( table [ k ])) )
386
+ lambda k :f "(0x{ k :08X } , '\\ u{{ { table [ k ]:06X } }} ')", 1 )
380
387
381
388
out .write ("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n " )
382
389
out .write (" match (c1, c2) {\n " )
@@ -403,7 +410,7 @@ def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_de
403
410
assert offset < 65536
404
411
out .write ("];\n " )
405
412
gen_mph_data (name + '_decomposed' ,table ,"(u32, (u16, u16))" ,
406
- lambda k :"(0x{:x }, ({}, {}))" . format ( k , offsets [k ], len (table [k ])) )
413
+ lambda k :f "(0x{ k :05X } , (0x { offsets [k ]:03X } , 0x { len (table [k ]):X } ))" , 1 )
407
414
408
415
def gen_qc_match (prop_table ,out ):
409
416
out .write (" match c {\n " )
@@ -421,7 +428,7 @@ def gen_qc_match(prop_table, out):
421
428
out .write (" }\n " )
422
429
423
430
def gen_nfc_qc (prop_tables ,out ):
424
- out .write ("#[inline]\n " )
431
+ out .write ("\n #[inline]\n " )
425
432
out .write ("#[allow(ellipsis_inclusive_range_patterns)]\n " )
426
433
out .write ("pub fn qc_nfc(c: char) -> IsNormalized {\n " )
427
434
gen_qc_match (prop_tables ['NFC_QC' ],out )
@@ -450,7 +457,7 @@ def gen_nfkd_qc(prop_tables, out):
450
457
451
458
def gen_combining_mark (general_category_mark ,out ):
452
459
gen_mph_data ('combining_mark' ,general_category_mark ,'u32' ,
453
- lambda k :'0x{:04x }' .format (k ))
460
+ lambda k :'0x{:05X }' .format (k ), 10 )
454
461
455
462
def gen_public_assigned (general_category_public_assigned ,out ):
456
463
# This could be done as a hash but the table is somewhat small.
@@ -464,17 +471,16 @@ def gen_public_assigned(general_category_public_assigned, out):
464
471
out .write (" " )
465
472
start = False
466
473
else :
467
- out .write (" | " )
474
+ out .write ("\n | " )
468
475
if first == last :
469
- out .write ("'\\ u{%s}'\n " % hexify (first ))
476
+ out .write ("'\\ u{%s}'" % hexify (first ))
470
477
else :
471
- out .write ("'\\ u{%s}'..='\\ u{%s}'\n " % (hexify (first ),hexify (last )))
472
- out .write (" => true,\n " )
478
+ out .write ("'\\ u{%s}'..='\\ u{%s}'" % (hexify (first ),hexify (last )))
479
+ out .write (" => true,\n " )
473
480
474
481
out .write (" _ => false,\n " )
475
482
out .write (" }\n " )
476
483
out .write ("}\n " )
477
- out .write ("\n " )
478
484
479
485
def gen_stream_safe (leading ,trailing ,out ):
480
486
# This could be done as a hash but the table is very small.
@@ -488,10 +494,9 @@ def gen_stream_safe(leading, trailing, out):
488
494
out .write (" _ => 0,\n " )
489
495
out .write (" }\n " )
490
496
out .write ("}\n " )
491
- out .write ("\n " )
492
497
493
498
gen_mph_data ('trailing_nonstarters' ,trailing ,'u32' ,
494
- lambda k :"0x{:X}" . format ( int (trailing [k ])| (k << 8 )) )
499
+ lambda k :f "0x{ int (trailing [k ])| (k << 8 ):07X } " , 8 )
495
500
496
501
def gen_tests (tests ,out ):
497
502
out .write ("""#[derive(Debug)]
@@ -579,43 +584,33 @@ def minimal_perfect_hash(d):
579
584
data = UnicodeData ()
580
585
with open ("tables.rs" ,"w" ,newline = "\n " )as out :
581
586
out .write (PREAMBLE )
582
- out .write ("#![cfg_attr(rustfmt, rustfmt::skip)]\n " )
583
587
out .write ("use crate::quick_check::IsNormalized;\n " )
584
588
out .write ("use crate::quick_check::IsNormalized::*;\n " )
585
589
out .write ("\n " )
586
590
587
591
version = "(%s, %s, %s)" % tuple (UNICODE_VERSION .split ("." ))
588
592
out .write ("#[allow(unused)]\n " )
589
- out .write ("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n \n " % version )
593
+ out .write ("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n " % version )
590
594
591
595
gen_combining_class (data .combining_classes ,out )
592
- out .write ("\n " )
593
596
594
597
gen_composition_table (data .canon_comp ,out )
595
- out .write ("\n " )
596
598
597
599
gen_decomposition_tables (data .canon_fully_decomp ,data .compat_fully_decomp ,data .cjk_compat_variants_fully_decomp ,out )
598
600
599
601
gen_combining_mark (data .general_category_mark ,out )
600
- out .write ("\n " )
601
602
602
603
gen_public_assigned (data .general_category_public_assigned ,out )
603
- out .write ("\n " )
604
604
605
605
gen_nfc_qc (data .norm_props ,out )
606
- out .write ("\n " )
607
606
608
607
gen_nfkc_qc (data .norm_props ,out )
609
- out .write ("\n " )
610
608
611
609
gen_nfd_qc (data .norm_props ,out )
612
- out .write ("\n " )
613
610
614
611
gen_nfkd_qc (data .norm_props ,out )
615
- out .write ("\n " )
616
612
617
613
gen_stream_safe (data .ss_leading ,data .ss_trailing ,out )
618
- out .write ("\n " )
619
614
620
615
with open ("normalization_tests.rs" ,"w" ,newline = "\n " )as out :
621
616
out .write (PREAMBLE )