2020# out-of-line and check the tables.rs and normalization_tests.rs files into git.
2121import collections
2222import urllib .request
23+ from itertools import batched
2324
2425UNICODE_VERSION = "15.1.0"
2526UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
@@ -100,9 +101,9 @@ def _load_unicode_data(self):
100101self .general_category_mark = []
101102self .general_category_public_assigned = []
102103
103- assigned_start = 0 ;
104- prev_char_int = - 1 ;
105- prev_name = "" ;
104+ assigned_start = 0
105+ prev_char_int = - 1
106+ prev_name = ""
106107
107108for line in self ._fetch ("UnicodeData.txt" ).splitlines ():
108109# See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
@@ -131,7 +132,7 @@ def _load_unicode_data(self):
131132self .general_category_public_assigned .append ((assigned_start ,prev_char_int ))
132133assigned_start = char_int
133134prev_char_int = char_int
134- prev_name = name ;
135+ prev_name = name
135136
136137self .general_category_public_assigned .append ((assigned_start ,prev_char_int ))
137138
@@ -158,16 +159,16 @@ def _load_cjk_compat_ideograph_variants(self):
158159
159160char_int = self .name_to_char_int [description ]
160161
161- assert not char_int in self .combining_classes ,"Unexpected: CJK compat variant with a combining class"
162- assert not char_int in self .compat_decomp ,"Unexpected: CJK compat variant and compatibility decomposition"
162+ assert char_int not in self .combining_classes ,"Unexpected: CJK compat variant with a combining class"
163+ assert char_int not in self .compat_decomp ,"Unexpected: CJK compat variant and compatibility decomposition"
163164assert len (self .canon_decomp [char_int ])== 1 ,"Unexpected: CJK compat variant and non-singleton canonical decomposition"
164165# If we ever need to handle Hangul here, we'll need to handle it separately.
165166assert not (S_BASE <= char_int < S_BASE + S_COUNT )
166167
167168cjk_compat_variant_parts = [int (c ,16 )for c in variation_sequence .split ()]
168169for c in cjk_compat_variant_parts :
169- assert not c in self .canon_decomp ,"Unexpected: CJK compat variant is unnormalized (canon)"
170- assert not c in self .compat_decomp ,"Unexpected: CJK compat variant is unnormalized (compat)"
170+ assert c not in self .canon_decomp ,"Unexpected: CJK compat variant is unnormalized (canon)"
171+ assert c not in self .compat_decomp ,"Unexpected: CJK compat variant is unnormalized (compat)"
171172self .cjk_compat_variants_fully_decomp [char_int ]= cjk_compat_variant_parts
172173
173174def _load_norm_props (self ):
@@ -354,20 +355,26 @@ def is_first_and_last(first, last):
354355return False
355356return first [1 :- 8 ]== last [1 :- 7 ]
356357
357- def gen_mph_data (name ,d ,kv_type ,kv_callback ):
358+ def gen_mph_data (name ,d ,kv_type ,kv_callback , kv_row_width ):
358359 (salt ,keys )= minimal_perfect_hash (d )
359- out .write ("pub(crate) const %s_SALT: &[u16] = &[\n " % name .upper ())
360- for s in salt :
361- out .write (" 0x{:x},\n " .format (s ))
360+ out .write (f"\n pub(crate) const{ name .upper ()} _SALT: &[u16] = &[\n " )
361+ for s_row in batched (salt ,13 ):
362+ out .write (" " )
363+ for s in s_row :
364+ out .write (f" 0x{ s :03X} ," )
365+ out .write ("\n " )
366+ out .write ("];\n " )
367+ out .write (f"pub(crate) const{ name .upper ()} _KV: &[{ kv_type } ] = &[\n " )
368+ for k_row in batched (keys ,kv_row_width ):
369+ out .write (" " )
370+ for k in k_row :
371+ out .write (f"{ kv_callback (k )} ," )
372+ out .write ("\n " )
362373out .write ("];\n " )
363- out .write ("pub(crate) const {}_KV: &[{}] = &[\n " .format (name .upper (),kv_type ))
364- for k in keys :
365- out .write (" {},\n " .format (kv_callback (k )))
366- out .write ("];\n \n " )
367374
368375def gen_combining_class (combining_classes ,out ):
369376gen_mph_data ('canonical_combining_class' ,combining_classes ,'u32' ,
370- lambda k :"0x{:X}" . format ( int (combining_classes [k ])| (k << 8 )) )
377+ lambda k :f "0x{ int (combining_classes [k ])| (k << 8 ):07X } " , 8 )
371378
372379def gen_composition_table (canon_comp ,out ):
373380table = {}
@@ -376,7 +383,7 @@ def gen_composition_table(canon_comp, out):
376383table [(c1 << 16 )| c2 ]= c3
377384 (salt ,keys )= minimal_perfect_hash (table )
378385gen_mph_data ('COMPOSITION_TABLE' ,table ,'(u32, char)' ,
379- lambda k :"(0x%s , '\\ u{%s} ')" % ( hexify ( k ), hexify ( table [ k ])) )
386+ lambda k :f "(0x{ k :08X } , '\\ u{{ { table [ k ]:06X } }} ')", 1 )
380387
381388out .write ("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n " )
382389out .write (" match (c1, c2) {\n " )
@@ -403,7 +410,7 @@ def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_de
403410assert offset < 65536
404411out .write ("];\n " )
405412gen_mph_data (name + '_decomposed' ,table ,"(u32, (u16, u16))" ,
406- lambda k :"(0x{:x }, ({}, {}))" . format ( k , offsets [k ], len (table [k ])) )
413+ lambda k :f "(0x{ k :05X } , (0x { offsets [k ]:03X } , 0x { len (table [k ]):X } ))" , 1 )
407414
408415def gen_qc_match (prop_table ,out ):
409416out .write (" match c {\n " )
@@ -421,7 +428,7 @@ def gen_qc_match(prop_table, out):
421428out .write (" }\n " )
422429
423430def gen_nfc_qc (prop_tables ,out ):
424- out .write ("#[inline]\n " )
431+ out .write ("\n #[inline]\n " )
425432out .write ("#[allow(ellipsis_inclusive_range_patterns)]\n " )
426433out .write ("pub fn qc_nfc(c: char) -> IsNormalized {\n " )
427434gen_qc_match (prop_tables ['NFC_QC' ],out )
@@ -450,13 +457,13 @@ def gen_nfkd_qc(prop_tables, out):
450457
451458def gen_combining_mark (general_category_mark ,out ):
452459gen_mph_data ('combining_mark' ,general_category_mark ,'u32' ,
453- lambda k :'0x{:04x }' .format (k ))
460+ lambda k :'0x{:05X }' .format (k ), 10 )
454461
455462def gen_public_assigned (general_category_public_assigned ,out ):
456463# This could be done as a hash but the table is somewhat small.
457464out .write ("#[inline]\n " )
458465out .write ("pub fn is_public_assigned(c: char) -> bool {\n " )
459- out .write ("match c { \n " )
466+ out .write ("matches!(c, \n " )
460467
461468start = True
462469for first ,last in general_category_public_assigned :
@@ -469,12 +476,9 @@ def gen_public_assigned(general_category_public_assigned, out):
469476out .write ("'\\ u{%s}'\n " % hexify (first ))
470477else :
471478out .write ("'\\ u{%s}'..='\\ u{%s}'\n " % (hexify (first ),hexify (last )))
472- out .write (" => true,\n " )
473479
474- out .write (" _ => false,\n " )
475- out .write (" }\n " )
480+ out .write (" )\n " )
476481out .write ("}\n " )
477- out .write ("\n " )
478482
479483def gen_stream_safe (leading ,trailing ,out ):
480484# This could be done as a hash but the table is very small.
@@ -488,10 +492,9 @@ def gen_stream_safe(leading, trailing, out):
488492out .write (" _ => 0,\n " )
489493out .write (" }\n " )
490494out .write ("}\n " )
491- out .write ("\n " )
492495
493496gen_mph_data ('trailing_nonstarters' ,trailing ,'u32' ,
494- lambda k :"0x{:X}" . format ( int (trailing [k ])| (k << 8 )) )
497+ lambda k :f "0x{ int (trailing [k ])| (k << 8 ):07X } " , 8 )
495498
496499def gen_tests (tests ,out ):
497500out .write ("""#[derive(Debug)]
@@ -585,36 +588,27 @@ def minimal_perfect_hash(d):
585588
586589version = "(%s, %s, %s)" % tuple (UNICODE_VERSION .split ("." ))
587590out .write ("#[allow(unused)]\n " )
588- out .write ("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n \n " % version )
591+ out .write ("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n " % version )
589592
590593gen_combining_class (data .combining_classes ,out )
591- out .write ("\n " )
592594
593595gen_composition_table (data .canon_comp ,out )
594- out .write ("\n " )
595596
596597gen_decomposition_tables (data .canon_fully_decomp ,data .compat_fully_decomp ,data .cjk_compat_variants_fully_decomp ,out )
597598
598599gen_combining_mark (data .general_category_mark ,out )
599- out .write ("\n " )
600600
601601gen_public_assigned (data .general_category_public_assigned ,out )
602- out .write ("\n " )
603602
604603gen_nfc_qc (data .norm_props ,out )
605- out .write ("\n " )
606604
607605gen_nfkc_qc (data .norm_props ,out )
608- out .write ("\n " )
609606
610607gen_nfd_qc (data .norm_props ,out )
611- out .write ("\n " )
612608
613609gen_nfkd_qc (data .norm_props ,out )
614- out .write ("\n " )
615610
616611gen_stream_safe (data .ss_leading ,data .ss_trailing ,out )
617- out .write ("\n " )
618612
619613with open ("normalization_tests.rs" ,"w" ,newline = "\n " )as out :
620614out .write (PREAMBLE )