2020# out-of-line and check the tables.rs and normalization_tests.rs files into git.
2121import collections
2222import urllib .request
23+ from itertools import batched
2324
2425UNICODE_VERSION = "15.1.0"
2526UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
@@ -354,20 +355,26 @@ def is_first_and_last(first, last):
354355return False
355356return first [1 :- 8 ]== last [1 :- 7 ]
356357
357- def gen_mph_data (name ,d ,kv_type ,kv_callback ):
358+ def gen_mph_data (name ,d ,kv_type ,kv_callback , kv_row_width ):
358359 (salt ,keys )= minimal_perfect_hash (d )
359- out .write ("pub(crate) const %s_SALT: &[u16] = &[\n " % name .upper ())
360- for s in salt :
361- out .write (" 0x{:x},\n " .format (s ))
360+ out .write (f"\n pub(crate) const{ name .upper ()} _SALT: &[u16] = &[\n " )
361+ for s_row in batched (salt ,13 ):
362+ out .write (" " )
363+ for s in s_row :
364+ out .write (f" 0x{ s :03X} ," )
365+ out .write ("\n " )
366+ out .write ("];\n " )
367+ out .write (f"pub(crate) const{ name .upper ()} _KV: &[{ kv_type } ] = &[\n " )
368+ for k_row in batched (keys ,kv_row_width ):
369+ out .write (" " )
370+ for k in k_row :
371+ out .write (f"{ kv_callback (k )} ," )
372+ out .write ("\n " )
362373out .write ("];\n " )
363- out .write ("pub(crate) const {}_KV: &[{}] = &[\n " .format (name .upper (),kv_type ))
364- for k in keys :
365- out .write (" {},\n " .format (kv_callback (k )))
366- out .write ("];\n \n " )
367374
368375def gen_combining_class (combining_classes ,out ):
369376gen_mph_data ('canonical_combining_class' ,combining_classes ,'u32' ,
370- lambda k :"0x{:X}" . format ( int (combining_classes [k ])| (k << 8 )) )
377+ lambda k :f "0x{ int (combining_classes [k ])| (k << 8 ):07X } " , 8 )
371378
372379def gen_composition_table (canon_comp ,out ):
373380table = {}
@@ -376,7 +383,7 @@ def gen_composition_table(canon_comp, out):
376383table [(c1 << 16 )| c2 ]= c3
377384 (salt ,keys )= minimal_perfect_hash (table )
378385gen_mph_data ('COMPOSITION_TABLE' ,table ,'(u32, char)' ,
379- lambda k :"(0x%s , '\\ u{%s} ')" % ( hexify ( k ), hexify ( table [ k ])) )
386+ lambda k :f "(0x{ k :08X } , '\\ u{{ { table [ k ]:06X } }} ')", 1 )
380387
381388out .write ("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n " )
382389out .write (" match (c1, c2) {\n " )
@@ -403,7 +410,7 @@ def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_de
403410assert offset < 65536
404411out .write ("];\n " )
405412gen_mph_data (name + '_decomposed' ,table ,"(u32, (u16, u16))" ,
406- lambda k :"(0x{:x }, ({}, {}))" . format ( k , offsets [k ], len (table [k ])) )
413+ lambda k :f "(0x{ k :05X } , (0x { offsets [k ]:03X } , 0x { len (table [k ]):X } ))" , 1 )
407414
408415def gen_qc_match (prop_table ,out ):
409416out .write (" match c {\n " )
@@ -421,7 +428,7 @@ def gen_qc_match(prop_table, out):
421428out .write (" }\n " )
422429
423430def gen_nfc_qc (prop_tables ,out ):
424- out .write ("#[inline]\n " )
431+ out .write ("\n #[inline]\n " )
425432out .write ("#[allow(ellipsis_inclusive_range_patterns)]\n " )
426433out .write ("pub fn qc_nfc(c: char) -> IsNormalized {\n " )
427434gen_qc_match (prop_tables ['NFC_QC' ],out )
@@ -450,7 +457,7 @@ def gen_nfkd_qc(prop_tables, out):
450457
451458def gen_combining_mark (general_category_mark ,out ):
452459gen_mph_data ('combining_mark' ,general_category_mark ,'u32' ,
453- lambda k :'0x{:04x }' .format (k ))
460+ lambda k :'0x{:05X }' .format (k ), 10 )
454461
455462def gen_public_assigned (general_category_public_assigned ,out ):
456463# This could be done as a hash but the table is somewhat small.
@@ -464,17 +471,16 @@ def gen_public_assigned(general_category_public_assigned, out):
464471out .write (" " )
465472start = False
466473else :
467- out .write (" | " )
474+ out .write ("\n | " )
468475if first == last :
469- out .write ("'\\ u{%s}'\n " % hexify (first ))
476+ out .write ("'\\ u{%s}'" % hexify (first ))
470477else :
471- out .write ("'\\ u{%s}'..='\\ u{%s}'\n " % (hexify (first ),hexify (last )))
472- out .write (" => true,\n " )
478+ out .write ("'\\ u{%s}'..='\\ u{%s}'" % (hexify (first ),hexify (last )))
479+ out .write (" => true,\n " )
473480
474481out .write (" _ => false,\n " )
475482out .write (" }\n " )
476483out .write ("}\n " )
477- out .write ("\n " )
478484
479485def gen_stream_safe (leading ,trailing ,out ):
480486# This could be done as a hash but the table is very small.
@@ -488,10 +494,9 @@ def gen_stream_safe(leading, trailing, out):
488494out .write (" _ => 0,\n " )
489495out .write (" }\n " )
490496out .write ("}\n " )
491- out .write ("\n " )
492497
493498gen_mph_data ('trailing_nonstarters' ,trailing ,'u32' ,
494- lambda k :"0x{:X}" . format ( int (trailing [k ])| (k << 8 )) )
499+ lambda k :f "0x{ int (trailing [k ])| (k << 8 ):07X } " , 8 )
495500
496501def gen_tests (tests ,out ):
497502out .write ("""#[derive(Debug)]
@@ -579,43 +584,33 @@ def minimal_perfect_hash(d):
579584data = UnicodeData ()
580585with open ("tables.rs" ,"w" ,newline = "\n " )as out :
581586out .write (PREAMBLE )
582- out .write ("#![cfg_attr(rustfmt, rustfmt::skip)]\n " )
583587out .write ("use crate::quick_check::IsNormalized;\n " )
584588out .write ("use crate::quick_check::IsNormalized::*;\n " )
585589out .write ("\n " )
586590
587591version = "(%s, %s, %s)" % tuple (UNICODE_VERSION .split ("." ))
588592out .write ("#[allow(unused)]\n " )
589- out .write ("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n \n " % version )
593+ out .write ("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n " % version )
590594
591595gen_combining_class (data .combining_classes ,out )
592- out .write ("\n " )
593596
594597gen_composition_table (data .canon_comp ,out )
595- out .write ("\n " )
596598
597599gen_decomposition_tables (data .canon_fully_decomp ,data .compat_fully_decomp ,data .cjk_compat_variants_fully_decomp ,out )
598600
599601gen_combining_mark (data .general_category_mark ,out )
600- out .write ("\n " )
601602
602603gen_public_assigned (data .general_category_public_assigned ,out )
603- out .write ("\n " )
604604
605605gen_nfc_qc (data .norm_props ,out )
606- out .write ("\n " )
607606
608607gen_nfkc_qc (data .norm_props ,out )
609- out .write ("\n " )
610608
611609gen_nfd_qc (data .norm_props ,out )
612- out .write ("\n " )
613610
614611gen_nfkd_qc (data .norm_props ,out )
615- out .write ("\n " )
616612
617613gen_stream_safe (data .ss_leading ,data .ss_trailing ,out )
618- out .write ("\n " )
619614
620615with open ("normalization_tests.rs" ,"w" ,newline = "\n " )as out :
621616out .write (PREAMBLE )