2727import os
2828import re
2929import sys
30+ import urllib .request
3031from collections import defaultdict
3132from itertools import batched
3233
34+ UNICODE_VERSION = "15.1.0"
35+ """The version of the Unicode data files to download."""
36+
3337NUM_CODEPOINTS = 0x110000
3438"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
3539
@@ -61,24 +65,28 @@ class OffsetType(enum.IntEnum):
6165
6266If this is edited, you must ensure that `emit_module` reflects your changes."""
6367
64- MODULE_FILENAME = "tables.rs"
65- """Thefilename of the emitted Rust module (will be created in the working directory)"""
68+ MODULE_PATH = "../src/ tables.rs"
69+ """Thepath of the emitted Rust module (relative to the working directory)"""
6670
6771Codepoint = int
6872BitPos = int
6973
7074
71- def fetch_open (filename :str ):
75+ def fetch_open (filename :str , local_prefix : str = "" ):
7276"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
73- fetches it from `http ://www.unicode.org/Public/UNIDATA /`. Exits with code 1 on failure.
77+ fetches it from `https ://www.unicode.org/Public/`. Exits with code 1 on failure.
7478 """
7579basename = os .path .basename (filename )
76- if not os .path .exists (basename ):
77- os .system (f"curl -O http://www.unicode.org/Public/UNIDATA/{ filename } " )
80+ localname = os .path .join (local_prefix ,basename )
81+ if not os .path .exists (localname ):
82+ urllib .request .urlretrieve (
83+ f"https://www.unicode.org/Public/{ UNICODE_VERSION } /ucd/{ filename } " ,
84+ localname ,
85+ )
7886try :
79- return open (basename ,encoding = "utf-8" )
87+ return open (localname ,encoding = "utf-8" )
8088except OSError :
81- sys .stderr .write (f"cannot load{ basename } " )
89+ sys .stderr .write (f"cannot load{ localname } " )
8290sys .exit (1 )
8391
8492
@@ -637,7 +645,7 @@ def emit_module(
637645module .write ("}\n " )
638646
639647
640- def main (module_filename :str ):
648+ def main (module_path :str ):
641649"""Obtain character data from the latest version of Unicode, transform it into a multi-level
642650 lookup table for character width, and write a Rust module utilizing that table to
643651 `module_filename`.
@@ -677,6 +685,9 @@ def main(module_filename: str):
677685emoji_variations = load_variation_sequences ()
678686variation_table = make_variation_sequence_table (emoji_variations ,width_map )
679687
688+ # Download normalization test file for use by tests
689+ fetch_open ("NormalizationTest.txt" ,"../tests/" )
690+
680691print ("------------------------" )
681692total_size = 0
682693for i ,table in enumerate (tables ):
@@ -692,9 +703,9 @@ def main(module_filename: str):
692703print ("------------------------" )
693704print (f" Total size:{ total_size } bytes" )
694705
695- emit_module (module_filename ,version ,tables ,variation_table )
696- print (f'Wrote to "{ module_filename } "' )
706+ emit_module (module_path ,version ,tables ,variation_table )
707+ print (f'Wrote to "{ module_path } "' )
697708
698709
699710if __name__ == "__main__" :
700- main (MODULE_FILENAME )
711+ main (MODULE_PATH )