@@ -53,6 +53,8 @@ def fetch(f):
5353sys .stderr .write ("cannot load %s\n " % f )
5454exit (1 )
5555
56+ return f
57+
5658# Download a UCD table file
5759def fetch_unidata (f ):
5860if not os .path .exists (os .path .basename (f )):
@@ -63,14 +65,14 @@ def fetch_unidata(f):
6365sys .stderr .write ("cannot load %s" % f )
6466exit (1 )
6567
66- # Loads code point data from IdentifierStatus.txt and
67- # IdentifierType.txt
68- # Implementation from unicode-segmentation
68+ return f
69+
70+ # Loads code point data from provided filename f
71+ # Implementation adapted from unicode-segmentation
6972def load_properties (f ,interestingprops = None ):
70- fetch (f )
7173props = {}
72- re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+) " )
73- re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+) " )
74+ re1 = re .compile (r"^ *([0-9A-F]+) *; *([^#\s]+) *# " )
75+ re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#\s]+) *# " )
7476
7577for line in fileinput .input (os .path .basename (f ),openhook = fileinput .hook_encoded ("utf-8" )):
7678prop = None
@@ -99,42 +101,6 @@ def load_properties(f, interestingprops = None):
99101
100102return props
101103
102- # Loads script data from Scripts.txt
103- def load_script_properties (f ,interestingprops ):
104- fetch_unidata (f )
105- props = {}
106- # Note: these regexes are different from those in unicode-segmentation,
107- # becase we need to handle spaces here
108- re1 = re .compile (r"^ *([0-9A-F]+) *; *([^#]+) *#" )
109- re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#" )
110-
111- for line in fileinput .input (os .path .basename (f )):
112- prop = None
113- d_lo = 0
114- d_hi = 0
115- m = re1 .match (line )
116- if m :
117- d_lo = m .group (1 )
118- d_hi = m .group (1 )
119- prop = m .group (2 ).strip ()
120- else :
121- m = re2 .match (line )
122- if m :
123- d_lo = m .group (1 )
124- d_hi = m .group (2 )
125- prop = m .group (3 ).strip ()
126- else :
127- continue
128- if interestingprops and prop not in interestingprops :
129- continue
130- d_lo = int (d_lo ,16 )
131- d_hi = int (d_hi ,16 )
132- if prop not in props :
133- props [prop ]= []
134- props [prop ].append ((d_lo ,d_hi ))
135-
136- return props
137-
138104# Loads confusables data from confusables.txt
139105def load_confusables (f ):
140106fetch (f )
@@ -189,7 +155,7 @@ def load_scripts(f):
189155# changes are introduced, update accordingly.
190156
191157 (longforms ,shortforms )= aliases ()
192- scripts = load_script_properties ( f , [])
158+ scripts = load_properties ( fetch_unidata ( f ) , [])
193159
194160script_table = []
195161script_list = []
@@ -546,10 +512,10 @@ def emit_identifier_module(f):
546512""" )
547513
548514f .write (" // Identifier status table:\n " )
549- identifier_status_table = load_properties ("IdentifierStatus.txt" )
515+ identifier_status_table = load_properties (fetch ( "IdentifierStatus.txt" ) )
550516emit_table (f ,"IDENTIFIER_STATUS" ,identifier_status_table ['Allowed' ],"&'static [(char, char)]" ,is_pub = False ,
551517pfun = lambda x :"(%s,%s)" % (escape_char (x [0 ]),escape_char (x [1 ])))
552- identifier_type = load_properties ("IdentifierType.txt" )
518+ identifier_type = load_properties (fetch ( "IdentifierType.txt" ) )
553519type_table = []
554520for ty in identifier_type :
555521type_table .extend ([(x ,y ,ty )for (x ,y )in identifier_type [ty ]])
@@ -601,7 +567,7 @@ def emit_potiential_mixed_script_confusable(f):
601567 }
602568 }
603569""" )
604- identifier_status_table = load_properties ("IdentifierStatus.txt" )
570+ identifier_status_table = load_properties (fetch ( "IdentifierStatus.txt" ) )
605571_ ,scripts = load_scripts ("Scripts.txt" )
606572identifier_allowed = identifier_status_table ['Allowed' ]
607573 (mixedscript_confusable ,mixedscript_confusable_unresolved )= load_potential_mixedscript_confusables ("confusables.txt" ,identifier_allowed ,scripts )