@@ -47,6 +47,15 @@ def fetch(f):
4747sys .stderr .write ("cannot load %s\n " % f )
4848exit (1 )
4949
50+ def fetch_unidata (f ):
51+ if not os .path .exists (os .path .basename (f )):
52+ os .system ("curl -O http://www.unicode.org/Public/%s/ucd/%s"
53+ % (UNICODE_VERSION_NUMBER ,f ))
54+
55+ if not os .path .exists (os .path .basename (f )):
56+ sys .stderr .write ("cannot load %s" % f )
57+ exit (1 )
58+
5059# Implementation from unicode-segmentation
5160def load_properties (f ,interestingprops = None ):
5261fetch (f )
@@ -81,6 +90,41 @@ def load_properties(f, interestingprops = None):
8190
8291return props
8392
93+ def load_script_properties (f ,interestingprops ):
94+ fetch_unidata (f )
95+ props = {}
96+ # Note: these regexes are different from those in unicode-segmentation,
97+ # becase we need to handle spaces here
98+ re1 = re .compile (r"^ *([0-9A-F]+) *; *([^#]+) *#" )
99+ re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#" )
100+
101+ for line in fileinput .input (os .path .basename (f )):
102+ prop = None
103+ d_lo = 0
104+ d_hi = 0
105+ m = re1 .match (line )
106+ if m :
107+ d_lo = m .group (1 )
108+ d_hi = m .group (1 )
109+ prop = m .group (2 ).strip ()
110+ else :
111+ m = re2 .match (line )
112+ if m :
113+ d_lo = m .group (1 )
114+ d_hi = m .group (2 )
115+ prop = m .group (3 ).strip ()
116+ else :
117+ continue
118+ if interestingprops and prop not in interestingprops :
119+ continue
120+ d_lo = int (d_lo ,16 )
121+ d_hi = int (d_hi ,16 )
122+ if prop not in props :
123+ props [prop ]= []
124+ props [prop ].append ((d_lo ,d_hi ))
125+
126+ return props
127+
84128def load_confusables (f ):
85129fetch (f )
86130confusables = []
@@ -97,12 +141,244 @@ def load_confusables(f):
97141raise Exception ('More than one code point in first column' )
98142d_input = int (d_inputs [0 ].strip (),16 )
99143for d_output in m .group (2 ).split ():
100- d_outputitem = int (d_output ,16 );
101- d_outputs .append (d_outputitem );
144+ d_outputitem = int (d_output ,16 )
145+ d_outputs .append (d_outputitem )
102146confusables .append ((d_input ,d_outputs ))
103147
104148return confusables
105149
150+ def aliases ():
151+ """
152+ Fetch the shorthand aliases for each longhand Script name
153+ """
154+ fetch_unidata ("PropertyValueAliases.txt" )
155+ longforms = {}
156+ shortforms = {}
157+ re1 = re .compile (r"^ *sc *; *(\w+) *; *(\w+)" )
158+ for line in fileinput .input (os .path .basename ("PropertyValueAliases.txt" )):
159+ m = re1 .match (line )
160+ if m :
161+ l = m .group (2 ).strip ()
162+ s = m .group (1 ).strip ()
163+ assert (s not in longforms )
164+ assert (l not in shortforms )
165+ longforms [s ]= l
166+ shortforms [l ]= s
167+ else :
168+ continue
169+
170+ return (longforms ,shortforms )
171+
172+ def load_scripts (f ):
173+ (longforms ,shortforms )= aliases ()
174+ scripts = load_script_properties (f , [])
175+
176+ script_table = []
177+ script_list = []
178+
179+ for script in scripts :
180+ if script not in ["Common" ,"Unknown" ,"Inherited" ]:
181+ script_list .append (shortforms [script ])
182+ script_table .extend ([(x ,y ,shortforms [script ])for (x ,y )in scripts [script ]])
183+ script_list .sort ()
184+ script_table .sort (key = lambda w :w [0 ])
185+ return (longforms ,script_table )
186+
187+ def is_script_ignored_in_mixedscript (source ):
188+ return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz'
189+
190+ def process_mixedscript_single_to_multi (item_i ,script_i ,proto_lst ,scripts ):
191+ script_lst = script_list (proto_lst ,scripts )
192+ script_lst .sort ()
193+ # here's a few rules to process current version of Unicode data (13.0 at this time)
194+ script_lst_len = len (script_lst )
195+ assert (script_lst_len > 0 )
196+ # Rule: A - A -> Processed, DontAdd
197+ if script_lst_len == 1 and script_lst [0 ]== script_i :
198+ return True ,False
199+ # Rule: A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
200+ if (script_lst_len == 1 and not is_script_ignored_in_mixedscript (script_lst [0 ])
201+ and not is_script_ignored_in_mixedscript (script_i )
202+ and script_lst [0 ]!= script_i ):
203+ return True ,True
204+ # Rule: (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
205+ if (script_lst_len == 1 and is_script_ignored_in_mixedscript (script_lst [0 ])
206+ and not is_script_ignored_in_mixedscript (script_i )):
207+ return True ,True
208+ # Rule: A ... - A -> Processed, DontAdd
209+ if script_lst_len > 1 and script_i in script_lst :
210+ return True ,False
211+ # Rule: (Zinh | Zyyy | Zzzz) A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
212+ if (script_lst_len == 2 and is_script_ignored_in_mixedscript (script_lst [0 ])
213+ and not is_script_ignored_in_mixedscript (script_lst [1 ])
214+ and not is_script_ignored_in_mixedscript (script_i )
215+ and script_lst [1 ]!= script_i ):
216+ return True ,True
217+ if (script_lst_len == 2 and is_script_ignored_in_mixedscript (script_lst [1 ])
218+ and not is_script_ignored_in_mixedscript (script_lst [0 ])
219+ and not is_script_ignored_in_mixedscript (script_i )
220+ and script_lst [0 ]!= script_i ):
221+ return True ,True
222+ # Rule: (Zinh | Zyyy | Zzzz) (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
223+ if (script_lst_len == 2 and is_script_ignored_in_mixedscript (script_lst [0 ])
224+ and is_script_ignored_in_mixedscript (script_lst [1 ])
225+ and not is_script_ignored_in_mixedscript (script_i )):
226+ return True ,True
227+
228+ # NotProcessed, DontAdd
229+ return False ,False
230+
231+ def is_codepoint_identifier_allowed (c ,identifier_allowed ):
232+ for data in identifier_allowed :
233+ if c >= data [0 ]and c <= data [1 ]:
234+ return True
235+ return False
236+
237+ def load_rustc_mixedscript_confusables (f ,identifier_allowed ,scripts ):
238+ confusables = load_confusables (f )
239+ seekup_map = {}
240+ for item in confusables :
241+ d_proto_list = item [1 ]
242+ d_source = item [0 ]
243+ assert (len (d_proto_list )> 0 )
244+ if len (d_proto_list )== 1 :
245+ seekup_map [escape_char (d_source )]= d_proto_list
246+ # collect prototypes
247+ codepoint_map = {}
248+ multicodepoint_map = {}
249+ for item in confusables :
250+ d_source = item [0 ]
251+ if not is_codepoint_identifier_allowed (d_source ,identifier_allowed ):
252+ continue
253+ d_proto_list = item [1 ]
254+ if len (d_proto_list )== 1 :
255+ d_proto = escape_char (d_proto_list [0 ])
256+ if d_proto not in codepoint_map :
257+ codepoint_map [d_proto ]= []
258+ if d_proto not in seekup_map and is_codepoint_identifier_allowed (d_proto_list [0 ],identifier_allowed ):
259+ codepoint_map [d_proto ].append (d_proto_list [0 ])
260+ codepoint_map [d_proto ].append (d_source )
261+ else :
262+ d_protos = escape_char_list (d_proto_list )
263+ if d_protos not in multicodepoint_map :
264+ multicodepoint_map [d_protos ]= (d_proto_list , [])
265+ multicodepoint_map [d_protos ][1 ].append (d_source )
266+
267+ mixedscript_confusable = {}
268+
269+ def confusable_entry_item (confusable ,script ,item_text ,item ):
270+ if script not in confusable :
271+ confusable [script ]= {}
272+ script_entry = confusable [script ]
273+ if item_text not in script_entry :
274+ script_entry [item_text ]= (item , [])
275+ return script_entry [item_text ][1 ]
276+
277+ # between single charpoint that has single charpoint prototype
278+ for _ ,source in codepoint_map .items ():
279+ source_len = len (source )
280+ for i in range (0 ,source_len - 1 ):
281+ for j in range (i + 1 ,source_len ):
282+ item_i ,item_j = source [i ],source [j ]
283+ script_i ,script_j = codepoint_script (item_i ,scripts ),codepoint_script (item_j ,scripts )
284+ if script_i == script_j :
285+ continue
286+ if not is_script_ignored_in_mixedscript (script_i ):
287+ confusable_entry_item (mixedscript_confusable ,script_i ,escape_char (item_i ),item_i ).append (item_j )
288+ if not is_script_ignored_in_mixedscript (script_j ):
289+ confusable_entry_item (mixedscript_confusable ,script_j ,escape_char (item_j ),item_j ).append (item_i )
290+
291+ # between single charpoint that has multi charpoint prototype
292+ for _ ,proto_lst_and_source in multicodepoint_map .items ():
293+ source = proto_lst_and_source [1 ]
294+ source_len = len (source )
295+ for i in range (0 ,source_len - 1 ):
296+ for j in range (i + 1 ,source_len ):
297+ item_i ,item_j = source [i ],source [j ]
298+ script_i ,script_j = codepoint_script (item_i ,scripts ),codepoint_script (item_j ,scripts )
299+ if script_i == script_j :
300+ continue
301+ if not is_script_ignored_in_mixedscript (script_i ):
302+ confusable_entry_item (mixedscript_confusable ,script_i ,escape_char (item_i ),item_i ).append (item_j )
303+ if not is_script_ignored_in_mixedscript (script_j ):
304+ confusable_entry_item (mixedscript_confusable ,script_j ,escape_char (item_j ),item_j ).append (item_i )
305+
306+ mixedscript_confusable_unresolved = {}
307+ # single charpoint that has multi charpoint prototype and its prototype
308+ for _ ,proto_lst_and_source in multicodepoint_map .items ():
309+ proto_lst = proto_lst_and_source [0 ]
310+ proto_lst_can_be_part_of_identifier = True
311+ for c in proto_lst :
312+ if not is_codepoint_identifier_allowed (c ,identifier_allowed ):
313+ proto_lst_can_be_part_of_identifier = False
314+ break
315+ if not proto_lst_can_be_part_of_identifier :
316+ continue
317+ source = proto_lst_and_source [1 ]
318+ source_len = len (source )
319+ for i in range (0 ,source_len ):
320+ item_i = source [i ]
321+ script_i = codepoint_script (item_i ,scripts )
322+ if is_script_ignored_in_mixedscript (script_i ):
323+ continue
324+ processed ,should_add = process_mixedscript_single_to_multi (item_i ,script_i ,proto_lst ,scripts )
325+ if should_add :
326+ assert (processed )
327+ confusable_entry_item (mixedscript_confusable ,script_i ,escape_char (item_i ),item_i ).append ('multi' )
328+ if processed :
329+ continue
330+ proto_lst_text = escape_char_list (proto_lst )
331+ if not proto_lst_text in mixedscript_confusable_unresolved :
332+ mixedscript_confusable_unresolved [proto_lst_text ]= (proto_lst , [])
333+ mixedscript_confusable_unresolved [proto_lst_text ][1 ].append (item_i )
334+ return (mixedscript_confusable ,mixedscript_confusable_unresolved )
335+
336+ def codepoint_script (c ,scripts ):
337+ for x ,y ,script in scripts :
338+ if c >= x and c <= y :
339+ return script
340+ raise Exception ("Not in scripts: " + escape_char (c ))
341+
342+ def debug_emit_mixedscript_confusable (f ,mixedscript_confusable ,text ,scripts ):
343+ f .write ("/* " + text + "\n " )
344+ for script ,lst in mixedscript_confusable .items ():
345+ f .write ("/// Script - " + script + "\n " )
346+ source_lst = [v [0 ]for (_ ,v )in lst .items ()]
347+ source_lst .sort ()
348+ for source in source_lst :
349+ source_text = escape_char (source )
350+ source_item_and_target_lst = lst [source_text ]
351+ target_lst = source_item_and_target_lst [1 ]
352+ f .write (source_text + " => " + escape_char_list (target_lst )+ " // " + escape_script_list (target_lst ,scripts )+ "\n " )
353+ f .write ("*/\n " )
354+
355+
356+ def script_list (char_lst ,scripts ):
357+ script_lst = []
358+ for c in char_lst :
359+ if c == 'multi' :
360+ script = 'Z~multi'
361+ else :
362+ script = codepoint_script (c ,scripts )
363+ if script not in script_lst :
364+ script_lst .append (script )
365+ return script_lst
366+
367+ def escape_script_list (char_lst ,scripts ):
368+ script_lst = script_list (char_lst ,scripts )
369+ script_lst .sort ()
370+ return str (script_lst )
371+
372+ def debug_emit_mixedscript_confusable_unresolved (f ,map ,text ,scripts ):
373+ if len (map )== 0 :
374+ return
375+ print ("// " + text + "\n " )
376+ for prototype_text ,pair in map .items ():
377+ prototype = pair [0 ]
378+ source = pair [1 ]
379+ print (prototype_text + " => " + escape_char_list (source )+ " // " + escape_script_list (prototype ,scripts )+ " => " + escape_script_list (source ,scripts )+ "\n " )
380+ raise Exception ("update the python script to add new rules for new data" )
381+
106382def format_table_content (f ,content ,indent ):
107383line = " " * indent
108384first = True
@@ -119,18 +395,20 @@ def format_table_content(f, content, indent):
119395f .write (line )
120396
121397def escape_char (c ):
398+ if c == 'multi' :
399+ return "\" <multiple code points>\" "
122400return "'\\ u{%x}'" % c
123401
124402def escape_char_list (l ):
125- line = "[" ;
126- first = True ;
403+ line = "["
404+ first = True
127405for c in l :
128406if first :
129- line += escape_char (c );
407+ line += escape_char (c )
130408else :
131- line += ", " + escape_char (c );
132- first = False ;
133- line += "]" ;
409+ line += ", " + escape_char (c )
410+ first = False
411+ line += "]"
134412return line
135413
136414def emit_table (f ,name ,t_data ,t_type = "&'static [(char, char)]" ,is_pub = True ,
@@ -226,7 +504,7 @@ def emit_confusable_detection_module(f):
226504confusable_table .sort (key = lambda w :w [0 ])
227505
228506last_key = None
229- for (k ,v )in confusable_table :
507+ for (k ,_ )in confusable_table :
230508if k == last_key :
231509raise Exception ("duplicate keys in confusables table: %s" % k )
232510last_key = k
@@ -235,6 +513,40 @@ def emit_confusable_detection_module(f):
235513pfun = lambda x :"(%s, &%s)" % (escape_char (x [0 ]),escape_char_list (x [1 ])))
236514f .write ("}\n \n " )
237515
516+ def escape_script_constant (name ,longforms ):
517+ return "Script::" + longforms [name ].strip ()
518+
519+ def emit_rustc_mixed_script_confusable_detection (f ):
520+ f .write ("pub mod rustc_mixed_script_confusable_detection {" )
521+ f .write ("""
522+ use unicode_script::Script;
523+
524+ #[inline]
525+ pub fn is_rustc_mixed_script_confusable(c: char) -> Option<Script> {
526+ match c as usize {
527+ _ => super::util::bsearch_value_table(c, CONFUSABLES)
528+ }
529+ }
530+
531+ """ )
532+ identifier_status_table = load_properties ("IdentifierStatus.txt" )
533+ longforms ,scripts = load_scripts ("Scripts.txt" )
534+ identifier_allowed = identifier_status_table ['Allowed' ]
535+ (mixedscript_confusable ,mixedscript_confusable_unresolved )= load_rustc_mixedscript_confusables ("confusables.txt" ,identifier_allowed ,scripts )
536+ debug = False
537+ if debug == True :
538+ debug_emit_mixedscript_confusable (f ,mixedscript_confusable ,"mixedscript_confusable" ,scripts )
539+ debug_emit_mixedscript_confusable_unresolved (f ,mixedscript_confusable_unresolved ,"mixedscript_confusable_unresolved" ,scripts )
540+ confusable_table = []
541+ for script ,lst in mixedscript_confusable .items ():
542+ for _ ,pair in lst .items ():
543+ source = pair [0 ]
544+ confusable_table .append ((source ,script ))
545+ confusable_table .sort (key = lambda w :w [0 ])
546+ emit_table (f ,"CONFUSABLES" ,confusable_table ,"&'static [(char, Script)]" ,is_pub = False ,
547+ pfun = lambda x :"(%s,%s)" % (escape_char (x [0 ]),escape_script_constant (x [1 ],longforms )))
548+ f .write ("}\n \n " )
549+
238550
239551def emit_util_mod (f ):
240552f .write ("""
@@ -301,3 +613,5 @@ def emit_util_mod(f):
301613emit_identifier_module (rf )
302614### confusable_detection module
303615emit_confusable_detection_module (rf )
616+ ### mixed_script_confusable_detection module
617+ emit_rustc_mixed_script_confusable_detection (rf )