Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit1fb9c04

Browse files
committed
Implement rustc_mixed_script_confusable_detection.
1 parent8195ca8 commit1fb9c04

File tree

4 files changed

+497
-9
lines changed

4 files changed

+497
-9
lines changed

‎scripts/unicode.py‎

Lines changed: 323 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,15 @@ def fetch(f):
4747
sys.stderr.write("cannot load %s\n"%f)
4848
exit(1)
4949

50+
deffetch_unidata(f):
51+
ifnotos.path.exists(os.path.basename(f)):
52+
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
53+
% (UNICODE_VERSION_NUMBER,f))
54+
55+
ifnotos.path.exists(os.path.basename(f)):
56+
sys.stderr.write("cannot load %s"%f)
57+
exit(1)
58+
5059
# Implementation from unicode-segmentation
5160
defload_properties(f,interestingprops=None):
5261
fetch(f)
@@ -81,6 +90,41 @@ def load_properties(f, interestingprops = None):
8190

8291
returnprops
8392

93+
defload_script_properties(f,interestingprops):
94+
fetch_unidata(f)
95+
props= {}
96+
# Note: these regexes are different from those in unicode-segmentation,
97+
# becase we need to handle spaces here
98+
re1=re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#")
99+
re2=re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#")
100+
101+
forlineinfileinput.input(os.path.basename(f)):
102+
prop=None
103+
d_lo=0
104+
d_hi=0
105+
m=re1.match(line)
106+
ifm:
107+
d_lo=m.group(1)
108+
d_hi=m.group(1)
109+
prop=m.group(2).strip()
110+
else:
111+
m=re2.match(line)
112+
ifm:
113+
d_lo=m.group(1)
114+
d_hi=m.group(2)
115+
prop=m.group(3).strip()
116+
else:
117+
continue
118+
ifinterestingpropsandpropnotininterestingprops:
119+
continue
120+
d_lo=int(d_lo,16)
121+
d_hi=int(d_hi,16)
122+
ifpropnotinprops:
123+
props[prop]= []
124+
props[prop].append((d_lo,d_hi))
125+
126+
returnprops
127+
84128
defload_confusables(f):
85129
fetch(f)
86130
confusables= []
@@ -97,12 +141,244 @@ def load_confusables(f):
97141
raiseException('More than one code point in first column')
98142
d_input=int(d_inputs[0].strip(),16)
99143
ford_outputinm.group(2).split():
100-
d_outputitem=int(d_output,16);
101-
d_outputs.append(d_outputitem);
144+
d_outputitem=int(d_output,16)
145+
d_outputs.append(d_outputitem)
102146
confusables.append((d_input,d_outputs))
103147

104148
returnconfusables
105149

150+
defaliases():
151+
"""
152+
Fetch the shorthand aliases for each longhand Script name
153+
"""
154+
fetch_unidata("PropertyValueAliases.txt")
155+
longforms= {}
156+
shortforms= {}
157+
re1=re.compile(r"^ *sc *; *(\w+) *; *(\w+)")
158+
forlineinfileinput.input(os.path.basename("PropertyValueAliases.txt")):
159+
m=re1.match(line)
160+
ifm:
161+
l=m.group(2).strip()
162+
s=m.group(1).strip()
163+
assert(snotinlongforms)
164+
assert(lnotinshortforms)
165+
longforms[s]=l
166+
shortforms[l]=s
167+
else:
168+
continue
169+
170+
return (longforms,shortforms)
171+
172+
defload_scripts(f):
173+
(longforms,shortforms)=aliases()
174+
scripts=load_script_properties(f, [])
175+
176+
script_table= []
177+
script_list= []
178+
179+
forscriptinscripts:
180+
ifscriptnotin ["Common","Unknown","Inherited"]:
181+
script_list.append(shortforms[script])
182+
script_table.extend([(x,y,shortforms[script])for (x,y)inscripts[script]])
183+
script_list.sort()
184+
script_table.sort(key=lambdaw:w[0])
185+
return (longforms,script_table)
186+
187+
defis_script_ignored_in_mixedscript(source):
188+
returnsource=='Zinh'orsource=='Zyyy'orsource=='Zzzz'
189+
190+
defprocess_mixedscript_single_to_multi(item_i,script_i,proto_lst,scripts):
191+
script_lst=script_list(proto_lst,scripts)
192+
script_lst.sort()
193+
# here's a few rules to process current version of Unicode data (13.0 at this time)
194+
script_lst_len=len(script_lst)
195+
assert(script_lst_len>0)
196+
# Rule: A - A -> Processed, DontAdd
197+
ifscript_lst_len==1andscript_lst[0]==script_i:
198+
returnTrue,False
199+
# Rule: A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
200+
if (script_lst_len==1andnotis_script_ignored_in_mixedscript(script_lst[0])
201+
andnotis_script_ignored_in_mixedscript(script_i)
202+
andscript_lst[0]!=script_i):
203+
returnTrue,True
204+
# Rule: (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
205+
if (script_lst_len==1andis_script_ignored_in_mixedscript(script_lst[0])
206+
andnotis_script_ignored_in_mixedscript(script_i)):
207+
returnTrue,True
208+
# Rule: A ... - A -> Processed, DontAdd
209+
ifscript_lst_len>1andscript_iinscript_lst:
210+
returnTrue,False
211+
# Rule: (Zinh | Zyyy | Zzzz) A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
212+
if (script_lst_len==2andis_script_ignored_in_mixedscript(script_lst[0])
213+
andnotis_script_ignored_in_mixedscript(script_lst[1])
214+
andnotis_script_ignored_in_mixedscript(script_i)
215+
andscript_lst[1]!=script_i):
216+
returnTrue,True
217+
if (script_lst_len==2andis_script_ignored_in_mixedscript(script_lst[1])
218+
andnotis_script_ignored_in_mixedscript(script_lst[0])
219+
andnotis_script_ignored_in_mixedscript(script_i)
220+
andscript_lst[0]!=script_i):
221+
returnTrue,True
222+
# Rule: (Zinh | Zyyy | Zzzz) (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add
223+
if (script_lst_len==2andis_script_ignored_in_mixedscript(script_lst[0])
224+
andis_script_ignored_in_mixedscript(script_lst[1])
225+
andnotis_script_ignored_in_mixedscript(script_i)):
226+
returnTrue,True
227+
228+
# NotProcessed, DontAdd
229+
returnFalse,False
230+
231+
defis_codepoint_identifier_allowed(c,identifier_allowed):
232+
fordatainidentifier_allowed:
233+
ifc>=data[0]andc<=data[1]:
234+
returnTrue
235+
returnFalse
236+
237+
defload_rustc_mixedscript_confusables(f,identifier_allowed,scripts):
238+
confusables=load_confusables(f)
239+
seekup_map= {}
240+
foriteminconfusables:
241+
d_proto_list=item[1]
242+
d_source=item[0]
243+
assert(len(d_proto_list)>0)
244+
iflen(d_proto_list)==1:
245+
seekup_map[escape_char(d_source)]=d_proto_list
246+
# collect prototypes
247+
codepoint_map= {}
248+
multicodepoint_map= {}
249+
foriteminconfusables:
250+
d_source=item[0]
251+
ifnotis_codepoint_identifier_allowed(d_source,identifier_allowed):
252+
continue
253+
d_proto_list=item[1]
254+
iflen(d_proto_list)==1:
255+
d_proto=escape_char(d_proto_list[0])
256+
ifd_protonotincodepoint_map:
257+
codepoint_map[d_proto]= []
258+
ifd_protonotinseekup_mapandis_codepoint_identifier_allowed(d_proto_list[0],identifier_allowed):
259+
codepoint_map[d_proto].append(d_proto_list[0])
260+
codepoint_map[d_proto].append(d_source)
261+
else:
262+
d_protos=escape_char_list(d_proto_list)
263+
ifd_protosnotinmulticodepoint_map:
264+
multicodepoint_map[d_protos]= (d_proto_list, [])
265+
multicodepoint_map[d_protos][1].append(d_source)
266+
267+
mixedscript_confusable= {}
268+
269+
defconfusable_entry_item(confusable,script,item_text,item):
270+
ifscriptnotinconfusable:
271+
confusable[script]= {}
272+
script_entry=confusable[script]
273+
ifitem_textnotinscript_entry:
274+
script_entry[item_text]= (item, [])
275+
returnscript_entry[item_text][1]
276+
277+
# between single charpoint that has single charpoint prototype
278+
for_,sourceincodepoint_map.items():
279+
source_len=len(source)
280+
foriinrange(0,source_len-1):
281+
forjinrange(i+1,source_len):
282+
item_i,item_j=source[i],source[j]
283+
script_i,script_j=codepoint_script(item_i,scripts),codepoint_script(item_j,scripts)
284+
ifscript_i==script_j:
285+
continue
286+
ifnotis_script_ignored_in_mixedscript(script_i):
287+
confusable_entry_item(mixedscript_confusable,script_i,escape_char(item_i),item_i).append(item_j)
288+
ifnotis_script_ignored_in_mixedscript(script_j):
289+
confusable_entry_item(mixedscript_confusable,script_j,escape_char(item_j),item_j).append(item_i)
290+
291+
# between single charpoint that has multi charpoint prototype
292+
for_,proto_lst_and_sourceinmulticodepoint_map.items():
293+
source=proto_lst_and_source[1]
294+
source_len=len(source)
295+
foriinrange(0,source_len-1):
296+
forjinrange(i+1,source_len):
297+
item_i,item_j=source[i],source[j]
298+
script_i,script_j=codepoint_script(item_i,scripts),codepoint_script(item_j,scripts)
299+
ifscript_i==script_j:
300+
continue
301+
ifnotis_script_ignored_in_mixedscript(script_i):
302+
confusable_entry_item(mixedscript_confusable,script_i,escape_char(item_i),item_i).append(item_j)
303+
ifnotis_script_ignored_in_mixedscript(script_j):
304+
confusable_entry_item(mixedscript_confusable,script_j,escape_char(item_j),item_j).append(item_i)
305+
306+
mixedscript_confusable_unresolved= {}
307+
# single charpoint that has multi charpoint prototype and its prototype
308+
for_,proto_lst_and_sourceinmulticodepoint_map.items():
309+
proto_lst=proto_lst_and_source[0]
310+
proto_lst_can_be_part_of_identifier=True
311+
forcinproto_lst:
312+
ifnotis_codepoint_identifier_allowed(c,identifier_allowed):
313+
proto_lst_can_be_part_of_identifier=False
314+
break
315+
ifnotproto_lst_can_be_part_of_identifier:
316+
continue
317+
source=proto_lst_and_source[1]
318+
source_len=len(source)
319+
foriinrange(0,source_len):
320+
item_i=source[i]
321+
script_i=codepoint_script(item_i,scripts)
322+
ifis_script_ignored_in_mixedscript(script_i):
323+
continue
324+
processed,should_add=process_mixedscript_single_to_multi(item_i,script_i,proto_lst,scripts)
325+
ifshould_add:
326+
assert(processed)
327+
confusable_entry_item(mixedscript_confusable,script_i,escape_char(item_i),item_i).append('multi')
328+
ifprocessed:
329+
continue
330+
proto_lst_text=escape_char_list(proto_lst)
331+
ifnotproto_lst_textinmixedscript_confusable_unresolved:
332+
mixedscript_confusable_unresolved[proto_lst_text]= (proto_lst, [])
333+
mixedscript_confusable_unresolved[proto_lst_text][1].append(item_i)
334+
return (mixedscript_confusable,mixedscript_confusable_unresolved)
335+
336+
defcodepoint_script(c,scripts):
337+
forx,y,scriptinscripts:
338+
ifc>=xandc<=y:
339+
returnscript
340+
raiseException("Not in scripts: "+escape_char(c))
341+
342+
defdebug_emit_mixedscript_confusable(f,mixedscript_confusable,text,scripts):
343+
f.write("/* "+text+"\n")
344+
forscript,lstinmixedscript_confusable.items():
345+
f.write("/// Script - "+script+"\n")
346+
source_lst= [v[0]for (_,v)inlst.items()]
347+
source_lst.sort()
348+
forsourceinsource_lst:
349+
source_text=escape_char(source)
350+
source_item_and_target_lst=lst[source_text]
351+
target_lst=source_item_and_target_lst[1]
352+
f.write(source_text+" => "+escape_char_list(target_lst)+" // "+escape_script_list(target_lst,scripts)+"\n")
353+
f.write("*/\n")
354+
355+
356+
defscript_list(char_lst,scripts):
357+
script_lst= []
358+
forcinchar_lst:
359+
ifc=='multi':
360+
script='Z~multi'
361+
else:
362+
script=codepoint_script(c,scripts)
363+
ifscriptnotinscript_lst:
364+
script_lst.append(script)
365+
returnscript_lst
366+
367+
defescape_script_list(char_lst,scripts):
368+
script_lst=script_list(char_lst,scripts)
369+
script_lst.sort()
370+
returnstr(script_lst)
371+
372+
defdebug_emit_mixedscript_confusable_unresolved(f,map,text,scripts):
373+
iflen(map)==0:
374+
return
375+
print("// "+text+"\n")
376+
forprototype_text,pairinmap.items():
377+
prototype=pair[0]
378+
source=pair[1]
379+
print(prototype_text+" => "+escape_char_list(source)+" // "+escape_script_list(prototype,scripts)+" => "+escape_script_list(source,scripts)+"\n")
380+
raiseException("update the python script to add new rules for new data")
381+
106382
defformat_table_content(f,content,indent):
107383
line=" "*indent
108384
first=True
@@ -119,18 +395,20 @@ def format_table_content(f, content, indent):
119395
f.write(line)
120396

121397
defescape_char(c):
398+
ifc=='multi':
399+
return"\"<multiple code points>\""
122400
return"'\\u{%x}'"%c
123401

124402
defescape_char_list(l):
125-
line="[";
126-
first=True;
403+
line="["
404+
first=True
127405
forcinl:
128406
iffirst:
129-
line+=escape_char(c);
407+
line+=escape_char(c)
130408
else:
131-
line+=", "+escape_char(c);
132-
first=False;
133-
line+="]";
409+
line+=", "+escape_char(c)
410+
first=False
411+
line+="]"
134412
returnline
135413

136414
defemit_table(f,name,t_data,t_type="&'static [(char, char)]",is_pub=True,
@@ -226,7 +504,7 @@ def emit_confusable_detection_module(f):
226504
confusable_table.sort(key=lambdaw:w[0])
227505

228506
last_key=None
229-
for (k,v)inconfusable_table:
507+
for (k,_)inconfusable_table:
230508
ifk==last_key:
231509
raiseException("duplicate keys in confusables table: %s"%k)
232510
last_key=k
@@ -235,6 +513,40 @@ def emit_confusable_detection_module(f):
235513
pfun=lambdax:"(%s, &%s)"% (escape_char(x[0]),escape_char_list(x[1])))
236514
f.write("}\n\n")
237515

516+
defescape_script_constant(name,longforms):
517+
return"Script::"+longforms[name].strip()
518+
519+
defemit_rustc_mixed_script_confusable_detection(f):
520+
f.write("pub mod rustc_mixed_script_confusable_detection {")
521+
f.write("""
522+
use unicode_script::Script;
523+
524+
#[inline]
525+
pub fn is_rustc_mixed_script_confusable(c: char) -> Option<Script> {
526+
match c as usize {
527+
_ => super::util::bsearch_value_table(c, CONFUSABLES)
528+
}
529+
}
530+
531+
""")
532+
identifier_status_table=load_properties("IdentifierStatus.txt")
533+
longforms,scripts=load_scripts("Scripts.txt")
534+
identifier_allowed=identifier_status_table['Allowed']
535+
(mixedscript_confusable,mixedscript_confusable_unresolved)=load_rustc_mixedscript_confusables("confusables.txt",identifier_allowed,scripts)
536+
debug=False
537+
ifdebug==True:
538+
debug_emit_mixedscript_confusable(f,mixedscript_confusable,"mixedscript_confusable",scripts)
539+
debug_emit_mixedscript_confusable_unresolved(f,mixedscript_confusable_unresolved,"mixedscript_confusable_unresolved",scripts)
540+
confusable_table= []
541+
forscript,lstinmixedscript_confusable.items():
542+
for_,pairinlst.items():
543+
source=pair[0]
544+
confusable_table.append((source,script))
545+
confusable_table.sort(key=lambdaw:w[0])
546+
emit_table(f,"CONFUSABLES",confusable_table,"&'static [(char, Script)]",is_pub=False,
547+
pfun=lambdax:"(%s,%s)"% (escape_char(x[0]),escape_script_constant(x[1],longforms)))
548+
f.write("}\n\n")
549+
238550

239551
defemit_util_mod(f):
240552
f.write("""
@@ -301,3 +613,5 @@ def emit_util_mod(f):
301613
emit_identifier_module(rf)
302614
### confusable_detection module
303615
emit_confusable_detection_module(rf)
616+
### mixed_script_confusable_detection module
617+
emit_rustc_mixed_script_confusable_detection(rf)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp