3434#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
3535'''
3636
37- UNICODE_VERSION = (12 , 1 ,0 )
37+ UNICODE_VERSION = (13 , 0 ,0 )
3838
3939UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
4040
@@ -54,7 +54,7 @@ def load_properties(f, interestingprops = None):
5454re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+)" )
5555re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)" )
5656
57- for line in fileinput .input (os .path .basename (f )):
57+ for line in fileinput .input (os .path .basename (f ), openhook = fileinput . hook_encoded ( "utf-8" ) ):
5858prop = None
5959d_lo = 0
6060d_hi = 0
@@ -81,6 +81,28 @@ def load_properties(f, interestingprops = None):
8181
8282return props
8383
84+ def load_confusables (f ):
85+ fetch (f )
86+ confusables = []
87+ re1 = re .compile (r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*" )
88+
89+ for line in fileinput .input (os .path .basename (f ),openhook = fileinput .hook_encoded ("utf-8" )):
90+ d_input = 0
91+ d_outputs = []
92+ m = re1 .match (line )
93+ if not m :
94+ continue
95+ d_inputs = m .group (1 ).split ()
96+ if len (d_inputs )!= 1 :
97+ raise Exception ('More than one code point in first column' )
98+ d_input = int (d_inputs [0 ].strip (),16 )
99+ for d_output in m .group (2 ).split ():
100+ d_outputitem = int (d_output ,16 );
101+ d_outputs .append (d_outputitem );
102+ confusables .append ((d_input ,d_outputs ))
103+
104+ return confusables
105+
84106def format_table_content (f ,content ,indent ):
85107line = " " * indent
86108first = True
@@ -99,6 +121,18 @@ def format_table_content(f, content, indent):
99121def escape_char (c ):
100122return "'\\ u{%x}'" % c
101123
124+ def escape_char_list (l ):
125+ line = "[" ;
126+ first = True ;
127+ for c in l :
128+ if first :
129+ line += escape_char (c );
130+ else :
131+ line += ", " + escape_char (c );
132+ first = False ;
133+ line += "]" ;
134+ return line
135+
102136def emit_table (f ,name ,t_data ,t_type = "&'static [(char, char)]" ,is_pub = True ,
103137pfun = lambda x :"(%s,%s)" % (escape_char (x [0 ]),escape_char (x [1 ])),is_const = True ):
104138pub_string = "const"
@@ -173,10 +207,51 @@ def emit_identifier_module(f):
173207pfun = lambda x :"(%s,%s, IdentifierType::%s)" % (escape_char (x [0 ]),escape_char (x [1 ]),x [2 ]))
174208f .write ("}\n \n " )
175209
210+ def emit_confusable_detection_module (f ):
211+ f .write ("pub mod confusable_detection {" )
212+ f .write ("""
213+
214+ #[inline]
215+ pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> {
216+ // FIXME: do we want to special case ASCII here?
217+ match c as usize {
218+ _ => super::util::bsearch_value_table(c, CONFUSABLES)
219+ }
220+ }
221+
222+ """ )
223+
224+ f .write (" // Confusable table:\n " )
225+ confusable_table = load_confusables ("confusables.txt" )
226+ confusable_table .sort (key = lambda w :w [0 ])
227+
228+ last_key = None
229+ for (k ,v )in confusable_table :
230+ if k == last_key :
231+ raise Exception ("duplicate keys in confusables table: %s" % k )
232+ last_key = k
233+
234+ emit_table (f ,"CONFUSABLES" ,confusable_table ,"&'static [(char, &'static [char])]" ,is_pub = False ,
235+ pfun = lambda x :"(%s, &%s)" % (escape_char (x [0 ]),escape_char_list (x [1 ])))
236+ f .write ("}\n \n " )
237+
238+
176239def emit_util_mod (f ):
177240f .write ("""
178241pub mod util {
179242 use core::result::Result::{Ok, Err};
243+
244+ #[inline]
245+ pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
246+ match r.binary_search_by_key(&c, |&(k, _)| k) {
247+ Ok(idx) => {
248+ let (_, v) = r[idx];
249+ Some(v)
250+ }
251+ Err(_) => None
252+ }
253+ }
254+
180255 #[inline]
181256 pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182257 use core::cmp::Ordering::{Equal, Less, Greater};
@@ -224,3 +299,5 @@ def emit_util_mod(f):
224299emit_util_mod (rf )
225300### identifier module
226301emit_identifier_module (rf )
302+ ### confusable_detection module
303+ emit_confusable_detection_module (rf )