@@ -47,37 +47,39 @@ def fetch(f):
4747sys .stderr .write ("cannot load %s\n " % f )
4848exit (1 )
4949
50- # load identifier status data
51- def load_identifier_status ():
52- f = "IdentifierStatus.txt"
50+ # Implementation from unicode-segmentation
51+ def load_properties (f ,interestingprops = None ):
5352fetch (f )
54- statuses = []
55- re1 = re .compile ("^ ([0-9A-F]+)+; + (\w+)" )
56- re2 = re .compile ("^ ([0-9A-F]+)\.\.([0-9A-F]+)+; + (\w+)" )
53+ props = {}
54+ re1 = re .compile (r"^ * ([0-9A-F]+)*; * (\w+)" )
55+ re2 = re .compile (r"^ * ([0-9A-F]+)\.\.([0-9A-F]+)*; * (\w+)" )
5756
58- for line in fileinput .input (f ):
57+ for line in fileinput .input (os .path .basename (f )):
58+ prop = None
5959d_lo = 0
6060d_hi = 0
61- cat = None
6261m = re1 .match (line )
6362if m :
6463d_lo = m .group (1 )
6564d_hi = m .group (1 )
66- cat = m .group (2 )
65+ prop = m .group (2 ). strip ( )
6766else :
6867m = re2 .match (line )
6968if m :
7069d_lo = m .group (1 )
7170d_hi = m .group (2 )
72- cat = m .group (3 )
71+ prop = m .group (3 ). strip ( )
7372else :
7473continue
75- if cat != "Allowed" :
74+ if interestingprops and prop not in interestingprops :
7675continue
7776d_lo = int (d_lo ,16 )
7877d_hi = int (d_hi ,16 )
79- statuses .append ((d_lo ,d_hi ))
80- return statuses
78+ if prop not in props :
79+ props [prop ]= []
80+ props [prop ].append ((d_lo ,d_hi ))
81+
82+ return props
8183
8284def format_table_content (f ,content ,indent ):
8385line = " " * indent
@@ -115,41 +117,95 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
115117format_table_content (f ,data ,8 )
116118f .write ("\n ];\n \n " )
117119
118- def emit_identifier_status_module ( f , statuses_table ):
119- f .write ("pub modidentifier_status {" )
120+ def emit_identifier_module ( f ):
121+ f .write ("pub modidentifier {" )
120122f .write ("""
121- use core::result::Result::{Ok, Err};
122123
124+ #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
125+ #[allow(non_camel_case_types)]
126+ /// https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type
127+ pub enum IdentifierType {
128+ // Restricted
129+ Not_Character,
130+ Deprecated,
131+ Default_Ignorable,
132+ Not_NFKC,
133+ Not_XID,
134+ Exclusion,
135+ Obsolete,
136+ Technical,
137+ Uncommon_Use,
138+ Limited_Use,
139+
140+ // Allowed
141+ Inclusion,
142+ Recommended
143+ }
123144 #[inline]
124- fn bsearch_range_value_table(c: char, r: &'static [(char, char)]) -> bool {
125- use core::cmp::Ordering::{Equal, Less, Greater};
126- match r.binary_search_by(|&(lo, hi)| {
127- if lo <= c && c <= hi { Equal }
128- else if hi < c { Less }
129- else { Greater }
130- }) {
131- Ok(_) => true,
132- Err(_) => false
145+ pub fn identifier_status_allowed(c: char) -> bool {
146+ // FIXME: do we want to special case ASCII here?
147+ match c as usize {
148+ _ => super::util::bsearch_range_table(c, IDENTIFIER_STATUS)
133149 }
134150 }
135- """ )
136151
137- f .write ("""
138152 #[inline]
139- pub fnidentifier_status_allowed (c: char) ->bool {
153+ pub fnidentifier_type (c: char) ->Option<IdentifierType> {
140154 // FIXME: do we want to special case ASCII here?
141155 match c as usize {
142- _ => bsearch_range_value_table(c,identifier_status_table )
156+ _ =>super::util:: bsearch_range_value_table(c,IDENTIFIER_TYPE )
143157 }
144158 }
145-
146159""" )
147160
148- f .write (" // identifier status table.\n " )
149- emit_table (f ,"identifier_status_table" ,statuses_table ,"&'static [(char, char)]" ,is_pub = False ,
161+ f .write (" // Identifier status table:\n " )
162+ identifier_status_table = load_properties ("IdentifierStatus.txt" )
163+ emit_table (f ,"IDENTIFIER_STATUS" ,identifier_status_table ['Allowed' ],"&'static [(char, char)]" ,is_pub = False ,
150164pfun = lambda x :"(%s,%s)" % (escape_char (x [0 ]),escape_char (x [1 ])))
165+ identifier_type = load_properties ("IdentifierType.txt" )
166+ type_table = []
167+ for ty in identifier_type :
168+ type_table .extend ([(x ,y ,ty )for (x ,y )in identifier_type [ty ]])
169+
170+ type_table .sort (key = lambda w :w [0 ])
171+
172+ emit_table (f ,"IDENTIFIER_TYPE" ,type_table ,"&'static [(char, char, IdentifierType)]" ,is_pub = False ,
173+ pfun = lambda x :"(%s,%s, IdentifierType::%s)" % (escape_char (x [0 ]),escape_char (x [1 ]),x [2 ]))
151174f .write ("}\n \n " )
152175
176+ def emit_util_mod (f ):
177+ f .write ("""
178+ pub mod util {
179+ use core::result::Result::{Ok, Err};
180+ #[inline]
181+ pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182+ use core::cmp::Ordering::{Equal, Less, Greater};
183+ r.binary_search_by(|&(lo,hi)| {
184+ if lo <= c && c <= hi { Equal }
185+ else if hi < c { Less }
186+ else { Greater }
187+ }).is_ok()
188+ }
189+
190+ pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
191+ use core::cmp::Ordering::{Equal, Less, Greater};
192+ match r.binary_search_by(|&(lo, hi, _)| {
193+ if lo <= c && c <= hi { Equal }
194+ else if hi < c { Less }
195+ else { Greater }
196+ }) {
197+ Ok(idx) => {
198+ let (_, _, cat) = r[idx];
199+ Some(cat)
200+ }
201+ Err(_) => None
202+ }
203+ }
204+
205+ }
206+
207+ """ )
208+
153209if __name__ == "__main__" :
154210r = "tables.rs"
155211if os .path .exists (r ):
@@ -164,6 +220,7 @@ def emit_identifier_status_module(f, statuses_table):
164220pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
165221
166222""" % UNICODE_VERSION )
167- ### identifier status module
168- identifier_status_table = load_identifier_status ()
169- emit_identifier_status_module (rf ,identifier_status_table )
223+
224+ emit_util_mod (rf )
225+ ### identifier module
226+ emit_identifier_module (rf )