@@ -155,11 +155,11 @@ def format_table_content(f, content, indent):
155
155
line = " " * indent + chunk
156
156
f .write (line )
157
157
158
- def load_properties (f ,interestingprops ):
158
+ def load_properties (f ,interestingprops : "list[str | tuple[str, str]] | None" = None ):
159
159
fetch (f )
160
160
props = {}
161
- re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+)" )
162
- re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)" )
161
+ re1 = re .compile (r"^\s *([0-9A-F]+)\s*;\s *(\w+)(?:\s*;\s*(\w+))? " )
162
+ re2 = re .compile (r"^\s *([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s *(\w+)(?:\s*;\s*(\w+))? " )
163
163
164
164
for line in fileinput .input (os .path .basename (f )):
165
165
prop = None
@@ -168,17 +168,21 @@ def load_properties(f, interestingprops):
168
168
m = re1 .match (line )
169
169
if m :
170
170
d_lo = m .group (1 )
171
- d_hi = m . group ( 1 )
171
+ d_hi = d_lo
172
172
prop = m .group (2 )
173
+ value = m .group (3 )
173
174
else :
174
175
m = re2 .match (line )
175
176
if m :
176
177
d_lo = m .group (1 )
177
178
d_hi = m .group (2 )
178
179
prop = m .group (3 )
180
+ value = m .group (4 )
179
181
else :
180
182
continue
181
- if interestingprops and prop not in interestingprops :
183
+ if value is not None :
184
+ prop = (prop ,value )
185
+ if interestingprops is not None and prop not in interestingprops :
182
186
continue
183
187
d_lo = int (d_lo ,16 )
184
188
d_hi = int (d_hi ,16 )
@@ -195,7 +199,7 @@ def load_properties(f, interestingprops):
195
199
def escape_char (c ):
196
200
return "'\\ u{%x}'" % c
197
201
198
- def emit_table (f ,name ,t_data ,t_type = "&'static [(char, char)]" ,is_pub = True ,
202
+ def emit_table (f ,name ,t_data ,t_type = "&[(char, char)]" ,is_pub = True ,
199
203
pfun = lambda x :"(%s,%s)" % (escape_char (x [0 ]),escape_char (x [1 ])),is_const = True ):
200
204
pub_string = "const"
201
205
if not is_const :
@@ -217,7 +221,7 @@ def emit_util_mod(f):
217
221
f .write ("""
218
222
pub mod util {
219
223
#[inline]
220
- pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
224
+ pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {
221
225
use core::cmp::Ordering::{Equal, Less, Greater};
222
226
r.binary_search_by(|&(lo,hi)| {
223
227
if lo <= c && c <= hi { Equal }
@@ -252,13 +256,22 @@ def emit_util_mod(f):
252
256
253
257
""" )
254
258
255
- def emit_property_module (f ,mod ,tbl ,emit ):
256
- f .write ("mod %s {\n " % mod )
257
- for cat in sorted (emit ):
258
- emit_table (f ,"%s_table" % cat ,tbl [cat ],is_pub = False )
259
+ def emit_property_module (f ,mod ,tbl ,emit :"list[str | tuple[str, str]]" ):
260
+ f .write ("pub mod %s {\n " % mod )
261
+
262
+ cats = []
263
+ for cat in emit :
264
+ if type (cat )is tuple :
265
+ cats .append ((f"{ cat [0 ]} _{ cat [1 ]} " ,cat ))
266
+ else :
267
+ cats .append ((cat ,cat ))
268
+ cats .sort (key = lambda x :x [0 ])
269
+
270
+ for cat_str ,cat in cats :
271
+ emit_table (f ,"%s_table" % cat_str ,tbl [cat ],is_pub = False )
259
272
f .write (" #[inline]\n " )
260
- f .write (" pub fn %s(c: char) -> bool {\n " % cat )
261
- f .write (" super::util::bsearch_range_table(c, %s_table)\n " % cat )
273
+ f .write (" pub fn %s(c: char) -> bool {\n " % cat_str )
274
+ f .write (" super::util::bsearch_range_table(c, %s_table)\n " % cat_str )
262
275
f .write (" }\n \n " )
263
276
f .write ("}\n \n " )
264
277
@@ -303,7 +316,7 @@ def emit_break_module(f, break_table, break_cats, name):
303
316
f .write ((" %sC_" % Name [0 ])+ cat + ",\n " )
304
317
f .write (""" }
305
318
306
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
319
+ fn bsearch_range_value_table(c: char, r: &[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
307
320
use core::cmp::Ordering::{Equal, Less, Greater};
308
321
match r.binary_search_by(|&(lo, hi, _)| {
309
322
if lo <= c && c <= hi { Equal }
@@ -355,11 +368,11 @@ def emit_break_module(f, break_table, break_cats, name):
355
368
else :
356
369
lookup_type = "u32"
357
370
358
- emit_table (f ,"%s_cat_lookup" % name ,lookup_table ,"&'static [%s]" % lookup_type ,
371
+ emit_table (f ,"%s_cat_lookup" % name ,lookup_table ,"&[%s]" % lookup_type ,
359
372
pfun = lambda x :"%d" % x ,
360
373
is_pub = False ,is_const = True )
361
374
362
- emit_table (f ,"%s_cat_table" % name ,break_table ,"&'static [(char, char, %sCat)]" % Name ,
375
+ emit_table (f ,"%s_cat_table" % name ,break_table ,"&[(char, char, %sCat)]" % Name ,
363
376
pfun = lambda x :"(%s,%s,%sC_%s)" % (escape_char (x [0 ]),escape_char (x [1 ]),Name [0 ],x [2 ]),
364
377
is_pub = False ,is_const = True )
365
378
f .write ("}\n " )
@@ -379,17 +392,26 @@ def emit_break_module(f, break_table, break_cats, name):
379
392
380
393
# download and parse all the data
381
394
gencats = load_gencats ("UnicodeData.txt" )
382
- derived = load_properties ("DerivedCoreProperties.txt" , ["Alphabetic" ])
395
+ derived = load_properties ("DerivedCoreProperties.txt" , ["Alphabetic" , ( "InCB" , "Consonant" ), ( "InCB" , "Extend" ), ( "InCB" , "Linker" ) ])
383
396
384
397
emit_util_mod (rf )
385
398
for (name ,cat ,pfuns )in ("general_category" ,gencats , ["N" ]), \
386
- ("derived_property" ,derived , ["Alphabetic" ]):
399
+ ("derived_property" ,derived , ["Alphabetic" , ( "InCB" , "Extend" ) ]):
387
400
emit_property_module (rf ,name ,cat ,pfuns )
388
401
402
+ rf .write ("""pub fn is_incb_linker(c: char) -> bool {
403
+ matches!(c,""" )
404
+
405
+ for (lo ,hi )in derived [("InCB" ,"Linker" )]:
406
+ rf .write (f" | '\\ u{{{ lo :X} }}'" )
407
+ if lo != hi :
408
+ rf .write (f"..'\\ u{{{ lo :X} }}'" )
409
+
410
+ rf .write (")\n }\n \n " )
411
+
389
412
### grapheme cluster module
390
413
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
391
- grapheme_cats = load_properties ("auxiliary/GraphemeBreakProperty.txt" , [])
392
-
414
+ grapheme_cats = load_properties ("auxiliary/GraphemeBreakProperty.txt" )
393
415
# Control
394
416
# Note:
395
417
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
@@ -398,22 +420,22 @@ def emit_break_module(f, break_table, break_cats, name):
398
420
grapheme_cats ["Control" ]= group_cat (list (
399
421
set (ungroup_cat (grapheme_cats ["Control" ]))
400
422
- set (ungroup_cat ([surrogate_codepoints ]))))
401
-
423
+ grapheme_cats ["InCB_Consonant" ]= derived [("InCB" ,"Consonant" )]
424
+ emoji_props = load_properties ("emoji-data.txt" , ["Extended_Pictographic" ])
425
+ grapheme_cats ["Extended_Pictographic" ]= emoji_props ["Extended_Pictographic" ]
402
426
grapheme_table = []
403
427
for cat in grapheme_cats :
404
428
grapheme_table .extend ([(x ,y ,cat )for (x ,y )in grapheme_cats [cat ]])
405
- emoji_props = load_properties ("emoji-data.txt" , ["Extended_Pictographic" ])
406
- grapheme_table .extend ([(x ,y ,"Extended_Pictographic" )for (x ,y )in emoji_props ["Extended_Pictographic" ]])
407
429
grapheme_table .sort (key = lambda w :w [0 ])
408
430
last = - 1
409
431
for chars in grapheme_table :
410
432
if chars [0 ]<= last :
411
433
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
412
434
last = chars [1 ]
413
- emit_break_module (rf ,grapheme_table ,list (grapheme_cats .keys ())+ [ "Extended_Pictographic" ] ,"grapheme" )
435
+ emit_break_module (rf ,grapheme_table ,list (grapheme_cats .keys ()),"grapheme" )
414
436
rf .write ("\n " )
415
437
416
- word_cats = load_properties ("auxiliary/WordBreakProperty.txt" , [] )
438
+ word_cats = load_properties ("auxiliary/WordBreakProperty.txt" )
417
439
word_table = []
418
440
for cat in word_cats :
419
441
word_table .extend ([(x ,y ,cat )for (x ,y )in word_cats [cat ]])
@@ -425,7 +447,7 @@ def emit_break_module(f, break_table, break_cats, name):
425
447
emoji_table = [(x ,y ,"Extended_Pictographic" )for (x ,y )in emoji_props ["Extended_Pictographic" ]]
426
448
emit_break_module (rf ,emoji_table , ["Extended_Pictographic" ],"emoji" )
427
449
428
- sentence_cats = load_properties ("auxiliary/SentenceBreakProperty.txt" , [] )
450
+ sentence_cats = load_properties ("auxiliary/SentenceBreakProperty.txt" )
429
451
sentence_table = []
430
452
for cat in sentence_cats :
431
453
sentence_table .extend ([(x ,y ,cat )for (x ,y )in sentence_cats [cat ]])