@@ -155,11 +155,11 @@ def format_table_content(f, content, indent):
155155line = " " * indent + chunk
156156f .write (line )
157157
158- def load_properties (f ,interestingprops ):
158+ def load_properties (f ,interestingprops : "list[str | tuple[str, str]] | None" = None ):
159159fetch (f )
160160props = {}
161- re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+)" )
162- re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)" )
161+ re1 = re .compile (r"^\s *([0-9A-F]+)\s*;\s *(\w+)(?:\s*;\s*(\w+))? " )
162+ re2 = re .compile (r"^\s *([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s *(\w+)(?:\s*;\s*(\w+))? " )
163163
164164for line in fileinput .input (os .path .basename (f )):
165165prop = None
@@ -168,17 +168,21 @@ def load_properties(f, interestingprops):
168168m = re1 .match (line )
169169if m :
170170d_lo = m .group (1 )
171- d_hi = m . group ( 1 )
171+ d_hi = d_lo
172172prop = m .group (2 )
173+ value = m .group (3 )
173174else :
174175m = re2 .match (line )
175176if m :
176177d_lo = m .group (1 )
177178d_hi = m .group (2 )
178179prop = m .group (3 )
180+ value = m .group (4 )
179181else :
180182continue
181- if interestingprops and prop not in interestingprops :
183+ if value is not None :
184+ prop = (prop ,value )
185+ if interestingprops is not None and prop not in interestingprops :
182186continue
183187d_lo = int (d_lo ,16 )
184188d_hi = int (d_hi ,16 )
@@ -195,7 +199,7 @@ def load_properties(f, interestingprops):
195199def escape_char (c ):
196200return "'\\ u{%x}'" % c
197201
198- def emit_table (f ,name ,t_data ,t_type = "&'static [(char, char)]" ,is_pub = True ,
202+ def emit_table (f ,name ,t_data ,t_type = "&[(char, char)]" ,is_pub = True ,
199203pfun = lambda x :"(%s,%s)" % (escape_char (x [0 ]),escape_char (x [1 ])),is_const = True ):
200204pub_string = "const"
201205if not is_const :
@@ -217,7 +221,7 @@ def emit_util_mod(f):
217221f .write ("""
218222pub mod util {
219223 #[inline]
220- pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
224+ pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {
221225 use core::cmp::Ordering::{Equal, Less, Greater};
222226 r.binary_search_by(|&(lo,hi)| {
223227 if lo <= c && c <= hi { Equal }
@@ -252,13 +256,22 @@ def emit_util_mod(f):
252256
253257""" )
254258
255- def emit_property_module (f ,mod ,tbl ,emit ):
256- f .write ("mod %s {\n " % mod )
257- for cat in sorted (emit ):
258- emit_table (f ,"%s_table" % cat ,tbl [cat ],is_pub = False )
259+ def emit_property_module (f ,mod ,tbl ,emit :"list[str | tuple[str, str]]" ):
260+ f .write ("pub mod %s {\n " % mod )
261+
262+ cats = []
263+ for cat in emit :
264+ if type (cat )is tuple :
265+ cats .append ((f"{ cat [0 ]} _{ cat [1 ]} " ,cat ))
266+ else :
267+ cats .append ((cat ,cat ))
268+ cats .sort (key = lambda x :x [0 ])
269+
270+ for cat_str ,cat in cats :
271+ emit_table (f ,"%s_table" % cat_str ,tbl [cat ],is_pub = False )
259272f .write (" #[inline]\n " )
260- f .write (" pub fn %s(c: char) -> bool {\n " % cat )
261- f .write (" super::util::bsearch_range_table(c, %s_table)\n " % cat )
273+ f .write (" pub fn %s(c: char) -> bool {\n " % cat_str )
274+ f .write (" super::util::bsearch_range_table(c, %s_table)\n " % cat_str )
262275f .write (" }\n \n " )
263276f .write ("}\n \n " )
264277
@@ -303,7 +316,7 @@ def emit_break_module(f, break_table, break_cats, name):
303316f .write ((" %sC_" % Name [0 ])+ cat + ",\n " )
304317f .write (""" }
305318
306- fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
319+ fn bsearch_range_value_table(c: char, r: &[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
307320 use core::cmp::Ordering::{Equal, Less, Greater};
308321 match r.binary_search_by(|&(lo, hi, _)| {
309322 if lo <= c && c <= hi { Equal }
@@ -355,11 +368,11 @@ def emit_break_module(f, break_table, break_cats, name):
355368else :
356369lookup_type = "u32"
357370
358- emit_table (f ,"%s_cat_lookup" % name ,lookup_table ,"&'static [%s]" % lookup_type ,
371+ emit_table (f ,"%s_cat_lookup" % name ,lookup_table ,"&[%s]" % lookup_type ,
359372pfun = lambda x :"%d" % x ,
360373is_pub = False ,is_const = True )
361374
362- emit_table (f ,"%s_cat_table" % name ,break_table ,"&'static [(char, char, %sCat)]" % Name ,
375+ emit_table (f ,"%s_cat_table" % name ,break_table ,"&[(char, char, %sCat)]" % Name ,
363376pfun = lambda x :"(%s,%s,%sC_%s)" % (escape_char (x [0 ]),escape_char (x [1 ]),Name [0 ],x [2 ]),
364377is_pub = False ,is_const = True )
365378f .write ("}\n " )
@@ -379,17 +392,26 @@ def emit_break_module(f, break_table, break_cats, name):
379392
380393# download and parse all the data
381394gencats = load_gencats ("UnicodeData.txt" )
382- derived = load_properties ("DerivedCoreProperties.txt" , ["Alphabetic" ])
395+ derived = load_properties ("DerivedCoreProperties.txt" , ["Alphabetic" , ( "InCB" , "Consonant" ), ( "InCB" , "Extend" ), ( "InCB" , "Linker" ) ])
383396
384397emit_util_mod (rf )
385398for (name ,cat ,pfuns )in ("general_category" ,gencats , ["N" ]), \
386- ("derived_property" ,derived , ["Alphabetic" ]):
399+ ("derived_property" ,derived , ["Alphabetic" , ( "InCB" , "Extend" ) ]):
387400emit_property_module (rf ,name ,cat ,pfuns )
388401
402+ rf .write ("""pub fn is_incb_linker(c: char) -> bool {
403+ matches!(c,""" )
404+
405+ for (lo ,hi )in derived [("InCB" ,"Linker" )]:
406+ rf .write (f" | '\\ u{{{ lo :X} }}'" )
407+ if lo != hi :
408+ rf .write (f"..'\\ u{{{ lo :X} }}'" )
409+
410+ rf .write (")\n }\n \n " )
411+
389412### grapheme cluster module
390413# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
391- grapheme_cats = load_properties ("auxiliary/GraphemeBreakProperty.txt" , [])
392-
414+ grapheme_cats = load_properties ("auxiliary/GraphemeBreakProperty.txt" )
393415# Control
394416# Note:
395417# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
@@ -398,22 +420,22 @@ def emit_break_module(f, break_table, break_cats, name):
398420grapheme_cats ["Control" ]= group_cat (list (
399421set (ungroup_cat (grapheme_cats ["Control" ]))
400422- set (ungroup_cat ([surrogate_codepoints ]))))
401-
423+ grapheme_cats ["InCB_Consonant" ]= derived [("InCB" ,"Consonant" )]
424+ emoji_props = load_properties ("emoji-data.txt" , ["Extended_Pictographic" ])
425+ grapheme_cats ["Extended_Pictographic" ]= emoji_props ["Extended_Pictographic" ]
402426grapheme_table = []
403427for cat in grapheme_cats :
404428grapheme_table .extend ([(x ,y ,cat )for (x ,y )in grapheme_cats [cat ]])
405- emoji_props = load_properties ("emoji-data.txt" , ["Extended_Pictographic" ])
406- grapheme_table .extend ([(x ,y ,"Extended_Pictographic" )for (x ,y )in emoji_props ["Extended_Pictographic" ]])
407429grapheme_table .sort (key = lambda w :w [0 ])
408430last = - 1
409431for chars in grapheme_table :
410432if chars [0 ]<= last :
411433raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
412434last = chars [1 ]
413- emit_break_module (rf ,grapheme_table ,list (grapheme_cats .keys ())+ [ "Extended_Pictographic" ] ,"grapheme" )
435+ emit_break_module (rf ,grapheme_table ,list (grapheme_cats .keys ()),"grapheme" )
414436rf .write ("\n " )
415437
416- word_cats = load_properties ("auxiliary/WordBreakProperty.txt" , [] )
438+ word_cats = load_properties ("auxiliary/WordBreakProperty.txt" )
417439word_table = []
418440for cat in word_cats :
419441word_table .extend ([(x ,y ,cat )for (x ,y )in word_cats [cat ]])
@@ -425,7 +447,7 @@ def emit_break_module(f, break_table, break_cats, name):
425447emoji_table = [(x ,y ,"Extended_Pictographic" )for (x ,y )in emoji_props ["Extended_Pictographic" ]]
426448emit_break_module (rf ,emoji_table , ["Extended_Pictographic" ],"emoji" )
427449
428- sentence_cats = load_properties ("auxiliary/SentenceBreakProperty.txt" , [] )
450+ sentence_cats = load_properties ("auxiliary/SentenceBreakProperty.txt" )
429451sentence_table = []
430452for cat in sentence_cats :
431453sentence_table .extend ([(x ,y ,cat )for (x ,y )in sentence_cats [cat ]])