Expand Up @@ -35,6 +35,8 @@ // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly #![allow(missing_docs, non_upper_case_globals, non_snake_case)] use super::ScriptExtension; ''' UNICODE_VERSION = (12, 0, 0) Expand Down Expand Up @@ -183,182 +185,102 @@ def emit_search(f): } """) def emit_enums(f, script_list, extension_list, longforms, intersections ): def emit_enums(f, script_list, extension_list, longforms): """ Emit the Script and ScriptExtension enums as well as any related utility functions """ f.write(""" use core::convert::TryFrom; #[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)] #[non_exhaustive] #[allow(non_camel_case_types)] /// A value of the Script property #[repr(u8)] /// A value of the `Script` property pub enum Script { /// Unknown script Unknown, Unknown = 0xFF, /// Zyyy Common = 0xFE, /// Zinh, Inherited = 0xFD, """) for script in script_list: f.write(" /// %s\n %s,\n" % (script, longforms[script])) f.write("""} #[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)] #[non_exhaustive] /// A value for the Script_Extension property /// /// Script_Extension is one or more Script /// /// This is essentially an optimized version of Vec<Script>, /// optimized by script sets and intersections actually present in Unicode. pub enum ScriptExtension { /// A single script Single(Script), for (i, script) in enumerate(script_list): f.write(" /// %s\n %s = %s,\n" % (script, longforms[script], i)) f.write("}\n") f.write("pub const NEXT_SCRIPT: u8 = %s;" % len(script_list)) f.write(""" pub mod script_extensions { use crate::ScriptExtension; pub const COMMON: ScriptExtension = ScriptExtension::new_common(); pub const INHERITED: ScriptExtension = ScriptExtension::new_inherited(); pub const UNKNOWN: ScriptExtension = ScriptExtension::new_unknown(); """) for (i, script) in enumerate(script_list): first = 0 second = 0 third = 0 # need to replace L because `hex()` will spit out an L suffix for larger numbers if i < 64: first = hex(1 << i).replace("L", "") elif i < 128: second = hex(1 << (i - 64)).replace("L", "") else: third = hex(1 << (i - 128)).replace("L", "") f.write(" /// %s\n pub const %s: ScriptExtension = ScriptExtension::new(%s, %s, %s);\n" % (longforms[script], longforms[script].upper(), first, second, third)) if script != longforms[script]: f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" % (longforms[script], script.upper(), longforms[script].upper())) for ext in extension_list: longform = ", ".join([longforms[s] for s in ext]) f.write(" /// %s\n %s,\n" % (longform, "".join(ext))) name = "_".join([s.upper() for s in ext]) expr = ext[0].upper() for e in ext[1:]: expr = "%s.union(%s)" % (expr, e.upper()) f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" % (longform, name, expr)) f.write("""} impl From<Script> for ScriptExtension { fn from(script: Script) -> Self { ScriptExtension::Single(script) } } impl TryFrom<ScriptExtension> for Script { type Error = (); fn try_from(ext: ScriptExtension) -> Result<Self, ()> { match ext { ScriptExtension::Single(s) => Ok(s), _ => Err(()) } } } impl Script { #[inline] pub(crate) fn inner_full_name(self) -> &'static str { match self { Script::Unknown => "Unknown", Script::Common => "Common", Script::Inherited => "Inherited", """) for script in script_list: f.write(" Script::%s => \"%s\",\n" % (longforms[script], longforms[script])) f.write(""" } } #[inline] pub(crate) fn inner_short_name(self) -> &'static str { match self { Script::Unknown => "", Script::Common => "Zyyy", Script::Inherited => "Zinh", """) for script in script_list: f.write(" Script::%s => \"%s\",\n" % (longforms[script], script)) f.write(""" } } } impl ScriptExtension { #[inline] #[cfg(feature = "with_std")] pub(crate) fn inner_scripts(self) -> Vec<Script> { match self { ScriptExtension::Single(s) => vec![s], pub(crate) fn for_integer(value: u8) -> Self { match value { """) for ext in extension_list: scripts = ", ".join(["Script::%s" % longforms[s] for s in ext]) f.write(" %s => vec![%s],\n" % (extension_name(ext), scripts)) f.write(""" _ => unreachable!() } } #[inline] pub(crate) fn inner_contains_script(self, other: Script) -> bool { match self { ScriptExtension::Single(s) => s == other, """) for ext in extension_list: scripts = " || ".join(["other == Script::%s" % longforms[s] for s in ext]) f.write(" %s => %s,\n" % (extension_name(ext), scripts)) f.write(""" } } #[inline] pub(crate) fn inner_intersect(self, other: Self) -> Self { match (self, other) { (ScriptExtension::Single(Script::Unknown), _) | (_, ScriptExtension::Single(Script::Unknown)) => ScriptExtension::Single(Script::Unknown), (a, b) if a == b => a, (ScriptExtension::Single(Script::Common), a) | (ScriptExtension::Single(Script::Inherited), a) | (a, ScriptExtension::Single(Script::Common)) | (a, ScriptExtension::Single(Script::Inherited)) => a, (ScriptExtension::Single(s), o) | (o, ScriptExtension::Single(s)) if o.inner_contains_script(s) => ScriptExtension::Single(s), """) for (e1, e2, i) in intersections: f.write(" (%s, %s) => %s,\n" % (extension_name(e1), extension_name(e2), extension_name(i, longforms))) f.write(""" _ => ScriptExtension::Single(Script::Unknown), for (i, script) in enumerate(script_list): f.write(" %s => Script::%s,\n" % (i, longforms[script])) f.write(""" _ => unreachable!(), } } } """) def compute_intersections_elements(extension_list): """ Compute all intersections between the script extensions. This will add new elements to extension_list, be sure to call it first! """ # This is the only third-level intersection # It's easier to hardcode things here rather than # do the below calculation in a loop extension_list.append(['Deva', 'Knda', 'Tirh']) intersections = [] # Some intersections will not exist in extension_list and we'll need to add them new_elements = [] sets = [(e, set(e)) for e in extension_list] for (e1, s1) in sets: for (e2, s2) in sets: if e1 == e2: continue intersection = s1.intersection(s2) if len(intersection) > 0: intersection = [i for i in intersection] intersection.sort() if len(intersection) > 1 and intersection not in extension_list and intersection not in new_elements: new_elements.append(intersection) if (e1, e2, intersection) not in intersections: intersections.append((e1, e2, intersection)) extension_list.extend(new_elements) # We now go through the newly added second-level extension values and calculate their intersections # with the original set and each other new_sets = [(e, set(e)) for e in new_elements] sets = [(e, set(e)) for e in extension_list] for (e1, s1) in new_sets: for (e2, s2) in sets: if e1 == e2: continue intersection = s1.intersection(s2) if len(intersection) > 0: intersection = [i for i in intersection] intersection.sort() if len(intersection) > 1 and intersection not in extension_list: raise "Found new third-level intersection, please hardcode it" # The previous routine would automatically get both versions # of an intersection because it would iterate each pair in both orders, # but here we're working on an asymmetric pair, so we insert both in order to not # miss anything if (e1, e2, intersection) not in intersections: intersections.append((e1, e2, intersection)) if (e2, e1, intersection) not in intersections: intersections.append((e2, e1, intersection)) intersections.sort() return intersections def extension_name(ext, longforms={}): def extension_name(ext): """Get the rust source for a given ScriptExtension""" if len(ext) == 1: return "ScriptExtension::Single(Script::%s)" % longforms[ext[0]] else: return "ScriptExtension::%s" % "".join(ext) return "script_extensions::%s" % "_".join([e.upper() for e in ext]) Expand All @@ -385,8 +307,10 @@ def extension_name(ext, longforms={}): script_list = [] for script in scripts: script_list.append(shortforms[script]) if script not in ["Common", "Unknown", "Inherited"]: script_list.append(shortforms[script]) script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]]) script_list.sort() script_table.sort(key=lambda w: w[0]) Expand All @@ -404,14 +328,13 @@ def extension_name(ext, longforms={}): extension_table.extend([(x, y, output_ext) for (x, y) in extensions[ext]]) extension_table.sort(key=lambda w: w[0]) intersections = compute_intersections_elements(extension_list) emit_enums(rf, script_list, extension_list, longforms, intersections ) emit_enums(rf, script_list, extension_list, longforms) emit_search(rf) emit_table(rf, "SCRIPTS", script_table, t_type = "&'static [(char, char, Script)]", is_pub=False , pfun=lambda x: "(%s,%s, Script::%s)" % (escape_char(x[0]), escape_char(x[1]), longforms[x[2]])) emit_table(rf, "SCRIPT_EXTENSIONS", extension_table, t_type = "&'static [(char, char, ScriptExtension)]", is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2], longforms ))) is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2]))) # emit_table(rf, "FOObar", properties)