Feb 24, 2025 · Feb 23, 2025 · Feb 23, 2025 · Feb 23, 2025 · Feb 23, 2025 · Feb 23, 2025
diff --git a/scripts/unicode.py b/scripts/unicode.py
 # - confusables.txt
 # - ReadMe.txt
 # This script also uses the following Unicode UCD data:
 # - DerivedCoreProperties.txt
 # - Scripts.txt
 #
 # Since this should not require frequent updates, we just store this
        sys.stderr.write("cannot load %s\n" % f)
        exit(1)

    return f

 # Download a UCD table file
 def fetch_unidata(f):
    if not os.path.exists(os.path.basename(f)):
        sys.stderr.write("cannot load %s" % f)
        exit(1)

 # Loads code point data from IdentifierStatus.txt and
 # IdentifierType.txt
 # Implementation from unicode-segmentation
    return f

 # Loads code point data from provided filename f
 # Implementation adapted from unicode-segmentation
 def load_properties(f, interestingprops = None):
    fetch(f)
    props = {}
    re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
    re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#\s]+) *#")
    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#\s]+) *#")

    for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
        prop = None

    return props

 # Loads script data from Scripts.txt
 def load_script_properties(f, interestingprops):
    fetch_unidata(f)
    props = {}
    # Note: these regexes are different from those in unicode-segmentation,
    # becase we need to handle spaces here
    re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#")
    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#")

    for line in fileinput.input(os.path.basename(f)):
        prop = None
        d_lo = 0
        d_hi = 0
        m = re1.match(line)
        if m:
            d_lo = m.group(1)
            d_hi = m.group(1)
            prop = m.group(2).strip()
        else:
            m = re2.match(line)
            if m:
                d_lo = m.group(1)
                d_hi = m.group(2)
                prop = m.group(3).strip()
            else:
                continue
        if interestingprops and prop not in interestingprops:
            continue
        d_lo = int(d_lo, 16)
        d_hi = int(d_hi, 16)
        if prop not in props:
            props[prop] = []
        props[prop].append((d_lo, d_hi))

    return props

 # Loads confusables data from confusables.txt
 def load_confusables(f):
    fetch(f)
    # changes are introduced, update accordingly.

    (longforms, shortforms) = aliases()
    scripts =load_script_properties(f, [])
    scripts =load_properties(fetch_unidata(f), [])

    script_table = []
    script_list = []
 """)

    f.write("    // Identifier status table:\n")
    identifier_status_table = load_properties("IdentifierStatus.txt")
    identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
    emit_table(f, "IDENTIFIER_STATUS", identifier_status_table['Allowed'], "&'static [(char, char)]", is_pub=False,
            pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
    identifier_type = load_properties("IdentifierType.txt")
    identifier_type = load_properties(fetch("IdentifierType.txt"))
    type_table = []
    for ty in identifier_type:
        type_table.extend([(x, y, ty) for (x, y) in identifier_type[ty]])
            pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
    f.write("}\n\n")

 def emit_default_ignorable_detection_module(f):
    f.write("pub mod default_ignorable_code_point {")
    f.write("""

    #[inline]
    pub fn default_ignorable_code_point(c: char) -> bool {
        match c as usize {
            _ => super::util::bsearch_range_table(c, DEFAULT_IGNORABLE)
        }
    }

 """)

    f.write("    // Default ignorable code point table:\n")
    default_ignorable_table = load_properties(fetch_unidata("DerivedCoreProperties.txt"), ["Default_Ignorable_Code_Point"])
    emit_table(f, "DEFAULT_IGNORABLE", default_ignorable_table["Default_Ignorable_Code_Point"], "&'static [(char, char)]", is_pub=False,
            pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))

    f.write("}\n\n")

 def emit_confusable_detection_module(f):
    f.write("pub mod confusable_detection {")
    f.write("""
        }
    }
 """)
    identifier_status_table = load_properties("IdentifierStatus.txt")
    identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
    _, scripts = load_scripts("Scripts.txt")
    identifier_allowed = identifier_status_table['Allowed']
    (mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)
        emit_util_mod(rf)
        ### identifier module
        emit_identifier_module(rf)
        ### default_ignorable_detection module
        emit_default_ignorable_detection_module(rf)
        ### confusable_detection module
        emit_confusable_detection_module(rf)
        ### mixed_script_confusable_detection module
diff --git a/src/confusable_detection.rs b/src/confusable_detection.rs

 /// Calculate skeleton for string, as defined by UTS 39
 pub fn skeleton(s: &str) -> impl Iterator<Item = char> + '_ {
    use crate::tables::default_ignorable_code_point::default_ignorable_code_point;
    use unicode_normalization::UnicodeNormalization;
    s.chars().nfd().flat_map(char_prototype).nfd()

    s.chars()
        .nfd()
        .filter(|c| !default_ignorable_code_point(*c))
        .flat_map(char_prototype)
        .nfd()
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,6 +17,7 @@
		# - confusables.txt
		# - ReadMe.txt
		# This script also uses the following Unicode UCD data:
		# - DerivedCoreProperties.txt
		# - Scripts.txt
		#
		# Since this should not require frequent updates, we just store this
Expand DownExpand Up		@@ -53,6 +54,8 @@ def fetch(f):
		sys.stderr.write("cannot load %s\n" % f)
		exit(1)

		return f

		# Download a UCD table file
		def fetch_unidata(f):
		if not os.path.exists(os.path.basename(f)):
Expand All		@@ -63,14 +66,14 @@ def fetch_unidata(f):
		sys.stderr.write("cannot load %s" % f)
		exit(1)

		# Loads code point data from IdentifierStatus.txt and
		# IdentifierType.txt
		# Implementation from unicode-segmentation
		return f

		# Loads code point data from provided filename f
		# Implementation adapted from unicode-segmentation
		def load_properties(f, interestingprops = None):
		fetch(f)
		props = {}
		re1 = re.compile(r"^ ([0-9A-F]+) ; *(\w+)")
		re2 = re.compile(r"^ ([0-9A-F]+)\.\.([0-9A-F]+) ; *(\w+)")
		re1 = re.compile(r"^ ([0-9A-F]+) ; ([^#\s]+) #")
		re2 = re.compile(r"^ ([0-9A-F]+)\.\.([0-9A-F]+) ; ([^#\s]+) #")

		for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
		prop = None
Expand DownExpand Up		@@ -99,42 +102,6 @@ def load_properties(f, interestingprops = None):

		return props

		# Loads script data from Scripts.txt
		def load_script_properties(f, interestingprops):
		fetch_unidata(f)
		props = {}
		# Note: these regexes are different from those in unicode-segmentation,
		# becase we need to handle spaces here
		re1 = re.compile(r"^ ([0-9A-F]+) ; ([^#]+) #")
		re2 = re.compile(r"^ ([0-9A-F]+)\.\.([0-9A-F]+) ; ([^#]+) #")

		for line in fileinput.input(os.path.basename(f)):
		prop = None
		d_lo = 0
		d_hi = 0
		m = re1.match(line)
		if m:
		d_lo = m.group(1)
		d_hi = m.group(1)
		prop = m.group(2).strip()
		else:
		m = re2.match(line)
		if m:
		d_lo = m.group(1)
		d_hi = m.group(2)
		prop = m.group(3).strip()
		else:
		continue
		if interestingprops and prop not in interestingprops:
		continue
		d_lo = int(d_lo, 16)
		d_hi = int(d_hi, 16)
		if prop not in props:
		props[prop] = []
		props[prop].append((d_lo, d_hi))

		return props

		# Loads confusables data from confusables.txt
		def load_confusables(f):
		fetch(f)
Expand DownExpand Up		@@ -189,7 +156,7 @@ def load_scripts(f):
		# changes are introduced, update accordingly.

		(longforms, shortforms) = aliases()
		scripts =load_script_properties(f, [])
		scripts =load_properties(fetch_unidata(f), [])

		script_table = []
		script_list = []
Expand DownExpand Up		@@ -546,10 +513,10 @@ def emit_identifier_module(f):
		""")

		f.write(" // Identifier status table:\n")
		identifier_status_table = load_properties("IdentifierStatus.txt")
		identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
		emit_table(f, "IDENTIFIER_STATUS", identifier_status_table['Allowed'], "&'static [(char, char)]", is_pub=False,
		pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
		identifier_type = load_properties("IdentifierType.txt")
		identifier_type = load_properties(fetch("IdentifierType.txt"))
		type_table = []
		for ty in identifier_type:
		type_table.extend([(x, y, ty) for (x, y) in identifier_type[ty]])
Expand All		@@ -560,6 +527,26 @@ def emit_identifier_module(f):
		pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
		f.write("}\n\n")

		def emit_default_ignorable_detection_module(f):
		f.write("pub mod default_ignorable_code_point {")
		f.write("""

		#[inline]
		pub fn default_ignorable_code_point(c: char) -> bool {
		match c as usize {
		_ => super::util::bsearch_range_table(c, DEFAULT_IGNORABLE)
		}
		}

		""")

		f.write(" // Default ignorable code point table:\n")
		default_ignorable_table = load_properties(fetch_unidata("DerivedCoreProperties.txt"), ["Default_Ignorable_Code_Point"])
		emit_table(f, "DEFAULT_IGNORABLE", default_ignorable_table["Default_Ignorable_Code_Point"], "&'static [(char, char)]", is_pub=False,
		pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))

		f.write("}\n\n")

		def emit_confusable_detection_module(f):
		f.write("pub mod confusable_detection {")
		f.write("""
Expand DownExpand Up		@@ -601,7 +588,7 @@ def emit_potiential_mixed_script_confusable(f):
		}
		}
		""")
		identifier_status_table = load_properties("IdentifierStatus.txt")
		identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
		_, scripts = load_scripts("Scripts.txt")
		identifier_allowed = identifier_status_table['Allowed']
		(mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)
Expand DownExpand Up		@@ -688,6 +675,8 @@ def emit_util_mod(f):
		emit_util_mod(rf)
		### identifier module
		emit_identifier_module(rf)
		### default_ignorable_detection module
		emit_default_ignorable_detection_module(rf)
		### confusable_detection module
		emit_confusable_detection_module(rf)
		### mixed_script_confusable_detection module
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -34,6 +34,12 @@ fn char_prototype(c: char) -> OnceOrMore<char, StaticSliceIterCloned> {

		/// Calculate skeleton for string, as defined by UTS 39
		pub fn skeleton(s: &str) -> impl Iterator<Item = char> + '_ {
		use crate::tables::default_ignorable_code_point::default_ignorable_code_point;
		use unicode_normalization::UnicodeNormalization;
		s.chars().nfd().flat_map(char_prototype).nfd()

		s.chars()
		.nfd()
		.filter(\|c\| !default_ignorable_code_point(*c))
		.flat_map(char_prototype)
		.nfd()
		}