Jul 14, 2025
diff --git a/scripts/unicode.py b/scripts/unicode.py
        sys.stderr.write("cannot load %s" % f)
        exit(1)

 def load_gencats(f):
    fetch(f)
    gencats = {}

    udict = {};
    range_start = -1;
    for line in fileinput.input(f):
        data = line.split(';');
        if len(data) != 15:
            continue
        cp = int(data[0], 16);
        if is_surrogate(cp):
            continue
        if range_start >= 0:
            for i in range(range_start, cp):
                udict[i] = data;
            range_start = -1;
        if data[1].endswith(", First>"):
            range_start = cp;
            continue;
        udict[cp] = data;

    for code in udict:
        [code_org, name, gencat, combine, bidi,
         decomp, deci, digit, num, mirror,
         old, iso, upcase, lowcase, titlecase ] = udict[code];

        # place letter in categories as appropriate
        for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
            if cat not in gencats:
                gencats[cat] = []
            gencats[cat].append(code)

    gencats = group_cats(gencats)
    return gencats

 def group_cats(cats):
    cats_out = {}
    for cat in cats:
        }).is_ok()
    }

    #[inline]
    fn is_alphabetic(c: char) -> bool {
        if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION {
            c.is_alphabetic()
        } else {
            match c {
                'a' ..= 'z' | 'A' ..= 'Z' => true,
                c if c > '\\x7f' => super::derived_property::Alphabetic(c),
                _ => false,
            }
        }
    }

    #[inline]
    fn is_numeric(c: char) -> bool {
        if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION {
            c.is_numeric()
        } else {
            match c {
                '0' ..= '9' => true,
                c if c > '\\x7f' => super::general_category::N(c),
                _ => false,
            }
        }
    }

    #[inline]
    pub fn is_alphanumeric(c: char) -> bool {
        is_alphabetic(c) || is_numeric(c)
    }
 }

 """)
 /// The version of [Unicode](http://www.unicode.org/)
 /// that this version of unicode-segmentation is based on.
 pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
 """ % UNICODE_VERSION)

        rf.write("""
 const UNICODE_VERSION_U8: (u8, u8, u8) = (%s, %s, %s);
 """ % UNICODE_VERSION)

        # download and parse all the data
        gencats = load_gencats("UnicodeData.txt")
        derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic", ("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])
        derived = load_properties("DerivedCoreProperties.txt", [("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])

        emit_util_mod(rf)
        for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
                                  ("derived_property", derived, ["Alphabetic", ("InCB", "Extend")]):
            emit_property_module(rf, name, cat, pfuns)
        emit_property_module(rf, "derived_property", derived, [("InCB", "Extend")])

        rf.write("""pub fn is_incb_linker(c: char) -> bool {
    matches!(c,""")
diff --git a/src/sentence.rs b/src/sentence.rs
 #[inline]
 pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> {
    use super::UnicodeSegmentation;
    use crate::tables::util::is_alphanumeric;

    fn has_alphanumeric(s: &&str) -> bool {
        s.chars().any(is_alphanumeric)
        s.chars().any(|c| c.is_alphanumeric())
    }
    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
Original file line number	Diff line number	Diff line change
Expand Up		@@ -74,42 +74,6 @@ def fetch(f):
		sys.stderr.write("cannot load %s" % f)
		exit(1)

		def load_gencats(f):
		fetch(f)
		gencats = {}

		udict = {};
		range_start = -1;
		for line in fileinput.input(f):
		data = line.split(';');
		if len(data) != 15:
		continue
		cp = int(data[0], 16);
		if is_surrogate(cp):
		continue
		if range_start >= 0:
		for i in range(range_start, cp):
		udict[i] = data;
		range_start = -1;
		if data[1].endswith(", First>"):
		range_start = cp;
		continue;
		udict[cp] = data;

		for code in udict:
		[code_org, name, gencat, combine, bidi,
		decomp, deci, digit, num, mirror,
		old, iso, upcase, lowcase, titlecase ] = udict[code];

		# place letter in categories as appropriate
		for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
		if cat not in gencats:
		gencats[cat] = []
		gencats[cat].append(code)

		gencats = group_cats(gencats)
		return gencats

		def group_cats(cats):
		cats_out = {}
		for cat in cats:
Expand DownExpand Up		@@ -230,36 +194,6 @@ def emit_util_mod(f):
		}).is_ok()
		}

		#[inline]
		fn is_alphabetic(c: char) -> bool {
		if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION {
		c.is_alphabetic()
		} else {
		match c {
		'a' ..= 'z' \| 'A' ..= 'Z' => true,
		c if c > '\\x7f' => super::derived_property::Alphabetic(c),
		_ => false,
		}
		}
		}

		#[inline]
		fn is_numeric(c: char) -> bool {
		if super::UNICODE_VERSION_U8 == char::UNICODE_VERSION {
		c.is_numeric()
		} else {
		match c {
		'0' ..= '9' => true,
		c if c > '\\x7f' => super::general_category::N(c),
		_ => false,
		}
		}
		}

		#[inline]
		pub fn is_alphanumeric(c: char) -> bool {
		is_alphabetic(c) \|\| is_numeric(c)
		}
		}

		""")
Expand DownExpand Up		@@ -396,20 +330,13 @@ def emit_break_module(f, break_table, break_cats, name):
		/// The version of [Unicode](http://www.unicode.org/)
		/// that this version of unicode-segmentation is based on.
		pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
		""" % UNICODE_VERSION)

		rf.write("""
		const UNICODE_VERSION_U8: (u8, u8, u8) = (%s, %s, %s);
		""" % UNICODE_VERSION)

		# download and parse all the data
		gencats = load_gencats("UnicodeData.txt")
		derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic", ("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])
		derived = load_properties("DerivedCoreProperties.txt", [("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])

		emit_util_mod(rf)
		for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
		("derived_property", derived, ["Alphabetic", ("InCB", "Extend")]):
		emit_property_module(rf, name, cat, pfuns)
		emit_property_module(rf, "derived_property", derived, [("InCB", "Extend")])

		rf.write("""pub fn is_incb_linker(c: char) -> bool {
		matches!(c,""")
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -345,10 +345,9 @@ pub fn new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_> {
		#[inline]
		pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> {
		use super::UnicodeSegmentation;
		use crate::tables::util::is_alphanumeric;

		fn has_alphanumeric(s: &&str) -> bool {
		s.chars().any(is_alphanumeric)
		s.chars().any(\|c\| c.is_alphanumeric())
		}
		let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer

Expand Down