May 7, 2024 · May 4, 2024
diff --git a/scripts/unicode.py b/scripts/unicode.py
    `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
    with fetch_open("EastAsianWidth.txt") as eaw:
        # matches a width assignment for a single codepoint, i.e. "1F336;N  # ..."
        single = re.compile(r"^([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
        single = re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
        # matches a width assignment for a range of codepoints, i.e. "3001..3003;W  # ..."
        multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
        multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
        # map between width category code and condensed width
        width_codes = {
            **{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]},
    # canonically equivalent sequences have the same width.
    with fetch_open("DerivedCoreProperties.txt") as properties:
        single = re.compile(
            r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
            r"^([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
        )
        multiple = re.compile(
            r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
            r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
        )

        for line in properties.readlines():
    #
    # (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
    with fetch_open("HangulSyllableType.txt") as categories:
        single = re.compile(r"^([0-9A-F]+)\s+;\s+(V|T)\s+")
        multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+")
        single = re.compile(r"^([0-9A-F]+)\s*;\s*(V|T)\s+")
        multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(V|T)\s+")

        for line in categories.readlines():
            raw_data = None  # (low, high)
    return tables


 defload_variation_sequences() -> "list[int]":
 defload_emoji_presentation_sequences() -> "list[int]":
    """Outputs a list of character ranages, corresponding to all the valid characters for starting
    an emoji presentation sequence."""

    with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
        # Match all emoji presentation sequences
        # (one codepoint followed by U+FE0F, and labeled "emoji style")
        sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
        sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s*emoji style")
        codepoints = []
        for line in sequences.readlines():
            if match := sequence.match(line):
    return codepoints


 def make_variation_sequence_table(
 def load_text_presentation_sequences() -> "list[int]":
    """Outputs a list of character ranages, corresponding to all the valid characters
    whose widths change with a text presentation sequence."""

    text_presentation_seq_codepoints = set()
    with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
        # Match all text presentation sequences
        # (one codepoint followed by U+FE0E, and labeled "text style")
        sequence = re.compile(r"^([0-9A-F]+)\s+FE0E\s*;\s*text style")
        for line in sequences.readlines():
            if match := sequence.match(line):
                cp = int(match.group(1), 16)
                text_presentation_seq_codepoints.add(cp)

    default_emoji_codepoints = set()
    with fetch_open("emoji/emoji-data.txt") as emoji_data:
        single = re.compile(r"^([0-9A-F]+)\s*;\s*Emoji_Presentation\s+")
        multiple = re.compile(
            r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Presentation\s+"
        )

        for line in emoji_data.readlines():
            raw_data = None  # (low, high)
            if match := single.match(line):
                raw_data = (match.group(1), match.group(1))
            elif match := multiple.match(line):
                raw_data = (match.group(1), match.group(2))
            else:
                continue
            low = int(raw_data[0], 16)
            high = int(raw_data[1], 16)
            for cp in range(low, high + 1):
                default_emoji_codepoints.add(cp)

    codepoints = []
    for cp in text_presentation_seq_codepoints.intersection(default_emoji_codepoints):
        # "Enclosed Ideographic Supplement" block;
        # wide even in text presentation
        if not cp in range(0x1F200, 0x1F300):
            codepoints.append(cp)

    codepoints.sort()
    return codepoints


 def make_presentation_sequence_table(
    seqs: "list[int]",
    width_map: "list[EffectiveWidth]",
 ) -> "tuple[list[int], list[list[int]]]":
    """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
    (Characters that are always wide may be excluded.)
    spurious_false: "set[EffectiveWidth]",
    spurious_true: "set[EffectiveWidth]",
 ) -> "tuple[list[tuple[int, int]], list[list[int]]]":
    """Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
    The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
    """

    prefixes_dict = defaultdict(set)
    for cp in seqs:
        prefixes_dict[cp >> 10].add(cp & 0x3FF)

    # We don't strictly need to keep track of characters that are always wide,
    # because being in an emoji variation seq won't affect their width.
    # So store their info only when it wouldn't inflate the size of the tables.
    for k in list(prefixes_dict.keys()):
        if all(
            map(
                lambda cp: width_map[(k << 10) | cp]== EffectiveWidth.WIDE,
                lambda cp: width_map[(k << 10) | cp]in spurious_false,
                prefixes_dict[k],
            )
        ):
            del prefixes_dict[k]

 indexes = list(prefixes_dict.keys())
 msbs: "list[int]" = list(prefixes_dict.keys())

    # Similarly, we can spuriously return `true` for always-wide characters
    # even if not part of a presentation seq; this saves an additional lookup,
    # so we should do it where there is no size cost.
    for cp, width in enumerate(width_map):
        if width== EffectiveWidth.WIDE and (cp >> 10) inindexes:
        if widthin spurious_true and (cp >> 10) inmsbs:
            prefixes_dict[cp >> 10].add(cp & 0x3FF)

    leaves = []
    leaves: "list[list[int]]" = []
    for cps in prefixes_dict.values():
        leaf = [0] * 128
        for cp in cps:
            idx_in_leaf, bit_shift = divmod(cp, 8)
            leaf[idx_in_leaf] |= 1 << bit_shift
        leaves.append(leaf)

    indexes = [(msb, index) for (index, msb) in enumerate(msbs)]

    # Cull duplicate leaves
    i = 0
    while i < len(leaves):
        first_idx = leaves.index(leaves[i])
        if first_idx == i:
            i += 1
        else:
            for j in range(0, len(indexes)):
                if indexes[j][1] == i:
                    indexes[j] = (indexes[j][0], first_idx)
                elif indexes[j][1] > i:
                    indexes[j] = (indexes[j][0], indexes[j][1] - 1)

            leaves.pop(i)

    return (indexes, leaves)


 def emit_module(
    out_name: str,
    unicode_version: "tuple[int, int, int]",
    tables: "list[Table]",
    variation_table: "tuple[list[int], list[list[int]]]",
    emoji_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]",
    text_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]",
 ):
    """Outputs a Rust module to `out_name` using table data from `tables`.
    If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
 """
        )

        variation_idx, variation_leaves = variation_table
        emoji_presentation_idx, emoji_presentation_leaves = emoji_presentation_table
        text_presentation_idx, text_presentation_leaves = text_presentation_table

        module.write(
            """
 """
        )

        fori, msbs inenumerate(variation_idx):
        formsbs, i inemoji_presentation_idx:
            module.write(f"            {msbs} => {i},\n")

        module.write(
 """
        )

        module.write(
            """
    /// Returns `true` iff `c` has default emoji presentation, but forms a [text presentation sequence]
    /// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence)
    /// when followed by `'\\u{FEOE}'`, and is not ideographic.
    /// Such sequences are considered to have width 1.
    ///
    /// This may spuriously return `true` for characters of narrow or ambiguous width.
    #[inline]
    pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool {
        let cp: u32 = c.into();
        // First level of lookup uses all but 10 LSB
        let top_bits = cp >> 10;
        let idx_of_leaf: usize = match top_bits {
 """
        )

        for msbs, i in text_presentation_idx:
            module.write(f"            {msbs} => {i},\n")

        module.write(
            """            _ => return false,
        };
        // Extract the 3-9th (0-indexed) least significant bits of `cp`,
        // and use them to index into `leaf_row`.
        let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
        let leaf_byte = TEXT_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
        // Use the 3 LSB of `cp` to index into `leaf_byte`.
        ((leaf_byte >> (cp & 7)) & 1) == 1
    }
 """
        )

        module.write(
            """
    /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
            f"""
    #[repr(align(128))]
    struct Align128<T>(T);
    /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
    /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
    static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(variation_leaves)}]> = Align128([
    /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
    /// to get whether it can start an emoji presentation sequence.
    static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(emoji_presentation_leaves)}]> = Align128([
 """
        )
        for leaf in emoji_presentation_leaves:
            module.write("        [\n")
            for row in batched(leaf, 14):
                module.write("           ")
                for entry in row:
                    module.write(f" 0x{entry:02X},")
                module.write("\n")
            module.write("        ],\n")

        module.write("    ]);\n")

        # text table

        module.write(
            f"""
    /// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
    /// to get whether it can start a text presentation sequence.
    static TEXT_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(text_presentation_leaves)}]> = Align128([
 """
        )
        for leaf invariation_leaves:
        for leaf intext_presentation_leaves:
            module.write("        [\n")
            for row in batched(leaf, 14):
                module.write("           ")
    lookup table for character width, and write a Rust module utilizing that table to
    `module_filename`.

    We obey the following rules, in decreasing order of importance:

    - Emoji presentation sequences are double-width.
    - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
    - Hangul jamo medial vowels & final consonants are zero-width.
    - `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
    - Control characters are zero-width.
    - `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters,
      are zero-width.
    - Codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
    - Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
    - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
      of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.

    These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
    See `lib.rs` for documentation of the exact width rules.
    """
    version = load_unicode_version()
    print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")

    tables = make_tables(TABLE_CFGS, enumerate(width_map))

    emoji_variations = load_variation_sequences()
    variation_table = make_variation_sequence_table(emoji_variations, width_map)
    emoji_presentations = load_emoji_presentation_sequences()
    emoji_presentation_table = make_presentation_sequence_table(
        emoji_presentations, width_map, {EffectiveWidth.WIDE}, {EffectiveWidth.WIDE}
    )

    text_presentations = load_text_presentation_sequences()
    text_presentation_table = make_presentation_sequence_table(
        text_presentations,
        width_map,
        set(),
        {EffectiveWidth.NARROW, EffectiveWidth.AMBIGUOUS},
    )

    # Download normalization test file for use by tests
    fetch_open("NormalizationTest.txt", "../tests/")
        size_bytes = len(table.to_bytes())
        print(f"Table {i} size: {size_bytes} bytes")
        total_size += size_bytes
    emoji_index_size = len(variation_table[0]) * 4
    print(f"Emoji presentation index size: {emoji_index_size} bytes")
    total_size += emoji_index_size
    emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0])
    print(f"Emoji presentation leaves size: {emoji_leaves_size} bytes")
    total_size += emoji_leaves_size

    for s, table in [
        ("Emoji", emoji_presentation_table),
        ("Text", text_presentation_table),
    ]:
        index_size = len(table[0]) * 4
        print(f"{s} presentation index size: {index_size} bytes")
        total_size += index_size
        leaves_size = len(table[1]) * len(table[1][0])
        print(f"{s} presentation leaves size: {leaves_size} bytes")
        total_size += leaves_size
    print("------------------------")
    print(f"  Total size: {total_size} bytes")

    emit_module(module_path, version, tables, variation_table)
    emit_module(
        module_path, version, tables, emoji_presentation_table, text_presentation_table
    )
    print(f'Wrote to "{module_path}"')
Original file line number	Diff line number	Diff line change
Expand Up		@@ -123,9 +123,9 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
		`Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
		with fetch_open("EastAsianWidth.txt") as eaw:
		# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
		single = re.compile(r"^([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
		single = re.compile(r"^([0-9A-F]+)\s;\s(\w+) +# (\w+)")
		# matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..."
		multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
		multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\s(\w+) +# (\w+)")
		# map between width category code and condensed width
		width_codes = {
		**{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]},
Expand DownExpand Up		@@ -189,10 +189,10 @@ def load_zero_widths() -> "list[bool]":
		# canonically equivalent sequences have the same width.
		with fetch_open("DerivedCoreProperties.txt") as properties:
		single = re.compile(
		r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"
		r"^([0-9A-F]+)\s;\s(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"
		)
		multiple = re.compile(
		r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"
		r"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\s(?:Default_Ignorable_Code_Point\|Grapheme_Extend)\s+"
		)

		for line in properties.readlines():
Expand DownExpand Up		@@ -225,8 +225,8 @@ def load_zero_widths() -> "list[bool]":
		#
		# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
		with fetch_open("HangulSyllableType.txt") as categories:
		single = re.compile(r"^([0-9A-F]+)\s+;\s+(V\|T)\s+")
		multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V\|T)\s+")
		single = re.compile(r"^([0-9A-F]+)\s;\s(V\|T)\s+")
		multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\s(V\|T)\s+")

		for line in categories.readlines():
		raw_data = None # (low, high)
Expand DownExpand Up		@@ -396,14 +396,14 @@ def make_tables(
		return tables


		defload_variation_sequences() -> "list[int]":
		defload_emoji_presentation_sequences() -> "list[int]":
		"""Outputs a list of character ranages, corresponding to all the valid characters for starting
		an emoji presentation sequence."""

		with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
		# Match all emoji presentation sequences
		# (one codepoint followed by U+FE0F, and labeled "emoji style")
		sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
		sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s;\semoji style")
		codepoints = []
		for line in sequences.readlines():
		if match := sequence.match(line):
Expand All		@@ -412,55 +412,114 @@ def load_variation_sequences() -> "list[int]":
		return codepoints


		def make_variation_sequence_table(
		def load_text_presentation_sequences() -> "list[int]":
		"""Outputs a list of character ranages, corresponding to all the valid characters
		whose widths change with a text presentation sequence."""

		text_presentation_seq_codepoints = set()
		with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
		# Match all text presentation sequences
		# (one codepoint followed by U+FE0E, and labeled "text style")
		sequence = re.compile(r"^([0-9A-F]+)\s+FE0E\s;\stext style")
		for line in sequences.readlines():
		if match := sequence.match(line):
		cp = int(match.group(1), 16)
		text_presentation_seq_codepoints.add(cp)

		default_emoji_codepoints = set()
		with fetch_open("emoji/emoji-data.txt") as emoji_data:
		single = re.compile(r"^([0-9A-F]+)\s;\sEmoji_Presentation\s+")
		multiple = re.compile(
		r"^([0-9A-F]+)\.\.([0-9A-F]+)\s;\sEmoji_Presentation\s+"
		)

		for line in emoji_data.readlines():
		raw_data = None # (low, high)
		if match := single.match(line):
		raw_data = (match.group(1), match.group(1))
		elif match := multiple.match(line):
		raw_data = (match.group(1), match.group(2))
		else:
		continue
		low = int(raw_data[0], 16)
		high = int(raw_data[1], 16)
		for cp in range(low, high + 1):
		default_emoji_codepoints.add(cp)

		codepoints = []
		for cp in text_presentation_seq_codepoints.intersection(default_emoji_codepoints):
		# "Enclosed Ideographic Supplement" block;
		# wide even in text presentation
		if not cp in range(0x1F200, 0x1F300):
		codepoints.append(cp)

		codepoints.sort()
		return codepoints


		def make_presentation_sequence_table(
		seqs: "list[int]",
		width_map: "list[EffectiveWidth]",
		) -> "tuple[list[int], list[list[int]]]":
		"""Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
		(Characters that are always wide may be excluded.)
		spurious_false: "set[EffectiveWidth]",
		spurious_true: "set[EffectiveWidth]",
		) -> "tuple[list[tuple[int, int]], list[list[int]]]":
		"""Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
		The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
		"""

		prefixes_dict = defaultdict(set)
		for cp in seqs:
		prefixes_dict[cp >> 10].add(cp & 0x3FF)

		# We don't strictly need to keep track of characters that are always wide,
		# because being in an emoji variation seq won't affect their width.
		# So store their info only when it wouldn't inflate the size of the tables.
		for k in list(prefixes_dict.keys()):
		if all(
		map(
		lambda cp: width_map[(k << 10) \| cp]== EffectiveWidth.WIDE,
		lambda cp: width_map[(k << 10) \| cp]in spurious_false,
		prefixes_dict[k],
		)
		):
		del prefixes_dict[k]

		indexes = list(prefixes_dict.keys())
		msbs: "list[int]" = list(prefixes_dict.keys())

		# Similarly, we can spuriously return `true` for always-wide characters
		# even if not part of a presentation seq; this saves an additional lookup,
		# so we should do it where there is no size cost.
		for cp, width in enumerate(width_map):
		if width== EffectiveWidth.WIDE and (cp >> 10) inindexes:
		if widthin spurious_true and (cp >> 10) inmsbs:
		prefixes_dict[cp >> 10].add(cp & 0x3FF)

		leaves = []
		leaves: "list[list[int]]" = []
		for cps in prefixes_dict.values():
		leaf = [0] * 128
		for cp in cps:
		idx_in_leaf, bit_shift = divmod(cp, 8)
		leaf[idx_in_leaf] \|= 1 << bit_shift
		leaves.append(leaf)

		indexes = [(msb, index) for (index, msb) in enumerate(msbs)]

		# Cull duplicate leaves
		i = 0
		while i < len(leaves):
		first_idx = leaves.index(leaves[i])
		if first_idx == i:
		i += 1
		else:
		for j in range(0, len(indexes)):
		if indexes[j][1] == i:
		indexes[j] = (indexes[j][0], first_idx)
		elif indexes[j][1] > i:
		indexes[j] = (indexes[j][0], indexes[j][1] - 1)

		leaves.pop(i)

		return (indexes, leaves)


		def emit_module(
		out_name: str,
		unicode_version: "tuple[int, int, int]",
		tables: "list[Table]",
		variation_table: "tuple[list[int], list[list[int]]]",
		emoji_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]",
		text_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]",
		):
		"""Outputs a Rust module to `out_name` using table data from `tables`.
		If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
Expand DownExpand Up		@@ -537,7 +596,8 @@ def emit_module(
		"""
		)

		variation_idx, variation_leaves = variation_table
		emoji_presentation_idx, emoji_presentation_leaves = emoji_presentation_table
		text_presentation_idx, text_presentation_leaves = text_presentation_table

		module.write(
		"""
Expand All		@@ -555,7 +615,7 @@ def emit_module(
		"""
		)

		fori, msbs inenumerate(variation_idx):
		formsbs, i inemoji_presentation_idx:
		module.write(f" {msbs} => {i},\n")

		module.write(
Expand All		@@ -571,6 +631,39 @@ def emit_module(
		"""
		)

		module.write(
		"""
		/// Returns `true` iff `c` has default emoji presentation, but forms a [text presentation sequence]
		/// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence)
		/// when followed by `'\\u{FEOE}'`, and is not ideographic.
		/// Such sequences are considered to have width 1.
		///
		/// This may spuriously return `true` for characters of narrow or ambiguous width.
		#[inline]
		pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool {
		let cp: u32 = c.into();
		// First level of lookup uses all but 10 LSB
		let top_bits = cp >> 10;
		let idx_of_leaf: usize = match top_bits {
		"""
		)

		for msbs, i in text_presentation_idx:
		module.write(f" {msbs} => {i},\n")

		module.write(
		""" _ => return false,
		};
		// Extract the 3-9th (0-indexed) least significant bits of `cp`,
		// and use them to index into `leaf_row`.
		let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
		let leaf_byte = TEXT_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
		// Use the 3 LSB of `cp` to index into `leaf_byte`.
		((leaf_byte >> (cp & 7)) & 1) == 1
		}
		"""
		)

		module.write(
		"""
		/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
Expand DownExpand Up		@@ -626,12 +719,32 @@ def emit_module(
		f"""
		#[repr(align(128))]
		struct Align128<T>(T);
		/// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
		/// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
		static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(variation_leaves)}]> = Align128([
		/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
		/// to get whether it can start an emoji presentation sequence.
		static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(emoji_presentation_leaves)}]> = Align128([
		"""
		)
		for leaf in emoji_presentation_leaves:
		module.write(" [\n")
		for row in batched(leaf, 14):
		module.write(" ")
		for entry in row:
		module.write(f" 0x{entry:02X},")
		module.write("\n")
		module.write(" ],\n")

		module.write(" ]);\n")

		# text table

		module.write(
		f"""
		/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
		/// to get whether it can start a text presentation sequence.
		static TEXT_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(text_presentation_leaves)}]> = Align128([
		"""
		)
		for leaf invariation_leaves:
		for leaf intext_presentation_leaves:
		module.write(" [\n")
		for row in batched(leaf, 14):
		module.write(" ")
Expand All		@@ -650,21 +763,7 @@ def main(module_path: str):
		lookup table for character width, and write a Rust module utilizing that table to
		`module_filename`.

		We obey the following rules, in decreasing order of importance:

		- Emoji presentation sequences are double-width.
		- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
		- Hangul jamo medial vowels & final consonants are zero-width.
		- `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
		- Control characters are zero-width.
		- `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters,
		are zero-width.
		- Codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
		- Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
		- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
		of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.

		These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
		See `lib.rs` for documentation of the exact width rules.
		"""
		version = load_unicode_version()
		print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")
Expand All		@@ -682,8 +781,18 @@ def main(module_path: str):

		tables = make_tables(TABLE_CFGS, enumerate(width_map))

		emoji_variations = load_variation_sequences()
		variation_table = make_variation_sequence_table(emoji_variations, width_map)
		emoji_presentations = load_emoji_presentation_sequences()
		emoji_presentation_table = make_presentation_sequence_table(
		emoji_presentations, width_map, {EffectiveWidth.WIDE}, {EffectiveWidth.WIDE}
		)

		text_presentations = load_text_presentation_sequences()
		text_presentation_table = make_presentation_sequence_table(
		text_presentations,
		width_map,
		set(),
		{EffectiveWidth.NARROW, EffectiveWidth.AMBIGUOUS},
		)

		# Download normalization test file for use by tests
		fetch_open("NormalizationTest.txt", "../tests/")
Expand All		@@ -694,16 +803,23 @@ def main(module_path: str):
		size_bytes = len(table.to_bytes())
		print(f"Table {i} size: {size_bytes} bytes")
		total_size += size_bytes
		emoji_index_size = len(variation_table[0]) * 4
		print(f"Emoji presentation index size: {emoji_index_size} bytes")
		total_size += emoji_index_size
		emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0])
		print(f"Emoji presentation leaves size: {emoji_leaves_size} bytes")
		total_size += emoji_leaves_size

		for s, table in [
		("Emoji", emoji_presentation_table),
		("Text", text_presentation_table),
		]:
		index_size = len(table[0]) * 4
		print(f"{s} presentation index size: {index_size} bytes")
		total_size += index_size
		leaves_size = len(table[1]) * len(table[1][0])
		print(f"{s} presentation leaves size: {leaves_size} bytes")
		total_size += leaves_size
		print("------------------------")
		print(f" Total size: {total_size} bytes")

		emit_module(module_path, version, tables, variation_table)
		emit_module(
		module_path, version, tables, emoji_presentation_table, text_presentation_table
		)
		print(f'Wrote to "{module_path}"')


Expand Down