Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit3aa94a5

Browse files
authored
Merge pull request#43 from Jules-Bertholet/text-presentation
Support text presentation sequences
2 parents74c8394 +2e2d3bb commit3aa94a5

File tree

4 files changed

+320
-81
lines changed

4 files changed

+320
-81
lines changed

‎scripts/unicode.py‎

Lines changed: 169 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,9 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
123123
`Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
124124
withfetch_open("EastAsianWidth.txt")aseaw:
125125
# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
126-
single=re.compile(r"^([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
126+
single=re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
127127
# matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..."
128-
multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
128+
multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
129129
# map between width category code and condensed width
130130
width_codes= {
131131
**{c:EffectiveWidth.NARROWforcin ["N","Na","H"]},
@@ -189,10 +189,10 @@ def load_zero_widths() -> "list[bool]":
189189
# canonically equivalent sequences have the same width.
190190
withfetch_open("DerivedCoreProperties.txt")asproperties:
191191
single=re.compile(
192-
r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
192+
r"^([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
193193
)
194194
multiple=re.compile(
195-
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
195+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
196196
)
197197

198198
forlineinproperties.readlines():
@@ -225,8 +225,8 @@ def load_zero_widths() -> "list[bool]":
225225
#
226226
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
227227
withfetch_open("HangulSyllableType.txt")ascategories:
228-
single=re.compile(r"^([0-9A-F]+)\s+;\s+(V|T)\s+")
229-
multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+")
228+
single=re.compile(r"^([0-9A-F]+)\s*;\s*(V|T)\s+")
229+
multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(V|T)\s+")
230230

231231
forlineincategories.readlines():
232232
raw_data=None# (low, high)
@@ -396,14 +396,14 @@ def make_tables(
396396
returntables
397397

398398

399-
defload_variation_sequences()->"list[int]":
399+
defload_emoji_presentation_sequences()->"list[int]":
400400
"""Outputs a list of character ranages, corresponding to all the valid characters for starting
401401
an emoji presentation sequence."""
402402

403403
withfetch_open("emoji/emoji-variation-sequences.txt")assequences:
404404
# Match all emoji presentation sequences
405405
# (one codepoint followed by U+FE0F, and labeled "emoji style")
406-
sequence=re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
406+
sequence=re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s*emoji style")
407407
codepoints= []
408408
forlineinsequences.readlines():
409409
ifmatch:=sequence.match(line):
@@ -412,55 +412,114 @@ def load_variation_sequences() -> "list[int]":
412412
returncodepoints
413413

414414

415-
defmake_variation_sequence_table(
415+
defload_text_presentation_sequences()->"list[int]":
416+
"""Outputs a list of character ranages, corresponding to all the valid characters
417+
whose widths change with a text presentation sequence."""
418+
419+
text_presentation_seq_codepoints=set()
420+
withfetch_open("emoji/emoji-variation-sequences.txt")assequences:
421+
# Match all text presentation sequences
422+
# (one codepoint followed by U+FE0E, and labeled "text style")
423+
sequence=re.compile(r"^([0-9A-F]+)\s+FE0E\s*;\s*text style")
424+
forlineinsequences.readlines():
425+
ifmatch:=sequence.match(line):
426+
cp=int(match.group(1),16)
427+
text_presentation_seq_codepoints.add(cp)
428+
429+
default_emoji_codepoints=set()
430+
withfetch_open("emoji/emoji-data.txt")asemoji_data:
431+
single=re.compile(r"^([0-9A-F]+)\s*;\s*Emoji_Presentation\s+")
432+
multiple=re.compile(
433+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Presentation\s+"
434+
)
435+
436+
forlineinemoji_data.readlines():
437+
raw_data=None# (low, high)
438+
ifmatch:=single.match(line):
439+
raw_data= (match.group(1),match.group(1))
440+
elifmatch:=multiple.match(line):
441+
raw_data= (match.group(1),match.group(2))
442+
else:
443+
continue
444+
low=int(raw_data[0],16)
445+
high=int(raw_data[1],16)
446+
forcpinrange(low,high+1):
447+
default_emoji_codepoints.add(cp)
448+
449+
codepoints= []
450+
forcpintext_presentation_seq_codepoints.intersection(default_emoji_codepoints):
451+
# "Enclosed Ideographic Supplement" block;
452+
# wide even in text presentation
453+
ifnotcpinrange(0x1F200,0x1F300):
454+
codepoints.append(cp)
455+
456+
codepoints.sort()
457+
returncodepoints
458+
459+
460+
defmake_presentation_sequence_table(
416461
seqs:"list[int]",
417462
width_map:"list[EffectiveWidth]",
418-
)->"tuple[list[int], list[list[int]]]":
419-
"""Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
420-
(Characters that are always wide may be excluded.)
463+
spurious_false:"set[EffectiveWidth]",
464+
spurious_true:"set[EffectiveWidth]",
465+
)->"tuple[list[tuple[int, int]], list[list[int]]]":
466+
"""Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
421467
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
422468
"""
423469

424470
prefixes_dict=defaultdict(set)
425471
forcpinseqs:
426472
prefixes_dict[cp>>10].add(cp&0x3FF)
427473

428-
# We don't strictly need to keep track of characters that are always wide,
429-
# because being in an emoji variation seq won't affect their width.
430-
# So store their info only when it wouldn't inflate the size of the tables.
431474
forkinlist(prefixes_dict.keys()):
432475
ifall(
433476
map(
434-
lambdacp:width_map[(k<<10)|cp]==EffectiveWidth.WIDE,
477+
lambdacp:width_map[(k<<10)|cp]inspurious_false,
435478
prefixes_dict[k],
436479
)
437480
):
438481
delprefixes_dict[k]
439482

440-
indexes=list(prefixes_dict.keys())
483+
msbs:"list[int]"=list(prefixes_dict.keys())
441484

442-
# Similarly, we can spuriously return `true` for always-wide characters
443-
# even if not part of a presentation seq; this saves an additional lookup,
444-
# so we should do it where there is no size cost.
445485
forcp,widthinenumerate(width_map):
446-
ifwidth==EffectiveWidth.WIDEand (cp>>10)inindexes:
486+
ifwidthinspurious_trueand (cp>>10)inmsbs:
447487
prefixes_dict[cp>>10].add(cp&0x3FF)
448488

449-
leaves= []
489+
leaves:"list[list[int]]"= []
450490
forcpsinprefixes_dict.values():
451491
leaf= [0]*128
452492
forcpincps:
453493
idx_in_leaf,bit_shift=divmod(cp,8)
454494
leaf[idx_in_leaf]|=1<<bit_shift
455495
leaves.append(leaf)
496+
497+
indexes= [(msb,index)for (index,msb)inenumerate(msbs)]
498+
499+
# Cull duplicate leaves
500+
i=0
501+
whilei<len(leaves):
502+
first_idx=leaves.index(leaves[i])
503+
iffirst_idx==i:
504+
i+=1
505+
else:
506+
forjinrange(0,len(indexes)):
507+
ifindexes[j][1]==i:
508+
indexes[j]= (indexes[j][0],first_idx)
509+
elifindexes[j][1]>i:
510+
indexes[j]= (indexes[j][0],indexes[j][1]-1)
511+
512+
leaves.pop(i)
513+
456514
return (indexes,leaves)
457515

458516

459517
defemit_module(
460518
out_name:str,
461519
unicode_version:"tuple[int, int, int]",
462520
tables:"list[Table]",
463-
variation_table:"tuple[list[int], list[list[int]]]",
521+
emoji_presentation_table:"tuple[list[tuple[int, int]], list[list[int]]]",
522+
text_presentation_table:"tuple[list[tuple[int, int]], list[list[int]]]",
464523
):
465524
"""Outputs a Rust module to `out_name` using table data from `tables`.
466525
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -537,7 +596,8 @@ def emit_module(
537596
"""
538597
)
539598

540-
variation_idx,variation_leaves=variation_table
599+
emoji_presentation_idx,emoji_presentation_leaves=emoji_presentation_table
600+
text_presentation_idx,text_presentation_leaves=text_presentation_table
541601

542602
module.write(
543603
"""
@@ -555,7 +615,7 @@ def emit_module(
555615
"""
556616
)
557617

558-
fori,msbsinenumerate(variation_idx):
618+
formsbs,iinemoji_presentation_idx:
559619
module.write(f"{msbs} =>{i},\n")
560620

561621
module.write(
@@ -571,6 +631,39 @@ def emit_module(
571631
"""
572632
)
573633

634+
module.write(
635+
"""
636+
/// Returns `true` iff `c` has default emoji presentation, but forms a [text presentation sequence]
637+
/// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence)
638+
/// when followed by `'\\u{FEOE}'`, and is not ideographic.
639+
/// Such sequences are considered to have width 1.
640+
///
641+
/// This may spuriously return `true` for characters of narrow or ambiguous width.
642+
#[inline]
643+
pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool {
644+
let cp: u32 = c.into();
645+
// First level of lookup uses all but 10 LSB
646+
let top_bits = cp >> 10;
647+
let idx_of_leaf: usize = match top_bits {
648+
"""
649+
)
650+
651+
formsbs,iintext_presentation_idx:
652+
module.write(f"{msbs} =>{i},\n")
653+
654+
module.write(
655+
""" _ => return false,
656+
};
657+
// Extract the 3-9th (0-indexed) least significant bits of `cp`,
658+
// and use them to index into `leaf_row`.
659+
let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
660+
let leaf_byte = TEXT_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
661+
// Use the 3 LSB of `cp` to index into `leaf_byte`.
662+
((leaf_byte >> (cp & 7)) & 1) == 1
663+
}
664+
"""
665+
)
666+
574667
module.write(
575668
"""
576669
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -626,12 +719,32 @@ def emit_module(
626719
f"""
627720
#[repr(align(128))]
628721
struct Align128<T>(T);
629-
/// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
630-
/// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
631-
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{len(variation_leaves)}]> = Align128([
722+
/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
723+
/// to get whether it can start an emoji presentation sequence.
724+
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{len(emoji_presentation_leaves)}]> = Align128([
725+
"""
726+
)
727+
forleafinemoji_presentation_leaves:
728+
module.write(" [\n")
729+
forrowinbatched(leaf,14):
730+
module.write(" ")
731+
forentryinrow:
732+
module.write(f" 0x{entry:02X},")
733+
module.write("\n")
734+
module.write(" ],\n")
735+
736+
module.write(" ]);\n")
737+
738+
# text table
739+
740+
module.write(
741+
f"""
742+
/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
743+
/// to get whether it can start a text presentation sequence.
744+
static TEXT_PRESENTATION_LEAVES: Align128<[[u8; 128];{len(text_presentation_leaves)}]> = Align128([
632745
"""
633746
)
634-
forleafinvariation_leaves:
747+
forleafintext_presentation_leaves:
635748
module.write(" [\n")
636749
forrowinbatched(leaf,14):
637750
module.write(" ")
@@ -650,21 +763,7 @@ def main(module_path: str):
650763
lookup table for character width, and write a Rust module utilizing that table to
651764
`module_filename`.
652765
653-
We obey the following rules, in decreasing order of importance:
654-
655-
- Emoji presentation sequences are double-width.
656-
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
657-
- Hangul jamo medial vowels & final consonants are zero-width.
658-
- `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
659-
- Control characters are zero-width.
660-
- `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters,
661-
are zero-width.
662-
- Codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
663-
- Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
664-
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
665-
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
666-
667-
These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
766+
See `lib.rs` for documentation of the exact width rules.
668767
"""
669768
version=load_unicode_version()
670769
print(f"Generating module for Unicode{version[0]}.{version[1]}.{version[2]}")
@@ -682,8 +781,18 @@ def main(module_path: str):
682781

683782
tables=make_tables(TABLE_CFGS,enumerate(width_map))
684783

685-
emoji_variations=load_variation_sequences()
686-
variation_table=make_variation_sequence_table(emoji_variations,width_map)
784+
emoji_presentations=load_emoji_presentation_sequences()
785+
emoji_presentation_table=make_presentation_sequence_table(
786+
emoji_presentations,width_map, {EffectiveWidth.WIDE}, {EffectiveWidth.WIDE}
787+
)
788+
789+
text_presentations=load_text_presentation_sequences()
790+
text_presentation_table=make_presentation_sequence_table(
791+
text_presentations,
792+
width_map,
793+
set(),
794+
{EffectiveWidth.NARROW,EffectiveWidth.AMBIGUOUS},
795+
)
687796

688797
# Download normalization test file for use by tests
689798
fetch_open("NormalizationTest.txt","../tests/")
@@ -694,16 +803,23 @@ def main(module_path: str):
694803
size_bytes=len(table.to_bytes())
695804
print(f"Table{i} size:{size_bytes} bytes")
696805
total_size+=size_bytes
697-
emoji_index_size=len(variation_table[0])*4
698-
print(f"Emoji presentation index size:{emoji_index_size} bytes")
699-
total_size+=emoji_index_size
700-
emoji_leaves_size=len(variation_table[1])*len(variation_table[1][0])
701-
print(f"Emoji presentation leaves size:{emoji_leaves_size} bytes")
702-
total_size+=emoji_leaves_size
806+
807+
fors,tablein [
808+
("Emoji",emoji_presentation_table),
809+
("Text",text_presentation_table),
810+
]:
811+
index_size=len(table[0])*4
812+
print(f"{s} presentation index size:{index_size} bytes")
813+
total_size+=index_size
814+
leaves_size=len(table[1])*len(table[1][0])
815+
print(f"{s} presentation leaves size:{leaves_size} bytes")
816+
total_size+=leaves_size
703817
print("------------------------")
704818
print(f" Total size:{total_size} bytes")
705819

706-
emit_module(module_path,version,tables,variation_table)
820+
emit_module(
821+
module_path,version,tables,emoji_presentation_table,text_presentation_table
822+
)
707823
print(f'Wrote to "{module_path}"')
708824

709825

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp