Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commita2db56b

Browse files
Refactorunicode.py
- Align tables- Use helper function to parse properties
1 parentda626ef commita2db56b

File tree

2 files changed

+96
-102
lines changed

2 files changed

+96
-102
lines changed

‎scripts/unicode.py‎

Lines changed: 81 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,14 @@
2424

2525
importenum
2626
importmath
27+
importoperator
2728
importos
2829
importre
2930
importsys
3031
importurllib.request
3132
fromcollectionsimportdefaultdict
3233
fromitertoolsimportbatched
34+
fromtypingimportCallable
3335

3436
UNICODE_VERSION="15.1.0"
3537
"""The version of the Unicode data files to download."""
@@ -90,13 +92,32 @@ def fetch_open(filename: str, local_prefix: str = ""):
9092
sys.exit(1)
9193

9294

93-
defload_unicode_version()->"tuple[int, int, int]":
95+
defload_unicode_version()->tuple[int,int,int]:
9496
"""Returns the current Unicode version by fetching and processing `ReadMe.txt`."""
9597
withfetch_open("ReadMe.txt")asreadme:
9698
pattern=r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
9799
returntuple(map(int,re.search(pattern,readme.read()).groups()))
98100

99101

102+
defload_property(filename:str,pattern:str,action:Callable[[int],None]):
103+
withfetch_open(filename)asproperties:
104+
single=re.compile(rf"^([0-9A-F]+)\s*;\s*{pattern}\s+")
105+
multiple=re.compile(rf"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*{pattern}\s+")
106+
107+
forlineinproperties.readlines():
108+
raw_data=None# (low, high)
109+
ifmatch:=single.match(line):
110+
raw_data= (match.group(1),match.group(1))
111+
elifmatch:=multiple.match(line):
112+
raw_data= (match.group(1),match.group(2))
113+
else:
114+
continue
115+
low=int(raw_data[0],16)
116+
high=int(raw_data[1],16)
117+
forcpinrange(low,high+1):
118+
action(cp)
119+
120+
100121
classEffectiveWidth(enum.IntEnum):
101122
"""Represents the width of a Unicode character. All East Asian Width classes resolve into
102123
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
@@ -112,15 +133,15 @@ class EffectiveWidth(enum.IntEnum):
112133
""" Two columns wide in a CJK context. One column wide in all other contexts. """
113134

114135

115-
defload_east_asian_widths()->"list[EffectiveWidth]":
136+
defload_east_asian_widths()->list[EffectiveWidth]:
116137
"""Return a list of effective widths, indexed by codepoint.
117138
Widths are determined by fetching and parsing `EastAsianWidth.txt`.
118139
119140
`Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`.
120141
121142
`Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
122143
123-
`Ambiguous`chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
144+
`Ambiguous`characters are assigned `EffectiveWidth.AMBIGUOUS`."""
124145
withfetch_open("EastAsianWidth.txt")aseaw:
125146
# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
126147
single=re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
@@ -161,7 +182,7 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
161182
returnwidth_map
162183

163184

164-
defload_zero_widths()->"list[bool]":
185+
defload_zero_widths()->list[bool]:
165186
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
166187
character. `c` is considered a zero-width character if
167188
@@ -180,26 +201,11 @@ def load_zero_widths() -> "list[bool]":
180201
# `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
181202
# as well as a few `Mc` characters that need to be included so that
182203
# canonically equivalent sequences have the same width.
183-
withfetch_open("DerivedCoreProperties.txt")asproperties:
184-
single=re.compile(
185-
r"^([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
186-
)
187-
multiple=re.compile(
188-
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
189-
)
190-
191-
forlineinproperties.readlines():
192-
raw_data=None# (low, high)
193-
ifmatch:=single.match(line):
194-
raw_data= (match.group(1),match.group(1))
195-
elifmatch:=multiple.match(line):
196-
raw_data= (match.group(1),match.group(2))
197-
else:
198-
continue
199-
low=int(raw_data[0],16)
200-
high=int(raw_data[1],16)
201-
forcpinrange(low,high+1):
202-
zw_map[cp]=True
204+
load_property(
205+
"DerivedCoreProperties.txt",
206+
r"(?:Default_Ignorable_Code_Point|Grapheme_Extend)",
207+
lambdacp:operator.setitem(zw_map,cp,True),
208+
)
203209

204210
# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
205211
# as they canonically decompose to two characters with this property,
@@ -217,29 +223,11 @@ def load_zero_widths() -> "list[bool]":
217223
# and the resulting grapheme has width 2.
218224
#
219225
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
220-
withfetch_open("HangulSyllableType.txt")ascategories:
221-
single=re.compile(r"^([0-9A-F]+)\s*;\s*(V|T)\s+")
222-
multiple=re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(V|T)\s+")
223-
224-
forlineincategories.readlines():
225-
raw_data=None# (low, high)
226-
ifmatch:=single.match(line):
227-
raw_data= (match.group(1),match.group(1))
228-
elifmatch:=multiple.match(line):
229-
raw_data= (match.group(1),match.group(2))
230-
else:
231-
continue
232-
low=int(raw_data[0],16)
233-
high=int(raw_data[1],16)
234-
forcpinrange(low,high+1):
235-
zw_map[cp]=True
236-
237-
# Special case: U+115F HANGUL CHOSEONG FILLER.
238-
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
239-
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
240-
# (which are considered 0-width on their own) to form a composed Hangul syllable with
241-
# width 2. Therefore, we treat it as having width 2.
242-
zw_map[0x115F]=False
226+
load_property(
227+
"HangulSyllableType.txt",
228+
r"(?:V|T)",
229+
lambdacp:operator.setitem(zw_map,cp,True),
230+
)
243231

244232
# Syriac abbreviation mark:
245233
# Zero-width `Prepended_Concatenation_Mark`
@@ -252,7 +240,14 @@ def load_zero_widths() -> "list[bool]":
252240
zw_map[0x0891]=True
253241
zw_map[0x08E2]=True
254242

255-
# U+A8FA DEVANAGARI CARET
243+
# HANGUL CHOSEONG FILLER
244+
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
245+
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
246+
# (which are considered 0-width on their own) to form a composed Hangul syllable with
247+
# width 2. Therefore, we treat it as having width 2.
248+
zw_map[0x115F]=False
249+
250+
# DEVANAGARI CARET
256251
# https://www.unicode.org/versions/Unicode15.0.0/ch12.pdf#G667447
257252
zw_map[0xA8FA]=True
258253

@@ -287,13 +282,13 @@ def try_extend(self, attempt: "Bucket") -> bool:
287282
self.widths=more
288283
returnTrue
289284

290-
defentries(self)->"list[tuple[Codepoint, EffectiveWidth]]":
285+
defentries(self)->list[tuple[Codepoint,EffectiveWidth]]:
291286
"""Return a list of the codepoint/width pairs in this bucket, sorted by codepoint."""
292287
result=list(self.entry_set)
293288
result.sort()
294289
returnresult
295290

296-
defwidth(self)->"EffectiveWidth | None":
291+
defwidth(self)->EffectiveWidth|None:
297292
"""If all codepoints in this bucket have the same width, return that width; otherwise,
298293
return `None`."""
299294
iflen(self.widths)==0:
@@ -305,7 +300,7 @@ def width(self) -> "EffectiveWidth | None":
305300
returnpotential_width
306301

307302

308-
defmake_buckets(entries,low_bit:BitPos,cap_bit:BitPos)->"list[Bucket]":
303+
defmake_buckets(entries,low_bit:BitPos,cap_bit:BitPos)->list[Bucket]:
309304
"""Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All
310305
codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the
311306
same bucket. Returns a list of the buckets in increasing order of those bits."""
@@ -373,7 +368,7 @@ def buckets(self):
373368
"""Returns an iterator over this table's buckets."""
374369
returnself.indexed
375370

376-
defto_bytes(self)->"list[int]":
371+
defto_bytes(self)->list[int]:
377372
"""Returns this table's entries as a list of bytes. The bytes are formatted according to
378373
the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries
379374
to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will
@@ -389,8 +384,8 @@ def to_bytes(self) -> "list[int]":
389384

390385

391386
defmake_tables(
392-
table_cfgs:"list[tuple[BitPos, BitPos, OffsetType]]",entries
393-
)->"list[Table]":
387+
table_cfgs:list[tuple[BitPos,BitPos,OffsetType]],entries
388+
)->list[Table]:
394389
"""Creates a table for each configuration in `table_cfgs`, with the first config corresponding
395390
to the top-level lookup table, the second config corresponding to the second-level lookup
396391
table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs
@@ -404,7 +399,7 @@ def make_tables(
404399
returntables
405400

406401

407-
defload_emoji_presentation_sequences()->"list[int]":
402+
defload_emoji_presentation_sequences()->list[int]:
408403
"""Outputs a list of character ranages, corresponding to all the valid characters for starting
409404
an emoji presentation sequence."""
410405

@@ -420,7 +415,7 @@ def load_emoji_presentation_sequences() -> "list[int]":
420415
returncodepoints
421416

422417

423-
defload_text_presentation_sequences()->"list[int]":
418+
defload_text_presentation_sequences()->list[int]:
424419
"""Outputs a list of character ranages, corresponding to all the valid characters
425420
whose widths change with a text presentation sequence."""
426421

@@ -435,24 +430,12 @@ def load_text_presentation_sequences() -> "list[int]":
435430
text_presentation_seq_codepoints.add(cp)
436431

437432
default_emoji_codepoints=set()
438-
withfetch_open("emoji/emoji-data.txt")asemoji_data:
439-
single=re.compile(r"^([0-9A-F]+)\s*;\s*Emoji_Presentation\s+")
440-
multiple=re.compile(
441-
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Presentation\s+"
442-
)
443433

444-
forlineinemoji_data.readlines():
445-
raw_data=None# (low, high)
446-
ifmatch:=single.match(line):
447-
raw_data= (match.group(1),match.group(1))
448-
elifmatch:=multiple.match(line):
449-
raw_data= (match.group(1),match.group(2))
450-
else:
451-
continue
452-
low=int(raw_data[0],16)
453-
high=int(raw_data[1],16)
454-
forcpinrange(low,high+1):
455-
default_emoji_codepoints.add(cp)
434+
load_property(
435+
"emoji/emoji-data.txt",
436+
"Emoji_Presentation",
437+
lambdacp:default_emoji_codepoints.add(cp),
438+
)
456439

457440
codepoints= []
458441
forcpintext_presentation_seq_codepoints.intersection(default_emoji_codepoints):
@@ -466,11 +449,11 @@ def load_text_presentation_sequences() -> "list[int]":
466449

467450

468451
defmake_presentation_sequence_table(
469-
seqs:"list[int]",
470-
width_map:"list[EffectiveWidth]",
471-
spurious_false:"set[EffectiveWidth]",
472-
spurious_true:"set[EffectiveWidth]",
473-
)->"tuple[list[tuple[int, int]], list[list[int]]]":
452+
seqs:list[Codepoint],
453+
width_map:list[EffectiveWidth],
454+
spurious_false:set[EffectiveWidth],
455+
spurious_true:set[EffectiveWidth],
456+
)->tuple[list[tuple[int,int]],list[list[int]]]:
474457
"""Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
475458
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
476459
"""
@@ -488,13 +471,13 @@ def make_presentation_sequence_table(
488471
):
489472
delprefixes_dict[k]
490473

491-
msbs:"list[int]"=list(prefixes_dict.keys())
474+
msbs:list[int]=list(prefixes_dict.keys())
492475

493476
forcp,widthinenumerate(width_map):
494477
ifwidthinspurious_trueand (cp>>10)inmsbs:
495478
prefixes_dict[cp>>10].add(cp&0x3FF)
496479

497-
leaves:"list[list[int]]"= []
480+
leaves:list[list[int]]= []
498481
forcpsinprefixes_dict.values():
499482
leaf= [0]*128
500483
forcpincps:
@@ -524,10 +507,10 @@ def make_presentation_sequence_table(
524507

525508
defemit_module(
526509
out_name:str,
527-
unicode_version:"tuple[int, int, int]",
528-
tables:"list[Table]",
529-
emoji_presentation_table:"tuple[list[tuple[int, int]], list[list[int]]]",
530-
text_presentation_table:"tuple[list[tuple[int, int]], list[list[int]]]",
510+
unicode_version:tuple[int,int,int],
511+
tables:list[Table],
512+
emoji_presentation_table:tuple[list[tuple[int,int]],list[list[int]]],
513+
text_presentation_table:tuple[list[tuple[int,int]],list[list[int]]],
531514
):
532515
"""Outputs a Rust module to `out_name` using table data from `tables`.
533516
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -574,18 +557,18 @@ def emit_module(
574557
pub fn lookup_width(c: char, is_cjk: bool) -> usize {
575558
let cp = c as usize;
576559
577-
let t1_offset = TABLES_0[cp >> 13 & 0xFF];
560+
let t1_offset = TABLES_0.0[cp >> 13 & 0xFF];
578561
579562
// Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte,
580563
// so each sub-table is 128 bytes in size.
581564
// (Sub-tables are selected using the computed offset from the previous table.)
582-
let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
565+
let t2_offset = TABLES_1.0[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
583566
584567
// Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits.
585568
// This is accomplished by packing four stored entries into one byte.
586569
// So each sub-table is 2**(6-2) == 16 bytes in size.
587570
// Since this is the last table, each entry represents an encoded width.
588-
let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
571+
let packed_widths = TABLES_2.0[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
589572
590573
// Extract the packed width
591574
let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
@@ -669,6 +652,12 @@ def emit_module(
669652
// Use the 3 LSB of `cp` to index into `leaf_byte`.
670653
((leaf_byte >> (cp & 7)) & 1) == 1
671654
}
655+
656+
#[repr(align(128))]
657+
struct Align128<T>(T);
658+
659+
#[repr(align(16))]
660+
struct Align16<T>(T);
672661
"""
673662
)
674663

@@ -677,26 +666,27 @@ def emit_module(
677666
new_subtable_count=len(table.buckets())
678667
ifi==len(tables)-1:
679668
table.indices_to_widths()# for the last table, indices == widths
669+
align=16
670+
else:
671+
align=128
680672
byte_array=table.to_bytes()
681673
module.write(
682674
f"""
683675
/// Autogenerated.{subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
684-
static TABLES_{i}: [u8;{len(byte_array)}] = ["""
676+
static TABLES_{i}:Align{align}<[u8;{len(byte_array)}]> =Align{align}(["""
685677
)
686678
forj,byteinenumerate(byte_array):
687679
# Add line breaks for every 15th entry (chosen to match what rustfmt does)
688680
ifj%15==0:
689681
module.write("\n ")
690682
module.write(f" 0x{byte:02X},")
691-
module.write("\n ];\n")
683+
module.write("\n ]);\n")
692684
subtable_count=new_subtable_count
693685

694686
# emoji table
695687

696688
module.write(
697689
f"""
698-
#[repr(align(128))]
699-
struct Align128<T>(T);
700690
/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
701691
/// to get whether it can start an emoji presentation sequence.
702692
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128];{len(emoji_presentation_leaves)}]> = Align128([

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp