Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit91a8e06

Browse files
committed
Move to using bit sets for ScriptExtension
1 parent1057462 commit91a8e06

File tree

5 files changed

+1954
-2433
lines changed

5 files changed

+1954
-2433
lines changed

‎.github/workflows/tests.yml‎

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,24 @@ on: [push]
44

55
jobs:
66
build:
7-
87
runs-on:ubuntu-latest
9-
8+
strategy:
9+
matrix:
10+
rust:
11+
-beta
12+
-nightly
1013
steps:
1114
-uses:actions/checkout@v1
1215
-uses:actions-rs/toolchain@v1
1316
with:
1417
profile:minimal
15-
toolchain:beta
18+
toolchain:${{ matrix.rust }}
1619
override:true
1720
components:rustfmt
1821
-name:Build
1922
run:cargo build --verbose
2023
-name:Run tests
2124
run:cargo test
25+
-name:Run benchmarks
26+
run:cargo bench --features bench
27+
if:startsWith(matrix.rust, 'nightly')

‎Cargo.toml‎

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name ="unicode-script"
3-
version ="0.4.0"
3+
version ="0.5.0"
44
authors = ["Manish Goregaokar <manishsmail@gmail.com>"]
55
edition ="2018"
66

@@ -20,9 +20,8 @@ exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt" ]
2020
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
2121

2222
[features]
23-
with_std = []
24-
default_features = ["with_std"]
2523
rustc-dep-of-std = ['std','core','compiler_builtins']
24+
bench = []
2625

2726
[dependencies]
2827
std = {version ="1.0",package ="rustc-std-workspace-std",optional =true }

‎scripts/unicode.py‎

Lines changed: 58 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
3636
3737
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
38+
39+
use super::ScriptExtension;
3840
'''
3941

4042
UNICODE_VERSION= (12,0,0)
@@ -183,44 +185,69 @@ def emit_search(f):
183185
}
184186
""")
185187

186-
defemit_enums(f,script_list,extension_list,longforms,intersections):
188+
defemit_enums(f,script_list,extension_list,longforms):
187189
"""
188190
Emit the Script and ScriptExtension enums as well as any related utility functions
189191
"""
192+
190193
f.write("""
191194
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
192195
#[non_exhaustive]
193196
#[allow(non_camel_case_types)]
197+
#[repr(u8)]
194198
/// A value of the `Script` property
195199
pub enum Script {
196200
/// Unknown script
197-
Unknown,
201+
Unknown = 0xFF,
202+
/// Zyyy
203+
Common = 0xFE,
204+
/// Zinh,
205+
Inherited = 0xFD,
198206
""")
199-
forscriptinscript_list:
200-
f.write(" /// %s\n %s,\n"% (script,longforms[script]))
201-
f.write("""}
202-
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
203-
#[non_exhaustive]
204-
/// A value for the `Script_Extension` property
205-
///
206-
/// [`ScriptExtension`] is one or more [`Script`]
207-
///
208-
/// This is essentially an optimized version of `Vec<Script>`,
209-
/// optimized by script sets and intersections actually present in Unicode.
210-
pub enum ScriptExtension {
211-
/// A single script
212-
Single(Script),
207+
for (i,script)inenumerate(script_list):
208+
f.write(" /// %s\n %s = %s,\n"% (script,longforms[script],i))
209+
f.write("}\n")
210+
f.write("pub const NEXT_SCRIPT: u8 = %s;"%len(script_list))
211+
f.write("""
212+
213+
pub mod script_extensions {
214+
use crate::ScriptExtension;
215+
pub const COMMON: ScriptExtension = ScriptExtension::new_common();
216+
pub const INHERITED: ScriptExtension = ScriptExtension::new_inherited();
217+
pub const UNKNOWN: ScriptExtension = ScriptExtension::new_unknown();
213218
""")
219+
for (i,script)inenumerate(script_list):
220+
first=0
221+
second=0
222+
third=0
223+
# need to replace L because `hex()` will spit out an L suffix for larger numbers
224+
ifi<64:
225+
first=hex(1<<i).replace("L","")
226+
elifi<128:
227+
second=hex(1<< (i-64)).replace("L","")
228+
else:
229+
third=hex(1<< (i-128)).replace("L","")
230+
f.write(" /// %s\n pub const %s: ScriptExtension = ScriptExtension::new(%s, %s, %s);\n"%
231+
(longforms[script],longforms[script].upper(),first,second,third))
232+
ifscript!=longforms[script]:
233+
f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n"%
234+
(longforms[script],script.upper(),longforms[script].upper()))
214235
forextinextension_list:
215236
longform=", ".join([longforms[s]forsinext])
216-
f.write(" /// %s\n %s,\n"% (longform,"".join(ext)))
237+
name="_".join([s.upper()forsinext])
238+
expr=ext[0].upper()
239+
foreinext[1:]:
240+
expr="%s.union(%s)"% (expr,e.upper())
241+
f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n"% (longform,name,expr))
217242
f.write("""}
218243
219244
impl Script {
220245
#[inline]
221246
pub(crate) fn inner_full_name(self) -> &'static str {
222247
match self {
223248
Script::Unknown => "Unknown",
249+
Script::Common => "Common",
250+
Script::Inherited => "Inherited",
224251
""")
225252
forscriptinscript_list:
226253
f.write(" Script::%s =>\"%s\",\n"% (longforms[script],longforms[script]))
@@ -231,119 +258,29 @@ def emit_enums(f, script_list, extension_list, longforms, intersections):
231258
pub(crate) fn inner_short_name(self) -> &'static str {
232259
match self {
233260
Script::Unknown => "",
261+
Script::Common => "Zyyy",
262+
Script::Inherited => "Zinh",
234263
""")
235264
forscriptinscript_list:
236265
f.write(" Script::%s =>\"%s\",\n"% (longforms[script],script))
237266
f.write(""" }
238267
}
239-
}
240-
241-
impl ScriptExtension {
242-
#[inline]
243-
#[cfg(feature = "with_std")]
244-
pub(crate) fn inner_scripts(self) -> Vec<Script> {
245-
match self {
246-
ScriptExtension::Single(s) => vec![s],
247-
""")
248-
forextinextension_list:
249-
scripts=", ".join(["Script::%s"%longforms[s]forsinext])
250-
f.write(" %s => vec![%s],\n"% (extension_name(ext),scripts))
251-
f.write(""" _ => unreachable!()
252-
}
253-
}
254-
255-
#[inline]
256-
pub(crate) fn inner_contains_script(self, other: Script) -> bool {
257-
match self {
258-
ScriptExtension::Single(s) => s == other,
259-
""")
260-
forextinextension_list:
261-
scripts=" || ".join(["other == Script::%s"%longforms[s]forsinext])
262-
f.write(" %s => %s,\n"% (extension_name(ext),scripts))
263-
f.write(""" }
264-
}
265268
266269
#[inline]
267-
pub(crate) fn inner_intersect(self, other: Self) -> Self {
268-
match (self, other) {
269-
(ScriptExtension::Single(Script::Unknown), _) |
270-
(_, ScriptExtension::Single(Script::Unknown)) => ScriptExtension::Single(Script::Unknown),
271-
(a, b) if a == b => a,
272-
(ScriptExtension::Single(Script::Common), a) |
273-
(ScriptExtension::Single(Script::Inherited), a) |
274-
(a, ScriptExtension::Single(Script::Common)) |
275-
(a, ScriptExtension::Single(Script::Inherited)) => a,
276-
(ScriptExtension::Single(s), o) | (o, ScriptExtension::Single(s)) if o.inner_contains_script(s) => ScriptExtension::Single(s),
270+
pub(crate) fn for_integer(value: u8) -> Self {
271+
match value {
277272
""")
278-
for (e1,e2,i)inintersections:
279-
f.write("(%s, %s)=> %s,\n"% (extension_name(e1),extension_name(e2),extension_name(i,longforms)))
280-
f.write(""" _ =>ScriptExtension::Single(Script::Unknown),
273+
for (i,script)inenumerate(script_list):
274+
f.write("%s=>Script::%s,\n"% (i,longforms[script]))
275+
f.write(""" _ =>unreachable!(),
281276
}
282277
}
283278
}
284279
""")
285280

286-
287-
defcompute_intersections_elements(extension_list):
288-
"""
289-
Compute all intersections between the script extensions.
290-
This will add new elements to extension_list, be sure to call it first!
291-
"""
292-
293-
# This is the only third-level intersection
294-
# It's easier to hardcode things here rather than
295-
# do the below calculation in a loop
296-
extension_list.append(['Deva','Knda','Tirh'])
297-
intersections= []
298-
# Some intersections will not exist in extension_list and we'll need to add them
299-
new_elements= []
300-
sets= [(e,set(e))foreinextension_list]
301-
for (e1,s1)insets:
302-
for (e2,s2)insets:
303-
ife1==e2:
304-
continue
305-
intersection=s1.intersection(s2)
306-
iflen(intersection)>0:
307-
intersection= [iforiinintersection]
308-
intersection.sort()
309-
iflen(intersection)>1andintersectionnotinextension_listandintersectionnotinnew_elements:
310-
new_elements.append(intersection)
311-
if (e1,e2,intersection)notinintersections:
312-
intersections.append((e1,e2,intersection))
313-
extension_list.extend(new_elements)
314-
315-
# We now go through the newly added second-level extension values and calculate their intersections
316-
# with the original set and each other
317-
new_sets= [(e,set(e))foreinnew_elements]
318-
sets= [(e,set(e))foreinextension_list]
319-
for (e1,s1)innew_sets:
320-
for (e2,s2)insets:
321-
ife1==e2:
322-
continue
323-
intersection=s1.intersection(s2)
324-
iflen(intersection)>0:
325-
intersection= [iforiinintersection]
326-
intersection.sort()
327-
iflen(intersection)>1andintersectionnotinextension_list:
328-
raise"Found new third-level intersection, please hardcode it"
329-
# The previous routine would automatically get both versions
330-
# of an intersection because it would iterate each pair in both orders,
331-
# but here we're working on an asymmetric pair, so we insert both in order to not
332-
# miss anything
333-
if (e1,e2,intersection)notinintersections:
334-
intersections.append((e1,e2,intersection))
335-
if (e2,e1,intersection)notinintersections:
336-
intersections.append((e2,e1,intersection))
337-
338-
intersections.sort()
339-
returnintersections
340-
341-
defextension_name(ext,longforms={}):
281+
defextension_name(ext):
342282
"""Get the rust source for a given ScriptExtension"""
343-
iflen(ext)==1:
344-
return"ScriptExtension::Single(Script::%s)"%longforms[ext[0]]
345-
else:
346-
return"ScriptExtension::%s"%"".join(ext)
283+
return"script_extensions::%s"%"_".join([e.upper()foreinext])
347284

348285

349286

@@ -370,8 +307,10 @@ def extension_name(ext, longforms={}):
370307
script_list= []
371308

372309
forscriptinscripts:
373-
script_list.append(shortforms[script])
310+
ifscriptnotin ["Common","Unknown","Inherited"]:
311+
script_list.append(shortforms[script])
374312
script_table.extend([(x,y,shortforms[script])for (x,y)inscripts[script]])
313+
script_list.sort()
375314
script_table.sort(key=lambdaw:w[0])
376315

377316

@@ -389,14 +328,13 @@ def extension_name(ext, longforms={}):
389328
extension_table.extend([(x,y,output_ext)for (x,y)inextensions[ext]])
390329
extension_table.sort(key=lambdaw:w[0])
391330

392-
intersections=compute_intersections_elements(extension_list)
393331

394-
emit_enums(rf,script_list,extension_list,longforms,intersections)
332+
emit_enums(rf,script_list,extension_list,longforms)
395333
emit_search(rf)
396334

397335
emit_table(rf,"SCRIPTS",script_table,t_type="&'static [(char, char, Script)]",
398336
is_pub=False ,pfun=lambdax:"(%s,%s, Script::%s)"% (escape_char(x[0]),escape_char(x[1]),longforms[x[2]]))
399337
emit_table(rf,"SCRIPT_EXTENSIONS",extension_table,t_type="&'static [(char, char, ScriptExtension)]",
400-
is_pub=False ,pfun=lambdax:"(%s,%s,%s)"% (escape_char(x[0]),escape_char(x[1]),extension_name(x[2],longforms)))
338+
is_pub=False ,pfun=lambdax:"(%s,%s,%s)"% (escape_char(x[0]),escape_char(x[1]),extension_name(x[2])))
401339

402340
# emit_table(rf, "FOObar", properties)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp