Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit592ce00

Browse files
authored
Merge pull request#134 from Jules-Bertholet/fix
Fix#125
2 parents3ff9de6 +dce3a34 commit592ce00

File tree

11 files changed

+1271
-2069
lines changed

11 files changed

+1271
-2069
lines changed

‎.github/workflows/rust.yml

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,31 @@ on:
77
branches:[ master ]
88

99
env:
10+
CARGO_INCREMENTAL:0
1011
CARGO_TERM_COLOR:always
12+
RUST_BACKTRACE:1
13+
RUSTFLAGS:-D warnings
14+
RUSTDOCFLAGS:-D warnings
1115

1216
jobs:
1317
build:
14-
1518
runs-on:ubuntu-latest
16-
1719
steps:
1820
-uses:actions/checkout@v2
1921
-name:Build
2022
run:cargo build --verbose
2123
-name:Run tests
2224
run:cargo test --verbose
23-
fmt:
25+
-name:Run clippy
26+
run:cargo clippy --all-targets --all --verbose
2427

28+
fmt:
2529
runs-on:ubuntu-latest
26-
2730
steps:
2831
-uses:actions/checkout@v2
2932
-name:Rustfmt
30-
run:cargo fmt --check
33+
run:cargo fmt --all --check
3134
-name:Verify regenerated files
3235
run:./scripts/unicode.py && diff tables.rs src/tables.rs
3336
-name:Verify regenerated tests
34-
run:./scripts/unicode_gen_breaktests.py &&rustfmt testdata.rs &&diff testdata.rssrc/testdata.rs
37+
run:./scripts/unicode_gen_breaktests.py && diff testdata.rstests/testdata/mod.rs

‎benches/chars.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
//! is how much slower full unicode handling is.
77
88
use criterion::{black_box, criterion_group, criterion_main,BenchmarkId,Criterion};
9-
use unicode_segmentation;
109

1110
use std::fs;
1211
use unicode_segmentation::UnicodeSegmentation;
@@ -24,14 +23,14 @@ const FILES: &[&str] = &[
2423

2524
#[inline(always)]
2625
fngrapheme(text:&str){
27-
for cinUnicodeSegmentation::graphemes(black_box(&*text),true){
26+
for cinUnicodeSegmentation::graphemes(black_box(text),true){
2827
black_box(c);
2928
}
3029
}
3130

3231
#[inline(always)]
3332
fnscalar(text:&str){
34-
for cinblack_box(&*text).chars(){
33+
for cinblack_box(text).chars(){
3534
black_box(c);
3635
}
3736
}

‎scripts/unicode.py

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,11 @@ def format_table_content(f, content, indent):
155155
line=" "*indent+chunk
156156
f.write(line)
157157

158-
defload_properties(f,interestingprops):
158+
defload_properties(f,interestingprops:"list[str | tuple[str, str]] | None"=None):
159159
fetch(f)
160160
props= {}
161-
re1=re.compile(r"^*([0-9A-F]+) *;*(\w+)")
162-
re2=re.compile(r"^*([0-9A-F]+)\.\.([0-9A-F]+) *;*(\w+)")
161+
re1=re.compile(r"^\s*([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
162+
re2=re.compile(r"^\s*([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
163163

164164
forlineinfileinput.input(os.path.basename(f)):
165165
prop=None
@@ -168,17 +168,21 @@ def load_properties(f, interestingprops):
168168
m=re1.match(line)
169169
ifm:
170170
d_lo=m.group(1)
171-
d_hi=m.group(1)
171+
d_hi=d_lo
172172
prop=m.group(2)
173+
value=m.group(3)
173174
else:
174175
m=re2.match(line)
175176
ifm:
176177
d_lo=m.group(1)
177178
d_hi=m.group(2)
178179
prop=m.group(3)
180+
value=m.group(4)
179181
else:
180182
continue
181-
ifinterestingpropsandpropnotininterestingprops:
183+
ifvalueisnotNone:
184+
prop= (prop,value)
185+
ifinterestingpropsisnotNoneandpropnotininterestingprops:
182186
continue
183187
d_lo=int(d_lo,16)
184188
d_hi=int(d_hi,16)
@@ -195,7 +199,7 @@ def load_properties(f, interestingprops):
195199
defescape_char(c):
196200
return"'\\u{%x}'"%c
197201

198-
defemit_table(f,name,t_data,t_type="&'static[(char, char)]",is_pub=True,
202+
defemit_table(f,name,t_data,t_type="&[(char, char)]",is_pub=True,
199203
pfun=lambdax:"(%s,%s)"% (escape_char(x[0]),escape_char(x[1])),is_const=True):
200204
pub_string="const"
201205
ifnotis_const:
@@ -217,7 +221,7 @@ def emit_util_mod(f):
217221
f.write("""
218222
pub mod util {
219223
#[inline]
220-
pub fn bsearch_range_table(c: char, r: &'static[(char,char)]) -> bool {
224+
pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {
221225
use core::cmp::Ordering::{Equal, Less, Greater};
222226
r.binary_search_by(|&(lo,hi)| {
223227
if lo <= c && c <= hi { Equal }
@@ -252,13 +256,22 @@ def emit_util_mod(f):
252256
253257
""")
254258

255-
defemit_property_module(f,mod,tbl,emit):
256-
f.write("mod %s {\n"%mod)
257-
forcatinsorted(emit):
258-
emit_table(f,"%s_table"%cat,tbl[cat],is_pub=False)
259+
defemit_property_module(f,mod,tbl,emit:"list[str | tuple[str, str]]"):
260+
f.write("pub mod %s {\n"%mod)
261+
262+
cats= []
263+
forcatinemit:
264+
iftype(cat)istuple:
265+
cats.append((f"{cat[0]}_{cat[1]}",cat))
266+
else:
267+
cats.append((cat,cat))
268+
cats.sort(key=lambdax:x[0])
269+
270+
forcat_str,catincats:
271+
emit_table(f,"%s_table"%cat_str,tbl[cat],is_pub=False)
259272
f.write(" #[inline]\n")
260-
f.write(" pub fn %s(c: char) -> bool {\n"%cat)
261-
f.write(" super::util::bsearch_range_table(c, %s_table)\n"%cat)
273+
f.write(" pub fn %s(c: char) -> bool {\n"%cat_str)
274+
f.write(" super::util::bsearch_range_table(c, %s_table)\n"%cat_str)
262275
f.write(" }\n\n")
263276
f.write("}\n\n")
264277

@@ -303,7 +316,7 @@ def emit_break_module(f, break_table, break_cats, name):
303316
f.write((" %sC_"%Name[0])+cat+",\n")
304317
f.write(""" }
305318
306-
fn bsearch_range_value_table(c: char, r: &'static[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
319+
fn bsearch_range_value_table(c: char, r: &[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
307320
use core::cmp::Ordering::{Equal, Less, Greater};
308321
match r.binary_search_by(|&(lo, hi, _)| {
309322
if lo <= c && c <= hi { Equal }
@@ -355,11 +368,11 @@ def emit_break_module(f, break_table, break_cats, name):
355368
else:
356369
lookup_type="u32"
357370

358-
emit_table(f,"%s_cat_lookup"%name,lookup_table,"&'static[%s]"%lookup_type,
371+
emit_table(f,"%s_cat_lookup"%name,lookup_table,"&[%s]"%lookup_type,
359372
pfun=lambdax:"%d"%x,
360373
is_pub=False,is_const=True)
361374

362-
emit_table(f,"%s_cat_table"%name,break_table,"&'static[(char, char, %sCat)]"%Name,
375+
emit_table(f,"%s_cat_table"%name,break_table,"&[(char, char, %sCat)]"%Name,
363376
pfun=lambdax:"(%s,%s,%sC_%s)"% (escape_char(x[0]),escape_char(x[1]),Name[0],x[2]),
364377
is_pub=False,is_const=True)
365378
f.write("}\n")
@@ -379,17 +392,26 @@ def emit_break_module(f, break_table, break_cats, name):
379392

380393
# download and parse all the data
381394
gencats=load_gencats("UnicodeData.txt")
382-
derived=load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
395+
derived=load_properties("DerivedCoreProperties.txt", ["Alphabetic", ("InCB","Consonant"), ("InCB","Extend"), ("InCB","Linker")])
383396

384397
emit_util_mod(rf)
385398
for (name,cat,pfuns)in ("general_category",gencats, ["N"]), \
386-
("derived_property",derived, ["Alphabetic"]):
399+
("derived_property",derived, ["Alphabetic", ("InCB","Extend")]):
387400
emit_property_module(rf,name,cat,pfuns)
388401

402+
rf.write("""pub fn is_incb_linker(c: char) -> bool {
403+
matches!(c,""")
404+
405+
for (lo,hi)inderived[("InCB","Linker")]:
406+
rf.write(f" | '\\u{{{lo:X}}}'")
407+
iflo!=hi:
408+
rf.write(f"..'\\u{{{lo:X}}}'")
409+
410+
rf.write(")\n}\n\n")
411+
389412
### grapheme cluster module
390413
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
391-
grapheme_cats=load_properties("auxiliary/GraphemeBreakProperty.txt", [])
392-
414+
grapheme_cats=load_properties("auxiliary/GraphemeBreakProperty.txt")
393415
# Control
394416
# Note:
395417
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
@@ -398,22 +420,22 @@ def emit_break_module(f, break_table, break_cats, name):
398420
grapheme_cats["Control"]=group_cat(list(
399421
set(ungroup_cat(grapheme_cats["Control"]))
400422
-set(ungroup_cat([surrogate_codepoints]))))
401-
423+
grapheme_cats["InCB_Consonant"]=derived[("InCB","Consonant")]
424+
emoji_props=load_properties("emoji-data.txt", ["Extended_Pictographic"])
425+
grapheme_cats["Extended_Pictographic"]=emoji_props["Extended_Pictographic"]
402426
grapheme_table= []
403427
forcatingrapheme_cats:
404428
grapheme_table.extend([(x,y,cat)for (x,y)ingrapheme_cats[cat]])
405-
emoji_props=load_properties("emoji-data.txt", ["Extended_Pictographic"])
406-
grapheme_table.extend([(x,y,"Extended_Pictographic")for (x,y)inemoji_props["Extended_Pictographic"]])
407429
grapheme_table.sort(key=lambdaw:w[0])
408430
last=-1
409431
forcharsingrapheme_table:
410432
ifchars[0]<=last:
411433
raise"Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
412434
last=chars[1]
413-
emit_break_module(rf,grapheme_table,list(grapheme_cats.keys())+ ["Extended_Pictographic"],"grapheme")
435+
emit_break_module(rf,grapheme_table,list(grapheme_cats.keys()),"grapheme")
414436
rf.write("\n")
415437

416-
word_cats=load_properties("auxiliary/WordBreakProperty.txt", [])
438+
word_cats=load_properties("auxiliary/WordBreakProperty.txt")
417439
word_table= []
418440
forcatinword_cats:
419441
word_table.extend([(x,y,cat)for (x,y)inword_cats[cat]])
@@ -425,7 +447,7 @@ def emit_break_module(f, break_table, break_cats, name):
425447
emoji_table= [(x,y,"Extended_Pictographic")for (x,y)inemoji_props["Extended_Pictographic"]]
426448
emit_break_module(rf,emoji_table, ["Extended_Pictographic"],"emoji")
427449

428-
sentence_cats=load_properties("auxiliary/SentenceBreakProperty.txt", [])
450+
sentence_cats=load_properties("auxiliary/SentenceBreakProperty.txt")
429451
sentence_table= []
430452
forcatinsentence_cats:
431453
sentence_table.extend([(x,y,cat)for (x,y)insentence_cats[cat]])

‎scripts/unicode_gen_breaktests.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,8 @@ def showfun(x):
140140
returnoutstr
141141

142142
defcreate_grapheme_data(f):
143-
# rules 9.1and 9.2 are for extended graphemes only
144-
optsplits= ['9.1','9.2']
143+
# rules 9.1, 9.2,and 9.3 are for extended graphemes only
144+
optsplits= ['9.1','9.2','9.3']
145145
d=load_test_data("auxiliary/GraphemeBreakTest.txt",optsplits)
146146

147147
test_same= []
@@ -169,8 +169,8 @@ def create_grapheme_data(f):
169169
else:
170170
test_diff.append((allchars,extgraphs,c))
171171

172-
stype="&'static[(&'staticstr, &'static [&'staticstr])]"
173-
dtype="&'static[(&'staticstr, &'static [&'staticstr], &'static [&'staticstr])]"
172+
stype="&[(&str, &[&str])]"
173+
dtype="&[(&str, &[&str], &[&str])]"
174174
f.write(" // official Unicode test data\n")
175175
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)
176176
unicode.emit_table(f,"TEST_SAME",test_same,stype,True,showfun,True)
@@ -185,7 +185,7 @@ def create_words_data(f):
185185
allchars= [cnforsincforcnins]
186186
test.append((allchars,c))
187187

188-
wtype="&'static[(&'staticstr, &'static [&'staticstr])]"
188+
wtype="&[(&str, &[&str])]"
189189
f.write(" // official Unicode test data\n")
190190
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)
191191
unicode.emit_table(f,"TEST_WORD",test,wtype,True,showfun,True)
@@ -199,7 +199,7 @@ def create_sentence_data(f):
199199
allchars= [cnforsincforcnins]
200200
test.append((allchars,c))
201201

202-
wtype="&'static[(&'staticstr, &'static [&'staticstr])]"
202+
wtype="&[(&str, &[&str])]"
203203
f.write(" // official Unicode test data\n")
204204
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n"%unicode.UNICODE_VERSION_NUMBER)
205205
unicode.emit_table(f,"TEST_SENTENCE",test,wtype,True,showfun,True)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp