Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit8f56a97

Browse files
authored
Merge pull request#11 from crlf0710/master
Update to Unicode 13.0 and implement confusable detection.
2 parents916eec5 +7786cb6 commit8f56a97

File tree

6 files changed

+3837
-1186
lines changed

6 files changed

+3837
-1186
lines changed

‎Cargo.toml‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ exclude = [ "target/*", "Cargo.lock" ]
1717

1818
[dependencies]
1919
unicode-script = {version ="0.4.0",default-features =false }
20+
unicode-normalization = {version ="0.1.12",default-features =false }
2021
std = {version ="1.0",package ="rustc-std-workspace-std",optional =true }
2122
core = {version ="1.0",package ="rustc-std-workspace-core",optional =true }
2223
compiler_builtins = {version ="0.1",optional =true }

‎scripts/unicode.py‎

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
3535
'''
3636

37-
UNICODE_VERSION= (12,1,0)
37+
UNICODE_VERSION= (13,0,0)
3838

3939
UNICODE_VERSION_NUMBER="%s.%s.%s"%UNICODE_VERSION
4040

@@ -54,7 +54,7 @@ def load_properties(f, interestingprops = None):
5454
re1=re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
5555
re2=re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
5656

57-
forlineinfileinput.input(os.path.basename(f)):
57+
forlineinfileinput.input(os.path.basename(f),openhook=fileinput.hook_encoded("utf-8")):
5858
prop=None
5959
d_lo=0
6060
d_hi=0
@@ -81,6 +81,28 @@ def load_properties(f, interestingprops = None):
8181

8282
returnprops
8383

84+
defload_confusables(f):
85+
fetch(f)
86+
confusables= []
87+
re1=re.compile(r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*")
88+
89+
forlineinfileinput.input(os.path.basename(f),openhook=fileinput.hook_encoded("utf-8")):
90+
d_input=0
91+
d_outputs= []
92+
m=re1.match(line)
93+
ifnotm:
94+
continue
95+
d_inputs=m.group(1).split()
96+
iflen(d_inputs)!=1:
97+
raiseException('More than one code point in first column')
98+
d_input=int(d_inputs[0].strip(),16)
99+
ford_outputinm.group(2).split():
100+
d_outputitem=int(d_output,16);
101+
d_outputs.append(d_outputitem);
102+
confusables.append((d_input,d_outputs))
103+
104+
returnconfusables
105+
84106
defformat_table_content(f,content,indent):
85107
line=" "*indent
86108
first=True
@@ -99,6 +121,18 @@ def format_table_content(f, content, indent):
99121
defescape_char(c):
100122
return"'\\u{%x}'"%c
101123

124+
defescape_char_list(l):
125+
line="[";
126+
first=True;
127+
forcinl:
128+
iffirst:
129+
line+=escape_char(c);
130+
else:
131+
line+=", "+escape_char(c);
132+
first=False;
133+
line+="]";
134+
returnline
135+
102136
defemit_table(f,name,t_data,t_type="&'static [(char, char)]",is_pub=True,
103137
pfun=lambdax:"(%s,%s)"% (escape_char(x[0]),escape_char(x[1])),is_const=True):
104138
pub_string="const"
@@ -173,10 +207,51 @@ def emit_identifier_module(f):
173207
pfun=lambdax:"(%s,%s, IdentifierType::%s)"% (escape_char(x[0]),escape_char(x[1]),x[2]))
174208
f.write("}\n\n")
175209

210+
defemit_confusable_detection_module(f):
211+
f.write("pub mod confusable_detection {")
212+
f.write("""
213+
214+
#[inline]
215+
pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> {
216+
// FIXME: do we want to special case ASCII here?
217+
match c as usize {
218+
_ => super::util::bsearch_value_table(c, CONFUSABLES)
219+
}
220+
}
221+
222+
""")
223+
224+
f.write(" // Confusable table:\n")
225+
confusable_table=load_confusables("confusables.txt")
226+
confusable_table.sort(key=lambdaw:w[0])
227+
228+
last_key=None
229+
for (k,v)inconfusable_table:
230+
ifk==last_key:
231+
raiseException("duplicate keys in confusables table: %s"%k)
232+
last_key=k
233+
234+
emit_table(f,"CONFUSABLES",confusable_table,"&'static [(char, &'static [char])]",is_pub=False,
235+
pfun=lambdax:"(%s, &%s)"% (escape_char(x[0]),escape_char_list(x[1])))
236+
f.write("}\n\n")
237+
238+
176239
defemit_util_mod(f):
177240
f.write("""
178241
pub mod util {
179242
use core::result::Result::{Ok, Err};
243+
244+
#[inline]
245+
pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
246+
match r.binary_search_by_key(&c, |&(k, _)| k) {
247+
Ok(idx) => {
248+
let (_, v) = r[idx];
249+
Some(v)
250+
}
251+
Err(_) => None
252+
}
253+
}
254+
180255
#[inline]
181256
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182257
use core::cmp::Ordering::{Equal, Less, Greater};
@@ -224,3 +299,5 @@ def emit_util_mod(f):
224299
emit_util_mod(rf)
225300
### identifier module
226301
emit_identifier_module(rf)
302+
### confusable_detection module
303+
emit_confusable_detection_module(rf)

‎src/confusable_detection.rs‎

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
//! [Confusable detection](https://www.unicode.org/reports/tr39/#Confusable_Detection)
2+
3+
use core::iter;
4+
5+
enumOnceOrMore<T,I>{
6+
Once(iter::Once<T>),
7+
More(I),
8+
}
9+
10+
impl<T,I>IteratorforOnceOrMore<T,I>
11+
where
12+
I:Iterator<Item =T>,
13+
{
14+
typeItem =T;
15+
16+
fnnext(&mutself) ->Option<T>{
17+
useOnceOrMore::*;
18+
matchself{
19+
Once(v) => v.next(),
20+
More(i) => i.next(),
21+
}
22+
}
23+
}
24+
25+
typeStaticSliceIterCloned = core::iter::Cloned<core::slice::Iter<'static,char>>;
26+
27+
fnchar_prototype(c:char) ->OnceOrMore<char,StaticSliceIterCloned>{
28+
usecrate::tables::confusable_detection::char_confusable_prototype;
29+
matchchar_confusable_prototype(c){
30+
None =>OnceOrMore::Once(iter::once(c)),
31+
Some(l) =>OnceOrMore::More(l.iter().cloned()),
32+
}
33+
}
34+
35+
/// Calculate skeleton for string, as defined by UTS 39
36+
pubfnskeleton(s:&str) ->implIterator<Item =char> +'_{
37+
use unicode_normalization::UnicodeNormalization;
38+
s.chars().nfd().flat_map(char_prototype).nfd()
39+
}

‎src/lib.rs‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,12 @@ extern crate test;
5858

5959
pubuse tables::UNICODE_VERSION;
6060

61+
pubmod confusable_detection;
6162
pubmod general_security_profile;
6263
pubmod mixed_script;
6364
pubmod restriction_level;
6465

66+
pubuse confusable_detection::skeleton;
6567
pubuse general_security_profile::GeneralSecurityProfile;
6668
pubuse mixed_script::MixedScript;
6769
pubuse restriction_level::{RestrictionLevel,RestrictionLevelDetection};

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp