Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commiteb9d304

Browse files
authored
Merge pull request#31 from ohhithere/fix-internal-skeleton
Fix internalSkeleton
2 parents22d684a +78707a7 commiteb9d304

File tree

4 files changed

+744
-902
lines changed

4 files changed

+744
-902
lines changed

‎scripts/unicode.py‎

Lines changed: 35 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# - confusables.txt
1818
# - ReadMe.txt
1919
# This script also uses the following Unicode UCD data:
20+
# - DerivedCoreProperties.txt
2021
# - Scripts.txt
2122
#
2223
# Since this should not require frequent updates, we just store this
@@ -53,6 +54,8 @@ def fetch(f):
5354
sys.stderr.write("cannot load %s\n"%f)
5455
exit(1)
5556

57+
returnf
58+
5659
# Download a UCD table file
5760
deffetch_unidata(f):
5861
ifnotos.path.exists(os.path.basename(f)):
@@ -63,14 +66,14 @@ def fetch_unidata(f):
6366
sys.stderr.write("cannot load %s"%f)
6467
exit(1)
6568

66-
# Loads code point data from IdentifierStatus.txt and
67-
# IdentifierType.txt
68-
# Implementation from unicode-segmentation
69+
returnf
70+
71+
# Loads code point data from provided filename f
72+
# Implementation adapted from unicode-segmentation
6973
defload_properties(f,interestingprops=None):
70-
fetch(f)
7174
props= {}
72-
re1=re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
73-
re2=re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
75+
re1=re.compile(r"^ *([0-9A-F]+) *; *([^#\s]+) *#")
76+
re2=re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#\s]+) *#")
7477

7578
forlineinfileinput.input(os.path.basename(f),openhook=fileinput.hook_encoded("utf-8")):
7679
prop=None
@@ -99,42 +102,6 @@ def load_properties(f, interestingprops = None):
99102

100103
returnprops
101104

102-
# Loads script data from Scripts.txt
103-
defload_script_properties(f,interestingprops):
104-
fetch_unidata(f)
105-
props= {}
106-
# Note: these regexes are different from those in unicode-segmentation,
107-
# becase we need to handle spaces here
108-
re1=re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#")
109-
re2=re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#")
110-
111-
forlineinfileinput.input(os.path.basename(f)):
112-
prop=None
113-
d_lo=0
114-
d_hi=0
115-
m=re1.match(line)
116-
ifm:
117-
d_lo=m.group(1)
118-
d_hi=m.group(1)
119-
prop=m.group(2).strip()
120-
else:
121-
m=re2.match(line)
122-
ifm:
123-
d_lo=m.group(1)
124-
d_hi=m.group(2)
125-
prop=m.group(3).strip()
126-
else:
127-
continue
128-
ifinterestingpropsandpropnotininterestingprops:
129-
continue
130-
d_lo=int(d_lo,16)
131-
d_hi=int(d_hi,16)
132-
ifpropnotinprops:
133-
props[prop]= []
134-
props[prop].append((d_lo,d_hi))
135-
136-
returnprops
137-
138105
# Loads confusables data from confusables.txt
139106
defload_confusables(f):
140107
fetch(f)
@@ -189,7 +156,7 @@ def load_scripts(f):
189156
# changes are introduced, update accordingly.
190157

191158
(longforms,shortforms)=aliases()
192-
scripts=load_script_properties(f, [])
159+
scripts=load_properties(fetch_unidata(f), [])
193160

194161
script_table= []
195162
script_list= []
@@ -546,10 +513,10 @@ def emit_identifier_module(f):
546513
""")
547514

548515
f.write(" // Identifier status table:\n")
549-
identifier_status_table=load_properties("IdentifierStatus.txt")
516+
identifier_status_table=load_properties(fetch("IdentifierStatus.txt"))
550517
emit_table(f,"IDENTIFIER_STATUS",identifier_status_table['Allowed'],"&'static [(char, char)]",is_pub=False,
551518
pfun=lambdax:"(%s,%s)"% (escape_char(x[0]),escape_char(x[1])))
552-
identifier_type=load_properties("IdentifierType.txt")
519+
identifier_type=load_properties(fetch("IdentifierType.txt"))
553520
type_table= []
554521
fortyinidentifier_type:
555522
type_table.extend([(x,y,ty)for (x,y)inidentifier_type[ty]])
@@ -560,6 +527,26 @@ def emit_identifier_module(f):
560527
pfun=lambdax:"(%s,%s, IdentifierType::%s)"% (escape_char(x[0]),escape_char(x[1]),x[2]))
561528
f.write("}\n\n")
562529

530+
defemit_default_ignorable_detection_module(f):
531+
f.write("pub mod default_ignorable_code_point {")
532+
f.write("""
533+
534+
#[inline]
535+
pub fn default_ignorable_code_point(c: char) -> bool {
536+
match c as usize {
537+
_ => super::util::bsearch_range_table(c, DEFAULT_IGNORABLE)
538+
}
539+
}
540+
541+
""")
542+
543+
f.write(" // Default ignorable code point table:\n")
544+
default_ignorable_table=load_properties(fetch_unidata("DerivedCoreProperties.txt"), ["Default_Ignorable_Code_Point"])
545+
emit_table(f,"DEFAULT_IGNORABLE",default_ignorable_table["Default_Ignorable_Code_Point"],"&'static [(char, char)]",is_pub=False,
546+
pfun=lambdax:"(%s,%s)"% (escape_char(x[0]),escape_char(x[1])))
547+
548+
f.write("}\n\n")
549+
563550
defemit_confusable_detection_module(f):
564551
f.write("pub mod confusable_detection {")
565552
f.write("""
@@ -601,7 +588,7 @@ def emit_potiential_mixed_script_confusable(f):
601588
}
602589
}
603590
""")
604-
identifier_status_table=load_properties("IdentifierStatus.txt")
591+
identifier_status_table=load_properties(fetch("IdentifierStatus.txt"))
605592
_,scripts=load_scripts("Scripts.txt")
606593
identifier_allowed=identifier_status_table['Allowed']
607594
(mixedscript_confusable,mixedscript_confusable_unresolved)=load_potential_mixedscript_confusables("confusables.txt",identifier_allowed,scripts)
@@ -688,6 +675,8 @@ def emit_util_mod(f):
688675
emit_util_mod(rf)
689676
### identifier module
690677
emit_identifier_module(rf)
678+
### default_ignorable_detection module
679+
emit_default_ignorable_detection_module(rf)
691680
### confusable_detection module
692681
emit_confusable_detection_module(rf)
693682
### mixed_script_confusable_detection module

‎src/confusable_detection.rs‎

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ fn char_prototype(c: char) -> OnceOrMore<char, StaticSliceIterCloned> {
3434

3535
/// Calculate skeleton for string, as defined by UTS 39
3636
pubfnskeleton(s:&str) ->implIterator<Item =char> +'_{
37+
usecrate::tables::default_ignorable_code_point::default_ignorable_code_point;
3738
use unicode_normalization::UnicodeNormalization;
38-
s.chars().nfd().flat_map(char_prototype).nfd()
39+
40+
s.chars()
41+
.nfd()
42+
.filter(|c| !default_ignorable_code_point(*c))
43+
.flat_map(char_prototype)
44+
.nfd()
3945
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp