Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Update to Unicode 13.0 and implement confusable detection.#11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
Manishearth merged 2 commits intounicode-rs:masterfromcrlf0710:master
Apr 27, 2020
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletionsCargo.toml
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -17,6 +17,7 @@ exclude = [ "target/*", "Cargo.lock" ]

[dependencies]
unicode-script = { version = "0.4.0", default-features = false }
unicode-normalization = { version = "0.1.12", default-features = false }
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
core = { version = "1.0", package = "rustc-std-workspace-core", optional = true }
compiler_builtins = { version = "0.1", optional = true }
Expand Down
81 changes: 79 additions & 2 deletionsscripts/unicode.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -34,7 +34,7 @@
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
'''

UNICODE_VERSION = (12, 1, 0)
UNICODE_VERSION = (13, 0, 0)

UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION

Expand All@@ -54,7 +54,7 @@ def load_properties(f, interestingprops = None):
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")

for line in fileinput.input(os.path.basename(f)):
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
prop = None
d_lo = 0
d_hi = 0
Expand All@@ -81,6 +81,28 @@ def load_properties(f, interestingprops = None):

return props

def load_confusables(f):
fetch(f)
confusables = []
re1 = re.compile(r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*")

for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
d_input = 0
d_outputs = []
m = re1.match(line)
if not m:
continue
d_inputs = m.group(1).split()
if len(d_inputs) != 1:
raise Exception('More than one code point in first column')
d_input = int(d_inputs[0].strip(), 16)
for d_output in m.group(2).split():
d_outputitem = int(d_output, 16);
d_outputs.append(d_outputitem);
confusables.append((d_input, d_outputs))

return confusables

def format_table_content(f, content, indent):
line = " "*indent
first = True
Expand All@@ -99,6 +121,18 @@ def format_table_content(f, content, indent):
def escape_char(c):
return "'\\u{%x}'" % c

def escape_char_list(l):
line = "[";
first = True;
for c in l:
if first:
line += escape_char(c);
else:
line += ", " + escape_char(c);
first = False;
line += "]";
return line

def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
pub_string = "const"
Expand DownExpand Up@@ -173,10 +207,51 @@ def emit_identifier_module(f):
pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
f.write("}\n\n")

def emit_confusable_detection_module(f):
f.write("pub mod confusable_detection {")
f.write("""

#[inline]
pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> {
// FIXME: do we want to special case ASCII here?
match c as usize {
_ => super::util::bsearch_value_table(c, CONFUSABLES)
}
}

""")

f.write(" // Confusable table:\n")
confusable_table = load_confusables("confusables.txt")
confusable_table.sort(key=lambda w: w[0])

last_key = None
for (k, v) in confusable_table:
if k == last_key:
raise Exception("duplicate keys in confusables table: %s" % k)
last_key = k

emit_table(f, "CONFUSABLES", confusable_table, "&'static [(char, &'static [char])]", is_pub=False,
pfun=lambda x: "(%s, &%s)" % (escape_char(x[0]), escape_char_list(x[1])))
f.write("}\n\n")


def emit_util_mod(f):
f.write("""
pub mod util {
use core::result::Result::{Ok, Err};

#[inline]
pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
match r.binary_search_by_key(&c, |&(k, _)| k) {
Ok(idx) => {
let (_, v) = r[idx];
Some(v)
}
Err(_) => None
}
}

#[inline]
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
use core::cmp::Ordering::{Equal, Less, Greater};
Expand DownExpand Up@@ -224,3 +299,5 @@ def emit_util_mod(f):
emit_util_mod(rf)
### identifier module
emit_identifier_module(rf)
### confusable_detection module
emit_confusable_detection_module(rf)
39 changes: 39 additions & 0 deletionssrc/confusable_detection.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
//! [Confusable detection](https://www.unicode.org/reports/tr39/#Confusable_Detection)

use core::iter;

enum OnceOrMore<T, I> {
Once(iter::Once<T>),
More(I),
}

impl<T, I> Iterator for OnceOrMore<T, I>
where
I: Iterator<Item = T>,
{
type Item = T;

fn next(&mut self) -> Option<T> {
use OnceOrMore::*;
match self {
Once(v) => v.next(),
More(i) => i.next(),
}
}
}

type StaticSliceIterCloned = core::iter::Cloned<core::slice::Iter<'static, char>>;

fn char_prototype(c: char) -> OnceOrMore<char, StaticSliceIterCloned> {
use crate::tables::confusable_detection::char_confusable_prototype;
match char_confusable_prototype(c) {
None => OnceOrMore::Once(iter::once(c)),
Some(l) => OnceOrMore::More(l.iter().cloned()),
}
}

/// Calculate skeleton for string, as defined by UTS 39
pub fn skeleton(s: &str) -> impl Iterator<Item = char> + '_ {
use unicode_normalization::UnicodeNormalization;
s.chars().nfd().flat_map(char_prototype).nfd()
}
2 changes: 2 additions & 0 deletionssrc/lib.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -58,10 +58,12 @@ extern crate test;

pub use tables::UNICODE_VERSION;

pub mod confusable_detection;
pub mod general_security_profile;
pub mod mixed_script;
pub mod restriction_level;

pub use confusable_detection::skeleton;
pub use general_security_profile::GeneralSecurityProfile;
pub use mixed_script::MixedScript;
pub use restriction_level::{RestrictionLevel, RestrictionLevelDetection};
Expand Down
Loading

[8]ページ先頭

©2009-2025 Movatter.jp