Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Update to Unicode 11#68

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
Manishearth merged 6 commits intomasterfromunicode-11
Oct 30, 2019
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 26 additions & 11 deletionsscripts/unicode.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -54,13 +54,21 @@
# these are the surrogate codepoints, which are not valid rust characters
surrogate_codepoints = (0xd800, 0xdfff)

UNICODE_VERSION = (11, 0, 0)

UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION

def is_surrogate(n):
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]

def fetch(f):
if not os.path.exists(os.path.basename(f)):
os.system("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"
% f)
if "emoji" in f:
os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
% (UNICODE_VERSION[0], UNICODE_VERSION[1], f))
else:
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
% (UNICODE_VERSION_NUMBER, f))

if not os.path.exists(os.path.basename(f)):
sys.stderr.write("cannot load %s" % f)
Expand DownExpand Up@@ -262,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
pub use self::%sCat::*;

#[allow(non_camel_case_types)]
#[derive(Clone, Copy, PartialEq, Eq)]
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum %sCat {
""" % (name, Name, Name))

Expand DownExpand Up@@ -305,18 +313,13 @@ def emit_break_module(f, break_table, break_cats, name):
with open(r, "w") as rf:
# write the file's preamble
rf.write(preamble)

# download and parse all the data
fetch("ReadMe.txt")
with open("ReadMe.txt") as readme:
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
unicode_version = re.search(pattern, readme.read()).groups()
rf.write("""
/// The version of [Unicode](http://www.unicode.org/)
/// that this version of unicode-segmentation is based on.
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
""" %unicode_version)
""" %UNICODE_VERSION)

# download and parse all the data
gencats = load_gencats("UnicodeData.txt")
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])

Expand All@@ -341,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
grapheme_table = []
for cat in grapheme_cats:
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
grapheme_table.sort(key=lambda w: w[0])
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
last = -1
for chars in grapheme_table:
if chars[0] <= last:
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
last = chars[1]
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
rf.write("\n")

word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
Expand All@@ -352,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, list(word_cats.keys()), "word")

# There are some emoji which are also ALetter, so this needs to be stored separately
# For efficiency, we could still merge the two tables and produce an ALetterEP state
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")

sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
sentence_table = []
for cat in sentence_cats:
Expand Down
6 changes: 3 additions & 3 deletionsscripts/unicode_gen_breaktests.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -172,7 +172,7 @@ def create_grapheme_data(f):
stype = "&'static [(&'static str, &'static [&'static str])]"
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)

Expand All@@ -187,7 +187,7 @@ def create_words_data(f):

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/WordBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

def create_sentence_data(f):
Expand All@@ -201,7 +201,7 @@ def create_sentence_data(f):

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/SentenceBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

if __name__ == "__main__":
Expand Down
23 changes: 13 additions & 10 deletionssrc/grapheme.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -147,8 +147,8 @@ enum GraphemeState {
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
Regional,
// The codepoint after isin the E_Modifier category, so whether it's a boundary
// depends on pre-context according toGB10.
// The codepoint after isExtended_Pictographic,
//so whether it's a boundarydepends on pre-context according toGB11.
Emoji,
}

Expand DownExpand Up@@ -239,11 +239,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
(_, GC_ZWJ) => NotBreak, // GB9
(_, GC_SpacingMark) => Extended, // GB9a
(GC_Prepend, _) => Extended, // GB9b
(GC_E_Base, GC_E_Modifier) => NotBreak, // GB10
(GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
(GC_Extend, GC_E_Modifier) => Emoji, // GB10
(GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11
(GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
(_, _) => Break, // GB999
}
Expand DownExpand Up@@ -415,10 +411,17 @@ impl GraphemeCursor {

fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
use tables::grapheme as gr;
for ch in chunk.chars().rev() {
let mut iter = chunk.chars().rev();
if let Some(ch) = iter.next() {
if gr::grapheme_category(ch) != gr::GC_ZWJ {
self.decide(true);
return;
}
}
for ch in iter {
match gr::grapheme_category(ch) {
gr::GC_Extend => (),
gr::GC_E_Base | gr::GC_E_Base_GAZ => {
gr::GC_Extended_Pictographic => {
self.decide(false);
return;
}
Expand DownExpand Up@@ -484,7 +487,7 @@ impl GraphemeCursor {
let mut need_pre_context = true;
match self.cat_after.unwrap() {
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
_ => need_pre_context = self.cat_before.is_none(),
}
if need_pre_context {
Expand Down
4 changes: 2 additions & 2 deletionssrc/lib.rs
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -29,7 +29,7 @@
//!
//! let s = "The quick (\"brown\") fox";
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
//! assert_eq!(w, b);
//! }
//! ```
Expand DownExpand Up@@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
///
/// assert_eq!(&swu1[..], b);
/// ```
Expand Down
Loading

[8]ページ先頭

©2009-2025 Movatter.jp