Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit6591535

Browse files
committed
Add WSegSpace support for in word boundaries from Unicode 11
1 parent0b168d5 commit6591535

File tree

2 files changed

+27
-2
lines changed

2 files changed

+27
-2
lines changed

‎src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
//!
3030
//! let s = "The quick (\"brown\") fox";
3131
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
32-
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];
32+
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
3333
//! assert_eq!(w, b);
3434
//! }
3535
//! ```
@@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
156156
/// ```
157157
/// # use self::unicode_segmentation::UnicodeSegmentation;
158158
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
159-
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "", " ", "fox"];
159+
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
160160
///
161161
/// assert_eq!(&swu1[..], b);
162162
/// ```

‎src/word.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ enum UWordBoundsState {
102102
FormatExtend(FormatExtendType),
103103
Zwj,
104104
Emoji,
105+
WSegSpace,
105106
}
106107

107108
// subtypes for FormatExtend state in UWordBoundsState
@@ -156,6 +157,8 @@ impl<'a> Iterator for UWordBounds<'a> {
156157
// Whether or not the previous category was ZWJ
157158
// ZWJs get collapsed, so this handles precedence of WB3c over WB4
158159
letmut prev_zwj;
160+
// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
161+
letmut skipped_format_extend =false;
159162
for(curr, ch)inself.string.char_indices(){
160163
idx = curr;
161164
prev_zwj = cat == wd::WC_ZWJ;
@@ -177,6 +180,7 @@ impl<'a> Iterator for UWordBounds<'a> {
177180
if state !=Start{
178181
match cat{
179182
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ =>{
183+
skipped_format_extend =true;
180184
continue
181185
}
182186
_ =>{}
@@ -219,6 +223,7 @@ impl<'a> Iterator for UWordBounds<'a> {
219223
wd::WC_Regional_Indicator =>Regional(RegionalState::Half),// rule WB13c
220224
wd::WC_LF | wd::WC_Newline =>break,// rule WB3a
221225
wd::WC_ZWJ =>Zwj,// rule WB3c
226+
wd::WC_WSegSpace =>WSegSpace,// rule WB3d
222227
_ =>{
223228
ifletSome(ncat) =self.get_next_cat(idx){// rule WB4
224229
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ{
@@ -230,6 +235,13 @@ impl<'a> Iterator for UWordBounds<'a> {
230235
break;// rule WB999
231236
}
232237
},
238+
WSegSpace =>match cat{
239+
wd::WC_WSegSpaceif !skipped_format_extend =>WSegSpace,
240+
_ =>{
241+
take_curr =false;
242+
break;
243+
}
244+
},
233245
Zwj =>{
234246
// We already handle WB3c above.
235247
take_curr =false;
@@ -371,6 +383,8 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
371383
letmut savestate =Start;
372384
letmut cat = wd::WC_Any;
373385

386+
letmut skipped_format_extend =false;
387+
374388
for(curr, ch)inself.string.char_indices().rev(){
375389
previdx = idx;
376390
idx = curr;
@@ -409,6 +423,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
409423
state = savestate;
410424
previdx = saveidx;
411425
take_cat =false;
426+
skipped_format_extend =true;
412427
}
413428

414429
// Don't use `continue` in this match without updating `catb`
@@ -427,6 +442,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
427442
saveidx = idx;
428443
FormatExtend(AcceptQLetter)// rule WB7a
429444
},
445+
wd::WC_WSegSpace =>WSegSpace,
430446
wd::WC_CR | wd::WC_LF | wd::WC_Newline =>{
431447
if state ==Start{
432448
if cat == wd::WC_LF{
@@ -451,6 +467,15 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
451467
break;
452468
}
453469
},
470+
WSegSpace =>match cat{// rule WB3d
471+
wd::WC_WSegSpaceif !skipped_format_extend =>{
472+
WSegSpace
473+
}
474+
_ =>{
475+
take_curr =false;
476+
break;
477+
}
478+
},
454479
Letter |HLetter =>match cat{
455480
wd::WC_ALetter =>Letter,// rule WB5
456481
wd::WC_Hebrew_Letter =>HLetter,// rule WB5

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp