@@ -122,6 +122,11 @@ enum RegionalState {
122122Unknown ,
123123}
124124
125+ fn is_emoji ( ch : char ) ->bool {
126+ use tables:: emoji;
127+ emoji:: emoji_category ( ch) == emoji:: EmojiCat :: EC_Extended_Pictographic
128+ }
129+
125130impl < ' a > Iterator for UWordBounds < ' a > {
126131type Item =& ' a str ;
127132
@@ -182,26 +187,18 @@ impl<'a> Iterator for UWordBounds<'a> {
182187// WB4 makes all ZWJs collapse into the previous state
183188// but you can still be in a Zwj state if you started with Zwj
184189//
185- // This means that Zwj + Extend will collapse into Zwj, which is wrong,
186- // since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
187- // and that rule (WB3c) has higher priority
188- //
189- // Additionally, Emoji_Base+ZWJ+(EBG/GAZ) will collapse into Emoji_Base+EBG/GAZ
190- // which won't have a boundary even though EB+ZWJ+GAZ should have a boundary.
190+ // This means that an EP + Zwj will collapse into EP, which is wrong,
191+ // since EP+EP is not a boundary but EP+ZWJ+EP is
191192//
192193// Thus, we separately keep track of whether or not the last character
193194// was a ZWJ. This is an additional bit of state tracked outside of the
194195// state enum; the state enum represents the last non-zwj state encountered.
195196// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
196197// however we are in the previous state for the purposes of all other rules.
197198if prev_zwj{
198- match cat{
199- wd:: WC_Glue_After_Zwj =>continue ,
200- wd:: WC_E_Base_GAZ =>{
201- state =Emoji ;
202- continue ;
203- } ,
204- _ =>( )
199+ if is_emoji ( ch) {
200+ state =Emoji ;
201+ continue ;
205202}
206203}
207204// Don't use `continue` in this match without updating `cat`
@@ -222,7 +219,6 @@ impl<'a> Iterator for UWordBounds<'a> {
222219 wd:: WC_Regional_Indicator =>Regional ( RegionalState :: Half ) , // rule WB13c
223220 wd:: WC_LF | wd:: WC_Newline =>break , // rule WB3a
224221 wd:: WC_ZWJ =>Zwj , // rule WB3c
225- wd:: WC_E_Base | wd:: WC_E_Base_GAZ =>Emoji , // rule WB14
226222 _ =>{
227223if let Some ( ncat) =self . get_next_cat ( idx) { // rule WB4
228224if ncat == wd:: WC_Format || ncat == wd:: WC_Extend || ncat == wd:: WC_ZWJ {
@@ -235,9 +231,7 @@ impl<'a> Iterator for UWordBounds<'a> {
235231}
236232} ,
237233Zwj =>{
238- // We already handle WB3c above. At this point,
239- // the current category is not GAZ or EBG,
240- // or the previous character was not actually a ZWJ
234+ // We already handle WB3c above.
241235 take_curr =false ;
242236break ;
243237}
@@ -313,12 +307,10 @@ impl<'a> Iterator for UWordBounds<'a> {
313307}
314308} ,
315309Regional ( _) =>unreachable ! ( "RegionalState::Unknown should not occur on forward iteration" ) ,
316- Emoji =>match cat{ // rule WB14
317- wd:: WC_E_Modifier => state,
318- _ =>{
319- take_curr =false ;
320- break ;
321- }
310+ Emoji =>{
311+ // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
312+ take_curr =false ;
313+ break ;
322314} ,
323315FormatExtend ( t) =>match t{ // handle FormatExtends depending on what type
324316RequireNumeric if cat == wd:: WC_Numeric =>Numeric , // rule WB11
@@ -422,20 +414,19 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
422414// Don't use `continue` in this match without updating `catb`
423415 state =match state{
424416Start |FormatExtend ( AcceptAny ) =>match cat{
417+ _if is_emoji ( ch) =>Zwj ,
425418 wd:: WC_ALetter =>Letter , // rule WB5, WB7, WB10, WB13b
426419 wd:: WC_Hebrew_Letter =>HLetter , // rule WB5, WB7, WB7c, WB10, WB13b
427420 wd:: WC_Numeric =>Numeric , // rule WB8, WB9, WB11, WB13b
428421 wd:: WC_Katakana =>Katakana , // rule WB13, WB13b
429422 wd:: WC_ExtendNumLet =>ExtendNumLet , // rule WB13a
430423 wd:: WC_Regional_Indicator =>Regional ( RegionalState :: Unknown ) , // rule WB13c
431- wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ =>Zwj , // rule WB3c
432424// rule WB4:
433425 wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ =>FormatExtend ( AcceptAny ) ,
434426 wd:: WC_Single_Quote =>{
435427 saveidx = idx;
436428FormatExtend ( AcceptQLetter ) // rule WB7a
437429} ,
438- wd:: WC_E_Modifier =>Emoji , // rule WB14
439430 wd:: WC_CR | wd:: WC_LF | wd:: WC_Newline =>{
440431if state ==Start {
441432if cat == wd:: WC_LF {
@@ -539,11 +530,10 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
539530break ;
540531}
541532} ,
542- Emoji =>match cat { // rule WB14
543- wd :: WC_E_Base | wd :: WC_E_Base_GAZ => {
533+ Emoji =>{
534+ if is_emoji ( ch ) { // rule WB3c
544535Zwj
545- } ,
546- _ =>{
536+ } else {
547537 take_curr =false ;
548538break ;
549539}