@@ -83,12 +83,10 @@ impl From<Script> for ScriptExtension {
8383impl TryFrom < ScriptExtension > for Script {
8484type Error =( ) ;
8585fn try_from ( ext : ScriptExtension ) ->Result < Self , ( ) > {
86- if ext. is_common_or_inherited ( ) {
87- if ext. common {
88- Ok ( Script :: Common )
89- } else {
90- Ok ( Script :: Inherited )
91- }
86+ if ext. is_common ( ) {
87+ Ok ( Script :: Common )
88+ } else if ext. is_inherited ( ) {
89+ Ok ( Script :: Inherited )
9290} else if ext. is_empty ( ) {
9391Ok ( Script :: Unknown )
9492} else {
@@ -131,94 +129,88 @@ impl fmt::Display for Script {
131129}
132130}
133131
134- #[ derive( Clone , Copy , PartialEq , Eq , Hash ) ]
132+ #[ derive( Clone , Copy , PartialEq , Eq , Hash , Debug ) ]
135133#[ non_exhaustive]
136134/// A value for the `Script_Extension` property
137135///
138136/// [`ScriptExtension`] is one or more [`Script`]
139137///
140138/// This is essentially an optimized version of `Vec<Script>` that uses bitfields
141139pub struct ScriptExtension {
142- // A bitset for the first64 scripts
140+ // A bitset for the first scripts [0..64]
143141first : u64 ,
144- // A bitset for the scripts65- 128
142+ // A bitset for the scripts[65.. 128]
145143second : u64 ,
146- // A bitset for scripts after 128
144+ // A bitset for scripts after [128..NEXT_SCRIPT]
145+ // The last 2 bits represent whether Common and Inherited is included
146+ // * Bit 63 indicates whether it includes Common
147+ // * Bit 64 indicates whether it includes Inherited
147148third : u64 ,
148- // Both Common and Inherited are represented by all used bits being set,
149- // this flag lets us distinguish the two.
150- common : bool ,
151149}
152150
153151impl ScriptExtension {
154152// We don't use the complete u64 of `third`, so the "all" value is not just u32::MAX
155153// Instead, we take the number of the next (unused) script bit, subtract 128 to bring
156154// it in the range of `third`, create a u64 with just that bit set, and subtract 1
157155// to create one with all the lower bits set.
158- const THIRD_MAX : u64 =( ( 1 <<( NEXT_SCRIPT -128 ) ) -1 ) ;
156+ const _CHECK: ( ) =assert ! ( NEXT_SCRIPT -128 <63 ) ;
157+ const COMMON_MASK : u64 =( 1 <<62 ) ; // 63rd bit
158+ const INHERITED_MASK : u64 =( 1 <<63 ) ; // 64th bit
159159
160160pub ( crate ) const fn new ( first : u64 , second : u64 , third : u64 ) ->Self {
161161ScriptExtension {
162162 first,
163163 second,
164164 third,
165- common : false ,
166165}
167166}
168167
168+ /// Returns a ScriptExtension containing only Common.
169169pub ( crate ) const fn new_common ( ) ->Self {
170170ScriptExtension {
171- first : u64:: MAX ,
172- second : u64:: MAX ,
173- third : Self :: THIRD_MAX ,
174- common : true ,
171+ first : 0 ,
172+ second : 0 ,
173+ third : Self :: COMMON_MASK ,
175174}
176175}
177176
177+ /// Returns a ScriptExtension containing only Inherited.
178178pub ( crate ) const fn new_inherited ( ) ->Self {
179179ScriptExtension {
180- first : u64:: MAX ,
181- second : u64:: MAX ,
182- third : Self :: THIRD_MAX ,
183- common : false ,
180+ first : 0 ,
181+ second : 0 ,
182+ third : Self :: INHERITED_MASK ,
184183}
185184}
186185
186+ /// Returns an empty ScriptExtension
187187pub ( crate ) const fn new_unknown ( ) ->Self {
188188ScriptExtension {
189189first : 0 ,
190190second : 0 ,
191191third : 0 ,
192- common : false ,
193192}
194193}
195194
196- const fn is_common_or_inherited ( self ) ->bool {
197- ( self . first == u64:: MAX ) & ( self . second == u64:: MAX ) & ( self . third ==Self :: THIRD_MAX )
198- }
199-
200195/// Checks if the script extension is Common
201196pub const fn is_common ( self ) ->bool {
202- self . is_common_or_inherited ( ) & self . common
197+ ( self . third & Self :: COMMON_MASK ) != 0
203198}
204199
205200/// Checks if the script extension is Inherited
206201pub const fn is_inherited ( self ) ->bool {
207- self . is_common_or_inherited ( ) & ! self . common
202+ ( self . third & Self :: INHERITED_MASK ) != 0
208203}
209204
210205/// Checks if the script extension is empty (unknown)
211206pub const fn is_empty ( self ) ->bool {
212207( self . first ==0 ) & ( self . second ==0 ) & ( self . third ==0 )
213208}
214209
215- /// Returns the number of scripts in the script extension
210+ /// Returns the number of scripts in the script extension. Common and
211+ /// Inherited, if present, are included and counted independently in the return value.
216212pub fn len ( self ) ->usize {
217- if self . is_common_or_inherited ( ) {
218- 1
219- } else {
220- ( self . first . count_ones ( ) +self . second . count_ones ( ) +self . third . count_ones ( ) ) as usize
221- }
213+ ( self . first . count_ones ( ) +self . second . count_ones ( ) +self . third . count_ones ( ) ) as usize
222214}
223215
224216/// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
@@ -233,54 +225,47 @@ impl ScriptExtension {
233225
234226/// Find the intersection between two ScriptExtensions. Returns Unknown if things
235227/// do not intersect.
236- ///
237- /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
238- /// everything, the intersection of `Common` and `Inherited` is `Inherited`
239228pub const fn intersection ( self , other : Self ) ->Self {
240229let first =self . first & other. first ;
241230let second =self . second & other. second ;
242231let third =self . third & other. third ;
243- let common =self . common & other. common ;
244232ScriptExtension {
245233 first,
246234 second,
247235 third,
248- common,
249236}
250237}
251238
252239/// Find the union between two ScriptExtensions.
253- ///
254- /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
255- /// everything, the union of `Common` and `Inherited` is `Common`
256240pub const fn union ( self , other : Self ) ->Self {
257241let first =self . first | other. first ;
258242let second =self . second | other. second ;
259243let third =self . third | other. third ;
260- let common =self . common | other. common ;
261244ScriptExtension {
262245 first,
263246 second,
264247 third,
265- common,
266248}
267249}
268250
251+ /// Returns true if and only if all members of `self` are present in `other`.
252+ pub fn is_subset_or_equal ( self , other : Self ) ->bool {
253+ self . intersection ( other) ==self &&self . union ( other) == other
254+ }
255+
269256/// Check if this ScriptExtension contains the given script
270- ///
271- /// Should be used with specific scripts only, this will
272- /// return `true` if `self` is not `Unknown` and `script` is
273- /// `Common` or `Inherited`
274257pub fn contains_script ( self , script : Script ) ->bool {
275258 !self . intersection ( script. into ( ) ) . is_empty ( )
276259}
277260
278- /// Get the intersection of script extensions of all characters
279- /// in a string.
261+ /// Get the script extension representing the union of all scripts for
262+ /// the characters in a string.
263+ ///
264+ /// This is likely to decay to Unknown. You probably want to use `for_str_union()` instead.
280265pub fn for_str ( x : & str ) ->Self {
281- let mut ext =ScriptExtension :: default ( ) ;
266+ let mut ext =ScriptExtension :: new_unknown ( ) ;
282267for chin x. chars ( ) {
283- ext. intersect_with ( ch. into ( ) ) ;
268+ ext = ext . union ( ch. into ( ) ) ;
284269}
285270 ext
286271}
@@ -311,33 +296,23 @@ impl From<&'_ str> for ScriptExtension {
311296}
312297}
313298
314- impl fmt:: Debug for ScriptExtension {
315- fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
316- write ! ( f, "ScriptExtension(" ) ?;
317- fmt:: Display :: fmt ( self , f) ?;
318- write ! ( f, ")" )
319- }
320- }
321-
322299impl fmt:: Display for ScriptExtension {
323300fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
324- if self . is_common ( ) {
325- write ! ( f, "Common" ) ?;
326- } else if self . is_inherited ( ) {
327- write ! ( f, "Inherited" ) ?;
328- } else if self . is_empty ( ) {
301+ write ! ( f, "ScriptExtension(" ) ?;
302+ if self . is_empty ( ) {
329303write ! ( f, "Unknown" ) ?;
330304} else {
331305let mut first =true ;
332306for scriptin self . iter ( ) {
333- if !first{
334- write ! ( f, " + " ) ?;
307+ if first{
335308 first =false ;
309+ } else {
310+ write ! ( f, " + " ) ?;
336311}
337312 script. full_name ( ) . fmt ( f) ?;
338313}
339314}
340- Ok ( ( ) )
315+ write ! ( f , ")" )
341316}
342317}
343318
@@ -361,7 +336,7 @@ impl UnicodeScript for char {
361336
362337/// Iterator over scripts in a [ScriptExtension].
363338///
364- /// Can be obtainedia [ScriptExtension::iter()]
339+ /// Can be obtainedvia [ScriptExtension::iter()]
365340pub struct ScriptIterator {
366341ext : ScriptExtension ,
367342}
@@ -370,26 +345,31 @@ impl Iterator for ScriptIterator {
370345type Item =Script ;
371346
372347fn next ( & mut self ) ->Option < Script > {
373- if self . ext . is_common_or_inherited ( ) {
374- let common =self . ext . common ;
375- self . ext =ScriptExtension :: new_unknown ( ) ;
376- if common{
377- Some ( Script :: Common )
378- } else {
379- Some ( Script :: Inherited )
380- }
348+ if self . ext . is_inherited ( ) {
349+ // If `self.ext` is both Inherited and Common, this
350+ // temporarily constructs an invalid ScriptExtension. We don't
351+ // use `self.ext` for anything other than iterating over bits,
352+ // so this is okay.
353+ self . ext . third &= !ScriptExtension :: INHERITED_MASK ;
354+ Some ( Script :: Inherited )
355+ } else if self . ext . is_common ( ) {
356+ self . ext . third &= !ScriptExtension :: COMMON_MASK ;
357+ Some ( Script :: Common )
358+
381359// Are there bits left in the first chunk?
382360} else if self . ext . first !=0 {
383361// Find the next bit
384362let bit =self . ext . first . trailing_zeros ( ) ;
385363// unset just that bit
386364self . ext . first &= !( 1 << bit) ;
387365Some ( Script :: for_integer ( bitas u8 ) )
366+
388367// Are there bits left in the second chunk?
389368} else if self . ext . second !=0 {
390369let bit =self . ext . second . trailing_zeros ( ) ;
391370self . ext . second &= !( 1 << bit) ;
392371Some ( Script :: for_integer ( 64 + bitas u8 ) )
372+
393373// Are there bits left in the third chunk?
394374} else if self . ext . third !=0 {
395375let bit =self . ext . third . trailing_zeros ( ) ;
@@ -429,8 +409,8 @@ mod tests {
429409 seen_scripts. insert ( script) ;
430410 seen_exts. insert ( ext) ;
431411assert_eq ! ( scriptas u8 , bit) ;
432- assert ! ( ! ScriptExtension :: new_common( ) . intersection( ext) . is_empty( ) ) ;
433- assert ! ( ! ScriptExtension :: new_inherited( )
412+ assert ! ( ScriptExtension :: new_common( ) . intersection( ext) . is_empty( ) ) ;
413+ assert ! ( ScriptExtension :: new_inherited( )
434414. intersection( ext)
435415. is_empty( ) ) ;
436416assert ! ( ScriptExtension :: new_unknown( ) . intersection( ext) . is_empty( ) ) ;
@@ -443,13 +423,13 @@ mod tests {
443423fn test_specific ( ) {
444424let s ="सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे." ;
445425let ext =ScriptExtension :: for_str ( s) ;
446- assert_eq ! ( ext , script_extensions:: DEVA ) ;
426+ assert ! ( script_extensions:: DEVA . is_subset_or_equal ( ext ) ) ;
447427println ! (
448- "{:? }" ,
428+ "{}" ,
449429 script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
450430) ;
451431println ! (
452- "{:? }" ,
432+ "{}" ,
453433 ext. intersection(
454434 script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
455435)
@@ -461,7 +441,9 @@ mod tests {
461441let u = ext. union ( Script :: Dogra . into ( ) ) ;
462442assert_eq ! (
463443 u. intersection(
464- script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
444+ script_extensions:: COMMON . union (
445+ script_extensions:: DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
446+ )
465447) ,
466448 u
467449) ;
@@ -499,6 +481,68 @@ mod tests {
499481assert ! ( scr. is_err( ) ) ;
500482}
501483
484+ #[ test]
485+ fn test_subsets_and_iter ( ) {
486+ let cases: & [ ( ScriptExtension , & [ Script ] ) ] =& [
487+ ( ScriptExtension :: new_inherited ( ) , & [ Script :: Inherited ] ) ,
488+ ( ScriptExtension :: new_common ( ) , & [ Script :: Common ] ) ,
489+ (
490+ ScriptExtension :: new_inherited ( ) . union ( script_extensions:: COMMON ) ,
491+ & [ Script :: Inherited , Script :: Common ] ,
492+ ) ,
493+ (
494+ ScriptExtension :: new_inherited ( )
495+ . union ( script_extensions:: COMMON )
496+ . union ( script_extensions:: LATIN ) ,
497+ & [ Script :: Inherited , Script :: Common , Script :: Latin ] ,
498+ ) ,
499+ (
500+ ScriptExtension :: new_inherited ( )
501+ . union ( script_extensions:: COMMON )
502+ . union ( script_extensions:: LATIN )
503+ . union ( script_extensions:: CYRILLIC ) ,
504+ & [
505+ Script :: Inherited ,
506+ Script :: Common ,
507+ Script :: Cyrillic ,
508+ Script :: Latin ,
509+ ] ,
510+ ) ,
511+ ] ;
512+ for & ( full_extension, component_scripts) in cases{
513+ for & scriptin component_scripts. iter ( ) {
514+ assert ! ( full_extension. contains_script( script) ) ;
515+ let cur = script. into ( ) ;
516+ let intersect = full_extension. intersection ( cur) ;
517+ let union = full_extension. union ( cur) ;
518+ assert_eq ! ( intersect, cur) ;
519+ assert_eq ! ( union , full_extension) ;
520+
521+ assert ! ( cur. is_subset_or_equal( cur) ) ;
522+ assert ! ( cur. is_subset_or_equal( intersect) ) ;
523+ assert ! ( cur. is_subset_or_equal( full_extension) ) ;
524+ assert ! ( cur. is_subset_or_equal( union ) ) ;
525+ if component_scripts. len ( ) >1 {
526+ assert ! ( !full_extension. is_subset_or_equal( cur) ) ;
527+ assert ! ( !union . is_subset_or_equal( cur) ) ;
528+ }
529+
530+ assert ! ( intersect. is_subset_or_equal( intersect) ) ;
531+ assert ! ( intersect. is_subset_or_equal( full_extension) ) ;
532+ assert ! ( intersect. is_subset_or_equal( union ) ) ;
533+ if component_scripts. len ( ) >1 {
534+ assert ! ( !full_extension. is_subset_or_equal( intersect) ) ;
535+ assert ! ( !union . is_subset_or_equal( intersect) ) ;
536+ }
537+
538+ assert ! ( union . is_subset_or_equal( union ) ) ;
539+ }
540+ let scripts = component_scripts. iter ( ) . cloned ( ) . collect :: < Vec < _ > > ( ) ;
541+ let scripts_iterated = full_extension. iter ( ) . collect :: < Vec < _ > > ( ) ;
542+ assert_eq ! ( scripts, scripts_iterated) ;
543+ }
544+ }
545+
502546#[ cfg( feature ="bench" ) ]
503547#[ bench]
504548fn bench_script_intersection ( b : & mut Bencher ) {