Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit80526e0

Browse files
committed
ScriptExtension now can represent combinations of scripts and Inherited/Common
After finding the iterator after a union with Common only yielded a singleelement, I overhauled the representation and semantics of`ScriptExtension`. This is a breaking change for most APIs.Summary of improvements to `ScriptExtension`:* Improved representation to be able to track multiple scripts as well as Inherited/Common* "Inherited" and "Common" no longer intersect with everything and have no subset/superset relationship between them.* `for_str` is a union, not intersection, of all chars* Added `is_subset_or_equal()` for easier comparison of unions and intersections* Changed `Debug` impl to a vanilla derive to allow comparing hex bits* Fixed `Display` impl to properly show each script, separated by pluses* New test for iterator
1 parent1f84c2e commit80526e0

File tree

1 file changed

+129
-85
lines changed

1 file changed

+129
-85
lines changed

‎src/lib.rs‎

Lines changed: 129 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,10 @@ impl From<Script> for ScriptExtension {
8383
implTryFrom<ScriptExtension>forScript{
8484
typeError =();
8585
fntry_from(ext:ScriptExtension) ->Result<Self,()>{
86-
if ext.is_common_or_inherited(){
87-
if ext.common{
88-
Ok(Script::Common)
89-
}else{
90-
Ok(Script::Inherited)
91-
}
86+
if ext.is_common(){
87+
Ok(Script::Common)
88+
}elseif ext.is_inherited(){
89+
Ok(Script::Inherited)
9290
}elseif ext.is_empty(){
9391
Ok(Script::Unknown)
9492
}else{
@@ -131,94 +129,88 @@ impl fmt::Display for Script {
131129
}
132130
}
133131

134-
#[derive(Clone,Copy,PartialEq,Eq,Hash)]
132+
#[derive(Clone,Copy,PartialEq,Eq,Hash,Debug)]
135133
#[non_exhaustive]
136134
/// A value for the `Script_Extension` property
137135
///
138136
/// [`ScriptExtension`] is one or more [`Script`]
139137
///
140138
/// This is essentially an optimized version of `Vec<Script>` that uses bitfields
141139
pubstructScriptExtension{
142-
// A bitset for the first64scripts
140+
// A bitset for the first scripts [0..64]
143141
first:u64,
144-
// A bitset for the scripts65-128
142+
// A bitset for the scripts[65..128]
145143
second:u64,
146-
// A bitset for scripts after 128
144+
// A bitset for scripts after [128..NEXT_SCRIPT]
145+
// The last 2 bits represent whether Common and Inherited is included
146+
// * Bit 63 indicates whether it includes Common
147+
// * Bit 64 indicates whether it includes Inherited
147148
third:u64,
148-
// Both Common and Inherited are represented by all used bits being set,
149-
// this flag lets us distinguish the two.
150-
common:bool,
151149
}
152150

153151
implScriptExtension{
154152
// We don't use the complete u64 of `third`, so the "all" value is not just u32::MAX
155153
// Instead, we take the number of the next (unused) script bit, subtract 128 to bring
156154
// it in the range of `third`, create a u64 with just that bit set, and subtract 1
157155
// to create one with all the lower bits set.
158-
constTHIRD_MAX:u64 =((1 <<(NEXT_SCRIPT -128)) -1);
156+
const _CHECK:() =assert!(NEXT_SCRIPT -128 <63);
157+
constCOMMON_MASK:u64 =(1 <<62);// 63rd bit
158+
constINHERITED_MASK:u64 =(1 <<63);// 64th bit
159159

160160
pub(crate)constfnnew(first:u64,second:u64,third:u64) ->Self{
161161
ScriptExtension{
162162
first,
163163
second,
164164
third,
165-
common:false,
166165
}
167166
}
168167

168+
/// Returns a ScriptExtension containing only Common.
169169
pub(crate)constfnnew_common() ->Self{
170170
ScriptExtension{
171-
first: u64::MAX,
172-
second: u64::MAX,
173-
third:Self::THIRD_MAX,
174-
common:true,
171+
first:0,
172+
second:0,
173+
third:Self::COMMON_MASK,
175174
}
176175
}
177176

177+
/// Returns a ScriptExtension containing only Inherited.
178178
pub(crate)constfnnew_inherited() ->Self{
179179
ScriptExtension{
180-
first: u64::MAX,
181-
second: u64::MAX,
182-
third:Self::THIRD_MAX,
183-
common:false,
180+
first:0,
181+
second:0,
182+
third:Self::INHERITED_MASK,
184183
}
185184
}
186185

186+
/// Returns an empty ScriptExtension
187187
pub(crate)constfnnew_unknown() ->Self{
188188
ScriptExtension{
189189
first:0,
190190
second:0,
191191
third:0,
192-
common:false,
193192
}
194193
}
195194

196-
constfnis_common_or_inherited(self) ->bool{
197-
(self.first == u64::MAX)&(self.second == u64::MAX)&(self.third ==Self::THIRD_MAX)
198-
}
199-
200195
/// Checks if the script extension is Common
201196
pubconstfnis_common(self) ->bool{
202-
self.is_common_or_inherited()&self.common
197+
(self.third&Self::COMMON_MASK) !=0
203198
}
204199

205200
/// Checks if the script extension is Inherited
206201
pubconstfnis_inherited(self) ->bool{
207-
self.is_common_or_inherited()&!self.common
202+
(self.third&Self::INHERITED_MASK) !=0
208203
}
209204

210205
/// Checks if the script extension is empty (unknown)
211206
pubconstfnis_empty(self) ->bool{
212207
(self.first ==0)&(self.second ==0)&(self.third ==0)
213208
}
214209

215-
/// Returns the number of scripts in the script extension
210+
/// Returns the number of scripts in the script extension. Common and
211+
/// Inherited, if present, are included and counted independently in the return value.
216212
pubfnlen(self) ->usize{
217-
ifself.is_common_or_inherited(){
218-
1
219-
}else{
220-
(self.first.count_ones() +self.second.count_ones() +self.third.count_ones())asusize
221-
}
213+
(self.first.count_ones() +self.second.count_ones() +self.third.count_ones())asusize
222214
}
223215

224216
/// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
@@ -233,54 +225,47 @@ impl ScriptExtension {
233225

234226
/// Find the intersection between two ScriptExtensions. Returns Unknown if things
235227
/// do not intersect.
236-
///
237-
/// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
238-
/// everything, the intersection of `Common` and `Inherited` is `Inherited`
239228
pubconstfnintersection(self,other:Self) ->Self{
240229
let first =self.first& other.first;
241230
let second =self.second& other.second;
242231
let third =self.third& other.third;
243-
let common =self.common& other.common;
244232
ScriptExtension{
245233
first,
246234
second,
247235
third,
248-
common,
249236
}
250237
}
251238

252239
/// Find the union between two ScriptExtensions.
253-
///
254-
/// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
255-
/// everything, the union of `Common` and `Inherited` is `Common`
256240
pubconstfnunion(self,other:Self) ->Self{
257241
let first =self.first | other.first;
258242
let second =self.second | other.second;
259243
let third =self.third | other.third;
260-
let common =self.common | other.common;
261244
ScriptExtension{
262245
first,
263246
second,
264247
third,
265-
common,
266248
}
267249
}
268250

251+
/// Returns true if and only if all members of `self` are present in `other`.
252+
pubfnis_subset_or_equal(self,other:Self) ->bool{
253+
self.intersection(other) ==self &&self.union(other) == other
254+
}
255+
269256
/// Check if this ScriptExtension contains the given script
270-
///
271-
/// Should be used with specific scripts only, this will
272-
/// return `true` if `self` is not `Unknown` and `script` is
273-
/// `Common` or `Inherited`
274257
pubfncontains_script(self,script:Script) ->bool{
275258
!self.intersection(script.into()).is_empty()
276259
}
277260

278-
/// Get the intersection of script extensions of all characters
279-
/// in a string.
261+
/// Get the script extension representing the union of all scripts for
262+
/// the characters in a string.
263+
///
264+
/// This is likely to decay to Unknown. You probably want to use `for_str_union()` instead.
280265
pubfnfor_str(x:&str) ->Self{
281-
letmut ext =ScriptExtension::default();
266+
letmut ext =ScriptExtension::new_unknown();
282267
for chin x.chars(){
283-
ext.intersect_with(ch.into());
268+
ext = ext.union(ch.into());
284269
}
285270
ext
286271
}
@@ -311,33 +296,23 @@ impl From<&'_ str> for ScriptExtension {
311296
}
312297
}
313298

314-
impl fmt::DebugforScriptExtension{
315-
fnfmt(&self,f:&mut fmt::Formatter) -> fmt::Result{
316-
write!(f,"ScriptExtension(")?;
317-
fmt::Display::fmt(self, f)?;
318-
write!(f,")")
319-
}
320-
}
321-
322299
impl fmt::DisplayforScriptExtension{
323300
fnfmt(&self,f:&mut fmt::Formatter) -> fmt::Result{
324-
ifself.is_common(){
325-
write!(f,"Common")?;
326-
}elseifself.is_inherited(){
327-
write!(f,"Inherited")?;
328-
}elseifself.is_empty(){
301+
write!(f,"ScriptExtension(")?;
302+
ifself.is_empty(){
329303
write!(f,"Unknown")?;
330304
}else{
331305
letmut first =true;
332306
for scriptinself.iter(){
333-
if !first{
334-
write!(f," + ")?;
307+
if first{
335308
first =false;
309+
}else{
310+
write!(f," + ")?;
336311
}
337312
script.full_name().fmt(f)?;
338313
}
339314
}
340-
Ok(())
315+
write!(f,")")
341316
}
342317
}
343318

@@ -361,7 +336,7 @@ impl UnicodeScript for char {
361336

362337
/// Iterator over scripts in a [ScriptExtension].
363338
///
364-
/// Can be obtainedia [ScriptExtension::iter()]
339+
/// Can be obtainedvia [ScriptExtension::iter()]
365340
pubstructScriptIterator{
366341
ext:ScriptExtension,
367342
}
@@ -370,26 +345,31 @@ impl Iterator for ScriptIterator {
370345
typeItem =Script;
371346

372347
fnnext(&mutself) ->Option<Script>{
373-
ifself.ext.is_common_or_inherited(){
374-
let common =self.ext.common;
375-
self.ext =ScriptExtension::new_unknown();
376-
if common{
377-
Some(Script::Common)
378-
}else{
379-
Some(Script::Inherited)
380-
}
348+
ifself.ext.is_inherited(){
349+
// If `self.ext` is both Inherited and Common, this
350+
// temporarily constructs an invalid ScriptExtension. We don't
351+
// use `self.ext` for anything other than iterating over bits,
352+
// so this is okay.
353+
self.ext.third &= !ScriptExtension::INHERITED_MASK;
354+
Some(Script::Inherited)
355+
}elseifself.ext.is_common(){
356+
self.ext.third &= !ScriptExtension::COMMON_MASK;
357+
Some(Script::Common)
358+
381359
// Are there bits left in the first chunk?
382360
}elseifself.ext.first !=0{
383361
// Find the next bit
384362
let bit =self.ext.first.trailing_zeros();
385363
// unset just that bit
386364
self.ext.first &= !(1 << bit);
387365
Some(Script::for_integer(bitasu8))
366+
388367
// Are there bits left in the second chunk?
389368
}elseifself.ext.second !=0{
390369
let bit =self.ext.second.trailing_zeros();
391370
self.ext.second &= !(1 << bit);
392371
Some(Script::for_integer(64 + bitasu8))
372+
393373
// Are there bits left in the third chunk?
394374
}elseifself.ext.third !=0{
395375
let bit =self.ext.third.trailing_zeros();
@@ -429,8 +409,8 @@ mod tests {
429409
seen_scripts.insert(script);
430410
seen_exts.insert(ext);
431411
assert_eq!(scriptasu8, bit);
432-
assert!(!ScriptExtension::new_common().intersection(ext).is_empty());
433-
assert!(!ScriptExtension::new_inherited()
412+
assert!(ScriptExtension::new_common().intersection(ext).is_empty());
413+
assert!(ScriptExtension::new_inherited()
434414
.intersection(ext)
435415
.is_empty());
436416
assert!(ScriptExtension::new_unknown().intersection(ext).is_empty());
@@ -443,13 +423,13 @@ mod tests {
443423
fntest_specific(){
444424
let s ="सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.";
445425
let ext =ScriptExtension::for_str(s);
446-
assert_eq!(ext,script_extensions::DEVA);
426+
assert!(script_extensions::DEVA.is_subset_or_equal(ext));
447427
println!(
448-
"{:?}",
428+
"{}",
449429
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
450430
);
451431
println!(
452-
"{:?}",
432+
"{}",
453433
ext.intersection(
454434
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
455435
)
@@ -461,7 +441,9 @@ mod tests {
461441
let u = ext.union(Script::Dogra.into());
462442
assert_eq!(
463443
u.intersection(
464-
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
444+
script_extensions::COMMON.union(
445+
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
446+
)
465447
),
466448
u
467449
);
@@ -499,6 +481,68 @@ mod tests {
499481
assert!(scr.is_err());
500482
}
501483

484+
#[test]
485+
fntest_subsets_and_iter(){
486+
let cases:&[(ScriptExtension,&[Script])] =&[
487+
(ScriptExtension::new_inherited(),&[Script::Inherited]),
488+
(ScriptExtension::new_common(),&[Script::Common]),
489+
(
490+
ScriptExtension::new_inherited().union(script_extensions::COMMON),
491+
&[Script::Inherited,Script::Common],
492+
),
493+
(
494+
ScriptExtension::new_inherited()
495+
.union(script_extensions::COMMON)
496+
.union(script_extensions::LATIN),
497+
&[Script::Inherited,Script::Common,Script::Latin],
498+
),
499+
(
500+
ScriptExtension::new_inherited()
501+
.union(script_extensions::COMMON)
502+
.union(script_extensions::LATIN)
503+
.union(script_extensions::CYRILLIC),
504+
&[
505+
Script::Inherited,
506+
Script::Common,
507+
Script::Cyrillic,
508+
Script::Latin,
509+
],
510+
),
511+
];
512+
for&(full_extension, component_scripts)in cases{
513+
for&scriptin component_scripts.iter(){
514+
assert!(full_extension.contains_script(script));
515+
let cur = script.into();
516+
let intersect = full_extension.intersection(cur);
517+
let union = full_extension.union(cur);
518+
assert_eq!(intersect, cur);
519+
assert_eq!(union, full_extension);
520+
521+
assert!(cur.is_subset_or_equal(cur));
522+
assert!(cur.is_subset_or_equal(intersect));
523+
assert!(cur.is_subset_or_equal(full_extension));
524+
assert!(cur.is_subset_or_equal(union));
525+
if component_scripts.len() >1{
526+
assert!(!full_extension.is_subset_or_equal(cur));
527+
assert!(!union.is_subset_or_equal(cur));
528+
}
529+
530+
assert!(intersect.is_subset_or_equal(intersect));
531+
assert!(intersect.is_subset_or_equal(full_extension));
532+
assert!(intersect.is_subset_or_equal(union));
533+
if component_scripts.len() >1{
534+
assert!(!full_extension.is_subset_or_equal(intersect));
535+
assert!(!union.is_subset_or_equal(intersect));
536+
}
537+
538+
assert!(union.is_subset_or_equal(union));
539+
}
540+
let scripts = component_scripts.iter().cloned().collect::<Vec<_>>();
541+
let scripts_iterated = full_extension.iter().collect::<Vec<_>>();
542+
assert_eq!(scripts, scripts_iterated);
543+
}
544+
}
545+
502546
#[cfg(feature ="bench")]
503547
#[bench]
504548
fnbench_script_intersection(b:&mutBencher){

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp