|
| 1 | +//! <http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf> Annex B |
| 2 | +
|
| 3 | +use core::{ |
| 4 | + convert::{TryFrom,TryInto}, |
| 5 | + iter::FusedIterator, |
| 6 | +}; |
| 7 | + |
| 8 | +use tinyvec::ArrayVec; |
| 9 | + |
| 10 | +// § B.1.1 |
| 11 | + |
| 12 | +usecrate::normalize::hangul_constants::{ |
| 13 | +L_BASE,L_LAST,N_COUNT,S_BASE,S_COUNT,T_BASE,T_COUNT,T_LAST,V_BASE,V_LAST, |
| 14 | +}; |
| 15 | + |
| 16 | +// § B.1.2 |
| 17 | + |
| 18 | +fnis_old_jongseong(t:char) ->bool{ |
| 19 | +match t{ |
| 20 | +'\u{11C3}'..='\u{11FF}' |'\u{D7CB}'..='\u{D7FB}' =>true, |
| 21 | + _ =>false, |
| 22 | +} |
| 23 | +} |
| 24 | + |
| 25 | +/// Iterator that decomposes modern Hangul LV syllables immediately followed by old Hangul T jamo |
| 26 | +/// into a 3-character L V T sequences, as specified in KS X 1026-1 annex B.1.5. |
| 27 | +#[derive(Clone,Debug)] |
| 28 | +pubstructRecomposeHangul<I>{ |
| 29 | +/// Medial vowel of a decomposed LV syllable |
| 30 | +v:Option<char>, |
| 31 | +/// Character yielded by inner iterator in last call to its `next()` |
| 32 | +last:Option<char>, |
| 33 | +inner:I, |
| 34 | +} |
| 35 | + |
| 36 | +impl<I:Iterator<Item =char>>IteratorforRecomposeHangul<I>{ |
| 37 | +typeItem =char; |
| 38 | + |
| 39 | +fnnext(&mutself) ->Option<Self::Item>{ |
| 40 | +ifletSome(v) =self.v{ |
| 41 | +// If an LV syllable was decomposed in the last call to `next`, |
| 42 | +// yield its medial vowel. |
| 43 | +self.v =None; |
| 44 | +Some(v) |
| 45 | +}else{ |
| 46 | +let prev =self.last; |
| 47 | +self.last =self.inner.next(); |
| 48 | + |
| 49 | +iflet(Some(prev),Some(next)) =(prev,self.last){ |
| 50 | +let s_index = u32::from(prev).wrapping_sub(S_BASE); |
| 51 | +if s_index <S_COUNT && s_index %T_COUNT ==0 &&is_old_jongseong(next){ |
| 52 | +// We have an LV syllable followed by an old jongseong, decompose into L V |
| 53 | +let l:char =(L_BASE + s_index /N_COUNT).try_into().unwrap(); |
| 54 | +self.v =Some((V_BASE +(s_index %N_COUNT) /T_COUNT).try_into().unwrap()); |
| 55 | +returnSome(l); |
| 56 | +} |
| 57 | +} |
| 58 | + |
| 59 | + prev |
| 60 | +} |
| 61 | +} |
| 62 | + |
| 63 | +#[inline] |
| 64 | +fnsize_hint(&self) ->(usize,Option<usize>){ |
| 65 | +let(inner_lo, inner_hi) =self.inner.size_hint(); |
| 66 | +let add_factor:usize =self.v.map_or(0, |_|1) +self.last.map_or(0, |_|1); |
| 67 | +( |
| 68 | + inner_lo.saturating_add(add_factor), |
| 69 | + inner_hi |
| 70 | +.and_then(|h| h.checked_mul(2)) |
| 71 | +.and_then(|h| h.checked_add(add_factor)), |
| 72 | +) |
| 73 | +} |
| 74 | +} |
| 75 | + |
| 76 | +impl<I:Iterator<Item =char> +FusedIterator>FusedIteratorforRecomposeHangul<I>{} |
| 77 | + |
| 78 | +impl<I:Iterator<Item =char>>RecomposeHangul<I>{ |
| 79 | +#[inline] |
| 80 | +pub(crate)fnnew(mutiter:I) ->Self{ |
| 81 | +RecomposeHangul{ |
| 82 | +v:None, |
| 83 | +last: iter.next(), |
| 84 | +inner: iter, |
| 85 | +} |
| 86 | +} |
| 87 | +} |
| 88 | + |
| 89 | +// B.2.1 |
| 90 | + |
| 91 | +staticCP_JAMO:[char;94] =[ |
| 92 | +'\u{1100}','\u{1101}','\u{11AA}','\u{1102}','\u{11AC}','\u{11AD}','\u{1103}','\u{1104}', |
| 93 | +'\u{1105}','\u{11B0}','\u{11B1}','\u{11B2}','\u{11B3}','\u{11B4}','\u{11B5}','\u{111A}', |
| 94 | +'\u{1106}','\u{1107}','\u{1108}','\u{1121}','\u{1109}','\u{110A}','\u{110B}','\u{110C}', |
| 95 | +'\u{110D}','\u{110E}','\u{110F}','\u{1110}','\u{1111}','\u{1112}','\u{1161}','\u{1162}', |
| 96 | +'\u{1163}','\u{1164}','\u{1165}','\u{1166}','\u{1167}','\u{1168}','\u{1169}','\u{116A}', |
| 97 | +'\u{116B}','\u{116C}','\u{116D}','\u{116E}','\u{116F}','\u{1170}','\u{1171}','\u{1172}', |
| 98 | +'\u{1173}','\u{1174}','\u{1175}','\u{1160}','\u{1114}','\u{1115}','\u{11C7}','\u{11C8}', |
| 99 | +'\u{11CC}','\u{11CE}','\u{11D3}','\u{11D7}','\u{11D9}','\u{111C}','\u{11DD}','\u{11DF}', |
| 100 | +'\u{111D}','\u{111E}','\u{1120}','\u{1122}','\u{1123}','\u{1127}','\u{1129}','\u{112B}', |
| 101 | +'\u{112C}','\u{112D}','\u{112E}','\u{112F}','\u{1132}','\u{1136}','\u{1140}','\u{1147}', |
| 102 | +'\u{114C}','\u{11F1}','\u{11F2}','\u{1157}','\u{1158}','\u{1159}','\u{1184}','\u{1185}', |
| 103 | +'\u{1188}','\u{1191}','\u{1192}','\u{1194}','\u{119E}','\u{11A1}', |
| 104 | +]; |
| 105 | + |
| 106 | +// § B.2.2 |
| 107 | + |
| 108 | +staticHW_JAMO:[char;64] =[ |
| 109 | +'\u{1160}','\u{1100}','\u{1101}','\u{11AA}','\u{1102}','\u{11AC}','\u{11AD}','\u{1103}', |
| 110 | +'\u{1104}','\u{1105}','\u{11B0}','\u{11B1}','\u{11B2}','\u{11B3}','\u{11B4}','\u{11B5}', |
| 111 | +'\u{111A}','\u{1106}','\u{1107}','\u{1108}','\u{1121}','\u{1109}','\u{110A}','\u{110B}', |
| 112 | +'\u{110C}','\u{110D}','\u{110E}','\u{110F}','\u{1110}','\u{1111}','\u{1112}','\u{FFBF}', |
| 113 | +'\u{FFC0}','\u{FFC1}','\u{1161}','\u{1162}','\u{1163}','\u{1164}','\u{1165}','\u{1166}', |
| 114 | +'\u{FFC8}','\u{FFC9}','\u{1167}','\u{1168}','\u{1169}','\u{116A}','\u{116B}','\u{116C}', |
| 115 | +'\u{FFD0}','\u{FFD1}','\u{116D}','\u{116E}','\u{116F}','\u{1170}','\u{1171}','\u{1172}', |
| 116 | +'\u{FFD8}','\u{FFD9}','\u{1173}','\u{1174}','\u{1175}','\u{FFDD}','\u{FFDE}','\u{FFDF}', |
| 117 | +]; |
| 118 | + |
| 119 | +// § B.2.3 |
| 120 | + |
| 121 | +staticPC_JAMO:[char;14] =[ |
| 122 | +'\u{1100}','\u{1102}','\u{1103}','\u{1105}','\u{1106}','\u{1107}','\u{1109}','\u{110B}', |
| 123 | +'\u{110C}','\u{110E}','\u{110F}','\u{1110}','\u{1111}','\u{1112}', |
| 124 | +]; |
| 125 | + |
| 126 | +// § B.2.4 |
| 127 | + |
| 128 | +/// Iterator that decomposes compatibility characters containing Hangul jamo |
| 129 | +/// in a manner that avoids introducing new nonstandard jamo sequences, |
| 130 | +/// as specified in KS X 1026-1 annex B.2.4. |
| 131 | +#[derive(Clone,Debug)] |
| 132 | +pubstructNormalizeJamoKdkc<I>{ |
| 133 | +inner:I, |
| 134 | +// Buffer for when a character normalizes into multiple. |
| 135 | +// Characters are pushed to and popped from the end. |
| 136 | +// Length 3 is sufficient, as the longest possible expansion |
| 137 | +// is for a parenthesized choseong like U+3200, |
| 138 | +// which expands into ['(', <choseong>, '\u{1160}', ')'] (length 4). |
| 139 | +// (There are no parenthesized jungseong or jongseong.) |
| 140 | +buf:ArrayVec<[char;3]>, |
| 141 | +} |
| 142 | + |
| 143 | +impl<I:Iterator<Item =char>>IteratorforNormalizeJamoKdkc<I>{ |
| 144 | +typeItem =char; |
| 145 | + |
| 146 | +fnnext(&mutself) ->Option<Self::Item>{ |
| 147 | +ifletSome(c) =self.buf.pop(){ |
| 148 | +// Empty buffer before yielding from underlying iterator. |
| 149 | +Some(c) |
| 150 | +}else{ |
| 151 | +let ch =self.inner.next()?; |
| 152 | +// Whether ch is a parenthesized Hangul letter |
| 153 | +letmut pf =false; |
| 154 | + |
| 155 | +let uch:u32 = ch.into(); |
| 156 | +let base_jamo:char =match uch{ |
| 157 | +// Hangul compatibility letter |
| 158 | +0x3131..=0x318E =>CP_JAMO[usize::try_from(uch -0x3131).unwrap()], |
| 159 | + |
| 160 | +// Parenthesized Hangul letter |
| 161 | +0x3200..=0x320D =>{ |
| 162 | + pf =true; |
| 163 | +self.buf.push(')'); |
| 164 | +PC_JAMO[usize::try_from(uch -0x3200).unwrap()] |
| 165 | +} |
| 166 | + |
| 167 | +// Circled Hangul letter |
| 168 | +0x3260..=0x326D =>PC_JAMO[usize::try_from(uch -0x3260).unwrap()], |
| 169 | + |
| 170 | +// Halfwidth Hangul letter |
| 171 | +0xFFA0..=0xFFDF =>HW_JAMO[usize::try_from(uch -0xFFA0).unwrap()], |
| 172 | + |
| 173 | + _ =>returnSome(ch), |
| 174 | +}; |
| 175 | + |
| 176 | +// Insert fillers |
| 177 | +let first_ret:char =match base_jamo.into(){ |
| 178 | +// `base_jamo` is choseong, yield a jungseong filler after |
| 179 | +L_BASE..=L_LAST =>{ |
| 180 | +self.buf.push('\u{1160}'); |
| 181 | + base_jamo |
| 182 | +} |
| 183 | + |
| 184 | +// `base_jamo` is jungseong, yield a choseong filler before |
| 185 | +V_BASE..=V_LAST =>{ |
| 186 | +self.buf.push(base_jamo); |
| 187 | +'\u{115F}' |
| 188 | +} |
| 189 | + |
| 190 | +// `base_jamo` is jongseong, yield a choseong and a jungseong filler before |
| 191 | +T_BASE..=T_LAST =>{ |
| 192 | +self.buf.push(base_jamo); |
| 193 | +self.buf.push('\u{1160}'); |
| 194 | +'\u{115F}' |
| 195 | +} |
| 196 | + |
| 197 | + _ =>unreachable!("`base_jamo` shluld be a jamo, but is not"), |
| 198 | +}; |
| 199 | + |
| 200 | +if pf{ |
| 201 | +// Parenthesized Hangul letter, yield open paren before |
| 202 | +self.buf.push(first_ret); |
| 203 | +Some('(') |
| 204 | +}else{ |
| 205 | +Some(first_ret) |
| 206 | +} |
| 207 | +} |
| 208 | +} |
| 209 | + |
| 210 | +#[inline] |
| 211 | +fnsize_hint(&self) ->(usize,Option<usize>){ |
| 212 | +let(inner_lo, inner_hi) =self.inner.size_hint(); |
| 213 | +let add_factor:usize =self.buf.len(); |
| 214 | +( |
| 215 | + inner_lo.saturating_add(add_factor), |
| 216 | + inner_hi |
| 217 | +.and_then(|h| h.checked_mul(4))// Why 4? See comment on `buf` field |
| 218 | +.and_then(|h| h.checked_add(add_factor)), |
| 219 | +) |
| 220 | +} |
| 221 | +} |
| 222 | + |
| 223 | +impl<I:Iterator<Item =char> +FusedIterator>FusedIteratorforNormalizeJamoKdkc<I>{} |
| 224 | + |
| 225 | +impl<I:Iterator<Item =char>>NormalizeJamoKdkc<I>{ |
| 226 | +#[inline] |
| 227 | +pub(crate)fnnew(iter:I) ->Self{ |
| 228 | +NormalizeJamoKdkc{ |
| 229 | +inner: iter, |
| 230 | +buf:ArrayVec::new(), |
| 231 | +} |
| 232 | +} |
| 233 | +} |