Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitefd900b

Browse files
Support KS X 1026-1
1 parentac8fa20 commitefd900b

File tree

7 files changed

+503
-29
lines changed

7 files changed

+503
-29
lines changed

‎.github/workflows/rust.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ env:
1111
CARGO_TERM_COLOR:always
1212
RUST_BACKTRACE:1
1313
RUSTFLAGS:-D warnings
14-
RUSTDOCFLAGS:-D warnings --cfg docsrs
14+
RUSTDOCFLAGS:-D warnings
1515

1616
jobs:
1717
build:
@@ -44,6 +44,8 @@ jobs:
4444
run:cd $(find target/package/ -maxdepth 1 -mindepth 1 -type d) && cargo test --no-default-features
4545
-name:Build docs
4646
if:matrix.rust == 'nightly'
47+
env:
48+
RUSTDOCFLAGS:-D warnings --cfg docsrs
4749
run:cargo doc --all-features --verbose
4850
-name:Check formatting
4951
if:matrix.rust == 'stable'

‎Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,8 @@ features = ["alloc"]
4040

4141
[features]
4242
default = ["std"]
43+
ks_x_1026-1 = []
4344
std = []
45+
46+
[package.metadata.docs.rs]
47+
rustc-args = ["--cfg","feature=\"ks_x_1026-1\""]

‎README.md

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ fn main() {
2626

2727
##crates.io
2828

29-
You can use this package in your project by adding the following
30-
to your`Cargo.toml`:
29+
You can use this package in your project by adding the following to your
30+
`Cargo.toml`:
3131

3232
```toml
3333
[dependencies]
@@ -36,4 +36,15 @@ unicode-normalization = "0.1.23"
3636

3737
##`no_std` +`alloc` support
3838

39-
This crate is completely`no_std` +`alloc` compatible. This can be enabled by disabling the`std` feature, i.e. specifying`default-features = false` for this crate on your`Cargo.toml`.
39+
This crate is completely`no_std` +`alloc` compatible. This can be enabled by
40+
disabling the`std` feature, i.e. specifying`default-features = false` for this
41+
crate on your`Cargo.toml`.
42+
43+
##KS X 1026-1
44+
45+
Korean Standard KS X 1026-1 ([Korean](https://standard.go.kr/KSCI/standardIntro/getStandardSearchView.do?ksNo=KSX1026-1),
46+
[English](http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf)) is an ROK government
47+
standard that corrects some defects and makes some changes to the Unicode NFC,
48+
NFKC, and NFKD normalization forms for certain Korean characters. The
49+
`ks_x_1026-1` crate feature (disabled by default) adds methods to support these
50+
alternate normalizations.

‎src/ks_x_1026_1.rs

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
//! <http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf> Annex B
2+
3+
use core::{
4+
convert::{TryFrom,TryInto},
5+
iter::FusedIterator,
6+
};
7+
8+
use tinyvec::ArrayVec;
9+
10+
// § B.1.1
11+
12+
usecrate::normalize::hangul_constants::{
13+
L_BASE,L_LAST,N_COUNT,S_BASE,S_COUNT,T_BASE,T_COUNT,T_LAST,V_BASE,V_LAST,
14+
};
15+
16+
// § B.1.2
17+
18+
fnis_old_jongseong(t:char) ->bool{
19+
match t{
20+
'\u{11C3}'..='\u{11FF}' |'\u{D7CB}'..='\u{D7FB}' =>true,
21+
_ =>false,
22+
}
23+
}
24+
25+
/// Iterator that decomposes modern Hangul LV syllables immediately followed by old Hangul T jamo
26+
/// into a 3-character L V T sequences, as specified in KS X 1026-1 annex B.1.5.
27+
#[derive(Clone,Debug)]
28+
pubstructRecomposeHangul<I>{
29+
/// Medial vowel of a decomposed LV syllable
30+
v:Option<char>,
31+
/// Character yielded by inner iterator in last call to its `next()`
32+
last:Option<char>,
33+
inner:I,
34+
}
35+
36+
impl<I:Iterator<Item =char>>IteratorforRecomposeHangul<I>{
37+
typeItem =char;
38+
39+
fnnext(&mutself) ->Option<Self::Item>{
40+
ifletSome(v) =self.v{
41+
// If an LV syllable was decomposed in the last call to `next`,
42+
// yield its medial vowel.
43+
self.v =None;
44+
Some(v)
45+
}else{
46+
let prev =self.last;
47+
self.last =self.inner.next();
48+
49+
iflet(Some(prev),Some(next)) =(prev,self.last){
50+
let s_index = u32::from(prev).wrapping_sub(S_BASE);
51+
if s_index <S_COUNT && s_index %T_COUNT ==0 &&is_old_jongseong(next){
52+
// We have an LV syllable followed by an old jongseong, decompose into L V
53+
let l:char =(L_BASE + s_index /N_COUNT).try_into().unwrap();
54+
self.v =Some((V_BASE +(s_index %N_COUNT) /T_COUNT).try_into().unwrap());
55+
returnSome(l);
56+
}
57+
}
58+
59+
prev
60+
}
61+
}
62+
63+
#[inline]
64+
fnsize_hint(&self) ->(usize,Option<usize>){
65+
let(inner_lo, inner_hi) =self.inner.size_hint();
66+
let add_factor:usize =self.v.map_or(0, |_|1) +self.last.map_or(0, |_|1);
67+
(
68+
inner_lo.saturating_add(add_factor),
69+
inner_hi
70+
.and_then(|h| h.checked_mul(2))
71+
.and_then(|h| h.checked_add(add_factor)),
72+
)
73+
}
74+
}
75+
76+
impl<I:Iterator<Item =char> +FusedIterator>FusedIteratorforRecomposeHangul<I>{}
77+
78+
impl<I:Iterator<Item =char>>RecomposeHangul<I>{
79+
#[inline]
80+
pub(crate)fnnew(mutiter:I) ->Self{
81+
RecomposeHangul{
82+
v:None,
83+
last: iter.next(),
84+
inner: iter,
85+
}
86+
}
87+
}
88+
89+
// B.2.1
90+
91+
staticCP_JAMO:[char;94] =[
92+
'\u{1100}','\u{1101}','\u{11AA}','\u{1102}','\u{11AC}','\u{11AD}','\u{1103}','\u{1104}',
93+
'\u{1105}','\u{11B0}','\u{11B1}','\u{11B2}','\u{11B3}','\u{11B4}','\u{11B5}','\u{111A}',
94+
'\u{1106}','\u{1107}','\u{1108}','\u{1121}','\u{1109}','\u{110A}','\u{110B}','\u{110C}',
95+
'\u{110D}','\u{110E}','\u{110F}','\u{1110}','\u{1111}','\u{1112}','\u{1161}','\u{1162}',
96+
'\u{1163}','\u{1164}','\u{1165}','\u{1166}','\u{1167}','\u{1168}','\u{1169}','\u{116A}',
97+
'\u{116B}','\u{116C}','\u{116D}','\u{116E}','\u{116F}','\u{1170}','\u{1171}','\u{1172}',
98+
'\u{1173}','\u{1174}','\u{1175}','\u{1160}','\u{1114}','\u{1115}','\u{11C7}','\u{11C8}',
99+
'\u{11CC}','\u{11CE}','\u{11D3}','\u{11D7}','\u{11D9}','\u{111C}','\u{11DD}','\u{11DF}',
100+
'\u{111D}','\u{111E}','\u{1120}','\u{1122}','\u{1123}','\u{1127}','\u{1129}','\u{112B}',
101+
'\u{112C}','\u{112D}','\u{112E}','\u{112F}','\u{1132}','\u{1136}','\u{1140}','\u{1147}',
102+
'\u{114C}','\u{11F1}','\u{11F2}','\u{1157}','\u{1158}','\u{1159}','\u{1184}','\u{1185}',
103+
'\u{1188}','\u{1191}','\u{1192}','\u{1194}','\u{119E}','\u{11A1}',
104+
];
105+
106+
// § B.2.2
107+
108+
staticHW_JAMO:[char;64] =[
109+
'\u{1160}','\u{1100}','\u{1101}','\u{11AA}','\u{1102}','\u{11AC}','\u{11AD}','\u{1103}',
110+
'\u{1104}','\u{1105}','\u{11B0}','\u{11B1}','\u{11B2}','\u{11B3}','\u{11B4}','\u{11B5}',
111+
'\u{111A}','\u{1106}','\u{1107}','\u{1108}','\u{1121}','\u{1109}','\u{110A}','\u{110B}',
112+
'\u{110C}','\u{110D}','\u{110E}','\u{110F}','\u{1110}','\u{1111}','\u{1112}','\u{FFBF}',
113+
'\u{FFC0}','\u{FFC1}','\u{1161}','\u{1162}','\u{1163}','\u{1164}','\u{1165}','\u{1166}',
114+
'\u{FFC8}','\u{FFC9}','\u{1167}','\u{1168}','\u{1169}','\u{116A}','\u{116B}','\u{116C}',
115+
'\u{FFD0}','\u{FFD1}','\u{116D}','\u{116E}','\u{116F}','\u{1170}','\u{1171}','\u{1172}',
116+
'\u{FFD8}','\u{FFD9}','\u{1173}','\u{1174}','\u{1175}','\u{FFDD}','\u{FFDE}','\u{FFDF}',
117+
];
118+
119+
// § B.2.3
120+
121+
staticPC_JAMO:[char;14] =[
122+
'\u{1100}','\u{1102}','\u{1103}','\u{1105}','\u{1106}','\u{1107}','\u{1109}','\u{110B}',
123+
'\u{110C}','\u{110E}','\u{110F}','\u{1110}','\u{1111}','\u{1112}',
124+
];
125+
126+
// § B.2.4
127+
128+
/// Iterator that decomposes compatibility characters containing Hangul jamo
129+
/// in a manner that avoids introducing new nonstandard jamo sequences,
130+
/// as specified in KS X 1026-1 annex B.2.4.
131+
#[derive(Clone,Debug)]
132+
pubstructNormalizeJamoKdkc<I>{
133+
inner:I,
134+
// Buffer for when a character normalizes into multiple.
135+
// Characters are pushed to and popped from the end.
136+
// Length 3 is sufficient, as the longest possible expansion
137+
// is for a parenthesized choseong like U+3200,
138+
// which expands into ['(', <choseong>, '\u{1160}', ')'] (length 4).
139+
// (There are no parenthesized jungseong or jongseong.)
140+
buf:ArrayVec<[char;3]>,
141+
}
142+
143+
impl<I:Iterator<Item =char>>IteratorforNormalizeJamoKdkc<I>{
144+
typeItem =char;
145+
146+
fnnext(&mutself) ->Option<Self::Item>{
147+
ifletSome(c) =self.buf.pop(){
148+
// Empty buffer before yielding from underlying iterator.
149+
Some(c)
150+
}else{
151+
let ch =self.inner.next()?;
152+
// Whether ch is a parenthesized Hangul letter
153+
letmut pf =false;
154+
155+
let uch:u32 = ch.into();
156+
let base_jamo:char =match uch{
157+
// Hangul compatibility letter
158+
0x3131..=0x318E =>CP_JAMO[usize::try_from(uch -0x3131).unwrap()],
159+
160+
// Parenthesized Hangul letter
161+
0x3200..=0x320D =>{
162+
pf =true;
163+
self.buf.push(')');
164+
PC_JAMO[usize::try_from(uch -0x3200).unwrap()]
165+
}
166+
167+
// Circled Hangul letter
168+
0x3260..=0x326D =>PC_JAMO[usize::try_from(uch -0x3260).unwrap()],
169+
170+
// Halfwidth Hangul letter
171+
0xFFA0..=0xFFDF =>HW_JAMO[usize::try_from(uch -0xFFA0).unwrap()],
172+
173+
_ =>returnSome(ch),
174+
};
175+
176+
// Insert fillers
177+
let first_ret:char =match base_jamo.into(){
178+
// `base_jamo` is choseong, yield a jungseong filler after
179+
L_BASE..=L_LAST =>{
180+
self.buf.push('\u{1160}');
181+
base_jamo
182+
}
183+
184+
// `base_jamo` is jungseong, yield a choseong filler before
185+
V_BASE..=V_LAST =>{
186+
self.buf.push(base_jamo);
187+
'\u{115F}'
188+
}
189+
190+
// `base_jamo` is jongseong, yield a choseong and a jungseong filler before
191+
T_BASE..=T_LAST =>{
192+
self.buf.push(base_jamo);
193+
self.buf.push('\u{1160}');
194+
'\u{115F}'
195+
}
196+
197+
_ =>unreachable!("`base_jamo` shluld be a jamo, but is not"),
198+
};
199+
200+
if pf{
201+
// Parenthesized Hangul letter, yield open paren before
202+
self.buf.push(first_ret);
203+
Some('(')
204+
}else{
205+
Some(first_ret)
206+
}
207+
}
208+
}
209+
210+
#[inline]
211+
fnsize_hint(&self) ->(usize,Option<usize>){
212+
let(inner_lo, inner_hi) =self.inner.size_hint();
213+
let add_factor:usize =self.buf.len();
214+
(
215+
inner_lo.saturating_add(add_factor),
216+
inner_hi
217+
.and_then(|h| h.checked_mul(4))// Why 4? See comment on `buf` field
218+
.and_then(|h| h.checked_add(add_factor)),
219+
)
220+
}
221+
}
222+
223+
impl<I:Iterator<Item =char> +FusedIterator>FusedIteratorforNormalizeJamoKdkc<I>{}
224+
225+
impl<I:Iterator<Item =char>>NormalizeJamoKdkc<I>{
226+
#[inline]
227+
pub(crate)fnnew(iter:I) ->Self{
228+
NormalizeJamoKdkc{
229+
inner: iter,
230+
buf:ArrayVec::new(),
231+
}
232+
}
233+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp