Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc24ac7f

Browse files
Add API to transform into standard Korean syllables
Seehttps://www.unicode.org/reports/tr29/#Transforming_Into_SKS
1 parent6b86cc2 commitc24ac7f

File tree

3 files changed

+189
-0
lines changed

3 files changed

+189
-0
lines changed

‎src/lib.rs‎

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ pub use crate::quick_check::{
7373
};
7474
pubusecrate::recompose::Recompositions;
7575
pubusecrate::replace::Replacements;
76+
pubusecrate::standardize_korean_syllables::StandardKoreanSyllables;
7677
pubusecrate::stream_safe::StreamSafe;
7778
pubusecrate::tables::UNICODE_VERSION;
7879
use core::{option, str::Chars};
@@ -86,6 +87,7 @@ mod perfect_hash;
8687
mod quick_check;
8788
mod recompose;
8889
mod replace;
90+
mod standardize_korean_syllables;
8991
mod stream_safe;
9092

9193
#[rustfmt::skip]
@@ -146,6 +148,10 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
146148
/// inserted according to the Stream-Safe Text Process ([UAX15-D4](https://unicode.org/reports/tr15/#UAX15-D4))
147149
fnstream_safe(self) ->StreamSafe<I>;
148150

151+
/// An iterator over the string with Hangul choseong and jugseong filler characters inserted
152+
/// to ensure that all Korean syllable blocks are in standard form according to [UAX29](https://www.unicode.org/reports/tr29/#Transforming_Into_SKS).
153+
fnstandard_korean_syllables(self) ->StandardKoreanSyllables<I>;
154+
149155
/// An iterator over the string in the variant of Unicode Normalization Form KD
150156
/// defined by Korean Standard X 1026-1. This normalization differs from that defined by Unicode
151157
/// in that it will not produce nonstandard Korean jamo sequences if none were present in the input.
@@ -210,6 +216,11 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
210216
StreamSafe::new(self.chars())
211217
}
212218

219+
#[inline]
220+
fnstandard_korean_syllables(self) ->StandardKoreanSyllables<Chars<'a>>{
221+
StandardKoreanSyllables::new(self.chars())
222+
}
223+
213224
#[cfg(feature ="ks_x_1026-1")]
214225
#[cfg_attr(docsrs, doc(cfg(feature ="ks_x_1026-1")))]
215226
#[inline]
@@ -265,6 +276,11 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {
265276
StreamSafe::new(Some(self).into_iter())
266277
}
267278

279+
#[inline]
280+
fnstandard_korean_syllables(self) ->StandardKoreanSyllables<option::IntoIter<char>>{
281+
StandardKoreanSyllables::new(Some(self).into_iter())
282+
}
283+
268284
#[cfg(feature ="ks_x_1026-1")]
269285
#[cfg_attr(docsrs, doc(cfg(feature ="ks_x_1026-1")))]
270286
#[inline]
@@ -322,6 +338,11 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
322338
StreamSafe::new(self)
323339
}
324340

341+
#[inline]
342+
fnstandard_korean_syllables(self) ->StandardKoreanSyllables<I>{
343+
StandardKoreanSyllables::new(self)
344+
}
345+
325346
#[cfg(feature ="ks_x_1026-1")]
326347
#[cfg_attr(docsrs, doc(cfg(feature ="ks_x_1026-1")))]
327348
#[inline]
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
use core::iter::FusedIterator;
2+
3+
use tinyvec::ArrayVec;
4+
5+
usecrate::normalize::hangul_constants::{N_COUNT,S_BASE,T_COUNT};
6+
7+
#[derive(Clone,Copy,Debug,PartialEq,Eq)]
8+
enumJamoKind{
9+
L,
10+
V,
11+
T,
12+
}
13+
14+
implJamoKind{
15+
fnof(c:char) ->(Option<Self>,Option<Self>){
16+
match c{
17+
// L
18+
'\u{1100}'..='\u{115F}' |'\u{A960}'..='\u{A97C}' =>{
19+
(Some(JamoKind::L),Some(JamoKind::L))
20+
}
21+
// V
22+
'\u{1160}'..='\u{11A7}' |'\u{D7B0}'..='\u{D7C6}' =>{
23+
(Some(JamoKind::V),Some(JamoKind::V))
24+
}
25+
// T
26+
'\u{11A8}'..='\u{11FF}' |'\u{D7CB}'..='\u{D7FB}' =>{
27+
(Some(JamoKind::T),Some(JamoKind::T))
28+
}
29+
// LV or LVT
30+
'\u{AC00}'..='\u{D7A3}' =>(
31+
Some(JamoKind::L),
32+
Some(if((u32::from(c) -S_BASE) %N_COUNT) %T_COUNT ==0{
33+
// LV
34+
JamoKind::V
35+
}else{
36+
// LVT
37+
JamoKind::T
38+
}),
39+
),
40+
_ =>(None,None),
41+
}
42+
}
43+
}
44+
45+
/// Iterator over a string's characters, with '\u{115F}' and '\u{1160}' inserted
46+
/// where needed to ensure all Korean syllable blocks are in standard form
47+
/// by [UAX29 rules](https://www.unicode.org/reports/tr29/#Standard_Korean_Syllables).
48+
#[derive(Clone,Debug)]
49+
pubstructStandardKoreanSyllables<I>{
50+
prev_end_jamo_kind:Option<JamoKind>,
51+
buf:ArrayVec<[Option<char>;3]>,
52+
inner:I,
53+
}
54+
55+
impl<I:Iterator<Item =char>>IteratorforStandardKoreanSyllables<I>{
56+
typeItem =char;
57+
58+
fnnext(&mutself) ->Option<Self::Item>{
59+
ifletSome(c) =self.buf.pop(){
60+
c
61+
}else{
62+
let next_c =self.inner.next();
63+
let prev_end_jamo_kind =self.prev_end_jamo_kind;
64+
let(next_start_jamo_kind, next_end_jamo_kind) =
65+
next_c.map_or((None,None),JamoKind::of);
66+
self.prev_end_jamo_kind = next_end_jamo_kind;
67+
68+
insert_fillers(
69+
next_c,
70+
prev_end_jamo_kind,
71+
next_start_jamo_kind,
72+
&mutself.buf,
73+
)
74+
}
75+
}
76+
77+
#[inline]
78+
fnsize_hint(&self) ->(usize,Option<usize>){
79+
let(inner_lo, inner_hi) =self.inner.size_hint();
80+
let add_factor:usize =self.buf.len();
81+
(
82+
inner_lo.saturating_add(add_factor),
83+
inner_hi
84+
.and_then(|h| h.checked_mul(3))// T → Lf Vf T
85+
.and_then(|h| h.checked_add(add_factor)),
86+
)
87+
}
88+
}
89+
90+
impl<I:Iterator<Item =char> +FusedIterator>FusedIteratorforStandardKoreanSyllables<I>{}
91+
92+
#[inline]
93+
fninsert_fillers(
94+
next_c:Option<char>,
95+
prev_end_jamo_kind:Option<JamoKind>,
96+
next_start_jamo_kind:Option<JamoKind>,
97+
buf:&mutArrayVec<[Option<char>;3]>,
98+
) ->Option<char>{
99+
match(prev_end_jamo_kind, next_start_jamo_kind){
100+
// Insert choseong filler before V not preceded by L or V
101+
(None,Some(JamoKind::V)) |(Some(JamoKind::T),Some(JamoKind::V)) =>{
102+
buf.push(next_c);
103+
Some('\u{115F}')
104+
}
105+
// Insert choseong and jungseong fillers before T preceded non-jamo
106+
(None,Some(JamoKind::T)) =>{
107+
buf.push(next_c);
108+
buf.push(Some('\u{1160}'));
109+
Some('\u{115F}')
110+
}
111+
// Insert V filler between L and non-jamo
112+
(Some(JamoKind::L),None) =>{
113+
buf.push(next_c);
114+
Some('\u{1160}')
115+
}
116+
// For L followed by T, insert V filler, L filler, then another V filler
117+
(Some(JamoKind::L),Some(JamoKind::T)) =>{
118+
buf.push(next_c);
119+
buf.push(Some('\u{1160}'));
120+
buf.push(Some('\u{115F}'));
121+
Some('\u{1160}')
122+
}
123+
_ => next_c,
124+
}
125+
}
126+
127+
impl<I>StandardKoreanSyllables<I>{
128+
#[inline]
129+
pub(crate)fnnew(iter:I) ->Self{
130+
Self{
131+
prev_end_jamo_kind:None,
132+
buf:ArrayVec::new(),
133+
inner: iter,
134+
}
135+
}
136+
}

‎tests/standard_korean_syllables.rs‎

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
use unicode_normalization::UnicodeNormalization;
2+
3+
macro_rules! standardize{
4+
($input: expr) =>{
5+
IntoIterator::into_iter($input)
6+
.standard_korean_syllables()
7+
.collect::<Vec<char>>()
8+
};
9+
}
10+
11+
/// <https://www.unicode.org/reports/tr29/#Korean_Syllable_Break_Examples>
12+
#[test]
13+
fnkorean_syllable_break_examples(){
14+
constL:char ='\u{1100}';
15+
constL_F:char ='\u{115F}';
16+
constV:char ='\u{1161}';
17+
constV_F:char ='\u{1160}';
18+
constT:char ='\u{11AE}';
19+
constLV:char ='\u{AC00}';
20+
constLVT:char ='\u{AC01}';
21+
22+
// LVT LV LV LVf LfV LfVfT
23+
let orig =[LVT,L,V,LV,L,V_F,L_F,V,L_F,V_F,T];
24+
assert_eq!(standardize!(orig), orig);
25+
26+
// LL TT VV TT VV LLVV
27+
let orig =[L,L,T,T,V,V,T,T,V,V,L,LV,V];
28+
assert_eq!(
29+
standardize!(orig),
30+
[L,L,V_F,L_F,V_F,T,T,L_F,V,V,T,T,L_F,V,V,L,LV,V]
31+
);
32+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp