Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit4e7f62b

Browse files
committed
Add support for Unicode case folding.
Expand case mapping tables to include entries for case folding, whichare parsed from CaseFolding.txt.Discussion:https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com
1 parent7921927 commit4e7f62b

File tree

7 files changed

+3280
-3125
lines changed

7 files changed

+3280
-3125
lines changed

‎src/common/unicode/Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
3030
# These files are part of the Unicode Character Database. Download
3131
# them on demand. The dependency on Makefile.global is for
3232
# UNICODE_VERSION.
33-
CompositionExclusions.txtDerivedCoreProperties.txtDerivedNormalizationProps.txtEastAsianWidth.txtNormalizationTest.txtPropList.txtSpecialCasing.txtUnicodeData.txt:$(top_builddir)/src/Makefile.global
33+
CompositionExclusions.txtCaseFolding.txtDerivedCoreProperties.txtDerivedNormalizationProps.txtEastAsianWidth.txtNormalizationTest.txtPropList.txtSpecialCasing.txtUnicodeData.txt:$(top_builddir)/src/Makefile.global
3434
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
3535

3636
unicode_version.h: generate-unicode_version.pl
3737
$(PERL)$< --version$(UNICODE_VERSION)
3838

39-
unicode_case_table.h: generate-unicode_case_table.pl UnicodeData.txt
39+
unicode_case_table.h: generate-unicode_case_table.plCaseFolding.txtUnicodeData.txt
4040
$(PERL)$<
4141

4242
unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt
@@ -91,4 +91,4 @@ clean:
9191
rm -f$(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
9292

9393
distclean: clean
94-
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
94+
rm -f CompositionExclusions.txtCaseFolding.txtDerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h

‎src/common/unicode/case_test.c

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,17 +81,20 @@ icu_test_simple(pg_wchar code)
8181
pg_wcharlower=unicode_lowercase_simple(code);
8282
pg_wchartitle=unicode_titlecase_simple(code);
8383
pg_wcharupper=unicode_uppercase_simple(code);
84+
pg_wcharfold=unicode_casefold_simple(code);
8485
pg_wchariculower=u_tolower(code);
8586
pg_wcharicutitle=u_totitle(code);
8687
pg_wcharicuupper=u_toupper(code);
88+
pg_wcharicufold=u_foldCase(code,U_FOLD_CASE_DEFAULT);
8789

88-
if (lower!=iculower||title!=icutitle||upper!=icuupper)
90+
if (lower!=iculower||title!=icutitle||upper!=icuupper||
91+
fold!=icufold)
8992
{
9093
printf("case_test: FAILURE for codepoint 0x%06x\n",code);
91-
printf("case_test: Postgres lower/title/upper:0x%06x/0x%06x/0x%06x\n",
92-
lower,title,upper);
93-
printf("case_test: ICU lower/title/upper:0x%06x/0x%06x/0x%06x\n",
94-
iculower,icutitle,icuupper);
94+
printf("case_test: Postgres lower/title/upper/fold:0x%06x/0x%06x/0x%06x/0x%06x\n",
95+
lower,title,upper,fold);
96+
printf("case_test: ICU lower/title/upper/fold:0x%06x/0x%06x/0x%06x/0x%06x\n",
97+
iculower,icutitle,icuupper,icufold);
9598
printf("\n");
9699
exit(1);
97100
}
@@ -103,9 +106,11 @@ icu_test_full(char *str)
103106
charlower[BUFSZ];
104107
chartitle[BUFSZ];
105108
charupper[BUFSZ];
109+
charfold[BUFSZ];
106110
charicu_lower[BUFSZ];
107111
charicu_title[BUFSZ];
108112
charicu_upper[BUFSZ];
113+
charicu_fold[BUFSZ];
109114
UErrorCodestatus;
110115
structWordBoundaryStatewbstate= {
111116
.str=str,
@@ -118,12 +123,15 @@ icu_test_full(char *str)
118123
unicode_strlower(lower,BUFSZ,str,-1, true);
119124
unicode_strtitle(title,BUFSZ,str,-1, true,initcap_wbnext,&wbstate);
120125
unicode_strupper(upper,BUFSZ,str,-1, true);
126+
unicode_strfold(fold,BUFSZ,str,-1, true);
121127
status=U_ZERO_ERROR;
122128
ucasemap_utf8ToLower(casemap,icu_lower,BUFSZ,str,-1,&status);
123129
status=U_ZERO_ERROR;
124130
ucasemap_utf8ToTitle(casemap,icu_title,BUFSZ,str,-1,&status);
125131
status=U_ZERO_ERROR;
126132
ucasemap_utf8ToUpper(casemap,icu_upper,BUFSZ,str,-1,&status);
133+
status=U_ZERO_ERROR;
134+
ucasemap_utf8FoldCase(casemap,icu_fold,BUFSZ,str,-1,&status);
127135

128136
if (strcmp(lower,icu_lower)!=0)
129137
{
@@ -143,6 +151,12 @@ icu_test_full(char *str)
143151
icu_upper);
144152
exit(1);
145153
}
154+
if (strcmp(fold,icu_fold)!=0)
155+
{
156+
printf("case_test: str='%s' fold='%s' icu_fold='%s'\n",str,fold,
157+
icu_fold);
158+
exit(1);
159+
}
146160
}
147161

148162
/*
@@ -302,6 +316,12 @@ tfunc_upper(char *dst, size_t dstsize, const char *src,
302316
returnunicode_strupper(dst,dstsize,src,srclen, true);
303317
}
304318

319+
staticsize_t
320+
tfunc_fold(char*dst,size_tdstsize,constchar*src,
321+
ssize_tsrclen)
322+
{
323+
returnunicode_strfold(dst,dstsize,src,srclen, true);
324+
}
305325

306326
staticvoid
307327
test_convert_case()
@@ -318,10 +338,12 @@ test_convert_case()
318338
test_convert(tfunc_upper,"ß","SS");
319339
test_convert(tfunc_lower,"ıiIİ","ıiii\u0307");
320340
test_convert(tfunc_upper,"ıiIİ","IIIİ");
341+
test_convert(tfunc_fold,"ıiIİ","ıiii\u0307");
321342
/* test final sigma */
322343
test_convert(tfunc_lower,"σςΣ ΣΣΣ","σςς σσς");
323344
test_convert(tfunc_lower,"σς'Σ' ΣΣ'Σ'","σς'ς' σσ'ς'");
324345
test_convert(tfunc_title,"σςΣ ΣΣΣ","Σςς Σσς");
346+
test_convert(tfunc_fold,"σςΣ ΣΣΣ","σσσ σσσ");
325347

326348
#ifdefUSE_ICU
327349
icu_test_full("");

‎src/common/unicode/generate-unicode_case_table.pl

Lines changed: 103 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@
4949
$simple{$code} = {
5050
Simple_Lowercase=> ($simple_lowercase ||$code),
5151
Simple_Titlecase=> ($simple_titlecase ||$code),
52-
Simple_Uppercase=> ($simple_uppercase ||$code)
52+
Simple_Uppercase=> ($simple_uppercase ||$code),
53+
Simple_Foldcase=>$code,
5354
};
5455
}
5556
}
@@ -87,6 +88,7 @@
8788
my@lower =map {hex$_ } (grep /^[0-9A-F]+$/, (split /\s+/,$elts[1]));
8889
my@title =map {hex$_ } (grep /^[0-9A-F]+$/, (split /\s+/,$elts[2]));
8990
my@upper =map {hex$_ } (grep /^[0-9A-F]+$/, (split /\s+/,$elts[3]));
91+
my@fold = ();
9092
my@conditions =map {
9193
# supporting negated conditions may require storing a
9294
# mask of relevant conditions for a given rule to differentiate
@@ -101,6 +103,7 @@
101103
push@lower,$codeif (scalar@lower == 0);
102104
push@title,$codeif (scalar@title == 0);
103105
push@upper,$codeif (scalar@upper == 0);
106+
push@fold,$code;
104107

105108
# none should map to more than 3 codepoints
106109
die"lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
@@ -114,13 +117,15 @@
114117
while (scalar@upper <$MAX_CASE_EXPANSION) {push@upper, 0x000000 }
115118
while (scalar@lower <$MAX_CASE_EXPANSION) {push@lower, 0x000000 }
116119
while (scalar@title <$MAX_CASE_EXPANSION) {push@title, 0x000000 }
120+
while (scalar@fold <$MAX_CASE_EXPANSION) {push@fold, 0x000000 }
117121

118122
# Characters with special mappings may not have simple mappings;
119123
# ensure that an entry exists.
120124
$simple{$code} ||= {
121125
Simple_Lowercase=>$code,
122126
Simple_Titlecase=>$code,
123-
Simple_Uppercase=>$code
127+
Simple_Uppercase=>$code,
128+
Simple_Foldcase=>$code
124129
};
125130

126131
# Multiple special case rules for a single codepoint could be
@@ -135,11 +140,96 @@
135140
Lowercase=> \@lower,
136141
Titlecase=> \@title,
137142
Uppercase=> \@upper,
143+
Foldcase=> \@fold,
138144
Conditions=>$cond_str
139145
};
140146
}
141147
close$FH;
142148

149+
open($FH,'<',"$output_path/CaseFolding.txt")
150+
ordie"Could not open$output_path/CaseFolding.txt:$!.";
151+
while (my$line = <$FH>)
152+
{
153+
# remove comments
154+
$line =~s/^(.*?)#.*$/$1/s;
155+
156+
# ignore empty lines
157+
nextunless$line =~/;/;
158+
159+
my@elts =split(';',$line);
160+
my$code =hex($elts[0]);
161+
my$status =$elts[1] =~s/^\s+|\s+$//rg;
162+
163+
# Codepoint may map to multiple characters when folding. Split
164+
# each mapping on whitespace and extract the hexadecimal into an
165+
# array of codepoints.
166+
my@fold =map {hex$_ } (grep /[0-9A-F]+/, (split /\s+/,$elts[2]));
167+
168+
die"codepoint$code out of range"if$code > 0x10FFFF;
169+
170+
# status 'T' unsupported; skip
171+
nextif$statuseq'T';
172+
173+
# encountered unrecognized status type
174+
die"unsupported status type '$status'"
175+
if$statusne'S' &&$statusne'C' &&$statusne'F';
176+
177+
# initialize simple case mappings if they don't exist
178+
$simple{$code} ||= {
179+
Simple_Lowercase=>$code,
180+
Simple_Titlecase=>$code,
181+
Simple_Uppercase=>$code,
182+
Simple_Foldcase=>$code
183+
};
184+
185+
if ($statuseq'S' ||$statuseq'C')
186+
{
187+
die
188+
"Simple case folding for$code has multiple codepoints: '$line' '$elts[2]'"
189+
ifscalar@fold != 1;
190+
my$simple_foldcase =$fold[0];
191+
192+
die"Simple_Foldcase$code out of range"
193+
if$simple_foldcase > 0x10FFFF;
194+
195+
$simple{$code}{Simple_Foldcase} =$simple_foldcase;
196+
}
197+
198+
if ($statuseq'F' || ($statuseq'C' &&defined$special{$code}))
199+
{
200+
while (scalar@fold <$MAX_CASE_EXPANSION) {push@fold, 0x000000 }
201+
202+
#initialize special case mappings if they don't exist
203+
if (!defined$special{$code})
204+
{
205+
my@lower = ($simple{$code}{Simple_Lowercase});
206+
my@title = ($simple{$code}{Simple_Titlecase});
207+
my@upper = ($simple{$code}{Simple_Uppercase});
208+
while (scalar@lower <$MAX_CASE_EXPANSION)
209+
{
210+
push@lower, 0x000000;
211+
}
212+
while (scalar@title <$MAX_CASE_EXPANSION)
213+
{
214+
push@title, 0x000000;
215+
}
216+
while (scalar@upper <$MAX_CASE_EXPANSION)
217+
{
218+
push@upper, 0x000000;
219+
}
220+
$special{$code} = {
221+
Lowercase=> \@lower,
222+
Titlecase=> \@title,
223+
Uppercase=> \@upper,
224+
Conditions=>'0'
225+
};
226+
}
227+
228+
$special{$code}{Foldcase} = \@fold;
229+
}
230+
}
231+
close$FH;
232+
143233
# assign sequential array indexes to the special mappings
144234
my$special_idx = 0;
145235
foreachmy$code (sort {$a<=>$b } (keys%special))
@@ -202,6 +292,7 @@
202292
CaseLower = 0,
203293
CaseTitle = 1,
204294
CaseUpper = 2,
295+
CaseFold = 3,
205296
NCaseKind
206297
} CaseKind;
207298
@@ -232,14 +323,17 @@
232323
dieifscalar @{$special{$code}{Lowercase} } !=$MAX_CASE_EXPANSION;
233324
dieifscalar @{$special{$code}{Titlecase} } !=$MAX_CASE_EXPANSION;
234325
dieifscalar @{$special{$code}{Uppercase} } !=$MAX_CASE_EXPANSION;
326+
dieifscalar @{$special{$code}{Foldcase} } !=$MAX_CASE_EXPANSION;
235327
my$lower =join",",
236328
(map {sprintf"0x%06x",$_ } @{$special{$code}{Lowercase} });
237329
my$title =join",",
238330
(map {sprintf"0x%06x",$_ } @{$special{$code}{Titlecase} });
239331
my$upper =join",",
240332
(map {sprintf"0x%06x",$_ } @{$special{$code}{Uppercase} });
333+
my$fold =join",",
334+
(map {sprintf"0x%06x",$_ } @{$special{$code}{Foldcase} });
241335
printf$OT"\t{0x%06x,%s,",$code,$special{$code}{Conditions};
242-
printf$OT"{{%s}, {%s}, {%s}}},\n",$lower,$title,$upper;
336+
printf$OT"{{%s}, {%s}, {%s}, {%s}}},\n",$lower,$title,$upper,$fold;
243337
}
244338

245339
print$OT"\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
@@ -260,11 +354,13 @@
260354
my$lc = ($simple{$code}{Simple_Lowercase} ||$code);
261355
my$tc = ($simple{$code}{Simple_Titlecase} ||$code);
262356
my$uc = ($simple{$code}{Simple_Uppercase} ||$code);
357+
my$fc = ($simple{$code}{Simple_Foldcase} ||$code);
358+
263359
die"unexpected special case for code$code"
264360
ifdefined$special{$code};
265361
printf$OT
266-
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
267-
$code,$lc,$tc,$uc;
362+
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, NULL},\n",
363+
$code,$lc,$tc,$uc,$fc;
268364
}
269365
printf$OT"\n";
270366

@@ -280,8 +376,8 @@
280376
$special_case =sprintf"&special_case[%d]",$special{$code}{Index};
281377
}
282378
printf$OT
283-
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x},%s},\n",
379+
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x},%s},\n",
284380
$code,$map->{Simple_Lowercase},$map->{Simple_Titlecase},
285-
$map->{Simple_Uppercase},$special_case;
381+
$map->{Simple_Uppercase},$map->{Simple_Foldcase},$special_case;
286382
}
287383
print$OT"};\n";

‎src/common/unicode/meson.build

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ endif
1111

1212
# These files are part of the Unicode Character Database. Download them on
1313
# demand.
14-
foreachf: ['CompositionExclusions.txt','DerivedCoreProperties.txt','DerivedNormalizationProps.txt','EastAsianWidth.txt','NormalizationTest.txt','PropList.txt','SpecialCasing.txt','UnicodeData.txt']
14+
foreachf: ['CompositionExclusions.txt','CaseFolding.txt','DerivedCoreProperties.txt','DerivedNormalizationProps.txt','EastAsianWidth.txt','NormalizationTest.txt','PropList.txt','SpecialCasing.txt','UnicodeData.txt']
1515
url= unicode_baseurl.format(UNICODE_VERSION, f)
1616
target=custom_target(f,
1717
output: f,
@@ -26,7 +26,7 @@ update_unicode_targets = []
2626

2727
update_unicode_targets+= \
2828
custom_target('unicode_case_table.h',
29-
input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
29+
input: [unicode_data['CaseFolding.txt'], unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
3030
output: ['unicode_case_table.h'],
3131
command: [
3232
perl,files('generate-unicode_case_table.pl'),

‎src/common/unicode_case.c

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ unicode_uppercase_simple(pg_wchar code)
5151
returnmap ?map->simplemap[CaseUpper] :code;
5252
}
5353

54+
pg_wchar
55+
unicode_casefold_simple(pg_wcharcode)
56+
{
57+
constpg_case_map*map=find_case_map(code);
58+
59+
returnmap ?map->simplemap[CaseFold] :code;
60+
}
61+
5462
/*
5563
* unicode_strlower()
5664
*
@@ -142,6 +150,30 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
142150
NULL);
143151
}
144152

153+
/*
154+
* unicode_strfold()
155+
*
156+
* Case fold src, and return the result length (not including terminating
157+
* NUL).
158+
*
159+
* String src must be encoded in UTF-8. If srclen < 0, src must be
160+
* NUL-terminated.
161+
*
162+
* Result string is stored in dst, truncating if larger than dstsize. If
163+
* dstsize is greater than the result length, dst will be NUL-terminated;
164+
* otherwise not.
165+
*
166+
* If dstsize is zero, dst may be NULL. This is useful for calculating the
167+
* required buffer size before allocating.
168+
*/
169+
size_t
170+
unicode_strfold(char*dst,size_tdstsize,constchar*src,ssize_tsrclen,
171+
boolfull)
172+
{
173+
returnconvert_case(dst,dstsize,src,srclen,CaseFold,full,NULL,
174+
NULL);
175+
}
176+
145177
/*
146178
* Implement Unicode Default Case Conversion algorithm.
147179
*

‎src/include/common/unicode_case.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,15 @@ typedef size_t (*WordBoundaryNext) (void *wbstate);
2121
pg_wcharunicode_lowercase_simple(pg_wcharcode);
2222
pg_wcharunicode_titlecase_simple(pg_wcharcode);
2323
pg_wcharunicode_uppercase_simple(pg_wcharcode);
24+
pg_wcharunicode_casefold_simple(pg_wcharcode);
2425
size_tunicode_strlower(char*dst,size_tdstsize,constchar*src,
2526
ssize_tsrclen,boolfull);
2627
size_tunicode_strtitle(char*dst,size_tdstsize,constchar*src,
2728
ssize_tsrclen,boolfull,
2829
WordBoundaryNextwbnext,void*wbstate);
2930
size_tunicode_strupper(char*dst,size_tdstsize,constchar*src,
3031
ssize_tsrclen,boolfull);
32+
size_tunicode_strfold(char*dst,size_tdstsize,constchar*src,
33+
ssize_tsrclen,boolfull);
3134

3235
#endif/* UNICODE_CASE_H */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp