Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit286a365

Browse files
committed
Support Unicode full case mapping and conversion.
Generate tables from Unicode SpecialCasing.txt to support moresophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "dž" uppercasing to "DŽ" but titlecasing to "Dž"Discussion:https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.comDiscussion:https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.orgReviewed-by: Peter Eisentraut, Daniel Verite
1 parent6a9b2a6 commit286a365

File tree

9 files changed

+3645
-2993
lines changed

9 files changed

+3645
-2993
lines changed

‎src/backend/utils/adt/pg_locale_builtin.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ size_t
7878
strlower_builtin(char*dest,size_tdestsize,constchar*src,ssize_tsrclen,
7979
pg_locale_tlocale)
8080
{
81-
returnunicode_strlower(dest,destsize,src,srclen);
81+
returnunicode_strlower(dest,destsize,src,srclen, false);
8282
}
8383

8484
size_t
@@ -93,15 +93,15 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
9393
.prev_alnum= false,
9494
};
9595

96-
returnunicode_strtitle(dest,destsize,src,srclen,
96+
returnunicode_strtitle(dest,destsize,src,srclen, false,
9797
initcap_wbnext,&wbstate);
9898
}
9999

100100
size_t
101101
strupper_builtin(char*dest,size_tdestsize,constchar*src,ssize_tsrclen,
102102
pg_locale_tlocale)
103103
{
104-
returnunicode_strupper(dest,destsize,src,srclen);
104+
returnunicode_strupper(dest,destsize,src,srclen, false);
105105
}
106106

107107
pg_locale_t

‎src/common/unicode/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
3030
# These files are part of the Unicode Character Database. Download
3131
# them on demand. The dependency on Makefile.global is for
3232
# UNICODE_VERSION.
33-
CompositionExclusions.txtDerivedCoreProperties.txtDerivedNormalizationProps.txtEastAsianWidth.txtNormalizationTest.txtPropList.txtUnicodeData.txt:$(top_builddir)/src/Makefile.global
33+
CompositionExclusions.txtDerivedCoreProperties.txtDerivedNormalizationProps.txtEastAsianWidth.txtNormalizationTest.txtPropList.txtSpecialCasing.txtUnicodeData.txt:$(top_builddir)/src/Makefile.global
3434
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
3535

3636
unicode_version.h: generate-unicode_version.pl
@@ -91,4 +91,4 @@ clean:
9191
rm -f$(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
9292

9393
distclean: clean
94-
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
94+
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txtSpecialCasing.txtUnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h

‎src/common/unicode/case_test.c

Lines changed: 191 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,61 @@
1818
#include<wctype.h>
1919

2020
#ifdefUSE_ICU
21+
#include<unicode/ucasemap.h>
2122
#include<unicode/uchar.h>
2223
#endif
2324
#include"common/unicode_case.h"
2425
#include"common/unicode_category.h"
2526
#include"common/unicode_version.h"
2627

28+
/* enough to hold largest source or result string, including NUL */
29+
#defineBUFSZ 256
30+
31+
#ifdefUSE_ICU
32+
staticUCaseMap*casemap=NULL;
33+
#endif
34+
35+
typedefsize_t (*TestFunc) (char*dst,size_tdstsize,constchar*src,
36+
ssize_tsrclen);
37+
38+
/* simple boundary iterator copied from pg_locale_builtin.c */
39+
structWordBoundaryState
40+
{
41+
constchar*str;
42+
size_tlen;
43+
size_toffset;
44+
boolinit;
45+
boolprev_alnum;
46+
};
47+
48+
staticsize_t
49+
initcap_wbnext(void*state)
50+
{
51+
structWordBoundaryState*wbstate= (structWordBoundaryState*)state;
52+
53+
while (wbstate->offset<wbstate->len&&
54+
wbstate->str[wbstate->offset]!='\0')
55+
{
56+
pg_wcharu=utf8_to_unicode((unsignedchar*)wbstate->str+
57+
wbstate->offset);
58+
boolcurr_alnum=pg_u_isalnum(u, true);
59+
60+
if (!wbstate->init||curr_alnum!=wbstate->prev_alnum)
61+
{
62+
size_tprev_offset=wbstate->offset;
63+
64+
wbstate->init= true;
65+
wbstate->offset+=unicode_utf8len(u);
66+
wbstate->prev_alnum=curr_alnum;
67+
returnprev_offset;
68+
}
69+
70+
wbstate->offset+=unicode_utf8len(u);
71+
}
72+
73+
returnwbstate->len;
74+
}
75+
2776
#ifdefUSE_ICU
2877

2978
staticvoid
@@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)
4897
}
4998
}
5099

100+
staticvoid
101+
icu_test_full(char*str)
102+
{
103+
charlower[BUFSZ];
104+
chartitle[BUFSZ];
105+
charupper[BUFSZ];
106+
charicu_lower[BUFSZ];
107+
charicu_title[BUFSZ];
108+
charicu_upper[BUFSZ];
109+
UErrorCodestatus;
110+
structWordBoundaryStatewbstate= {
111+
.str=str,
112+
.len=strlen(str),
113+
.offset=0,
114+
.init= false,
115+
.prev_alnum= false,
116+
};
117+
118+
unicode_strlower(lower,BUFSZ,str,-1, true);
119+
unicode_strtitle(title,BUFSZ,str,-1, true,initcap_wbnext,&wbstate);
120+
unicode_strupper(upper,BUFSZ,str,-1, true);
121+
status=U_ZERO_ERROR;
122+
ucasemap_utf8ToLower(casemap,icu_lower,BUFSZ,str,-1,&status);
123+
status=U_ZERO_ERROR;
124+
ucasemap_utf8ToTitle(casemap,icu_title,BUFSZ,str,-1,&status);
125+
status=U_ZERO_ERROR;
126+
ucasemap_utf8ToUpper(casemap,icu_upper,BUFSZ,str,-1,&status);
127+
128+
if (strcmp(lower,icu_lower)!=0)
129+
{
130+
printf("case_test: str='%s' lower='%s' icu_lower='%s'\n",str,lower,
131+
icu_lower);
132+
exit(1);
133+
}
134+
if (strcmp(title,icu_title)!=0)
135+
{
136+
printf("case_test: str='%s' title='%s' icu_title='%s'\n",str,title,
137+
icu_title);
138+
exit(1);
139+
}
140+
if (strcmp(upper,icu_upper)!=0)
141+
{
142+
printf("case_test: str='%s' upper='%s' icu_upper='%s'\n",str,upper,
143+
icu_upper);
144+
exit(1);
145+
}
146+
}
147+
51148
/*
52149
* Exhaustively compare case mappings with the results from ICU.
53150
*/
@@ -64,6 +161,7 @@ test_icu(void)
64161
if (category!=PG_U_UNASSIGNED)
65162
{
66163
uint8_ticu_category=u_charType(code);
164+
charcode_str[5]= {0};
67165

68166
if (icu_category==PG_U_UNASSIGNED)
69167
{
@@ -72,6 +170,9 @@ test_icu(void)
72170
}
73171

74172
icu_test_simple(code);
173+
unicode_to_utf8(code, (unsignedchar*)code_str);
174+
icu_test_full(code_str);
175+
75176
successful++;
76177
}
77178
}
@@ -86,7 +187,7 @@ test_icu(void)
86187
#endif
87188

88189
staticvoid
89-
test_strlower(constchar*test_string,constchar*expected)
190+
test_convert(TestFunctfunc,constchar*test_string,constchar*expected)
90191
{
91192
size_tsrc1len=strlen(test_string);
92193
size_tsrc2len=-1;/* NUL-terminated */
@@ -102,10 +203,11 @@ test_strlower(const char *test_string, const char *expected)
102203

103204
/* neither source nor destination are NUL-terminated */
104205
memset(dst1,0x7F,dst1len);
105-
needed=unicode_strlower(dst1,dst1len,src1,src1len);
206+
needed=tfunc(dst1,dst1len,src1,src1len);
106207
if (needed!=strlen(expected))
107208
{
108-
printf("case_test: convert_case test1 FAILURE: needed %zu\n",needed);
209+
printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
210+
test_string,needed,strlen(expected));
109211
exit(1);
110212
}
111213
if (memcmp(dst1,expected,dst1len)!=0)
@@ -117,10 +219,11 @@ test_strlower(const char *test_string, const char *expected)
117219

118220
/* destination is NUL-terminated and source is not */
119221
memset(dst2,0x7F,dst2len);
120-
needed=unicode_strlower(dst2,dst2len,src1,src1len);
222+
needed=tfunc(dst2,dst2len,src1,src1len);
121223
if (needed!=strlen(expected))
122224
{
123-
printf("case_test: convert_case test2 FAILURE: needed %zu\n",needed);
225+
printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
226+
test_string,needed,strlen(expected));
124227
exit(1);
125228
}
126229
if (strcmp(dst2,expected)!=0)
@@ -132,9 +235,11 @@ test_strlower(const char *test_string, const char *expected)
132235

133236
/* source is NUL-terminated and destination is not */
134237
memset(dst1,0x7F,dst1len);
135-
needed=unicode_strlower(dst1,dst1len,src2,src2len);
238+
needed=tfunc(dst1,dst1len,src2,src2len);
136239
if (needed!=strlen(expected))
137240
{
241+
printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
242+
test_string,needed,strlen(expected));
138243
printf("case_test: convert_case test3 FAILURE: needed %zu\n",needed);
139244
exit(1);
140245
}
@@ -147,10 +252,11 @@ test_strlower(const char *test_string, const char *expected)
147252

148253
/* both source and destination are NUL-terminated */
149254
memset(dst2,0x7F,dst2len);
150-
needed=unicode_strlower(dst2,dst2len,src2,src2len);
255+
needed=tfunc(dst2,dst2len,src2,src2len);
151256
if (needed!=strlen(expected))
152257
{
153-
printf("case_test: convert_case test4 FAILURE: needed %zu\n",needed);
258+
printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
259+
test_string,needed,strlen(expected));
154260
exit(1);
155261
}
156262
if (strcmp(dst2,expected)!=0)
@@ -166,22 +272,92 @@ test_strlower(const char *test_string, const char *expected)
166272
free(dst2);
167273
}
168274

275+
staticsize_t
276+
tfunc_lower(char*dst,size_tdstsize,constchar*src,
277+
ssize_tsrclen)
278+
{
279+
returnunicode_strlower(dst,dstsize,src,srclen, true);
280+
}
281+
282+
staticsize_t
283+
tfunc_title(char*dst,size_tdstsize,constchar*src,
284+
ssize_tsrclen)
285+
{
286+
structWordBoundaryStatewbstate= {
287+
.str=src,
288+
.len=srclen,
289+
.offset=0,
290+
.init= false,
291+
.prev_alnum= false,
292+
};
293+
294+
returnunicode_strtitle(dst,dstsize,src,srclen, true,initcap_wbnext,
295+
&wbstate);
296+
}
297+
298+
staticsize_t
299+
tfunc_upper(char*dst,size_tdstsize,constchar*src,
300+
ssize_tsrclen)
301+
{
302+
returnunicode_strupper(dst,dstsize,src,srclen, true);
303+
}
304+
305+
169306
staticvoid
170307
test_convert_case()
171308
{
172309
/* test string with no case changes */
173-
test_strlower("√∞","√∞");
310+
test_convert(tfunc_lower,"√∞","√∞");
311+
/* test adjust-to-cased behavior */
312+
test_convert(tfunc_title,"abc 123xyz","Abc 123xyz");
174313
/* test string with case changes */
175-
test_strlower("ABC","abc");
314+
test_convert(tfunc_upper,"abc","ABC");
176315
/* test string with case changes and byte length changes */
177-
test_strlower("ȺȺȺ","ⱥⱥⱥ");
316+
test_convert(tfunc_lower,"ȺȺȺ","ⱥⱥⱥ");
317+
/* test special case conversions */
318+
test_convert(tfunc_upper,"ß","SS");
319+
test_convert(tfunc_lower,"ıiIİ","ıiii\u0307");
320+
test_convert(tfunc_upper,"ıiIİ","IIIİ");
321+
/* test final sigma */
322+
test_convert(tfunc_lower,"σςΣ ΣΣΣ","σςς σσς");
323+
test_convert(tfunc_lower,"σς'Σ' ΣΣ'Σ'","σς'ς' σσ'ς'");
324+
test_convert(tfunc_title,"σςΣ ΣΣΣ","Σςς Σσς");
325+
326+
#ifdefUSE_ICU
327+
icu_test_full("");
328+
icu_test_full("ȺȺȺ");
329+
icu_test_full("ßßß");
330+
icu_test_full("√∞");
331+
icu_test_full("a b");
332+
icu_test_full("abc 123xyz");
333+
icu_test_full("σςΣ ΣΣΣ");
334+
icu_test_full("ıiIİ");
335+
/* test <alpha><iota_subscript><acute> */
336+
icu_test_full("\u0391\u0345\u0301");
337+
#endif
178338

179339
printf("case_test: convert_case: success\n");
180340
}
181341

182342
int
183343
main(intargc,char**argv)
184344
{
345+
#ifdefUSE_ICU
346+
UErrorCodestatus=U_ZERO_ERROR;
347+
348+
/*
349+
* Disable ICU's word break adjustment for titlecase to match the expected
350+
* behavior of unicode_strtitle().
351+
*/
352+
casemap=ucasemap_open("und",U_TITLECASE_NO_BREAK_ADJUSTMENT,&status);
353+
if (U_FAILURE(status))
354+
{
355+
printf("case_test: failure opening UCaseMap: %s\n",
356+
u_errorName(status));
357+
exit(1);
358+
}
359+
#endif
360+
185361
printf("case_test: Postgres Unicode version:\t%s\n",PG_UNICODE_VERSION);
186362
#ifdefUSE_ICU
187363
printf("case_test: ICU Unicode version:\t\t%s\n",U_UNICODE_VERSION);
@@ -191,5 +367,9 @@ main(int argc, char **argv)
191367
#endif
192368

193369
test_convert_case();
370+
371+
#ifdefUSE_ICU
372+
ucasemap_close(casemap);
373+
#endif
194374
exit(0);
195375
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp