NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit286a365

committed

Support Unicode full case mapping and conversion.

Generate tables from Unicode SpecialCasing.txt to support moresophisticated case mapping behavior: * support case mappings to multiple codepoints, such as "ß" uppercasing to "SS" * support conditional case mappings, such as the "final sigma" * support titlecase variants, such as "ǆ" uppercasing to "Ǆ" but titlecasing to "ǅ"Discussion:https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.comDiscussion:https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.orgReviewed-by: Peter Eisentraut, Daniel Verite

1 parent6a9b2a6 commit286a365Copy full SHA for 286a365

File tree

9 files changed

+3645

-2993

lines changed

src
- backend/utils/adt
  - pg_locale_builtin.c
- common
  - unicode
  - unicode_case.c
- include/common
  - unicode_case.h
  - unicode_case_table.h
- tools/pgindent
  - typedefs.list

9 files changed

+3645

-2993

lines changed

`‎src/backend/utils/adt/pg_locale_builtin.c`

Lines changed: 3 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ size_t`
`78`	`78`	`strlower_builtin(chardest,size_tdestsize,constcharsrc,ssize_tsrclen,`
`79`	`79`	`pg_locale_tlocale)`
`80`	`80`	`{`
`81`		`-returnunicode_strlower(dest,destsize,src,srclen);`
	`81`	`+returnunicode_strlower(dest,destsize,src,srclen, false);`
`82`	`82`	`}`
`83`	`83`
`84`	`84`	`size_t`
`@@ -93,15 +93,15 @@ strtitle_builtin(char dest, size_t destsize, const char src, ssize_t srclen,`
`93`	`93`	`.prev_alnum= false,`
`94`	`94`	`};`
`95`	`95`
`96`		`-returnunicode_strtitle(dest,destsize,src,srclen,`
	`96`	`+returnunicode_strtitle(dest,destsize,src,srclen, false,`
`97`	`97`	`initcap_wbnext,&wbstate);`
`98`	`98`	`}`
`99`	`99`
`100`	`100`	`size_t`
`101`	`101`	`strupper_builtin(chardest,size_tdestsize,constcharsrc,ssize_tsrclen,`
`102`	`102`	`pg_locale_tlocale)`
`103`	`103`	`{`
`104`		`-returnunicode_strupper(dest,destsize,src,srclen);`
	`104`	`+returnunicode_strupper(dest,destsize,src,srclen, false);`
`105`	`105`	`}`
`106`	`106`
`107`	`107`	`pg_locale_t`

`‎src/common/unicode/Makefile`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian`
`30`	`30`	`# These files are part of the Unicode Character Database. Download`
`31`	`31`	`# them on demand. The dependency on Makefile.global is for`
`32`	`32`	`# UNICODE_VERSION.`
`33`		`-CompositionExclusions.txtDerivedCoreProperties.txtDerivedNormalizationProps.txtEastAsianWidth.txtNormalizationTest.txtPropList.txtUnicodeData.txt:$(top_builddir)/src/Makefile.global`
	`33`	`+CompositionExclusions.txtDerivedCoreProperties.txtDerivedNormalizationProps.txtEastAsianWidth.txtNormalizationTest.txtPropList.txtSpecialCasing.txtUnicodeData.txt:$(top_builddir)/src/Makefile.global`
`34`	`34`	`$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)`
`35`	`35`
`36`	`36`	`unicode_version.h: generate-unicode_version.pl`
`@@ -91,4 +91,4 @@ clean:`
`91`	`91`	`rm -f$(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o`
`92`	`92`
`93`	`93`	`distclean: clean`
`94`		`-rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h`
	`94`	`+rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txtSpecialCasing.txtUnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h`

`‎src/common/unicode/case_test.c`

Lines changed: 191 additions & 11 deletions

Original file line number	Diff line number	Diff line change
`@@ -18,12 +18,61 @@`
`18`	`18`	`#include<wctype.h>`
`19`	`19`
`20`	`20`	`#ifdefUSE_ICU`
	`21`	`+#include<unicode/ucasemap.h>`
`21`	`22`	`#include<unicode/uchar.h>`
`22`	`23`	`#endif`
`23`	`24`	`#include"common/unicode_case.h"`
`24`	`25`	`#include"common/unicode_category.h"`
`25`	`26`	`#include"common/unicode_version.h"`
`26`	`27`
	`28`	`+/* enough to hold largest source or result string, including NUL */`
	`29`	`+#defineBUFSZ 256`
	`30`	`+`
	`31`	`+#ifdefUSE_ICU`
	`32`	`+staticUCaseMap*casemap=NULL;`
	`33`	`+#endif`
	`34`	`+`
	`35`	`+typedefsize_t (TestFunc) (chardst,size_tdstsize,constchar*src,`
	`36`	`+ssize_tsrclen);`
	`37`	`+`
	`38`	`+/* simple boundary iterator copied from pg_locale_builtin.c */`
	`39`	`+structWordBoundaryState`
	`40`	`+{`
	`41`	`+constchar*str;`
	`42`	`+size_tlen;`
	`43`	`+size_toffset;`
	`44`	`+boolinit;`
	`45`	`+boolprev_alnum;`
	`46`	`+};`
	`47`	`+`
	`48`	`+staticsize_t`
	`49`	`+initcap_wbnext(void*state)`
	`50`	`+{`
	`51`	`+structWordBoundaryStatewbstate= (structWordBoundaryState)state;`
	`52`	`+`
	`53`	`+while (wbstate->offset<wbstate->len&&`
	`54`	`+wbstate->str[wbstate->offset]!='\0')`
	`55`	`+{`
	`56`	`+pg_wcharu=utf8_to_unicode((unsignedchar*)wbstate->str+`
	`57`	`+wbstate->offset);`
	`58`	`+boolcurr_alnum=pg_u_isalnum(u, true);`
	`59`	`+`
	`60`	`+if (!wbstate->init\|\|curr_alnum!=wbstate->prev_alnum)`
	`61`	`+{`
	`62`	`+size_tprev_offset=wbstate->offset;`
	`63`	`+`
	`64`	`+wbstate->init= true;`
	`65`	`+wbstate->offset+=unicode_utf8len(u);`
	`66`	`+wbstate->prev_alnum=curr_alnum;`
	`67`	`+returnprev_offset;`
	`68`	`+}`
	`69`	`+`
	`70`	`+wbstate->offset+=unicode_utf8len(u);`
	`71`	`+}`
	`72`	`+`
	`73`	`+returnwbstate->len;`
	`74`	`+}`
	`75`	`+`
`27`	`76`	`#ifdefUSE_ICU`
`28`	`77`
`29`	`78`	`staticvoid`
`@@ -48,6 +97,54 @@ icu_test_simple(pg_wchar code)`
`48`	`97`	`}`
`49`	`98`	`}`
`50`	`99`
	`100`	`+staticvoid`
	`101`	`+icu_test_full(char*str)`
	`102`	`+{`
	`103`	`+charlower[BUFSZ];`
	`104`	`+chartitle[BUFSZ];`
	`105`	`+charupper[BUFSZ];`
	`106`	`+charicu_lower[BUFSZ];`
	`107`	`+charicu_title[BUFSZ];`
	`108`	`+charicu_upper[BUFSZ];`
	`109`	`+UErrorCodestatus;`
	`110`	`+structWordBoundaryStatewbstate= {`
	`111`	`+.str=str,`
	`112`	`+.len=strlen(str),`
	`113`	`+.offset=0,`
	`114`	`+.init= false,`
	`115`	`+.prev_alnum= false,`
	`116`	`+};`
	`117`	`+`
	`118`	`+unicode_strlower(lower,BUFSZ,str,-1, true);`
	`119`	`+unicode_strtitle(title,BUFSZ,str,-1, true,initcap_wbnext,&wbstate);`
	`120`	`+unicode_strupper(upper,BUFSZ,str,-1, true);`
	`121`	`+status=U_ZERO_ERROR;`
	`122`	`+ucasemap_utf8ToLower(casemap,icu_lower,BUFSZ,str,-1,&status);`
	`123`	`+status=U_ZERO_ERROR;`
	`124`	`+ucasemap_utf8ToTitle(casemap,icu_title,BUFSZ,str,-1,&status);`
	`125`	`+status=U_ZERO_ERROR;`
	`126`	`+ucasemap_utf8ToUpper(casemap,icu_upper,BUFSZ,str,-1,&status);`
	`127`	`+`
	`128`	`+if (strcmp(lower,icu_lower)!=0)`
	`129`	`+{`
	`130`	`+printf("case_test: str='%s' lower='%s' icu_lower='%s'\n",str,lower,`
	`131`	`+icu_lower);`
	`132`	`+exit(1);`
	`133`	`+}`
	`134`	`+if (strcmp(title,icu_title)!=0)`
	`135`	`+{`
	`136`	`+printf("case_test: str='%s' title='%s' icu_title='%s'\n",str,title,`
	`137`	`+icu_title);`
	`138`	`+exit(1);`
	`139`	`+}`
	`140`	`+if (strcmp(upper,icu_upper)!=0)`
	`141`	`+{`
	`142`	`+printf("case_test: str='%s' upper='%s' icu_upper='%s'\n",str,upper,`
	`143`	`+icu_upper);`
	`144`	`+exit(1);`
	`145`	`+}`
	`146`	`+}`
	`147`	`+`
`51`	`148`	`/*`
`52`	`149`	`* Exhaustively compare case mappings with the results from ICU.`
`53`	`150`	`*/`
`@@ -64,6 +161,7 @@ test_icu(void)`
`64`	`161`	`if (category!=PG_U_UNASSIGNED)`
`65`	`162`	`{`
`66`	`163`	`uint8_ticu_category=u_charType(code);`
	`164`	`+charcode_str[5]= {0};`
`67`	`165`
`68`	`166`	`if (icu_category==PG_U_UNASSIGNED)`
`69`	`167`	`{`
`@@ -72,6 +170,9 @@ test_icu(void)`
`72`	`170`	`}`
`73`	`171`
`74`	`172`	`icu_test_simple(code);`
	`173`	`+unicode_to_utf8(code, (unsignedchar*)code_str);`
	`174`	`+icu_test_full(code_str);`
	`175`	`+`
`75`	`176`	`successful++;`
`76`	`177`	`}`
`77`	`178`	`}`
`@@ -86,7 +187,7 @@ test_icu(void)`
`86`	`187`	`#endif`
`87`	`188`
`88`	`189`	`staticvoid`
`89`		`-test_strlower(constchartest_string,constcharexpected)`
	`190`	`+test_convert(TestFunctfunc,constchartest_string,constcharexpected)`
`90`	`191`	`{`
`91`	`192`	`size_tsrc1len=strlen(test_string);`
`92`	`193`	`size_tsrc2len=-1;/* NUL-terminated */`
`@@ -102,10 +203,11 @@ test_strlower(const char test_string, const char expected)`
`102`	`203`
`103`	`204`	`/* neither source nor destination are NUL-terminated */`
`104`	`205`	`memset(dst1,0x7F,dst1len);`
`105`		`-needed=unicode_strlower(dst1,dst1len,src1,src1len);`
	`206`	`+needed=tfunc(dst1,dst1len,src1,src1len);`
`106`	`207`	`if (needed!=strlen(expected))`
`107`	`208`	`{`
`108`		`-printf("case_test: convert_case test1 FAILURE: needed %zu\n",needed);`
	`209`	`+printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",`
	`210`	`+test_string,needed,strlen(expected));`
`109`	`211`	`exit(1);`
`110`	`212`	`}`
`111`	`213`	`if (memcmp(dst1,expected,dst1len)!=0)`
`@@ -117,10 +219,11 @@ test_strlower(const char test_string, const char expected)`
`117`	`219`
`118`	`220`	`/* destination is NUL-terminated and source is not */`
`119`	`221`	`memset(dst2,0x7F,dst2len);`
`120`		`-needed=unicode_strlower(dst2,dst2len,src1,src1len);`
	`222`	`+needed=tfunc(dst2,dst2len,src1,src1len);`
`121`	`223`	`if (needed!=strlen(expected))`
`122`	`224`	`{`
`123`		`-printf("case_test: convert_case test2 FAILURE: needed %zu\n",needed);`
	`225`	`+printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",`
	`226`	`+test_string,needed,strlen(expected));`
`124`	`227`	`exit(1);`
`125`	`228`	`}`
`126`	`229`	`if (strcmp(dst2,expected)!=0)`
`@@ -132,9 +235,11 @@ test_strlower(const char test_string, const char expected)`
`132`	`235`
`133`	`236`	`/* source is NUL-terminated and destination is not */`
`134`	`237`	`memset(dst1,0x7F,dst1len);`
`135`		`-needed=unicode_strlower(dst1,dst1len,src2,src2len);`
	`238`	`+needed=tfunc(dst1,dst1len,src2,src2len);`
`136`	`239`	`if (needed!=strlen(expected))`
`137`	`240`	`{`
	`241`	`+printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",`
	`242`	`+test_string,needed,strlen(expected));`
`138`	`243`	`printf("case_test: convert_case test3 FAILURE: needed %zu\n",needed);`
`139`	`244`	`exit(1);`
`140`	`245`	`}`
`@@ -147,10 +252,11 @@ test_strlower(const char test_string, const char expected)`
`147`	`252`
`148`	`253`	`/* both source and destination are NUL-terminated */`
`149`	`254`	`memset(dst2,0x7F,dst2len);`
`150`		`-needed=unicode_strlower(dst2,dst2len,src2,src2len);`
	`255`	`+needed=tfunc(dst2,dst2len,src2,src2len);`
`151`	`256`	`if (needed!=strlen(expected))`
`152`	`257`	`{`
`153`		`-printf("case_test: convert_case test4 FAILURE: needed %zu\n",needed);`
	`258`	`+printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",`
	`259`	`+test_string,needed,strlen(expected));`
`154`	`260`	`exit(1);`
`155`	`261`	`}`
`156`	`262`	`if (strcmp(dst2,expected)!=0)`
`@@ -166,22 +272,92 @@ test_strlower(const char test_string, const char expected)`
`166`	`272`	`free(dst2);`
`167`	`273`	`}`
`168`	`274`
	`275`	`+staticsize_t`
	`276`	`+tfunc_lower(chardst,size_tdstsize,constcharsrc,`
	`277`	`+ssize_tsrclen)`
	`278`	`+{`
	`279`	`+returnunicode_strlower(dst,dstsize,src,srclen, true);`
	`280`	`+}`
	`281`	`+`
	`282`	`+staticsize_t`
	`283`	`+tfunc_title(chardst,size_tdstsize,constcharsrc,`
	`284`	`+ssize_tsrclen)`
	`285`	`+{`
	`286`	`+structWordBoundaryStatewbstate= {`
	`287`	`+.str=src,`
	`288`	`+.len=srclen,`
	`289`	`+.offset=0,`
	`290`	`+.init= false,`
	`291`	`+.prev_alnum= false,`
	`292`	`+};`
	`293`	`+`
	`294`	`+returnunicode_strtitle(dst,dstsize,src,srclen, true,initcap_wbnext,`
	`295`	`+&wbstate);`
	`296`	`+}`
	`297`	`+`
	`298`	`+staticsize_t`
	`299`	`+tfunc_upper(chardst,size_tdstsize,constcharsrc,`
	`300`	`+ssize_tsrclen)`
	`301`	`+{`
	`302`	`+returnunicode_strupper(dst,dstsize,src,srclen, true);`
	`303`	`+}`
	`304`	`+`
	`305`	`+`
`169`	`306`	`staticvoid`
`170`	`307`	`test_convert_case()`
`171`	`308`	`{`
`172`	`309`	`/* test string with no case changes */`
`173`		`-test_strlower("√∞","√∞");`
	`310`	`+test_convert(tfunc_lower,"√∞","√∞");`
	`311`	`+/* test adjust-to-cased behavior */`
	`312`	`+test_convert(tfunc_title,"abc 123xyz","Abc 123xyz");`
`174`	`313`	`/* test string with case changes */`
`175`		`-test_strlower("ABC","abc");`
	`314`	`+test_convert(tfunc_upper,"abc","ABC");`
`176`	`315`	`/* test string with case changes and byte length changes */`
`177`		`-test_strlower("ȺȺȺ","ⱥⱥⱥ");`
	`316`	`+test_convert(tfunc_lower,"ȺȺȺ","ⱥⱥⱥ");`
	`317`	`+/* test special case conversions */`
	`318`	`+test_convert(tfunc_upper,"ß","SS");`
	`319`	`+test_convert(tfunc_lower,"ıiIİ","ıiii\u0307");`
	`320`	`+test_convert(tfunc_upper,"ıiIİ","IIIİ");`
	`321`	`+/* test final sigma */`
	`322`	`+test_convert(tfunc_lower,"σςΣ ΣΣΣ","σςς σσς");`
	`323`	`+test_convert(tfunc_lower,"σς'Σ' ΣΣ'Σ'","σς'ς' σσ'ς'");`
	`324`	`+test_convert(tfunc_title,"σςΣ ΣΣΣ","Σςς Σσς");`
	`325`	`+`
	`326`	`+#ifdefUSE_ICU`
	`327`	`+icu_test_full("");`
	`328`	`+icu_test_full("ȺȺȺ");`
	`329`	`+icu_test_full("ßßß");`
	`330`	`+icu_test_full("√∞");`
	`331`	`+icu_test_full("a b");`
	`332`	`+icu_test_full("abc 123xyz");`
	`333`	`+icu_test_full("σςΣ ΣΣΣ");`
	`334`	`+icu_test_full("ıiIİ");`
	`335`	`+/* test <alpha><iota_subscript><acute> */`
	`336`	`+icu_test_full("\u0391\u0345\u0301");`
	`337`	`+#endif`
`178`	`338`
`179`	`339`	`printf("case_test: convert_case: success\n");`
`180`	`340`	`}`
`181`	`341`
`182`	`342`	`int`
`183`	`343`	`main(intargc,char**argv)`
`184`	`344`	`{`
	`345`	`+#ifdefUSE_ICU`
	`346`	`+UErrorCodestatus=U_ZERO_ERROR;`
	`347`	`+`
	`348`	`+/*`
	`349`	`+ * Disable ICU's word break adjustment for titlecase to match the expected`
	`350`	`+ * behavior of unicode_strtitle().`
	`351`	`+ */`
	`352`	`+casemap=ucasemap_open("und",U_TITLECASE_NO_BREAK_ADJUSTMENT,&status);`
	`353`	`+if (U_FAILURE(status))`
	`354`	`+{`
	`355`	`+printf("case_test: failure opening UCaseMap: %s\n",`
	`356`	`+u_errorName(status));`
	`357`	`+exit(1);`
	`358`	`+}`
	`359`	`+#endif`
	`360`	`+`
`185`	`361`	`printf("case_test: Postgres Unicode version:\t%s\n",PG_UNICODE_VERSION);`
`186`	`362`	`#ifdefUSE_ICU`
`187`	`363`	`printf("case_test: ICU Unicode version:\t\t%s\n",U_UNICODE_VERSION);`
`@@ -191,5 +367,9 @@ main(int argc, char **argv)`
`191`	`367`	`#endif`
`192`	`368`
`193`	`369`	`test_convert_case();`
	`370`	`+`
	`371`	`+#ifdefUSE_ICU`
	`372`	`+ucasemap_close(casemap);`
	`373`	`+#endif`
`194`	`374`	`exit(0);`
`195`	`375`	`}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit286a365

File tree

9 files changed

9 files changed

`‎src/backend/utils/adt/pg_locale_builtin.c`

`‎src/common/unicode/Makefile`

`‎src/common/unicode/case_test.c`

0 commit comments