NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit46e5441

committed

Add unicode_strtitle() for Unicode Default Case Conversion.

This brings the titlecasing implementation for the builtin providerout of formatting.c and into unicode_case.c, along withunicode_strlower() and unicode_strupper(). Accepts an arbitrary wordboundary callback.Simple for now, but can be extended to support the Unicode DefaultCase Conversion algorithm with full case mapping.Discussion:https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.comReviewed-by: Peter Eisentraut

1 parenta96a8b1 commit46e5441Copy full SHA for 46e5441

File tree

3 files changed

+140

-48

lines changed

src
- backend/utils/adt
  - formatting.c
- common
  - unicode_case.c
- include/common
  - unicode_case.h

3 files changed

+140

-48

lines changed

`‎src/backend/utils/adt/formatting.c‎`

Lines changed: 67 additions & 40 deletions

Original file line number	Diff line number	Diff line change
`@@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)`
`1922`	`1922`	`returnresult;`
`1923`	`1923`	`}`
`1924`	`1924`
	`1925`	`+structWordBoundaryState`
	`1926`	`+{`
	`1927`	`+constchar*str;`
	`1928`	`+size_tlen;`
	`1929`	`+size_toffset;`
	`1930`	`+boolinit;`
	`1931`	`+boolprev_alnum;`
	`1932`	`+};`
	`1933`	`+`
	`1934`	`+/*`
	`1935`	`+ * Simple word boundary iterator that draws boundaries each time the result of`
	`1936`	`+ * pg_u_isalnum() changes.`
	`1937`	`+ */`
	`1938`	`+staticsize_t`
	`1939`	`+initcap_wbnext(void*state)`
	`1940`	`+{`
	`1941`	`+structWordBoundaryStatewbstate= (structWordBoundaryState)state;`
	`1942`	`+`
	`1943`	`+while (wbstate->offset<wbstate->len&&`
	`1944`	`+wbstate->str[wbstate->offset]!='\0')`
	`1945`	`+{`
	`1946`	`+pg_wcharu=utf8_to_unicode((unsignedchar*)wbstate->str+`
	`1947`	`+wbstate->offset);`
	`1948`	`+boolcurr_alnum=pg_u_isalnum(u, true);`
	`1949`	`+`
	`1950`	`+if (!wbstate->init\|\|curr_alnum!=wbstate->prev_alnum)`
	`1951`	`+{`
	`1952`	`+size_tprev_offset=wbstate->offset;`
	`1953`	`+`
	`1954`	`+wbstate->init= true;`
	`1955`	`+wbstate->offset+=unicode_utf8len(u);`
	`1956`	`+wbstate->prev_alnum=curr_alnum;`
	`1957`	`+returnprev_offset;`
	`1958`	`+}`
	`1959`	`+`
	`1960`	`+wbstate->offset+=unicode_utf8len(u);`
	`1961`	`+}`
	`1962`	`+`
	`1963`	`+returnwbstate->len;`
	`1964`	`+}`
	`1965`	`+`
`1925`	`1966`	`/*`
`1926`	`1967`	`* collation-aware, wide-character-aware initcap function`
`1927`	`1968`	`*`
`@@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)`
`1980`	`2021`	`#endif`
`1981`	`2022`	`if (mylocale&&mylocale->provider==COLLPROVIDER_BUILTIN)`
`1982`	`2023`	`{`
`1983`		`-constunsignedcharsrc= (unsignedchar)buff;`
	`2024`	`+constchar*src=buff;`
`1984`	`2025`	`size_tsrclen=nbytes;`
`1985`		`-unsignedchar*dst;`
`1986`	`2026`	`size_tdstsize;`
`1987`		`-intsrcoff=0;`
`1988`		`-intdstoff=0;`
	`2027`	`+char*dst;`
	`2028`	`+size_tneeded;`
	`2029`	`+structWordBoundaryStatewbstate= {`
	`2030`	`+.str=src,`
	`2031`	`+.len=srclen,`
	`2032`	`+.offset=0,`
	`2033`	`+.init= false,`
	`2034`	`+.prev_alnum= false,`
	`2035`	`+};`
`1989`	`2036`
`1990`	`2037`	`Assert(GetDatabaseEncoding()==PG_UTF8);`
`1991`	`2038`
`1992`		`-/* overflow paranoia */`
`1993`		`-if ((srclen+1)> (INT_MAX /MAX_MULTIBYTE_CHAR_LEN))`
`1994`		`-ereport(ERROR,`
`1995`		`-(errcode(ERRCODE_OUT_OF_MEMORY),`
`1996`		`-errmsg("out of memory")));`
`1997`		`-`
`1998`		`-/* result is at most srclen codepoints plus terminating NUL */`
`1999`		`-dstsize=srclen*MAX_MULTIBYTE_CHAR_LEN+1;`
`2000`		`-dst= (unsignedchar*)palloc(dstsize);`
	`2039`	`+/* first try buffer of equal size plus terminating NUL */`
	`2040`	`+dstsize=srclen+1;`
	`2041`	`+dst=palloc(dstsize);`
`2001`	`2042`
`2002`		`-while (srcoff<nbytes)`
	`2043`	`+needed=unicode_strtitle(dst,dstsize,src,srclen,`
	`2044`	`+initcap_wbnext,&wbstate);`
	`2045`	`+if (needed+1>dstsize)`
`2003`	`2046`	`{`
`2004`		`-pg_wcharu1=utf8_to_unicode(src+srcoff);`
`2005`		`-pg_wcharu2;`
`2006`		`-intu1len=unicode_utf8len(u1);`
`2007`		`-intu2len;`
`2008`		`-`
`2009`		`-if (wasalnum)`
`2010`		`-u2=unicode_lowercase_simple(u1);`
`2011`		`-else`
`2012`		`-u2=unicode_uppercase_simple(u1);`
	`2047`	`+/* reset iterator */`
	`2048`	`+wbstate.offset=0;`
	`2049`	`+wbstate.init= false;`
`2013`	`2050`
`2014`		`-u2len=unicode_utf8len(u2);`
`2015`		`-`
`2016`		`-Assert(dstoff+u2len+1 <=dstsize);`
`2017`		`-`
`2018`		`-wasalnum=pg_u_isalnum(u2, true);`
`2019`		`-`
`2020`		`-unicode_to_utf8(u2,dst+dstoff);`
`2021`		`-srcoff+=u1len;`
`2022`		`-dstoff+=u2len;`
	`2051`	`+/* grow buffer if needed and retry */`
	`2052`	`+dstsize=needed+1;`
	`2053`	`+dst=repalloc(dst,dstsize);`
	`2054`	`+needed=unicode_strtitle(dst,dstsize,src,srclen,`
	`2055`	`+initcap_wbnext,&wbstate);`
	`2056`	`+Assert(needed+1==dstsize);`
`2023`	`2057`	`}`
`2024`	`2058`
`2025`		`-Assert(dstoff+1 <=dstsize);`
`2026`		`-*(dst+dstoff)='\0';`
`2027`		`-dstoff++;`
`2028`		`-`
`2029`		`-/* allocate result buffer of the right size and free workspace */`
`2030`		`-result=palloc(dstoff);`
`2031`		`-memcpy(result,dst,dstoff);`
`2032`		`-pfree(dst);`
	`2059`	`+result=dst;`
`2033`	`2060`	`}`
`2034`	`2061`	`else`
`2035`	`2062`	`{`

`‎src/common/unicode_case.c‎`

Lines changed: 68 additions & 8 deletions

Original file line number	Diff line number	Diff line change
`@@ -21,8 +21,9 @@`
`21`	`21`	`#include"mb/pg_wchar.h"`
`22`	`22`
`23`	`23`	`staticconstpg_case_map*find_case_map(pg_wcharucs);`
`24`		`-staticsize_tconvert_case(chardst,size_tdstsize,constcharsrc,`
`25`		`-ssize_tsrclen,CaseKindcasekind);`
	`24`	`+staticsize_tconvert_case(chardst,size_tdstsize,constcharsrc,ssize_tsrclen,`
	`25`	`+CaseKindstr_casekind,WordBoundaryNextwbnext,`
	`26`	`+void*wbstate);`
`26`	`27`
`27`	`28`	`pg_wchar`
`28`	`29`	`unicode_lowercase_simple(pg_wcharcode)`
`@@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)`
`67`	`68`	`size_t`
`68`	`69`	`unicode_strlower(chardst,size_tdstsize,constcharsrc,ssize_tsrclen)`
`69`	`70`	`{`
`70`		`-returnconvert_case(dst,dstsize,src,srclen,CaseLower);`
	`71`	`+returnconvert_case(dst,dstsize,src,srclen,CaseLower,NULL,NULL);`
	`72`	`+}`
	`73`	`+`
	`74`	`+/*`
	`75`	`+ * unicode_strtitle()`
	`76`	`+ *`
	`77`	`+ * Convert src to titlecase, and return the result length (not including`
	`78`	`+ * terminating NUL).`
	`79`	`+ *`
	`80`	`+ * String src must be encoded in UTF-8. If srclen < 0, src must be`
	`81`	`+ * NUL-terminated.`
	`82`	`+ *`
	`83`	`+ * Result string is stored in dst, truncating if larger than dstsize. If`
	`84`	`+ * dstsize is greater than the result length, dst will be NUL-terminated;`
	`85`	`+ * otherwise not.`
	`86`	`+ *`
	`87`	`+ * If dstsize is zero, dst may be NULL. This is useful for calculating the`
	`88`	`+ * required buffer size before allocating.`
	`89`	`+ *`
	`90`	`+ * Titlecasing requires knowledge about word boundaries, which is provided by`
	`91`	`+ * the callback wbnext. A word boundary is the offset of the start of a word`
	`92`	`+ * or the offset of the character immediately following a word.`
	`93`	`+ *`
	`94`	`+ * The caller is expected to initialize and free the callback state`
	`95`	`+ * wbstate. The callback should first return offset 0 for the first boundary;`
	`96`	`+ * then the offset of each subsequent word boundary; then the total length of`
	`97`	`+ * the string to indicate the final boundary.`
	`98`	`+ */`
	`99`	`+size_t`
	`100`	`+unicode_strtitle(chardst,size_tdstsize,constcharsrc,ssize_tsrclen,`
	`101`	`+WordBoundaryNextwbnext,void*wbstate)`
	`102`	`+{`
	`103`	`+returnconvert_case(dst,dstsize,src,srclen,CaseTitle,wbnext,`
	`104`	`+wbstate);`
`71`	`105`	`}`
`72`	`106`
`73`	`107`	`/*`
`@@ -89,30 +123,56 @@ unicode_strlower(char dst, size_t dstsize, const char src, ssize_t srclen)`
`89`	`123`	`size_t`
`90`	`124`	`unicode_strupper(chardst,size_tdstsize,constcharsrc,ssize_tsrclen)`
`91`	`125`	`{`
`92`		`-returnconvert_case(dst,dstsize,src,srclen,CaseUpper);`
	`126`	`+returnconvert_case(dst,dstsize,src,srclen,CaseUpper,NULL,NULL);`
`93`	`127`	`}`
`94`	`128`
`95`	`129`	`/*`
`96`		`- * Implement Unicode Default Case Conversion algorithm.`
	`130`	`+ * If str_casekind is CaseLower or CaseUpper, map each character in the string`
	`131`	`+ * for which a mapping is available.`
`97`	`132`	`*`
`98`		`- * Map each character in the string for which a mapping is available.`
	`133`	`+ * If str_casekind is CaseTitle, maps characters found on a word boundary to`
	`134`	`+ * uppercase and other characters to lowercase.`
`99`	`135`	`*/`
`100`	`136`	`staticsize_t`
`101`	`137`	`convert_case(chardst,size_tdstsize,constcharsrc,ssize_tsrclen,`
`102`		`-CaseKindcasekind)`
	`138`	`+CaseKindstr_casekind,WordBoundaryNextwbnext,void*wbstate)`
`103`	`139`	`{`
	`140`	`+/* character CaseKind varies while titlecasing */`
	`141`	`+CaseKindchr_casekind=str_casekind;`
`104`	`142`	`size_tsrcoff=0;`
`105`	`143`	`size_tresult_len=0;`
	`144`	`+size_tboundary=0;`
	`145`	`+`
	`146`	`+Assert((str_casekind==CaseTitle&&wbnext&&wbstate)\|\|`
	`147`	`+ (str_casekind!=CaseTitle&& !wbnext&& !wbstate));`
	`148`	`+`
	`149`	`+if (str_casekind==CaseTitle)`
	`150`	`+{`
	`151`	`+boundary=wbnext(wbstate);`
	`152`	`+Assert(boundary==0);/* start of text is always a boundary */`
	`153`	`+}`
`106`	`154`
`107`	`155`	`while ((srclen<0\|\|srcoff<srclen)&&src[srcoff]!='\0')`
`108`	`156`	`{`
`109`	`157`	`pg_wcharu1=utf8_to_unicode((unsignedchar*)src+srcoff);`
`110`	`158`	`intu1len=unicode_utf8len(u1);`
`111`	`159`	`constpg_case_map*casemap=find_case_map(u1);`
`112`	`160`
	`161`	`+if (str_casekind==CaseTitle)`
	`162`	`+{`
	`163`	`+if (srcoff==boundary)`
	`164`	`+{`
	`165`	`+chr_casekind=CaseUpper;`
	`166`	`+boundary=wbnext(wbstate);`
	`167`	`+}`
	`168`	`+else`
	`169`	`+chr_casekind=CaseLower;`
	`170`	`+}`
	`171`	`+`
	`172`	`+/* perform mapping, update result_len, and write to dst */`
`113`	`173`	`if (casemap)`
`114`	`174`	`{`
`115`		`-pg_wcharu2=casemap->simplemap[casekind];`
	`175`	`+pg_wcharu2=casemap->simplemap[chr_casekind];`
`116`	`176`	`pg_wcharu2len=unicode_utf8len(u2);`
`117`	`177`
`118`	`178`	`if (result_len+u2len <=dstsize)`

`‎src/include/common/unicode_case.h‎`

Lines changed: 5 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -16,11 +16,16 @@`
`16`	`16`
`17`	`17`	`#include"mb/pg_wchar.h"`
`18`	`18`
	`19`	`+typedefsize_t (WordBoundaryNext) (voidwbstate);`
	`20`	`+`
`19`	`21`	`pg_wcharunicode_lowercase_simple(pg_wcharucs);`
`20`	`22`	`pg_wcharunicode_titlecase_simple(pg_wcharucs);`
`21`	`23`	`pg_wcharunicode_uppercase_simple(pg_wcharucs);`
`22`	`24`	`size_tunicode_strlower(chardst,size_tdstsize,constcharsrc,`
`23`	`25`	`ssize_tsrclen);`
	`26`	`+size_tunicode_strtitle(chardst,size_tdstsize,constcharsrc,`
	`27`	`+ssize_tsrclen,WordBoundaryNextwbnext,`
	`28`	`+void*wbstate);`
`24`	`29`	`size_tunicode_strupper(chardst,size_tdstsize,constcharsrc,`
`25`	`30`	`ssize_tsrclen);`
`26`	`31`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit46e5441

File tree

3 files changed

3 files changed

`‎src/backend/utils/adt/formatting.c‎`

`‎src/common/unicode_case.c‎`

`‎src/include/common/unicode_case.h‎`

0 commit comments