Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit46e5441

Browse files
committed
Add unicode_strtitle() for Unicode Default Case Conversion.
This brings the titlecasing implementation for the builtin providerout of formatting.c and into unicode_case.c, along withunicode_strlower() and unicode_strupper(). Accepts an arbitrary wordboundary callback.Simple for now, but can be extended to support the Unicode DefaultCase Conversion algorithm with full case mapping.Discussion:https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.comReviewed-by: Peter Eisentraut
1 parenta96a8b1 commit46e5441

File tree

3 files changed

+140
-48
lines changed

3 files changed

+140
-48
lines changed

‎src/backend/utils/adt/formatting.c‎

Lines changed: 67 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
19221922
returnresult;
19231923
}
19241924

1925+
structWordBoundaryState
1926+
{
1927+
constchar*str;
1928+
size_tlen;
1929+
size_toffset;
1930+
boolinit;
1931+
boolprev_alnum;
1932+
};
1933+
1934+
/*
1935+
* Simple word boundary iterator that draws boundaries each time the result of
1936+
* pg_u_isalnum() changes.
1937+
*/
1938+
staticsize_t
1939+
initcap_wbnext(void*state)
1940+
{
1941+
structWordBoundaryState*wbstate= (structWordBoundaryState*)state;
1942+
1943+
while (wbstate->offset<wbstate->len&&
1944+
wbstate->str[wbstate->offset]!='\0')
1945+
{
1946+
pg_wcharu=utf8_to_unicode((unsignedchar*)wbstate->str+
1947+
wbstate->offset);
1948+
boolcurr_alnum=pg_u_isalnum(u, true);
1949+
1950+
if (!wbstate->init||curr_alnum!=wbstate->prev_alnum)
1951+
{
1952+
size_tprev_offset=wbstate->offset;
1953+
1954+
wbstate->init= true;
1955+
wbstate->offset+=unicode_utf8len(u);
1956+
wbstate->prev_alnum=curr_alnum;
1957+
returnprev_offset;
1958+
}
1959+
1960+
wbstate->offset+=unicode_utf8len(u);
1961+
}
1962+
1963+
returnwbstate->len;
1964+
}
1965+
19251966
/*
19261967
* collation-aware, wide-character-aware initcap function
19271968
*
@@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
19802021
#endif
19812022
if (mylocale&&mylocale->provider==COLLPROVIDER_BUILTIN)
19822023
{
1983-
constunsignedchar*src= (unsignedchar*)buff;
2024+
constchar*src=buff;
19842025
size_tsrclen=nbytes;
1985-
unsignedchar*dst;
19862026
size_tdstsize;
1987-
intsrcoff=0;
1988-
intdstoff=0;
2027+
char*dst;
2028+
size_tneeded;
2029+
structWordBoundaryStatewbstate= {
2030+
.str=src,
2031+
.len=srclen,
2032+
.offset=0,
2033+
.init= false,
2034+
.prev_alnum= false,
2035+
};
19892036

19902037
Assert(GetDatabaseEncoding()==PG_UTF8);
19912038

1992-
/* overflow paranoia */
1993-
if ((srclen+1)> (INT_MAX /MAX_MULTIBYTE_CHAR_LEN))
1994-
ereport(ERROR,
1995-
(errcode(ERRCODE_OUT_OF_MEMORY),
1996-
errmsg("out of memory")));
1997-
1998-
/* result is at most srclen codepoints plus terminating NUL */
1999-
dstsize=srclen*MAX_MULTIBYTE_CHAR_LEN+1;
2000-
dst= (unsignedchar*)palloc(dstsize);
2039+
/* first try buffer of equal size plus terminating NUL */
2040+
dstsize=srclen+1;
2041+
dst=palloc(dstsize);
20012042

2002-
while (srcoff<nbytes)
2043+
needed=unicode_strtitle(dst,dstsize,src,srclen,
2044+
initcap_wbnext,&wbstate);
2045+
if (needed+1>dstsize)
20032046
{
2004-
pg_wcharu1=utf8_to_unicode(src+srcoff);
2005-
pg_wcharu2;
2006-
intu1len=unicode_utf8len(u1);
2007-
intu2len;
2008-
2009-
if (wasalnum)
2010-
u2=unicode_lowercase_simple(u1);
2011-
else
2012-
u2=unicode_uppercase_simple(u1);
2047+
/* reset iterator */
2048+
wbstate.offset=0;
2049+
wbstate.init= false;
20132050

2014-
u2len=unicode_utf8len(u2);
2015-
2016-
Assert(dstoff+u2len+1 <=dstsize);
2017-
2018-
wasalnum=pg_u_isalnum(u2, true);
2019-
2020-
unicode_to_utf8(u2,dst+dstoff);
2021-
srcoff+=u1len;
2022-
dstoff+=u2len;
2051+
/* grow buffer if needed and retry */
2052+
dstsize=needed+1;
2053+
dst=repalloc(dst,dstsize);
2054+
needed=unicode_strtitle(dst,dstsize,src,srclen,
2055+
initcap_wbnext,&wbstate);
2056+
Assert(needed+1==dstsize);
20232057
}
20242058

2025-
Assert(dstoff+1 <=dstsize);
2026-
*(dst+dstoff)='\0';
2027-
dstoff++;
2028-
2029-
/* allocate result buffer of the right size and free workspace */
2030-
result=palloc(dstoff);
2031-
memcpy(result,dst,dstoff);
2032-
pfree(dst);
2059+
result=dst;
20332060
}
20342061
else
20352062
{

‎src/common/unicode_case.c‎

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@
2121
#include"mb/pg_wchar.h"
2222

2323
staticconstpg_case_map*find_case_map(pg_wcharucs);
24-
staticsize_tconvert_case(char*dst,size_tdstsize,constchar*src,
25-
ssize_tsrclen,CaseKindcasekind);
24+
staticsize_tconvert_case(char*dst,size_tdstsize,constchar*src,ssize_tsrclen,
25+
CaseKindstr_casekind,WordBoundaryNextwbnext,
26+
void*wbstate);
2627

2728
pg_wchar
2829
unicode_lowercase_simple(pg_wcharcode)
@@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
6768
size_t
6869
unicode_strlower(char*dst,size_tdstsize,constchar*src,ssize_tsrclen)
6970
{
70-
returnconvert_case(dst,dstsize,src,srclen,CaseLower);
71+
returnconvert_case(dst,dstsize,src,srclen,CaseLower,NULL,NULL);
72+
}
73+
74+
/*
75+
* unicode_strtitle()
76+
*
77+
* Convert src to titlecase, and return the result length (not including
78+
* terminating NUL).
79+
*
80+
* String src must be encoded in UTF-8. If srclen < 0, src must be
81+
* NUL-terminated.
82+
*
83+
* Result string is stored in dst, truncating if larger than dstsize. If
84+
* dstsize is greater than the result length, dst will be NUL-terminated;
85+
* otherwise not.
86+
*
87+
* If dstsize is zero, dst may be NULL. This is useful for calculating the
88+
* required buffer size before allocating.
89+
*
90+
* Titlecasing requires knowledge about word boundaries, which is provided by
91+
* the callback wbnext. A word boundary is the offset of the start of a word
92+
* or the offset of the character immediately following a word.
93+
*
94+
* The caller is expected to initialize and free the callback state
95+
* wbstate. The callback should first return offset 0 for the first boundary;
96+
* then the offset of each subsequent word boundary; then the total length of
97+
* the string to indicate the final boundary.
98+
*/
99+
size_t
100+
unicode_strtitle(char*dst,size_tdstsize,constchar*src,ssize_tsrclen,
101+
WordBoundaryNextwbnext,void*wbstate)
102+
{
103+
returnconvert_case(dst,dstsize,src,srclen,CaseTitle,wbnext,
104+
wbstate);
71105
}
72106

73107
/*
@@ -89,30 +123,56 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
89123
size_t
90124
unicode_strupper(char*dst,size_tdstsize,constchar*src,ssize_tsrclen)
91125
{
92-
returnconvert_case(dst,dstsize,src,srclen,CaseUpper);
126+
returnconvert_case(dst,dstsize,src,srclen,CaseUpper,NULL,NULL);
93127
}
94128

95129
/*
96-
* Implement Unicode Default Case Conversion algorithm.
130+
* If str_casekind is CaseLower or CaseUpper, map each character in the string
131+
* for which a mapping is available.
97132
*
98-
* Map each character in the string for which a mapping is available.
133+
* If str_casekind is CaseTitle, maps characters found on a word boundary to
134+
* uppercase and other characters to lowercase.
99135
*/
100136
staticsize_t
101137
convert_case(char*dst,size_tdstsize,constchar*src,ssize_tsrclen,
102-
CaseKindcasekind)
138+
CaseKindstr_casekind,WordBoundaryNextwbnext,void*wbstate)
103139
{
140+
/* character CaseKind varies while titlecasing */
141+
CaseKindchr_casekind=str_casekind;
104142
size_tsrcoff=0;
105143
size_tresult_len=0;
144+
size_tboundary=0;
145+
146+
Assert((str_casekind==CaseTitle&&wbnext&&wbstate)||
147+
(str_casekind!=CaseTitle&& !wbnext&& !wbstate));
148+
149+
if (str_casekind==CaseTitle)
150+
{
151+
boundary=wbnext(wbstate);
152+
Assert(boundary==0);/* start of text is always a boundary */
153+
}
106154

107155
while ((srclen<0||srcoff<srclen)&&src[srcoff]!='\0')
108156
{
109157
pg_wcharu1=utf8_to_unicode((unsignedchar*)src+srcoff);
110158
intu1len=unicode_utf8len(u1);
111159
constpg_case_map*casemap=find_case_map(u1);
112160

161+
if (str_casekind==CaseTitle)
162+
{
163+
if (srcoff==boundary)
164+
{
165+
chr_casekind=CaseUpper;
166+
boundary=wbnext(wbstate);
167+
}
168+
else
169+
chr_casekind=CaseLower;
170+
}
171+
172+
/* perform mapping, update result_len, and write to dst */
113173
if (casemap)
114174
{
115-
pg_wcharu2=casemap->simplemap[casekind];
175+
pg_wcharu2=casemap->simplemap[chr_casekind];
116176
pg_wcharu2len=unicode_utf8len(u2);
117177

118178
if (result_len+u2len <=dstsize)

‎src/include/common/unicode_case.h‎

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,16 @@
1616

1717
#include"mb/pg_wchar.h"
1818

19+
typedefsize_t (*WordBoundaryNext) (void*wbstate);
20+
1921
pg_wcharunicode_lowercase_simple(pg_wcharucs);
2022
pg_wcharunicode_titlecase_simple(pg_wcharucs);
2123
pg_wcharunicode_uppercase_simple(pg_wcharucs);
2224
size_tunicode_strlower(char*dst,size_tdstsize,constchar*src,
2325
ssize_tsrclen);
26+
size_tunicode_strtitle(char*dst,size_tdstsize,constchar*src,
27+
ssize_tsrclen,WordBoundaryNextwbnext,
28+
void*wbstate);
2429
size_tunicode_strupper(char*dst,size_tdstsize,constchar*src,
2530
ssize_tsrclen);
2631

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp