Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitbfc5992

Browse files
committed
Add SQL function CASEFOLD().
Useful for caseless matching. Similar to LOWER(), but avoids edge-caseproblems with using LOWER() for caseless matching.For collations that support it, CASEFOLD() handles characters withmore than two case variations or multi-character case variations. Somecharacters may fold to uppercase. The results of case folding are alsomore stable across Unicode versions than LOWER() or UPPER().Discussion:https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.comReviewed-by: Ian Lawrence Barwick
1 parentf15538c commitbfc5992

File tree

14 files changed

+278
-3
lines changed

14 files changed

+278
-3
lines changed

‎doc/src/sgml/func.sgml

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2596,7 +2596,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
25962596

25972597
<row>
25982598
<entry role="func_table_entry"><para role="func_signature">
2599-
<indexterm>
2599+
<indexterm id="function-lower">
26002600
<primary>lower</primary>
26012601
</indexterm>
26022602
<function>lower</function> ( <type>text</type> )
@@ -2657,7 +2657,7 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
26572657

26582658
<row>
26592659
<entry role="func_table_entry"><para role="func_signature">
2660-
<indexterm>
2660+
<indexterm id="function-normalize">
26612661
<primary>normalize</primary>
26622662
</indexterm>
26632663
<indexterm>
@@ -3109,6 +3109,48 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
31093109
</para></entry>
31103110
</row>
31113111

3112+
<row>
3113+
<entry role="func_table_entry"><para role="func_signature">
3114+
<indexterm>
3115+
<primary>casefold</primary>
3116+
</indexterm>
3117+
<function>casefold</function> ( <type>text</type> )
3118+
<returnvalue>text</returnvalue>
3119+
</para>
3120+
<para>
3121+
Performs case folding of the input string according to the collation.
3122+
Case folding is similar to case conversion, but the purpose of case
3123+
folding is to facilitate case-insensitive comparison of strings,
3124+
whereas the purpose of case conversion is to convert to a particular
3125+
cased form. This function can only be used when the server encoding
3126+
is <literal>UTF8</literal>.
3127+
</para>
3128+
<para>
3129+
Ordinarily, case folding simply converts to lowercase, but there are a
3130+
few notable exceptions depending on the collation. For instance, the
3131+
character <literal>Σ</literal> (U+03A3) has two lowercase forms:
3132+
<literal>σ</literal> (U+03C3) and <literal>ς</literal> (U+03C2); case
3133+
folding in the <literal>PG_C_UTF8</literal> collation maps all three
3134+
forms to <literal>σ</literal>. Additionally, the result is not
3135+
necessarily lowercase; some characters may be folded to uppercase.
3136+
</para>
3137+
<para>
3138+
Case folding may change the length of the string. For instance, in
3139+
the <literal>PG_UNICODE_FAST</literal> collation, <literal>ß</literal>
3140+
(U+00DF) folds to <literal>ss</literal>.
3141+
</para>
3142+
<para>
3143+
<function>casefold</function> can be used for Unicode Default Caseless
3144+
Matching. It does not always preserve the normalized form of the
3145+
input string (see <xref linkend="function-normalize"/>).
3146+
</para>
3147+
<para>
3148+
The <literal>libc</literal> provider doesn't support case folding, so
3149+
<function>casefold</function> is identical to <xref
3150+
linkend="function-lower"/>.
3151+
</para></entry>
3152+
</row>
3153+
31123154
<row>
31133155
<entry role="func_table_entry"><para role="func_signature">
31143156
<indexterm>

‎src/backend/utils/adt/formatting.c

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1819,6 +1819,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
18191819
returnresult;
18201820
}
18211821

1822+
/*
1823+
* collation-aware, wide-character-aware case folding
1824+
*
1825+
* We pass the number of bytes so we can pass varlena and char*
1826+
* to this function. The result is a palloc'd, null-terminated string.
1827+
*/
1828+
char*
1829+
str_casefold(constchar*buff,size_tnbytes,Oidcollid)
1830+
{
1831+
char*result;
1832+
pg_locale_tmylocale;
1833+
1834+
if (!buff)
1835+
returnNULL;
1836+
1837+
if (!OidIsValid(collid))
1838+
{
1839+
/*
1840+
* This typically means that the parser could not resolve a conflict
1841+
* of implicit collations, so report it that way.
1842+
*/
1843+
ereport(ERROR,
1844+
(errcode(ERRCODE_INDETERMINATE_COLLATION),
1845+
errmsg("could not determine which collation to use for %s function",
1846+
"lower()"),
1847+
errhint("Use the COLLATE clause to set the collation explicitly.")));
1848+
}
1849+
1850+
if (GetDatabaseEncoding()!=PG_UTF8)
1851+
ereport(ERROR,
1852+
(errcode(ERRCODE_SYNTAX_ERROR),
1853+
errmsg("Unicode case folding can only be performed if server encoding is UTF8")));
1854+
1855+
mylocale=pg_newlocale_from_collation(collid);
1856+
1857+
/* C/POSIX collations use this path regardless of database encoding */
1858+
if (mylocale->ctype_is_c)
1859+
{
1860+
result=asc_tolower(buff,nbytes);
1861+
}
1862+
else
1863+
{
1864+
constchar*src=buff;
1865+
size_tsrclen=nbytes;
1866+
size_tdstsize;
1867+
char*dst;
1868+
size_tneeded;
1869+
1870+
/* first try buffer of equal size plus terminating NUL */
1871+
dstsize=srclen+1;
1872+
dst=palloc(dstsize);
1873+
1874+
needed=pg_strfold(dst,dstsize,src,srclen,mylocale);
1875+
if (needed+1>dstsize)
1876+
{
1877+
/* grow buffer if needed and retry */
1878+
dstsize=needed+1;
1879+
dst=repalloc(dst,dstsize);
1880+
needed=pg_strfold(dst,dstsize,src,srclen,mylocale);
1881+
Assert(needed+1 <=dstsize);
1882+
}
1883+
1884+
Assert(dst[needed]=='\0');
1885+
result=dst;
1886+
}
1887+
1888+
returnresult;
1889+
}
1890+
18221891
/*
18231892
* ASCII-only lower function
18241893
*

‎src/backend/utils/adt/oracle_compat.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,22 @@ initcap(PG_FUNCTION_ARGS)
126126
PG_RETURN_TEXT_P(result);
127127
}
128128

129+
Datum
130+
casefold(PG_FUNCTION_ARGS)
131+
{
132+
text*in_string=PG_GETARG_TEXT_PP(0);
133+
char*out_string;
134+
text*result;
135+
136+
out_string=str_casefold(VARDATA_ANY(in_string),
137+
VARSIZE_ANY_EXHDR(in_string),
138+
PG_GET_COLLATION());
139+
result=cstring_to_text(out_string);
140+
pfree(out_string);
141+
142+
PG_RETURN_TEXT_P(result);
143+
}
144+
129145

130146
/********************************************************************
131147
*

‎src/backend/utils/adt/pg_locale.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,13 +106,17 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
106106
ssize_tsrclen,pg_locale_tlocale);
107107
externsize_tstrupper_builtin(char*dst,size_tdstsize,constchar*src,
108108
ssize_tsrclen,pg_locale_tlocale);
109+
externsize_tstrfold_builtin(char*dst,size_tdstsize,constchar*src,
110+
ssize_tsrclen,pg_locale_tlocale);
109111

110112
externsize_tstrlower_icu(char*dst,size_tdstsize,constchar*src,
111113
ssize_tsrclen,pg_locale_tlocale);
112114
externsize_tstrtitle_icu(char*dst,size_tdstsize,constchar*src,
113115
ssize_tsrclen,pg_locale_tlocale);
114116
externsize_tstrupper_icu(char*dst,size_tdstsize,constchar*src,
115117
ssize_tsrclen,pg_locale_tlocale);
118+
externsize_tstrfold_icu(char*dst,size_tdstsize,constchar*src,
119+
ssize_tsrclen,pg_locale_tlocale);
116120

117121
externsize_tstrlower_libc(char*dst,size_tdstsize,constchar*src,
118122
ssize_tsrclen,pg_locale_tlocale);
@@ -1447,6 +1451,26 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
14471451
return0;/* keep compiler quiet */
14481452
}
14491453

1454+
size_t
1455+
pg_strfold(char*dst,size_tdstsize,constchar*src,ssize_tsrclen,
1456+
pg_locale_tlocale)
1457+
{
1458+
if (locale->provider==COLLPROVIDER_BUILTIN)
1459+
returnstrfold_builtin(dst,dstsize,src,srclen,locale);
1460+
#ifdefUSE_ICU
1461+
elseif (locale->provider==COLLPROVIDER_ICU)
1462+
returnstrfold_icu(dst,dstsize,src,srclen,locale);
1463+
#endif
1464+
/* for libc, just use strlower */
1465+
elseif (locale->provider==COLLPROVIDER_LIBC)
1466+
returnstrlower_libc(dst,dstsize,src,srclen,locale);
1467+
else
1468+
/* shouldn't happen */
1469+
PGLOCALE_SUPPORT_ERROR(locale->provider);
1470+
1471+
return0;/* keep compiler quiet */
1472+
}
1473+
14501474
/*
14511475
* pg_strcoll
14521476
*

‎src/backend/utils/adt/pg_locale_builtin.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
3131
ssize_tsrclen,pg_locale_tlocale);
3232
externsize_tstrupper_builtin(char*dst,size_tdstsize,constchar*src,
3333
ssize_tsrclen,pg_locale_tlocale);
34+
externsize_tstrfold_builtin(char*dst,size_tdstsize,constchar*src,
35+
ssize_tsrclen,pg_locale_tlocale);
3436

3537

3638
structWordBoundaryState
@@ -107,6 +109,14 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
107109
locale->info.builtin.casemap_full);
108110
}
109111

112+
size_t
113+
strfold_builtin(char*dest,size_tdestsize,constchar*src,ssize_tsrclen,
114+
pg_locale_tlocale)
115+
{
116+
returnunicode_strfold(dest,destsize,src,srclen,
117+
locale->info.builtin.casemap_full);
118+
}
119+
110120
pg_locale_t
111121
create_pg_locale_builtin(Oidcollid,MemoryContextcontext)
112122
{

‎src/backend/utils/adt/pg_locale_icu.c

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
5454
ssize_tsrclen,pg_locale_tlocale);
5555
externsize_tstrupper_icu(char*dst,size_tdstsize,constchar*src,
5656
ssize_tsrclen,pg_locale_tlocale);
57+
externsize_tstrfold_icu(char*dst,size_tdstsize,constchar*src,
58+
ssize_tsrclen,pg_locale_tlocale);
5759

5860
#ifdefUSE_ICU
5961

@@ -117,6 +119,10 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
117119
constUChar*src,int32_tsrcLength,
118120
constchar*locale,
119121
UErrorCode*pErrorCode);
122+
staticint32_tu_strFoldCase_default(UChar*dest,int32_tdestCapacity,
123+
constUChar*src,int32_tsrcLength,
124+
constchar*locale,
125+
UErrorCode*pErrorCode);
120126

121127
staticconststructcollate_methodscollate_methods_icu= {
122128
.strncoll=strncoll_icu,
@@ -439,6 +445,26 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
439445
returnresult_len;
440446
}
441447

448+
size_t
449+
strfold_icu(char*dest,size_tdestsize,constchar*src,ssize_tsrclen,
450+
pg_locale_tlocale)
451+
{
452+
int32_tlen_uchar;
453+
int32_tlen_conv;
454+
UChar*buff_uchar;
455+
UChar*buff_conv;
456+
size_tresult_len;
457+
458+
len_uchar=icu_to_uchar(&buff_uchar,src,srclen);
459+
len_conv=icu_convert_case(u_strFoldCase_default,locale,
460+
&buff_conv,buff_uchar,len_uchar);
461+
result_len=icu_from_uchar(dest,destsize,buff_conv,len_conv);
462+
pfree(buff_uchar);
463+
pfree(buff_conv);
464+
465+
returnresult_len;
466+
}
467+
442468
/*
443469
* strncoll_icu_utf8
444470
*
@@ -673,6 +699,38 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
673699
NULL,locale,pErrorCode);
674700
}
675701

702+
staticint32_t
703+
u_strFoldCase_default(UChar*dest,int32_tdestCapacity,
704+
constUChar*src,int32_tsrcLength,
705+
constchar*locale,
706+
UErrorCode*pErrorCode)
707+
{
708+
uint32options=U_FOLD_CASE_DEFAULT;
709+
charlang[3];
710+
UErrorCodestatus;
711+
712+
/*
713+
* Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
714+
* folding does not accept a locale. Instead it just supports a single
715+
* option relevant to Turkic languages 'az' and 'tr'; check for those
716+
* languages to enable the option.
717+
*/
718+
status=U_ZERO_ERROR;
719+
uloc_getLanguage(locale,lang,3,&status);
720+
if (U_SUCCESS(status))
721+
{
722+
/*
723+
* The option name is confusing, but it causes u_strFoldCase to use
724+
* the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
725+
*/
726+
if (strcmp(lang,"tr")==0||strcmp(lang,"az")==0)
727+
options=U_FOLD_CASE_EXCLUDE_SPECIAL_I;
728+
}
729+
730+
returnu_strFoldCase(dest,destCapacity,src,srcLength,
731+
options,pErrorCode);
732+
}
733+
676734
/*
677735
* strncoll_icu
678736
*

‎src/include/catalog/catversion.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,6 @@
5757
*/
5858

5959
/*yyyymmddN */
60-
#defineCATALOG_VERSION_NO202501231
60+
#defineCATALOG_VERSION_NO202501232
6161

6262
#endif

‎src/include/catalog/pg_proc.dat

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3623,6 +3623,9 @@
36233623
{ oid => '872', descr => 'capitalize each word',
36243624
proname => 'initcap', prorettype => 'text', proargtypes => 'text',
36253625
prosrc => 'initcap' },
3626+
{ oid => '9569', descr => 'fold case',
3627+
proname => 'casefold', prorettype => 'text', proargtypes => 'text',
3628+
prosrc => 'casefold' },
36263629
{ oid => '873', descr => 'left-pad string to length',
36273630
proname => 'lpad', prorettype => 'text', proargtypes => 'text int4 text',
36283631
prosrc => 'lpad' },

‎src/include/utils/formatting.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
externchar*str_tolower(constchar*buff,size_tnbytes,Oidcollid);
2222
externchar*str_toupper(constchar*buff,size_tnbytes,Oidcollid);
2323
externchar*str_initcap(constchar*buff,size_tnbytes,Oidcollid);
24+
externchar*str_casefold(constchar*buff,size_tnbytes,Oidcollid);
2425

2526
externchar*asc_tolower(constchar*buff,size_tnbytes);
2627
externchar*asc_toupper(constchar*buff,size_tnbytes);

‎src/include/utils/pg_locale.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,9 @@ extern size_t pg_strtitle(char *dest, size_t destsize,
134134
externsize_tpg_strupper(char*dest,size_tdestsize,
135135
constchar*src,ssize_tsrclen,
136136
pg_locale_tlocale);
137+
externsize_tpg_strfold(char*dest,size_tdestsize,
138+
constchar*src,ssize_tsrclen,
139+
pg_locale_tlocale);
137140
externintpg_strcoll(constchar*arg1,constchar*arg2,pg_locale_tlocale);
138141
externintpg_strncoll(constchar*arg1,ssize_tlen1,
139142
constchar*arg2,ssize_tlen2,pg_locale_tlocale);

‎src/test/regress/expected/collate.icu.utf8.out

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,30 @@ SELECT a, x, y FROM collate_test10 ORDER BY lower(y), a;
255255
1 | hij | hij
256256
(2 rows)
257257

258+
SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
259+
lower
260+
-------------------------------
261+
abcd 123 #$% ıiii̇ ß ß dždždž σσς
262+
(1 row)
263+
264+
SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "en-x-icu");
265+
casefold
266+
---------------------------------
267+
abcd 123 #$% ıiii̇ ss ss dždždž σσσ
268+
(1 row)
269+
270+
SELECT lower('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
271+
lower
272+
-------------------------------
273+
abcd 123 #$% ıiıi ß ß dždždž σσς
274+
(1 row)
275+
276+
SELECT casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' COLLATE "tr-x-icu");
277+
casefold
278+
---------------------------------
279+
abcd 123 #$% ıiıi ss ss dždždž σσσ
280+
(1 row)
281+
258282
-- LIKE/ILIKE
259283
SELECT * FROM collate_test1 WHERE b LIKE 'abc';
260284
a | b

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp