Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitea1db8a

Browse files
committed
Canonicalize ICU locale names to language tags.
Convert to BCP47 language tags before storing in the catalog, exceptduring binary upgrade or when the locale comes from an existingcollation or template database.The resulting language tags can vary slightly between ICUversions. For instance, "@colBackwards=yes" is converted to"und-u-kb-true" in older versions of ICU, and to the simpler (butequivalent) "und-u-kb" in newer versions.The process of canonicalizing to a language tag also understands moreinput locale string formats than ucol_open(). For instance,"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region isignored; effectively treating it the same as the locale "fr" andopening the wrong collator. Canonicalization properly interprets thelanguage and region, resulting in the language tag "fr-CA", which canthen be understood by ucol_open().This commit fixes a problem in prior versions due to ucol_open()misinterpreting locale strings as described above. For instance,creating an ICU collation with locale "fr_CA.UTF-8" would store thatstring directly in the catalog, which would later be passed to (andmisinterpreted by) ucol_open(). After this commit, the locale stringwill be canonicalized to language tag "fr-CA" in the catalog, whichwill be properly understood by ucol_open(). Because this fix affectsthe resulting collator, we cannot change the locale string stored inthe catalog for existing databases or collations; otherwise we'd riskcorrupting indexes. Therefore, only canonicalize locales fornewly-created (not upgraded) collations/databases. For similarreasons, do not backport.Discussion:https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.comReviewed-by: Peter Eisentraut
1 parentd3d53f9 commitea1db8a

File tree

10 files changed

+258
-27
lines changed

10 files changed

+258
-27
lines changed

‎doc/src/sgml/charset.sgml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -893,7 +893,7 @@ CREATE COLLATION german (provider = libc, locale = 'de_DE');
893893
The first example selects the ICU locale using a <quote>language
894894
tag</quote> per BCP 47. The second example uses the traditional
895895
ICU-specific locale syntax. The first style is preferred going
896-
forward,but itisnot supported by older ICU versions.
896+
forward,andisused internally to store locales.
897897
</para>
898898
<para>
899899
Note that you can name the collation objects in the SQL environment

‎src/backend/commands/collationcmds.c

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,11 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
165165
else
166166
colliculocale=NULL;
167167

168+
/*
169+
* When the ICU locale comes from an existing collation, do not
170+
* canonicalize to a language tag.
171+
*/
172+
168173
datum=SysCacheGetAttr(COLLOID,tp,Anum_pg_collation_collicurules,&isnull);
169174
if (!isnull)
170175
collicurules=TextDatumGetCString(datum);
@@ -259,6 +264,25 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
259264
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
260265
errmsg("parameter \"locale\" must be specified")));
261266

267+
/*
268+
* During binary upgrade, preserve the locale string. Otherwise,
269+
* canonicalize to a language tag.
270+
*/
271+
if (!IsBinaryUpgrade)
272+
{
273+
char*langtag=icu_language_tag(colliculocale,
274+
icu_validation_level);
275+
276+
if (langtag&&strcmp(colliculocale,langtag)!=0)
277+
{
278+
ereport(NOTICE,
279+
(errmsg("using standard form \"%s\" for locale \"%s\"",
280+
langtag,colliculocale)));
281+
282+
colliculocale=langtag;
283+
}
284+
}
285+
262286
icu_validate_locale(colliculocale);
263287
}
264288

@@ -569,26 +593,6 @@ cmpaliases(const void *a, const void *b)
569593

570594

571595
#ifdefUSE_ICU
572-
/*
573-
* Get the ICU language tag for a locale name.
574-
* The result is a palloc'd string.
575-
*/
576-
staticchar*
577-
get_icu_language_tag(constchar*localename)
578-
{
579-
charbuf[ULOC_FULLNAME_CAPACITY];
580-
UErrorCodestatus;
581-
582-
status=U_ZERO_ERROR;
583-
uloc_toLanguageTag(localename,buf,sizeof(buf), true,&status);
584-
if (U_FAILURE(status))
585-
ereport(ERROR,
586-
(errmsg("could not convert locale name \"%s\" to language tag: %s",
587-
localename,u_errorName(status))));
588-
589-
returnpstrdup(buf);
590-
}
591-
592596
/*
593597
* Get a comment (specifically, the display name) for an ICU locale.
594598
* The result is a palloc'd string, or NULL if we can't get a comment
@@ -950,7 +954,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
950954
else
951955
name=uloc_getAvailable(i);
952956

953-
langtag=get_icu_language_tag(name);
957+
langtag=icu_language_tag(name,ERROR);
954958

955959
/*
956960
* Be paranoid about not allowing any non-ASCII strings into

‎src/backend/commands/dbcommands.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,26 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
10581058
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10591059
errmsg("ICU locale must be specified")));
10601060

1061+
/*
1062+
* During binary upgrade, or when the locale came from the template
1063+
* database, preserve locale string. Otherwise, canonicalize to a
1064+
* language tag.
1065+
*/
1066+
if (!IsBinaryUpgrade&&dbiculocale!=src_iculocale)
1067+
{
1068+
char*langtag=icu_language_tag(dbiculocale,
1069+
icu_validation_level);
1070+
1071+
if (langtag&&strcmp(dbiculocale,langtag)!=0)
1072+
{
1073+
ereport(NOTICE,
1074+
(errmsg("using standard form \"%s\" for locale \"%s\"",
1075+
langtag,dbiculocale)));
1076+
1077+
dbiculocale=langtag;
1078+
}
1079+
}
1080+
10611081
icu_validate_locale(dbiculocale);
10621082
}
10631083
else

‎src/backend/utils/adt/pg_locale.c

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2826,6 +2826,91 @@ icu_set_collation_attributes(UCollator *collator, const char *loc,
28262826

28272827
#endif
28282828

2829+
/*
2830+
* Return the BCP47 language tag representation of the requested locale.
2831+
*
2832+
* This function should be called before passing the string to ucol_open(),
2833+
* because conversion to a language tag also performs "level 2
2834+
* canonicalization". In addition to producing a consistent format, level 2
2835+
* canonicalization is able to more accurately interpret different input
2836+
* locale string formats, such as POSIX and .NET IDs.
2837+
*/
2838+
char*
2839+
icu_language_tag(constchar*loc_str,intelevel)
2840+
{
2841+
#ifdefUSE_ICU
2842+
UErrorCodestatus;
2843+
charlang[ULOC_LANG_CAPACITY];
2844+
char*langtag;
2845+
size_tbuflen=32;/* arbitrary starting buffer size */
2846+
constboolstrict= true;
2847+
2848+
status=U_ZERO_ERROR;
2849+
uloc_getLanguage(loc_str,lang,ULOC_LANG_CAPACITY,&status);
2850+
if (U_FAILURE(status))
2851+
{
2852+
if (elevel>0)
2853+
ereport(elevel,
2854+
(errmsg("could not get language from locale \"%s\": %s",
2855+
loc_str,u_errorName(status))));
2856+
returnNULL;
2857+
}
2858+
2859+
/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
2860+
if (strcmp(lang,"c")==0||strcmp(lang,"posix")==0)
2861+
returnpstrdup("en-US-u-va-posix");
2862+
2863+
/*
2864+
* A BCP47 language tag doesn't have a clearly-defined upper limit
2865+
* (cf. RFC5646 section 4.4). Additionally, in older ICU versions,
2866+
* uloc_toLanguageTag() doesn't always return the ultimate length on the
2867+
* first call, necessitating a loop.
2868+
*/
2869+
langtag=palloc(buflen);
2870+
while (true)
2871+
{
2872+
int32_tlen;
2873+
2874+
status=U_ZERO_ERROR;
2875+
len=uloc_toLanguageTag(loc_str,langtag,buflen,strict,&status);
2876+
2877+
/*
2878+
* If the result fits in the buffer exactly (len == buflen),
2879+
* uloc_toLanguageTag() will return success without nul-terminating
2880+
* the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >=
2881+
* buflen and try again.
2882+
*/
2883+
if ((status==U_BUFFER_OVERFLOW_ERROR||
2884+
(U_SUCCESS(status)&&len >=buflen))&&
2885+
buflen<MaxAllocSize)
2886+
{
2887+
buflen=Min(buflen*2,MaxAllocSize);
2888+
langtag=repalloc(langtag,buflen);
2889+
continue;
2890+
}
2891+
2892+
break;
2893+
}
2894+
2895+
if (U_FAILURE(status))
2896+
{
2897+
pfree(langtag);
2898+
2899+
if (elevel>0)
2900+
ereport(elevel,
2901+
(errmsg("could not convert locale name \"%s\" to language tag: %s",
2902+
loc_str,u_errorName(status))));
2903+
returnNULL;
2904+
}
2905+
2906+
returnlangtag;
2907+
#else/* not USE_ICU */
2908+
ereport(ERROR,
2909+
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2910+
errmsg("ICU is not supported in this build")));
2911+
#endif/* not USE_ICU */
2912+
}
2913+
28292914
/*
28302915
* Perform best-effort check that the locale is a valid one.
28312916
*/

‎src/bin/initdb/initdb.c

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2229,6 +2229,78 @@ check_icu_locale_encoding(int user_enc)
22292229
return true;
22302230
}
22312231

2232+
/*
2233+
* Convert to canonical BCP47 language tag. Must be consistent with
2234+
* icu_language_tag().
2235+
*/
2236+
staticchar*
2237+
icu_language_tag(constchar*loc_str)
2238+
{
2239+
#ifdefUSE_ICU
2240+
UErrorCodestatus;
2241+
charlang[ULOC_LANG_CAPACITY];
2242+
char*langtag;
2243+
size_tbuflen=32;/* arbitrary starting buffer size */
2244+
constboolstrict= true;
2245+
2246+
status=U_ZERO_ERROR;
2247+
uloc_getLanguage(loc_str,lang,ULOC_LANG_CAPACITY,&status);
2248+
if (U_FAILURE(status))
2249+
{
2250+
pg_fatal("could not get language from locale \"%s\": %s",
2251+
loc_str,u_errorName(status));
2252+
returnNULL;
2253+
}
2254+
2255+
/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
2256+
if (strcmp(lang,"c")==0||strcmp(lang,"posix")==0)
2257+
returnpstrdup("en-US-u-va-posix");
2258+
2259+
/*
2260+
* A BCP47 language tag doesn't have a clearly-defined upper limit
2261+
* (cf. RFC5646 section 4.4). Additionally, in older ICU versions,
2262+
* uloc_toLanguageTag() doesn't always return the ultimate length on the
2263+
* first call, necessitating a loop.
2264+
*/
2265+
langtag=pg_malloc(buflen);
2266+
while (true)
2267+
{
2268+
int32_tlen;
2269+
2270+
status=U_ZERO_ERROR;
2271+
len=uloc_toLanguageTag(loc_str,langtag,buflen,strict,&status);
2272+
2273+
/*
2274+
* If the result fits in the buffer exactly (len == buflen),
2275+
* uloc_toLanguageTag() will return success without nul-terminating
2276+
* the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >=
2277+
* buflen and try again.
2278+
*/
2279+
if (status==U_BUFFER_OVERFLOW_ERROR||
2280+
(U_SUCCESS(status)&&len >=buflen))
2281+
{
2282+
buflen=buflen*2;
2283+
langtag=pg_realloc(langtag,buflen);
2284+
continue;
2285+
}
2286+
2287+
break;
2288+
}
2289+
2290+
if (U_FAILURE(status))
2291+
{
2292+
pg_free(langtag);
2293+
2294+
pg_fatal("could not convert locale name \"%s\" to language tag: %s",
2295+
loc_str,u_errorName(status));
2296+
}
2297+
2298+
returnlangtag;
2299+
#else
2300+
pg_fatal("ICU is not supported in this build");
2301+
#endif
2302+
}
2303+
22322304
/*
22332305
* Perform best-effort check that the locale is a valid one. Should be
22342306
* consistent with pg_locale.c, except that it doesn't need to open the
@@ -2376,13 +2448,22 @@ setlocales(void)
23762448

23772449
if (locale_provider==COLLPROVIDER_ICU)
23782450
{
2451+
char*langtag;
2452+
23792453
/* acquire default locale from the environment, if not specified */
23802454
if (icu_locale==NULL)
23812455
{
23822456
icu_locale=default_icu_locale();
23832457
printf(_("Using default ICU locale \"%s\".\n"),icu_locale);
23842458
}
23852459

2460+
/* canonicalize to a language tag */
2461+
langtag=icu_language_tag(icu_locale);
2462+
printf(_("Using language tag \"%s\" for ICU locale \"%s\".\n"),
2463+
langtag,icu_locale);
2464+
pg_free(icu_locale);
2465+
icu_locale=langtag;
2466+
23862467
icu_validate_locale(icu_locale);
23872468

23882469
/*

‎src/bin/initdb/t/001_initdb.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@
144144
'--locale-provider=icu',
145145
'--icu-locale=@colNumeric=lower',"$tempdir/dataX"
146146
],
147-
qr/could not open collator for locale "\@colNumeric=lower": U_ILLEGAL_ARGUMENT_ERROR/,
147+
qr/could not open collator for locale "und-u-kn-lower": U_ILLEGAL_ARGUMENT_ERROR/,
148148
'fails for invalid collation argument');
149149
}
150150
else

‎src/bin/pg_dump/t/002_pg_dump.pl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1860,9 +1860,9 @@
18601860

18611861
'CREATE COLLATION icu_collation'=> {
18621862
create_order=> 76,
1863-
create_sql=>"CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'C');",
1863+
create_sql=>"CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'en-US-u-va-posix');",
18641864
regexp=>
1865-
qr/CREATE COLLATION public.icu_collation\(provider = icu, locale = 'C'(, version = '[^']*')?\);/m,
1865+
qr/CREATE COLLATION public.icu_collation\(provider = icu, locale = 'en-US-u-va-posix'(, version = '[^']*')?\);/m,
18661866
icu=> 1,
18671867
like=> {%full_runs,section_pre_data=> 1, },
18681868
},

‎src/include/utils/pg_locale.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
120120
size_tsrclen,pg_locale_tlocale);
121121

122122
externvoidicu_validate_locale(constchar*loc_str);
123+
externchar*icu_language_tag(constchar*loc_str,intelevel);
123124

124125
#ifdefUSE_ICU
125126
externint32_ticu_to_uchar(UChar**buff_uchar,constchar*buff,size_tnbytes);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp