Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd40d564

Browse files
committed
Add support for other normal forms to Unicode normalization API
It previously only supported NFKC, for use by SASLprep. This expandsthe API to offer the choice of all four normalization forms. Rightnow, there are no internal users of the forms other than NFKC.Reviewed-by: Daniel Verite <daniel@manitou-mail.org>Reviewed-by: Andreas Karlsson <andreas@proxel.se>Discussion:https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
1 parentcedffbd commitd40d564

File tree

7 files changed

+3727
-3702
lines changed

7 files changed

+3727
-3702
lines changed

‎src/common/saslprep.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1156,7 +1156,7 @@ pg_saslprep(const char *input, char **output)
11561156
* 2) Normalize -- Normalize the result of step 1 using Unicode
11571157
* normalization.
11581158
*/
1159-
output_chars=unicode_normalize_kc(input_chars);
1159+
output_chars=unicode_normalize(UNICODE_NFKC,input_chars);
11601160
if (!output_chars)
11611161
gotooom;
11621162

‎src/common/unicode/generate-norm_test_table.pl

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
{
4949
intlinenum;
5050
pg_wcharinput[50];
51-
pg_wcharoutput[50];
51+
pg_wcharoutput[4][50];
5252
} pg_unicode_test;
5353
5454
/* test table */
@@ -89,13 +89,16 @@ sub codepoint_string_to_hex
8989
my ($source,$nfc,$nfd,$nfkc,$nfkd) =split(';',$line);
9090

9191
my$source_utf8 = codepoint_string_to_hex($source);
92+
my$nfc_utf8 = codepoint_string_to_hex($nfc);
93+
my$nfd_utf8 = codepoint_string_to_hex($nfd);
9294
my$nfkc_utf8 = codepoint_string_to_hex($nfkc);
95+
my$nfkd_utf8 = codepoint_string_to_hex($nfkd);
9396

94-
print$OUTPUT"\t{$linenum, {$source_utf8 }, {$nfkc_utf8 } },\n";
97+
print$OUTPUT"\t{$linenum, {$source_utf8 }, {{$nfc_utf8 }, {$nfd_utf8 }, {$nfkc_utf8 }, {$nfkd_utf8 } } },\n";
9598
}
9699

97100
# Output terminator entry
98-
print$OUTPUT"\t{ 0, { 0 }, {0 } }";
101+
print$OUTPUT"\t{ 0, { 0 }, {{ 0 }, { 0 }, { 0 }, { 0 } } }";
99102
print$OUTPUT"\n};\n";
100103

101104
close$OUTPUT;

‎src/common/unicode/generate-unicode_norm_table.pl

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,12 @@
9999
#define DECOMP_NO_COMPOSE0x80/* don't use for re-composition */
100100
#define DECOMP_INLINE0x40/* decomposition is stored inline in
101101
* dec_index */
102+
#define DECOMP_COMPAT0x20/* compatibility mapping */
102103
103-
#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags &0x3F)
104-
#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0)
104+
#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags &0x1F)
105+
#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags &(DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
105106
#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
107+
#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)
106108
107109
/* Table of Unicode codepoints and their decompositions */
108110
static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
@@ -136,22 +138,22 @@
136138
# Decomposition size
137139
# Print size of decomposition
138140
my$decomp_size =scalar(@decomp_elts);
141+
dieif$decomp_size > 0x1F;# to not overrun bitmask
139142

140143
my$first_decomp =shift@decomp_elts;
141144

142145
my$flags ="";
143146
my$comment ="";
144147

145-
if ($decomp_size == 2)
148+
if ($compat)
146149
{
150+
$flags .=" | DECOMP_COMPAT";
151+
}
147152

153+
if ($decomp_size == 2)
154+
{
148155
# Should this be used for recomposition?
149-
if ($compat)
150-
{
151-
$flags .=" | DECOMP_NO_COMPOSE";
152-
$comment ="compatibility mapping";
153-
}
154-
elsif ($character_hash{$first_decomp}
156+
if ($character_hash{$first_decomp}
155157
&&$character_hash{$first_decomp}->{class} != 0)
156158
{
157159
$flags .=" | DECOMP_NO_COMPOSE";

‎src/common/unicode/norm_test.c

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,18 +63,21 @@ main(int argc, char **argv)
6363

6464
for (test=UnicodeNormalizationTests;test->input[0]!=0;test++)
6565
{
66-
pg_wchar*result;
66+
for (intform=0;form<4;form++)
67+
{
68+
pg_wchar*result;
6769

68-
result=unicode_normalize_kc(test->input);
70+
result=unicode_normalize(form,test->input);
6971

70-
if (pg_wcscmp(test->output,result)!=0)
71-
{
72-
printf("FAILURE (NormalizationTest.txt line %d):\n",test->linenum);
73-
printf("input: %s\n",print_wchar_str(test->input));
74-
printf("expected: %s\n",print_wchar_str(test->output));
75-
printf("got: %s\n",print_wchar_str(result));
76-
printf("\n");
77-
exit(1);
72+
if (pg_wcscmp(test->output[form],result)!=0)
73+
{
74+
printf("FAILURE (NormalizationTest.txt line %d form %d):\n",test->linenum,form);
75+
printf("input: %s\n",print_wchar_str(test->input));
76+
printf("expected: %s\n",print_wchar_str(test->output[form]));
77+
printf("got: %s\n",print_wchar_str(result));
78+
printf("\n");
79+
exit(1);
80+
}
7881
}
7982
}
8083

‎src/common/unicode_norm.c

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*-------------------------------------------------------------------------
22
* unicode_norm.c
3-
*Normalize a Unicode string to NFKC form
3+
*Normalize a Unicode string
44
*
55
* This implements Unicode normalization, per the documentation at
66
* https://www.unicode.org/reports/tr15/.
@@ -98,7 +98,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
9898
* are, in turn, decomposable.
9999
*/
100100
staticint
101-
get_decomposed_size(pg_wcharcode)
101+
get_decomposed_size(pg_wcharcode,boolcompat)
102102
{
103103
pg_unicode_decomposition*entry;
104104
intsize=0;
@@ -131,7 +131,8 @@ get_decomposed_size(pg_wchar code)
131131
* Just count current code if no other decompositions. A NULL entry is
132132
* equivalent to a character with class 0 and no decompositions.
133133
*/
134-
if (entry==NULL||DECOMPOSITION_SIZE(entry)==0)
134+
if (entry==NULL||DECOMPOSITION_SIZE(entry)==0||
135+
(!compat&&DECOMPOSITION_IS_COMPAT(entry)))
135136
return1;
136137

137138
/*
@@ -143,7 +144,7 @@ get_decomposed_size(pg_wchar code)
143144
{
144145
uint32lcode=decomp[i];
145146

146-
size+=get_decomposed_size(lcode);
147+
size+=get_decomposed_size(lcode,compat);
147148
}
148149

149150
returnsize;
@@ -224,7 +225,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
224225
* in the array result.
225226
*/
226227
staticvoid
227-
decompose_code(pg_wcharcode,pg_wchar**result,int*current)
228+
decompose_code(pg_wcharcode,boolcompat,pg_wchar**result,int*current)
228229
{
229230
pg_unicode_decomposition*entry;
230231
inti;
@@ -272,7 +273,8 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
272273
* character with class 0 and no decompositions, so just leave also in
273274
* this case.
274275
*/
275-
if (entry==NULL||DECOMPOSITION_SIZE(entry)==0)
276+
if (entry==NULL||DECOMPOSITION_SIZE(entry)==0||
277+
(!compat&&DECOMPOSITION_IS_COMPAT(entry)))
276278
{
277279
pg_wchar*res=*result;
278280

@@ -290,12 +292,12 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
290292
pg_wcharlcode= (pg_wchar)decomp[i];
291293

292294
/* Leave if no more decompositions */
293-
decompose_code(lcode,result,current);
295+
decompose_code(lcode,compat,result,current);
294296
}
295297
}
296298

297299
/*
298-
*unicode_normalize_kc - Normalize a Unicode string toNFKC form.
300+
*unicode_normalize - Normalize a Unicode string tothe specified form.
299301
*
300302
* The input is a 0-terminated array of codepoints.
301303
*
@@ -304,8 +306,10 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
304306
* string is palloc'd instead, and OOM is reported with ereport().
305307
*/
306308
pg_wchar*
307-
unicode_normalize_kc(constpg_wchar*input)
309+
unicode_normalize(UnicodeNormalizationFormform,constpg_wchar*input)
308310
{
311+
boolcompat= (form==UNICODE_NFKC||form==UNICODE_NFKD);
312+
boolrecompose= (form==UNICODE_NFC||form==UNICODE_NFKC);
309313
pg_wchar*decomp_chars;
310314
pg_wchar*recomp_chars;
311315
intdecomp_size,
@@ -326,7 +330,7 @@ unicode_normalize_kc(const pg_wchar *input)
326330
*/
327331
decomp_size=0;
328332
for (p=input;*p;p++)
329-
decomp_size+=get_decomposed_size(*p);
333+
decomp_size+=get_decomposed_size(*p,compat);
330334

331335
decomp_chars= (pg_wchar*)ALLOC((decomp_size+1)*sizeof(pg_wchar));
332336
if (decomp_chars==NULL)
@@ -338,7 +342,7 @@ unicode_normalize_kc(const pg_wchar *input)
338342
*/
339343
current_size=0;
340344
for (p=input;*p;p++)
341-
decompose_code(*p,&decomp_chars,&current_size);
345+
decompose_code(*p,compat,&decomp_chars,&current_size);
342346
decomp_chars[decomp_size]='\0';
343347
Assert(decomp_size==current_size);
344348

@@ -385,8 +389,11 @@ unicode_normalize_kc(const pg_wchar *input)
385389
count-=2;
386390
}
387391

392+
if (!recompose)
393+
returndecomp_chars;
394+
388395
/*
389-
* The last phase of NFKC is the recomposition of the reordered Unicode
396+
* The last phase ofNFC andNFKC is the recomposition of the reordered Unicode
390397
* string using combining classes. The recomposed string cannot be longer
391398
* than the decomposed one, so make the allocation of the output string
392399
* based on that assumption.

‎src/include/common/unicode_norm.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@
1616

1717
#include"mb/pg_wchar.h"
1818

19-
externpg_wchar*unicode_normalize_kc(constpg_wchar*input);
19+
typedefenum
20+
{
21+
UNICODE_NFC=0,
22+
UNICODE_NFD=1,
23+
UNICODE_NFKC=2,
24+
UNICODE_NFKD=3,
25+
}UnicodeNormalizationForm;
26+
27+
externpg_wchar*unicode_normalize(UnicodeNormalizationFormform,constpg_wchar*input);
2028

2129
#endif/* UNICODE_NORM_H */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp