NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commitd40d564

committed

Add support for other normal forms to Unicode normalization API

It previously only supported NFKC, for use by SASLprep. This expandsthe API to offer the choice of all four normalization forms. Rightnow, there are no internal users of the forms other than NFKC.Reviewed-by: Daniel Verite <daniel@manitou-mail.org>Reviewed-by: Andreas Karlsson <andreas@proxel.se>Discussion:https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com

1 parentcedffbd commitd40d564Copy full SHA for d40d564

File tree

7 files changed

+3727

-3702

lines changed

src
- common
- include/common
  - unicode_norm.h
  - unicode_norm_table.h

7 files changed

+3727

-3702

lines changed

`‎src/common/saslprep.c`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -1156,7 +1156,7 @@ pg_saslprep(const char input, char *output)`
`1156`	`1156`	`* 2) Normalize -- Normalize the result of step 1 using Unicode`
`1157`	`1157`	`* normalization.`
`1158`	`1158`	`*/`
`1159`		`-output_chars=unicode_normalize_kc(input_chars);`
	`1159`	`+output_chars=unicode_normalize(UNICODE_NFKC,input_chars);`
`1160`	`1160`	`if (!output_chars)`
`1161`	`1161`	`gotooom;`
`1162`	`1162`

`‎src/common/unicode/generate-norm_test_table.pl`

Lines changed: 6 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@`
`48`	`48`	`{`
`49`	`49`	`intlinenum;`
`50`	`50`	`pg_wcharinput[50];`
`51`		`-pg_wcharoutput[50];`
	`51`	`+pg_wcharoutput[4][50];`
`52`	`52`	`} pg_unicode_test;`
`53`	`53`
`54`	`54`	`/* test table */`
`@@ -89,13 +89,16 @@ sub codepoint_string_to_hex`
`89`	`89`	`my ($source,$nfc,$nfd,$nfkc,$nfkd) =split(';',$line);`
`90`	`90`
`91`	`91`	`my$source_utf8 = codepoint_string_to_hex($source);`
	`92`	`+my$nfc_utf8 = codepoint_string_to_hex($nfc);`
	`93`	`+my$nfd_utf8 = codepoint_string_to_hex($nfd);`
`92`	`94`	`my$nfkc_utf8 = codepoint_string_to_hex($nfkc);`
	`95`	`+my$nfkd_utf8 = codepoint_string_to_hex($nfkd);`
`93`	`96`
`94`		`-print$OUTPUT"\t{$linenum, {$source_utf8 }, {$nfkc_utf8 } },\n";`
	`97`	`+print$OUTPUT"\t{$linenum, {$source_utf8 }, {{$nfc_utf8 }, {$nfd_utf8 }, {$nfkc_utf8 }, {$nfkd_utf8 } } },\n";`
`95`	`98`	`}`
`96`	`99`
`97`	`100`	`# Output terminator entry`
`98`		`-print$OUTPUT"\t{ 0, { 0 }, {0 } }";`
	`101`	`+print$OUTPUT"\t{ 0, { 0 }, {{ 0 }, { 0 }, { 0 }, { 0 } } }";`
`99`	`102`	`print$OUTPUT"\n};\n";`
`100`	`103`
`101`	`104`	`close$OUTPUT;`

`‎src/common/unicode/generate-unicode_norm_table.pl`

Lines changed: 11 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -99,10 +99,12 @@`
`99`	`99`	`#define DECOMP_NO_COMPOSE0x80/* don't use for re-composition */`
`100`	`100`	`#define DECOMP_INLINE0x40/* decomposition is stored inline in`
`101`	`101`	`* dec_index */`
	`102`	`+#define DECOMP_COMPAT0x20/* compatibility mapping */`
`102`	`103`
`103`		`-#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags &0x3F)`
`104`		`-#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0)`
	`104`	`+#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags &0x1F)`
	`105`	`+#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags &(DECOMP_NO_COMPOSE \| DECOMP_COMPAT)) != 0)`
`105`	`106`	`#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)`
	`107`	`+#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)`
`106`	`108`
`107`	`109`	`/* Table of Unicode codepoints and their decompositions */`
`108`	`110`	`static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =`
`@@ -136,22 +138,22 @@`
`136`	`138`	`# Decomposition size`
`137`	`139`	`# Print size of decomposition`
`138`	`140`	`my$decomp_size =scalar(@decomp_elts);`
	`141`	`+dieif$decomp_size > 0x1F;# to not overrun bitmask`
`139`	`142`
`140`	`143`	`my$first_decomp =shift@decomp_elts;`
`141`	`144`
`142`	`145`	`my$flags ="";`
`143`	`146`	`my$comment ="";`
`144`	`147`
`145`		`-if ($decomp_size == 2)`
	`148`	`+if ($compat)`
`146`	`149`	`{`
	`150`	`+$flags .=" \| DECOMP_COMPAT";`
	`151`	`+}`
`147`	`152`
	`153`	`+if ($decomp_size == 2)`
	`154`	`+{`
`148`	`155`	`# Should this be used for recomposition?`
`149`		`-if ($compat)`
`150`		`-{`
`151`		`-$flags .=" \| DECOMP_NO_COMPOSE";`
`152`		`-$comment ="compatibility mapping";`
`153`		`-}`
`154`		`-elsif ($character_hash{$first_decomp}`
	`156`	`+if ($character_hash{$first_decomp}`
`155`	`157`	`&&$character_hash{$first_decomp}->{class} != 0)`
`156`	`158`	`{`
`157`	`159`	`$flags .=" \| DECOMP_NO_COMPOSE";`

`‎src/common/unicode/norm_test.c`

Lines changed: 13 additions & 10 deletions

Original file line number	Diff line number	Diff line change
`@@ -63,18 +63,21 @@ main(int argc, char **argv)`
`63`	`63`
`64`	`64`	`for (test=UnicodeNormalizationTests;test->input[0]!=0;test++)`
`65`	`65`	`{`
`66`		`-pg_wchar*result;`
	`66`	`+for (intform=0;form<4;form++)`
	`67`	`+{`
	`68`	`+pg_wchar*result;`
`67`	`69`
`68`		`-result=unicode_normalize_kc(test->input);`
	`70`	`+result=unicode_normalize(form,test->input);`
`69`	`71`
`70`		`-if (pg_wcscmp(test->output,result)!=0)`
`71`		`-{`
`72`		`-printf("FAILURE (NormalizationTest.txt line %d):\n",test->linenum);`
`73`		`-printf("input: %s\n",print_wchar_str(test->input));`
`74`		`-printf("expected: %s\n",print_wchar_str(test->output));`
`75`		`-printf("got: %s\n",print_wchar_str(result));`
`76`		`-printf("\n");`
`77`		`-exit(1);`
	`72`	`+if (pg_wcscmp(test->output[form],result)!=0)`
	`73`	`+{`
	`74`	`+printf("FAILURE (NormalizationTest.txt line %d form %d):\n",test->linenum,form);`
	`75`	`+printf("input: %s\n",print_wchar_str(test->input));`
	`76`	`+printf("expected: %s\n",print_wchar_str(test->output[form]));`
	`77`	`+printf("got: %s\n",print_wchar_str(result));`
	`78`	`+printf("\n");`
	`79`	`+exit(1);`
	`80`	`+}`
`78`	`81`	`}`
`79`	`82`	`}`
`80`	`83`

`‎src/common/unicode_norm.c`

Lines changed: 19 additions & 12 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`/*-------------------------------------------------------------------------`
`2`	`2`	`* unicode_norm.c`
`3`		`- *Normalize a Unicode string to NFKC form`
	`3`	`+ *Normalize a Unicode string`
`4`	`4`	`*`
`5`	`5`	`* This implements Unicode normalization, per the documentation at`
`6`	`6`	`* https://www.unicode.org/reports/tr15/.`
`@@ -98,7 +98,7 @@ get_code_decomposition(pg_unicode_decomposition entry, int dec_size)`
`98`	`98`	`* are, in turn, decomposable.`
`99`	`99`	`*/`
`100`	`100`	`staticint`
`101`		`-get_decomposed_size(pg_wcharcode)`
	`101`	`+get_decomposed_size(pg_wcharcode,boolcompat)`
`102`	`102`	`{`
`103`	`103`	`pg_unicode_decomposition*entry;`
`104`	`104`	`intsize=0;`
`@@ -131,7 +131,8 @@ get_decomposed_size(pg_wchar code)`
`131`	`131`	`* Just count current code if no other decompositions. A NULL entry is`
`132`	`132`	`* equivalent to a character with class 0 and no decompositions.`
`133`	`133`	`*/`
`134`		`-if (entry==NULL\|\|DECOMPOSITION_SIZE(entry)==0)`
	`134`	`+if (entry==NULL\|\|DECOMPOSITION_SIZE(entry)==0\|\|`
	`135`	`+(!compat&&DECOMPOSITION_IS_COMPAT(entry)))`
`135`	`136`	`return1;`
`136`	`137`
`137`	`138`	`/*`
`@@ -143,7 +144,7 @@ get_decomposed_size(pg_wchar code)`
`143`	`144`	`{`
`144`	`145`	`uint32lcode=decomp[i];`
`145`	`146`
`146`		`-size+=get_decomposed_size(lcode);`
	`147`	`+size+=get_decomposed_size(lcode,compat);`
`147`	`148`	`}`
`148`	`149`
`149`	`150`	`returnsize;`
`@@ -224,7 +225,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)`
`224`	`225`	`* in the array result.`
`225`	`226`	`*/`
`226`	`227`	`staticvoid`
`227`		`-decompose_code(pg_wcharcode,pg_wchar*result,intcurrent)`
	`228`	`+decompose_code(pg_wcharcode,boolcompat,pg_wchar*result,intcurrent)`
`228`	`229`	`{`
`229`	`230`	`pg_unicode_decomposition*entry;`
`230`	`231`	`inti;`
`@@ -272,7 +273,8 @@ decompose_code(pg_wchar code, pg_wchar *result, int current)`
`272`	`273`	`* character with class 0 and no decompositions, so just leave also in`
`273`	`274`	`* this case.`
`274`	`275`	`*/`
`275`		`-if (entry==NULL\|\|DECOMPOSITION_SIZE(entry)==0)`
	`276`	`+if (entry==NULL\|\|DECOMPOSITION_SIZE(entry)==0\|\|`
	`277`	`+(!compat&&DECOMPOSITION_IS_COMPAT(entry)))`
`276`	`278`	`{`
`277`	`279`	`pg_wcharres=result;`
`278`	`280`
`@@ -290,12 +292,12 @@ decompose_code(pg_wchar code, pg_wchar *result, int current)`
`290`	`292`	`pg_wcharlcode= (pg_wchar)decomp[i];`
`291`	`293`
`292`	`294`	`/* Leave if no more decompositions */`
`293`		`-decompose_code(lcode,result,current);`
	`295`	`+decompose_code(lcode,compat,result,current);`
`294`	`296`	`}`
`295`	`297`	`}`
`296`	`298`
`297`	`299`	`/*`
`298`		`- *unicode_normalize_kc - Normalize a Unicode string toNFKC form.`
	`300`	`+ *unicode_normalize - Normalize a Unicode string tothe specified form.`
`299`	`301`	`*`
`300`	`302`	`* The input is a 0-terminated array of codepoints.`
`301`	`303`	`*`
`@@ -304,8 +306,10 @@ decompose_code(pg_wchar code, pg_wchar *result, int current)`
`304`	`306`	`* string is palloc'd instead, and OOM is reported with ereport().`
`305`	`307`	`*/`
`306`	`308`	`pg_wchar*`
`307`		`-unicode_normalize_kc(constpg_wchar*input)`
	`309`	`+unicode_normalize(UnicodeNormalizationFormform,constpg_wchar*input)`
`308`	`310`	`{`
	`311`	`+boolcompat= (form==UNICODE_NFKC\|\|form==UNICODE_NFKD);`
	`312`	`+boolrecompose= (form==UNICODE_NFC\|\|form==UNICODE_NFKC);`
`309`	`313`	`pg_wchar*decomp_chars;`
`310`	`314`	`pg_wchar*recomp_chars;`
`311`	`315`	`intdecomp_size,`
`@@ -326,7 +330,7 @@ unicode_normalize_kc(const pg_wchar *input)`
`326`	`330`	`*/`
`327`	`331`	`decomp_size=0;`
`328`	`332`	`for (p=input;*p;p++)`
`329`		`-decomp_size+=get_decomposed_size(*p);`
	`333`	`+decomp_size+=get_decomposed_size(*p,compat);`
`330`	`334`
`331`	`335`	`decomp_chars= (pg_wchar)ALLOC((decomp_size+1)sizeof(pg_wchar));`
`332`	`336`	`if (decomp_chars==NULL)`
`@@ -338,7 +342,7 @@ unicode_normalize_kc(const pg_wchar *input)`
`338`	`342`	`*/`
`339`	`343`	`current_size=0;`
`340`	`344`	`for (p=input;*p;p++)`
`341`		`-decompose_code(*p,&decomp_chars,&current_size);`
	`345`	`+decompose_code(*p,compat,&decomp_chars,&current_size);`
`342`	`346`	`decomp_chars[decomp_size]='\0';`
`343`	`347`	`Assert(decomp_size==current_size);`
`344`	`348`
`@@ -385,8 +389,11 @@ unicode_normalize_kc(const pg_wchar *input)`
`385`	`389`	`count-=2;`
`386`	`390`	`}`
`387`	`391`
	`392`	`+if (!recompose)`
	`393`	`+returndecomp_chars;`
	`394`	`+`
`388`	`395`	`/*`
`389`		`- * The last phase of NFKC is the recomposition of the reordered Unicode`
	`396`	`+ * The last phase ofNFC andNFKC is the recomposition of the reordered Unicode`
`390`	`397`	`* string using combining classes. The recomposed string cannot be longer`
`391`	`398`	`* than the decomposed one, so make the allocation of the output string`
`392`	`399`	`* based on that assumption.`

`‎src/include/common/unicode_norm.h`

Lines changed: 9 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,14 @@`
`16`	`16`
`17`	`17`	`#include"mb/pg_wchar.h"`
`18`	`18`
`19`		`-externpg_wcharunicode_normalize_kc(constpg_wcharinput);`
	`19`	`+typedefenum`
	`20`	`+{`
	`21`	`+UNICODE_NFC=0,`
	`22`	`+UNICODE_NFD=1,`
	`23`	`+UNICODE_NFKC=2,`
	`24`	`+UNICODE_NFKD=3,`
	`25`	`+}UnicodeNormalizationForm;`
	`26`	`+`
	`27`	`+externpg_wcharunicode_normalize(UnicodeNormalizationFormform,constpg_wcharinput);`
`20`	`28`
`21`	`29`	`#endif/* UNICODE_NORM_H */`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitd40d564

File tree

7 files changed

7 files changed

`‎src/common/saslprep.c`

`‎src/common/unicode/generate-norm_test_table.pl`

`‎src/common/unicode/generate-unicode_norm_table.pl`

`‎src/common/unicode/norm_test.c`

`‎src/common/unicode_norm.c`

`‎src/include/common/unicode_norm.h`

0 commit comments