NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commitf37fec8

committed

Add unistr function

This allows decoding a string with Unicode escape sequences. It issimilar to Unicode escape strings, but offers some more flexibility.Author: Pavel Stehule <pavel.stehule@gmail.com>Reviewed-by: Asif Rehman <asifr.rehman@gmail.com>Discussion:https://www.postgresql.org/message-id/flat/CAFj8pRA5GnKT+gDVwbVRH2ep451H_myBt+NTz8RkYUARE9+qOQ@mail.gmail.com

1 parentebedd0c commitf37fec8Copy full SHA for f37fec8

File tree

6 files changed

+310

-1

lines changed

doc/src/sgml
- func.sgml
src
- backend/utils/adt
  - varlena.c
- include/catalog
  - catversion.h
  - pg_proc.dat
- test/regress
  - expected
    - strings.out
  - sql
    - strings.sql

6 files changed

+310

-1

lines changed

`‎doc/src/sgml/func.sgml`

Lines changed: 46 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -3551,6 +3551,52 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>`
`3551`	`3551`	`</para></entry>`
`3552`	`3552`	`</row>`
`3553`	`3553`
	`3554`	`+ <row>`
	`3555`	`+ <entry role="func_table_entry"><para role="func_signature">`
	`3556`	`+ <indexterm>`
	`3557`	`+ <primary>unistr</primary>`
	`3558`	`+ </indexterm>`
	`3559`	`+ <function>unistr</function> ( <type>text</type> )`
	`3560`	`+ <returnvalue>text</returnvalue>`
	`3561`	`+ </para>`
	`3562`	`+ <para>`
	`3563`	`+ Evaluate escaped Unicode characters in argument. Unicode characters`
	`3564`	`+ can be specified as`
	`3565`	`+ <literal>\<replaceable>XXXX</replaceable></literal> (4 hexadecimal`
	`3566`	`+ digits), <literal>\+<replaceable>XXXXXX</replaceable></literal> (6`
	`3567`	`+ hexadecimal digits),`
	`3568`	`+ <literal>\u<replaceable>XXXX</replaceable></literal> (4 hexadecimal`
	`3569`	`+ digits), or <literal>\U<replaceable>XXXXXXXX</replaceable></literal>`
	`3570`	`+ (8 hexadecimal digits). To specify a backslash, write two`
	`3571`	`+ backslashes. All other characters are taken literally.`
	`3572`	`+ </para>`
	`3573`	`+`
	`3574`	`+ <para>`
	`3575`	`+ If the server encoding is not UTF-8, the Unicode code point identified`
	`3576`	`+ by one of these escape sequences is converted to the actual server`
	`3577`	`+ encoding; an error is reported if that's not possible.`
	`3578`	`+ </para>`
	`3579`	`+`
	`3580`	`+ <para>`
	`3581`	`+ This function provides a (non-standard) alternative to string`
	`3582`	`+ constants with Unicode escapes (see <xref`
	`3583`	`+ linkend="sql-syntax-strings-uescape"/>).`
	`3584`	`+ </para>`
	`3585`	`+`
	`3586`	`+ <para>`
	`3587`	`+ <literal>unistr('\0441\043B\043E\043D')</literal>`
	`3588`	`+ <returnvalue>слон</returnvalue>`
	`3589`	`+ </para>`
	`3590`	`+ <para>`
	`3591`	`+ <literal>unistr('d\0061t\+000061')</literal>`
	`3592`	`+ <returnvalue>data</returnvalue>`
	`3593`	`+ </para>`
	`3594`	`+ <para>`
	`3595`	`+ <literal>unistr('d\u0061t\U00000061')</literal>`
	`3596`	`+ <returnvalue>data</returnvalue>`
	`3597`	`+ </para></entry>`
	`3598`	`+ </row>`
	`3599`	`+`
`3554`	`3600`	`</tbody>`
`3555`	`3601`	`</tgroup>`
`3556`	`3602`	`</table>`

`‎src/backend/utils/adt/varlena.c`

Lines changed: 210 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS)`
`6380`	`6380`
`6381`	`6381`	`PG_RETURN_BOOL(result);`
`6382`	`6382`	`}`
	`6383`	`+`
	`6384`	`+/*`
	`6385`	`+ * Check if first n chars are hexadecimal digits`
	`6386`	`+ */`
	`6387`	`+staticbool`
	`6388`	`+isxdigits_n(constchar*instr,size_tn)`
	`6389`	`+{`
	`6390`	`+for (size_ti=0;i<n;i++)`
	`6391`	`+if (!isxdigit((unsignedchar)instr[i]))`
	`6392`	`+return false;`
	`6393`	`+`
	`6394`	`+return true;`
	`6395`	`+}`
	`6396`	`+`
	`6397`	`+staticunsignedint`
	`6398`	`+hexval(unsignedcharc)`
	`6399`	`+{`
	`6400`	`+if (c >='0'&&c <='9')`
	`6401`	`+returnc-'0';`
	`6402`	`+if (c >='a'&&c <='f')`
	`6403`	`+returnc-'a'+0xA;`
	`6404`	`+if (c >='A'&&c <='F')`
	`6405`	`+returnc-'A'+0xA;`
	`6406`	`+elog(ERROR,"invalid hexadecimal digit");`
	`6407`	`+return0;/* not reached */`
	`6408`	`+}`
	`6409`	`+`
	`6410`	`+/*`
	`6411`	`+ * Translate string with hexadecimal digits to number`
	`6412`	`+ */`
	`6413`	`+staticunsignedint`
	`6414`	`+hexval_n(constchar*instr,size_tn)`
	`6415`	`+{`
	`6416`	`+unsignedintresult=0;`
	`6417`	`+`
	`6418`	`+for (size_ti=0;i<n;i++)`
	`6419`	`+result+=hexval(instr[i]) << (4* (n-i-1));`
	`6420`	`+`
	`6421`	`+returnresult;`
	`6422`	`+}`
	`6423`	`+`
	`6424`	`+/*`
	`6425`	`+ * Replaces Unicode escape sequences by Unicode characters`
	`6426`	`+ */`
	`6427`	`+Datum`
	`6428`	`+unistr(PG_FUNCTION_ARGS)`
	`6429`	`+{`
	`6430`	`+text*input_text=PG_GETARG_TEXT_PP(0);`
	`6431`	`+char*instr;`
	`6432`	`+intlen;`
	`6433`	`+StringInfoDatastr;`
	`6434`	`+text*result;`
	`6435`	`+pg_wcharpair_first=0;`
	`6436`	`+charcbuf[MAX_UNICODE_EQUIVALENT_STRING+1];`
	`6437`	`+`
	`6438`	`+instr=VARDATA_ANY(input_text);`
	`6439`	`+len=VARSIZE_ANY_EXHDR(input_text);`
	`6440`	`+`
	`6441`	`+initStringInfo(&str);`
	`6442`	`+`
	`6443`	`+while (len>0)`
	`6444`	`+{`
	`6445`	`+if (instr[0]=='\\')`
	`6446`	`+{`
	`6447`	`+if (len >=2&&`
	`6448`	`+instr[1]=='\\')`
	`6449`	`+{`
	`6450`	`+if (pair_first)`
	`6451`	`+gotoinvalid_pair;`
	`6452`	`+appendStringInfoChar(&str,'\\');`
	`6453`	`+instr+=2;`
	`6454`	`+len-=2;`
	`6455`	`+}`
	`6456`	`+elseif ((len >=5&&isxdigits_n(instr+1,4))\|\|`
	`6457`	`+ (len >=6&&instr[1]=='u'&&isxdigits_n(instr+2,4)))`
	`6458`	`+{`
	`6459`	`+pg_wcharunicode;`
	`6460`	`+intoffset=instr[1]=='u' ?2 :1;`
	`6461`	`+`
	`6462`	`+unicode=hexval_n(instr+offset,4);`
	`6463`	`+`
	`6464`	`+if (!is_valid_unicode_codepoint(unicode))`
	`6465`	`+ereport(ERROR,`
	`6466`	`+errcode(ERRCODE_INVALID_PARAMETER_VALUE),`
	`6467`	`+errmsg("invalid Unicode code point: %04X",unicode));`
	`6468`	`+`
	`6469`	`+if (pair_first)`
	`6470`	`+{`
	`6471`	`+if (is_utf16_surrogate_second(unicode))`
	`6472`	`+{`
	`6473`	`+unicode=surrogate_pair_to_codepoint(pair_first,unicode);`
	`6474`	`+pair_first=0;`
	`6475`	`+}`
	`6476`	`+else`
	`6477`	`+gotoinvalid_pair;`
	`6478`	`+}`
	`6479`	`+elseif (is_utf16_surrogate_second(unicode))`
	`6480`	`+gotoinvalid_pair;`
	`6481`	`+`
	`6482`	`+if (is_utf16_surrogate_first(unicode))`
	`6483`	`+pair_first=unicode;`
	`6484`	`+else`
	`6485`	`+{`
	`6486`	`+pg_unicode_to_server(unicode, (unsignedchar*)cbuf);`
	`6487`	`+appendStringInfoString(&str,cbuf);`
	`6488`	`+}`
	`6489`	`+`
	`6490`	`+instr+=4+offset;`
	`6491`	`+len-=4+offset;`
	`6492`	`+}`
	`6493`	`+elseif (len >=8&&instr[1]=='+'&&isxdigits_n(instr+2,6))`
	`6494`	`+{`
	`6495`	`+pg_wcharunicode;`
	`6496`	`+`
	`6497`	`+unicode=hexval_n(instr+2,6);`
	`6498`	`+`
	`6499`	`+if (!is_valid_unicode_codepoint(unicode))`
	`6500`	`+ereport(ERROR,`
	`6501`	`+errcode(ERRCODE_INVALID_PARAMETER_VALUE),`
	`6502`	`+errmsg("invalid Unicode code point: %04X",unicode));`
	`6503`	`+`
	`6504`	`+if (pair_first)`
	`6505`	`+{`
	`6506`	`+if (is_utf16_surrogate_second(unicode))`
	`6507`	`+{`
	`6508`	`+unicode=surrogate_pair_to_codepoint(pair_first,unicode);`
	`6509`	`+pair_first=0;`
	`6510`	`+}`
	`6511`	`+else`
	`6512`	`+gotoinvalid_pair;`
	`6513`	`+}`
	`6514`	`+elseif (is_utf16_surrogate_second(unicode))`
	`6515`	`+gotoinvalid_pair;`
	`6516`	`+`
	`6517`	`+if (is_utf16_surrogate_first(unicode))`
	`6518`	`+pair_first=unicode;`
	`6519`	`+else`
	`6520`	`+{`
	`6521`	`+pg_unicode_to_server(unicode, (unsignedchar*)cbuf);`
	`6522`	`+appendStringInfoString(&str,cbuf);`
	`6523`	`+}`
	`6524`	`+`
	`6525`	`+instr+=8;`
	`6526`	`+len-=8;`
	`6527`	`+}`
	`6528`	`+elseif (len >=10&&instr[1]=='U'&&isxdigits_n(instr+2,8))`
	`6529`	`+{`
	`6530`	`+pg_wcharunicode;`
	`6531`	`+`
	`6532`	`+unicode=hexval_n(instr+2,8);`
	`6533`	`+`
	`6534`	`+if (!is_valid_unicode_codepoint(unicode))`
	`6535`	`+ereport(ERROR,`
	`6536`	`+errcode(ERRCODE_INVALID_PARAMETER_VALUE),`
	`6537`	`+errmsg("invalid Unicode code point: %04X",unicode));`
	`6538`	`+`
	`6539`	`+if (pair_first)`
	`6540`	`+{`
	`6541`	`+if (is_utf16_surrogate_second(unicode))`
	`6542`	`+{`
	`6543`	`+unicode=surrogate_pair_to_codepoint(pair_first,unicode);`
	`6544`	`+pair_first=0;`
	`6545`	`+}`
	`6546`	`+else`
	`6547`	`+gotoinvalid_pair;`
	`6548`	`+}`
	`6549`	`+elseif (is_utf16_surrogate_second(unicode))`
	`6550`	`+gotoinvalid_pair;`
	`6551`	`+`
	`6552`	`+if (is_utf16_surrogate_first(unicode))`
	`6553`	`+pair_first=unicode;`
	`6554`	`+else`
	`6555`	`+{`
	`6556`	`+pg_unicode_to_server(unicode, (unsignedchar*)cbuf);`
	`6557`	`+appendStringInfoString(&str,cbuf);`
	`6558`	`+}`
	`6559`	`+`
	`6560`	`+instr+=10;`
	`6561`	`+len-=10;`
	`6562`	`+}`
	`6563`	`+else`
	`6564`	`+ereport(ERROR,`
	`6565`	`+(errcode(ERRCODE_SYNTAX_ERROR),`
	`6566`	`+errmsg("invalid Unicode escape"),`
	`6567`	`+errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));`
	`6568`	`+}`
	`6569`	`+else`
	`6570`	`+{`
	`6571`	`+if (pair_first)`
	`6572`	`+gotoinvalid_pair;`
	`6573`	`+`
	`6574`	`+appendStringInfoChar(&str,*instr++);`
	`6575`	`+len--;`
	`6576`	`+}`
	`6577`	`+}`
	`6578`	`+`
	`6579`	`+/* unfinished surrogate pair? */`
	`6580`	`+if (pair_first)`
	`6581`	`+gotoinvalid_pair;`
	`6582`	`+`
	`6583`	`+result=cstring_to_text_with_len(str.data,str.len);`
	`6584`	`+pfree(str.data);`
	`6585`	`+`
	`6586`	`+PG_RETURN_TEXT_P(result);`
	`6587`	`+`
	`6588`	`+invalid_pair:`
	`6589`	`+ereport(ERROR,`
	`6590`	`+(errcode(ERRCODE_SYNTAX_ERROR),`
	`6591`	`+errmsg("invalid Unicode surrogate pair")));`
	`6592`	`+}`

`‎src/include/catalog/catversion.h`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,6 @@`
`53`	`53`	`*/`
`54`	`54`
`55`	`55`	`/yyyymmddN /`
`56`		`-#defineCATALOG_VERSION_NO202103266`
	`56`	`+#defineCATALOG_VERSION_NO202103291`
`57`	`57`
`58`	`58`	`#endif`

`‎src/include/catalog/pg_proc.dat`

Lines changed: 4 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -11527,6 +11527,10 @@`
`11527`	`11527`	`proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',`
`11528`	`11528`	`prosrc => 'unicode_is_normalized' },`
`11529`	`11529`
	`11530`	`+{ oid => '9822', descr => 'unescape Unicode characters',`
	`11531`	`+ proname => 'unistr', prorettype => 'text', proargtypes => 'text',`
	`11532`	`+ prosrc => 'unistr' },`
	`11533`	`+`
`11530`	`11534`	`{ oid => '4596', descr => 'I/O',`
`11531`	`11535`	`proname => 'brin_bloom_summary_in', prorettype => 'pg_brin_bloom_summary',`
`11532`	`11536`	`proargtypes => 'cstring', prosrc => 'brin_bloom_summary_in' },`

`‎src/test/regress/expected/strings.out`

Lines changed: 36 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -2234,3 +2234,39 @@ SELECT bit_count('\x1234567890'::bytea);`
`2234`	`2234`	`15`
`2235`	`2235`	`(1 row)`
`2236`	`2236`
	`2237`	`+SELECT unistr('\0064at\+0000610');`
	`2238`	`+ unistr`
	`2239`	`+--------`
	`2240`	`+ data0`
	`2241`	`+(1 row)`
	`2242`	`+`
	`2243`	`+SELECT unistr('d\u0061t\U000000610');`
	`2244`	`+ unistr`
	`2245`	`+--------`
	`2246`	`+ data0`
	`2247`	`+(1 row)`
	`2248`	`+`
	`2249`	`+SELECT unistr('a\\b');`
	`2250`	`+ unistr`
	`2251`	`+--------`
	`2252`	`+ a\b`
	`2253`	`+(1 row)`
	`2254`	`+`
	`2255`	`+-- errors:`
	`2256`	`+SELECT unistr('wrong: \db99');`
	`2257`	`+ERROR: invalid Unicode surrogate pair`
	`2258`	`+SELECT unistr('wrong: \db99\0061');`
	`2259`	`+ERROR: invalid Unicode surrogate pair`
	`2260`	`+SELECT unistr('wrong: \+00db99\+000061');`
	`2261`	`+ERROR: invalid Unicode surrogate pair`
	`2262`	`+SELECT unistr('wrong: \+2FFFFF');`
	`2263`	`+ERROR: invalid Unicode code point: 2FFFFF`
	`2264`	`+SELECT unistr('wrong: \udb99\u0061');`
	`2265`	`+ERROR: invalid Unicode surrogate pair`
	`2266`	`+SELECT unistr('wrong: \U0000db99\U00000061');`
	`2267`	`+ERROR: invalid Unicode surrogate pair`
	`2268`	`+SELECT unistr('wrong: \U002FFFFF');`
	`2269`	`+ERROR: invalid Unicode code point: 2FFFFF`
	`2270`	`+SELECT unistr('wrong: \xyz');`
	`2271`	`+ERROR: invalid Unicode escape`
	`2272`	`+HINT: Unicode escapes must be \XXXX, \+XXXXXX, \uXXXX, or \UXXXXXXXX.`

`‎src/test/regress/sql/strings.sql`

Lines changed: 13 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -746,3 +746,16 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8)`
`746`	`746`	`SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::byteafrom5 for3),'escape');`
`747`	`747`
`748`	`748`	`SELECT bit_count('\x1234567890'::bytea);`
	`749`	`+`
	`750`	`+SELECT unistr('\0064at\+0000610');`
	`751`	`+SELECT unistr('d\u0061t\U000000610');`
	`752`	`+SELECT unistr('a\\b');`
	`753`	`+-- errors:`
	`754`	`+SELECT unistr('wrong:\db99');`
	`755`	`+SELECT unistr('wrong:\db99\0061');`
	`756`	`+SELECT unistr('wrong:\+00db99\+000061');`
	`757`	`+SELECT unistr('wrong:\+2FFFFF');`
	`758`	`+SELECT unistr('wrong:\udb99\u0061');`
	`759`	`+SELECT unistr('wrong:\U0000db99\U00000061');`
	`760`	`+SELECT unistr('wrong:\U002FFFFF');`
	`761`	`+SELECT unistr('wrong:\xyz');`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitf37fec8

File tree

6 files changed

6 files changed

`‎doc/src/sgml/func.sgml`

`‎src/backend/utils/adt/varlena.c`

`‎src/include/catalog/catversion.h`

`‎src/include/catalog/pg_proc.dat`

`‎src/test/regress/expected/strings.out`

`‎src/test/regress/sql/strings.sql`

0 commit comments