Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitf37fec8

Browse files
committed
Add unistr function
This allows decoding a string with Unicode escape sequences. It issimilar to Unicode escape strings, but offers some more flexibility.Author: Pavel Stehule <pavel.stehule@gmail.com>Reviewed-by: Asif Rehman <asifr.rehman@gmail.com>Discussion:https://www.postgresql.org/message-id/flat/CAFj8pRA5GnKT+gDVwbVRH2ep451H_myBt+NTz8RkYUARE9+qOQ@mail.gmail.com
1 parentebedd0c commitf37fec8

File tree

6 files changed

+310
-1
lines changed

6 files changed

+310
-1
lines changed

‎doc/src/sgml/func.sgml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3551,6 +3551,52 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
35513551
</para></entry>
35523552
</row>
35533553

3554+
<row>
3555+
<entry role="func_table_entry"><para role="func_signature">
3556+
<indexterm>
3557+
<primary>unistr</primary>
3558+
</indexterm>
3559+
<function>unistr</function> ( <type>text</type> )
3560+
<returnvalue>text</returnvalue>
3561+
</para>
3562+
<para>
3563+
Evaluate escaped Unicode characters in argument. Unicode characters
3564+
can be specified as
3565+
<literal>\<replaceable>XXXX</replaceable></literal> (4 hexadecimal
3566+
digits), <literal>\+<replaceable>XXXXXX</replaceable></literal> (6
3567+
hexadecimal digits),
3568+
<literal>\u<replaceable>XXXX</replaceable></literal> (4 hexadecimal
3569+
digits), or <literal>\U<replaceable>XXXXXXXX</replaceable></literal>
3570+
(8 hexadecimal digits). To specify a backslash, write two
3571+
backslashes. All other characters are taken literally.
3572+
</para>
3573+
3574+
<para>
3575+
If the server encoding is not UTF-8, the Unicode code point identified
3576+
by one of these escape sequences is converted to the actual server
3577+
encoding; an error is reported if that's not possible.
3578+
</para>
3579+
3580+
<para>
3581+
This function provides a (non-standard) alternative to string
3582+
constants with Unicode escapes (see <xref
3583+
linkend="sql-syntax-strings-uescape"/>).
3584+
</para>
3585+
3586+
<para>
3587+
<literal>unistr('\0441\043B\043E\043D')</literal>
3588+
<returnvalue>слон</returnvalue>
3589+
</para>
3590+
<para>
3591+
<literal>unistr('d\0061t\+000061')</literal>
3592+
<returnvalue>data</returnvalue>
3593+
</para>
3594+
<para>
3595+
<literal>unistr('d\u0061t\U00000061')</literal>
3596+
<returnvalue>data</returnvalue>
3597+
</para></entry>
3598+
</row>
3599+
35543600
</tbody>
35553601
</tgroup>
35563602
</table>

‎src/backend/utils/adt/varlena.c

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
63806380

63816381
PG_RETURN_BOOL(result);
63826382
}
6383+
6384+
/*
6385+
* Check if first n chars are hexadecimal digits
6386+
*/
6387+
staticbool
6388+
isxdigits_n(constchar*instr,size_tn)
6389+
{
6390+
for (size_ti=0;i<n;i++)
6391+
if (!isxdigit((unsignedchar)instr[i]))
6392+
return false;
6393+
6394+
return true;
6395+
}
6396+
6397+
staticunsignedint
6398+
hexval(unsignedcharc)
6399+
{
6400+
if (c >='0'&&c <='9')
6401+
returnc-'0';
6402+
if (c >='a'&&c <='f')
6403+
returnc-'a'+0xA;
6404+
if (c >='A'&&c <='F')
6405+
returnc-'A'+0xA;
6406+
elog(ERROR,"invalid hexadecimal digit");
6407+
return0;/* not reached */
6408+
}
6409+
6410+
/*
6411+
* Translate string with hexadecimal digits to number
6412+
*/
6413+
staticunsignedint
6414+
hexval_n(constchar*instr,size_tn)
6415+
{
6416+
unsignedintresult=0;
6417+
6418+
for (size_ti=0;i<n;i++)
6419+
result+=hexval(instr[i]) << (4* (n-i-1));
6420+
6421+
returnresult;
6422+
}
6423+
6424+
/*
6425+
* Replaces Unicode escape sequences by Unicode characters
6426+
*/
6427+
Datum
6428+
unistr(PG_FUNCTION_ARGS)
6429+
{
6430+
text*input_text=PG_GETARG_TEXT_PP(0);
6431+
char*instr;
6432+
intlen;
6433+
StringInfoDatastr;
6434+
text*result;
6435+
pg_wcharpair_first=0;
6436+
charcbuf[MAX_UNICODE_EQUIVALENT_STRING+1];
6437+
6438+
instr=VARDATA_ANY(input_text);
6439+
len=VARSIZE_ANY_EXHDR(input_text);
6440+
6441+
initStringInfo(&str);
6442+
6443+
while (len>0)
6444+
{
6445+
if (instr[0]=='\\')
6446+
{
6447+
if (len >=2&&
6448+
instr[1]=='\\')
6449+
{
6450+
if (pair_first)
6451+
gotoinvalid_pair;
6452+
appendStringInfoChar(&str,'\\');
6453+
instr+=2;
6454+
len-=2;
6455+
}
6456+
elseif ((len >=5&&isxdigits_n(instr+1,4))||
6457+
(len >=6&&instr[1]=='u'&&isxdigits_n(instr+2,4)))
6458+
{
6459+
pg_wcharunicode;
6460+
intoffset=instr[1]=='u' ?2 :1;
6461+
6462+
unicode=hexval_n(instr+offset,4);
6463+
6464+
if (!is_valid_unicode_codepoint(unicode))
6465+
ereport(ERROR,
6466+
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6467+
errmsg("invalid Unicode code point: %04X",unicode));
6468+
6469+
if (pair_first)
6470+
{
6471+
if (is_utf16_surrogate_second(unicode))
6472+
{
6473+
unicode=surrogate_pair_to_codepoint(pair_first,unicode);
6474+
pair_first=0;
6475+
}
6476+
else
6477+
gotoinvalid_pair;
6478+
}
6479+
elseif (is_utf16_surrogate_second(unicode))
6480+
gotoinvalid_pair;
6481+
6482+
if (is_utf16_surrogate_first(unicode))
6483+
pair_first=unicode;
6484+
else
6485+
{
6486+
pg_unicode_to_server(unicode, (unsignedchar*)cbuf);
6487+
appendStringInfoString(&str,cbuf);
6488+
}
6489+
6490+
instr+=4+offset;
6491+
len-=4+offset;
6492+
}
6493+
elseif (len >=8&&instr[1]=='+'&&isxdigits_n(instr+2,6))
6494+
{
6495+
pg_wcharunicode;
6496+
6497+
unicode=hexval_n(instr+2,6);
6498+
6499+
if (!is_valid_unicode_codepoint(unicode))
6500+
ereport(ERROR,
6501+
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6502+
errmsg("invalid Unicode code point: %04X",unicode));
6503+
6504+
if (pair_first)
6505+
{
6506+
if (is_utf16_surrogate_second(unicode))
6507+
{
6508+
unicode=surrogate_pair_to_codepoint(pair_first,unicode);
6509+
pair_first=0;
6510+
}
6511+
else
6512+
gotoinvalid_pair;
6513+
}
6514+
elseif (is_utf16_surrogate_second(unicode))
6515+
gotoinvalid_pair;
6516+
6517+
if (is_utf16_surrogate_first(unicode))
6518+
pair_first=unicode;
6519+
else
6520+
{
6521+
pg_unicode_to_server(unicode, (unsignedchar*)cbuf);
6522+
appendStringInfoString(&str,cbuf);
6523+
}
6524+
6525+
instr+=8;
6526+
len-=8;
6527+
}
6528+
elseif (len >=10&&instr[1]=='U'&&isxdigits_n(instr+2,8))
6529+
{
6530+
pg_wcharunicode;
6531+
6532+
unicode=hexval_n(instr+2,8);
6533+
6534+
if (!is_valid_unicode_codepoint(unicode))
6535+
ereport(ERROR,
6536+
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6537+
errmsg("invalid Unicode code point: %04X",unicode));
6538+
6539+
if (pair_first)
6540+
{
6541+
if (is_utf16_surrogate_second(unicode))
6542+
{
6543+
unicode=surrogate_pair_to_codepoint(pair_first,unicode);
6544+
pair_first=0;
6545+
}
6546+
else
6547+
gotoinvalid_pair;
6548+
}
6549+
elseif (is_utf16_surrogate_second(unicode))
6550+
gotoinvalid_pair;
6551+
6552+
if (is_utf16_surrogate_first(unicode))
6553+
pair_first=unicode;
6554+
else
6555+
{
6556+
pg_unicode_to_server(unicode, (unsignedchar*)cbuf);
6557+
appendStringInfoString(&str,cbuf);
6558+
}
6559+
6560+
instr+=10;
6561+
len-=10;
6562+
}
6563+
else
6564+
ereport(ERROR,
6565+
(errcode(ERRCODE_SYNTAX_ERROR),
6566+
errmsg("invalid Unicode escape"),
6567+
errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6568+
}
6569+
else
6570+
{
6571+
if (pair_first)
6572+
gotoinvalid_pair;
6573+
6574+
appendStringInfoChar(&str,*instr++);
6575+
len--;
6576+
}
6577+
}
6578+
6579+
/* unfinished surrogate pair? */
6580+
if (pair_first)
6581+
gotoinvalid_pair;
6582+
6583+
result=cstring_to_text_with_len(str.data,str.len);
6584+
pfree(str.data);
6585+
6586+
PG_RETURN_TEXT_P(result);
6587+
6588+
invalid_pair:
6589+
ereport(ERROR,
6590+
(errcode(ERRCODE_SYNTAX_ERROR),
6591+
errmsg("invalid Unicode surrogate pair")));
6592+
}

‎src/include/catalog/catversion.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,6 @@
5353
*/
5454

5555
/*yyyymmddN */
56-
#defineCATALOG_VERSION_NO202103266
56+
#defineCATALOG_VERSION_NO202103291
5757

5858
#endif

‎src/include/catalog/pg_proc.dat

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11527,6 +11527,10 @@
1152711527
proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
1152811528
prosrc => 'unicode_is_normalized' },
1152911529

11530+
{ oid => '9822', descr => 'unescape Unicode characters',
11531+
proname => 'unistr', prorettype => 'text', proargtypes => 'text',
11532+
prosrc => 'unistr' },
11533+
1153011534
{ oid => '4596', descr => 'I/O',
1153111535
proname => 'brin_bloom_summary_in', prorettype => 'pg_brin_bloom_summary',
1153211536
proargtypes => 'cstring', prosrc => 'brin_bloom_summary_in' },

‎src/test/regress/expected/strings.out

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2234,3 +2234,39 @@ SELECT bit_count('\x1234567890'::bytea);
22342234
15
22352235
(1 row)
22362236

2237+
SELECT unistr('\0064at\+0000610');
2238+
unistr
2239+
--------
2240+
data0
2241+
(1 row)
2242+
2243+
SELECT unistr('d\u0061t\U000000610');
2244+
unistr
2245+
--------
2246+
data0
2247+
(1 row)
2248+
2249+
SELECT unistr('a\\b');
2250+
unistr
2251+
--------
2252+
a\b
2253+
(1 row)
2254+
2255+
-- errors:
2256+
SELECT unistr('wrong: \db99');
2257+
ERROR: invalid Unicode surrogate pair
2258+
SELECT unistr('wrong: \db99\0061');
2259+
ERROR: invalid Unicode surrogate pair
2260+
SELECT unistr('wrong: \+00db99\+000061');
2261+
ERROR: invalid Unicode surrogate pair
2262+
SELECT unistr('wrong: \+2FFFFF');
2263+
ERROR: invalid Unicode code point: 2FFFFF
2264+
SELECT unistr('wrong: \udb99\u0061');
2265+
ERROR: invalid Unicode surrogate pair
2266+
SELECT unistr('wrong: \U0000db99\U00000061');
2267+
ERROR: invalid Unicode surrogate pair
2268+
SELECT unistr('wrong: \U002FFFFF');
2269+
ERROR: invalid Unicode code point: 2FFFFF
2270+
SELECT unistr('wrong: \xyz');
2271+
ERROR: invalid Unicode escape
2272+
HINT: Unicode escapes must be \XXXX, \+XXXXXX, \uXXXX, or \UXXXXXXXX.

‎src/test/regress/sql/strings.sql

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,3 +746,16 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8)
746746
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::byteafrom5 for3),'escape');
747747

748748
SELECT bit_count('\x1234567890'::bytea);
749+
750+
SELECT unistr('\0064at\+0000610');
751+
SELECT unistr('d\u0061t\U000000610');
752+
SELECT unistr('a\\b');
753+
-- errors:
754+
SELECT unistr('wrong:\db99');
755+
SELECT unistr('wrong:\db99\0061');
756+
SELECT unistr('wrong:\+00db99\+000061');
757+
SELECT unistr('wrong:\+2FFFFF');
758+
SELECT unistr('wrong:\udb99\u0061');
759+
SELECT unistr('wrong:\U0000db99\U00000061');
760+
SELECT unistr('wrong:\U002FFFFF');
761+
SELECT unistr('wrong:\xyz');

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp