Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit02faeb4

Browse files
committed
Surrogate pair support for U& string and identifier syntax
This is mainly to make the functionality consistent with the proposed \uescape syntax.
1 parentc6bc0fe commit02faeb4

File tree

2 files changed

+81
-6
lines changed

2 files changed

+81
-6
lines changed

‎doc/src/sgml/syntax.sgml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.134 2009/08/27 20:08:02 tgl Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
22

33
<chapter id="sql-syntax">
44
<title>SQL Syntax</title>
@@ -238,6 +238,10 @@ U&amp;"d!0061t!+000061" UESCAPE '!'
238238
The Unicode escape syntax works only when the server encoding is
239239
UTF8. When other server encodings are used, only code points in
240240
the ASCII range (up to <literal>\007F</literal>) can be specified.
241+
Both the 4-digit and the 6-digit form can be used to specify
242+
UTF-16 surrogate pairs to compose characters with code points
243+
larger than <literal>\FFFF</literal> (although the availability of
244+
the 6-digit form technically makes this unnecessary).
241245
</para>
242246

243247
<para>
@@ -497,6 +501,10 @@ U&amp;'d!0061t!+000061' UESCAPE '!'
497501
UTF8. When other server encodings are used, only code points in
498502
the ASCII range (up to <literal>\007F</literal>) can be
499503
specified.
504+
Both the 4-digit and the 6-digit form can be used to specify
505+
UTF-16 surrogate pairs to compose characters with code points
506+
larger than <literal>\FFFF</literal> (although the availability
507+
of the 6-digit form technically makes this unnecessary).
500508
</para>
501509

502510
<para>

‎src/backend/parser/scan.l

Lines changed: 72 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* Portions Copyright (c) 1994, Regents of the University of California
2525
*
2626
* IDENTIFICATION
27-
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $
27+
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
2828
*
2929
*-------------------------------------------------------------------------
3030
*/
@@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner)
10971097
}
10981098
}
10991099

1100+
staticbool
1101+
is_utf16_surrogate_first(pg_wchar c)
1102+
{
1103+
return (c >=0xD800 && c <=0xDBFF);
1104+
}
1105+
1106+
staticbool
1107+
is_utf16_surrogate_second(pg_wchar c)
1108+
{
1109+
return (c >=0xDC00 && c <=0xDFFF);
1110+
}
1111+
1112+
static pg_wchar
1113+
surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
1114+
{
1115+
return ((first &0x3FF) <<10) +0x10000 + (second &0x3FF);
1116+
}
1117+
11001118
staticchar *
11011119
litbuf_udeescape(unsignedchar escape,base_yyscan_t yyscanner)
11021120
{
11031121
char *new;
11041122
char *litbuf, *in, *out;
1123+
pg_wchar pair_first =0;
11051124

11061125
if (isxdigit(escape)
11071126
|| escape =='+'
@@ -1131,16 +1150,39 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
11311150
{
11321151
if (in[1] == escape)
11331152
{
1153+
if (pair_first)
1154+
{
1155+
ADVANCE_YYLLOC(in - litbuf +3);/* 3 for U&" */
1156+
yyerror("invalid Unicode surrogate pair");
1157+
}
11341158
*out++ = escape;
11351159
in +=2;
11361160
}
11371161
elseif (isxdigit(in[1]) &&isxdigit(in[2]) &&isxdigit(in[3]) &&isxdigit(in[4]))
11381162
{
11391163
pg_wchar unicode =hexval(in[1]) *16*16*16 +hexval(in[2]) *16*16 +hexval(in[3]) *16 +hexval(in[4]);
11401164
check_unicode_value(unicode, in, yyscanner);
1141-
unicode_to_utf8(unicode, (unsignedchar *) out);
1165+
if (pair_first)
1166+
{
1167+
if (is_utf16_surrogate_second(unicode))
1168+
{
1169+
unicode =surrogate_pair_to_codepoint(pair_first, unicode);
1170+
pair_first =0;
1171+
}
1172+
else
1173+
{
1174+
ADVANCE_YYLLOC(in - litbuf +3);/* 3 for U&" */
1175+
yyerror("invalid Unicode surrogate pair");
1176+
}
1177+
}
1178+
if (is_utf16_surrogate_first(unicode))
1179+
pair_first = unicode;
1180+
else
1181+
{
1182+
unicode_to_utf8(unicode, (unsignedchar *) out);
1183+
out +=pg_mblen(out);
1184+
}
11421185
in +=5;
1143-
out +=pg_mblen(out);
11441186
}
11451187
elseif (in[1] =='+'
11461188
&&isxdigit(in[2]) &&isxdigit(in[3])
@@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
11501192
pg_wchar unicode =hexval(in[2]) *16*16*16*16*16 +hexval(in[3]) *16*16*16*16 +hexval(in[4]) *16*16*16
11511193
+hexval(in[5]) *16*16 +hexval(in[6]) *16 +hexval(in[7]);
11521194
check_unicode_value(unicode, in, yyscanner);
1153-
unicode_to_utf8(unicode, (unsignedchar *) out);
1195+
if (pair_first)
1196+
{
1197+
if (is_utf16_surrogate_second(unicode))
1198+
{
1199+
unicode =surrogate_pair_to_codepoint(pair_first, unicode);
1200+
pair_first =0;
1201+
}
1202+
else
1203+
{
1204+
ADVANCE_YYLLOC(in - litbuf +3);/* 3 for U&" */
1205+
yyerror("invalid Unicode surrogate pair");
1206+
}
1207+
}
1208+
if (is_utf16_surrogate_first(unicode))
1209+
pair_first = unicode;
1210+
else
1211+
{
1212+
unicode_to_utf8(unicode, (unsignedchar *) out);
1213+
out +=pg_mblen(out);
1214+
}
11541215
in +=8;
1155-
out +=pg_mblen(out);
11561216
}
11571217
else
11581218
{
@@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
11611221
}
11621222
}
11631223
else
1224+
{
1225+
if (pair_first)
1226+
{
1227+
ADVANCE_YYLLOC(in - litbuf +3);/* 3 for U&" */
1228+
yyerror("invalid Unicode surrogate pair");
1229+
}
11641230
*out++ = *in++;
1231+
}
11651232
}
11661233

11671234
*out ='\0';

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp