Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc2bb037

Browse files
committed
Unicode escapes in E'...' strings
Author: Marko Kreen <markokr@gmail.com>
1 parent9048b73 commitc2bb037

File tree

3 files changed

+98
-9
lines changed

3 files changed

+98
-9
lines changed

‎doc/src/sgml/syntax.sgml

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.136 2009/09/22 23:52:53 petere Exp $ -->
22

33
<chapter id="sql-syntax">
44
<title>SQL Syntax</title>
@@ -398,6 +398,14 @@ SELECT 'foo' 'bar';
398398
</entry>
399399
<entry>hexadecimal byte value</entry>
400400
</row>
401+
<row>
402+
<entry>
403+
<literal>\u<replaceable>xxxx</replaceable></literal>,
404+
<literal>\U<replaceable>xxxxxxxx</replaceable></literal>
405+
(<replaceable>x</replaceable> = 0 - 9, A - F)
406+
</entry>
407+
<entry>16 or 32-bit hexadecimal Unicode character value</entry>
408+
</row>
401409
</tbody>
402410
</tgroup>
403411
</table>
@@ -411,13 +419,25 @@ SELECT 'foo' 'bar';
411419
</para>
412420

413421
<para>
414-
It is your responsibility that the byte sequences you create are
422+
It is your responsibility that the byte sequences you create,
423+
especially when using the octal or hexadecimal escapes, compose
415424
valid characters in the server character set encoding. When the
416-
server encoding is UTF-8, then the alternative Unicode escape
417-
syntax, explained in <xref linkend="sql-syntax-strings-uescape">,
418-
should be used instead. (The alternative would be doing the
419-
UTF-8 encoding by hand and writing out the bytes, which would be
420-
very cumbersome.)
425+
server encoding is UTF-8, then the Unicode escapes or the
426+
alternative Unicode escape syntax, explained
427+
in <xref linkend="sql-syntax-strings-uescape">, should be used
428+
instead. (The alternative would be doing the UTF-8 encoding by
429+
hand and writing out the bytes, which would be very cumbersome.)
430+
</para>
431+
432+
<para>
433+
The Unicode escape syntax works fully only when the server
434+
encoding is UTF-8. When other server encodings are used, only
435+
code points in the ASCII range (up to <literal>\u007F</>) can be
436+
specified. Both the 4-digit and the 8-digit form can be used to
437+
specify UTF-16 surrogate pairs to compose characters with code
438+
points larger than <literal>\FFFF</literal> (although the
439+
availability of the 8-digit form technically makes this
440+
unnecessary).
421441
</para>
422442

423443
<caution>

‎src/backend/parser/scan.l

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* Portions Copyright (c) 1994, Regents of the University of California
2525
*
2626
* IDENTIFICATION
27-
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
27+
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.159 2009/09/22 23:52:53 petere Exp $
2828
*
2929
*-------------------------------------------------------------------------
3030
*/
@@ -80,6 +80,9 @@ static void addlitchar(unsigned char ychar, base_yyscan_t yyscanner);
8080
staticchar *litbufdup(base_yyscan_t yyscanner);
8181
staticchar *litbuf_udeescape(unsignedchar escape,base_yyscan_t yyscanner);
8282
staticunsignedcharunescape_single_char(unsignedchar c,base_yyscan_t yyscanner);
83+
staticboolis_utf16_surrogate_first(pg_wchar c);
84+
staticboolis_utf16_surrogate_second(pg_wchar c);
85+
static pg_wcharsurrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
8386

8487
#defineyyerror(msg) scanner_yyerror(msg, yyscanner)
8588

@@ -97,6 +100,8 @@ static void check_escape_warning(base_yyscan_t yyscanner);
97100
externintbase_yyget_column(yyscan_t yyscanner);
98101
externvoidbase_yyset_column(int column_no,yyscan_t yyscanner);
99102

103+
staticvoidaddunicode(pg_wchar c,yyscan_t yyscanner);
104+
100105
%}
101106

102107
%optionreentrant
@@ -134,6 +139,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
134139
* <xdolq> $foo$ quoted strings
135140
* <xui> quoted identifier with Unicode escapes
136141
* <xus> quoted string with Unicode escapes
142+
* <xeu> Unicode surrogate pair in extended quoted string
137143
*/
138144

139145
%xxb
@@ -145,6 +151,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
145151
%xxdolq
146152
%xxui
147153
%xxus
154+
%xxeu
148155

149156
/*
150157
* In order to make the world safe for Windows and Mac clients as well as
@@ -223,6 +230,8 @@ xeinside[^\\']+
223230
xeescape[\\][^0-7]
224231
xeoctesc[\\][0-7]{1,3}
225232
xehexesc[\\]x[0-9A-Fa-f]{1,2}
233+
xeunicode[\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
234+
xeunicodebad[\\]([uU])
226235

227236
/* Extended quote
228237
* xqdouble implements embedded quote, ''''
@@ -535,6 +544,45 @@ other.
535544
<xe>{xeinside} {
536545
addlit(yytext, yyleng, yyscanner);
537546
}
547+
<xe>{xeunicode} {
548+
pg_wchar c =strtoul(yytext+2,NULL,16);
549+
550+
check_escape_warning(yyscanner);
551+
552+
if (is_utf16_surrogate_first(c))
553+
{
554+
yyextra->utf16_first_part = c;
555+
BEGIN(xeu);
556+
}
557+
elseif (is_utf16_surrogate_second(c))
558+
yyerror("invalid Unicode surrogate pair");
559+
else
560+
addunicode(c, yyscanner);
561+
}
562+
<xeu>{xeunicode} {
563+
pg_wchar c =strtoul(yytext+2,NULL,16);
564+
565+
if (!is_utf16_surrogate_second(c))
566+
yyerror("invalid Unicode surrogate pair");
567+
568+
c =surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
569+
570+
addunicode(c, yyscanner);
571+
572+
BEGIN(xe);
573+
}
574+
<xeu>.|
575+
<xeu>\n|
576+
<xeu><<EOF>>{yyerror("invalid Unicode surrogate pair"); }
577+
578+
<xe>{xeunicodebad}{
579+
ereport(ERROR,
580+
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
581+
errmsg("invalid Unicode escape"),
582+
errhint("Unicode escapes must be\\uXXXX or\\UXXXXXXXX."),
583+
lexer_errposition()));
584+
}
585+
538586
<xe>{xeescape} {
539587
if (yytext[1] =='\'')
540588
{
@@ -1330,3 +1378,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner)
13301378
if (ptr)
13311379
pfree(ptr);
13321380
}
1381+
1382+
staticvoid
1383+
addunicode(pg_wchar c,base_yyscan_t yyscanner)
1384+
{
1385+
char buf[8];
1386+
1387+
if (c ==0 || c >0x10FFFF)
1388+
yyerror("invalid Unicode escape value");
1389+
if (c >0x7F)
1390+
{
1391+
if (GetDatabaseEncoding() != PG_UTF8)
1392+
yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1393+
yyextra->saw_non_ascii =true;
1394+
}
1395+
unicode_to_utf8(c, (unsignedchar *)buf);
1396+
addlit(buf,pg_mblen(buf), yyscanner);
1397+
}
1398+

‎src/include/parser/gramparse.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
1212
* Portions Copyright (c) 1994, Regents of the University of California
1313
*
14-
* $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.47 2009/07/14 20:24:10 tgl Exp $
14+
* $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.48 2009/09/22 23:52:53 petere Exp $
1515
*
1616
*-------------------------------------------------------------------------
1717
*/
@@ -71,6 +71,9 @@ typedef struct base_yy_extra_type
7171
intxcdepth;/* depth of nesting in slash-star comments */
7272
char*dolqstart;/* current $foo$ quote start string */
7373

74+
/* first part of UTF16 surrogate pair for Unicode escapes */
75+
int32utf16_first_part;
76+
7477
/* state variables for literal-lexing warnings */
7578
boolwarn_on_first_escape;
7679
boolsaw_non_ascii;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp