3333 * Portions Copyright (c) 1994, Regents of the University of California
3434 *
3535 * IDENTIFICATION
36- * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.28 2009/01/01 17:23:55 momjian Exp $
36+ * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.29 2009/09/27 03:27:24 tgl Exp $
3737 *
3838 *-------------------------------------------------------------------------
3939*/
@@ -117,6 +117,7 @@ static void push_new_buffer(const char *newstr);
117117static YY_BUFFER_STATEprepare_buffer (const char *txt,int len,
118118char **txtcopy);
119119static void emit (const char *txt,int len);
120+ static bool is_utf16_surrogate_first (uint32 c);
120121
121122#define ECHO emit (yytext, yyleng)
122123
@@ -158,6 +159,7 @@ static void emit(const char *txt, int len);
158159 * <xdolq> $foo$ quoted strings
159160 * <xui> quoted identifier with Unicode escapes
160161 * <xus> quoted string with Unicode escapes
162+ * <xeu> Unicode surrogate pair in extended quoted string
161163*/
162164
163165%x xb
@@ -169,6 +171,7 @@ static void emit(const char *txt, int len);
169171%x xdolq
170172%x xui
171173%x xus
174+ %x xeu
172175/* Additional exclusive states for psql only: lex backslash commands*/
173176%x xslashcmd
174177%x xslasharg
@@ -192,6 +195,9 @@ static void emit(const char *txt, int len);
192195 * did not end with a newline.
193196 *
194197 * XXX perhaps \f (formfeed) should be treated as a newline as well?
198+ *
199+ * XXX if you change the set of whitespace characters, fix scanner_isspace()
200+ * to agree, and see also the plpgsql lexer.
195201*/
196202
197203space[ \t\n\r\f]
@@ -253,6 +259,8 @@ xeinside[^\\']+
253259xeescape[\\ ][^0-7]
254260xeoctesc[\\ ][0-7]{1,3}
255261xehexesc[\\ ]x[0-9A-Fa-f]{1,2}
262+ xeunicode[\\ ](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
263+ xeunicodefail[\\ ](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
256264
257265/* Extended quote
258266 * xqdouble implements embedded quote,' ' ' '
@@ -334,6 +342,10 @@ identifier{ident_start}{ident_cont}*
334342
335343typecast" ::"
336344
345+ /* these two token types are used by PL/pgsql, though not in core SQL*/
346+ dot_dot\.\.
347+ colon_equals" :="
348+
337349/*
338350 * "self" is the set of chars that should be returned as single-character
339351 * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
@@ -511,6 +523,22 @@ other.
511523<xe>{xeinside} {
512524ECHO;
513525}
526+ <xe>{xeunicode} {
527+ uint32 c =strtoul (yytext+2 ,NULL ,16 );
528+
529+ if (is_utf16_surrogate_first (c))
530+ BEGIN (xeu);
531+ ECHO;
532+ }
533+ <xeu>{xeunicode} {
534+ BEGIN (xe);
535+ ECHO;
536+ }
537+ <xeu>.{ ECHO; }
538+ <xeu>\n{ ECHO; }
539+ <xe,xeu>{xeunicodefail}{
540+ ECHO;
541+ }
514542<xe>{xeescape} {
515543ECHO;
516544}
@@ -605,6 +633,14 @@ other.
605633ECHO;
606634}
607635
636+ {dot_dot}{
637+ ECHO;
638+ }
639+
640+ {colon_equals}{
641+ ECHO;
642+ }
643+
608644/*
609645 * These rules are specific to psql --- they implement parenthesis
610646 * counting and detection of command-ending semicolon. These must
@@ -1690,3 +1726,9 @@ emit(const char *txt, int len)
16901726}
16911727}
16921728}
1729+
1730+ static bool
1731+ is_utf16_surrogate_first (uint32 c)
1732+ {
1733+ return (c >=0xD800 && c <=0xDBFF );
1734+ }