NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commitd809fd0

committed

Improve parser's one-extra-token lookahead mechanism.

There are a couple of places in our grammar that fail to be strict LALR(1),by requiring more than a single token of lookahead to decide what to do.Up to now we've dealt with that by using a filter between the lexer andparser that merges adjacent tokens into one in the places where two tokensof lookahead are necessary. But that creates a number of user-visibleanomalies, for instance that you can't name a CTE "ordinality" because"WITH ordinality AS ..." triggers folding of WITH and ORDINALITY into onetoken. I realized that there's a better way.In this patch, we still do the lookahead basically as before, but we nevermerge the second token into the first; we replace just the first token bya special lookahead symbol when one of the lookahead pairs is seen.This requires a couple extra productions in the grammar, but it involvesfewer special tokens, so that the grammar tables come out a bit smallerthan before. The filter logic is no slower than before, perhaps a bitfaster.I also fixed the filter logic so that when backing up after a lookahead,the current token's terminator is correctly restored; this eliminates someweird behavior in error message issuance, as is shown by the one change inexisting regression test outputs.I believe that this patch entirely eliminates odd behaviors caused bylookahead for WITH. It doesn't really improve the situation for NULLSfollowed by FIRST/LAST unfortunately: those sequences still act like areserved word, even though there are cases where they should be seen as twoordinary identifiers, eg "SELECT nulls first FROM ...". I experimentedwith additional grammar hacks but couldn't find any simple solution forthat. Still, this is better than before, and it seems much more likelythat we *could* somehow solve the NULLS case on the basis of this filterbehavior than the previous one.

1 parent23a7835 commitd809fd0Copy full SHA for d809fd0

File tree

8 files changed

+173

-113

lines changed

src
- backend/parser
  - gram.y
  - parser.c
- include/parser
  - gramparse.h
- interfaces/ecpg/preproc
  - parse.pl
  - parser.c
- test/regress
  - expected
    - foreign_data.out
    - with.out
  - sql
    - with.sql

8 files changed

+173

-113

lines changed

`‎src/backend/parser/gram.y`

Lines changed: 25 additions & 10 deletions

Original file line number	Diff line number	Diff line change
`@@ -633,9 +633,9 @@ static Node makeRecursiveViewSelect(char relname, List aliases, Node query);`
`633`	`633`	`/*`
`634`	`634`	`* The grammar thinks these are keywords, but they are not in the kwlist.h`
`635`	`635`	`* list and so can never be entered directly. The filter in parser.c`
`636`		`- * creates these tokens when required.`
	`636`	`+ * creates these tokens when required (based on looking one token ahead).`
`637`	`637`	`*/`
`638`		`-%tokenNULLS_FIRSTNULLS_LASTWITH_ORDINALITYWITH_TIME`
	`638`	`+%tokenNULLS_LAWITH_LA`
`639`	`639`
`640`	`640`
`641`	`641`	`/* Precedence: lowest to highest*/`
`@@ -873,6 +873,7 @@ CreateRoleStmt:`
`873`	`873`
`874`	`874`
`875`	`875`	`opt_with:WITH{}`
	`876`	`+\|WITH_LA{}`
`876`	`877`	`\|/EMPTY/{}`
`877`	`878`	`;`
`878`	`879`
`@@ -6673,8 +6674,8 @@ opt_asc_desc: ASC{ $$ = SORTBY_ASC; }`
`6673`	`6674`	`\|/EMPTY/{$$ = SORTBY_DEFAULT; }`
`6674`	`6675`	`;`
`6675`	`6676`
`6676`		`-opt_nulls_order:NULLS_FIRST{$$ = SORTBY_NULLS_FIRST; }`
`6677`		`-\|NULLS_LAST{$$ = SORTBY_NULLS_LAST; }`
	`6677`	`+opt_nulls_order:NULLS_LAFIRST_P{$$ = SORTBY_NULLS_FIRST; }`
	`6678`	`+\|NULLS_LALAST_P{$$ = SORTBY_NULLS_LAST; }`
`6678`	`6679`	`\|/EMPTY/{$$ = SORTBY_NULLS_DEFAULT; }`
`6679`	`6680`	`;`
`6680`	`6681`
`@@ -8923,7 +8924,7 @@ AlterTSDictionaryStmt:`
`8923`	`8924`	`;`
`8924`	`8925`
`8925`	`8926`	`AlterTSConfigurationStmt:`
`8926`		`-ALTERTEXT_PSEARCHCONFIGURATIONany_nameADD_PMAPPINGFORname_listWITHany_name_list`
	`8927`	`+ALTERTEXT_PSEARCHCONFIGURATIONany_nameADD_PMAPPINGFORname_listany_withany_name_list`
`8927`	`8928`	`{`
`8928`	`8929`	`AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);`
`8929`	`8930`	`n->cfgname =$5;`
`@@ -8933,7 +8934,7 @@ AlterTSConfigurationStmt:`
`8933`	`8934`	`n->replace =false;`
`8934`	`8935`	`$$ = (Node*)n;`
`8935`	`8936`	`}`
`8936`		`-\|ALTERTEXT_PSEARCHCONFIGURATIONany_nameALTERMAPPINGFORname_listWITHany_name_list`
	`8937`	`+\|ALTERTEXT_PSEARCHCONFIGURATIONany_nameALTERMAPPINGFORname_listany_withany_name_list`
`8937`	`8938`	`{`
`8938`	`8939`	`AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);`
`8939`	`8940`	`n->cfgname =$5;`
`@@ -8943,7 +8944,7 @@ AlterTSConfigurationStmt:`
`8943`	`8944`	`n->replace =false;`
`8944`	`8945`	`$$ = (Node*)n;`
`8945`	`8946`	`}`
`8946`		`-\|ALTERTEXT_PSEARCHCONFIGURATIONany_nameALTERMAPPINGREPLACEany_nameWITHany_name`
	`8947`	`+\|ALTERTEXT_PSEARCHCONFIGURATIONany_nameALTERMAPPINGREPLACEany_nameany_withany_name`
`8947`	`8948`	`{`
`8948`	`8949`	`AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);`
`8949`	`8950`	`n->cfgname =$5;`
`@@ -8953,7 +8954,7 @@ AlterTSConfigurationStmt:`
`8953`	`8954`	`n->replace =true;`
`8954`	`8955`	`$$ = (Node*)n;`
`8955`	`8956`	`}`
`8956`		`-\|ALTERTEXT_PSEARCHCONFIGURATIONany_nameALTERMAPPINGFORname_listREPLACEany_nameWITHany_name`
	`8957`	`+\|ALTERTEXT_PSEARCHCONFIGURATIONany_nameALTERMAPPINGFORname_listREPLACEany_nameany_withany_name`
`8957`	`8958`	`{`
`8958`	`8959`	`AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);`
`8959`	`8960`	`n->cfgname =$5;`
`@@ -8981,6 +8982,11 @@ AlterTSConfigurationStmt:`
`8981`	`8982`	`}`
`8982`	`8983`	`;`
`8983`	`8984`
	`8985`	`+/* Use this if TIME or ORDINALITY after WITH should be taken as an identifier*/`
	`8986`	`+any_with:WITH{}`
	`8987`	`+\|WITH_LA{}`
	`8988`	`+;`
	`8989`	`+`
`8984`	`8990`
`8985`	`8991`	`/*****************************************************************************`
`8986`	`8992`	`*`
`@@ -9891,6 +9897,8 @@ simple_select:`
`9891`	`9897`	`*AS (query) [ SEARCH or CYCLE clause ]`
`9892`	`9898`	`*`
`9893`	`9899`	`* We don't currently support the SEARCH or CYCLE clause.`
	`9900`	`+ *`
	`9901`	`+ * Recognizing WITH_LA here allows a CTE to be named TIME or ORDINALITY.`
`9894`	`9902`	`*/`
`9895`	`9903`	`with_clause:`
`9896`	`9904`	`WITHcte_list`
`@@ -9900,6 +9908,13 @@ with_clause:`
`9900`	`9908`	`$$->recursive =false;`
`9901`	`9909`	`$$->location =@1;`
`9902`	`9910`	`}`
	`9911`	`+\|WITH_LActe_list`
	`9912`	`+{`
	`9913`	`+$$ = makeNode(WithClause);`
	`9914`	`+$$->ctes =$2;`
	`9915`	`+$$->recursive =false;`
	`9916`	`+$$->location =@1;`
	`9917`	`+}`
`9903`	`9918`	`\|WITHRECURSIVEcte_list`
`9904`	`9919`	`{`
`9905`	`9920`	`$$ = makeNode(WithClause);`
`@@ -10601,7 +10616,7 @@ opt_col_def_list: AS '(' TableFuncElementList ')'{ $$ = $3; }`
`10601`	`10616`	`\|/EMPTY/{$$ = NIL; }`
`10602`	`10617`	`;`
`10603`	`10618`
`10604`		`-opt_ordinality:WITH_ORDINALITY{$$ =true; }`
	`10619`	`+opt_ordinality:WITH_LAORDINALITY{$$ =true; }`
`10605`	`10620`	`\|/EMPTY/{$$ =false; }`
`10606`	`10621`	`;`
`10607`	`10622`
`@@ -11057,7 +11072,7 @@ ConstInterval:`
`11057`	`11072`	`;`
`11058`	`11073`
`11059`	`11074`	`opt_timezone:`
`11060`		`-WITH_TIMEZONE{$$ =TRUE; }`
	`11075`	`+WITH_LATIMEZONE{$$ =TRUE; }`
`11061`	`11076`	`\|WITHOUTTIMEZONE{$$ =FALSE; }`
`11062`	`11077`	`\|/EMPTY/{$$ =FALSE; }`
`11063`	`11078`	`;`

`‎src/backend/parser/parser.c`

Lines changed: 58 additions & 47 deletions

Original file line number	Diff line number	Diff line change
`@@ -64,13 +64,13 @@ raw_parser(const char *str)`
`64`	`64`	`/*`
`65`	`65`	`* Intermediate filter between parser and core lexer (core_yylex in scan.l).`
`66`	`66`	`*`
`67`		`- *The filter is needed because in some cases the standard SQL grammar`
	`67`	`+ *This filter is needed because in some cases the standard SQL grammar`
`68`	`68`	`* requires more than one token lookahead. We reduce these cases to one-token`
`69`		`- * lookahead bycombining tokens here, in order to keep the grammar LALR(1).`
	`69`	`+ * lookahead byreplacing tokens here, in order to keep the grammar LALR(1).`
`70`	`70`	`*`
`71`	`71`	`* Using a filter is simpler than trying to recognize multiword tokens`
`72`	`72`	`* directly in scan.l, because we'd have to allow for comments between the`
`73`		`- * words. Furthermore it's not clear how to doit without re-introducing`
	`73`	`+ * words. Furthermore it's not clear how to dothat without re-introducing`
`74`	`74`	`* scanner backtrack, which would cost more performance than this filter`
`75`	`75`	`* layer does.`
`76`	`76`	`*`
`@@ -84,7 +84,7 @@ base_yylex(YYSTYPE lvalp, YYLTYPE llocp, core_yyscan_t yyscanner)`
`84`	`84`	`base_yy_extra_type*yyextra=pg_yyget_extra(yyscanner);`
`85`	`85`	`intcur_token;`
`86`	`86`	`intnext_token;`
`87`		`-core_YYSTYPEcur_yylval;`
	`87`	`+intcur_token_length;`
`88`	`88`	`YYLTYPEcur_yylloc;`
`89`	`89`
`90`	`90`	`/* Get next token --- we might already have it */`
`@@ -93,74 +93,85 @@ base_yylex(YYSTYPE lvalp, YYLTYPE llocp, core_yyscan_t yyscanner)`
`93`	`93`	`cur_token=yyextra->lookahead_token;`
`94`	`94`	`lvalp->core_yystype=yyextra->lookahead_yylval;`
`95`	`95`	`*llocp=yyextra->lookahead_yylloc;`
	`96`	`+*(yyextra->lookahead_end)=yyextra->lookahead_hold_char;`
`96`	`97`	`yyextra->have_lookahead= false;`
`97`	`98`	`}`
`98`	`99`	`else`
`99`	`100`	`cur_token=core_yylex(&(lvalp->core_yystype),llocp,yyscanner);`
`100`	`101`
`101`		`-/* Do we need to look ahead for a possible multiword token? */`
	`102`	`+/*`
	`103`	`+ * If this token isn't one that requires lookahead, just return it. If it`
	`104`	`+ * does, determine the token length. (We could get that via strlen(), but`
	`105`	`+ * since we have such a small set of possibilities, hardwiring seems`
	`106`	`+ * feasible and more efficient.)`
	`107`	`+ */`
`102`	`108`	`switch (cur_token)`
`103`	`109`	`{`
`104`	`110`	`caseNULLS_P:`
	`111`	`+cur_token_length=5;`
	`112`	`+break;`
	`113`	`+caseWITH:`
	`114`	`+cur_token_length=4;`
	`115`	`+break;`
	`116`	`+default:`
	`117`	`+returncur_token;`
	`118`	`+}`
`105`	`119`
`106`		`-/*`
`107`		`- * NULLS FIRST and NULLS LAST must be reduced to one token`
`108`		`- */`
`109`		`-cur_yylval=lvalp->core_yystype;`
`110`		`-cur_yylloc=*llocp;`
`111`		`-next_token=core_yylex(&(lvalp->core_yystype),llocp,yyscanner);`
	`120`	`+/*`
	`121`	`+ * Identify end+1 of current token. core_yylex() has temporarily stored a`
	`122`	`+ * '\0' here, and will undo that when we call it again. We need to redo`
	`123`	`+ * it to fully revert the lookahead call for error reporting purposes.`
	`124`	`+ */`
	`125`	`+yyextra->lookahead_end=yyextra->core_yy_extra.scanbuf+`
	`126`	`+*llocp+cur_token_length;`
	`127`	`+Assert(*(yyextra->lookahead_end)=='\0');`
	`128`	`+`
	`129`	`+/*`
	`130`	`+ * Save and restore *llocp around the call. It might look like we could`
	`131`	`+ * avoid this by just passing &lookahead_yylloc to core_yylex(), but that`
	`132`	`+ * does not work because flex actually holds onto the last-passed pointer`
	`133`	`+ * internally, and will use that for error reporting. We need any error`
	`134`	`+ * reports to point to the current token, not the next one.`
	`135`	`+ */`
	`136`	`+cur_yylloc=*llocp;`
	`137`	`+`
	`138`	`+/* Get next token, saving outputs into lookahead variables */`
	`139`	`+next_token=core_yylex(&(yyextra->lookahead_yylval),llocp,yyscanner);`
	`140`	`+yyextra->lookahead_token=next_token;`
	`141`	`+yyextra->lookahead_yylloc=*llocp;`
	`142`	`+`
	`143`	`+*llocp=cur_yylloc;`
	`144`	`+`
	`145`	`+/* Now revert the un-truncation of the current token */`
	`146`	`+yyextra->lookahead_hold_char=*(yyextra->lookahead_end);`
	`147`	`+*(yyextra->lookahead_end)='\0';`
	`148`	`+`
	`149`	`+yyextra->have_lookahead= true;`
	`150`	`+`
	`151`	`+/* Replace cur_token if needed, based on lookahead */`
	`152`	`+switch (cur_token)`
	`153`	`+{`
	`154`	`+caseNULLS_P:`
	`155`	`+/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */`
`112`	`156`	`switch (next_token)`
`113`	`157`	`{`
`114`	`158`	`caseFIRST_P:`
`115`		`-cur_token=NULLS_FIRST;`
`116`		`-break;`
`117`	`159`	`caseLAST_P:`
`118`		`-cur_token=NULLS_LAST;`
`119`		`-break;`
`120`		`-default:`
`121`		`-/* save the lookahead token for next time */`
`122`		`-yyextra->lookahead_token=next_token;`
`123`		`-yyextra->lookahead_yylval=lvalp->core_yystype;`
`124`		`-yyextra->lookahead_yylloc=*llocp;`
`125`		`-yyextra->have_lookahead= true;`
`126`		`-/* and back up the output info to cur_token */`
`127`		`-lvalp->core_yystype=cur_yylval;`
`128`		`-*llocp=cur_yylloc;`
	`160`	`+cur_token=NULLS_LA;`
`129`	`161`	`break;`
`130`	`162`	`}`
`131`	`163`	`break;`
`132`	`164`
`133`	`165`	`caseWITH:`
`134`		`-`
`135`		`-/*`
`136`		`- * WITH TIME and WITH ORDINALITY must each be reduced to one token`
`137`		`- */`
`138`		`-cur_yylval=lvalp->core_yystype;`
`139`		`-cur_yylloc=*llocp;`
`140`		`-next_token=core_yylex(&(lvalp->core_yystype),llocp,yyscanner);`
	`166`	`+/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */`
`141`	`167`	`switch (next_token)`
`142`	`168`	`{`
`143`	`169`	`caseTIME:`
`144`		`-cur_token=WITH_TIME;`
`145`		`-break;`
`146`	`170`	`caseORDINALITY:`
`147`		`-cur_token=WITH_ORDINALITY;`
`148`		`-break;`
`149`		`-default:`
`150`		`-/* save the lookahead token for next time */`
`151`		`-yyextra->lookahead_token=next_token;`
`152`		`-yyextra->lookahead_yylval=lvalp->core_yystype;`
`153`		`-yyextra->lookahead_yylloc=*llocp;`
`154`		`-yyextra->have_lookahead= true;`
`155`		`-/* and back up the output info to cur_token */`
`156`		`-lvalp->core_yystype=cur_yylval;`
`157`		`-*llocp=cur_yylloc;`
	`171`	`+cur_token=WITH_LA;`
`158`	`172`	`break;`
`159`	`173`	`}`
`160`	`174`	`break;`
`161`		`-`
`162`		`-default:`
`163`		`-break;`
`164`	`175`	`}`
`165`	`176`
`166`	`177`	`returncur_token;`

`‎src/include/parser/gramparse.h`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,8 @@ typedef struct base_yy_extra_type`
`46`	`46`	`intlookahead_token;/* one-token lookahead */`
`47`	`47`	`core_YYSTYPElookahead_yylval;/* yylval for lookahead token */`
`48`	`48`	`YYLTYPElookahead_yylloc;/* yylloc for lookahead token */`
	`49`	`+charlookahead_end;/ end of current token */`
	`50`	`+charlookahead_hold_char;/* to be put back at lookahead_end /`
`49`	`51`
`50`	`52`	`/*`
`51`	`53`	`* State variables that belong to the grammar.`

`‎src/interfaces/ecpg/preproc/parse.pl`

Lines changed: 2 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -42,10 +42,8 @@`
`42`	`42`
`43`	`43`	`# or in the block`
`44`	`44`	`my%replace_string = (`
`45`		`-'WITH_TIME'=>'with time',`
`46`		`-'WITH_ORDINALITY'=>'with ordinality',`
`47`		`-'NULLS_FIRST'=>'nulls first',`
`48`		`-'NULLS_LAST'=>'nulls last',`
	`45`	`+'NULLS_LA'=>'nulls',`
	`46`	`+'WITH_LA'=>'with',`
`49`	`47`	`'TYPECAST'=>'::',`
`50`	`48`	`'DOT_DOT'=>'..',`
`51`	`49`	`'COLON_EQUALS'=>':=',);`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitd809fd0

File tree

8 files changed

8 files changed

`‎src/backend/parser/gram.y`

`‎src/backend/parser/parser.c`

`‎src/include/parser/gramparse.h`

`‎src/interfaces/ecpg/preproc/parse.pl`

0 commit comments