NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commitcb76fbd

committed

Make regexp engine's backref-related compilation state more bulletproof.

Up to now, we remembered the definition of a capturing parenthesissubexpression by storing a pointer to the associated subRE node.That was okay before, because that subRE didn't get modified anymorewhile parsing the rest of the regexp. However, in the wake ofcommitea1268f, that's no longer true: the outer invocation ofparseqatom() feels free to scribble on that subRE. This seems towork anyway, because the states we jam into the child atom in the"prepare a general-purpose state skeleton" stanza aren't reallysemantically different from the original endpoints of the child atom.But that would be mighty easy to break, and it's definitely not howthings worked before.Between this and the issue fixed in the prior commit, it seems bestto get rid of this dependence on subRE nodes entirely. We don't needthe whole child subRE for future backrefs, only its starting and endingNFA states; so let's just store pointers to those.Also, in the corner case where we make an extra subRE to handleimmediately-nested capturing parentheses, it seems like it'd be smartto have the extra subRE have the same begin/end states as the originalchild subRE does (s/s2 not lp/rp). I think that linking it from lp torp might actually be semantically wrong, though since Spencer's originalcode did it that way, I'm not totally certain. Using s/s2 is certainlynot wrong, in any case.Per report from Mark Dilger. Back-patch to v14 where the problematicpatches came in.Discussion:https://postgr.es/m/0203588E-E609-43AF-9F4F-902854231EE7@enterprisedb.com

1 parentcc18687 commitcb76fbdCopy full SHA for cb76fbd

File tree

1 file changed

+35

-22

lines changed

src/backend/regex
- regcomp.c

1 file changed

+35

-22

lines changed

`‎src/backend/regex/regcomp.c`

Lines changed: 35 additions & 22 deletions

Original file line number	Diff line number	Diff line change
`@@ -233,6 +233,13 @@ static intcmp(const chr , const chr , size_t);`
`233`	`233`	`staticintcasecmp(constchr,constchr,size_t);`
`234`	`234`
`235`	`235`
	`236`	`+/* info we need during compilation about a known capturing subexpression */`
	`237`	`+structsubinfo`
	`238`	`+{`
	`239`	`+structstateleft;/ left end of its sub-NFA */`
	`240`	`+structstateright;/ right end of its sub-NFA */`
	`241`	`+};`
	`242`	`+`
`236`	`243`	`/* internal variables, bundled for easy passing around */`
`237`	`244`	`structvars`
`238`	`245`	`{`
`@@ -245,10 +252,10 @@ struct vars`
`245`	`252`	`intnexttype;/* type of next token */`
`246`	`253`	`chrnextvalue;/* value (if any) of next token */`
`247`	`254`	`intlexcon;/* lexical context type (see regc_lex.c) */`
`248`		`-intnsubexp;/subexpression count /`
`249`		`-structsubre*subs;/subRE pointer vector */`
`250`		`-size_tnsubs;/* length of vector */`
`251`		`-structsubresub10[10];/ initial vector, enough for most */`
	`255`	`+intnsubexp;/number of known capturing subexpressions /`
	`256`	`+structsubinfosubs;/info about known capturing subexpressions */`
	`257`	`+size_tnsubs;/allocatedlength of subs[] vector /`
	`258`	`+structsubinfosub10[10];/* initial vector, enough for most */`
`252`	`259`	`structnfanfa;/ the NFA */`
`253`	`260`	`structcolormapcm;/ character color map */`
`254`	`261`	`colornlcolor;/* color of newline */`
`@@ -368,7 +375,7 @@ pg_regcomp(regex_t *re,`
`368`	`375`	`v->subs=v->sub10;`
`369`	`376`	`v->nsubs=10;`
`370`	`377`	`for (j=0;j<v->nsubs;j++)`
`371`		`-v->subs[j]=NULL;`
	`378`	`+v->subs[j].left=v->subs[j].right=NULL;`
`372`	`379`	`v->nfa=NULL;`
`373`	`380`	`v->cm=NULL;`
`374`	`381`	`v->nlcolor=COLORLESS;`
`@@ -504,35 +511,35 @@ pg_regcomp(regex_t *re,`
`504`	`511`	`}`
`505`	`512`
`506`	`513`	`/*`
`507`		`- * moresubs - enlargesubRE vector`
	`514`	`+ * moresubs - enlargecapturing-subexpressions vector`
`508`	`515`	`*/`
`509`	`516`	`staticvoid`
`510`	`517`	`moresubs(structvars*v,`
`511`	`518`	`intwanted)/* want enough room for this one */`
`512`	`519`	`{`
`513`		`-structsubre**p;`
	`520`	`+structsubinfo*p;`
`514`	`521`	`size_tn;`
`515`	`522`
`516`	`523`	`assert(wanted>0&& (size_t)wanted >=v->nsubs);`
`517`	`524`	`n= (size_t)wanted*3 /2+1;`
`518`	`525`
`519`	`526`	`if (v->subs==v->sub10)`
`520`	`527`	`{`
`521`		`-p= (structsubre*)MALLOC(nsizeof(structsubre*));`
	`528`	`+p= (structsubinfo)MALLOC(nsizeof(structsubinfo));`
`522`	`529`	`if (p!=NULL)`
`523`	`530`	`memcpy(VS(p),VS(v->subs),`
`524`		`-v->nsubssizeof(structsubre));`
	`531`	`+v->nsubs*sizeof(structsubinfo));`
`525`	`532`	`}`
`526`	`533`	`else`
`527`		`-p= (structsubre*)REALLOC(v->subs,nsizeof(structsubre*));`
	`534`	`+p= (structsubinfo)REALLOC(v->subs,nsizeof(structsubinfo));`
`528`	`535`	`if (p==NULL)`
`529`	`536`	`{`
`530`	`537`	`ERR(REG_ESPACE);`
`531`	`538`	`return;`
`532`	`539`	`}`
`533`	`540`	`v->subs=p;`
`534`	`541`	`for (p=&v->subs[v->nsubs];v->nsubs<n;p++,v->nsubs++)`
`535`		`-*p=NULL;`
	`542`	`+p->left=p->right=NULL;`
`536`	`543`	`assert(v->nsubs==n);`
`537`	`544`	`assert((size_t)wanted<v->nsubs);`
`538`	`545`	`}`
`@@ -969,10 +976,14 @@ parseqatom(struct vars *v,`
`969`	`976`	`NEXT();`
`970`	`977`
`971`	`978`	`/*`
`972`		`- * Make separate endpoints to ensure we keep this sub-NFA cleanly`
`973`		`- * separate from what surrounds it. We need to be sure that when`
`974`		`- * we duplicate the sub-NFA for a backref, we get the right states`
`975`		`- * and no others.`
	`979`	`+ * Make separate endpoint states to keep this sub-NFA distinct`
	`980`	`+ * from what surrounds it. We need to be sure that when we`
	`981`	`+ * duplicate the sub-NFA for a backref, we get the right`
	`982`	`+ * states/arcs and no others. In particular, letting a backref`
	`983`	`+ * duplicate the sub-NFA from lp to rp would be quite wrong,`
	`984`	`+ * because we may add quantification superstructure around this`
	`985`	`+ * atom below. (Perhaps we could skip the extra states for`
	`986`	`+ * non-capturing parens, but it seems not worth the trouble.)`
`976`	`987`	`*/`
`977`	`988`	`s=newstate(v->nfa);`
`978`	`989`	`s2=newstate(v->nfa);`
`@@ -986,8 +997,10 @@ parseqatom(struct vars *v,`
`986`	`997`	`NOERRN();`
`987`	`998`	`if (cap)`
`988`	`999`	`{`
`989`		`-assert(v->subs[subno]==NULL);`
`990`		`-v->subs[subno]=atom;`
	`1000`	`+/* save the sub-NFA's endpoints for future backrefs to use */`
	`1001`	`+assert(v->subs[subno].left==NULL);`
	`1002`	`+v->subs[subno].left=s;`
	`1003`	`+v->subs[subno].right=s2;`
`991`	`1004`	`if (atom->capno==0)`
`992`	`1005`	`{`
`993`	`1006`	`/* normal case: just mark the atom as capturing */`
`@@ -997,7 +1010,7 @@ parseqatom(struct vars *v,`
`997`	`1010`	`else`
`998`	`1011`	`{`
`999`	`1012`	`/* generate no-op wrapper node to handle "((x))" */`
`1000`		`-t=subre(v,'(',atom->flags \|CAP,lp,rp);`
	`1013`	`+t=subre(v,'(',atom->flags \|CAP,s,s2);`
`1001`	`1014`	`NOERRN();`
`1002`	`1015`	`t->capno=subno;`
`1003`	`1016`	`t->child=atom;`
`@@ -1009,7 +1022,7 @@ parseqatom(struct vars *v,`
`1009`	`1022`	`caseBACKREF:/* the Feature From The Black Lagoon */`
`1010`	`1023`	`INSIST(type!=LACON,REG_ESUBREG);`
`1011`	`1024`	`INSIST(v->nextvalue<v->nsubs,REG_ESUBREG);`
`1012`		`-INSIST(v->subs[v->nextvalue]!=NULL,REG_ESUBREG);`
	`1025`	`+INSIST(v->subs[v->nextvalue].left!=NULL,REG_ESUBREG);`
`1013`	`1026`	`NOERRN();`
`1014`	`1027`	`assert(v->nextvalue>0);`
`1015`	`1028`	`atom=subre(v,'b',BACKR,lp,rp);`
`@@ -1084,7 +1097,7 @@ parseqatom(struct vars *v,`
`1084`	`1097`	`if (atom!=NULL)`
`1085`	`1098`	`freesubre(v,atom);`
`1086`	`1099`	`if (atomtype=='(')`
`1087`		`-v->subs[subno]=NULL;`
	`1100`	`+v->subs[subno].left=v->subs[subno].right=NULL;`
`1088`	`1101`	`delsub(v->nfa,lp,rp);`
`1089`	`1102`	`EMPTYARC(lp,rp);`
`1090`	`1103`	`returntop;`
`@@ -1177,14 +1190,14 @@ parseqatom(struct vars *v,`
`1177`	`1190`	`{`
`1178`	`1191`	`assert(atom->begin->nouts==1);/* just the EMPTY */`
`1179`	`1192`	`delsub(v->nfa,atom->begin,atom->end);`
`1180`		`-assert(v->subs[subno]!=NULL);`
	`1193`	`+assert(v->subs[subno].left!=NULL);`
`1181`	`1194`
`1182`	`1195`	`/*`
`1183`	`1196`	`* And here's why the recursion got postponed: it must wait until the`
`1184`	`1197`	`* skeleton is filled in, because it may hit a backref that wants to`
`1185`	`1198`	`* copy the filled-in skeleton.`
`1186`	`1199`	`*/`
`1187`		`-dupnfa(v->nfa,v->subs[subno]->begin,v->subs[subno]->end,`
	`1200`	`+dupnfa(v->nfa,v->subs[subno].left,v->subs[subno].right,`
`1188`	`1201`	`atom->begin,atom->end);`
`1189`	`1202`	`NOERRN();`
`1190`	`1203`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitcb76fbd

File tree

1 file changed

1 file changed

`‎src/backend/regex/regcomp.c`

0 commit comments