Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitcb76fbd

Browse files
committed
Make regexp engine's backref-related compilation state more bulletproof.
Up to now, we remembered the definition of a capturing parenthesissubexpression by storing a pointer to the associated subRE node.That was okay before, because that subRE didn't get modified anymorewhile parsing the rest of the regexp. However, in the wake ofcommitea1268f, that's no longer true: the outer invocation ofparseqatom() feels free to scribble on that subRE. This seems towork anyway, because the states we jam into the child atom in the"prepare a general-purpose state skeleton" stanza aren't reallysemantically different from the original endpoints of the child atom.But that would be mighty easy to break, and it's definitely not howthings worked before.Between this and the issue fixed in the prior commit, it seems bestto get rid of this dependence on subRE nodes entirely. We don't needthe whole child subRE for future backrefs, only its starting and endingNFA states; so let's just store pointers to those.Also, in the corner case where we make an extra subRE to handleimmediately-nested capturing parentheses, it seems like it'd be smartto have the extra subRE have the same begin/end states as the originalchild subRE does (s/s2 not lp/rp). I think that linking it from lp torp might actually be semantically wrong, though since Spencer's originalcode did it that way, I'm not totally certain. Using s/s2 is certainlynot wrong, in any case.Per report from Mark Dilger. Back-patch to v14 where the problematicpatches came in.Discussion:https://postgr.es/m/0203588E-E609-43AF-9F4F-902854231EE7@enterprisedb.com
1 parentcc18687 commitcb76fbd

File tree

1 file changed

+35
-22
lines changed

1 file changed

+35
-22
lines changed

‎src/backend/regex/regcomp.c

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,13 @@ static intcmp(const chr *, const chr *, size_t);
233233
staticintcasecmp(constchr*,constchr*,size_t);
234234

235235

236+
/* info we need during compilation about a known capturing subexpression */
237+
structsubinfo
238+
{
239+
structstate*left;/* left end of its sub-NFA */
240+
structstate*right;/* right end of its sub-NFA */
241+
};
242+
236243
/* internal variables, bundled for easy passing around */
237244
structvars
238245
{
@@ -245,10 +252,10 @@ struct vars
245252
intnexttype;/* type of next token */
246253
chrnextvalue;/* value (if any) of next token */
247254
intlexcon;/* lexical context type (see regc_lex.c) */
248-
intnsubexp;/*subexpression count */
249-
structsubre**subs;/*subRE pointer vector */
250-
size_tnsubs;/* length of vector */
251-
structsubre*sub10[10];/* initial vector, enough for most */
255+
intnsubexp;/*number of known capturing subexpressions */
256+
structsubinfo*subs;/*info about known capturing subexpressions */
257+
size_tnsubs;/*allocatedlength of subs[] vector */
258+
structsubinfosub10[10];/* initial vector, enough for most */
252259
structnfa*nfa;/* the NFA */
253260
structcolormap*cm;/* character color map */
254261
colornlcolor;/* color of newline */
@@ -368,7 +375,7 @@ pg_regcomp(regex_t *re,
368375
v->subs=v->sub10;
369376
v->nsubs=10;
370377
for (j=0;j<v->nsubs;j++)
371-
v->subs[j]=NULL;
378+
v->subs[j].left=v->subs[j].right=NULL;
372379
v->nfa=NULL;
373380
v->cm=NULL;
374381
v->nlcolor=COLORLESS;
@@ -504,35 +511,35 @@ pg_regcomp(regex_t *re,
504511
}
505512

506513
/*
507-
* moresubs - enlargesubRE vector
514+
* moresubs - enlargecapturing-subexpressions vector
508515
*/
509516
staticvoid
510517
moresubs(structvars*v,
511518
intwanted)/* want enough room for this one */
512519
{
513-
structsubre**p;
520+
structsubinfo*p;
514521
size_tn;
515522

516523
assert(wanted>0&& (size_t)wanted >=v->nsubs);
517524
n= (size_t)wanted*3 /2+1;
518525

519526
if (v->subs==v->sub10)
520527
{
521-
p= (structsubre**)MALLOC(n*sizeof(structsubre*));
528+
p= (structsubinfo*)MALLOC(n*sizeof(structsubinfo));
522529
if (p!=NULL)
523530
memcpy(VS(p),VS(v->subs),
524-
v->nsubs*sizeof(structsubre*));
531+
v->nsubs*sizeof(structsubinfo));
525532
}
526533
else
527-
p= (structsubre**)REALLOC(v->subs,n*sizeof(structsubre*));
534+
p= (structsubinfo*)REALLOC(v->subs,n*sizeof(structsubinfo));
528535
if (p==NULL)
529536
{
530537
ERR(REG_ESPACE);
531538
return;
532539
}
533540
v->subs=p;
534541
for (p=&v->subs[v->nsubs];v->nsubs<n;p++,v->nsubs++)
535-
*p=NULL;
542+
p->left=p->right=NULL;
536543
assert(v->nsubs==n);
537544
assert((size_t)wanted<v->nsubs);
538545
}
@@ -969,10 +976,14 @@ parseqatom(struct vars *v,
969976
NEXT();
970977

971978
/*
972-
* Make separate endpoints to ensure we keep this sub-NFA cleanly
973-
* separate from what surrounds it. We need to be sure that when
974-
* we duplicate the sub-NFA for a backref, we get the right states
975-
* and no others.
979+
* Make separate endpoint states to keep this sub-NFA distinct
980+
* from what surrounds it. We need to be sure that when we
981+
* duplicate the sub-NFA for a backref, we get the right
982+
* states/arcs and no others. In particular, letting a backref
983+
* duplicate the sub-NFA from lp to rp would be quite wrong,
984+
* because we may add quantification superstructure around this
985+
* atom below. (Perhaps we could skip the extra states for
986+
* non-capturing parens, but it seems not worth the trouble.)
976987
*/
977988
s=newstate(v->nfa);
978989
s2=newstate(v->nfa);
@@ -986,8 +997,10 @@ parseqatom(struct vars *v,
986997
NOERRN();
987998
if (cap)
988999
{
989-
assert(v->subs[subno]==NULL);
990-
v->subs[subno]=atom;
1000+
/* save the sub-NFA's endpoints for future backrefs to use */
1001+
assert(v->subs[subno].left==NULL);
1002+
v->subs[subno].left=s;
1003+
v->subs[subno].right=s2;
9911004
if (atom->capno==0)
9921005
{
9931006
/* normal case: just mark the atom as capturing */
@@ -997,7 +1010,7 @@ parseqatom(struct vars *v,
9971010
else
9981011
{
9991012
/* generate no-op wrapper node to handle "((x))" */
1000-
t=subre(v,'(',atom->flags |CAP,lp,rp);
1013+
t=subre(v,'(',atom->flags |CAP,s,s2);
10011014
NOERRN();
10021015
t->capno=subno;
10031016
t->child=atom;
@@ -1009,7 +1022,7 @@ parseqatom(struct vars *v,
10091022
caseBACKREF:/* the Feature From The Black Lagoon */
10101023
INSIST(type!=LACON,REG_ESUBREG);
10111024
INSIST(v->nextvalue<v->nsubs,REG_ESUBREG);
1012-
INSIST(v->subs[v->nextvalue]!=NULL,REG_ESUBREG);
1025+
INSIST(v->subs[v->nextvalue].left!=NULL,REG_ESUBREG);
10131026
NOERRN();
10141027
assert(v->nextvalue>0);
10151028
atom=subre(v,'b',BACKR,lp,rp);
@@ -1084,7 +1097,7 @@ parseqatom(struct vars *v,
10841097
if (atom!=NULL)
10851098
freesubre(v,atom);
10861099
if (atomtype=='(')
1087-
v->subs[subno]=NULL;
1100+
v->subs[subno].left=v->subs[subno].right=NULL;
10881101
delsub(v->nfa,lp,rp);
10891102
EMPTYARC(lp,rp);
10901103
returntop;
@@ -1177,14 +1190,14 @@ parseqatom(struct vars *v,
11771190
{
11781191
assert(atom->begin->nouts==1);/* just the EMPTY */
11791192
delsub(v->nfa,atom->begin,atom->end);
1180-
assert(v->subs[subno]!=NULL);
1193+
assert(v->subs[subno].left!=NULL);
11811194

11821195
/*
11831196
* And here's why the recursion got postponed: it must wait until the
11841197
* skeleton is filled in, because it may hit a backref that wants to
11851198
* copy the filled-in skeleton.
11861199
*/
1187-
dupnfa(v->nfa,v->subs[subno]->begin,v->subs[subno]->end,
1200+
dupnfa(v->nfa,v->subs[subno].left,v->subs[subno].right,
11881201
atom->begin,atom->end);
11891202
NOERRN();
11901203

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp