Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd4f6488

Browse files
committed
Add some more query-cancel checks to regular expression matching.
Commit9662143 added infrastructure toallow regular-expression operations to be terminated early in the eventof SIGINT etc. However, fuzz testing by Greg Stark disclosed that thereare still cases where regex compilation could run for a long time withoutnoticing a cancel request. Specifically, the fixempties() phase neveradds new states, only new arcs, so it doesn't hit the cancel check I'd putin newstate(). Add one to newarc() as well to cover that.Some experimentation of my own found that regex execution could also runfor a long time despite a pending cancel. We'd put a high-level cancelcheck into cdissect(), but there was none inside the core text-matchingroutines longest() and shortest(). Ordinarily those inner loops are veryvery fast ... but in the presence of lookahead constraints, not so much.As a compromise, stick a cancel check into the stateset cache-missfunction, which is enough to guarantee a cancel check at least once perlookahead constraint test.Making this work required more attention to error handling throughout theregex executor. Henry Spencer had apparently originally intended longest()and shortest() to be incapable of incurring errors while running, soneither they nor their subroutines had well-defined error reportingbehaviors. However, that was already broken by the lookahead constraintfeature, since lacon() can surely suffer an out-of-memory failure ---which, in the code as it stood, might never be reported to the user at all,but just silently be treated as a non-match of the lookahead constraint.Normalize all that by inserting explicit error tests as needed. I took theopportunity to add some more comments to the code, too.Back-patch to all supported branches, like the previous patch.
1 parent3fe8a6c commitd4f6488

File tree

3 files changed

+158
-37
lines changed

3 files changed

+158
-37
lines changed

‎src/backend/regex/regc_nfa.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ newstate(struct nfa * nfa)
180180
/*
181181
* This is a handy place to check for operation cancel during regex
182182
* compilation, since no code path will go very long without making a new
183-
* state.
183+
* state or arc.
184184
*/
185185
if (CANCEL_REQUESTED(nfa->v->re))
186186
{
@@ -333,6 +333,17 @@ newarc(struct nfa * nfa,
333333

334334
assert(from!=NULL&&to!=NULL);
335335

336+
/*
337+
* This is a handy place to check for operation cancel during regex
338+
* compilation, since no code path will go very long without making a new
339+
* state or arc.
340+
*/
341+
if (CANCEL_REQUESTED(nfa->v->re))
342+
{
343+
NERR(REG_CANCEL);
344+
return;
345+
}
346+
336347
/* check for duplicates */
337348
for (a=from->outs;a!=NULL;a=a->outchain)
338349
if (a->to==to&&a->co==co&&a->type==t)

‎src/backend/regex/rege_dfa.c

Lines changed: 115 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,12 @@
3434

3535
/*
3636
* longest - longest-preferred matching engine
37+
*
38+
* On success, returns match endpoint address. Returns NULL on no match.
39+
* Internal errors also return NULL, with v->err set.
3740
*/
38-
staticchr*/* endpoint, or NULL */
39-
longest(structvars*v,/* used only for debug and exec flags */
41+
staticchr*
42+
longest(structvars*v,
4043
structdfa*d,
4144
chr*start,/* where the match should start */
4245
chr*stop,/* match must end at or before here */
@@ -51,11 +54,15 @@ longest(struct vars * v,/* used only for debug and exec flags */
5154
inti;
5255
structcolormap*cm=d->cm;
5356

57+
/* prevent "uninitialized variable" warnings */
58+
if (hitstopp!=NULL)
59+
*hitstopp=0;
60+
5461
/* initialize */
5562
css=initialize(v,d,start);
63+
if (css==NULL)
64+
returnNULL;
5665
cp=start;
57-
if (hitstopp!=NULL)
58-
*hitstopp=0;
5966

6067
/* startup */
6168
FDEBUG(("+++ startup +++\n"));
@@ -74,8 +81,14 @@ longest(struct vars * v,/* used only for debug and exec flags */
7481
returnNULL;
7582
css->lastseen=cp;
7683

77-
/* main loop */
84+
/*
85+
* This is the main text-scanning loop. It seems worth having two copies
86+
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
87+
* builds, when you're not actively tracing.
88+
*/
89+
#ifdefREG_DEBUG
7890
if (v->eflags&REG_FTRACE)
91+
{
7992
while (cp<realstop)
8093
{
8194
FDEBUG(("+++ at c%d +++\n", (int) (css-d->ssets)));
@@ -92,7 +105,10 @@ longest(struct vars * v,/* used only for debug and exec flags */
92105
ss->lastseen=cp;
93106
css=ss;
94107
}
108+
}
95109
else
110+
#endif
111+
{
96112
while (cp<realstop)
97113
{
98114
co=GETCOLOR(cm,*cp);
@@ -107,6 +123,10 @@ longest(struct vars * v,/* used only for debug and exec flags */
107123
ss->lastseen=cp;
108124
css=ss;
109125
}
126+
}
127+
128+
if (ISERR())
129+
returnNULL;
110130

111131
/* shutdown */
112132
FDEBUG(("+++ shutdown at c%d +++\n", (int) (css-d->ssets)));
@@ -117,6 +137,8 @@ longest(struct vars * v,/* used only for debug and exec flags */
117137
co=d->cnfa->eos[(v->eflags&REG_NOTEOL) ?0 :1];
118138
FDEBUG(("color %ld\n", (long)co));
119139
ss=miss(v,d,css,co,cp,start);
140+
if (ISERR())
141+
returnNULL;
120142
/* special case: match ended at eol? */
121143
if (ss!=NULL&& (ss->flags&POSTSTATE))
122144
returncp;
@@ -138,14 +160,17 @@ longest(struct vars * v,/* used only for debug and exec flags */
138160

139161
/*
140162
* shortest - shortest-preferred matching engine
163+
*
164+
* On success, returns match endpoint address. Returns NULL on no match.
165+
* Internal errors also return NULL, with v->err set.
141166
*/
142-
staticchr*/* endpoint, or NULL */
167+
staticchr*
143168
shortest(structvars*v,
144169
structdfa*d,
145170
chr*start,/* where the match should start */
146171
chr*min,/* match must end at or after here */
147172
chr*max,/* match must end at or before here */
148-
chr**coldp,/* store coldstart pointer here, ifnonNULL */
173+
chr**coldp,/* store coldstart pointer here, ifnon-NULL */
149174
int*hitstopp)/* record whether hit v->stop, if non-NULL */
150175
{
151176
chr*cp;
@@ -156,11 +181,17 @@ shortest(struct vars * v,
156181
structsset*ss;
157182
structcolormap*cm=d->cm;
158183

184+
/* prevent "uninitialized variable" warnings */
185+
if (coldp!=NULL)
186+
*coldp=NULL;
187+
if (hitstopp!=NULL)
188+
*hitstopp=0;
189+
159190
/* initialize */
160191
css=initialize(v,d,start);
192+
if (css==NULL)
193+
returnNULL;
161194
cp=start;
162-
if (hitstopp!=NULL)
163-
*hitstopp=0;
164195

165196
/* startup */
166197
FDEBUG(("--- startup ---\n"));
@@ -180,8 +211,14 @@ shortest(struct vars * v,
180211
css->lastseen=cp;
181212
ss=css;
182213

183-
/* main loop */
214+
/*
215+
* This is the main text-scanning loop. It seems worth having two copies
216+
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
217+
* builds, when you're not actively tracing.
218+
*/
219+
#ifdefREG_DEBUG
184220
if (v->eflags&REG_FTRACE)
221+
{
185222
while (cp<realmax)
186223
{
187224
FDEBUG(("--- at c%d ---\n", (int) (css-d->ssets)));
@@ -200,7 +237,10 @@ shortest(struct vars * v,
200237
if ((ss->flags&POSTSTATE)&&cp >=realmin)
201238
break;/* NOTE BREAK OUT */
202239
}
240+
}
203241
else
242+
#endif
243+
{
204244
while (cp<realmax)
205245
{
206246
co=GETCOLOR(cm,*cp);
@@ -217,6 +257,7 @@ shortest(struct vars * v,
217257
if ((ss->flags&POSTSTATE)&&cp >=realmin)
218258
break;/* NOTE BREAK OUT */
219259
}
260+
}
220261

221262
if (ss==NULL)
222263
returnNULL;
@@ -390,7 +431,7 @@ hash(unsigned *uv,
390431
* initialize - hand-craft a cache entry for startup, otherwise get ready
391432
*/
392433
staticstructsset*
393-
initialize(structvars*v,/* used only for debug flags */
434+
initialize(structvars*v,
394435
structdfa*d,
395436
chr*start)
396437
{
@@ -403,6 +444,8 @@ initialize(struct vars * v,/* used only for debug flags */
403444
else
404445
{/* no, must (re)build it */
405446
ss=getvacant(v,d,start,start);
447+
if (ss==NULL)
448+
returnNULL;
406449
for (i=0;i<d->wordsper;i++)
407450
ss->states[i]=0;
408451
BSET(ss->states,d->cnfa->pre);
@@ -421,10 +464,20 @@ initialize(struct vars * v,/* used only for debug flags */
421464
}
422465

423466
/*
424-
* miss - handle a cache miss
467+
* miss - handle a stateset cache miss
468+
*
469+
* css is the current stateset, co is the color of the current input character,
470+
* cp points to the character after that (which is where we may need to test
471+
* LACONs). start does not affect matching behavior but is needed for pickss'
472+
* heuristics about which stateset cache entry to replace.
473+
*
474+
* Ordinarily, returns the address of the next stateset (the one that is
475+
* valid after consuming the input character). Returns NULL if no valid
476+
* NFA states remain, ie we have a certain match failure.
477+
* Internal errors also return NULL, with v->err set.
425478
*/
426-
staticstructsset*/* NULL if goes to empty set */
427-
miss(structvars*v,/* used only for debug flags */
479+
staticstructsset*
480+
miss(structvars*v,
428481
structdfa*d,
429482
structsset*css,
430483
pcolorco,
@@ -450,9 +503,23 @@ miss(struct vars * v,/* used only for debug flags */
450503
}
451504
FDEBUG(("miss\n"));
452505

453-
/* first, what set of states would we end up in? */
506+
/*
507+
* Checking for operation cancel in the inner text search loop seems
508+
* unduly expensive. As a compromise, check during cache misses.
509+
*/
510+
if (CANCEL_REQUESTED(v->re))
511+
{
512+
ERR(REG_CANCEL);
513+
returnNULL;
514+
}
515+
516+
/*
517+
* What set of states would we end up in after consuming the co character?
518+
* We first consider PLAIN arcs that consume the character, and then look
519+
* to see what LACON arcs could be traversed after consuming it.
520+
*/
454521
for (i=0;i<d->wordsper;i++)
455-
d->work[i]=0;
522+
d->work[i]=0;/* build new stateset bitmap in d->work */
456523
ispost=0;
457524
noprogress=1;
458525
gotstate=0;
@@ -469,22 +536,31 @@ miss(struct vars * v,/* used only for debug flags */
469536
noprogress=0;
470537
FDEBUG(("%d -> %d\n",i,ca->to));
471538
}
472-
dolacons= (gotstate) ? (cnfa->flags&HASLACONS) :0;
539+
if (!gotstate)
540+
returnNULL;/* character cannot reach any new state */
541+
dolacons= (cnfa->flags&HASLACONS);
473542
sawlacons=0;
543+
/* outer loop handles transitive closure of reachable-by-LACON states */
474544
while (dolacons)
475-
{/* transitive closure */
545+
{
476546
dolacons=0;
477547
for (i=0;i<d->nstates;i++)
478548
if (ISBSET(d->work,i))
479549
for (ca=cnfa->states[i];ca->co!=COLORLESS;ca++)
480550
{
481551
if (ca->co<cnfa->ncolors)
482-
continue;/* NOTE CONTINUE */
483-
sawlacons=1;
552+
continue;/* not a LACON arc */
484553
if (ISBSET(d->work,ca->to))
485-
continue;/* NOTE CONTINUE */
554+
continue;/* arc would be a no-op anyway */
555+
sawlacons=1;/* this LACON affects our result */
486556
if (!lacon(v,cnfa,cp,ca->co))
487-
continue;/* NOTE CONTINUE */
557+
{
558+
if (ISERR())
559+
returnNULL;
560+
continue;/* LACON arc cannot be traversed */
561+
}
562+
if (ISERR())
563+
returnNULL;
488564
BSET(d->work,ca->to);
489565
dolacons=1;
490566
if (ca->to==cnfa->post)
@@ -494,11 +570,9 @@ miss(struct vars * v,/* used only for debug flags */
494570
FDEBUG(("%d :> %d\n",i,ca->to));
495571
}
496572
}
497-
if (!gotstate)
498-
returnNULL;
499573
h=HASH(d->work,d->wordsper);
500574

501-
/*next, is that in the cache? */
575+
/*Is this stateset already in the cache? */
502576
for (p=d->ssets,i=d->nssused;i>0;p++,i--)
503577
if (HIT(h,d->work,p,d->wordsper))
504578
{
@@ -508,6 +582,8 @@ miss(struct vars * v,/* used only for debug flags */
508582
if (i==0)
509583
{/* nope, need a new cache entry */
510584
p=getvacant(v,d,cp,start);
585+
if (p==NULL)
586+
returnNULL;
511587
assert(p!=css);
512588
for (i=0;i<d->wordsper;i++)
513589
p->states[i]=d->work[i];
@@ -518,8 +594,15 @@ miss(struct vars * v,/* used only for debug flags */
518594
/* lastseen to be dealt with by caller */
519595
}
520596

597+
/*
598+
* Link new stateset to old, unless a LACON affected the result, in which
599+
* case we don't create the link. That forces future transitions across
600+
* this same arc (same prior stateset and character color) to come through
601+
* miss() again, so that we can recheck the LACON(s), which might or might
602+
* not pass since context will be different.
603+
*/
521604
if (!sawlacons)
522-
{/* lookahead conds. always cache miss */
605+
{
523606
FDEBUG(("c%d[%d]->c%d\n",
524607
(int) (css-d->ssets),co, (int) (p-d->ssets)));
525608
css->outs[co]=p;
@@ -563,11 +646,12 @@ lacon(struct vars * v,
563646

564647
/*
565648
* getvacant - get a vacant state set
649+
*
566650
* This routine clears out the inarcs and outarcs, but does not otherwise
567651
* clear the innards of the state set -- that's up to the caller.
568652
*/
569653
staticstructsset*
570-
getvacant(structvars*v,/* used only for debug flags */
654+
getvacant(structvars*v,
571655
structdfa*d,
572656
chr*cp,
573657
chr*start)
@@ -579,6 +663,8 @@ getvacant(struct vars * v,/* used only for debug flags */
579663
colorco;
580664

581665
ss=pickss(v,d,cp,start);
666+
if (ss==NULL)
667+
returnNULL;
582668
assert(!(ss->flags&LOCKED));
583669

584670
/* clear out its inarcs, including self-referential ones */
@@ -636,7 +722,7 @@ getvacant(struct vars * v,/* used only for debug flags */
636722
* pickss - pick the next stateset to be used
637723
*/
638724
staticstructsset*
639-
pickss(structvars*v,/* used only for debug flags */
725+
pickss(structvars*v,
640726
structdfa*d,
641727
chr*cp,
642728
chr*start)
@@ -692,7 +778,6 @@ pickss(struct vars * v,/* used only for debug flags */
692778

693779
/* nobody's old enough?!? -- something's really wrong */
694780
FDEBUG(("cannot find victim to replace!\n"));
695-
assert(NOTREACHED);
696781
ERR(REG_ASSERT);
697-
returnd->ssets;
782+
returnNULL;
698783
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp