Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit9fe8fe9

Browse files
committed
Add some more query-cancel checks to regular expression matching.
Commit9662143 added infrastructure toallow regular-expression operations to be terminated early in the eventof SIGINT etc. However, fuzz testing by Greg Stark disclosed that thereare still cases where regex compilation could run for a long time withoutnoticing a cancel request. Specifically, the fixempties() phase neveradds new states, only new arcs, so it doesn't hit the cancel check I'd putin newstate(). Add one to newarc() as well to cover that.Some experimentation of my own found that regex execution could also runfor a long time despite a pending cancel. We'd put a high-level cancelcheck into cdissect(), but there was none inside the core text-matchingroutines longest() and shortest(). Ordinarily those inner loops are veryvery fast ... but in the presence of lookahead constraints, not so much.As a compromise, stick a cancel check into the stateset cache-missfunction, which is enough to guarantee a cancel check at least once perlookahead constraint test.Making this work required more attention to error handling throughout theregex executor. Henry Spencer had apparently originally intended longest()and shortest() to be incapable of incurring errors while running, soneither they nor their subroutines had well-defined error reportingbehaviors. However, that was already broken by the lookahead constraintfeature, since lacon() can surely suffer an out-of-memory failure ---which, in the code as it stood, might never be reported to the user at all,but just silently be treated as a non-match of the lookahead constraint.Normalize all that by inserting explicit error tests as needed. I took theopportunity to add some more comments to the code, too.Back-patch to all supported branches, like the previous patch.
1 parent558d4ad commit9fe8fe9

File tree

3 files changed

+154
-31
lines changed

3 files changed

+154
-31
lines changed

‎src/backend/regex/regc_nfa.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ newstate(struct nfa * nfa)
180180
/*
181181
* This is a handy place to check for operation cancel during regex
182182
* compilation, since no code path will go very long without making a new
183-
* state.
183+
* state or arc.
184184
*/
185185
if (CANCEL_REQUESTED(nfa->v->re))
186186
{
@@ -333,6 +333,17 @@ newarc(struct nfa * nfa,
333333

334334
assert(from!=NULL&&to!=NULL);
335335

336+
/*
337+
* This is a handy place to check for operation cancel during regex
338+
* compilation, since no code path will go very long without making a new
339+
* state or arc.
340+
*/
341+
if (CANCEL_REQUESTED(nfa->v->re))
342+
{
343+
NERR(REG_CANCEL);
344+
return;
345+
}
346+
336347
/* check for duplicates */
337348
for (a=from->outs;a!=NULL;a=a->outchain)
338349
if (a->to==to&&a->co==co&&a->type==t)

‎src/backend/regex/rege_dfa.c

Lines changed: 115 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,12 @@
3434

3535
/*
3636
* longest - longest-preferred matching engine
37+
*
38+
* On success, returns match endpoint address. Returns NULL on no match.
39+
* Internal errors also return NULL, with v->err set.
3740
*/
38-
staticchr*/* endpoint, or NULL */
39-
longest(structvars*v,/* used only for debug and exec flags */
41+
staticchr*
42+
longest(structvars*v,
4043
structdfa*d,
4144
chr*start,/* where the match should start */
4245
chr*stop,/* match must end at or before here */
@@ -51,11 +54,15 @@ longest(struct vars * v,/* used only for debug and exec flags */
5154
inti;
5255
structcolormap*cm=d->cm;
5356

57+
/* prevent "uninitialized variable" warnings */
58+
if (hitstopp!=NULL)
59+
*hitstopp=0;
60+
5461
/* initialize */
5562
css=initialize(v,d,start);
63+
if (css==NULL)
64+
returnNULL;
5665
cp=start;
57-
if (hitstopp!=NULL)
58-
*hitstopp=0;
5966

6067
/* startup */
6168
FDEBUG(("+++ startup +++\n"));
@@ -74,8 +81,14 @@ longest(struct vars * v,/* used only for debug and exec flags */
7481
returnNULL;
7582
css->lastseen=cp;
7683

77-
/* main loop */
84+
/*
85+
* This is the main text-scanning loop. It seems worth having two copies
86+
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
87+
* builds, when you're not actively tracing.
88+
*/
89+
#ifdefREG_DEBUG
7890
if (v->eflags&REG_FTRACE)
91+
{
7992
while (cp<realstop)
8093
{
8194
FDEBUG(("+++ at c%d +++\n", (int) (css-d->ssets)));
@@ -92,7 +105,10 @@ longest(struct vars * v,/* used only for debug and exec flags */
92105
ss->lastseen=cp;
93106
css=ss;
94107
}
108+
}
95109
else
110+
#endif
111+
{
96112
while (cp<realstop)
97113
{
98114
co=GETCOLOR(cm,*cp);
@@ -107,6 +123,10 @@ longest(struct vars * v,/* used only for debug and exec flags */
107123
ss->lastseen=cp;
108124
css=ss;
109125
}
126+
}
127+
128+
if (ISERR())
129+
returnNULL;
110130

111131
/* shutdown */
112132
FDEBUG(("+++ shutdown at c%d +++\n", (int) (css-d->ssets)));
@@ -117,6 +137,8 @@ longest(struct vars * v,/* used only for debug and exec flags */
117137
co=d->cnfa->eos[(v->eflags&REG_NOTEOL) ?0 :1];
118138
FDEBUG(("color %ld\n", (long)co));
119139
ss=miss(v,d,css,co,cp,start);
140+
if (ISERR())
141+
returnNULL;
120142
/* special case: match ended at eol? */
121143
if (ss!=NULL&& (ss->flags&POSTSTATE))
122144
returncp;
@@ -138,14 +160,17 @@ longest(struct vars * v,/* used only for debug and exec flags */
138160

139161
/*
140162
* shortest - shortest-preferred matching engine
163+
*
164+
* On success, returns match endpoint address. Returns NULL on no match.
165+
* Internal errors also return NULL, with v->err set.
141166
*/
142-
staticchr*/* endpoint, or NULL */
167+
staticchr*
143168
shortest(structvars*v,
144169
structdfa*d,
145170
chr*start,/* where the match should start */
146171
chr*min,/* match must end at or after here */
147172
chr*max,/* match must end at or before here */
148-
chr**coldp,/* store coldstart pointer here, ifnonNULL */
173+
chr**coldp,/* store coldstart pointer here, ifnon-NULL */
149174
int*hitstopp)/* record whether hit v->stop, if non-NULL */
150175
{
151176
chr*cp;
@@ -156,11 +181,17 @@ shortest(struct vars * v,
156181
structsset*ss;
157182
structcolormap*cm=d->cm;
158183

184+
/* prevent "uninitialized variable" warnings */
185+
if (coldp!=NULL)
186+
*coldp=NULL;
187+
if (hitstopp!=NULL)
188+
*hitstopp=0;
189+
159190
/* initialize */
160191
css=initialize(v,d,start);
192+
if (css==NULL)
193+
returnNULL;
161194
cp=start;
162-
if (hitstopp!=NULL)
163-
*hitstopp=0;
164195

165196
/* startup */
166197
FDEBUG(("--- startup ---\n"));
@@ -180,8 +211,14 @@ shortest(struct vars * v,
180211
css->lastseen=cp;
181212
ss=css;
182213

183-
/* main loop */
214+
/*
215+
* This is the main text-scanning loop. It seems worth having two copies
216+
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
217+
* builds, when you're not actively tracing.
218+
*/
219+
#ifdefREG_DEBUG
184220
if (v->eflags&REG_FTRACE)
221+
{
185222
while (cp<realmax)
186223
{
187224
FDEBUG(("--- at c%d ---\n", (int) (css-d->ssets)));
@@ -200,7 +237,10 @@ shortest(struct vars * v,
200237
if ((ss->flags&POSTSTATE)&&cp >=realmin)
201238
break;/* NOTE BREAK OUT */
202239
}
240+
}
203241
else
242+
#endif
243+
{
204244
while (cp<realmax)
205245
{
206246
co=GETCOLOR(cm,*cp);
@@ -217,6 +257,7 @@ shortest(struct vars * v,
217257
if ((ss->flags&POSTSTATE)&&cp >=realmin)
218258
break;/* NOTE BREAK OUT */
219259
}
260+
}
220261

221262
if (ss==NULL)
222263
returnNULL;
@@ -389,7 +430,7 @@ hash(unsigned *uv,
389430
* initialize - hand-craft a cache entry for startup, otherwise get ready
390431
*/
391432
staticstructsset*
392-
initialize(structvars*v,/* used only for debug flags */
433+
initialize(structvars*v,
393434
structdfa*d,
394435
chr*start)
395436
{
@@ -402,6 +443,8 @@ initialize(struct vars * v,/* used only for debug flags */
402443
else
403444
{/* no, must (re)build it */
404445
ss=getvacant(v,d,start,start);
446+
if (ss==NULL)
447+
returnNULL;
405448
for (i=0;i<d->wordsper;i++)
406449
ss->states[i]=0;
407450
BSET(ss->states,d->cnfa->pre);
@@ -420,10 +463,20 @@ initialize(struct vars * v,/* used only for debug flags */
420463
}
421464

422465
/*
423-
* miss - handle a cache miss
466+
* miss - handle a stateset cache miss
467+
*
468+
* css is the current stateset, co is the color of the current input character,
469+
* cp points to the character after that (which is where we may need to test
470+
* LACONs). start does not affect matching behavior but is needed for pickss'
471+
* heuristics about which stateset cache entry to replace.
472+
*
473+
* Ordinarily, returns the address of the next stateset (the one that is
474+
* valid after consuming the input character). Returns NULL if no valid
475+
* NFA states remain, ie we have a certain match failure.
476+
* Internal errors also return NULL, with v->err set.
424477
*/
425-
staticstructsset*/* NULL if goes to empty set */
426-
miss(structvars*v,/* used only for debug flags */
478+
staticstructsset*
479+
miss(structvars*v,
427480
structdfa*d,
428481
structsset*css,
429482
pcolorco,
@@ -449,9 +502,23 @@ miss(struct vars * v,/* used only for debug flags */
449502
}
450503
FDEBUG(("miss\n"));
451504

452-
/* first, what set of states would we end up in? */
505+
/*
506+
* Checking for operation cancel in the inner text search loop seems
507+
* unduly expensive. As a compromise, check during cache misses.
508+
*/
509+
if (CANCEL_REQUESTED(v->re))
510+
{
511+
ERR(REG_CANCEL);
512+
returnNULL;
513+
}
514+
515+
/*
516+
* What set of states would we end up in after consuming the co character?
517+
* We first consider PLAIN arcs that consume the character, and then look
518+
* to see what LACON arcs could be traversed after consuming it.
519+
*/
453520
for (i=0;i<d->wordsper;i++)
454-
d->work[i]=0;
521+
d->work[i]=0;/* build new stateset bitmap in d->work */
455522
ispost=0;
456523
noprogress=1;
457524
gotstate=0;
@@ -468,22 +535,31 @@ miss(struct vars * v,/* used only for debug flags */
468535
noprogress=0;
469536
FDEBUG(("%d -> %d\n",i,ca->to));
470537
}
471-
dolacons= (gotstate) ? (cnfa->flags&HASLACONS) :0;
538+
if (!gotstate)
539+
returnNULL;/* character cannot reach any new state */
540+
dolacons= (cnfa->flags&HASLACONS);
472541
sawlacons=0;
542+
/* outer loop handles transitive closure of reachable-by-LACON states */
473543
while (dolacons)
474-
{/* transitive closure */
544+
{
475545
dolacons=0;
476546
for (i=0;i<d->nstates;i++)
477547
if (ISBSET(d->work,i))
478548
for (ca=cnfa->states[i];ca->co!=COLORLESS;ca++)
479549
{
480550
if (ca->co<cnfa->ncolors)
481-
continue;/* NOTE CONTINUE */
482-
sawlacons=1;
551+
continue;/* not a LACON arc */
483552
if (ISBSET(d->work,ca->to))
484-
continue;/* NOTE CONTINUE */
553+
continue;/* arc would be a no-op anyway */
554+
sawlacons=1;/* this LACON affects our result */
485555
if (!lacon(v,cnfa,cp,ca->co))
486-
continue;/* NOTE CONTINUE */
556+
{
557+
if (ISERR())
558+
returnNULL;
559+
continue;/* LACON arc cannot be traversed */
560+
}
561+
if (ISERR())
562+
returnNULL;
487563
BSET(d->work,ca->to);
488564
dolacons=1;
489565
if (ca->to==cnfa->post)
@@ -493,11 +569,9 @@ miss(struct vars * v,/* used only for debug flags */
493569
FDEBUG(("%d :> %d\n",i,ca->to));
494570
}
495571
}
496-
if (!gotstate)
497-
returnNULL;
498572
h=HASH(d->work,d->wordsper);
499573

500-
/*next, is that in the cache? */
574+
/*Is this stateset already in the cache? */
501575
for (p=d->ssets,i=d->nssused;i>0;p++,i--)
502576
if (HIT(h,d->work,p,d->wordsper))
503577
{
@@ -507,6 +581,8 @@ miss(struct vars * v,/* used only for debug flags */
507581
if (i==0)
508582
{/* nope, need a new cache entry */
509583
p=getvacant(v,d,cp,start);
584+
if (p==NULL)
585+
returnNULL;
510586
assert(p!=css);
511587
for (i=0;i<d->wordsper;i++)
512588
p->states[i]=d->work[i];
@@ -517,8 +593,15 @@ miss(struct vars * v,/* used only for debug flags */
517593
/* lastseen to be dealt with by caller */
518594
}
519595

596+
/*
597+
* Link new stateset to old, unless a LACON affected the result, in which
598+
* case we don't create the link. That forces future transitions across
599+
* this same arc (same prior stateset and character color) to come through
600+
* miss() again, so that we can recheck the LACON(s), which might or might
601+
* not pass since context will be different.
602+
*/
520603
if (!sawlacons)
521-
{/* lookahead conds. always cache miss */
604+
{
522605
FDEBUG(("c%d[%d]->c%d\n",
523606
(int) (css-d->ssets),co, (int) (p-d->ssets)));
524607
css->outs[co]=p;
@@ -562,11 +645,12 @@ lacon(struct vars * v,
562645

563646
/*
564647
* getvacant - get a vacant state set
648+
*
565649
* This routine clears out the inarcs and outarcs, but does not otherwise
566650
* clear the innards of the state set -- that's up to the caller.
567651
*/
568652
staticstructsset*
569-
getvacant(structvars*v,/* used only for debug flags */
653+
getvacant(structvars*v,
570654
structdfa*d,
571655
chr*cp,
572656
chr*start)
@@ -578,6 +662,8 @@ getvacant(struct vars * v,/* used only for debug flags */
578662
colorco;
579663

580664
ss=pickss(v,d,cp,start);
665+
if (ss==NULL)
666+
returnNULL;
581667
assert(!(ss->flags&LOCKED));
582668

583669
/* clear out its inarcs, including self-referential ones */
@@ -635,7 +721,7 @@ getvacant(struct vars * v,/* used only for debug flags */
635721
* pickss - pick the next stateset to be used
636722
*/
637723
staticstructsset*
638-
pickss(structvars*v,/* used only for debug flags */
724+
pickss(structvars*v,
639725
structdfa*d,
640726
chr*cp,
641727
chr*start)
@@ -691,7 +777,6 @@ pickss(struct vars * v,/* used only for debug flags */
691777

692778
/* nobody's old enough?!? -- something's really wrong */
693779
FDEBUG(("cannot find victim to replace!\n"));
694-
assert(NOTREACHED);
695780
ERR(REG_ASSERT);
696-
returnd->ssets;
781+
returnNULL;
697782
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp