Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit0c3405c

Browse files
committed
Improve performance of regular expression back-references.
In some cases, at the time that we're doing an NFA-based precheckof whether a backref subexpression can match at a particular placein the string, we already know which substring the referencedsubexpression matched. If so, we might as well forget about the NFAand just compare the substring; this is faster and it gives an exactrather than approximate answer.In general, this optimization can help while we are prechecking withinthe second child expression of a concat node, while the capture waswithin the first child expression; then the substring was saved duringcdissect() of the first child and will be available to NFA checks donewhile cdissect() recurses into the second child. It can help quite alot if the tree looks like concat / \ capture concat / \ expensive stuff backrefas we will be able to avoid recursively dissecting the "expensivestuff" before discovering that the backref isn't satisfied with aparticular midpoint that the lower concat node is testing. Thisdoesn't help if the concat tree is left-deep, as the capture nodewon't get set soon enough (and it's hard to fix that without changingthe engine's match behavior). Fortunately, right-deep concat treesare the common case.Patch by me, reviewed by Joel JacobsonDiscussion:https://postgr.es/m/661609.1614560029@sss.pgh.pa.us
1 parent4aea704 commit0c3405c

File tree

2 files changed

+138
-3
lines changed

2 files changed

+138
-3
lines changed

‎src/backend/regex/rege_dfa.c

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,19 @@ longest(struct vars *v,
5858
if (hitstopp!=NULL)
5959
*hitstopp=0;
6060

61+
/* if this is a backref to a known string, just match against that */
62+
if (d->backno >=0)
63+
{
64+
assert((size_t)d->backno<v->nmatch);
65+
if (v->pmatch[d->backno].rm_so >=0)
66+
{
67+
cp=dfa_backref(v,d,start,start,stop, false);
68+
if (cp==v->stop&&stop==v->stop&&hitstopp!=NULL)
69+
*hitstopp=1;
70+
returncp;
71+
}
72+
}
73+
6174
/* fast path for matchall NFAs */
6275
if (d->cnfa->flags&MATCHALL)
6376
{
@@ -210,6 +223,20 @@ shortest(struct vars *v,
210223
if (hitstopp!=NULL)
211224
*hitstopp=0;
212225

226+
/* if this is a backref to a known string, just match against that */
227+
if (d->backno >=0)
228+
{
229+
assert((size_t)d->backno<v->nmatch);
230+
if (v->pmatch[d->backno].rm_so >=0)
231+
{
232+
cp=dfa_backref(v,d,start,min,max, true);
233+
if (cp!=NULL&&coldp!=NULL)
234+
*coldp=start;
235+
/* there is no case where we should set *hitstopp */
236+
returncp;
237+
}
238+
}
239+
213240
/* fast path for matchall NFAs */
214241
if (d->cnfa->flags&MATCHALL)
215242
{
@@ -467,6 +494,94 @@ matchuntil(struct vars *v,
467494
return1;
468495
}
469496

497+
/*
498+
* dfa_backref - find best match length for a known backref string
499+
*
500+
* When the backref's referent is already available, we can deliver an exact
501+
* answer with considerably less work than running the backref node's NFA.
502+
*
503+
* Return match endpoint for longest or shortest valid repeated match,
504+
* or NULL if there is no valid match.
505+
*
506+
* Should be in sync with cbrdissect(), although that has the different task
507+
* of checking a match to a predetermined section of the string.
508+
*/
509+
staticchr*
510+
dfa_backref(structvars*v,
511+
structdfa*d,
512+
chr*start,/* where the match should start */
513+
chr*min,/* match must end at or after here */
514+
chr*max,/* match must end at or before here */
515+
boolshortest)
516+
{
517+
intn=d->backno;
518+
intbackmin=d->backmin;
519+
intbackmax=d->backmax;
520+
size_tnumreps;
521+
size_tminreps;
522+
size_tmaxreps;
523+
size_tbrlen;
524+
chr*brstring;
525+
chr*p;
526+
527+
/* get the backreferenced string (caller should have checked this) */
528+
if (v->pmatch[n].rm_so==-1)
529+
returnNULL;
530+
brstring=v->start+v->pmatch[n].rm_so;
531+
brlen=v->pmatch[n].rm_eo-v->pmatch[n].rm_so;
532+
533+
/* special-case zero-length backreference to avoid divide by zero */
534+
if (brlen==0)
535+
{
536+
/*
537+
* matches only a zero-length string, but any number of repetitions
538+
* can be considered to be present
539+
*/
540+
if (min==start&&backmin <=backmax)
541+
returnstart;
542+
returnNULL;
543+
}
544+
545+
/*
546+
* convert min and max into numbers of possible repetitions of the backref
547+
* string, rounding appropriately
548+
*/
549+
if (min <=start)
550+
minreps=0;
551+
else
552+
minreps= (min-start-1) /brlen+1;
553+
maxreps= (max-start) /brlen;
554+
555+
/* apply bounds, then see if there is any allowed match length */
556+
if (minreps<backmin)
557+
minreps=backmin;
558+
if (backmax!=DUPINF&&maxreps>backmax)
559+
maxreps=backmax;
560+
if (maxreps<minreps)
561+
returnNULL;
562+
563+
/* quick exit if zero-repetitions match is valid and preferred */
564+
if (shortest&&minreps==0)
565+
returnstart;
566+
567+
/* okay, compare the actual string contents */
568+
p=start;
569+
numreps=0;
570+
while (numreps<maxreps)
571+
{
572+
if ((*v->g->compare) (brstring,p,brlen)!=0)
573+
break;
574+
p+=brlen;
575+
numreps++;
576+
if (shortest&&numreps >=minreps)
577+
break;
578+
}
579+
580+
if (numreps >=minreps)
581+
returnp;
582+
returnNULL;
583+
}
584+
470585
/*
471586
* lastcold - determine last point at which no progress had been made
472587
*/
@@ -563,6 +678,8 @@ newdfa(struct vars *v,
563678
d->lastpost=NULL;
564679
d->lastnopr=NULL;
565680
d->search=d->ssets;
681+
d->backno=-1;/* may be set by caller */
682+
d->backmin=d->backmax=0;
566683

567684
/* initialization of sset fields is done as needed */
568685

‎src/backend/regex/regexec.c

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ struct dfa
7777
chr*lastpost;/* location of last cache-flushed success */
7878
chr*lastnopr;/* location of last cache-flushed NOPROGRESS */
7979
structsset*search;/* replacement-search-pointer memory */
80+
intbackno;/* if DFA for a backref, subno it refers to */
81+
shortbackmin;/* min repetitions for backref */
82+
shortbackmax;/* max repetitions for backref */
8083
boolismalloced;/* should this struct dfa be freed? */
8184
boolarraysmalloced;/* should its subsidiary arrays be freed? */
8285
};
@@ -154,6 +157,7 @@ static intcreviterdissect(struct vars *, struct subre *, chr *, chr *);
154157
staticchr*longest(structvars*,structdfa*,chr*,chr*,int*);
155158
staticchr*shortest(structvars*,structdfa*,chr*,chr*,chr*,chr**,int*);
156159
staticintmatchuntil(structvars*,structdfa*,chr*,structsset**,chr**);
160+
staticchr*dfa_backref(structvars*,structdfa*,chr*,chr*,chr*,bool);
157161
staticchr*lastcold(structvars*,structdfa*);
158162
staticstructdfa*newdfa(structvars*,structcnfa*,structcolormap*,structsmalldfa*);
159163
staticvoidfreedfa(structdfa*);
@@ -342,13 +346,23 @@ static struct dfa *
342346
getsubdfa(structvars*v,
343347
structsubre*t)
344348
{
345-
if (v->subdfas[t->id]==NULL)
349+
structdfa*d=v->subdfas[t->id];
350+
351+
if (d==NULL)
346352
{
347-
v->subdfas[t->id]=newdfa(v,&t->cnfa,&v->g->cmap,DOMALLOC);
353+
d=newdfa(v,&t->cnfa,&v->g->cmap,DOMALLOC);
348354
if (ISERR())
349355
returnNULL;
356+
/* set up additional info if this is a backref node */
357+
if (t->op=='b')
358+
{
359+
d->backno=t->backno;
360+
d->backmin=t->min;
361+
d->backmax=t->max;
362+
}
363+
v->subdfas[t->id]=d;
350364
}
351-
returnv->subdfas[t->id];
365+
returnd;
352366
}
353367

354368
/*
@@ -369,6 +383,7 @@ getladfa(struct vars *v,
369383
v->ladfas[n]=newdfa(v,&sub->cnfa,&v->g->cmap,DOMALLOC);
370384
if (ISERR())
371385
returnNULL;
386+
/* a LACON can't contain a backref, so nothing else to do */
372387
}
373388
returnv->ladfas[n];
374389
}
@@ -927,6 +942,9 @@ crevcondissect(struct vars *v,
927942

928943
/*
929944
* cbrdissect - dissect match for backref node
945+
*
946+
* The backref match might already have been verified by dfa_backref(),
947+
* but we don't know that for sure so must check it here.
930948
*/
931949
staticint/* regexec return code */
932950
cbrdissect(structvars*v,

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp