Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitea1268f

Browse files
committed
Avoid generating extra subre tree nodes for capturing parentheses.
Previously, each pair of capturing parentheses gave rise to a separatesubre tree node, whose only function was to identify that we ought tocapture the match details for this particular sub-expression. Inmost cases we don't really need that, since we can perfectly wellput a "capture this" annotation on the child node that does the realmatching work. As with the two preceding commits, the main valueof this is to avoid generating and optimizing an NFA for a tree nodethat's not really pulling its weight.The chosen data representation only allows one capture annotationper subre node. In the legal-per-spec, but seemingly not very useful,case where there are multiple capturing parens around the exact samebit of the regex (i.e. "((xyz))"), wrap the child node in N-1 capturenodes that act the same as before. We could work harder at that butI'll refrain, pending some evidence that such cases are worth troublingover.In passing, improve the comments in regex.h to say what all thedifferent re_info bits mean. Some of them were pretty obviousbut others not so much, so reverse-engineer some documentation.This is part of a patch series that in total reduces the regex engine'sruntime by about a factor of four on a large corpus of real-world regexes.Patch by me, reviewed by Joel JacobsonDiscussion:https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
1 parent5810430 commitea1268f

File tree

6 files changed

+91
-51
lines changed

6 files changed

+91
-51
lines changed

‎src/backend/regex/README‎

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,8 @@ possibilities.
130130

131131
As an example, consider the regex "(a[bc]+)\1". The compiled
132132
representation will have a top-level concatenation subre node. Its first
133-
child is acapturenode, and the child of thatisa plain DFA node for
134-
"a[bc]+". The concatenation's second child is a backref node for \1.
133+
child is aplain DFAnode for "a[bc]+" (whichismarked as being a capture
134+
node). The concatenation's second child is a backref node for \1.
135135
The DFA associated with the concatenation node will be "a[bc]+a[bc]+",
136136
where the backref has been replaced by a copy of the DFA for its referent
137137
expression. When executed, the concatenation node will have to search for
@@ -147,6 +147,17 @@ run much faster than a pure NFA engine could do. It is this behavior that
147147
justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's
148148
library.
149149

150+
It's perhaps worth noting that separate capture subre nodes are a rarity:
151+
normally, we just mark a subre as capturing and that's it. However, it's
152+
legal to write a regex like "((x))" in which the same substring has to be
153+
captured by multiple sets of parentheses. Since a subre has room for only
154+
one "capno" field, a single subre can't handle that. We handle such cases
155+
by wrapping the base subre (which captures the innermost parens) in a
156+
no-op capture node, or even more than one for "(((x)))" etc. This is a
157+
little bit inefficient because we end up with multiple identical NFAs,
158+
but since the case is pointless and infrequent, it's not worth working
159+
harder.
160+
150161

151162
Colors and colormapping
152163
-----------------------

‎src/backend/regex/regcomp.c‎

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ pg_regcomp(regex_t *re,
452452
#endif
453453

454454
/* Prepend .* to pattern if it's a lookbehind LACON */
455-
nfanode(v,lasub, !LATYPE_IS_AHEAD(lasub->subno),debug);
455+
nfanode(v,lasub, !LATYPE_IS_AHEAD(lasub->latype),debug);
456456
}
457457
CNOERR();
458458
if (v->tree->flags&SHORTER)
@@ -944,7 +944,13 @@ parseqatom(struct vars *v,
944944
else
945945
atomtype=PLAIN;/* something that's not '(' */
946946
NEXT();
947-
/* need new endpoints because tree will contain pointers */
947+
948+
/*
949+
* Make separate endpoints to ensure we keep this sub-NFA cleanly
950+
* separate from what surrounds it. We need to be sure that when
951+
* we duplicate the sub-NFA for a backref, we get the right states
952+
* and no others.
953+
*/
948954
s=newstate(v->nfa);
949955
s2=newstate(v->nfa);
950956
NOERR();
@@ -959,11 +965,21 @@ parseqatom(struct vars *v,
959965
{
960966
assert(v->subs[subno]==NULL);
961967
v->subs[subno]=atom;
962-
t=subre(v,'(',atom->flags |CAP,lp,rp);
963-
NOERR();
964-
t->subno=subno;
965-
t->child=atom;
966-
atom=t;
968+
if (atom->capno==0)
969+
{
970+
/* normal case: just mark the atom as capturing */
971+
atom->flags |=CAP;
972+
atom->capno=subno;
973+
}
974+
else
975+
{
976+
/* generate no-op wrapper node to handle "((x))" */
977+
t=subre(v,'(',atom->flags |CAP,lp,rp);
978+
NOERR();
979+
t->capno=subno;
980+
t->child=atom;
981+
atom=t;
982+
}
967983
}
968984
/* postpone everything else pending possible {0} */
969985
break;
@@ -976,7 +992,7 @@ parseqatom(struct vars *v,
976992
atom=subre(v,'b',BACKR,lp,rp);
977993
NOERR();
978994
subno=v->nextvalue;
979-
atom->subno=subno;
995+
atom->backno=subno;
980996
EMPTYARC(lp,rp);/* temporarily, so there's something */
981997
NEXT();
982998
break;
@@ -1276,8 +1292,10 @@ parseqatom(struct vars *v,
12761292
freesubre(v,top->child);
12771293
top->op=t->op;
12781294
top->flags=t->flags;
1295+
top->latype=t->latype;
12791296
top->id=t->id;
1280-
top->subno=t->subno;
1297+
top->capno=t->capno;
1298+
top->backno=t->backno;
12811299
top->min=t->min;
12821300
top->max=t->max;
12831301
top->child=t->child;
@@ -1790,8 +1808,10 @@ subre(struct vars *v,
17901808

17911809
ret->op=op;
17921810
ret->flags=flags;
1811+
ret->latype= (char)-1;
17931812
ret->id=0;/* will be assigned later */
1794-
ret->subno=0;
1813+
ret->capno=0;
1814+
ret->backno=0;
17951815
ret->min=ret->max=1;
17961816
ret->child=NULL;
17971817
ret->sibling=NULL;
@@ -1893,7 +1913,7 @@ numst(struct subre *t,
18931913
assert(t!=NULL);
18941914

18951915
i=start;
1896-
t->id=(short)i++;
1916+
t->id=i++;
18971917
for (t2=t->child;t2!=NULL;t2=t2->sibling)
18981918
i=numst(t2,i);
18991919
returni;
@@ -2040,7 +2060,7 @@ newlacon(struct vars *v,
20402060
sub=&v->lacons[n];
20412061
sub->begin=begin;
20422062
sub->end=end;
2043-
sub->subno=latype;
2063+
sub->latype=latype;
20442064
ZAPCNFA(sub->cnfa);
20452065
returnn;
20462066
}
@@ -2163,7 +2183,7 @@ dump(regex_t *re,
21632183
structsubre*lasub=&g->lacons[i];
21642184
constchar*latype;
21652185

2166-
switch (lasub->subno)
2186+
switch (lasub->latype)
21672187
{
21682188
caseLATYPE_AHEAD_POS:
21692189
latype="positive lookahead";
@@ -2227,8 +2247,12 @@ stdump(struct subre *t,
22272247
fprintf(f," hasbackref");
22282248
if (!(t->flags&INUSE))
22292249
fprintf(f," UNUSED");
2230-
if (t->subno!=0)
2231-
fprintf(f," (#%d)",t->subno);
2250+
if (t->latype!= (char)-1)
2251+
fprintf(f," latype(%d)",t->latype);
2252+
if (t->capno!=0)
2253+
fprintf(f," capture(%d)",t->capno);
2254+
if (t->backno!=0)
2255+
fprintf(f," backref(%d)",t->backno);
22322256
if (t->min!=1||t->max!=1)
22332257
{
22342258
fprintf(f," {%d,",t->min);

‎src/backend/regex/rege_dfa.c‎

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -825,12 +825,12 @@ lacon(struct vars *v,
825825
d=getladfa(v,n);
826826
if (d==NULL)
827827
return0;
828-
if (LATYPE_IS_AHEAD(sub->subno))
828+
if (LATYPE_IS_AHEAD(sub->latype))
829829
{
830830
/* used to use longest() here, but shortest() could be much cheaper */
831831
end=shortest(v,d,cp,cp,v->stop,
832832
(chr**)NULL, (int*)NULL);
833-
satisfied=LATYPE_IS_POS(sub->subno) ? (end!=NULL) : (end==NULL);
833+
satisfied=LATYPE_IS_POS(sub->latype) ? (end!=NULL) : (end==NULL);
834834
}
835835
else
836836
{
@@ -843,7 +843,7 @@ lacon(struct vars *v,
843843
* nominal match.
844844
*/
845845
satisfied=matchuntil(v,d,cp,&v->lblastcss[n],&v->lblastcp[n]);
846-
if (!LATYPE_IS_POS(sub->subno))
846+
if (!LATYPE_IS_POS(sub->latype))
847847
satisfied= !satisfied;
848848
}
849849
FDEBUG(("=== lacon %d satisfied %d\n",n,satisfied));

‎src/backend/regex/regexec.c‎

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -640,13 +640,11 @@ static void
640640
zaptreesubs(structvars*v,
641641
structsubre*t)
642642
{
643+
intn=t->capno;
643644
structsubre*t2;
644645

645-
if (t->op=='(')
646+
if (n>0)
646647
{
647-
intn=t->subno;
648-
649-
assert(n>0);
650648
if ((size_t)n<v->nmatch)
651649
{
652650
v->pmatch[n].rm_so=-1;
@@ -667,7 +665,7 @@ subset(struct vars *v,
667665
chr*begin,
668666
chr*end)
669667
{
670-
intn=sub->subno;
668+
intn=sub->capno;
671669

672670
assert(n>0);
673671
if ((size_t)n >=v->nmatch)
@@ -739,12 +737,10 @@ cdissect(struct vars *v,
739737
else
740738
er=citerdissect(v,t,begin,end);
741739
break;
742-
case'(':/*capturing */
740+
case'(':/*no-op capture node */
743741
assert(t->child!=NULL);
744-
assert(t->subno>0);
742+
assert(t->capno>0);
745743
er=cdissect(v,t->child,begin,end);
746-
if (er==REG_OKAY)
747-
subset(v,t,begin,end);
748744
break;
749745
default:
750746
er=REG_ASSERT;
@@ -758,6 +754,12 @@ cdissect(struct vars *v,
758754
*/
759755
assert(er!=REG_NOMATCH|| (t->flags&BACKR));
760756

757+
/*
758+
* If this node is marked as capturing, save successful match's location.
759+
*/
760+
if (t->capno>0&&er==REG_OKAY)
761+
subset(v,t,begin,end);
762+
761763
returner;
762764
}
763765

@@ -932,7 +934,7 @@ cbrdissect(struct vars *v,
932934
chr*begin,/* beginning of relevant substring */
933935
chr*end)/* end of same */
934936
{
935-
intn=t->subno;
937+
intn=t->backno;
936938
size_tnumreps;
937939
size_ttlen;
938940
size_tbrlen;

‎src/include/regex/regex.h‎

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -56,21 +56,23 @@ typedef struct
5656
{
5757
intre_magic;/* magic number */
5858
size_tre_nsub;/* number of subexpressions */
59-
longre_info;/* information about RE */
60-
#defineREG_UBACKREF 000001
61-
#defineREG_ULOOKAROUND 000002
62-
#defineREG_UBOUNDS 000004
63-
#defineREG_UBRACES 000010
64-
#defineREG_UBSALNUM 000020
65-
#defineREG_UPBOTCH 000040
66-
#defineREG_UBBS 000100
67-
#defineREG_UNONPOSIX 000200
68-
#defineREG_UUNSPEC 000400
69-
#defineREG_UUNPORT 001000
70-
#defineREG_ULOCALE 002000
71-
#defineREG_UEMPTYMATCH 004000
72-
#defineREG_UIMPOSSIBLE 010000
73-
#defineREG_USHORTEST 020000
59+
longre_info;/* bitmask of the following flags: */
60+
#defineREG_UBACKREF000001/* has back-reference (\n) */
61+
#defineREG_ULOOKAROUND000002/* has lookahead/lookbehind constraint */
62+
#defineREG_UBOUNDS000004/* has bounded quantifier ({m,n}) */
63+
#defineREG_UBRACES000010/* has { that doesn't begin a quantifier */
64+
#defineREG_UBSALNUM000020/* has backslash-alphanumeric in non-ARE */
65+
#defineREG_UPBOTCH000040/* has unmatched right paren in ERE (legal
66+
* per spec, but that was a mistake) */
67+
#defineREG_UBBS000100/* has backslash within bracket expr */
68+
#defineREG_UNONPOSIX000200/* has any construct that extends POSIX */
69+
#defineREG_UUNSPEC000400/* has any case disallowed by POSIX, e.g.
70+
* an empty branch */
71+
#defineREG_UUNPORT001000/* has numeric character code dependency */
72+
#defineREG_ULOCALE002000/* has locale dependency */
73+
#defineREG_UEMPTYMATCH004000/* can match a zero-length string */
74+
#defineREG_UIMPOSSIBLE010000/* provably cannot match anything */
75+
#defineREG_USHORTEST020000/* has non-greedy quantifier */
7476
intre_csize;/* sizeof(character) */
7577
char*re_endp;/* backward compatibility kludge */
7678
Oidre_collation;/* Collation that defines LC_CTYPE behavior */

‎src/include/regex/regguts.h‎

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ struct cnfa
422422
* "op" is one of:
423423
*'=' plain regex without interesting substructure (implemented as DFA)
424424
*'b' back-reference (has no substructure either)
425-
*'(' capture node: captures the match of its single child
425+
*'('no-opcapture node: captures the match of its single child
426426
*'.' concatenation: matches a match for first child, then second child
427427
*'|' alternation: matches a match for any of its children
428428
*'*' iteration: matches some number of matches of its single child
@@ -446,8 +446,8 @@ struct subre
446446
#defineLONGER 01/* prefers longer match */
447447
#defineSHORTER 02/* prefers shorter match */
448448
#defineMIXED 04/* mixed preference below */
449-
#defineCAP 010/* capturing parens below */
450-
#defineBACKR 020/* back reference below */
449+
#defineCAP 010/* capturing parenshere orbelow */
450+
#defineBACKR 020/* back referencehere orbelow */
451451
#defineINUSE 0100/* in use in final tree */
452452
#defineNOPROP 03/* bits which may not propagate up */
453453
#defineLMIX(f) ((f)<<2)/* LONGER -> MIXED */
@@ -457,9 +457,10 @@ struct subre
457457
#definePREF(f) ((f)&NOPROP)
458458
#definePREF2(f1,f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
459459
#defineCOMBINE(f1,f2) (UP((f1)|(f2)) | PREF2(f1, f2))
460-
shortid;/* ID of subre (1..ntree-1) */
461-
intsubno;/* subexpression number for 'b' and '(', or
462-
* LATYPE code for lookaround constraint */
460+
charlatype;/* LATYPE code, if lookaround constraint */
461+
intid;/* ID of subre (1..ntree-1) */
462+
intcapno;/* if capture node, subno to capture into */
463+
intbackno;/* if backref node, subno it refers to */
463464
shortmin;/* min repetitions for iteration or backref */
464465
shortmax;/* max repetitions for iteration or backref */
465466
structsubre*child;/* first child, if any (also freelist chain) */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp