Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit173e29a

Browse files
committed
Fix the general case of quantified regex back-references.
Cases where a back-reference is part of a larger subexpression thatis quantified have never worked in Spencer's regex engine, becausehe used a compile-time transformation that neglected the need tocheck the back-reference match in iterations before the last one.(That was okay for capturing parens, and we still do it if theregex has *only* capturing parens ... but it's not okay for backrefs.)To make this work properly, we have to add an "iteration" node typeto the regex engine's vocabulary of sub-regex nodes. Since this is amoderately large change with a fair risk of introducing new bugs of itsown, apply to HEAD only, even though it's a fix for a longstanding bug.
1 parent0c9e5d5 commit173e29a

File tree

6 files changed

+884
-55
lines changed

6 files changed

+884
-55
lines changed

‎src/backend/regex/README

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are
102102
either plain regular expressions (which are executed as DFAs in the manner
103103
described above) or back-references (which try to match the input to some
104104
previous substring). Non-leaf nodes are capture nodes (which save the
105-
location of the substring currently matching their child node) or
106-
concatenationoralternation nodes. At execution time, the executor
107-
recursively scans the tree. At concatenation oralternation nodes,
108-
it considers each possible alternative way of matching the input string,
109-
ieeach place where the string could be split for a concatenation, or each
110-
child node for an alternation. It tries the next alternative if the match
111-
fails according to the child nodes. This is exactly thesort of
112-
backtracking search done by a traditional NFA regex engine. If there are
113-
many tree levels it can get very slow.
105+
location of the substring currently matching their child node),
106+
concatenation, alternation,oriteration nodes. At execution time, the
107+
executorrecursively scans the tree. At concatenation,alternation, or
108+
iteration nodes,it considers each possible alternative way of matching the
109+
input string, that iseach place where the string could be split for a
110+
concatenation or iteration, or eachchild node for an alternation. It
111+
tries the next alternative if the match fails according to thechild nodes.
112+
This is exactly the sort ofbacktracking search done by a traditional NFA
113+
regex engine. If there aremany tree levels it can get very slow.
114114

115115
But all is not lost: we can still be smarter than the average pure NFA
116116
engine. To do this, each subre node has an associated DFA, which

‎src/backend/regex/regcomp.c

Lines changed: 52 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
10361036
/*----------
10371037
* Prepare a general-purpose state skeleton.
10381038
*
1039-
* ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1040-
* / /
1041-
* [lp] ----> [s2] ----bypass---------------------
1039+
* In the no-backrefs case, we want this:
10421040
*
1043-
* where bypass is an empty, and prefix is some repetitions of atom
1041+
* [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1042+
*
1043+
* where prefix is some repetitions of atom. In the general case we need
1044+
*
1045+
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
1046+
*
1047+
* where the iterator wraps around [begin] ---atom---> [end]
1048+
*
1049+
* We make the s state here for both cases; s2 is made below if needed
10441050
*----------
10451051
*/
10461052
s=newstate(v->nfa);/* first, new endpoints for the atom */
@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
10511057
NOERR();
10521058
atom->begin=s;
10531059
atom->end=s2;
1054-
s=newstate(v->nfa);/* and spots for prefix and bypass */
1055-
s2=newstate(v->nfa);
1060+
s=newstate(v->nfa);/* set up starting state */
10561061
NOERR();
10571062
EMPTYARC(lp,s);
1058-
EMPTYARC(lp,s2);
10591063
NOERR();
10601064

10611065
/* break remaining subRE into x{...} and what follows */
@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
10891093
}
10901094

10911095
/*
1092-
* It's quantifier time. If the atom is just a BACKREF, we'll let it deal
1093-
* with quantifiers internally. Otherwise, the first step is to turn
1094-
* x{0,...} into x{1,...}|empty
1096+
* It's quantifier time. If the atom is just a backref, we'll let it deal
1097+
* with quantifiers internally.
10951098
*/
1096-
if (m==0&&atomtype!=BACKREF)
1097-
{
1098-
EMPTYARC(s2,atom->end);/* the bypass */
1099-
assert(PREF(qprefer)!=0);
1100-
f=COMBINE(qprefer,atom->flags);
1101-
t=subre(v,'|',f,lp,atom->end);
1102-
NOERR();
1103-
t->left=atom;
1104-
t->right=subre(v,'|',PREF(f),s2,atom->end);
1105-
NOERR();
1106-
t->right->left=subre(v,'=',0,s2,atom->end);
1107-
NOERR();
1108-
*atomp=t;
1109-
atomp=&t->left;
1110-
m=1;
1111-
}
1112-
1113-
/* deal with the rest of the quantifier */
11141099
if (atomtype==BACKREF)
11151100
{
11161101
/* special case: backrefs have internal quantifiers */
@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
11201105
atom->min= (short)m;
11211106
atom->max= (short)n;
11221107
atom->flags |=COMBINE(qprefer,atom->flags);
1108+
/* rest of branch can be strung starting from atom->end */
1109+
s2=atom->end;
11231110
}
11241111
elseif (m==1&&n==1)
11251112
{
11261113
/* no/vacuous quantifier: done */
11271114
EMPTYARC(s,atom->begin);/* empty prefix */
1115+
/* rest of branch can be strung starting from atom->end */
1116+
s2=atom->end;
11281117
}
1129-
else
1118+
elseif (m>0&& !(atom->flags&BACKR))
11301119
{
11311120
/*
1132-
* Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
1133-
* second x
1121+
* If there's no backrefs involved, we can turn x{m,n} into
1122+
* x{m-1,n-1}x, with capturing parens in only the second x. This
1123+
* is valid because we only care about capturing matches from the
1124+
* final iteration of the quantifier. It's a win because we can
1125+
* implement the backref-free left side as a plain DFA node, since
1126+
* we don't really care where its submatches are.
11341127
*/
11351128
dupnfa(v->nfa,atom->begin,atom->end,s,atom->begin);
11361129
assert(m >=1&&m!=INFINITY&&n >=1);
@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
11421135
NOERR();
11431136
t->right=atom;
11441137
*atomp=t;
1138+
/* rest of branch can be strung starting from atom->end */
1139+
s2=atom->end;
1140+
}
1141+
else
1142+
{
1143+
/* general case: need an iteration node */
1144+
s2=newstate(v->nfa);
1145+
NOERR();
1146+
moveouts(v->nfa,atom->end,s2);
1147+
NOERR();
1148+
dupnfa(v->nfa,atom->begin,atom->end,s,s2);
1149+
repeat(v,s,s2,m,n);
1150+
f=COMBINE(qprefer,atom->flags);
1151+
t=subre(v,'*',f,s,s2);
1152+
NOERR();
1153+
t->min= (short)m;
1154+
t->max= (short)n;
1155+
t->left=atom;
1156+
*atomp=t;
1157+
/* rest of branch is to be strung from iteration's end state */
11451158
}
11461159

11471160
/* and finally, look after that postponed recursion */
11481161
t=top->right;
11491162
if (!(SEE('|')||SEE(stopper)||SEE(EOS)))
1150-
t->right=parsebranch(v,stopper,type,atom->end,rp,1);
1163+
t->right=parsebranch(v,stopper,type,s2,rp,1);
11511164
else
11521165
{
1153-
EMPTYARC(atom->end,rp);
1154-
t->right=subre(v,'=',0,atom->end,rp);
1166+
EMPTYARC(s2,rp);
1167+
t->right=subre(v,'=',0,s2,rp);
11551168
}
11561169
assert(SEE('|')||SEE(stopper)||SEE(EOS));
11571170
t->flags |=COMBINE(t->flags,t->right->flags);
@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
12141227
/*
12151228
* repeat - replicate subNFA for quantifiers
12161229
*
1230+
* The sub-NFA strung from lp to rp is modified to represent m to n
1231+
* repetitions of its initial contents.
1232+
*
12171233
* The duplication sequences used here are chosen carefully so that any
12181234
* pointers starting out pointing into the subexpression end up pointing into
12191235
* the last occurrence. (Note that it may not be strung between the same
@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
12291245
intn)
12301246
{
12311247
#defineSOME 2
1232-
#defineINF 3
1248+
#defineINF 3
12331249
#definePAIR(x,y) ((x)*4 + (y))
12341250
#defineREDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
12351251
constintrm=REDUCE(m);
@@ -1603,7 +1619,7 @@ subre(struct vars * v,
16031619
v->treechain=ret;
16041620
}
16051621

1606-
assert(strchr("|.b(=",op)!=NULL);
1622+
assert(strchr("=b|.*(",op)!=NULL);
16071623

16081624
ret->op=op;
16091625
ret->flags=flags;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp