NotificationsYou must be signed in to change notification settings
Fork5
Star27

Commit173e29a

committed

Fix the general case of quantified regex back-references.

Cases where a back-reference is part of a larger subexpression thatis quantified have never worked in Spencer's regex engine, becausehe used a compile-time transformation that neglected the need tocheck the back-reference match in iterations before the last one.(That was okay for capturing parens, and we still do it if theregex has *only* capturing parens ... but it's not okay for backrefs.)To make this work properly, we have to add an "iteration" node typeto the regex engine's vocabulary of sub-regex nodes. Since this is amoderately large change with a fair risk of introducing new bugs of itsown, apply to HEAD only, even though it's a fix for a longstanding bug.

1 parent0c9e5d5 commit173e29aCopy full SHA for 173e29a

File tree

6 files changed

+884

-55

lines changed

src
- backend/regex
- include/regex
  - regguts.h
- test/regress
  - expected
    - regex.out
  - sql
    - regex.sql

6 files changed

+884

-55

lines changed

`‎src/backend/regex/README‎`

Lines changed: 9 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are`
`102`	`102`	`either plain regular expressions (which are executed as DFAs in the manner`
`103`	`103`	`described above) or back-references (which try to match the input to some`
`104`	`104`	`previous substring). Non-leaf nodes are capture nodes (which save the`
`105`		`-location of the substring currently matching their child node) or`
`106`		`-concatenationoralternation nodes. At execution time, the executor`
`107`		`-recursively scans the tree. At concatenation oralternation nodes,`
`108`		`-it considers each possible alternative way of matching the input string,`
`109`		`-ieeach place where the string could be split for a concatenation, or each`
`110`		`-child node for an alternation. It tries the next alternative if the match`
`111`		`-fails according to the child nodes. This is exactly thesort of`
`112`		`-backtracking search done by a traditional NFA regex engine. If there are`
`113`		`-many tree levels it can get very slow.`
	`105`	`+location of the substring currently matching their child node),`
	`106`	`+concatenation, alternation,oriteration nodes. At execution time, the`
	`107`	`+executorrecursively scans the tree. At concatenation,alternation, or`
	`108`	`+iteration nodes,it considers each possible alternative way of matching the`
	`109`	`+input string, that iseach place where the string could be split for a`
	`110`	`+concatenation or iteration, or eachchild node for an alternation. It`
	`111`	`+tries the next alternative if the match fails according to thechild nodes.`
	`112`	`+This is exactly the sort ofbacktracking search done by a traditional NFA`
	`113`	`+regex engine. If there aremany tree levels it can get very slow.`
`114`	`114`
`115`	`115`	`But all is not lost: we can still be smarter than the average pure NFA`
`116`	`116`	`engine. To do this, each subre node has an associated DFA, which`

`‎src/backend/regex/regcomp.c‎`

Lines changed: 52 additions & 36 deletions

Original file line number	Diff line number	Diff line change
`@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,`
`1036`	`1036`	`/*----------`
`1037`	`1037`	`* Prepare a general-purpose state skeleton.`
`1038`	`1038`	`*`
`1039`		`- * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]`
`1040`		`- * / /`
`1041`		`- * [lp] ----> [s2] ----bypass---------------------`
	`1039`	`+ * In the no-backrefs case, we want this:`
`1042`	`1040`	`*`
`1043`		`- * where bypass is an empty, and prefix is some repetitions of atom`
	`1041`	`+ * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]`
	`1042`	`+ *`
	`1043`	`+ * where prefix is some repetitions of atom. In the general case we need`
	`1044`	`+ *`
	`1045`	`+ * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]`
	`1046`	`+ *`
	`1047`	`+ * where the iterator wraps around [begin] ---atom---> [end]`
	`1048`	`+ *`
	`1049`	`+ * We make the s state here for both cases; s2 is made below if needed`
`1044`	`1050`	`*----------`
`1045`	`1051`	`*/`
`1046`	`1052`	`s=newstate(v->nfa);/* first, new endpoints for the atom */`
`@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,`
`1051`	`1057`	`NOERR();`
`1052`	`1058`	`atom->begin=s;`
`1053`	`1059`	`atom->end=s2;`
`1054`		`-s=newstate(v->nfa);/* and spots for prefix and bypass */`
`1055`		`-s2=newstate(v->nfa);`
	`1060`	`+s=newstate(v->nfa);/* set up starting state */`
`1056`	`1061`	`NOERR();`
`1057`	`1062`	`EMPTYARC(lp,s);`
`1058`		`-EMPTYARC(lp,s2);`
`1059`	`1063`	`NOERR();`
`1060`	`1064`
`1061`	`1065`	`/* break remaining subRE into x{...} and what follows */`
`@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,`
`1089`	`1093`	`}`
`1090`	`1094`
`1091`	`1095`	`/*`
`1092`		`- * It's quantifier time. If the atom is just a BACKREF, we'll let it deal`
`1093`		`- * with quantifiers internally. Otherwise, the first step is to turn`
`1094`		`- * x{0,...} into x{1,...}\|empty`
	`1096`	`+ * It's quantifier time. If the atom is just a backref, we'll let it deal`
	`1097`	`+ * with quantifiers internally.`
`1095`	`1098`	`*/`
`1096`		`-if (m==0&&atomtype!=BACKREF)`
`1097`		`-{`
`1098`		`-EMPTYARC(s2,atom->end);/* the bypass */`
`1099`		`-assert(PREF(qprefer)!=0);`
`1100`		`-f=COMBINE(qprefer,atom->flags);`
`1101`		`-t=subre(v,'\|',f,lp,atom->end);`
`1102`		`-NOERR();`
`1103`		`-t->left=atom;`
`1104`		`-t->right=subre(v,'\|',PREF(f),s2,atom->end);`
`1105`		`-NOERR();`
`1106`		`-t->right->left=subre(v,'=',0,s2,atom->end);`
`1107`		`-NOERR();`
`1108`		`-*atomp=t;`
`1109`		`-atomp=&t->left;`
`1110`		`-m=1;`
`1111`		`-}`
`1112`		`-`
`1113`		`-/* deal with the rest of the quantifier */`
`1114`	`1099`	`if (atomtype==BACKREF)`
`1115`	`1100`	`{`
`1116`	`1101`	`/* special case: backrefs have internal quantifiers */`
`@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,`
`1120`	`1105`	`atom->min= (short)m;`
`1121`	`1106`	`atom->max= (short)n;`
`1122`	`1107`	`atom->flags \|=COMBINE(qprefer,atom->flags);`
	`1108`	`+/* rest of branch can be strung starting from atom->end */`
	`1109`	`+s2=atom->end;`
`1123`	`1110`	`}`
`1124`	`1111`	`elseif (m==1&&n==1)`
`1125`	`1112`	`{`
`1126`	`1113`	`/* no/vacuous quantifier: done */`
`1127`	`1114`	`EMPTYARC(s,atom->begin);/* empty prefix */`
	`1115`	`+/* rest of branch can be strung starting from atom->end */`
	`1116`	`+s2=atom->end;`
`1128`	`1117`	`}`
`1129`		`-else`
	`1118`	`+elseif (m>0&& !(atom->flags&BACKR))`
`1130`	`1119`	`{`
`1131`	`1120`	`/*`
`1132`		`- * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the`
`1133`		`- * second x`
	`1121`	`+ * If there's no backrefs involved, we can turn x{m,n} into`
	`1122`	`+ * x{m-1,n-1}x, with capturing parens in only the second x. This`
	`1123`	`+ * is valid because we only care about capturing matches from the`
	`1124`	`+ * final iteration of the quantifier. It's a win because we can`
	`1125`	`+ * implement the backref-free left side as a plain DFA node, since`
	`1126`	`+ * we don't really care where its submatches are.`
`1134`	`1127`	`*/`
`1135`	`1128`	`dupnfa(v->nfa,atom->begin,atom->end,s,atom->begin);`
`1136`	`1129`	`assert(m >=1&&m!=INFINITY&&n >=1);`
`@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,`
`1142`	`1135`	`NOERR();`
`1143`	`1136`	`t->right=atom;`
`1144`	`1137`	`*atomp=t;`
	`1138`	`+/* rest of branch can be strung starting from atom->end */`
	`1139`	`+s2=atom->end;`
	`1140`	`+}`
	`1141`	`+else`
	`1142`	`+{`
	`1143`	`+/* general case: need an iteration node */`
	`1144`	`+s2=newstate(v->nfa);`
	`1145`	`+NOERR();`
	`1146`	`+moveouts(v->nfa,atom->end,s2);`
	`1147`	`+NOERR();`
	`1148`	`+dupnfa(v->nfa,atom->begin,atom->end,s,s2);`
	`1149`	`+repeat(v,s,s2,m,n);`
	`1150`	`+f=COMBINE(qprefer,atom->flags);`
	`1151`	`+t=subre(v,'*',f,s,s2);`
	`1152`	`+NOERR();`
	`1153`	`+t->min= (short)m;`
	`1154`	`+t->max= (short)n;`
	`1155`	`+t->left=atom;`
	`1156`	`+*atomp=t;`
	`1157`	`+/* rest of branch is to be strung from iteration's end state */`
`1145`	`1158`	`}`
`1146`	`1159`
`1147`	`1160`	`/* and finally, look after that postponed recursion */`
`1148`	`1161`	`t=top->right;`
`1149`	`1162`	`if (!(SEE('\|')\|\|SEE(stopper)\|\|SEE(EOS)))`
`1150`		`-t->right=parsebranch(v,stopper,type,atom->end,rp,1);`
	`1163`	`+t->right=parsebranch(v,stopper,type,s2,rp,1);`
`1151`	`1164`	`else`
`1152`	`1165`	`{`
`1153`		`-EMPTYARC(atom->end,rp);`
`1154`		`-t->right=subre(v,'=',0,atom->end,rp);`
	`1166`	`+EMPTYARC(s2,rp);`
	`1167`	`+t->right=subre(v,'=',0,s2,rp);`
`1155`	`1168`	`}`
`1156`	`1169`	`assert(SEE('\|')\|\|SEE(stopper)\|\|SEE(EOS));`
`1157`	`1170`	`t->flags \|=COMBINE(t->flags,t->right->flags);`
`@@ -1214,6 +1227,9 @@ scannum(struct vars * v)`
`1214`	`1227`	`/*`
`1215`	`1228`	`* repeat - replicate subNFA for quantifiers`
`1216`	`1229`	`*`
	`1230`	`+ * The sub-NFA strung from lp to rp is modified to represent m to n`
	`1231`	`+ * repetitions of its initial contents.`
	`1232`	`+ *`
`1217`	`1233`	`* The duplication sequences used here are chosen carefully so that any`
`1218`	`1234`	`* pointers starting out pointing into the subexpression end up pointing into`
`1219`	`1235`	`* the last occurrence. (Note that it may not be strung between the same`
`@@ -1229,7 +1245,7 @@ repeat(struct vars * v,`
`1229`	`1245`	`intn)`
`1230`	`1246`	`{`
`1231`	`1247`	`#defineSOME 2`
`1232`		`-#defineINF 3`
	`1248`	`+#defineINF 3`
`1233`	`1249`	`#definePAIR(x,y) ((x)*4 + (y))`
`1234`	`1250`	`#defineREDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )`
`1235`	`1251`	`constintrm=REDUCE(m);`
`@@ -1603,7 +1619,7 @@ subre(struct vars * v,`
`1603`	`1619`	`v->treechain=ret;`
`1604`	`1620`	`}`
`1605`	`1621`
`1606`		`-assert(strchr("\|.b(=",op)!=NULL);`
	`1622`	`+assert(strchr("=b\|.*(",op)!=NULL);`
`1607`	`1623`
`1608`	`1624`	`ret->op=op;`
`1609`	`1625`	`ret->flags=flags;`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit173e29a

File tree

6 files changed

6 files changed

`‎src/backend/regex/README‎`

`‎src/backend/regex/regcomp.c‎`

0 commit comments