@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
10361036/*----------
10371037 * Prepare a general-purpose state skeleton.
10381038 *
1039- * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1040- * / /
1041- * [lp] ----> [s2] ----bypass---------------------
1039+ * In the no-backrefs case, we want this:
10421040 *
1043- * where bypass is an empty, and prefix is some repetitions of atom
1041+ * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1042+ *
1043+ * where prefix is some repetitions of atom. In the general case we need
1044+ *
1045+ * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
1046+ *
1047+ * where the iterator wraps around [begin] ---atom---> [end]
1048+ *
1049+ * We make the s state here for both cases; s2 is made below if needed
10441050 *----------
10451051 */
10461052s = newstate (v -> nfa );/* first, new endpoints for the atom */
@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
10511057NOERR ();
10521058atom -> begin = s ;
10531059atom -> end = s2 ;
1054- s = newstate (v -> nfa );/* and spots for prefix and bypass */
1055- s2 = newstate (v -> nfa );
1060+ s = newstate (v -> nfa );/* set up starting state */
10561061NOERR ();
10571062EMPTYARC (lp ,s );
1058- EMPTYARC (lp ,s2 );
10591063NOERR ();
10601064
10611065/* break remaining subRE into x{...} and what follows */
@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
10891093}
10901094
10911095/*
1092- * It's quantifier time. If the atom is just a BACKREF, we'll let it deal
1093- * with quantifiers internally. Otherwise, the first step is to turn
1094- * x{0,...} into x{1,...}|empty
1096+ * It's quantifier time. If the atom is just a backref, we'll let it deal
1097+ * with quantifiers internally.
10951098 */
1096- if (m == 0 && atomtype != BACKREF )
1097- {
1098- EMPTYARC (s2 ,atom -> end );/* the bypass */
1099- assert (PREF (qprefer )!= 0 );
1100- f = COMBINE (qprefer ,atom -> flags );
1101- t = subre (v ,'|' ,f ,lp ,atom -> end );
1102- NOERR ();
1103- t -> left = atom ;
1104- t -> right = subre (v ,'|' ,PREF (f ),s2 ,atom -> end );
1105- NOERR ();
1106- t -> right -> left = subre (v ,'=' ,0 ,s2 ,atom -> end );
1107- NOERR ();
1108- * atomp = t ;
1109- atomp = & t -> left ;
1110- m = 1 ;
1111- }
1112-
1113- /* deal with the rest of the quantifier */
11141099if (atomtype == BACKREF )
11151100{
11161101/* special case: backrefs have internal quantifiers */
@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
11201105atom -> min = (short )m ;
11211106atom -> max = (short )n ;
11221107atom -> flags |=COMBINE (qprefer ,atom -> flags );
1108+ /* rest of branch can be strung starting from atom->end */
1109+ s2 = atom -> end ;
11231110}
11241111else if (m == 1 && n == 1 )
11251112{
11261113/* no/vacuous quantifier: done */
11271114EMPTYARC (s ,atom -> begin );/* empty prefix */
1115+ /* rest of branch can be strung starting from atom->end */
1116+ s2 = atom -> end ;
11281117}
1129- else
1118+ else if ( m > 0 && !( atom -> flags & BACKR ))
11301119{
11311120/*
1132- * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
1133- * second x
1121+ * If there's no backrefs involved, we can turn x{m,n} into
1122+ * x{m-1,n-1}x, with capturing parens in only the second x. This
1123+ * is valid because we only care about capturing matches from the
1124+ * final iteration of the quantifier. It's a win because we can
1125+ * implement the backref-free left side as a plain DFA node, since
1126+ * we don't really care where its submatches are.
11341127 */
11351128dupnfa (v -> nfa ,atom -> begin ,atom -> end ,s ,atom -> begin );
11361129assert (m >=1 && m != INFINITY && n >=1 );
@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
11421135NOERR ();
11431136t -> right = atom ;
11441137* atomp = t ;
1138+ /* rest of branch can be strung starting from atom->end */
1139+ s2 = atom -> end ;
1140+ }
1141+ else
1142+ {
1143+ /* general case: need an iteration node */
1144+ s2 = newstate (v -> nfa );
1145+ NOERR ();
1146+ moveouts (v -> nfa ,atom -> end ,s2 );
1147+ NOERR ();
1148+ dupnfa (v -> nfa ,atom -> begin ,atom -> end ,s ,s2 );
1149+ repeat (v ,s ,s2 ,m ,n );
1150+ f = COMBINE (qprefer ,atom -> flags );
1151+ t = subre (v ,'*' ,f ,s ,s2 );
1152+ NOERR ();
1153+ t -> min = (short )m ;
1154+ t -> max = (short )n ;
1155+ t -> left = atom ;
1156+ * atomp = t ;
1157+ /* rest of branch is to be strung from iteration's end state */
11451158}
11461159
11471160/* and finally, look after that postponed recursion */
11481161t = top -> right ;
11491162if (!(SEE ('|' )|| SEE (stopper )|| SEE (EOS )))
1150- t -> right = parsebranch (v ,stopper ,type ,atom -> end ,rp ,1 );
1163+ t -> right = parsebranch (v ,stopper ,type ,s2 ,rp ,1 );
11511164else
11521165{
1153- EMPTYARC (atom -> end ,rp );
1154- t -> right = subre (v ,'=' ,0 ,atom -> end ,rp );
1166+ EMPTYARC (s2 ,rp );
1167+ t -> right = subre (v ,'=' ,0 ,s2 ,rp );
11551168}
11561169assert (SEE ('|' )|| SEE (stopper )|| SEE (EOS ));
11571170t -> flags |=COMBINE (t -> flags ,t -> right -> flags );
@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
12141227/*
12151228 * repeat - replicate subNFA for quantifiers
12161229 *
1230+ * The sub-NFA strung from lp to rp is modified to represent m to n
1231+ * repetitions of its initial contents.
1232+ *
12171233 * The duplication sequences used here are chosen carefully so that any
12181234 * pointers starting out pointing into the subexpression end up pointing into
12191235 * the last occurrence. (Note that it may not be strung between the same
@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
12291245int n )
12301246{
12311247#define SOME 2
1232- #define INF 3
1248+ #define INF 3
12331249#define PAIR (x ,y ) ((x)*4 + (y))
12341250#define REDUCE (x ) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
12351251const int rm = REDUCE (m );
@@ -1603,7 +1619,7 @@ subre(struct vars * v,
16031619v -> treechain = ret ;
16041620}
16051621
1606- assert (strchr ("|.b(= " ,op )!= NULL );
1622+ assert (strchr ("=b|.*( " ,op )!= NULL );
16071623
16081624ret -> op = op ;
16091625ret -> flags = flags ;