3434
3535/*
3636 * longest - longest-preferred matching engine
37+ *
38+ * On success, returns match endpoint address. Returns NULL on no match.
39+ * Internal errors also return NULL, with v->err set.
3740 */
38- static chr * /* endpoint, or NULL */
39- longest (struct vars * v ,/* used only for debug and exec flags */
41+ static chr *
42+ longest (struct vars * v ,
4043struct dfa * d ,
4144chr * start ,/* where the match should start */
4245chr * stop ,/* match must end at or before here */
@@ -51,11 +54,15 @@ longest(struct vars * v,/* used only for debug and exec flags */
5154int i ;
5255struct colormap * cm = d -> cm ;
5356
57+ /* prevent "uninitialized variable" warnings */
58+ if (hitstopp != NULL )
59+ * hitstopp = 0 ;
60+
5461/* initialize */
5562css = initialize (v ,d ,start );
63+ if (css == NULL )
64+ return NULL ;
5665cp = start ;
57- if (hitstopp != NULL )
58- * hitstopp = 0 ;
5966
6067/* startup */
6168FDEBUG (("+++ startup +++\n" ));
@@ -74,8 +81,14 @@ longest(struct vars * v,/* used only for debug and exec flags */
7481return NULL ;
7582css -> lastseen = cp ;
7683
77- /* main loop */
84+ /*
85+ * This is the main text-scanning loop. It seems worth having two copies
86+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
87+ * builds, when you're not actively tracing.
88+ */
89+ #ifdef REG_DEBUG
7890if (v -> eflags & REG_FTRACE )
91+ {
7992while (cp < realstop )
8093{
8194FDEBUG (("+++ at c%d +++\n" , (int ) (css - d -> ssets )));
@@ -92,7 +105,10 @@ longest(struct vars * v,/* used only for debug and exec flags */
92105ss -> lastseen = cp ;
93106css = ss ;
94107}
108+ }
95109else
110+ #endif
111+ {
96112while (cp < realstop )
97113{
98114co = GETCOLOR (cm ,* cp );
@@ -107,6 +123,10 @@ longest(struct vars * v,/* used only for debug and exec flags */
107123ss -> lastseen = cp ;
108124css = ss ;
109125}
126+ }
127+
128+ if (ISERR ())
129+ return NULL ;
110130
111131/* shutdown */
112132FDEBUG (("+++ shutdown at c%d +++\n" , (int ) (css - d -> ssets )));
@@ -117,6 +137,8 @@ longest(struct vars * v,/* used only for debug and exec flags */
117137co = d -> cnfa -> eos [(v -> eflags & REG_NOTEOL ) ?0 :1 ];
118138FDEBUG (("color %ld\n" , (long )co ));
119139ss = miss (v ,d ,css ,co ,cp ,start );
140+ if (ISERR ())
141+ return NULL ;
120142/* special case: match ended at eol? */
121143if (ss != NULL && (ss -> flags & POSTSTATE ))
122144return cp ;
@@ -138,14 +160,17 @@ longest(struct vars * v,/* used only for debug and exec flags */
138160
139161/*
140162 * shortest - shortest-preferred matching engine
163+ *
164+ * On success, returns match endpoint address. Returns NULL on no match.
165+ * Internal errors also return NULL, with v->err set.
141166 */
142- static chr * /* endpoint, or NULL */
167+ static chr *
143168shortest (struct vars * v ,
144169struct dfa * d ,
145170chr * start ,/* where the match should start */
146171chr * min ,/* match must end at or after here */
147172chr * max ,/* match must end at or before here */
148- chr * * coldp ,/* store coldstart pointer here, ifnonNULL */
173+ chr * * coldp ,/* store coldstart pointer here, ifnon-NULL */
149174int * hitstopp )/* record whether hit v->stop, if non-NULL */
150175{
151176chr * cp ;
@@ -156,11 +181,17 @@ shortest(struct vars * v,
156181struct sset * ss ;
157182struct colormap * cm = d -> cm ;
158183
184+ /* prevent "uninitialized variable" warnings */
185+ if (coldp != NULL )
186+ * coldp = NULL ;
187+ if (hitstopp != NULL )
188+ * hitstopp = 0 ;
189+
159190/* initialize */
160191css = initialize (v ,d ,start );
192+ if (css == NULL )
193+ return NULL ;
161194cp = start ;
162- if (hitstopp != NULL )
163- * hitstopp = 0 ;
164195
165196/* startup */
166197FDEBUG (("--- startup ---\n" ));
@@ -180,8 +211,14 @@ shortest(struct vars * v,
180211css -> lastseen = cp ;
181212ss = css ;
182213
183- /* main loop */
214+ /*
215+ * This is the main text-scanning loop. It seems worth having two copies
216+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
217+ * builds, when you're not actively tracing.
218+ */
219+ #ifdef REG_DEBUG
184220if (v -> eflags & REG_FTRACE )
221+ {
185222while (cp < realmax )
186223{
187224FDEBUG (("--- at c%d ---\n" , (int ) (css - d -> ssets )));
@@ -200,7 +237,10 @@ shortest(struct vars * v,
200237if ((ss -> flags & POSTSTATE )&& cp >=realmin )
201238break ;/* NOTE BREAK OUT */
202239}
240+ }
203241else
242+ #endif
243+ {
204244while (cp < realmax )
205245{
206246co = GETCOLOR (cm ,* cp );
@@ -217,6 +257,7 @@ shortest(struct vars * v,
217257if ((ss -> flags & POSTSTATE )&& cp >=realmin )
218258break ;/* NOTE BREAK OUT */
219259}
260+ }
220261
221262if (ss == NULL )
222263return NULL ;
@@ -389,7 +430,7 @@ hash(unsigned *uv,
389430 * initialize - hand-craft a cache entry for startup, otherwise get ready
390431 */
391432static struct sset *
392- initialize (struct vars * v ,/* used only for debug flags */
433+ initialize (struct vars * v ,
393434struct dfa * d ,
394435chr * start )
395436{
@@ -402,6 +443,8 @@ initialize(struct vars * v,/* used only for debug flags */
402443else
403444{/* no, must (re)build it */
404445ss = getvacant (v ,d ,start ,start );
446+ if (ss == NULL )
447+ return NULL ;
405448for (i = 0 ;i < d -> wordsper ;i ++ )
406449ss -> states [i ]= 0 ;
407450BSET (ss -> states ,d -> cnfa -> pre );
@@ -420,10 +463,20 @@ initialize(struct vars * v,/* used only for debug flags */
420463}
421464
422465/*
423- * miss - handle a cache miss
466+ * miss - handle a stateset cache miss
467+ *
468+ * css is the current stateset, co is the color of the current input character,
469+ * cp points to the character after that (which is where we may need to test
470+ * LACONs). start does not affect matching behavior but is needed for pickss'
471+ * heuristics about which stateset cache entry to replace.
472+ *
473+ * Ordinarily, returns the address of the next stateset (the one that is
474+ * valid after consuming the input character). Returns NULL if no valid
475+ * NFA states remain, ie we have a certain match failure.
476+ * Internal errors also return NULL, with v->err set.
424477 */
425- static struct sset * /* NULL if goes to empty set */
426- miss (struct vars * v ,/* used only for debug flags */
478+ static struct sset *
479+ miss (struct vars * v ,
427480struct dfa * d ,
428481struct sset * css ,
429482pcolor co ,
@@ -449,9 +502,23 @@ miss(struct vars * v,/* used only for debug flags */
449502}
450503FDEBUG (("miss\n" ));
451504
452- /* first, what set of states would we end up in? */
505+ /*
506+ * Checking for operation cancel in the inner text search loop seems
507+ * unduly expensive. As a compromise, check during cache misses.
508+ */
509+ if (CANCEL_REQUESTED (v -> re ))
510+ {
511+ ERR (REG_CANCEL );
512+ return NULL ;
513+ }
514+
515+ /*
516+ * What set of states would we end up in after consuming the co character?
517+ * We first consider PLAIN arcs that consume the character, and then look
518+ * to see what LACON arcs could be traversed after consuming it.
519+ */
453520for (i = 0 ;i < d -> wordsper ;i ++ )
454- d -> work [i ]= 0 ;
521+ d -> work [i ]= 0 ;/* build new stateset bitmap in d->work */
455522ispost = 0 ;
456523noprogress = 1 ;
457524gotstate = 0 ;
@@ -468,22 +535,31 @@ miss(struct vars * v,/* used only for debug flags */
468535noprogress = 0 ;
469536FDEBUG (("%d -> %d\n" ,i ,ca -> to ));
470537}
471- dolacons = (gotstate ) ? (cnfa -> flags & HASLACONS ) :0 ;
538+ if (!gotstate )
539+ return NULL ;/* character cannot reach any new state */
540+ dolacons = (cnfa -> flags & HASLACONS );
472541sawlacons = 0 ;
542+ /* outer loop handles transitive closure of reachable-by-LACON states */
473543while (dolacons )
474- {/* transitive closure */
544+ {
475545dolacons = 0 ;
476546for (i = 0 ;i < d -> nstates ;i ++ )
477547if (ISBSET (d -> work ,i ))
478548for (ca = cnfa -> states [i ];ca -> co != COLORLESS ;ca ++ )
479549{
480550if (ca -> co < cnfa -> ncolors )
481- continue ;/* NOTE CONTINUE */
482- sawlacons = 1 ;
551+ continue ;/* not a LACON arc */
483552if (ISBSET (d -> work ,ca -> to ))
484- continue ;/* NOTE CONTINUE */
553+ continue ;/* arc would be a no-op anyway */
554+ sawlacons = 1 ;/* this LACON affects our result */
485555if (!lacon (v ,cnfa ,cp ,ca -> co ))
486- continue ;/* NOTE CONTINUE */
556+ {
557+ if (ISERR ())
558+ return NULL ;
559+ continue ;/* LACON arc cannot be traversed */
560+ }
561+ if (ISERR ())
562+ return NULL ;
487563BSET (d -> work ,ca -> to );
488564dolacons = 1 ;
489565if (ca -> to == cnfa -> post )
@@ -493,11 +569,9 @@ miss(struct vars * v,/* used only for debug flags */
493569FDEBUG (("%d :> %d\n" ,i ,ca -> to ));
494570}
495571}
496- if (!gotstate )
497- return NULL ;
498572h = HASH (d -> work ,d -> wordsper );
499573
500- /*next, is that in the cache? */
574+ /*Is this stateset already in the cache? */
501575for (p = d -> ssets ,i = d -> nssused ;i > 0 ;p ++ ,i -- )
502576if (HIT (h ,d -> work ,p ,d -> wordsper ))
503577{
@@ -507,6 +581,8 @@ miss(struct vars * v,/* used only for debug flags */
507581if (i == 0 )
508582{/* nope, need a new cache entry */
509583p = getvacant (v ,d ,cp ,start );
584+ if (p == NULL )
585+ return NULL ;
510586assert (p != css );
511587for (i = 0 ;i < d -> wordsper ;i ++ )
512588p -> states [i ]= d -> work [i ];
@@ -517,8 +593,15 @@ miss(struct vars * v,/* used only for debug flags */
517593/* lastseen to be dealt with by caller */
518594}
519595
596+ /*
597+ * Link new stateset to old, unless a LACON affected the result, in which
598+ * case we don't create the link. That forces future transitions across
599+ * this same arc (same prior stateset and character color) to come through
600+ * miss() again, so that we can recheck the LACON(s), which might or might
601+ * not pass since context will be different.
602+ */
520603if (!sawlacons )
521- {/* lookahead conds. always cache miss */
604+ {
522605FDEBUG (("c%d[%d]->c%d\n" ,
523606(int ) (css - d -> ssets ),co , (int ) (p - d -> ssets )));
524607css -> outs [co ]= p ;
@@ -562,11 +645,12 @@ lacon(struct vars * v,
562645
563646/*
564647 * getvacant - get a vacant state set
648+ *
565649 * This routine clears out the inarcs and outarcs, but does not otherwise
566650 * clear the innards of the state set -- that's up to the caller.
567651 */
568652static struct sset *
569- getvacant (struct vars * v ,/* used only for debug flags */
653+ getvacant (struct vars * v ,
570654struct dfa * d ,
571655chr * cp ,
572656chr * start )
@@ -578,6 +662,8 @@ getvacant(struct vars * v,/* used only for debug flags */
578662color co ;
579663
580664ss = pickss (v ,d ,cp ,start );
665+ if (ss == NULL )
666+ return NULL ;
581667assert (!(ss -> flags & LOCKED ));
582668
583669/* clear out its inarcs, including self-referential ones */
@@ -635,7 +721,7 @@ getvacant(struct vars * v,/* used only for debug flags */
635721 * pickss - pick the next stateset to be used
636722 */
637723static struct sset *
638- pickss (struct vars * v ,/* used only for debug flags */
724+ pickss (struct vars * v ,
639725struct dfa * d ,
640726chr * cp ,
641727chr * start )
@@ -691,7 +777,6 @@ pickss(struct vars * v,/* used only for debug flags */
691777
692778/* nobody's old enough?!? -- something's really wrong */
693779FDEBUG (("cannot find victim to replace!\n" ));
694- assert (NOTREACHED );
695780ERR (REG_ASSERT );
696- return d -> ssets ;
781+ return NULL ;
697782}