34
34
35
35
/*
36
36
* longest - longest-preferred matching engine
37
+ *
38
+ * On success, returns match endpoint address. Returns NULL on no match.
39
+ * Internal errors also return NULL, with v->err set.
37
40
*/
38
- static chr * /* endpoint, or NULL */
39
- longest (struct vars * v ,/* used only for debug and exec flags */
41
+ static chr *
42
+ longest (struct vars * v ,
40
43
struct dfa * d ,
41
44
chr * start ,/* where the match should start */
42
45
chr * stop ,/* match must end at or before here */
@@ -51,11 +54,15 @@ longest(struct vars * v,/* used only for debug and exec flags */
51
54
int i ;
52
55
struct colormap * cm = d -> cm ;
53
56
57
+ /* prevent "uninitialized variable" warnings */
58
+ if (hitstopp != NULL )
59
+ * hitstopp = 0 ;
60
+
54
61
/* initialize */
55
62
css = initialize (v ,d ,start );
63
+ if (css == NULL )
64
+ return NULL ;
56
65
cp = start ;
57
- if (hitstopp != NULL )
58
- * hitstopp = 0 ;
59
66
60
67
/* startup */
61
68
FDEBUG (("+++ startup +++\n" ));
@@ -74,8 +81,14 @@ longest(struct vars * v,/* used only for debug and exec flags */
74
81
return NULL ;
75
82
css -> lastseen = cp ;
76
83
77
- /* main loop */
84
+ /*
85
+ * This is the main text-scanning loop. It seems worth having two copies
86
+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
87
+ * builds, when you're not actively tracing.
88
+ */
89
+ #ifdef REG_DEBUG
78
90
if (v -> eflags & REG_FTRACE )
91
+ {
79
92
while (cp < realstop )
80
93
{
81
94
FDEBUG (("+++ at c%d +++\n" , (int ) (css - d -> ssets )));
@@ -92,7 +105,10 @@ longest(struct vars * v,/* used only for debug and exec flags */
92
105
ss -> lastseen = cp ;
93
106
css = ss ;
94
107
}
108
+ }
95
109
else
110
+ #endif
111
+ {
96
112
while (cp < realstop )
97
113
{
98
114
co = GETCOLOR (cm ,* cp );
@@ -107,6 +123,10 @@ longest(struct vars * v,/* used only for debug and exec flags */
107
123
ss -> lastseen = cp ;
108
124
css = ss ;
109
125
}
126
+ }
127
+
128
+ if (ISERR ())
129
+ return NULL ;
110
130
111
131
/* shutdown */
112
132
FDEBUG (("+++ shutdown at c%d +++\n" , (int ) (css - d -> ssets )));
@@ -117,6 +137,8 @@ longest(struct vars * v,/* used only for debug and exec flags */
117
137
co = d -> cnfa -> eos [(v -> eflags & REG_NOTEOL ) ?0 :1 ];
118
138
FDEBUG (("color %ld\n" , (long )co ));
119
139
ss = miss (v ,d ,css ,co ,cp ,start );
140
+ if (ISERR ())
141
+ return NULL ;
120
142
/* special case: match ended at eol? */
121
143
if (ss != NULL && (ss -> flags & POSTSTATE ))
122
144
return cp ;
@@ -138,14 +160,17 @@ longest(struct vars * v,/* used only for debug and exec flags */
138
160
139
161
/*
140
162
* shortest - shortest-preferred matching engine
163
+ *
164
+ * On success, returns match endpoint address. Returns NULL on no match.
165
+ * Internal errors also return NULL, with v->err set.
141
166
*/
142
- static chr * /* endpoint, or NULL */
167
+ static chr *
143
168
shortest (struct vars * v ,
144
169
struct dfa * d ,
145
170
chr * start ,/* where the match should start */
146
171
chr * min ,/* match must end at or after here */
147
172
chr * max ,/* match must end at or before here */
148
- chr * * coldp ,/* store coldstart pointer here, ifnonNULL */
173
+ chr * * coldp ,/* store coldstart pointer here, ifnon-NULL */
149
174
int * hitstopp )/* record whether hit v->stop, if non-NULL */
150
175
{
151
176
chr * cp ;
@@ -156,11 +181,17 @@ shortest(struct vars * v,
156
181
struct sset * ss ;
157
182
struct colormap * cm = d -> cm ;
158
183
184
+ /* prevent "uninitialized variable" warnings */
185
+ if (coldp != NULL )
186
+ * coldp = NULL ;
187
+ if (hitstopp != NULL )
188
+ * hitstopp = 0 ;
189
+
159
190
/* initialize */
160
191
css = initialize (v ,d ,start );
192
+ if (css == NULL )
193
+ return NULL ;
161
194
cp = start ;
162
- if (hitstopp != NULL )
163
- * hitstopp = 0 ;
164
195
165
196
/* startup */
166
197
FDEBUG (("--- startup ---\n" ));
@@ -180,8 +211,14 @@ shortest(struct vars * v,
180
211
css -> lastseen = cp ;
181
212
ss = css ;
182
213
183
- /* main loop */
214
+ /*
215
+ * This is the main text-scanning loop. It seems worth having two copies
216
+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
217
+ * builds, when you're not actively tracing.
218
+ */
219
+ #ifdef REG_DEBUG
184
220
if (v -> eflags & REG_FTRACE )
221
+ {
185
222
while (cp < realmax )
186
223
{
187
224
FDEBUG (("--- at c%d ---\n" , (int ) (css - d -> ssets )));
@@ -200,7 +237,10 @@ shortest(struct vars * v,
200
237
if ((ss -> flags & POSTSTATE )&& cp >=realmin )
201
238
break ;/* NOTE BREAK OUT */
202
239
}
240
+ }
203
241
else
242
+ #endif
243
+ {
204
244
while (cp < realmax )
205
245
{
206
246
co = GETCOLOR (cm ,* cp );
@@ -217,6 +257,7 @@ shortest(struct vars * v,
217
257
if ((ss -> flags & POSTSTATE )&& cp >=realmin )
218
258
break ;/* NOTE BREAK OUT */
219
259
}
260
+ }
220
261
221
262
if (ss == NULL )
222
263
return NULL ;
@@ -390,7 +431,7 @@ hash(unsigned *uv,
390
431
* initialize - hand-craft a cache entry for startup, otherwise get ready
391
432
*/
392
433
static struct sset *
393
- initialize (struct vars * v ,/* used only for debug flags */
434
+ initialize (struct vars * v ,
394
435
struct dfa * d ,
395
436
chr * start )
396
437
{
@@ -403,6 +444,8 @@ initialize(struct vars * v,/* used only for debug flags */
403
444
else
404
445
{/* no, must (re)build it */
405
446
ss = getvacant (v ,d ,start ,start );
447
+ if (ss == NULL )
448
+ return NULL ;
406
449
for (i = 0 ;i < d -> wordsper ;i ++ )
407
450
ss -> states [i ]= 0 ;
408
451
BSET (ss -> states ,d -> cnfa -> pre );
@@ -421,10 +464,20 @@ initialize(struct vars * v,/* used only for debug flags */
421
464
}
422
465
423
466
/*
424
- * miss - handle a cache miss
467
+ * miss - handle a stateset cache miss
468
+ *
469
+ * css is the current stateset, co is the color of the current input character,
470
+ * cp points to the character after that (which is where we may need to test
471
+ * LACONs). start does not affect matching behavior but is needed for pickss'
472
+ * heuristics about which stateset cache entry to replace.
473
+ *
474
+ * Ordinarily, returns the address of the next stateset (the one that is
475
+ * valid after consuming the input character). Returns NULL if no valid
476
+ * NFA states remain, ie we have a certain match failure.
477
+ * Internal errors also return NULL, with v->err set.
425
478
*/
426
- static struct sset * /* NULL if goes to empty set */
427
- miss (struct vars * v ,/* used only for debug flags */
479
+ static struct sset *
480
+ miss (struct vars * v ,
428
481
struct dfa * d ,
429
482
struct sset * css ,
430
483
pcolor co ,
@@ -450,9 +503,23 @@ miss(struct vars * v,/* used only for debug flags */
450
503
}
451
504
FDEBUG (("miss\n" ));
452
505
453
- /* first, what set of states would we end up in? */
506
+ /*
507
+ * Checking for operation cancel in the inner text search loop seems
508
+ * unduly expensive. As a compromise, check during cache misses.
509
+ */
510
+ if (CANCEL_REQUESTED (v -> re ))
511
+ {
512
+ ERR (REG_CANCEL );
513
+ return NULL ;
514
+ }
515
+
516
+ /*
517
+ * What set of states would we end up in after consuming the co character?
518
+ * We first consider PLAIN arcs that consume the character, and then look
519
+ * to see what LACON arcs could be traversed after consuming it.
520
+ */
454
521
for (i = 0 ;i < d -> wordsper ;i ++ )
455
- d -> work [i ]= 0 ;
522
+ d -> work [i ]= 0 ;/* build new stateset bitmap in d->work */
456
523
ispost = 0 ;
457
524
noprogress = 1 ;
458
525
gotstate = 0 ;
@@ -469,22 +536,31 @@ miss(struct vars * v,/* used only for debug flags */
469
536
noprogress = 0 ;
470
537
FDEBUG (("%d -> %d\n" ,i ,ca -> to ));
471
538
}
472
- dolacons = (gotstate ) ? (cnfa -> flags & HASLACONS ) :0 ;
539
+ if (!gotstate )
540
+ return NULL ;/* character cannot reach any new state */
541
+ dolacons = (cnfa -> flags & HASLACONS );
473
542
sawlacons = 0 ;
543
+ /* outer loop handles transitive closure of reachable-by-LACON states */
474
544
while (dolacons )
475
- {/* transitive closure */
545
+ {
476
546
dolacons = 0 ;
477
547
for (i = 0 ;i < d -> nstates ;i ++ )
478
548
if (ISBSET (d -> work ,i ))
479
549
for (ca = cnfa -> states [i ];ca -> co != COLORLESS ;ca ++ )
480
550
{
481
551
if (ca -> co < cnfa -> ncolors )
482
- continue ;/* NOTE CONTINUE */
483
- sawlacons = 1 ;
552
+ continue ;/* not a LACON arc */
484
553
if (ISBSET (d -> work ,ca -> to ))
485
- continue ;/* NOTE CONTINUE */
554
+ continue ;/* arc would be a no-op anyway */
555
+ sawlacons = 1 ;/* this LACON affects our result */
486
556
if (!lacon (v ,cnfa ,cp ,ca -> co ))
487
- continue ;/* NOTE CONTINUE */
557
+ {
558
+ if (ISERR ())
559
+ return NULL ;
560
+ continue ;/* LACON arc cannot be traversed */
561
+ }
562
+ if (ISERR ())
563
+ return NULL ;
488
564
BSET (d -> work ,ca -> to );
489
565
dolacons = 1 ;
490
566
if (ca -> to == cnfa -> post )
@@ -494,11 +570,9 @@ miss(struct vars * v,/* used only for debug flags */
494
570
FDEBUG (("%d :> %d\n" ,i ,ca -> to ));
495
571
}
496
572
}
497
- if (!gotstate )
498
- return NULL ;
499
573
h = HASH (d -> work ,d -> wordsper );
500
574
501
- /*next, is that in the cache? */
575
+ /*Is this stateset already in the cache? */
502
576
for (p = d -> ssets ,i = d -> nssused ;i > 0 ;p ++ ,i -- )
503
577
if (HIT (h ,d -> work ,p ,d -> wordsper ))
504
578
{
@@ -508,6 +582,8 @@ miss(struct vars * v,/* used only for debug flags */
508
582
if (i == 0 )
509
583
{/* nope, need a new cache entry */
510
584
p = getvacant (v ,d ,cp ,start );
585
+ if (p == NULL )
586
+ return NULL ;
511
587
assert (p != css );
512
588
for (i = 0 ;i < d -> wordsper ;i ++ )
513
589
p -> states [i ]= d -> work [i ];
@@ -518,8 +594,15 @@ miss(struct vars * v,/* used only for debug flags */
518
594
/* lastseen to be dealt with by caller */
519
595
}
520
596
597
+ /*
598
+ * Link new stateset to old, unless a LACON affected the result, in which
599
+ * case we don't create the link. That forces future transitions across
600
+ * this same arc (same prior stateset and character color) to come through
601
+ * miss() again, so that we can recheck the LACON(s), which might or might
602
+ * not pass since context will be different.
603
+ */
521
604
if (!sawlacons )
522
- {/* lookahead conds. always cache miss */
605
+ {
523
606
FDEBUG (("c%d[%d]->c%d\n" ,
524
607
(int ) (css - d -> ssets ),co , (int ) (p - d -> ssets )));
525
608
css -> outs [co ]= p ;
@@ -563,11 +646,12 @@ lacon(struct vars * v,
563
646
564
647
/*
565
648
* getvacant - get a vacant state set
649
+ *
566
650
* This routine clears out the inarcs and outarcs, but does not otherwise
567
651
* clear the innards of the state set -- that's up to the caller.
568
652
*/
569
653
static struct sset *
570
- getvacant (struct vars * v ,/* used only for debug flags */
654
+ getvacant (struct vars * v ,
571
655
struct dfa * d ,
572
656
chr * cp ,
573
657
chr * start )
@@ -579,6 +663,8 @@ getvacant(struct vars * v,/* used only for debug flags */
579
663
color co ;
580
664
581
665
ss = pickss (v ,d ,cp ,start );
666
+ if (ss == NULL )
667
+ return NULL ;
582
668
assert (!(ss -> flags & LOCKED ));
583
669
584
670
/* clear out its inarcs, including self-referential ones */
@@ -636,7 +722,7 @@ getvacant(struct vars * v,/* used only for debug flags */
636
722
* pickss - pick the next stateset to be used
637
723
*/
638
724
static struct sset *
639
- pickss (struct vars * v ,/* used only for debug flags */
725
+ pickss (struct vars * v ,
640
726
struct dfa * d ,
641
727
chr * cp ,
642
728
chr * start )
@@ -692,7 +778,6 @@ pickss(struct vars * v,/* used only for debug flags */
692
778
693
779
/* nobody's old enough?!? -- something's really wrong */
694
780
FDEBUG (("cannot find victim to replace!\n" ));
695
- assert (NOTREACHED );
696
781
ERR (REG_ASSERT );
697
- return d -> ssets ;
782
+ return NULL ;
698
783
}