@@ -65,6 +65,8 @@ newnfa(struct vars *v,
65
65
nfa -> v = v ;
66
66
nfa -> bos [0 ]= nfa -> bos [1 ]= COLORLESS ;
67
67
nfa -> eos [0 ]= nfa -> eos [1 ]= COLORLESS ;
68
+ nfa -> flags = 0 ;
69
+ nfa -> minmatchall = nfa -> maxmatchall = -1 ;
68
70
nfa -> parent = parent ;/* Precedes newfstate so parent is valid. */
69
71
nfa -> post = newfstate (nfa ,'@' );/* number 0 */
70
72
nfa -> pre = newfstate (nfa ,'>' );/* number 1 */
@@ -2875,15 +2877,297 @@ analyze(struct nfa *nfa)
2875
2877
if (NISERR ())
2876
2878
return 0 ;
2877
2879
2880
+ /* Detect whether NFA can't match anything */
2878
2881
if (nfa -> pre -> outs == NULL )
2879
2882
return REG_UIMPOSSIBLE ;
2883
+
2884
+ /* Detect whether NFA matches all strings (possibly with length bounds) */
2885
+ checkmatchall (nfa );
2886
+
2887
+ /* Detect whether NFA can possibly match a zero-length string */
2880
2888
for (a = nfa -> pre -> outs ;a != NULL ;a = a -> outchain )
2881
2889
for (aa = a -> to -> outs ;aa != NULL ;aa = aa -> outchain )
2882
2890
if (aa -> to == nfa -> post )
2883
2891
return REG_UEMPTYMATCH ;
2884
2892
return 0 ;
2885
2893
}
2886
2894
2895
+ /*
2896
+ * checkmatchall - does the NFA represent no more than a string length test?
2897
+ *
2898
+ * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
2899
+ * to begin with) and set the MATCHALL bit in nfa->flags.
2900
+ *
2901
+ * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
2902
+ * for pseudocolors (i.e., BOS/BOL/EOS/EOL). We must be able to reach the
2903
+ * post state via RAINBOW arcs, and if there are any loops in the graph, they
2904
+ * must be loop-to-self arcs, ensuring that each loop iteration consumes
2905
+ * exactly one character. (Longer loops are problematic because they create
2906
+ * non-consecutive possible match lengths; we have no good way to represent
2907
+ * that situation for lengths beyond the DUPINF limit.)
2908
+ *
2909
+ * Pseudocolor arcs complicate things a little. We know that they can only
2910
+ * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
2911
+ * EOS/EOL). There, they must exactly replicate the parallel RAINBOW arcs,
2912
+ * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
2913
+ * and BOL outarcs to state 2, and no others. Missing or extra pseudocolor
2914
+ * arcs can occur, meaning that the NFA involves some constraint on the
2915
+ * adjacent characters, which makes it not a matchall NFA.
2916
+ */
2917
+ static void
2918
+ checkmatchall (struct nfa * nfa )
2919
+ {
2920
+ bool hasmatch [DUPINF + 1 ];
2921
+ int minmatch ,
2922
+ maxmatch ,
2923
+ morematch ;
2924
+
2925
+ /*
2926
+ * hasmatch[i] will be set true if a match of length i is feasible, for i
2927
+ * from 0 to DUPINF-1. hasmatch[DUPINF] will be set true if every match
2928
+ * length of DUPINF or more is feasible.
2929
+ */
2930
+ memset (hasmatch ,0 ,sizeof (hasmatch ));
2931
+
2932
+ /*
2933
+ * Recursively search the graph for all-RAINBOW paths to the "post" state,
2934
+ * starting at the "pre" state. The -1 initial depth accounts for the
2935
+ * fact that transitions out of the "pre" state are not part of the
2936
+ * matched string. We likewise don't count the final transition to the
2937
+ * "post" state as part of the match length. (But we still insist that
2938
+ * those transitions have RAINBOW arcs, otherwise there are lookbehind or
2939
+ * lookahead constraints at the start/end of the pattern.)
2940
+ */
2941
+ if (!checkmatchall_recurse (nfa ,nfa -> pre , false,-1 ,hasmatch ))
2942
+ return ;
2943
+
2944
+ /*
2945
+ * We found some all-RAINBOW paths, and not anything that we couldn't
2946
+ * handle. Now verify that pseudocolor arcs adjacent to the pre and post
2947
+ * states match the RAINBOW arcs there. (We could do this while
2948
+ * recursing, but it's expensive and unlikely to fail, so do it last.)
2949
+ */
2950
+ if (!check_out_colors_match (nfa -> pre ,RAINBOW ,nfa -> bos [0 ])||
2951
+ !check_out_colors_match (nfa -> pre ,nfa -> bos [0 ],RAINBOW )||
2952
+ !check_out_colors_match (nfa -> pre ,RAINBOW ,nfa -> bos [1 ])||
2953
+ !check_out_colors_match (nfa -> pre ,nfa -> bos [1 ],RAINBOW ))
2954
+ return ;
2955
+ if (!check_in_colors_match (nfa -> post ,RAINBOW ,nfa -> eos [0 ])||
2956
+ !check_in_colors_match (nfa -> post ,nfa -> eos [0 ],RAINBOW )||
2957
+ !check_in_colors_match (nfa -> post ,RAINBOW ,nfa -> eos [1 ])||
2958
+ !check_in_colors_match (nfa -> post ,nfa -> eos [1 ],RAINBOW ))
2959
+ return ;
2960
+
2961
+ /*
2962
+ * hasmatch[] now represents the set of possible match lengths; but we
2963
+ * want to reduce that to a min and max value, because it doesn't seem
2964
+ * worth complicating regexec.c to deal with nonconsecutive possible match
2965
+ * lengths. Find min and max of first run of lengths, then verify there
2966
+ * are no nonconsecutive lengths.
2967
+ */
2968
+ for (minmatch = 0 ;minmatch <=DUPINF ;minmatch ++ )
2969
+ {
2970
+ if (hasmatch [minmatch ])
2971
+ break ;
2972
+ }
2973
+ assert (minmatch <=DUPINF );/* else checkmatchall_recurse lied */
2974
+ for (maxmatch = minmatch ;maxmatch < DUPINF ;maxmatch ++ )
2975
+ {
2976
+ if (!hasmatch [maxmatch + 1 ])
2977
+ break ;
2978
+ }
2979
+ for (morematch = maxmatch + 1 ;morematch <=DUPINF ;morematch ++ )
2980
+ {
2981
+ if (hasmatch [morematch ])
2982
+ return ;/* fail, there are nonconsecutive lengths */
2983
+ }
2984
+
2985
+ /* Success, so record the info */
2986
+ nfa -> minmatchall = minmatch ;
2987
+ nfa -> maxmatchall = maxmatch ;
2988
+ nfa -> flags |=MATCHALL ;
2989
+ }
2990
+
2991
+ /*
2992
+ * checkmatchall_recurse - recursive search for checkmatchall
2993
+ *
2994
+ * s is the current state
2995
+ * foundloop is true if any predecessor state has a loop-to-self
2996
+ * depth is the current recursion depth (starting at -1)
2997
+ * hasmatch[] is the output area for recording feasible match lengths
2998
+ *
2999
+ * We return true if there is at least one all-RAINBOW path to the "post"
3000
+ * state and no non-matchall paths; otherwise false. Note we assume that
3001
+ * any dead-end paths have already been removed, else we might return
3002
+ * false unnecessarily.
3003
+ */
3004
+ static bool
3005
+ checkmatchall_recurse (struct nfa * nfa ,struct state * s ,
3006
+ bool foundloop ,int depth ,
3007
+ bool * hasmatch )
3008
+ {
3009
+ bool result = false;
3010
+ struct arc * a ;
3011
+
3012
+ /*
3013
+ * Since this is recursive, it could be driven to stack overflow. But we
3014
+ * need not treat that as a hard failure; just deem the NFA non-matchall.
3015
+ */
3016
+ if (STACK_TOO_DEEP (nfa -> v -> re ))
3017
+ return false;
3018
+
3019
+ /*
3020
+ * Likewise, if we get to a depth too large to represent correctly in
3021
+ * maxmatchall, fail quietly.
3022
+ */
3023
+ if (depth >=DUPINF )
3024
+ return false;
3025
+
3026
+ /*
3027
+ * Scan the outarcs to detect cases we can't handle, and to see if there
3028
+ * is a loop-to-self here. We need to know about any such loop before we
3029
+ * recurse, so it's hard to avoid making two passes over the outarcs. In
3030
+ * any case, checking for showstoppers before we recurse is probably best.
3031
+ */
3032
+ for (a = s -> outs ;a != NULL ;a = a -> outchain )
3033
+ {
3034
+ if (a -> type != PLAIN )
3035
+ return false;/* any LACONs make it non-matchall */
3036
+ if (a -> co != RAINBOW )
3037
+ {
3038
+ if (nfa -> cm -> cd [a -> co ].flags & PSEUDO )
3039
+ {
3040
+ /*
3041
+ * Pseudocolor arc: verify it's in a valid place (this seems
3042
+ * quite unlikely to fail, but let's be sure).
3043
+ */
3044
+ if (s == nfa -> pre &&
3045
+ (a -> co == nfa -> bos [0 ]|| a -> co == nfa -> bos [1 ]))
3046
+ /* okay BOS/BOL arc */ ;
3047
+ else if (a -> to == nfa -> post &&
3048
+ (a -> co == nfa -> eos [0 ]|| a -> co == nfa -> eos [1 ]))
3049
+ /* okay EOS/EOL arc */ ;
3050
+ else
3051
+ return false;/* unexpected pseudocolor arc */
3052
+ /* We'll finish checking these arcs after the recursion */
3053
+ continue ;
3054
+ }
3055
+ return false;/* any other color makes it non-matchall */
3056
+ }
3057
+ if (a -> to == s )
3058
+ {
3059
+ /*
3060
+ * We found a cycle of length 1, so remember that to pass down to
3061
+ * successor states. (It doesn't matter if there was also such a
3062
+ * loop at a predecessor state.)
3063
+ */
3064
+ foundloop = true;
3065
+ }
3066
+ else if (a -> to -> tmp )
3067
+ {
3068
+ /* We found a cycle of length > 1, so fail. */
3069
+ return false;
3070
+ }
3071
+ }
3072
+
3073
+ /* We need to recurse, so mark state as under consideration */
3074
+ assert (s -> tmp == NULL );
3075
+ s -> tmp = s ;
3076
+
3077
+ for (a = s -> outs ;a != NULL ;a = a -> outchain )
3078
+ {
3079
+ if (a -> co != RAINBOW )
3080
+ continue ;/* ignore pseudocolor arcs */
3081
+ if (a -> to == nfa -> post )
3082
+ {
3083
+ /* We found an all-RAINBOW path to the post state */
3084
+ result = true;
3085
+ /* Record potential match lengths */
3086
+ assert (depth >=0 );
3087
+ hasmatch [depth ]= true;
3088
+ if (foundloop )
3089
+ {
3090
+ /* A predecessor loop makes all larger lengths match, too */
3091
+ int i ;
3092
+
3093
+ for (i = depth + 1 ;i <=DUPINF ;i ++ )
3094
+ hasmatch [i ]= true;
3095
+ }
3096
+ }
3097
+ else if (a -> to != s )
3098
+ {
3099
+ /* This is a new path forward; recurse to investigate */
3100
+ result = checkmatchall_recurse (nfa ,a -> to ,
3101
+ foundloop ,depth + 1 ,
3102
+ hasmatch );
3103
+ /* Fail if any recursive path fails */
3104
+ if (!result )
3105
+ break ;
3106
+ }
3107
+ }
3108
+
3109
+ s -> tmp = NULL ;
3110
+ return result ;
3111
+ }
3112
+
3113
+ /*
3114
+ * check_out_colors_match - subroutine for checkmatchall
3115
+ *
3116
+ * Check if every s outarc of color co1 has a matching outarc of color co2.
3117
+ * (checkmatchall_recurse already verified that all of the outarcs are PLAIN,
3118
+ * so we need not examine arc types here. Also, since it verified that there
3119
+ * are only RAINBOW and pseudocolor arcs, there shouldn't be enough arcs for
3120
+ * this brute-force O(N^2) implementation to cause problems.)
3121
+ */
3122
+ static bool
3123
+ check_out_colors_match (struct state * s ,color co1 ,color co2 )
3124
+ {
3125
+ struct arc * a1 ;
3126
+ struct arc * a2 ;
3127
+
3128
+ for (a1 = s -> outs ;a1 != NULL ;a1 = a1 -> outchain )
3129
+ {
3130
+ if (a1 -> co != co1 )
3131
+ continue ;
3132
+ for (a2 = s -> outs ;a2 != NULL ;a2 = a2 -> outchain )
3133
+ {
3134
+ if (a2 -> co == co2 && a2 -> to == a1 -> to )
3135
+ break ;
3136
+ }
3137
+ if (a2 == NULL )
3138
+ return false;
3139
+ }
3140
+ return true;
3141
+ }
3142
+
3143
+ /*
3144
+ * check_in_colors_match - subroutine for checkmatchall
3145
+ *
3146
+ * Check if every s inarc of color co1 has a matching inarc of color co2.
3147
+ * (For paranoia's sake, ignore any non-PLAIN arcs here. But we're still
3148
+ * not expecting very many arcs.)
3149
+ */
3150
+ static bool
3151
+ check_in_colors_match (struct state * s ,color co1 ,color co2 )
3152
+ {
3153
+ struct arc * a1 ;
3154
+ struct arc * a2 ;
3155
+
3156
+ for (a1 = s -> ins ;a1 != NULL ;a1 = a1 -> inchain )
3157
+ {
3158
+ if (a1 -> type != PLAIN || a1 -> co != co1 )
3159
+ continue ;
3160
+ for (a2 = s -> ins ;a2 != NULL ;a2 = a2 -> inchain )
3161
+ {
3162
+ if (a2 -> type == PLAIN && a2 -> co == co2 && a2 -> from == a1 -> from )
3163
+ break ;
3164
+ }
3165
+ if (a2 == NULL )
3166
+ return false;
3167
+ }
3168
+ return true;
3169
+ }
3170
+
2887
3171
/*
2888
3172
* compact - construct the compact representation of an NFA
2889
3173
*/
@@ -2930,7 +3214,9 @@ compact(struct nfa *nfa,
2930
3214
cnfa -> eos [0 ]= nfa -> eos [0 ];
2931
3215
cnfa -> eos [1 ]= nfa -> eos [1 ];
2932
3216
cnfa -> ncolors = maxcolor (nfa -> cm )+ 1 ;
2933
- cnfa -> flags = 0 ;
3217
+ cnfa -> flags = nfa -> flags ;
3218
+ cnfa -> minmatchall = nfa -> minmatchall ;
3219
+ cnfa -> maxmatchall = nfa -> maxmatchall ;
2934
3220
2935
3221
ca = cnfa -> arcs ;
2936
3222
for (s = nfa -> states ;s != NULL ;s = s -> next )
@@ -3034,6 +3320,11 @@ dumpnfa(struct nfa *nfa,
3034
3320
fprintf (f ,", eos [%ld]" , (long )nfa -> eos [0 ]);
3035
3321
if (nfa -> eos [1 ]!= COLORLESS )
3036
3322
fprintf (f ,", eol [%ld]" , (long )nfa -> eos [1 ]);
3323
+ if (nfa -> flags & HASLACONS )
3324
+ fprintf (f ,", haslacons" );
3325
+ if (nfa -> flags & MATCHALL )
3326
+ fprintf (f ,", minmatchall %d, maxmatchall %d" ,
3327
+ nfa -> minmatchall ,nfa -> maxmatchall );
3037
3328
fprintf (f ,"\n" );
3038
3329
for (s = nfa -> states ;s != NULL ;s = s -> next )
3039
3330
{
@@ -3201,6 +3492,9 @@ dumpcnfa(struct cnfa *cnfa,
3201
3492
fprintf (f ,", eol [%ld]" , (long )cnfa -> eos [1 ]);
3202
3493
if (cnfa -> flags & HASLACONS )
3203
3494
fprintf (f ,", haslacons" );
3495
+ if (cnfa -> flags & MATCHALL )
3496
+ fprintf (f ,", minmatchall %d, maxmatchall %d" ,
3497
+ cnfa -> minmatchall ,cnfa -> maxmatchall );
3204
3498
fprintf (f ,"\n" );
3205
3499
for (st = 0 ;st < cnfa -> nstates ;st ++ )
3206
3500
dumpcstate (st ,cnfa ,f );