8383RSPLIT ,
8484};
8585
86- /* Return codes for re_sizecode() and re_comp() */
87- enum {
88- RE_SUCCESS = 0 ,
89- RE_SYNTAX_ERROR = -2 ,
90- RE_UNSUPPORTED_SYNTAX = -3 ,
91- };
92-
9386typedef struct rsub rsub ;
9487struct rsub
9588{
@@ -184,25 +177,22 @@ void re_dumpcode(rcode *prog)
184177prog -> unilen ,prog -> len ,prog -> splits ,i );
185178}
186179
187- /* next todo: crack and factor out this recursion,
188- no recursion will allow to make a meta macro out
189- of this, such that re_sizecode() becomes efficient
190- difficulty: very high, probably not any time soon */
191- static int _compilecode (const char * * re_loc ,rcode * prog ,int sizecode )
180+ static int _compilecode (const char * re_loc ,rcode * prog ,int sizecode )
192181{
193- const char * re = * re_loc ;
182+ const char * re = re_loc ;
194183int * code = sizecode ?NULL :prog -> insts ;
195184int start = PC ,term = PC ;
196185int alt_label = 0 ,c ;
197- int alt_stack [5000 ],altc = 0 ;
186+ int alt_stack [4096 ],altc = 0 ;
187+ int cap_stack [4096 * 5 ],capc = 0 ;
198188
199- for (; * re && * re != ')' ; ) {
189+ while ( * re ) {
200190switch (* re ) {
201191case '\\' :
202192re ++ ;
203- if (!* re )goto syntax_error ;/* Trailing backslash */
193+ if (!* re )return -1 ;/* Trailing backslash */
204194if (* re == '<' || * re == '>' ) {
205- if (re - * re_loc > 2 && re [-2 ]== '\\' )
195+ if (re - re_loc > 2 && re [-2 ]== '\\' )
206196break ;
207197EMIT (PC ++ ,* re == '<' ?WBEG :WEND );
208198term = PC ;
@@ -230,7 +220,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
230220PC ++ ;/* Skip "# of pairs" byte */
231221for (cnt = 0 ;* re != ']' ;cnt ++ ) {
232222if (* re == '\\' )re ++ ;
233- if (!* re )goto syntax_error ;
223+ if (!* re )return -1 ;
234224uc_code (c ,re )EMIT (PC ++ ,c );
235225uc_len (c ,re )
236226if (re [c ]== '-' && re [c + 1 ]!= ']' )
@@ -244,29 +234,42 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
244234term = PC ;
245235int sub ;
246236int capture = 1 ;
247- re ++ ;
248- if (* re == '?' ) {
249- re ++ ;
250- if (* re == ':' ) {
237+ if (* (re + 1 )== '?' ) {
238+ re += 2 ;
239+ if (* re == ':' )
251240capture = 0 ;
252- re ++ ;
253- }else {
254- * re_loc = re ;
255- return RE_UNSUPPORTED_SYNTAX ;
256- }
241+ else
242+ return -1 ;
257243}
258244if (capture ) {
259245sub = ++ prog -> sub ;
260246EMIT (PC ++ ,SAVE );
261247EMIT (PC ++ ,sub );
262248}
263- int res = _compilecode (& re ,prog ,sizecode );
264- * re_loc = re ;
265- if (res < 0 )return res ;
266- if (* re != ')' )return RE_SYNTAX_ERROR ;
267- if (capture ) {
249+ cap_stack [capc ++ ]= capture ;
250+ cap_stack [capc ++ ]= term ;
251+ cap_stack [capc ++ ]= alt_label ;
252+ cap_stack [capc ++ ]= start ;
253+ cap_stack [capc ++ ]= altc ;
254+ alt_label = 0 ;
255+ start = PC ;
256+ break ;
257+ case ')' :
258+ if (-- capc - 4 < 0 )return -1 ;
259+ if (code && alt_label ) {
260+ EMIT (alt_label ,REL (alt_label ,PC )+ 1 );
261+ int _altc = cap_stack [capc ];
262+ for (int alts = altc ;altc > _altc ;altc -- ) {
263+ int at = alt_stack [_altc + alts - altc ]+ (altc - _altc )* 2 ;
264+ EMIT (at ,REL (at ,PC )+ 1 );
265+ }
266+ }
267+ start = cap_stack [-- capc ];
268+ alt_label = cap_stack [-- capc ];
269+ term = cap_stack [-- capc ];
270+ if (cap_stack [-- capc ]) {
268271EMIT (PC ++ ,SAVE );
269- EMIT (PC ++ ,sub + prog -> presub + 1 );
272+ EMIT (PC ++ ,code [ term + 1 ] + prog -> presub + 1 );
270273}
271274break ;
272275case '{' :;
@@ -300,7 +303,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
300303}
301304break ;
302305case '?' :
303- if (PC == term )goto syntax_error ;
306+ if (PC == term )return -1 ;
304307INSERT_CODE (term ,2 ,PC );
305308if (re [1 ]== '?' ) {
306309EMIT (term ,RSPLIT );
@@ -311,7 +314,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
311314term = PC ;
312315break ;
313316case '*' :
314- if (PC == term )goto syntax_error ;
317+ if (PC == term )return -1 ;
315318INSERT_CODE (term ,2 ,PC );
316319EMIT (PC ,JMP );
317320EMIT (PC + 1 ,REL (PC ,term ));
@@ -325,7 +328,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
325328term = PC ;
326329break ;
327330case '+' :
328- if (PC == term )goto syntax_error ;
331+ if (PC == term )return -1 ;
329332if (re [1 ]== '?' ) {
330333EMIT (PC ,SPLIT );
331334re ++ ;
@@ -363,11 +366,7 @@ static int _compilecode(const char **re_loc, rcode *prog, int sizecode)
363366EMIT (at ,REL (at ,PC )+ 1 );
364367}
365368}
366- * re_loc = re ;
367- return RE_SUCCESS ;
368- syntax_error :
369- * re_loc = re ;
370- return RE_SYNTAX_ERROR ;
369+ return capc ?-1 :0 ;
371370}
372371
373372int re_sizecode (const char * re ,int * nsub )
@@ -376,9 +375,8 @@ int re_sizecode(const char *re, int *nsub)
376375dummyprog .unilen = 3 ;
377376dummyprog .sub = 0 ;
378377
379- int res = _compilecode (& re ,& dummyprog ,1 );
378+ int res = _compilecode (re ,& dummyprog ,1 );
380379if (res < 0 )return res ;
381- if (* re )return RE_SYNTAX_ERROR ;
382380* nsub = dummyprog .sub ;
383381return dummyprog .unilen ;
384382}
@@ -391,9 +389,8 @@ int re_comp(rcode *prog, const char *re, int nsubs)
391389prog -> presub = nsubs ;
392390prog -> splits = 0 ;
393391
394- int res = _compilecode (& re ,prog ,0 );
392+ int res = _compilecode (re ,prog ,0 );
395393if (res < 0 )return res ;
396- if (* re )return RE_SYNTAX_ERROR ;
397394int icnt = 0 ,scnt = SPLIT ;
398395for (int i = 0 ;i < prog -> unilen ;i ++ )
399396switch (prog -> insts [i ]) {
@@ -424,7 +421,7 @@ int re_comp(rcode *prog, const char *re, int nsubs)
424421prog -> presub = sizeof (rsub )+ (sizeof (char * )* (nsubs + 1 )* 2 );
425422prog -> sub = prog -> presub * (prog -> len - prog -> splits + 3 );
426423prog -> sparsesz = scnt ;
427- return RE_SUCCESS ;
424+ return 0 ;
428425}
429426
430427#define newsub (init ,copy ) \
@@ -636,10 +633,14 @@ int main(int argc, char *argv[])
636633int sub_els ;
637634int sz = re_sizecode (argv [1 ],& sub_els )* sizeof (int );
638635printf ("Precalculated size: %d\n" ,sz );
636+ if (sz < 0 ) {
637+ printf ("Error in re_sizecode\n" );
638+ return 1 ;
639+ }
639640char code [sizeof (rcode )+ sz ];
640641rcode * _code = (rcode * )code ;
641642if (re_comp (_code ,argv [1 ],sub_els )) {
642- printf ("Error in re_comp" );
643+ printf ("Error in re_comp\n " );
643644return 1 ;
644645}
645646re_dumpcode (_code );