1212
1313
1414#define CHAR_EOF -1
15+ #define CHAR_NULL 0
1516#define CHAR_BANG 33
1617#define CHAR_DOUBLE 34
1718#define CHAR_PERCENT 37
2324#define CHAR_GT 62
2425#define CHAR_QUESTION 63
2526#define CHAR_RIGHTB 93
27+ #define CHAR_TICK 96
2628
2729/* prototypes */
2830
@@ -41,6 +43,7 @@ static int h5_state_before_attribute_name(h5_state_t* hs);
4143static int h5_state_before_attribute_value (h5_state_t * hs );
4244static int h5_state_attribute_value_double_quote (h5_state_t * hs );
4345static int h5_state_attribute_value_single_quote (h5_state_t * hs );
46+ static int h5_state_attribute_value_back_quote (h5_state_t * hs );
4447static int h5_state_attribute_value_no_quote (h5_state_t * hs );
4548static int h5_state_after_attribute_value_quoted_state (h5_state_t * hs );
4649static int h5_state_comment (h5_state_t * hs );
@@ -60,16 +63,28 @@ static int h5_state_doctype(h5_state_t* hs);
6063/**
6164 * public function
6265 */
63- void libinjection_h5_init (h5_state_t * hs ,const char * s ,size_t len ,int flags )
66+ void libinjection_h5_init (h5_state_t * hs ,const char * s ,size_t len ,enum html5_flags flags )
6467{
6568memset (hs ,0 ,sizeof (h5_state_t ));
6669hs -> s = s ;
6770hs -> len = len ;
68- hs -> state = h5_state_data ;
69- if (flags == 0 ) {
71+
72+ switch (flags ) {
73+ case DATA_STATE :
7074hs -> state = h5_state_data ;
71- }else {
72- assert (0 );
75+ break ;
76+ case VALUE_NO_QUOTE :
77+ hs -> state = h5_state_before_attribute_name ;
78+ break ;
79+ case VALUE_SINGLE_QUOTE :
80+ hs -> state = h5_state_attribute_value_single_quote ;
81+ break ;
82+ case VALUE_DOUBLE_QUOTE :
83+ hs -> state = h5_state_attribute_value_double_quote ;
84+ break ;
85+ case VALUE_BACK_QUOTE :
86+ hs -> state = h5_state_attribute_value_back_quote ;
87+ break ;
7388 }
7489}
7590
@@ -85,10 +100,18 @@ int libinjection_h5_next(h5_state_t* hs)
85100/**
86101 * Everything below here is private
87102 *
88- */
103+ */
104+
89105
90106static int h5_is_white (char ch )
91107{
108+ /*
109+ * \t = horizontal tab = 0x09
110+ * \n = newline = 0x0A
111+ * \v = vertical tab = 0x0B
112+ * \f = form feed = 0x0C
113+ * \r = cr = 0x0D
114+ */
92115return strchr (" \t\n\v\f\r" ,ch )!= NULL ;
93116}
94117
@@ -97,9 +120,17 @@ static int h5_skip_white(h5_state_t* hs)
97120char ch ;
98121while (hs -> pos < hs -> len ) {
99122ch = hs -> s [hs -> pos ];
100- if (ch == ' ' ) {
123+ switch (ch ) {
124+ case 0x00 :/* IE only */
125+ case 0x20 :
126+ case 0x09 :
127+ case 0x0A :
128+ case 0x0B :/* IE only */
129+ case 0x0C :
130+ case 0x0D :/* IE only */
101131hs -> pos += 1 ;
102- }else {
132+ break ;
133+ default :
103134return ch ;
104135 }
105136 }
@@ -149,6 +180,9 @@ static int h5_state_tag_open(h5_state_t* hs)
149180char ch ;
150181
151182TRACE ();
183+ if (hs -> pos >=hs -> len ) {
184+ return 0 ;
185+ }
152186ch = hs -> s [hs -> pos ];
153187if (ch == CHAR_BANG ) {
154188hs -> pos += 1 ;
@@ -167,6 +201,9 @@ static int h5_state_tag_open(h5_state_t* hs)
167201return h5_state_bogus_comment2 (hs );
168202 }else if ((ch >='a' && ch <='z' )|| (ch >='A' && ch <='Z' )) {
169203return h5_state_tag_name (hs );
204+ }else if (ch == CHAR_NULL ) {
205+ /* IE-ism NULL characters are ignored */
206+ return h5_state_tag_name (hs );
170207 }else {
171208/* user input mistake in configuring state */
172209if (hs -> pos == 0 ) {
@@ -197,7 +234,9 @@ static int h5_state_end_tag_open(h5_state_t* hs)
197234 }else if ((ch >='a' && ch <='z' )|| (ch >='A' && ch <='Z' )) {
198235return h5_state_tag_name (hs );
199236 }
200- return h5_state_data (hs );
237+
238+ hs -> is_close = 0 ;
239+ return h5_state_bogus_comment (hs );
201240}
202241/*
203242 *
@@ -231,7 +270,12 @@ static int h5_state_tag_name(h5_state_t* hs)
231270pos = hs -> pos ;
232271while (pos < hs -> len ) {
233272ch = hs -> s [pos ];
234- if (h5_is_white (ch )) {
273+ if (ch == 0 ) {
274+ /* special non-standard case */
275+ /* allow nulls in tag name */
276+ /* some old browsers apparently allow and ignore them */
277+ pos += 1 ;
278+ }else if (h5_is_white (ch )) {
235279hs -> token_start = hs -> s + hs -> pos ;
236280hs -> token_len = pos - hs -> pos ;
237281hs -> token_type = TAG_NAME_OPEN ;
@@ -299,7 +343,7 @@ static int h5_state_before_attribute_name(h5_state_t* hs)
299343default : {
300344return h5_state_attribute_name (hs );
301345 }
302- }
346+ }
303347}
304348
305349static int h5_state_attribute_name (h5_state_t * hs )
@@ -308,7 +352,7 @@ static int h5_state_attribute_name(h5_state_t* hs)
308352size_t pos ;
309353
310354TRACE ();
311- pos = hs -> pos ;
355+ pos = hs -> pos + 1 ;
312356while (pos < hs -> len ) {
313357ch = hs -> s [pos ];
314358if (h5_is_white (ch )) {
@@ -358,21 +402,19 @@ static int h5_state_attribute_name(h5_state_t* hs)
358402static int h5_state_after_attribute_name (h5_state_t * hs )
359403{
360404int c ;
361- size_t pos ;
362405
363406TRACE ();
364- pos = hs -> pos ;
365407c = h5_skip_white (hs );
366408switch (c ) {
367409case CHAR_EOF : {
368410return 0 ;
369411 }
370412case CHAR_SLASH : {
371- hs -> pos = pos + 1 ;
413+ hs -> pos += 1 ;
372414return h5_state_self_closing_start_tag (hs );
373415 }
374416case CHAR_EQUALS : {
375- hs -> pos = pos + 1 ;
417+ hs -> pos += 1 ;
376418return h5_state_before_attribute_value (hs );
377419 }
378420case CHAR_GT : {
@@ -403,6 +445,9 @@ static int h5_state_before_attribute_value(h5_state_t* hs)
403445return h5_state_attribute_value_double_quote (hs );
404446 }else if (c == CHAR_SINGLE ) {
405447return h5_state_attribute_value_single_quote (hs );
448+ }else if (c == CHAR_TICK ) {
449+ /* NON STANDARD IE */
450+ return h5_state_attribute_value_back_quote (hs );
406451 }else {
407452return h5_state_attribute_value_no_quote (hs );
408453 }
@@ -415,8 +460,16 @@ static int h5_state_attribute_value_quote(h5_state_t* hs, char qchar)
415460
416461TRACE ();
417462
418- /* skip quote */
419- hs -> pos += 1 ;
463+ /* skip initial quote in normal case.
464+ * don't do this "if (pos == 0)" since it means we have started
465+ * in a non-data state. given an input of '><foo
466+ * we want to make 0-length attribute name
467+ */
468+ if (hs -> pos > 0 ) {
469+ hs -> pos += 1 ;
470+ }
471+
472+
420473idx = (const char * )memchr (hs -> s + hs -> pos ,qchar ,hs -> len - hs -> pos );
421474if (idx == NULL ) {
422475hs -> token_start = hs -> s + hs -> pos ;
@@ -447,6 +500,13 @@ int h5_state_attribute_value_single_quote(h5_state_t* hs)
447500return h5_state_attribute_value_quote (hs ,CHAR_SINGLE );
448501}
449502
503+ static
504+ int h5_state_attribute_value_back_quote (h5_state_t * hs )
505+ {
506+ TRACE ();
507+ return h5_state_attribute_value_quote (hs ,CHAR_TICK );
508+ }
509+
450510static int h5_state_attribute_value_no_quote (h5_state_t * hs )
451511{
452512char ch ;
@@ -656,10 +716,13 @@ static int h5_state_comment(h5_state_t* hs)
656716char ch ;
657717const char * idx ;
658718size_t pos ;
719+ size_t offset ;
720+ const char * end = hs -> s + hs -> len ;
659721
660722TRACE ();
661723pos = hs -> pos ;
662724while (1 ) {
725+
663726idx = (const char * )memchr (hs -> s + pos ,CHAR_DASH ,hs -> len - pos );
664727
665728/* did not find anything or has less than 3 chars left */
@@ -670,21 +733,62 @@ static int h5_state_comment(h5_state_t* hs)
670733hs -> token_type = TAG_COMMENT ;
671734return 1 ;
672735 }
673- ch = * (idx + 1 );
736+ offset = 1 ;
737+
738+ /* skip all nulls */
739+ while (idx + offset < end && * (idx + offset )== 0 ) {
740+ offset += 1 ;
741+ }
742+ if (idx + offset == end ) {
743+ hs -> state = h5_state_eof ;
744+ hs -> token_start = hs -> s + hs -> pos ;
745+ hs -> token_len = hs -> len - hs -> pos ;
746+ hs -> token_type = TAG_COMMENT ;
747+ return 1 ;
748+ }
749+
750+ ch = * (idx + offset );
674751if (ch != CHAR_DASH && ch != CHAR_BANG ) {
675752pos = (size_t )(idx - hs -> s )+ 1 ;
676753continue ;
677754 }
678- ch = * (idx + 2 );
755+
756+ /* need to test */
757+ #if 0
758+ /* skip all nulls */
759+ while (idx + offset < end && * (idx + offset )== 0 ) {
760+ offset += 1 ;
761+ }
762+ if (idx + offset == end ) {
763+ hs -> state = h5_state_eof ;
764+ hs -> token_start = hs -> s + hs -> pos ;
765+ hs -> token_len = hs -> len - hs -> pos ;
766+ hs -> token_type = TAG_COMMENT ;
767+ return 1 ;
768+ }
769+ #endif
770+
771+ offset += 1 ;
772+ if (idx + offset == end ) {
773+ hs -> state = h5_state_eof ;
774+ hs -> token_start = hs -> s + hs -> pos ;
775+ hs -> token_len = hs -> len - hs -> pos ;
776+ hs -> token_type = TAG_COMMENT ;
777+ return 1 ;
778+ }
779+
780+
781+ ch = * (idx + offset );
679782if (ch != CHAR_GT ) {
680783pos = (size_t )(idx - hs -> s )+ 1 ;
681784continue ;
682785 }
786+ offset += 1 ;
683787
684788/* ends in --> or -!> */
685789hs -> token_start = hs -> s + hs -> pos ;
686790hs -> token_len = (size_t )(idx - hs -> s )- hs -> pos ;
687- hs -> pos = (size_t )(idx - hs -> s )+ 3 ;
791+ hs -> pos = (size_t )(idx + offset - hs -> s );
688792hs -> state = h5_state_data ;
689793hs -> token_type = TAG_COMMENT ;
690794return 1 ;