@@ -566,52 +566,99 @@ def test_EOF_in_charref(self):
566
566
for html ,expected in data :
567
567
self ._run_check (html ,expected )
568
568
569
- def test_broken_comments (self ):
569
+ def test_EOF_in_comments_or_decls (self ):
570
+ data = [
571
+ ('<!' , [('data' ,'<!' )]),
572
+ ('<!-' , [('data' ,'<!-' )]),
573
+ ('<!--' , [('data' ,'<!--' )]),
574
+ ('<![' , [('data' ,'<![' )]),
575
+ ('<![CDATA[' , [('data' ,'<![CDATA[' )]),
576
+ ('<![CDATA[x' , [('data' ,'<![CDATA[x' )]),
577
+ ('<!DOCTYPE' , [('data' ,'<!DOCTYPE' )]),
578
+ ('<!DOCTYPE HTML' , [('data' ,'<!DOCTYPE HTML' )]),
579
+ ]
580
+ for html ,expected in data :
581
+ self ._run_check (html ,expected )
582
+ def test_bogus_comments (self ):
570
583
html = ('<! not really a comment >'
571
584
'<! not a comment either -->'
572
585
'<! -- close enough -->'
573
586
'<!><!<-- this was an empty comment>'
574
- '<!!! another bogus comment !!!>' )
587
+ '<!!! another bogus comment !!!>'
588
+ # see #32876
589
+ '<![with square brackets]!>'
590
+ '<![\n multiline\n bogusness\n ]!>'
591
+ '<![more brackets]-[and a hyphen]!>'
592
+ '<![cdata[should be uppercase]]>'
593
+ '<![CDATA [whitespaces are not ignored]]>'
594
+ '<![CDATA]]>' # required '[' after CDATA
595
+ )
575
596
expected = [
576
597
('comment' ,' not really a comment ' ),
577
598
('comment' ,' not a comment either --' ),
578
599
('comment' ,' -- close enough --' ),
579
600
('comment' ,'' ),
580
601
('comment' ,'<-- this was an empty comment' ),
581
602
('comment' ,'!! another bogus comment !!!' ),
603
+ ('comment' ,'[with square brackets]!' ),
604
+ ('comment' ,'[\n multiline\n bogusness\n ]!' ),
605
+ ('comment' ,'[more brackets]-[and a hyphen]!' ),
606
+ ('comment' ,'[cdata[should be uppercase]]' ),
607
+ ('comment' ,'[CDATA [whitespaces are not ignored]]' ),
608
+ ('comment' ,'[CDATA]]' ),
582
609
]
583
610
self ._run_check (html ,expected )
584
611
585
612
def test_broken_condcoms (self ):
586
613
# these condcoms are missing the '--' after '<!' and before the '>'
614
+ # and they are considered bogus comments according to
615
+ # "8.2.4.42. Markup declaration open state"
587
616
html = ('<![if !(IE)]>broken condcom<![endif]>'
588
617
'<![if ! IE]><link href="favicon.tiff"/><![endif]>'
589
618
'<![if !IE 6]><img src="firefox.png" /><![endif]>'
590
619
'<![if !ie 6]><b>foo</b><![endif]>'
591
620
'<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>' )
592
- # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
593
- # and "8.2.4.45 Markup declaration open state", comment tokens should
594
- # be emitted instead of 'unknown decl', but calling unknown_decl
595
- # provides more flexibility.
596
- # See also Lib/_markupbase.py:parse_declaration
597
621
expected = [
598
- ('unknown decl ' ,'if !(IE)' ),
622
+ ('comment ' ,'[ if !(IE)] ' ),
599
623
('data' ,'broken condcom' ),
600
- ('unknown decl ' ,'endif' ),
601
- ('unknown decl ' ,'if ! IE' ),
624
+ ('comment ' ,'[ endif] ' ),
625
+ ('comment ' ,'[ if ! IE] ' ),
602
626
('startendtag' ,'link' , [('href' ,'favicon.tiff' )]),
603
- ('unknown decl ' ,'endif' ),
604
- ('unknown decl ' ,'if !IE 6' ),
627
+ ('comment ' ,'[ endif] ' ),
628
+ ('comment ' ,'[ if !IE 6] ' ),
605
629
('startendtag' ,'img' , [('src' ,'firefox.png' )]),
606
- ('unknown decl ' ,'endif' ),
607
- ('unknown decl ' ,'if !ie 6' ),
630
+ ('comment ' ,'[ endif] ' ),
631
+ ('comment ' ,'[ if !ie 6] ' ),
608
632
('starttag' ,'b' , []),
609
633
('data' ,'foo' ),
610
634
('endtag' ,'b' ),
611
- ('unknown decl ' ,'endif' ),
612
- ('unknown decl ' ,'if (!IE)|(lt IE 9)' ),
635
+ ('comment ' ,'[ endif] ' ),
636
+ ('comment ' ,'[ if (!IE)|(lt IE 9)] ' ),
613
637
('startendtag' ,'img' , [('src' ,'mammoth.bmp' )]),
614
- ('unknown decl' ,'endif' )
638
+ ('comment' ,'[endif]' )
639
+ ]
640
+ self ._run_check (html ,expected )
641
+
642
+ def test_cdata_declarations (self ):
643
+ # More tests should be added. See also "8.2.4.42. Markup
644
+ # declaration open state", "8.2.4.69. CDATA section state",
645
+ # and issue 32876
646
+ html = ('<![CDATA[just some plain text]]>' )
647
+ expected = [('unknown decl' ,'CDATA[just some plain text' )]
648
+ self ._run_check (html ,expected )
649
+
650
+ def test_cdata_declarations_multiline (self ):
651
+ html = ('<code><![CDATA['
652
+ ' if (a < b && a > b) {'
653
+ ' printf("[<marquee>How?</marquee>]");'
654
+ ' }'
655
+ ']]></code>' )
656
+ expected = [
657
+ ('starttag' ,'code' , []),
658
+ ('unknown decl' ,
659
+ 'CDATA[ if (a < b && a > b) { '
660
+ 'printf("[<marquee>How?</marquee>]"); }' ),
661
+ ('endtag' ,'code' )
615
662
]
616
663
self ._run_check (html ,expected )
617
664