@@ -25,7 +25,7 @@ def any(iterable):
2525from treebuilders import simpletree
2626
2727import utils
28- from constants import contentModelFlags , spaceCharacters ,asciiUpper2Lower
28+ from constants import spaceCharacters ,asciiUpper2Lower
2929from constants import scopingElements ,formattingElements ,specialElements
3030from constants import headingElements ,tableInsertModeElements
3131from constants import cdataElements ,rcdataElements ,voidElements
@@ -77,7 +77,7 @@ def __init__(self, tree = simpletree.TreeBuilder,
7777# XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
7878"afterHead" :AfterHeadPhase (self ,self .tree ),
7979"inBody" :InBodyPhase (self ,self .tree ),
80- "inRCDataRawtext " :InRCDataRawtextPhase (self ,self .tree ),
80+ "text " :TextPhase (self ,self .tree ),
8181"inTable" :InTablePhase (self ,self .tree ),
8282"inTableText" :InTableTextPhase (self ,self .tree ),
8383"inCaption" :InCaptionPhase (self ,self .tree ),
@@ -124,14 +124,14 @@ def reset(self):
124124self .innerHTML = self .container .lower ()
125125
126126if self .innerHTML in cdataElements :
127- self .tokenizer .contentModelFlag = tokenizer .contentModelFlags [ "RCDATA" ]
127+ self .tokenizer .state = self . tokenizer .rcdataState
128128elif self .innerHTML in rcdataElements :
129- self .tokenizer .contentModelFlag = tokenizer .contentModelFlags [ "RAWTEXT" ]
129+ self .tokenizer .state = self . tokenizer .rawtextState
130130elif self .innerHTML == 'plaintext' :
131- self .tokenizer .contentModelFlag = tokenizer .contentModelFlags [ "PLAINTEXT" ]
131+ self .tokenizer .state = self . tokenizer .plaintextState
132132else :
133- #contentModelFlag already isPCDATA
134- #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
133+ #state already isdata state
134+ # self.tokenizer.state =self. tokenizer.dataState
135135pass
136136self .phase = self .phases ["beforeHtml" ]
137137self .phase .insertHtmlElement ()
@@ -406,7 +406,7 @@ def parseRCDataRawtext(self, token, contentType):
406406
407407self .originalPhase = self .phase
408408
409- self .phase = self .phases ["inRCDataRawtext " ]
409+ self .phase = self .phases ["text " ]
410410
411411class Phase (object ):
412412"""Base class for helper object that implements each phase of processing
@@ -636,8 +636,12 @@ def processStartTag(self, token):
636636self .parser .phase .processStartTag (token )
637637
638638def processEndTag (self ,token ):
639- self .insertHtmlElement ()
640- self .parser .phase .processEndTag (token )
639+ if token ["name" ]not in ("head" ,"body" ,"html" ,"br" ):
640+ self .parser .parseError ("unexpected-end-tag-before-html" ,
641+ {"name" :token ["name" ]})
642+ else :
643+ self .insertHtmlElement ()
644+ self .parser .phase .processEndTag (token )
641645
642646
643647class BeforeHeadPhase (Phase ):
@@ -651,7 +655,7 @@ def __init__(self, parser, tree):
651655self .startTagHandler .default = self .startTagOther
652656
653657self .endTagHandler = utils .MethodDispatcher ([
654- (("head" ,"br" ),self .endTagImplyHead )
658+ (("head" ,"body" , "html" , " br" ),self .endTagImplyHead )
655659 ])
656660self .endTagHandler .default = self .endTagOther
657661
@@ -666,6 +670,9 @@ def processCharacters(self, token):
666670self .startTagHead (impliedTagToken ("head" ,"StartTag" ))
667671self .parser .phase .processCharacters (token )
668672
673+ def startTagHtml (self ,token ):
674+ self .parser .phases ["inBody" ].processStartTag (token )
675+
669676def startTagHead (self ,token ):
670677self .tree .insertElement (token )
671678self .tree .headPointer = self .tree .openElements [- 1 ]
@@ -692,8 +699,8 @@ def __init__(self, parser, tree):
692699 ("title" ,self .startTagTitle ),
693700 (("noscript" ,"noframes" ,"style" ),self .startTagNoScriptNoFramesStyle ),
694701 ("script" ,self .startTagScript ),
695- (("base" ,"link" ,"command" , "eventsource" ),
696- self .startTagBaseLinkCommandEventsource ),
702+ (("base" ,"link" ,"command" ),
703+ self .startTagBaseLinkCommand ),
697704 ("meta" ,self .startTagMeta ),
698705 ("head" ,self .startTagHead )
699706 ])
@@ -728,7 +735,7 @@ def startTagHtml(self, token):
728735def startTagHead (self ,token ):
729736self .parser .parseError ("two-heads-are-not-better-than-one" )
730737
731- def startTagBaseLinkCommandEventsource (self ,token ):
738+ def startTagBaseLinkCommand (self ,token ):
732739self .tree .insertElement (token )
733740self .tree .openElements .pop ()
734741token ["selfClosingAcknowledged" ]= True
@@ -757,9 +764,10 @@ def startTagNoScriptNoFramesStyle(self, token):
757764self .parser .parseRCDataRawtext (token ,"RAWTEXT" )
758765
759766def startTagScript (self ,token ):
760- #I think this is equivalent to the RAWTEXT stuff since we don't execute script
761- #self.tree.insertElement(token)
762- self .parser .parseRCDataRawtext (token ,"RAWTEXT" )
767+ self .tree .insertElement (token )
768+ self .parser .tokenizer .state = self .parser .tokenizer .scriptDataState
769+ self .parser .originalPhase = self .parser .phase
770+ self .parser .phase = self .parser .phases ["text" ]
763771
764772def startTagOther (self ,token ):
765773self .anythingElse ()
@@ -838,7 +846,6 @@ def startTagOther(self, token):
838846self .parser .phase .processStartTag (token )
839847
840848def endTagHtmlBodyBr (self ,token ):
841- #This is not currently in the spec
842849self .anythingElse ()
843850self .parser .phase .processEndTag (token )
844851
@@ -852,8 +859,8 @@ def anythingElse(self):
852859
853860
854861class InBodyPhase (Phase ):
855- # http://www.whatwg.org/specs/web-apps/current-work/#in-body
856- # the crazy mode
862+ # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
863+ # thereally-really-really-very crazy mode
857864def __init__ (self ,parser ,tree ):
858865Phase .__init__ (self ,parser ,tree )
859866
@@ -862,15 +869,16 @@ def __init__(self, parser, tree):
862869
863870self .startTagHandler = utils .MethodDispatcher ([
864871 ("html" ,self .startTagHtml ),
865- (("base" ,"link" ,"meta" ,"script " ,"style " ,"title" ),
866- self .startTagProcessInHead ),
872+ (("base" ,"command" , " link" ,"meta" ,"noframes " ,"script " ,"style" ,
873+ "title" ), self .startTagProcessInHead ),
867874 ("body" ,self .startTagBody ),
868875 ("frameset" ,self .startTagFrameset ),
869876 (("address" ,"article" ,"aside" ,"blockquote" ,"center" ,"datagrid" ,
870- "details" ,"dialog" , " dir" ,"div" ,"dl" ,"fieldset" ,"figure" ,
871- "footer" ,"h1 " ,"h2 " ,"h3 " ,"h4 " ,"h5 " ,"h6" , "header" , "listing " ,
872- "menu" , "nav" , "ol" , "p" , "pre" , " section" ,"ul" ),
877+ "details" ,"dir" ,"div" ,"dl" ,"fieldset" ,"figure" ,
878+ "footer" ,"header " ,"hgroup " ,"menu " ,"nav " ,"ol " ,"p " ,
879+ "section" ,"ul" ),
873880self .startTagCloseP ),
881+ (("pre" ,"listing" ),self .startTagPreListing ),
874882 ("form" ,self .startTagForm ),
875883 (("li" ,"dd" ,"dt" ),self .startTagListItem ),
876884 ("plaintext" ,self .startTagPlaintext ),
@@ -899,18 +907,17 @@ def __init__(self, parser, tree):
899907 (("svg" ),self .startTagSvg ),
900908 (("caption" ,"col" ,"colgroup" ,"frame" ,"head" ,
901909"tbody" ,"td" ,"tfoot" ,"th" ,"thead" ,
902- "tr" ),self .startTagMisplaced ),
903- (("event-source" ,"command" ),self .startTagNew )
910+ "tr" ),self .startTagMisplaced )
904911 ])
905912self .startTagHandler .default = self .startTagOther
906913
907914self .endTagHandler = utils .MethodDispatcher ([
908915 ("body" ,self .endTagBody ),
909916 ("html" ,self .endTagHtml ),
910917 (("address" ,"article" ,"aside" ,"blockquote" ,"center" ,"datagrid" ,
911- "details" ,"dialog" , " dir" ,"div" ,"dl" ,"fieldset" ,"figure" ,
912- "footer" ,"header" ,"listing" ,"menu" ,"nav" ,"ol" ,"pre" ,"section" ,
913- "ul" ),self .endTagBlock ),
918+ "details" ,"dir" ,"div" ,"dl" ,"fieldset" ,"figure" ,
919+ "footer" ,"header" ,"hgroup" , " listing" ,"menu" ,"nav" ,"ol" ,"pre" ,
920+ "section" , " ul" ),self .endTagBlock ),
914921 ("form" ,self .endTagForm ),
915922 ("p" ,self .endTagP ),
916923 (("dd" ,"dt" ,"li" ),self .endTagListItem ),
@@ -953,14 +960,10 @@ def processSpaceCharactersDropNewline(self, token):
953960self .tree .insertText (data )
954961
955962def processCharacters (self ,token ):
956- # XXX The specification says to do this for every character at the
957- # moment, but apparently that doesn't match the real world so we don't
958- # do it for space characters.
959963self .tree .reconstructActiveFormattingElements ()
960964self .tree .insertText (token ["data" ])
961965self .parser .framesetOK = False
962966
963- #This matches the current spec but may not match the real world
964967def processSpaceCharacters (self ,token ):
965968self .tree .reconstructActiveFormattingElements ()
966969self .tree .insertText (token ["data" ])
@@ -996,9 +999,13 @@ def startTagCloseP(self, token):
996999if self .tree .elementInScope ("p" ):
9971000self .endTagP (impliedTagToken ("p" ))
9981001self .tree .insertElement (token )
999- if token ["name" ]in ("pre" ,"listing" ):
1000- self .parser .framesetOK = False
1001- self .processSpaceCharacters = self .processSpaceCharactersDropNewline
1002+
1003+ def startTagPreListing (self ,token ):
1004+ if self .tree .elementInScope ("p" ):
1005+ self .endTagP (impliedTagToken ("p" ))
1006+ self .tree .insertElement (token )
1007+ self .parser .framesetOK = False
1008+ self .processSpaceCharacters = self .processSpaceCharactersDropNewline
10021009
10031010def startTagForm (self ,token ):
10041011if self .tree .formPointer :
@@ -1035,23 +1042,14 @@ def startTagPlaintext(self, token):
10351042if self .tree .elementInScope ("p" ):
10361043self .endTagP (impliedTagToken ("p" ))
10371044self .tree .insertElement (token )
1038- self .parser .tokenizer .contentModelFlag = contentModelFlags [ "PLAINTEXT" ]
1045+ self .parser .tokenizer .state = self . parser . tokenizer . plaintextState
10391046
10401047def startTagHeading (self ,token ):
10411048if self .tree .elementInScope ("p" ):
10421049self .endTagP (impliedTagToken ("p" ))
10431050if self .tree .openElements [- 1 ].name in headingElements :
10441051self .parser .parseError ("unexpected-start-tag" , {"name" :token ["name" ]})
10451052self .tree .openElements .pop ()
1046- # Uncomment the following for IE7 behavior:
1047- #
1048- #for item in headingElements:
1049- # if self.tree.elementInScope(item):
1050- # self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1051- # item = self.tree.openElements.pop()
1052- # while item.name not in headingElements:
1053- # item = self.tree.openElements.pop()
1054- # break
10551053self .tree .insertElement (token )
10561054
10571055def startTagA (self ,token ):
@@ -1175,9 +1173,8 @@ def startTagIsIndex(self, token):
11751173self .processEndTag (impliedTagToken ("form" ))
11761174
11771175def startTagTextarea (self ,token ):
1178- # XXX Form element pointer checking here as well...
11791176self .tree .insertElement (token )
1180- self .parser .tokenizer .contentModelFlag = contentModelFlags [ "RCDATA" ]
1177+ self .parser .tokenizer .state = self . parser . tokenizer . rcdataState
11811178self .processSpaceCharacters = self .processSpaceCharactersDropNewline
11821179self .parser .framesetOK = False
11831180
@@ -1257,16 +1254,6 @@ def startTagMisplaced(self, token):
12571254 """
12581255self .parser .parseError ("unexpected-start-tag-ignored" , {"name" :token ["name" ]})
12591256
1260- def startTagNew (self ,token ):
1261- """New HTML5 elements, "event-source", "section", "nav",
1262- "article", "aside", "header", "footer", "datagrid", "command"
1263- """
1264- #2007-08-30 - MAP - commenting out this write to sys.stderr because
1265- # it's really annoying me when I run the validator tests
1266- #sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
1267- self .startTagOther (token )
1268- #raise NotImplementedError
1269-
12701257def startTagOther (self ,token ):
12711258self .tree .reconstructActiveFormattingElements ()
12721259self .tree .insertElement (token )
@@ -1285,19 +1272,16 @@ def endTagP(self, token):
12851272node = self .tree .openElements .pop ()
12861273
12871274def endTagBody (self ,token ):
1288- # XXX Need to take open <p> tags into account here. We shouldn't imply
1289- # </p> but we should not throw a parse error either. Specification is
1290- # likely to be updated.
1291- if (len (self .tree .openElements )== 1 or
1292- self .tree .openElements [1 ].name != "body" ):
1293- # innerHTML case
1275+ if not self .tree .elementInScope ("body" ):
12941276self .parser .parseError ()
12951277return
12961278elif self .tree .openElements [- 1 ].name != "body" :
12971279for node in self .tree .openElements [2 :]:
1298- if node .name not in frozenset (("dd" ,"dt" ,"li" ,"p" ,
1280+ if node .name not in frozenset (("dd" ,"dt" ,"li" ,"optgroup" ,
1281+ "option" ,"p" ,"rp" ,"rt" ,
12991282"tbody" ,"td" ,"tfoot" ,
1300- "th" ,"thead" ,"tr" )):
1283+ "th" ,"thead" ,"tr" ,"body" ,
1284+ "html" )):
13011285#Not sure this is the correct name for the parse error
13021286self .parser .parseError (
13031287"expected-one-end-tag-but-got-another" ,
@@ -1524,7 +1508,7 @@ def endTagOther(self, token):
15241508self .parser .parseError ("unexpected-end-tag" , {"name" :token ["name" ]})
15251509break
15261510
1527- class InRCDataRawtextPhase (Phase ):
1511+ class TextPhase (Phase ):
15281512def __init__ (self ,parser ,tree ):
15291513Phase .__init__ (self ,parser ,tree )
15301514self .startTagHandler = utils .MethodDispatcher ([])