@@ -75,7 +75,7 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder,
7575"afterBody" :AfterBodyPhase (self ,self .tree ),
7676"inFrameset" :InFramesetPhase (self ,self .tree ),
7777"afterFrameset" :AfterFramesetPhase (self ,self .tree ),
78- "trailingEnd" :TrailingEndPhase (self ,self .tree )
78+ "trailingEnd" :TrailingEndPhase (self ,self .tree ),
7979# XXX after after body
8080# XXX after after frameset
8181# XXX trailingEnd is gone
@@ -117,10 +117,11 @@ def _parse(self, stream, innerHTML=False, container="div",
117117# relevant ... need others too
118118self .lastPhase = None
119119
120+ self .beforeRCDataPhase = None
121+
120122# XXX This is temporary for the moment so there isn't any other
121123# changes needed for the parser to work with the iterable tokenizer
122- for token in self .tokenizer :
123- token = self .normalizeToken (token )
124+ for token in self .normalizedTokens ():
124125type = token ["type" ]
125126method = getattr (self .phase ,"process%s" % type ,None )
126127if type in ("Characters" ,"SpaceCharacters" ,"Comment" ):
@@ -137,6 +138,10 @@ def _parse(self, stream, innerHTML=False, container="div",
137138# When the loop finishes it's EOF
138139self .phase .processEOF ()
139140
141+ def normalizedTokens (self ):
142+ for token in self .tokenizer :
143+ yield self .normalizeToken (token )
144+
140145def parse (self ,stream ,encoding = None ,parseMeta = True ,useChardet = True ):
141146"""Parse a HTML document into a well-formed tree
142147
@@ -238,6 +243,29 @@ def resetInsertionMode(self):
238243self .phase = self .phases ["inBody" ]
239244break
240245
246+ def parseRCDataCData (self ,name ,attributes ,contentType ):
247+ """Generic (R)CDATA Parsing algorithm
248+ contentType - RCDATA or CDATA
249+ """
250+ assert contentType in ("CDATA" ,"RCDATA" )
251+
252+ element = self .tree .insertElement (name ,attributes )
253+ self .tokenizer .contentModelFlag = contentModelFlags [contentType ]
254+
255+ for token in self .normalizedTokens ():
256+ if token ["type" ]in ("Characters" ,"SpaceCharacters" ):
257+ self .tree .insertText (token ["data" ])
258+ elif token ["type" ]== "ParseError" :
259+ self .parseError (token ["data" ],token .get ("datavars" , {}))
260+ else :
261+ assert self .tokenizer .contentModelFlag == contentModelFlags ["PCDATA" ]
262+ assert token ["type" ]== "EndTag" and token ["name" ]== name ,repr (token )
263+ assert self .tree .openElements .pop ()== element
264+ return
265+ #Otherwise we hit EOF
266+ assert self .tree .openElements .pop ()== element
267+ self .parseError ("expected-closing-tag-but-got-eof" )
268+
241269class Phase (object ):
242270"""Base class for helper object that implements each phase of processing
243271 """
@@ -298,29 +326,6 @@ def startTagHtml(self, name, attributes):
298326def processEndTag (self ,name ):
299327self .endTagHandler [name ](name )
300328
301- def parseRCDataCData (self ,name ,attributes ,contentType ):
302- """Generic (R)CDATA Parsing algorithm
303- contentType - RCDATA or CDATA
304- """
305- assert contentType in ("CDATA" ,"RCDATA" )
306- element = self .tree .insertElement (name ,attributes )
307- self .parser .tokenizer .contentModelFlag = contentModelFlags [contentType ]
308- for token in self .parser .tokenizer :
309- token = self .parser .normalizeToken (token )
310- if token ["type" ]in ("Characters" ,"SpaceCharacters" ):
311- self .tree .insertText (token ["data" ])
312- elif token ["type" ]== "ParseError" :
313- self .parser .parseError (token ["data" ],token .get ("datavars" , {}))
314- else :
315- assert self .parser .tokenizer .contentModelFlag == contentModelFlags ["PCDATA" ]
316- assert token ["type" ]== "EndTag" and token ["name" ]== name ,repr (token )
317- assert self .tree .openElements .pop ()== element
318- return
319- #Otherwise we hit EOF
320- assert self .tree .openElements .pop ()== element
321- self .parser .parseError ("expected-closing-tag-but-got-eof" )
322-
323-
324329class InitialPhase (Phase ):
325330# This phase deals with error handling as well which is currently not
326331# covered in the specification. The error handling is typically known as
@@ -586,18 +591,18 @@ def startTagHead(self, name, attributes):
586591self .parser .parseError ("two-heads-are-not-better-than-one" )
587592
588593def startTagTitle (self ,name ,attributes ):
589- self .parseRCDataCData (name ,attributes ,"RCDATA" )
594+ self .parser . parseRCDataCData (name ,attributes ,"RCDATA" )
590595
591596def startTagStyle (self ,name ,attributes ):
592- self .parseRCDataCData (name ,attributes ,"CDATA" )
597+ self .parser . parseRCDataCData (name ,attributes ,"CDATA" )
593598
594599def startTagNoScript (self ,name ,attributes ):
595600#Need to decide whether to implement the scripting-disabled case
596- self .parseRCDataCData (name ,attributes ,"CDATA" )
601+ self .parser . parseRCDataCData (name ,attributes ,"CDATA" )
597602
598603def startTagScript (self ,name ,attributes ):
599604#I think this is equivalent to the CDATA stuff since we don't execute script
600- self .parseRCDataCData (name ,attributes ,"CDATA" )
605+ self .parser . parseRCDataCData (name ,attributes ,"CDATA" )
601606
602607def startTagBaseLinkMeta (self ,name ,attributes ):
603608if (self .tree .headPointer is not None and self .parser .phase == self .parser .phases ["inHead" ]):
@@ -612,7 +617,7 @@ def startTagOther(self, name, attributes):
612617self .parser .phase .processStartTag (name ,attributes )
613618
614619def endTagHead (self ,name ):
615- assert self .tree .openElements [- 1 ].name == "head"
620+ assert self .tree .openElements [- 1 ].name == "head" , "Expected head got %s" % self . tree . openElements [ - 1 ]. name
616621self .tree .openElements .pop ()
617622self .parser .phase = self .parser .phases ["afterHead" ]
618623
@@ -922,7 +927,7 @@ def startTagAppletMarqueeObject(self, name, attributes):
922927
923928def startTagXmp (self ,name ,attributes ):
924929self .tree .reconstructActiveFormattingElements ()
925- self .parseRCDataCData (name ,attributes ,"CDATA" )
930+ self .parser . parseRCDataCData (name ,attributes ,"CDATA" )
926931
927932def startTagTable (self ,name ,attributes ):
928933if self .tree .elementInScope ("p" ):
@@ -982,7 +987,7 @@ def startTagTextarea(self, name, attributes):
982987
983988def startTagCdata (self ,name ,attributes ):
984989"""iframe, noembed noframes, noscript(if scripting enabled)"""
985- self .parseRCDataCData (name ,attributes ,"CDATA" )
990+ self .parser . parseRCDataCData (name ,attributes ,"CDATA" )
986991
987992def startTagSelect (self ,name ,attributes ):
988993self .tree .reconstructActiveFormattingElements ()