@@ -75,7 +75,7 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder,
75
75
"afterBody" :AfterBodyPhase (self ,self .tree ),
76
76
"inFrameset" :InFramesetPhase (self ,self .tree ),
77
77
"afterFrameset" :AfterFramesetPhase (self ,self .tree ),
78
- "trailingEnd" :TrailingEndPhase (self ,self .tree )
78
+ "trailingEnd" :TrailingEndPhase (self ,self .tree ),
79
79
# XXX after after body
80
80
# XXX after after frameset
81
81
# XXX trailingEnd is gone
@@ -117,10 +117,11 @@ def _parse(self, stream, innerHTML=False, container="div",
117
117
# relevant ... need others too
118
118
self .lastPhase = None
119
119
120
+ self .beforeRCDataPhase = None
121
+
120
122
# XXX This is temporary for the moment so there isn't any other
121
123
# changes needed for the parser to work with the iterable tokenizer
122
- for token in self .tokenizer :
123
- token = self .normalizeToken (token )
124
+ for token in self .normalizedTokens ():
124
125
type = token ["type" ]
125
126
method = getattr (self .phase ,"process%s" % type ,None )
126
127
if type in ("Characters" ,"SpaceCharacters" ,"Comment" ):
@@ -137,6 +138,10 @@ def _parse(self, stream, innerHTML=False, container="div",
137
138
# When the loop finishes it's EOF
138
139
self .phase .processEOF ()
139
140
141
+ def normalizedTokens (self ):
142
+ for token in self .tokenizer :
143
+ yield self .normalizeToken (token )
144
+
140
145
def parse (self ,stream ,encoding = None ,parseMeta = True ,useChardet = True ):
141
146
"""Parse a HTML document into a well-formed tree
142
147
@@ -238,6 +243,29 @@ def resetInsertionMode(self):
238
243
self .phase = self .phases ["inBody" ]
239
244
break
240
245
246
+ def parseRCDataCData (self ,name ,attributes ,contentType ):
247
+ """Generic (R)CDATA Parsing algorithm
248
+ contentType - RCDATA or CDATA
249
+ """
250
+ assert contentType in ("CDATA" ,"RCDATA" )
251
+
252
+ element = self .tree .insertElement (name ,attributes )
253
+ self .tokenizer .contentModelFlag = contentModelFlags [contentType ]
254
+
255
+ for token in self .normalizedTokens ():
256
+ if token ["type" ]in ("Characters" ,"SpaceCharacters" ):
257
+ self .tree .insertText (token ["data" ])
258
+ elif token ["type" ]== "ParseError" :
259
+ self .parseError (token ["data" ],token .get ("datavars" , {}))
260
+ else :
261
+ assert self .tokenizer .contentModelFlag == contentModelFlags ["PCDATA" ]
262
+ assert token ["type" ]== "EndTag" and token ["name" ]== name ,repr (token )
263
+ assert self .tree .openElements .pop ()== element
264
+ return
265
+ #Otherwise we hit EOF
266
+ assert self .tree .openElements .pop ()== element
267
+ self .parseError ("expected-closing-tag-but-got-eof" )
268
+
241
269
class Phase (object ):
242
270
"""Base class for helper object that implements each phase of processing
243
271
"""
@@ -298,29 +326,6 @@ def startTagHtml(self, name, attributes):
298
326
def processEndTag (self ,name ):
299
327
self .endTagHandler [name ](name )
300
328
301
- def parseRCDataCData (self ,name ,attributes ,contentType ):
302
- """Generic (R)CDATA Parsing algorithm
303
- contentType - RCDATA or CDATA
304
- """
305
- assert contentType in ("CDATA" ,"RCDATA" )
306
- element = self .tree .insertElement (name ,attributes )
307
- self .parser .tokenizer .contentModelFlag = contentModelFlags [contentType ]
308
- for token in self .parser .tokenizer :
309
- token = self .parser .normalizeToken (token )
310
- if token ["type" ]in ("Characters" ,"SpaceCharacters" ):
311
- self .tree .insertText (token ["data" ])
312
- elif token ["type" ]== "ParseError" :
313
- self .parser .parseError (token ["data" ],token .get ("datavars" , {}))
314
- else :
315
- assert self .parser .tokenizer .contentModelFlag == contentModelFlags ["PCDATA" ]
316
- assert token ["type" ]== "EndTag" and token ["name" ]== name ,repr (token )
317
- assert self .tree .openElements .pop ()== element
318
- return
319
- #Otherwise we hit EOF
320
- assert self .tree .openElements .pop ()== element
321
- self .parser .parseError ("expected-closing-tag-but-got-eof" )
322
-
323
-
324
329
class InitialPhase (Phase ):
325
330
# This phase deals with error handling as well which is currently not
326
331
# covered in the specification. The error handling is typically known as
@@ -586,18 +591,18 @@ def startTagHead(self, name, attributes):
586
591
self .parser .parseError ("two-heads-are-not-better-than-one" )
587
592
588
593
def startTagTitle (self ,name ,attributes ):
589
- self .parseRCDataCData (name ,attributes ,"RCDATA" )
594
+ self .parser . parseRCDataCData (name ,attributes ,"RCDATA" )
590
595
591
596
def startTagStyle (self ,name ,attributes ):
592
- self .parseRCDataCData (name ,attributes ,"CDATA" )
597
+ self .parser . parseRCDataCData (name ,attributes ,"CDATA" )
593
598
594
599
def startTagNoScript (self ,name ,attributes ):
595
600
#Need to decide whether to implement the scripting-disabled case
596
- self .parseRCDataCData (name ,attributes ,"CDATA" )
601
+ self .parser . parseRCDataCData (name ,attributes ,"CDATA" )
597
602
598
603
def startTagScript (self ,name ,attributes ):
599
604
#I think this is equivalent to the CDATA stuff since we don't execute script
600
- self .parseRCDataCData (name ,attributes ,"CDATA" )
605
+ self .parser . parseRCDataCData (name ,attributes ,"CDATA" )
601
606
602
607
def startTagBaseLinkMeta (self ,name ,attributes ):
603
608
if (self .tree .headPointer is not None and self .parser .phase == self .parser .phases ["inHead" ]):
@@ -612,7 +617,7 @@ def startTagOther(self, name, attributes):
612
617
self .parser .phase .processStartTag (name ,attributes )
613
618
614
619
def endTagHead (self ,name ):
615
- assert self .tree .openElements [- 1 ].name == "head"
620
+ assert self .tree .openElements [- 1 ].name == "head" , "Expected head got %s" % self . tree . openElements [ - 1 ]. name
616
621
self .tree .openElements .pop ()
617
622
self .parser .phase = self .parser .phases ["afterHead" ]
618
623
@@ -922,7 +927,7 @@ def startTagAppletMarqueeObject(self, name, attributes):
922
927
923
928
def startTagXmp (self ,name ,attributes ):
924
929
self .tree .reconstructActiveFormattingElements ()
925
- self .parseRCDataCData (name ,attributes ,"CDATA" )
930
+ self .parser . parseRCDataCData (name ,attributes ,"CDATA" )
926
931
927
932
def startTagTable (self ,name ,attributes ):
928
933
if self .tree .elementInScope ("p" ):
@@ -982,7 +987,7 @@ def startTagTextarea(self, name, attributes):
982
987
983
988
def startTagCdata (self ,name ,attributes ):
984
989
"""iframe, noembed noframes, noscript(if scripting enabled)"""
985
- self .parseRCDataCData (name ,attributes ,"CDATA" )
990
+ self .parser . parseRCDataCData (name ,attributes ,"CDATA" )
986
991
987
992
def startTagSelect (self ,name ,attributes ):
988
993
self .tree .reconstructActiveFormattingElements ()