@@ -28,6 +28,7 @@ def startswithany(str, prefixes):
2828return False
2929
3030import sys
31+ import types
3132
3233import inputstream
3334import tokenizer
@@ -37,14 +38,18 @@ def startswithany(str, prefixes):
3738from treebuilders import simpletree
3839
3940import utils
41+ import constants
4042from constants import spaceCharacters ,asciiUpper2Lower
4143from constants import scopingElements ,formattingElements ,specialElements
4244from constants import headingElements ,tableInsertModeElements
4345from constants import cdataElements ,rcdataElements ,voidElements
4446from constants import tokenTypes ,ReparseException ,namespaces
4547
48+ debug_log = True
49+
4650def parse (doc ,treebuilder = "simpletree" ,encoding = None ,
4751namespaceHTMLElements = True ):
52+ """Parse a string or file-like object into a tree"""
4853tb = treebuilders .getTreeBuilder (treebuilder )
4954p = HTMLParser (tb ,namespaceHTMLElements = namespaceHTMLElements )
5055return p .parse (doc ,encoding = encoding )
@@ -55,6 +60,17 @@ def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
5560p = HTMLParser (tb ,namespaceHTMLElements = namespaceHTMLElements )
5661return p .parseFragment (doc ,container = container ,encoding = encoding )
5762
63+ def method_decorator_metaclass (function ):
64+ class Decorated (type ):
65+ def __new__ (meta ,classname ,bases ,classDict ):
66+ for attributeName ,attribute in classDict .iteritems ():
67+ if type (attribute )== types .FunctionType :
68+ attribute = function (attribute )
69+
70+ classDict [attributeName ]= attribute
71+ return type .__new__ (meta ,classname ,bases ,classDict )
72+ return Decorated
73+
5874class HTMLParser (object ):
5975"""HTML parser. Generates a tree structure from a stream of (possibly
6076 malformed) HTML"""
@@ -129,6 +145,7 @@ def reset(self):
129145self .tree .reset ()
130146self .firstStartTag = False
131147self .errors = []
148+ self .log = []#only used with debug mode
132149# "quirks" / "limited quirks" / "no quirks"
133150self .compatMode = "no quirks"
134151
@@ -420,6 +437,31 @@ def parseRCDataRawtext(self, token, contentType):
420437
421438self .phase = self .phases ["text" ]
422439
440+ def log (function ):
441+ """Logger that records which phase processes each token"""
442+ type_names = dict ((value ,key )for key ,value in
443+ constants .tokenTypes .iteritems ())
444+ def wrapped (self ,* args ,** kwargs ):
445+ if function .__name__ != "__init__" and len (args )> 0 :
446+ token = args [0 ]
447+ try :
448+ info = {"type" :type_names [token ['type' ]]}
449+ except :
450+ print token
451+ raise
452+ if token ['type' ]in constants .tagTokenTypes :
453+ info ["name" ]= token ['name' ]
454+
455+ self .parser .log .append ((self .parser .tokenizer .state .__name__ ,
456+ self .parser .phase .__class__ .__name__ ,
457+ self .__class__ .__name__ ,
458+ function .__name__ ,
459+ info ))
460+ return function (self ,* args ,** kwargs )
461+ else :
462+ return function (self ,* args ,** kwargs )
463+ return wrapped
464+
423465class Phase (object ):
424466"""Base class for helper object that implements each phase of processing
425467 """
@@ -434,6 +476,9 @@ class Phase(object):
434476# * EndTag
435477# - endTag* methods
436478
479+ if debug_log :
480+ __metaclass__ = method_decorator_metaclass (log )
481+
437482def __init__ (self ,parser ,tree ):
438483self .parser = parser
439484self .tree = tree
@@ -1008,7 +1053,7 @@ def startTagForm(self, token):
10081053self .parser .parseError (u"unexpected-start-tag" , {"name" :"form" })
10091054else :
10101055if self .tree .elementInScope ("p" ):
1011- self .endTagP ("p" )
1056+ self .endTagP (impliedTagToken ( "p" ) )
10121057self .tree .insertElement (token )
10131058self .tree .formPointer = self .tree .openElements [- 1 ]
10141059
@@ -1831,7 +1876,7 @@ def processEOF(self):
18311876return
18321877else :
18331878ignoreEndTag = self .ignoreEndTagColgroup ()
1834- self .endTagColgroup ("colgroup" )
1879+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
18351880if not ignoreEndTag :
18361881self .parser .phase .processEOF ()
18371882
@@ -1847,7 +1892,7 @@ def startTagCol(self, token):
18471892
18481893def startTagOther (self ,token ):
18491894ignoreEndTag = self .ignoreEndTagColgroup ()
1850- self .endTagColgroup ("colgroup" )
1895+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
18511896if not ignoreEndTag :
18521897self .parser .phase .processStartTag (token )
18531898
@@ -1865,7 +1910,7 @@ def endTagCol(self, token):
18651910
18661911def endTagOther (self ,token ):
18671912ignoreEndTag = self .ignoreEndTagColgroup ()
1868- self .endTagColgroup ("colgroup" )
1913+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
18691914if not ignoreEndTag :
18701915self .parser .phase .processEndTag (token )
18711916
@@ -2016,7 +2061,7 @@ def startTagTableCell(self, token):
20162061
20172062def startTagTableOther (self ,token ):
20182063ignoreEndTag = self .ignoreEndTagTr ()
2019- self .endTagTr ("tr" )
2064+ self .endTagTr (impliedTagToken ( "tr" ) )
20202065# XXX how are we sure it's always ignored in the innerHTML case?
20212066if not ignoreEndTag :
20222067self .parser .phase .processStartTag (token )
@@ -2036,15 +2081,15 @@ def endTagTr(self, token):
20362081
20372082def endTagTable (self ,token ):
20382083ignoreEndTag = self .ignoreEndTagTr ()
2039- self .endTagTr ("tr" )
2084+ self .endTagTr (impliedTagToken ( "tr" ) )
20402085# Reprocess the current tag if the tr end tag was not ignored
20412086# XXX how are we sure it's always ignored in the innerHTML case?
20422087if not ignoreEndTag :
20432088self .parser .phase .processEndTag (token )
20442089
20452090def endTagTableRowGroup (self ,token ):
20462091if self .tree .elementInScope (token ["name" ],variant = "table" ):
2047- self .endTagTr ("tr" )
2092+ self .endTagTr (impliedTagToken ( "tr" ) )
20482093self .parser .phase .processEndTag (token )
20492094else :
20502095# innerHTML case
@@ -2187,12 +2232,12 @@ def startTagOptgroup(self, token):
21872232
21882233def startTagSelect (self ,token ):
21892234self .parser .parseError ("unexpected-select-in-select" )
2190- self .endTagSelect ("select" )
2235+ self .endTagSelect (impliedTagToken ( "select" ) )
21912236
21922237def startTagInput (self ,token ):
21932238self .parser .parseError ("unexpected-input-in-select" )
21942239if self .tree .elementInScope ("select" ,variant = "table" ):
2195- self .endTagSelect ("select" )
2240+ self .endTagSelect (impliedTagToken ( "select" ) )
21962241self .parser .phase .processStartTag (token )
21972242
21982243def startTagOther (self ,token ):