@@ -28,6 +28,7 @@ def startswithany(str, prefixes):
28
28
return False
29
29
30
30
import sys
31
+ import types
31
32
32
33
import inputstream
33
34
import tokenizer
@@ -37,14 +38,18 @@ def startswithany(str, prefixes):
37
38
from treebuilders import simpletree
38
39
39
40
import utils
41
+ import constants
40
42
from constants import spaceCharacters ,asciiUpper2Lower
41
43
from constants import scopingElements ,formattingElements ,specialElements
42
44
from constants import headingElements ,tableInsertModeElements
43
45
from constants import cdataElements ,rcdataElements ,voidElements
44
46
from constants import tokenTypes ,ReparseException ,namespaces
45
47
48
+ debug_log = True
49
+
46
50
def parse (doc ,treebuilder = "simpletree" ,encoding = None ,
47
51
namespaceHTMLElements = True ):
52
+ """Parse a string or file-like object into a tree"""
48
53
tb = treebuilders .getTreeBuilder (treebuilder )
49
54
p = HTMLParser (tb ,namespaceHTMLElements = namespaceHTMLElements )
50
55
return p .parse (doc ,encoding = encoding )
@@ -55,6 +60,17 @@ def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
55
60
p = HTMLParser (tb ,namespaceHTMLElements = namespaceHTMLElements )
56
61
return p .parseFragment (doc ,container = container ,encoding = encoding )
57
62
63
+ def method_decorator_metaclass (function ):
64
+ class Decorated (type ):
65
+ def __new__ (meta ,classname ,bases ,classDict ):
66
+ for attributeName ,attribute in classDict .iteritems ():
67
+ if type (attribute )== types .FunctionType :
68
+ attribute = function (attribute )
69
+
70
+ classDict [attributeName ]= attribute
71
+ return type .__new__ (meta ,classname ,bases ,classDict )
72
+ return Decorated
73
+
58
74
class HTMLParser (object ):
59
75
"""HTML parser. Generates a tree structure from a stream of (possibly
60
76
malformed) HTML"""
@@ -129,6 +145,7 @@ def reset(self):
129
145
self .tree .reset ()
130
146
self .firstStartTag = False
131
147
self .errors = []
148
+ self .log = []#only used with debug mode
132
149
# "quirks" / "limited quirks" / "no quirks"
133
150
self .compatMode = "no quirks"
134
151
@@ -420,6 +437,31 @@ def parseRCDataRawtext(self, token, contentType):
420
437
421
438
self .phase = self .phases ["text" ]
422
439
440
+ def log (function ):
441
+ """Logger that records which phase processes each token"""
442
+ type_names = dict ((value ,key )for key ,value in
443
+ constants .tokenTypes .iteritems ())
444
+ def wrapped (self ,* args ,** kwargs ):
445
+ if function .__name__ != "__init__" and len (args )> 0 :
446
+ token = args [0 ]
447
+ try :
448
+ info = {"type" :type_names [token ['type' ]]}
449
+ except :
450
+ print token
451
+ raise
452
+ if token ['type' ]in constants .tagTokenTypes :
453
+ info ["name" ]= token ['name' ]
454
+
455
+ self .parser .log .append ((self .parser .tokenizer .state .__name__ ,
456
+ self .parser .phase .__class__ .__name__ ,
457
+ self .__class__ .__name__ ,
458
+ function .__name__ ,
459
+ info ))
460
+ return function (self ,* args ,** kwargs )
461
+ else :
462
+ return function (self ,* args ,** kwargs )
463
+ return wrapped
464
+
423
465
class Phase (object ):
424
466
"""Base class for helper object that implements each phase of processing
425
467
"""
@@ -434,6 +476,9 @@ class Phase(object):
434
476
# * EndTag
435
477
# - endTag* methods
436
478
479
+ if debug_log :
480
+ __metaclass__ = method_decorator_metaclass (log )
481
+
437
482
def __init__ (self ,parser ,tree ):
438
483
self .parser = parser
439
484
self .tree = tree
@@ -1008,7 +1053,7 @@ def startTagForm(self, token):
1008
1053
self .parser .parseError (u"unexpected-start-tag" , {"name" :"form" })
1009
1054
else :
1010
1055
if self .tree .elementInScope ("p" ):
1011
- self .endTagP ("p" )
1056
+ self .endTagP (impliedTagToken ( "p" ) )
1012
1057
self .tree .insertElement (token )
1013
1058
self .tree .formPointer = self .tree .openElements [- 1 ]
1014
1059
@@ -1831,7 +1876,7 @@ def processEOF(self):
1831
1876
return
1832
1877
else :
1833
1878
ignoreEndTag = self .ignoreEndTagColgroup ()
1834
- self .endTagColgroup ("colgroup" )
1879
+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
1835
1880
if not ignoreEndTag :
1836
1881
self .parser .phase .processEOF ()
1837
1882
@@ -1847,7 +1892,7 @@ def startTagCol(self, token):
1847
1892
1848
1893
def startTagOther (self ,token ):
1849
1894
ignoreEndTag = self .ignoreEndTagColgroup ()
1850
- self .endTagColgroup ("colgroup" )
1895
+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
1851
1896
if not ignoreEndTag :
1852
1897
self .parser .phase .processStartTag (token )
1853
1898
@@ -1865,7 +1910,7 @@ def endTagCol(self, token):
1865
1910
1866
1911
def endTagOther (self ,token ):
1867
1912
ignoreEndTag = self .ignoreEndTagColgroup ()
1868
- self .endTagColgroup ("colgroup" )
1913
+ self .endTagColgroup (impliedTagToken ( "colgroup" ) )
1869
1914
if not ignoreEndTag :
1870
1915
self .parser .phase .processEndTag (token )
1871
1916
@@ -2016,7 +2061,7 @@ def startTagTableCell(self, token):
2016
2061
2017
2062
def startTagTableOther (self ,token ):
2018
2063
ignoreEndTag = self .ignoreEndTagTr ()
2019
- self .endTagTr ("tr" )
2064
+ self .endTagTr (impliedTagToken ( "tr" ) )
2020
2065
# XXX how are we sure it's always ignored in the innerHTML case?
2021
2066
if not ignoreEndTag :
2022
2067
self .parser .phase .processStartTag (token )
@@ -2036,15 +2081,15 @@ def endTagTr(self, token):
2036
2081
2037
2082
def endTagTable (self ,token ):
2038
2083
ignoreEndTag = self .ignoreEndTagTr ()
2039
- self .endTagTr ("tr" )
2084
+ self .endTagTr (impliedTagToken ( "tr" ) )
2040
2085
# Reprocess the current tag if the tr end tag was not ignored
2041
2086
# XXX how are we sure it's always ignored in the innerHTML case?
2042
2087
if not ignoreEndTag :
2043
2088
self .parser .phase .processEndTag (token )
2044
2089
2045
2090
def endTagTableRowGroup (self ,token ):
2046
2091
if self .tree .elementInScope (token ["name" ],variant = "table" ):
2047
- self .endTagTr ("tr" )
2092
+ self .endTagTr (impliedTagToken ( "tr" ) )
2048
2093
self .parser .phase .processEndTag (token )
2049
2094
else :
2050
2095
# innerHTML case
@@ -2187,12 +2232,12 @@ def startTagOptgroup(self, token):
2187
2232
2188
2233
def startTagSelect (self ,token ):
2189
2234
self .parser .parseError ("unexpected-select-in-select" )
2190
- self .endTagSelect ("select" )
2235
+ self .endTagSelect (impliedTagToken ( "select" ) )
2191
2236
2192
2237
def startTagInput (self ,token ):
2193
2238
self .parser .parseError ("unexpected-input-in-select" )
2194
2239
if self .tree .elementInScope ("select" ,variant = "table" ):
2195
- self .endTagSelect ("select" )
2240
+ self .endTagSelect (impliedTagToken ( "select" ) )
2196
2241
self .parser .phase .processStartTag (token )
2197
2242
2198
2243
def startTagOther (self ,token ):