Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit5078e07

Browse files
author
James Graham
committed
Add phase transition logging support
1 parent205aced commit5078e07

File tree

2 files changed

+67
-15
lines changed

2 files changed

+67
-15
lines changed

‎html5lib/html5parser.py

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def startswithany(str, prefixes):
2828
returnFalse
2929

3030
importsys
31+
importtypes
3132

3233
importinputstream
3334
importtokenizer
@@ -37,14 +38,18 @@ def startswithany(str, prefixes):
3738
fromtreebuildersimportsimpletree
3839

3940
importutils
41+
importconstants
4042
fromconstantsimportspaceCharacters,asciiUpper2Lower
4143
fromconstantsimportscopingElements,formattingElements,specialElements
4244
fromconstantsimportheadingElements,tableInsertModeElements
4345
fromconstantsimportcdataElements,rcdataElements,voidElements
4446
fromconstantsimporttokenTypes,ReparseException,namespaces
4547

48+
debug_log=True
49+
4650
defparse(doc,treebuilder="simpletree",encoding=None,
4751
namespaceHTMLElements=True):
52+
"""Parse a string or file-like object into a tree"""
4853
tb=treebuilders.getTreeBuilder(treebuilder)
4954
p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)
5055
returnp.parse(doc,encoding=encoding)
@@ -55,6 +60,17 @@ def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
5560
p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)
5661
returnp.parseFragment(doc,container=container,encoding=encoding)
5762

63+
defmethod_decorator_metaclass(function):
64+
classDecorated(type):
65+
def__new__(meta,classname,bases,classDict):
66+
forattributeName,attributeinclassDict.iteritems():
67+
iftype(attribute)==types.FunctionType:
68+
attribute=function(attribute)
69+
70+
classDict[attributeName]=attribute
71+
returntype.__new__(meta,classname,bases,classDict)
72+
returnDecorated
73+
5874
classHTMLParser(object):
5975
"""HTML parser. Generates a tree structure from a stream of (possibly
6076
malformed) HTML"""
@@ -129,6 +145,7 @@ def reset(self):
129145
self.tree.reset()
130146
self.firstStartTag=False
131147
self.errors= []
148+
self.log= []#only used with debug mode
132149
# "quirks" / "limited quirks" / "no quirks"
133150
self.compatMode="no quirks"
134151

@@ -420,6 +437,31 @@ def parseRCDataRawtext(self, token, contentType):
420437

421438
self.phase=self.phases["text"]
422439

440+
deflog(function):
441+
"""Logger that records which phase processes each token"""
442+
type_names=dict((value,key)forkey,valuein
443+
constants.tokenTypes.iteritems())
444+
defwrapped(self,*args,**kwargs):
445+
iffunction.__name__!="__init__"andlen(args)>0:
446+
token=args[0]
447+
try:
448+
info= {"type":type_names[token['type']]}
449+
except:
450+
printtoken
451+
raise
452+
iftoken['type']inconstants.tagTokenTypes:
453+
info["name"]=token['name']
454+
455+
self.parser.log.append((self.parser.tokenizer.state.__name__,
456+
self.parser.phase.__class__.__name__,
457+
self.__class__.__name__,
458+
function.__name__,
459+
info))
460+
returnfunction(self,*args,**kwargs)
461+
else:
462+
returnfunction(self,*args,**kwargs)
463+
returnwrapped
464+
423465
classPhase(object):
424466
"""Base class for helper object that implements each phase of processing
425467
"""
@@ -434,6 +476,9 @@ class Phase(object):
434476
# * EndTag
435477
# - endTag* methods
436478

479+
ifdebug_log:
480+
__metaclass__=method_decorator_metaclass(log)
481+
437482
def__init__(self,parser,tree):
438483
self.parser=parser
439484
self.tree=tree
@@ -1008,7 +1053,7 @@ def startTagForm(self, token):
10081053
self.parser.parseError(u"unexpected-start-tag", {"name":"form"})
10091054
else:
10101055
ifself.tree.elementInScope("p"):
1011-
self.endTagP("p")
1056+
self.endTagP(impliedTagToken("p"))
10121057
self.tree.insertElement(token)
10131058
self.tree.formPointer=self.tree.openElements[-1]
10141059

@@ -1831,7 +1876,7 @@ def processEOF(self):
18311876
return
18321877
else:
18331878
ignoreEndTag=self.ignoreEndTagColgroup()
1834-
self.endTagColgroup("colgroup")
1879+
self.endTagColgroup(impliedTagToken("colgroup"))
18351880
ifnotignoreEndTag:
18361881
self.parser.phase.processEOF()
18371882

@@ -1847,7 +1892,7 @@ def startTagCol(self, token):
18471892

18481893
defstartTagOther(self,token):
18491894
ignoreEndTag=self.ignoreEndTagColgroup()
1850-
self.endTagColgroup("colgroup")
1895+
self.endTagColgroup(impliedTagToken("colgroup"))
18511896
ifnotignoreEndTag:
18521897
self.parser.phase.processStartTag(token)
18531898

@@ -1865,7 +1910,7 @@ def endTagCol(self, token):
18651910

18661911
defendTagOther(self,token):
18671912
ignoreEndTag=self.ignoreEndTagColgroup()
1868-
self.endTagColgroup("colgroup")
1913+
self.endTagColgroup(impliedTagToken("colgroup"))
18691914
ifnotignoreEndTag:
18701915
self.parser.phase.processEndTag(token)
18711916

@@ -2016,7 +2061,7 @@ def startTagTableCell(self, token):
20162061

20172062
defstartTagTableOther(self,token):
20182063
ignoreEndTag=self.ignoreEndTagTr()
2019-
self.endTagTr("tr")
2064+
self.endTagTr(impliedTagToken("tr"))
20202065
# XXX how are we sure it's always ignored in the innerHTML case?
20212066
ifnotignoreEndTag:
20222067
self.parser.phase.processStartTag(token)
@@ -2036,15 +2081,15 @@ def endTagTr(self, token):
20362081

20372082
defendTagTable(self,token):
20382083
ignoreEndTag=self.ignoreEndTagTr()
2039-
self.endTagTr("tr")
2084+
self.endTagTr(impliedTagToken("tr"))
20402085
# Reprocess the current tag if the tr end tag was not ignored
20412086
# XXX how are we sure it's always ignored in the innerHTML case?
20422087
ifnotignoreEndTag:
20432088
self.parser.phase.processEndTag(token)
20442089

20452090
defendTagTableRowGroup(self,token):
20462091
ifself.tree.elementInScope(token["name"],variant="table"):
2047-
self.endTagTr("tr")
2092+
self.endTagTr(impliedTagToken("tr"))
20482093
self.parser.phase.processEndTag(token)
20492094
else:
20502095
# innerHTML case
@@ -2187,12 +2232,12 @@ def startTagOptgroup(self, token):
21872232

21882233
defstartTagSelect(self,token):
21892234
self.parser.parseError("unexpected-select-in-select")
2190-
self.endTagSelect("select")
2235+
self.endTagSelect(impliedTagToken("select"))
21912236

21922237
defstartTagInput(self,token):
21932238
self.parser.parseError("unexpected-input-in-select")
21942239
ifself.tree.elementInScope("select",variant="table"):
2195-
self.endTagSelect("select")
2240+
self.endTagSelect(impliedTagToken("select"))
21962241
self.parser.phase.processStartTag(token)
21972242

21982243
defstartTagOther(self,token):

‎parse.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
11
#!/usr/bin/env python
22
"""usage: %prog [options] filename
33
4-
Parse a document to asimpletreetree, with optional profiling
4+
Parse a document to a tree, with optional profiling
55
"""
6-
#RELEASE move ./examples/
76

87
importsys
98
importos
109
fromoptparseimportOptionParser
1110

12-
#RELEASE remove
13-
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
14-
#END RELEASE
1511
fromhtml5libimporthtml5parser,sanitizer
1612
fromhtml5lib.tokenizerimportHTMLTokenizer
1713
fromhtml5libimporttreebuilders,serializer,treewalkers
@@ -52,6 +48,8 @@ def parse():
5248
else:
5349
tokenizer=HTMLTokenizer
5450

51+
ifopts.log:
52+
html5parser.debug_log=True
5553

5654
p=html5parser.HTMLParser(tree=treebuilder,tokenizer=tokenizer)
5755

@@ -87,10 +85,16 @@ def parse():
8785
defprintOutput(parser,document,opts):
8886
ifopts.encoding:
8987
print"Encoding:",parser.tokenizer.stream.charEncoding
88+
89+
ifopts.log:
90+
foriteminparser.log:
91+
printitem
92+
9093
ifopts.xml:
9194
sys.stdout.write(document.toxml("utf-8"))
9295
elifopts.tree:
93-
ifnothasattr(document,'__getitem__'):document= [document]
96+
ifnothasattr(document,'__getitem__'):
97+
document= [document]
9498
forfragmentindocument:
9599
printparser.tree.testSerializer(fragment).encode("utf-8")
96100
elifopts.hilite:
@@ -199,6 +203,9 @@ def getOptParser():
199203
parser.add_option("","--sanitize",action="store_true",default=False,
200204
dest="sanitize",help="sanitize")
201205

206+
parser.add_option("-l","--log",action="store_true",default=False,
207+
dest="log",help="log state transitions")
208+
202209
returnparser
203210

204211
if__name__=="__main__":

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp