Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc5de2df

Browse files
committed
Update parser (and tests) to latest spec. Fix a couple of bugs in the tokenizer that only turned up in the tree construction tests.
1 parent981cbd1 commitc5de2df

File tree

3 files changed

+91
-93
lines changed

3 files changed

+91
-93
lines changed

‎src/html5lib/constants.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -256,21 +256,18 @@
256256
_(u"Unexpected end of file. Expected select content."),
257257
"eof-in-frameset":
258258
_(u"Unexpected end of file. Expected frameset content."),
259+
"eof-in-script-in-script":
260+
_(u"Unexpected end of file. Expected script content."),
259261
"non-void-element-with-trailing-solidus":
260262
_(u"Trailing solidus not allowed on element %(name)s"),
261263
"unexpected-html-element-in-foreign-content":
262264
_(u"Element %(name)s not allowed in a non-html context"),
265+
"unexpected-end-tag-before-html":
266+
_(u"Unexpected end tag (%(name)s) before html."),
263267
"XXX-undefined-error":
264268
(u"Undefined error (this sucks and should be fixed)"),
265269
}
266270

267-
contentModelFlags= {
268-
"PCDATA":0,
269-
"RCDATA":1,
270-
"RAWTEXT":2,
271-
"PLAINTEXT":3
272-
}
273-
274271
namespaces= {
275272
"html":"http://www.w3.org/1999/xhtml",
276273
"mathml":"http://www.w3.org/1998/Math/MathML",

‎src/html5lib/html5parser.py

Lines changed: 52 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def any(iterable):
2525
fromtreebuildersimportsimpletree
2626

2727
importutils
28-
fromconstantsimportcontentModelFlags,spaceCharacters,asciiUpper2Lower
28+
fromconstantsimportspaceCharacters,asciiUpper2Lower
2929
fromconstantsimportscopingElements,formattingElements,specialElements
3030
fromconstantsimportheadingElements,tableInsertModeElements
3131
fromconstantsimportcdataElements,rcdataElements,voidElements
@@ -77,7 +77,7 @@ def __init__(self, tree = simpletree.TreeBuilder,
7777
# XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
7878
"afterHead":AfterHeadPhase(self,self.tree),
7979
"inBody":InBodyPhase(self,self.tree),
80-
"inRCDataRawtext":InRCDataRawtextPhase(self,self.tree),
80+
"text":TextPhase(self,self.tree),
8181
"inTable":InTablePhase(self,self.tree),
8282
"inTableText":InTableTextPhase(self,self.tree),
8383
"inCaption":InCaptionPhase(self,self.tree),
@@ -124,14 +124,14 @@ def reset(self):
124124
self.innerHTML=self.container.lower()
125125

126126
ifself.innerHTMLincdataElements:
127-
self.tokenizer.contentModelFlag=tokenizer.contentModelFlags["RCDATA"]
127+
self.tokenizer.state=self.tokenizer.rcdataState
128128
elifself.innerHTMLinrcdataElements:
129-
self.tokenizer.contentModelFlag=tokenizer.contentModelFlags["RAWTEXT"]
129+
self.tokenizer.state=self.tokenizer.rawtextState
130130
elifself.innerHTML=='plaintext':
131-
self.tokenizer.contentModelFlag=tokenizer.contentModelFlags["PLAINTEXT"]
131+
self.tokenizer.state=self.tokenizer.plaintextState
132132
else:
133-
#contentModelFlag already isPCDATA
134-
#self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
133+
#state already isdata state
134+
#self.tokenizer.state =self.tokenizer.dataState
135135
pass
136136
self.phase=self.phases["beforeHtml"]
137137
self.phase.insertHtmlElement()
@@ -406,7 +406,7 @@ def parseRCDataRawtext(self, token, contentType):
406406

407407
self.originalPhase=self.phase
408408

409-
self.phase=self.phases["inRCDataRawtext"]
409+
self.phase=self.phases["text"]
410410

411411
classPhase(object):
412412
"""Base class for helper object that implements each phase of processing
@@ -636,8 +636,12 @@ def processStartTag(self, token):
636636
self.parser.phase.processStartTag(token)
637637

638638
defprocessEndTag(self,token):
639-
self.insertHtmlElement()
640-
self.parser.phase.processEndTag(token)
639+
iftoken["name"]notin ("head","body","html","br"):
640+
self.parser.parseError("unexpected-end-tag-before-html",
641+
{"name":token["name"]})
642+
else:
643+
self.insertHtmlElement()
644+
self.parser.phase.processEndTag(token)
641645

642646

643647
classBeforeHeadPhase(Phase):
@@ -651,7 +655,7 @@ def __init__(self, parser, tree):
651655
self.startTagHandler.default=self.startTagOther
652656

653657
self.endTagHandler=utils.MethodDispatcher([
654-
(("head","br"),self.endTagImplyHead)
658+
(("head","body","html","br"),self.endTagImplyHead)
655659
])
656660
self.endTagHandler.default=self.endTagOther
657661

@@ -666,6 +670,9 @@ def processCharacters(self, token):
666670
self.startTagHead(impliedTagToken("head","StartTag"))
667671
self.parser.phase.processCharacters(token)
668672

673+
defstartTagHtml(self,token):
674+
self.parser.phases["inBody"].processStartTag(token)
675+
669676
defstartTagHead(self,token):
670677
self.tree.insertElement(token)
671678
self.tree.headPointer=self.tree.openElements[-1]
@@ -692,8 +699,8 @@ def __init__(self, parser, tree):
692699
("title",self.startTagTitle),
693700
(("noscript","noframes","style"),self.startTagNoScriptNoFramesStyle),
694701
("script",self.startTagScript),
695-
(("base","link","command","eventsource"),
696-
self.startTagBaseLinkCommandEventsource),
702+
(("base","link","command"),
703+
self.startTagBaseLinkCommand),
697704
("meta",self.startTagMeta),
698705
("head",self.startTagHead)
699706
])
@@ -728,7 +735,7 @@ def startTagHtml(self, token):
728735
defstartTagHead(self,token):
729736
self.parser.parseError("two-heads-are-not-better-than-one")
730737

731-
defstartTagBaseLinkCommandEventsource(self,token):
738+
defstartTagBaseLinkCommand(self,token):
732739
self.tree.insertElement(token)
733740
self.tree.openElements.pop()
734741
token["selfClosingAcknowledged"]=True
@@ -757,9 +764,10 @@ def startTagNoScriptNoFramesStyle(self, token):
757764
self.parser.parseRCDataRawtext(token,"RAWTEXT")
758765

759766
defstartTagScript(self,token):
760-
#I think this is equivalent to the RAWTEXT stuff since we don't execute script
761-
#self.tree.insertElement(token)
762-
self.parser.parseRCDataRawtext(token,"RAWTEXT")
767+
self.tree.insertElement(token)
768+
self.parser.tokenizer.state=self.parser.tokenizer.scriptDataState
769+
self.parser.originalPhase=self.parser.phase
770+
self.parser.phase=self.parser.phases["text"]
763771

764772
defstartTagOther(self,token):
765773
self.anythingElse()
@@ -838,7 +846,6 @@ def startTagOther(self, token):
838846
self.parser.phase.processStartTag(token)
839847

840848
defendTagHtmlBodyBr(self,token):
841-
#This is not currently in the spec
842849
self.anythingElse()
843850
self.parser.phase.processEndTag(token)
844851

@@ -852,8 +859,8 @@ def anythingElse(self):
852859

853860

854861
classInBodyPhase(Phase):
855-
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
856-
# the crazy mode
862+
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
863+
# thereally-really-really-verycrazy mode
857864
def__init__(self,parser,tree):
858865
Phase.__init__(self,parser,tree)
859866

@@ -862,15 +869,16 @@ def __init__(self, parser, tree):
862869

863870
self.startTagHandler=utils.MethodDispatcher([
864871
("html",self.startTagHtml),
865-
(("base","link","meta","script","style","title"),
866-
self.startTagProcessInHead),
872+
(("base","command","link","meta","noframes","script","style",
873+
"title"),self.startTagProcessInHead),
867874
("body",self.startTagBody),
868875
("frameset",self.startTagFrameset),
869876
(("address","article","aside","blockquote","center","datagrid",
870-
"details","dialog","dir","div","dl","fieldset","figure",
871-
"footer","h1","h2","h3","h4","h5","h6","header","listing",
872-
"menu","nav","ol","p","pre","section","ul"),
877+
"details","dir","div","dl","fieldset","figure",
878+
"footer","header","hgroup","menu","nav","ol","p",
879+
"section","ul"),
873880
self.startTagCloseP),
881+
(("pre","listing"),self.startTagPreListing),
874882
("form",self.startTagForm),
875883
(("li","dd","dt"),self.startTagListItem),
876884
("plaintext",self.startTagPlaintext),
@@ -899,18 +907,17 @@ def __init__(self, parser, tree):
899907
(("svg"),self.startTagSvg),
900908
(("caption","col","colgroup","frame","head",
901909
"tbody","td","tfoot","th","thead",
902-
"tr"),self.startTagMisplaced),
903-
(("event-source","command"),self.startTagNew)
910+
"tr"),self.startTagMisplaced)
904911
])
905912
self.startTagHandler.default=self.startTagOther
906913

907914
self.endTagHandler=utils.MethodDispatcher([
908915
("body",self.endTagBody),
909916
("html",self.endTagHtml),
910917
(("address","article","aside","blockquote","center","datagrid",
911-
"details","dialog","dir","div","dl","fieldset","figure",
912-
"footer","header","listing","menu","nav","ol","pre","section",
913-
"ul"),self.endTagBlock),
918+
"details","dir","div","dl","fieldset","figure",
919+
"footer","header","hgroup","listing","menu","nav","ol","pre",
920+
"section","ul"),self.endTagBlock),
914921
("form",self.endTagForm),
915922
("p",self.endTagP),
916923
(("dd","dt","li"),self.endTagListItem),
@@ -953,14 +960,10 @@ def processSpaceCharactersDropNewline(self, token):
953960
self.tree.insertText(data)
954961

955962
defprocessCharacters(self,token):
956-
# XXX The specification says to do this for every character at the
957-
# moment, but apparently that doesn't match the real world so we don't
958-
# do it for space characters.
959963
self.tree.reconstructActiveFormattingElements()
960964
self.tree.insertText(token["data"])
961965
self.parser.framesetOK=False
962966

963-
#This matches the current spec but may not match the real world
964967
defprocessSpaceCharacters(self,token):
965968
self.tree.reconstructActiveFormattingElements()
966969
self.tree.insertText(token["data"])
@@ -996,9 +999,13 @@ def startTagCloseP(self, token):
996999
ifself.tree.elementInScope("p"):
9971000
self.endTagP(impliedTagToken("p"))
9981001
self.tree.insertElement(token)
999-
iftoken["name"]in ("pre","listing"):
1000-
self.parser.framesetOK=False
1001-
self.processSpaceCharacters=self.processSpaceCharactersDropNewline
1002+
1003+
defstartTagPreListing(self,token):
1004+
ifself.tree.elementInScope("p"):
1005+
self.endTagP(impliedTagToken("p"))
1006+
self.tree.insertElement(token)
1007+
self.parser.framesetOK=False
1008+
self.processSpaceCharacters=self.processSpaceCharactersDropNewline
10021009

10031010
defstartTagForm(self,token):
10041011
ifself.tree.formPointer:
@@ -1035,23 +1042,14 @@ def startTagPlaintext(self, token):
10351042
ifself.tree.elementInScope("p"):
10361043
self.endTagP(impliedTagToken("p"))
10371044
self.tree.insertElement(token)
1038-
self.parser.tokenizer.contentModelFlag=contentModelFlags["PLAINTEXT"]
1045+
self.parser.tokenizer.state=self.parser.tokenizer.plaintextState
10391046

10401047
defstartTagHeading(self,token):
10411048
ifself.tree.elementInScope("p"):
10421049
self.endTagP(impliedTagToken("p"))
10431050
ifself.tree.openElements[-1].nameinheadingElements:
10441051
self.parser.parseError("unexpected-start-tag", {"name":token["name"]})
10451052
self.tree.openElements.pop()
1046-
# Uncomment the following for IE7 behavior:
1047-
#
1048-
#for item in headingElements:
1049-
# if self.tree.elementInScope(item):
1050-
# self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1051-
# item = self.tree.openElements.pop()
1052-
# while item.name not in headingElements:
1053-
# item = self.tree.openElements.pop()
1054-
# break
10551053
self.tree.insertElement(token)
10561054

10571055
defstartTagA(self,token):
@@ -1175,9 +1173,8 @@ def startTagIsIndex(self, token):
11751173
self.processEndTag(impliedTagToken("form"))
11761174

11771175
defstartTagTextarea(self,token):
1178-
# XXX Form element pointer checking here as well...
11791176
self.tree.insertElement(token)
1180-
self.parser.tokenizer.contentModelFlag=contentModelFlags["RCDATA"]
1177+
self.parser.tokenizer.state=self.parser.tokenizer.rcdataState
11811178
self.processSpaceCharacters=self.processSpaceCharactersDropNewline
11821179
self.parser.framesetOK=False
11831180

@@ -1257,16 +1254,6 @@ def startTagMisplaced(self, token):
12571254
"""
12581255
self.parser.parseError("unexpected-start-tag-ignored", {"name":token["name"]})
12591256

1260-
defstartTagNew(self,token):
1261-
"""New HTML5 elements, "event-source", "section", "nav",
1262-
"article", "aside", "header", "footer", "datagrid", "command"
1263-
"""
1264-
#2007-08-30 - MAP - commenting out this write to sys.stderr because
1265-
# it's really annoying me when I run the validator tests
1266-
#sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
1267-
self.startTagOther(token)
1268-
#raise NotImplementedError
1269-
12701257
defstartTagOther(self,token):
12711258
self.tree.reconstructActiveFormattingElements()
12721259
self.tree.insertElement(token)
@@ -1285,19 +1272,16 @@ def endTagP(self, token):
12851272
node=self.tree.openElements.pop()
12861273

12871274
defendTagBody(self,token):
1288-
# XXX Need to take open <p> tags into account here. We shouldn't imply
1289-
# </p> but we should not throw a parse error either. Specification is
1290-
# likely to be updated.
1291-
if (len(self.tree.openElements)==1or
1292-
self.tree.openElements[1].name!="body"):
1293-
# innerHTML case
1275+
ifnotself.tree.elementInScope("body"):
12941276
self.parser.parseError()
12951277
return
12961278
elifself.tree.openElements[-1].name!="body":
12971279
fornodeinself.tree.openElements[2:]:
1298-
ifnode.namenotinfrozenset(("dd","dt","li","p",
1280+
ifnode.namenotinfrozenset(("dd","dt","li","optgroup",
1281+
"option","p","rp","rt",
12991282
"tbody","td","tfoot",
1300-
"th","thead","tr")):
1283+
"th","thead","tr","body",
1284+
"html")):
13011285
#Not sure this is the correct name for the parse error
13021286
self.parser.parseError(
13031287
"expected-one-end-tag-but-got-another",
@@ -1524,7 +1508,7 @@ def endTagOther(self, token):
15241508
self.parser.parseError("unexpected-end-tag", {"name":token["name"]})
15251509
break
15261510

1527-
classInRCDataRawtextPhase(Phase):
1511+
classTextPhase(Phase):
15281512
def__init__(self,parser,tree):
15291513
Phase.__init__(self,parser,tree)
15301514
self.startTagHandler=utils.MethodDispatcher([])

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp