Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit38ec086

Browse files
committed
Move token normalisation to the tokenizer
1 parentb2e4802 commit38ec086

File tree

6 files changed

+151
-68
lines changed

6 files changed

+151
-68
lines changed

‎html5lib/_tokenizer.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
fromsiximportunichraschr
44

5-
fromcollectionsimportdeque
5+
fromcollectionsimportdeque,OrderedDict
6+
fromsysimportversion_info
67

78
from .constantsimportspaceCharacters
89
from .constantsimportentities
@@ -17,6 +18,11 @@
1718

1819
entitiesTrie=Trie(entities)
1920

21+
ifversion_info>= (3,7):
22+
attributeMap=dict
23+
else:
24+
attributeMap=OrderedDict
25+
2026

2127
classHTMLTokenizer(object):
2228
""" This class takes care of tokenizing HTML.
@@ -228,6 +234,14 @@ def emitCurrentToken(self):
228234
# Add token to the queue to be yielded
229235
if (token["type"]intagTokenTypes):
230236
token["name"]=token["name"].translate(asciiUpper2Lower)
237+
iftoken["type"]==tokenTypes["StartTag"]:
238+
raw=token["data"]
239+
data=attributeMap(raw)
240+
iflen(raw)>len(data):
241+
# we had some duplicated attribute, fix so first wins
242+
data.update(raw[::-1])
243+
token["data"]=data
244+
231245
iftoken["type"]==tokenTypes["EndTag"]:
232246
iftoken["data"]:
233247
self.tokenQueue.append({"type":tokenTypes["ParseError"],

‎html5lib/html5parser.py

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@
33

44
importtypes
55

6-
fromcollectionsimportOrderedDict
7-
fromsysimportversion_info
8-
96
from .import_inputstream
107
from .import_tokenizer
118

@@ -26,12 +23,6 @@
2623
)
2724

2825

29-
ifversion_info>= (3,7):
30-
attributeMap=dict
31-
else:
32-
attributeMap=OrderedDict
33-
34-
3526
defparse(doc,treebuilder="etree",namespaceHTMLElements=True,**kwargs):
3627
"""Parse an HTML document as a string or file-like object into a tree
3728
@@ -210,7 +201,7 @@ def mainLoop(self):
210201
DoctypeToken=tokenTypes["Doctype"]
211202
ParseErrorToken=tokenTypes["ParseError"]
212203

213-
fortokeninself.normalizedTokens():
204+
fortokeninself.tokenizer:
214205
prev_token=None
215206
new_token=token
216207
whilenew_tokenisnotNone:
@@ -268,10 +259,6 @@ def mainLoop(self):
268259
ifreprocess:
269260
assertself.phasenotinphases
270261

271-
defnormalizedTokens(self):
272-
fortokeninself.tokenizer:
273-
yieldself.normalizeToken(token)
274-
275262
defparse(self,stream,*args,**kwargs):
276263
"""Parse a HTML document into a well-formed tree
277264
@@ -333,18 +320,6 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None):
333320
ifself.strict:
334321
raiseParseError(E[errorcode]%datavars)
335322

336-
defnormalizeToken(self,token):
337-
# HTML5 specific normalizations to the token stream
338-
iftoken["type"]==tokenTypes["StartTag"]:
339-
raw=token["data"]
340-
data=attributeMap(raw)
341-
iflen(raw)>len(data):
342-
# we had some duplicated attribute, fix so first wins
343-
data.update(raw[::-1])
344-
token["data"]=data
345-
346-
returntoken
347-
348323
defadjustMathMLAttributes(self,token):
349324
adjust_attributes(token,adjustMathMLAttributes)
350325

@@ -2803,8 +2778,8 @@ def processEndTag(self, token):
28032778
defadjust_attributes(token,replacements):
28042779
needs_adjustment=viewkeys(token['data'])&viewkeys(replacements)
28052780
ifneeds_adjustment:
2806-
token['data']=attributeMap((replacements.get(k,k),v)
2807-
fork,vintoken['data'].items())
2781+
token['data']=type(token['data'])((replacements.get(k,k),v)
2782+
fork,vintoken['data'].items())
28082783

28092784

28102785
defimpliedTagToken(name,type="EndTag",attributes=None,

‎html5lib/tests/test_parser2.py

Lines changed: 2 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
from __future__importabsolute_import,division,unicode_literals
22

3-
fromsiximportPY2,text_type,unichr
3+
fromsiximportPY2,text_type
44

55
importio
66

77
from .importsupport# noqa
88

9-
fromhtml5lib.constantsimportnamespaces,tokenTypes
9+
fromhtml5lib.constantsimportnamespaces
1010
fromhtml5libimportparse,parseFragment,HTMLParser
1111

1212

@@ -53,42 +53,6 @@ def test_unicode_file():
5353
assertparse(io.StringIO("a"))isnotNone
5454

5555

56-
deftest_maintain_attribute_order():
57-
# This is here because we impl it in parser and not tokenizer
58-
p=HTMLParser()
59-
# generate loads to maximize the chance a hash-based mutation will occur
60-
attrs= [(unichr(x),i)fori,xinenumerate(range(ord('a'),ord('z')))]
61-
token= {'name':'html',
62-
'selfClosing':False,
63-
'selfClosingAcknowledged':False,
64-
'type':tokenTypes["StartTag"],
65-
'data':attrs}
66-
out=p.normalizeToken(token)
67-
attr_order=list(out["data"].keys())
68-
assertattr_order== [xforx,iinattrs]
69-
70-
71-
deftest_duplicate_attribute():
72-
# This is here because we impl it in parser and not tokenizer
73-
doc=parse('<p class=a class=b>')
74-
el=doc[1][0]
75-
assertel.get("class")=="a"
76-
77-
78-
deftest_maintain_duplicate_attribute_order():
79-
# This is here because we impl it in parser and not tokenizer
80-
p=HTMLParser()
81-
attrs= [(unichr(x),i)fori,xinenumerate(range(ord('a'),ord('z')))]
82-
token= {'name':'html',
83-
'selfClosing':False,
84-
'selfClosingAcknowledged':False,
85-
'type':tokenTypes["StartTag"],
86-
'data':attrs+ [('a',len(attrs))]}
87-
out=p.normalizeToken(token)
88-
attr_order=list(out["data"].keys())
89-
assertattr_order== [xforx,iinattrs]
90-
91-
9256
deftest_debug_log():
9357
parser=HTMLParser(debug=True)
9458
parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")

‎html5lib/tests/test_tokenizer2.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from __future__importabsolute_import,division,unicode_literals
2+
3+
importio
4+
5+
fromsiximportunichr,text_type
6+
7+
fromhtml5lib._tokenizerimportHTMLTokenizer
8+
fromhtml5lib.constantsimporttokenTypes
9+
10+
11+
defignore_parse_errors(toks):
12+
fortokintoks:
13+
iftok['type']!=tokenTypes['ParseError']:
14+
yieldtok
15+
16+
17+
deftest_maintain_attribute_order():
18+
# generate loads to maximize the chance a hash-based mutation will occur
19+
attrs= [(unichr(x),text_type(i))fori,xinenumerate(range(ord('a'),ord('z')))]
20+
stream=io.StringIO("<span "+" ".join("%s='%s'"% (x,i)forx,iinattrs)+">")
21+
22+
toks=HTMLTokenizer(stream)
23+
out=list(ignore_parse_errors(toks))
24+
25+
assertlen(out)==1
26+
assertout[0]['type']==tokenTypes['StartTag']
27+
28+
attrs_tok=out[0]['data']
29+
assertlen(attrs_tok)==len(attrs)
30+
31+
for (in_name,in_value), (out_name,out_value)inzip(attrs,attrs_tok.items()):
32+
assertin_name==out_name
33+
assertin_value==out_value
34+
35+
36+
deftest_duplicate_attribute():
37+
stream=io.StringIO("<span a=1 a=2 a=3>")
38+
39+
toks=HTMLTokenizer(stream)
40+
out=list(ignore_parse_errors(toks))
41+
42+
assertlen(out)==1
43+
assertout[0]['type']==tokenTypes['StartTag']
44+
45+
attrs_tok=out[0]['data']
46+
assertlen(attrs_tok)==1
47+
assertlist(attrs_tok.items())== [('a','1')]
48+
49+
50+
deftest_maintain_duplicate_attribute_order():
51+
# generate loads to maximize the chance a hash-based mutation will occur
52+
attrs= [(unichr(x),text_type(i))fori,xinenumerate(range(ord('a'),ord('z')))]
53+
stream=io.StringIO("<span "+" ".join("%s='%s'"% (x,i)forx,iinattrs)+" a=100>")
54+
55+
toks=HTMLTokenizer(stream)
56+
out=list(ignore_parse_errors(toks))
57+
58+
assertlen(out)==1
59+
assertout[0]['type']==tokenTypes['StartTag']
60+
61+
attrs_tok=out[0]['data']
62+
assertlen(attrs_tok)==len(attrs)
63+
64+
for (in_name,in_value), (out_name,out_value)inzip(attrs,attrs_tok.items()):
65+
assertin_name==out_name
66+
assertin_value==out_value

‎html5lib/tests/test_treewalkers.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from __future__importabsolute_import,division,unicode_literals
22

33
importitertools
4+
importsys
45

6+
fromsiximportunichr,text_type
57
importpytest
68

79
try:
@@ -135,3 +137,65 @@ def test_lxml_xml():
135137
output=Lint(walker(lxmltree))
136138

137139
assertlist(output)==expected
140+
141+
142+
@pytest.mark.parametrize("treeName",
143+
[pytest.param(treeName,marks=[getattr(pytest.mark,treeName),
144+
pytest.mark.skipif(sys.version_info< (3,7),reason="dict order undef")])
145+
fortreeNameinsorted(treeTypes.keys())])
146+
deftest_maintain_attribute_order(treeName):
147+
treeAPIs=treeTypes[treeName]
148+
iftreeAPIsisNone:
149+
pytest.skip("Treebuilder not loaded")
150+
151+
# generate loads to maximize the chance a hash-based mutation will occur
152+
attrs= [(unichr(x),text_type(i))fori,xinenumerate(range(ord('a'),ord('z')))]
153+
data="<span "+" ".join("%s='%s'"% (x,i)forx,iinattrs)+">"
154+
155+
parser=html5parser.HTMLParser(tree=treeAPIs["builder"])
156+
document=parser.parseFragment(data)
157+
158+
document=treeAPIs.get("adapter",lambdax:x)(document)
159+
output=list(Lint(treeAPIs["walker"](document)))
160+
161+
assertlen(output)==2
162+
assertoutput[0]['type']=='StartTag'
163+
assertoutput[1]['type']=="EndTag"
164+
165+
attrs_out=output[0]['data']
166+
assertlen(attrs)==len(attrs_out)
167+
168+
for (in_name,in_value), (out_name,out_value)inzip(attrs,attrs_out.items()):
169+
assert (None,in_name)==out_name
170+
assertin_value==out_value
171+
172+
173+
@pytest.mark.parametrize("treeName",
174+
[pytest.param(treeName,marks=[getattr(pytest.mark,treeName),
175+
pytest.mark.skipif(sys.version_info< (3,7),reason="dict order undef")])
176+
fortreeNameinsorted(treeTypes.keys())])
177+
deftest_maintain_attribute_order_adjusted(treeName):
178+
treeAPIs=treeTypes[treeName]
179+
iftreeAPIsisNone:
180+
pytest.skip("Treebuilder not loaded")
181+
182+
# generate loads to maximize the chance a hash-based mutation will occur
183+
data="<svg a=1 refx=2 b=3 xml:lang=4 c=5>"
184+
185+
parser=html5parser.HTMLParser(tree=treeAPIs["builder"])
186+
document=parser.parseFragment(data)
187+
188+
document=treeAPIs.get("adapter",lambdax:x)(document)
189+
output=list(Lint(treeAPIs["walker"](document)))
190+
191+
assertlen(output)==2
192+
assertoutput[0]['type']=='StartTag'
193+
assertoutput[1]['type']=="EndTag"
194+
195+
attrs_out=output[0]['data']
196+
197+
assertlist(attrs_out.items())== [((None,'a'),'1'),
198+
((None,'refX'),'2'),
199+
((None,'b'),'3'),
200+
(('http://www.w3.org/XML/1998/namespace','lang'),'4'),
201+
((None,'c'),'5')]

‎html5lib/tests/tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def processDoctype(self, token):
4040

4141
defprocessStartTag(self,token):
4242
self.outputTokens.append(["StartTag",token["name"],
43-
dict(token["data"][::-1]),token["selfClosing"]])
43+
token["data"],token["selfClosing"]])
4444

4545
defprocessEmptyTag(self,token):
4646
iftoken["name"]notinconstants.voidElements:

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp