Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit96da7f5

Browse files
ambvgsnedders
authored andcommitted
Removesimpletree, changing the default tree builder toetree.
1 parentb0dda81 commit96da7f5

File tree

15 files changed

+95
-413
lines changed

15 files changed

+95
-413
lines changed

‎CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ Change Log
66

77
Released on XXX, 2013
88

9+
* Removed ``simpletree`` from the package. The default tree builder is
10+
now ``etree`` (using the ``xml.etree.ElementTree/cElementTree``
11+
implementation).
12+
913

1014
0.95
1115
~~~~

‎html5lib/html5parser.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from .importtreebuilders
1010
from .treebuilders._baseimportMarker
11-
from .treebuildersimportsimpletree
1211

1312
from .importutils
1413
from .importconstants
@@ -20,15 +19,15 @@
2019
from .constantsimporthtmlIntegrationPointElements,mathmlTextIntegrationPointElements
2120

2221

23-
defparse(doc,treebuilder="simpletree",encoding=None,
22+
defparse(doc,treebuilder="etree",encoding=None,
2423
namespaceHTMLElements=True):
2524
"""Parse a string or file-like object into a tree"""
2625
tb=treebuilders.getTreeBuilder(treebuilder)
2726
p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)
2827
returnp.parse(doc,encoding=encoding)
2928

3029

31-
defparseFragment(doc,container="div",treebuilder="simpletree",encoding=None,
30+
defparseFragment(doc,container="div",treebuilder="etree",encoding=None,
3231
namespaceHTMLElements=True):
3332
tb=treebuilders.getTreeBuilder(treebuilder)
3433
p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)
@@ -51,9 +50,8 @@ class HTMLParser(object):
5150
"""HTML parser. Generates a tree structure from a stream of (possibly
5251
malformed) HTML"""
5352

54-
def__init__(self,tree=simpletree.TreeBuilder,
55-
tokenizer=tokenizer.HTMLTokenizer,strict=False,
56-
namespaceHTMLElements=True,debug=False):
53+
def__init__(self,tree=None,tokenizer=tokenizer.HTMLTokenizer,
54+
strict=False,namespaceHTMLElements=True,debug=False):
5755
"""
5856
strict - raise an exception when a parse error is encountered
5957
@@ -69,6 +67,8 @@ def __init__(self, tree=simpletree.TreeBuilder,
6967
# Raise an exception on the first error encountered
7068
self.strict=strict
7169

70+
iftreeisNone:
71+
tree=treebuilders.getTreeBuilder("etree")
7272
self.tree=tree(namespaceHTMLElements)
7373
self.tokenizer_class=tokenizer
7474
self.errors= []

‎html5lib/serializer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from .htmlserializerimportHTMLSerializer
66

77

8-
defserialize(input,tree="simpletree",format="html",encoding=None,
8+
defserialize(input,tree="etree",format="html",encoding=None,
99
**serializer_opts):
1010
# XXX: Should we cache this?
1111
walker=treewalkers.getTreeWalker(tree)

‎html5lib/tests/support.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616
delbase_path
1717

1818
# Build a dict of avaliable trees
19-
treeTypes= {"simpletree":treebuilders.getTreeBuilder("simpletree"),
20-
"DOM":treebuilders.getTreeBuilder("dom")}
19+
treeTypes= {"DOM":treebuilders.getTreeBuilder("dom")}
2120

2221
# Try whatever etree implementations are avaliable from a list that are
2322
#"supposed" to work
@@ -64,7 +63,7 @@ def __getitem__(self, key):
6463

6564
classTestData(object):
6665
def__init__(self,filename,newTestHeading="data",encoding="utf8"):
67-
ifencoding==None:
66+
ifencodingisNone:
6867
self.f=open(filename,mode="rb")
6968
else:
7069
self.f=codecs.open(filename,encoding=encoding)

‎html5lib/tests/test_parser2.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from .importsupport# flake8: noqa
66
fromhtml5libimporthtml5parser
77
fromhtml5lib.constantsimportnamespaces
8-
fromhtml5lib.treebuildersimportdom
8+
fromhtml5libimporttreebuilders
99

1010
importunittest
1111

@@ -14,29 +14,42 @@
1414

1515
classMoreParserTests(unittest.TestCase):
1616

17+
defsetUp(self):
18+
self.dom_tree=treebuilders.getTreeBuilder("dom")
19+
1720
deftest_assertDoctypeCloneable(self):
18-
parser=html5parser.HTMLParser(tree=dom.TreeBuilder)
21+
parser=html5parser.HTMLParser(tree=self.dom_tree)
1922
doc=parser.parse('<!DOCTYPE HTML>')
2023
self.assertTrue(doc.cloneNode(True))
2124

2225
deftest_line_counter(self):
2326
# http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
24-
parser=html5parser.HTMLParser(tree=dom.TreeBuilder)
27+
parser=html5parser.HTMLParser(tree=self.dom_tree)
2528
parser.parse("<pre>\nx\n&gt;\n</pre>")
2629

27-
deftest_namespace_html_elements_0(self):
30+
deftest_namespace_html_elements_0_dom(self):
31+
parser=html5parser.HTMLParser(tree=self.dom_tree,namespaceHTMLElements=True)
32+
doc=parser.parse("<html></html>")
33+
self.assertTrue(doc.childNodes[0].namespaceURI==namespaces["html"])
34+
35+
deftest_namespace_html_elements_1_dom(self):
36+
parser=html5parser.HTMLParser(tree=self.dom_tree,namespaceHTMLElements=False)
37+
doc=parser.parse("<html></html>")
38+
self.assertTrue(doc.childNodes[0].namespaceURIisNone)
39+
40+
deftest_namespace_html_elements_0_etree(self):
2841
parser=html5parser.HTMLParser(namespaceHTMLElements=True)
2942
doc=parser.parse("<html></html>")
30-
self.assertTrue(doc.childNodes[0].namespace==namespaces["html"])
43+
self.assertTrue(list(doc)[0].tag=="{%s}html"% (namespaces["html"],))
3144

32-
deftest_namespace_html_elements_1(self):
45+
deftest_namespace_html_elements_1_etree(self):
3346
parser=html5parser.HTMLParser(namespaceHTMLElements=False)
3447
doc=parser.parse("<html></html>")
35-
self.assertTrue(doc.childNodes[0].namespace==None)
48+
self.assertTrue(list(doc)[0].tag=="html")
3649

3750
deftest_unicode_file(self):
3851
parser=html5parser.HTMLParser()
39-
doc=parser.parse(io.StringIO("a"))
52+
parser.parse(io.StringIO("a"))
4053

4154

4255
defbuildTestSuite():

‎html5lib/tests/test_sanitizer.py

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,43 @@
55
exceptImportError:
66
importsimplejsonasjson
77

8-
fromhtml5libimporthtml5parser,sanitizer,constants
8+
fromhtml5libimporthtml5parser,sanitizer,constants,treebuilders
99

1010

11-
defrunSanitizerTest(name,expected,input):
12-
expected=''.join([token.toxml()fortokeninhtml5parser.HTMLParser().
13-
parseFragment(expected).childNodes])
11+
deftoxmlFactory():
12+
tree=treebuilders.getTreeBuilder("etree")
13+
14+
deftoxml(element):
15+
# encode/decode roundtrip required for Python 2.6 compatibility
16+
result_bytes=tree.implementation.tostring(element,encoding="utf-8")
17+
returnresult_bytes.decode("utf-8")
18+
19+
returntoxml
20+
21+
22+
defrunSanitizerTest(name,expected,input,toxml=None):
23+
iftoxmlisNone:
24+
toxml=toxmlFactory()
25+
expected=''.join([toxml(token)fortokeninhtml5parser.HTMLParser().
26+
parseFragment(expected)])
1427
expected=json.loads(json.dumps(expected))
1528
assertexpected==sanitize_html(input)
1629

1730

18-
defsanitize_html(stream):
19-
return''.join([token.toxml()fortokenin
31+
defsanitize_html(stream,toxml=None):
32+
iftoxmlisNone:
33+
toxml=toxmlFactory()
34+
return''.join([toxml(token)fortokenin
2035
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
21-
parseFragment(stream).childNodes])
36+
parseFragment(stream)])
2237

2338

2439
deftest_should_handle_astral_plane_characters():
25-
assert"<p>\U0001d4b5\U0001d538</p>"==sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
40+
assert'<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5\U0001d538</html:p>'==sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
2641

2742

2843
deftest_sanitizer():
44+
toxml=toxmlFactory()
2945
fortag_nameinsanitizer.HTMLSanitizer.allowed_elements:
3046
iftag_namein ['caption','col','colgroup','optgroup','option','table','tbody','td','tfoot','th','thead','tr']:
3147
continue# TODO
@@ -34,25 +50,30 @@ def test_sanitizer():
3450
iftag_name=='image':
3551
yield (runSanitizerTest,"test_should_allow_%s_tag"%tag_name,
3652
"<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
37-
"<%s title='1'>foo <bad>bar</bad> baz</%s>"% (tag_name,tag_name))
53+
"<%s title='1'>foo <bad>bar</bad> baz</%s>"% (tag_name,tag_name),
54+
toxml)
3855
eliftag_name=='br':
3956
yield (runSanitizerTest,"test_should_allow_%s_tag"%tag_name,
4057
"<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
41-
"<%s title='1'>foo <bad>bar</bad> baz</%s>"% (tag_name,tag_name))
58+
"<%s title='1'>foo <bad>bar</bad> baz</%s>"% (tag_name,tag_name),
59+
toxml)
4260
eliftag_nameinconstants.voidElements:
4361
yield (runSanitizerTest,"test_should_allow_%s_tag"%tag_name,
4462
"<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"%tag_name,
45-
"<%s title='1'>foo <bad>bar</bad> baz</%s>"% (tag_name,tag_name))
63+
"<%s title='1'>foo <bad>bar</bad> baz</%s>"% (tag_name,tag_name),
64+
toxml)
4665
else:
4766
yield (runSanitizerTest,"test_should_allow_%s_tag"%tag_name,
4867
"<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>"% (tag_name,tag_name),
49-
"<%s title='1'>foo <bad>bar</bad> baz</%s>"% (tag_name,tag_name))
68+
"<%s title='1'>foo <bad>bar</bad> baz</%s>"% (tag_name,tag_name),
69+
toxml)
5070

5171
fortag_nameinsanitizer.HTMLSanitizer.allowed_elements:
5272
tag_name=tag_name.upper()
5373
yield (runSanitizerTest,"test_should_forbid_%s_tag"%tag_name,
5474
"&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;"% (tag_name,tag_name),
55-
"<%s title='1'>foo <bad>bar</bad> baz</%s>"% (tag_name,tag_name))
75+
"<%s title='1'>foo <bad>bar</bad> baz</%s>"% (tag_name,tag_name),
76+
toxml)
5677

5778
forattribute_nameinsanitizer.HTMLSanitizer.allowed_attributes:
5879
ifattribute_name!=attribute_name.lower():
@@ -61,20 +82,24 @@ def test_sanitizer():
6182
continue
6283
yield (runSanitizerTest,"test_should_allow_%s_attribute"%attribute_name,
6384
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"%attribute_name,
64-
"<p %s='foo'>foo <bad>bar</bad> baz</p>"%attribute_name)
85+
"<p %s='foo'>foo <bad>bar</bad> baz</p>"%attribute_name,
86+
toxml)
6587

6688
forattribute_nameinsanitizer.HTMLSanitizer.allowed_attributes:
6789
attribute_name=attribute_name.upper()
6890
yield (runSanitizerTest,"test_should_forbid_%s_attribute"%attribute_name,
6991
"<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
70-
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>"%attribute_name)
92+
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>"%attribute_name,
93+
toxml)
7194

7295
forprotocolinsanitizer.HTMLSanitizer.allowed_protocols:
7396
yield (runSanitizerTest,"test_should_allow_%s_uris"%protocol,
7497
"<a href=\"%s\">foo</a>"%protocol,
75-
"""<a href="%s">foo</a>"""%protocol)
98+
"""<a href="%s">foo</a>"""%protocol,
99+
toxml)
76100

77101
forprotocolinsanitizer.HTMLSanitizer.allowed_protocols:
78102
yield (runSanitizerTest,"test_should_allow_uppercase_%s_uris"%protocol,
79103
"<a href=\"%s\">foo</a>"%protocol,
80-
"""<a href="%s">foo</a>"""%protocol)
104+
"""<a href="%s">foo</a>"""%protocol,
105+
toxml)

‎html5lib/tests/test_treewalkers.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,11 @@ def PullDOMAdapter(node):
4545
raiseNotImplementedError("Node type not supported: "+str(node.nodeType))
4646

4747
treeTypes= {
48-
"simpletree": {"builder":treebuilders.getTreeBuilder("simpletree"),
49-
"walker":treewalkers.getTreeWalker("simpletree")},
50-
"DOM": {"builder":treebuilders.getTreeBuilder("dom"),
51-
"walker":treewalkers.getTreeWalker("dom")},
52-
"PullDOM": {"builder":treebuilders.getTreeBuilder("dom"),
53-
"adapter":PullDOMAdapter,
54-
"walker":treewalkers.getTreeWalker("pulldom")},
48+
"DOM": {"builder":treebuilders.getTreeBuilder("dom"),
49+
"walker":treewalkers.getTreeWalker("dom")},
50+
"PullDOM": {"builder":treebuilders.getTreeBuilder("dom"),
51+
"adapter":PullDOMAdapter,
52+
"walker":treewalkers.getTreeWalker("pulldom")},
5553
}
5654

5755
# Try whatever etree implementations are available from a list that are
@@ -103,7 +101,7 @@ def PullDOMAdapter(node):
103101
else:
104102
defGenshiAdapter(tree):
105103
text=None
106-
fortokenintreewalkers.getTreeWalker("simpletree")(tree):
104+
fortokenintreewalkers.getTreeWalker("dom")(tree):
107105
type=token["type"]
108106
iftypein ("Characters","SpaceCharacters"):
109107
iftextisNone:
@@ -147,7 +145,7 @@ def GenshiAdapter(tree):
147145
yieldTEXT,text, (None,-1,-1)
148146

149147
treeTypes["genshi"]= \
150-
{"builder":treebuilders.getTreeBuilder("simpletree"),
148+
{"builder":treebuilders.getTreeBuilder("dom"),
151149
"adapter":GenshiAdapter,
152150
"walker":treewalkers.getTreeWalker("genshi")}
153151

‎html5lib/treebuilders/__init__.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
1) A set of classes for various types of elements: Document, Doctype,
88
Comment, Element. These must implement the interface of
99
_base.treebuilders.Node (although comment nodes have a different
10-
signature for their constructor, see treebuilders.simpletree.Comment)
10+
signature for their constructor, see treebuilders.etree.Comment)
1111
Textual content may also be implemented as another node type, or not, as
1212
your tree implementation requires.
1313
@@ -24,10 +24,6 @@
2424
testSerializer method on your treebuilder which accepts a node and
2525
returns a string containing Node and its children serialized according
2626
to the format used in the unittests
27-
28-
The supplied simpletree module provides a python-only implementation
29-
of a full treebuilder and is a useful reference for the semantics of
30-
the various methods.
3127
"""
3228

3329
from __future__importabsolute_import,division,unicode_literals
@@ -39,10 +35,8 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
3935
"""Get a TreeBuilder class for various types of tree with built-in support
4036
4137
treeType - the name of the tree type required (case-insensitive). Supported
42-
values are "simpletree", "dom", and "etree"
38+
values are:
4339
44-
"simpletree" - a built-in DOM-ish tree type with support for some
45-
more pythonic idioms.
4640
"dom" - A generic builder for DOM implementations, defaulting to
4741
a xml.dom.minidom based implementation for the sake of
4842
backwards compatibility (as releases up until 0.10 had a
@@ -65,9 +59,6 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
6559
implementation=minidom
6660
# XXX: NEVER cache here, caching is done in the dom submodule
6761
returndom.getDomModule(implementation,**kwargs).TreeBuilder
68-
eliftreeType=="simpletree":
69-
from .importsimpletree
70-
treeBuilderCache[treeType]=simpletree.TreeBuilder
7162
eliftreeType=="lxml":
7263
from .importetree_lxml
7364
treeBuilderCache[treeType]=etree_lxml.TreeBuilder

‎html5lib/treebuilders/dom.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ def insertText(self, data, parent=None):
163163
self.dom._child_node_types.append(Node.TEXT_NODE)
164164
self.dom.appendChild(self.dom.createTextNode(data))
165165

166+
implementation=DomImplementation
166167
name=None
167168

168169
deftestSerializer(element):

‎html5lib/treebuilders/etree.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ class TreeBuilder(_base.TreeBuilder):
313313
elementClass=Element
314314
commentClass=Comment
315315
fragmentClass=DocumentFragment
316+
implementation=ElementTreeImplementation
316317

317318
deftestSerializer(self,element):
318319
returntestSerializer(element)

‎html5lib/treebuilders/etree_lxml.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def serializeElement(element, indent=0):
122122
ifelement.text:
123123
rv.append("|%s\"%s\""% (' '* (indent+2),element.text))
124124
indent+=2
125-
forchildinelement.getchildren():
125+
forchildinelement:
126126
serializeElement(child,indent)
127127
ifhasattr(element,"tail")andelement.tail:
128128
rv.append("|%s\"%s\""% (' '* (indent-2),element.tail))
@@ -163,7 +163,7 @@ def serializeElement(element):
163163
ifelement.text:
164164
rv.append(element.text)
165165

166-
forchildinelement.getchildren():
166+
forchildinelement:
167167
serializeElement(child)
168168

169169
rv.append("</%s>"% (element.tag,))
@@ -185,6 +185,7 @@ class TreeBuilder(_base.TreeBuilder):
185185
elementClass=None
186186
commentClass=None
187187
fragmentClass=Document
188+
implementation=etree
188189

189190
def__init__(self,namespaceHTMLElements,fullTree=False):
190191
builder=etree_builders.getETreeModule(etree,fullTree=fullTree)
@@ -280,7 +281,7 @@ def getFragment(self):
280281
element=self.openElements[0]._element
281282
ifelement.text:
282283
fragment.append(element.text)
283-
fragment.extend(element.getchildren())
284+
fragment.extend(list(element))
284285
ifelement.tail:
285286
fragment.append(element.tail)
286287
returnfragment

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp