55except ImportError :
66import simplejson as json
77
8- from html5lib import html5parser ,sanitizer ,constants
8+ from html5lib import html5parser ,sanitizer ,constants , treebuilders
99
1010
11- def runSanitizerTest (name ,expected ,input ):
12- expected = '' .join ([token .toxml ()for token in html5parser .HTMLParser ().
13- parseFragment (expected ).childNodes ])
11+ def toxmlFactory ():
12+ tree = treebuilders .getTreeBuilder ("etree" )
13+
14+ def toxml (element ):
15+ # encode/decode roundtrip required for Python 2.6 compatibility
16+ result_bytes = tree .implementation .tostring (element ,encoding = "utf-8" )
17+ return result_bytes .decode ("utf-8" )
18+
19+ return toxml
20+
21+
22+ def runSanitizerTest (name ,expected ,input ,toxml = None ):
23+ if toxml is None :
24+ toxml = toxmlFactory ()
25+ expected = '' .join ([toxml (token )for token in html5parser .HTMLParser ().
26+ parseFragment (expected )])
1427expected = json .loads (json .dumps (expected ))
1528assert expected == sanitize_html (input )
1629
1730
18- def sanitize_html (stream ):
19- return '' .join ([token .toxml ()for token in
31+ def sanitize_html (stream ,toxml = None ):
32+ if toxml is None :
33+ toxml = toxmlFactory ()
34+ return '' .join ([toxml (token )for token in
2035html5parser .HTMLParser (tokenizer = sanitizer .HTMLSanitizer ).
21- parseFragment (stream ). childNodes ])
36+ parseFragment (stream )])
2237
2338
2439def test_should_handle_astral_plane_characters ():
25- assert "<p >\U0001d4b5 \U0001d538 </p>" == sanitize_html ("<p>𝒵 𝔸</p>" )
40+ assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml" >\U0001d4b5 \U0001d538 </html:p>' == sanitize_html ("<p>𝒵 𝔸</p>" )
2641
2742
2843def test_sanitizer ():
44+ toxml = toxmlFactory ()
2945for tag_name in sanitizer .HTMLSanitizer .allowed_elements :
3046if tag_name in ['caption' ,'col' ,'colgroup' ,'optgroup' ,'option' ,'table' ,'tbody' ,'td' ,'tfoot' ,'th' ,'thead' ,'tr' ]:
3147continue # TODO
@@ -34,25 +50,30 @@ def test_sanitizer():
3450if tag_name == 'image' :
3551yield (runSanitizerTest ,"test_should_allow_%s_tag" % tag_name ,
3652"<img title=\" 1\" />foo <bad>bar</bad> baz" ,
37- "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ))
53+ "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ),
54+ toxml )
3855elif tag_name == 'br' :
3956yield (runSanitizerTest ,"test_should_allow_%s_tag" % tag_name ,
4057"<br title=\" 1\" />foo <bad>bar</bad> baz<br/>" ,
41- "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ))
58+ "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ),
59+ toxml )
4260elif tag_name in constants .voidElements :
4361yield (runSanitizerTest ,"test_should_allow_%s_tag" % tag_name ,
4462"<%s title=\" 1\" />foo <bad>bar</bad> baz" % tag_name ,
45- "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ))
63+ "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ),
64+ toxml )
4665else :
4766yield (runSanitizerTest ,"test_should_allow_%s_tag" % tag_name ,
4867"<%s title=\" 1\" >foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ),
49- "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ))
68+ "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ),
69+ toxml )
5070
5171for tag_name in sanitizer .HTMLSanitizer .allowed_elements :
5272tag_name = tag_name .upper ()
5373yield (runSanitizerTest ,"test_should_forbid_%s_tag" % tag_name ,
5474"<%s title=\" 1\" >foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ),
55- "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ))
75+ "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name ,tag_name ),
76+ toxml )
5677
5778for attribute_name in sanitizer .HTMLSanitizer .allowed_attributes :
5879if attribute_name != attribute_name .lower ():
@@ -61,20 +82,24 @@ def test_sanitizer():
6182continue
6283yield (runSanitizerTest ,"test_should_allow_%s_attribute" % attribute_name ,
6384"<p %s=\" foo\" >foo <bad>bar</bad> baz</p>" % attribute_name ,
64- "<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name )
85+ "<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name ,
86+ toxml )
6587
6688for attribute_name in sanitizer .HTMLSanitizer .allowed_attributes :
6789attribute_name = attribute_name .upper ()
6890yield (runSanitizerTest ,"test_should_forbid_%s_attribute" % attribute_name ,
6991"<p>foo <bad>bar</bad> baz</p>" ,
70- "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name )
92+ "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name ,
93+ toxml )
7194
7295for protocol in sanitizer .HTMLSanitizer .allowed_protocols :
7396yield (runSanitizerTest ,"test_should_allow_%s_uris" % protocol ,
7497"<a href=\" %s\" >foo</a>" % protocol ,
75- """<a href="%s">foo</a>""" % protocol )
98+ """<a href="%s">foo</a>""" % protocol ,
99+ toxml )
76100
77101for protocol in sanitizer .HTMLSanitizer .allowed_protocols :
78102yield (runSanitizerTest ,"test_should_allow_uppercase_%s_uris" % protocol ,
79103"<a href=\" %s\" >foo</a>" % protocol ,
80- """<a href="%s">foo</a>""" % protocol )
104+ """<a href="%s">foo</a>""" % protocol ,
105+ toxml )