Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitf644865

Browse files
Drew Hublgsnedders
Drew Hubl
authored andcommitted
Allow the data URI scheme, a whitelist for content types, and update tests to correctly check URIs
1 parentb51828b commitf644865

File tree

3 files changed

+49
-12
lines changed

3 files changed

+49
-12
lines changed

‎AUTHORS.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,6 @@ Patches and suggestions
3232
- Juan Carlos Garcia Segovia
3333
- Mike West
3434
- Marc DM
35+
- Drew Hubl
36+
- Austin Kumbera
3537
- Jim Baker

‎html5lib/sanitizer.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,26 @@
22

33
importre
44
fromxml.sax.saxutilsimportescape,unescape
5+
fromsix.movesimporturllib_parseasurlparse
56

67
from .tokenizerimportHTMLTokenizer
78
from .constantsimporttokenTypes
89

910

11+
content_type_rgx=re.compile(r'''
12+
^
13+
# Match a content type <application>/<type>
14+
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
15+
# Match any character set and encoding
16+
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
17+
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
18+
# Assume the rest is data
19+
,.*
20+
$
21+
''',
22+
re.VERBOSE)
23+
24+
1025
classHTMLSanitizerMixin(object):
1126
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
1227

@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
138153
acceptable_protocols= ['ed2k','ftp','http','https','irc',
139154
'mailto','news','gopher','nntp','telnet','webcal',
140155
'xmpp','callto','feed','urn','aim','rsync','tag',
141-
'ssh','sftp','rtsp','afs']
156+
'ssh','sftp','rtsp','afs','data']
157+
158+
acceptable_content_types= ['image/png','image/jpeg','image/gif','image/webp','image/bmp','text/plain']
142159

143160
# subclasses may define their own versions of these constants
144161
allowed_elements=acceptable_elements+mathml_elements+svg_elements
@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
147164
allowed_css_keywords=acceptable_css_keywords
148165
allowed_svg_properties=acceptable_svg_properties
149166
allowed_protocols=acceptable_protocols
167+
allowed_content_types=acceptable_content_types
150168

151169
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
152170
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +207,17 @@ def allowed_token(self, token, token_type):
189207
unescape(attrs[attr])).lower()
190208
# remove replacement characters from unescaped characters
191209
val_unescaped=val_unescaped.replace("\ufffd","")
192-
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped)and
193-
(val_unescaped.split(':')[0]notin
194-
self.allowed_protocols)):
195-
delattrs[attr]
210+
uri=urlparse.urlparse(val_unescaped)
211+
ifuri:
212+
ifuri.schemenotinself.allowed_protocols:
213+
delattrs[attr]
214+
ifuri.scheme=='data':
215+
m=content_type_rgx.match(uri.path)
216+
ifnotm:
217+
delattrs[attr]
218+
ifm.group('content_type')notinself.allowed_content_types:
219+
delattrs[attr]
220+
196221
forattrinself.svg_attr_val_allows_ref:
197222
ifattrinattrs:
198223
attrs[attr]=re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',

‎html5lib/tests/test_sanitizer.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,12 @@ def test_sanitizer():
8080
continue# TODO
8181
ifattribute_name=='style':
8282
continue
83+
attribute_value='foo'
84+
ifattribute_nameinsanitizer.HTMLSanitizer.attr_val_is_uri:
85+
attribute_value='%s://sub.domain.tld/path/object.ext'%sanitizer.HTMLSanitizer.allowed_protocols[0]
8386
yield (runSanitizerTest,"test_should_allow_%s_attribute"%attribute_name,
84-
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"%attribute_name,
85-
"<p %s='foo'>foo <bad>bar</bad> baz</p>"%attribute_name,
87+
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"%(attribute_name,attribute_value),
88+
"<p %s='%s'>foo <bad>bar</bad> baz</p>"%(attribute_name,attribute_value),
8689
toxml)
8790

8891
forattribute_nameinsanitizer.HTMLSanitizer.allowed_attributes:
@@ -93,13 +96,20 @@ def test_sanitizer():
9396
toxml)
9497

9598
forprotocolinsanitizer.HTMLSanitizer.allowed_protocols:
96-
yield (runSanitizerTest,"test_should_allow_%s_uris"%protocol,
97-
"<a href=\"%s\">foo</a>"%protocol,
98-
"""<a href="%s">foo</a>"""%protocol,
99+
rest_of_uri='//sub.domain.tld/path/object.ext'
100+
ifprotocol=='data':
101+
rest_of_uri='image/png;base64,aGVsbG8gd29ybGQ='
102+
yield (runSanitizerTest,"test_should_allow_uppercase_%s_uris"%protocol,
103+
"<img src=\"%s:%s\">foo</a>"% (protocol,rest_of_uri),
104+
"""<img src="%s:%s">foo</a>"""% (protocol,rest_of_uri),
99105
toxml)
100106

101107
forprotocolinsanitizer.HTMLSanitizer.allowed_protocols:
108+
rest_of_uri='//sub.domain.tld/path/object.ext'
109+
ifprotocol=='data':
110+
rest_of_uri='image/png;base64,aGVsbG8gd29ybGQ='
111+
protocol=protocol.upper()
102112
yield (runSanitizerTest,"test_should_allow_uppercase_%s_uris"%protocol,
103-
"<a href=\"%s\">foo</a>"%protocol,
104-
"""<a href="%s">foo</a>"""%protocol,
113+
"<img src=\"%s:%s\">foo</a>"%(protocol,rest_of_uri),
114+
"""<img src="%s:%s">foo</a>"""%(protocol,rest_of_uri),
105115
toxml)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp