Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commita265d2d

Browse files
Drew HublCaptainCodeman
Drew Hubl
authored andcommitted
Allow the data URI scheme, a whitelist for content types, and update tests to correctly check URIs
1 parentb0c3975 commita265d2d

File tree

3 files changed

+65
-12
lines changed

3 files changed

+65
-12
lines changed

‎html5lib/sanitizer.py‎

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
importre
44
fromxml.sax.saxutilsimportescape,unescape
5+
try:
6+
fromurllib.parseimporturlparse
7+
exceptImportError:
8+
fromurlparseimporturlparse
59

610
from .tokenizerimportHTMLTokenizer
711
from .constantsimporttokenTypes
@@ -140,13 +144,16 @@ class HTMLSanitizerMixin(object):
140144
'xmpp','callto','feed','urn','aim','rsync','tag',
141145
'ssh','sftp','rtsp','afs','data']
142146

147+
acceptable_content_types= ['image/png','image/jpeg','image/gif','image/webp','image/bmp','text/plain']
148+
143149
# subclasses may define their own versions of these constants
144150
allowed_elements=acceptable_elements+mathml_elements+svg_elements
145151
allowed_attributes=acceptable_attributes+mathml_attributes+svg_attributes
146152
allowed_css_properties=acceptable_css_properties
147153
allowed_css_keywords=acceptable_css_keywords
148154
allowed_svg_properties=acceptable_svg_properties
149155
allowed_protocols=acceptable_protocols
156+
allowed_content_types=acceptable_content_types
150157

151158
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
152159
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +196,46 @@ def allowed_token(self, token, token_type):
189196
unescape(attrs[attr])).lower()
190197
# remove replacement characters from unescaped characters
191198
val_unescaped=val_unescaped.replace("\ufffd","")
192-
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped)and
193-
(val_unescaped.split(':')[0]notin
194-
self.allowed_protocols)):
195-
delattrs[attr]
199+
uri=urlparse(val_unescaped)
200+
ifuri:
201+
ifuri.schemenotinself.allowed_protocols:
202+
delattrs[attr]
203+
rgx=re.compile(r'''
204+
^
205+
# Match a content type <application>/<type>
206+
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
207+
# Match any character set and encoding
208+
# Note that this does not prevent the
209+
# same one being set twice
210+
# The charset group is currently unused
211+
(?:;charset=(?P<charset>[-a-zA-Z0-9]+)|;(?P<encoding>base64)){0,2}
212+
# Match the base64-encoded or urlencoded
213+
# data
214+
# The data group is currently unused
215+
(?P<data>,(?P<base64_encoded_data>[a-zA-Z0-9+/]+=*|(?P<url_encoded_data>[a-zA-Z0-9]+|%[a-fA-F0-9]{2})))
216+
$
217+
''',
218+
re.VERBOSE)
219+
ifuri.scheme=='data':
220+
m=rgx.match(uri.path)
221+
ifnotm:
222+
delattrs[attr]
223+
ifm.group('content_type')notinself.allowed_content_types:
224+
delattrs[attr]
225+
ifm.group('encoding'):
226+
ifm.group('encoding')=='base64':
227+
# If the encoding identifier is base64, then
228+
# make sure the data is encoded in base64
229+
ifnotm.group('base64_encoded_data'):
230+
delattrs[attr]
231+
else:
232+
delattrs[attr]
233+
else:
234+
# If the encoding is not given, expect the data to
235+
# be urlencoded
236+
ifnotm.group('url_encoded_data'):
237+
delattrs[attr]
238+
196239
forattrinself.svg_attr_val_allows_ref:
197240
ifattrinattrs:
198241
attrs[attr]=re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',

‎html5lib/tests/test_sanitizer.py‎

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,12 @@ def test_sanitizer():
8080
continue# TODO
8181
ifattribute_name=='style':
8282
continue
83+
attribute_value='foo'
84+
ifattribute_nameinsanitizer.HTMLSanitizer.attr_val_is_uri:
85+
attribute_value='http://sub.domain.tld/path/object.ext'
8386
yield (runSanitizerTest,"test_should_allow_%s_attribute"%attribute_name,
84-
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"%attribute_name,
85-
"<p %s='foo'>foo <bad>bar</bad> baz</p>"%attribute_name,
87+
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"%(attribute_name,attribute_value),
88+
"<p %s='%s'>foo <bad>bar</bad> baz</p>"%(attribute_name,attribute_value),
8689
toxml)
8790

8891
forattribute_nameinsanitizer.HTMLSanitizer.allowed_attributes:
@@ -93,13 +96,20 @@ def test_sanitizer():
9396
toxml)
9497

9598
forprotocolinsanitizer.HTMLSanitizer.allowed_protocols:
96-
yield (runSanitizerTest,"test_should_allow_%s_uris"%protocol,
97-
"<a href=\"%s\">foo</a>"%protocol,
98-
"""<a href="%s">foo</a>"""%protocol,
99+
rest_of_uri='//sub.domain.tld/path/object.ext'
100+
ifprotocol=='data':
101+
rest_of_uri='image/png;base64,aGVsbG8gd29ybGQ='
102+
yield (runSanitizerTest,"test_should_allow_uppercase_%s_uris"%protocol,
103+
"<img src=\"%s:%s\">foo</a>"% (protocol,rest_of_uri),
104+
"""<img src="%s:%s">foo</a>"""% (protocol,rest_of_uri),
99105
toxml)
100106

101107
forprotocolinsanitizer.HTMLSanitizer.allowed_protocols:
108+
rest_of_uri='//sub.domain.tld/path/object.ext'
109+
ifprotocol=='data':
110+
rest_of_uri='image/png;base64,aGVsbG8gd29ybGQ='
111+
protocol=protocol.upper()
102112
yield (runSanitizerTest,"test_should_allow_uppercase_%s_uris"%protocol,
103-
"<a href=\"%s\">foo</a>"%protocol,
104-
"""<a href="%s">foo</a>"""%protocol,
113+
"<img src=\"%s:%s\">foo</a>"%(protocol,rest_of_uri),
114+
"""<img src="%s:%s">foo</a>"""%(protocol,rest_of_uri),
105115
toxml)

‎html5lib/tests/testdata‎

Submoduletestdata updated62 files

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp