2
2
3
3
import re
4
4
from xml .sax .saxutils import escape ,unescape
5
+ from six .moves import urllib_parse as urlparse
5
6
6
7
from .tokenizer import HTMLTokenizer
7
8
from .constants import tokenTypes
8
9
9
10
11
+ content_type_rgx = re .compile (r'''
12
+ ^
13
+ # Match a content type <application>/<type>
14
+ (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
15
+ # Match any character set and encoding
16
+ (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
17
+ |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
18
+ # Assume the rest is data
19
+ ,.*
20
+ $
21
+ ''' ,
22
+ re .VERBOSE )
23
+
24
+
10
25
class HTMLSanitizerMixin (object ):
11
26
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
12
27
@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
138
153
acceptable_protocols = ['ed2k' ,'ftp' ,'http' ,'https' ,'irc' ,
139
154
'mailto' ,'news' ,'gopher' ,'nntp' ,'telnet' ,'webcal' ,
140
155
'xmpp' ,'callto' ,'feed' ,'urn' ,'aim' ,'rsync' ,'tag' ,
141
- 'ssh' ,'sftp' ,'rtsp' ,'afs' ]
156
+ 'ssh' ,'sftp' ,'rtsp' ,'afs' ,'data' ]
157
+
158
+ acceptable_content_types = ['image/png' ,'image/jpeg' ,'image/gif' ,'image/webp' ,'image/bmp' ,'text/plain' ]
142
159
143
160
# subclasses may define their own versions of these constants
144
161
allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
147
164
allowed_css_keywords = acceptable_css_keywords
148
165
allowed_svg_properties = acceptable_svg_properties
149
166
allowed_protocols = acceptable_protocols
167
+ allowed_content_types = acceptable_content_types
150
168
151
169
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
152
170
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +207,17 @@ def allowed_token(self, token, token_type):
189
207
unescape (attrs [attr ])).lower ()
190
208
# remove replacement characters from unescaped characters
191
209
val_unescaped = val_unescaped .replace ("\ufffd " ,"" )
192
- if (re .match ("^[a-z0-9][-+.a-z0-9]*:" ,val_unescaped )and
193
- (val_unescaped .split (':' )[0 ]not in
194
- self .allowed_protocols )):
195
- del attrs [attr ]
210
+ uri = urlparse .urlparse (val_unescaped )
211
+ if uri :
212
+ if uri .scheme not in self .allowed_protocols :
213
+ del attrs [attr ]
214
+ if uri .scheme == 'data' :
215
+ m = content_type_rgx .match (uri .path )
216
+ if not m :
217
+ del attrs [attr ]
218
+ if m .group ('content_type' )not in self .allowed_content_types :
219
+ del attrs [attr ]
220
+
196
221
for attr in self .svg_attr_val_allows_ref :
197
222
if attr in attrs :
198
223
attrs [attr ]= re .sub (r'url\s*\(\s*[^#\s][^)]+?\)' ,