22
33import re
44from xml .sax .saxutils import escape ,unescape
5+ from six .moves import urllib_parse as urlparse
56
67from .tokenizer import HTMLTokenizer
78from .constants import tokenTypes
89
910
11+ content_type_rgx = re .compile (r'''
12+ ^
13+ # Match a content type <application>/<type>
14+ (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
15+ # Match any character set and encoding
16+ (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
17+ |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
18+ # Assume the rest is data
19+ ,.*
20+ $
21+ ''' ,
22+ re .VERBOSE )
23+
24+
1025class HTMLSanitizerMixin (object ):
1126""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
1227
@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
138153acceptable_protocols = ['ed2k' ,'ftp' ,'http' ,'https' ,'irc' ,
139154'mailto' ,'news' ,'gopher' ,'nntp' ,'telnet' ,'webcal' ,
140155'xmpp' ,'callto' ,'feed' ,'urn' ,'aim' ,'rsync' ,'tag' ,
141- 'ssh' ,'sftp' ,'rtsp' ,'afs' ]
156+ 'ssh' ,'sftp' ,'rtsp' ,'afs' ,'data' ]
157+
158+ acceptable_content_types = ['image/png' ,'image/jpeg' ,'image/gif' ,'image/webp' ,'image/bmp' ,'text/plain' ]
142159
143160# subclasses may define their own versions of these constants
144161allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
147164allowed_css_keywords = acceptable_css_keywords
148165allowed_svg_properties = acceptable_svg_properties
149166allowed_protocols = acceptable_protocols
167+ allowed_content_types = acceptable_content_types
150168
151169# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
152170# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +207,17 @@ def allowed_token(self, token, token_type):
189207unescape (attrs [attr ])).lower ()
190208# remove replacement characters from unescaped characters
191209val_unescaped = val_unescaped .replace ("\ufffd " ,"" )
192- if (re .match ("^[a-z0-9][-+.a-z0-9]*:" ,val_unescaped )and
193- (val_unescaped .split (':' )[0 ]not in
194- self .allowed_protocols )):
195- del attrs [attr ]
210+ uri = urlparse .urlparse (val_unescaped )
211+ if uri :
212+ if uri .scheme not in self .allowed_protocols :
213+ del attrs [attr ]
214+ if uri .scheme == 'data' :
215+ m = content_type_rgx .match (uri .path )
216+ if not m :
217+ del attrs [attr ]
218+ if m .group ('content_type' )not in self .allowed_content_types :
219+ del attrs [attr ]
220+
196221for attr in self .svg_attr_val_allows_ref :
197222if attr in attrs :
198223attrs [attr ]= re .sub (r'url\s*\(\s*[^#\s][^)]+?\)' ,