Commitf644865

Drew Hubl

authored and

gsnedders

committed

Allow the data URI scheme, a whitelist for content types, and update tests to correctly check URIs

1 parentb51828b commitf644865Copy full SHA for f644865

File tree

3 files changed

+49

-12

lines changed

AUTHORS.rst
html5lib
- sanitizer.py
- tests
  - test_sanitizer.py

3 files changed

+49

-12

lines changed

`‎AUTHORS.rst`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -32,4 +32,6 @@ Patches and suggestions`
`32`	`32`	`- Juan Carlos Garcia Segovia`
`33`	`33`	`- Mike West`
`34`	`34`	`- Marc DM`
	`35`	`+- Drew Hubl`
	`36`	`+- Austin Kumbera`
`35`	`37`	`- Jim Baker`

`‎html5lib/sanitizer.py`

Lines changed: 30 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -2,11 +2,26 @@`
`2`	`2`
`3`	`3`	`importre`
`4`	`4`	`fromxml.sax.saxutilsimportescape,unescape`
	`5`	`+fromsix.movesimporturllib_parseasurlparse`
`5`	`6`
`6`	`7`	`from .tokenizerimportHTMLTokenizer`
`7`	`8`	`from .constantsimporttokenTypes`
`8`	`9`
`9`	`10`
	`11`	`+content_type_rgx=re.compile(r'''`
	`12`	`+ ^`
	`13`	`+ # Match a content type <application>/<type>`
	`14`	`+ (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)`
	`15`	`+ # Match any character set and encoding`
	`16`	`+ (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)`
	`17`	`+ \|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)`
	`18`	`+ # Assume the rest is data`
	`19`	`+ ,.*`
	`20`	`+ $`
	`21`	`+ ''',`
	`22`	`+re.VERBOSE)`
	`23`	`+`
	`24`	`+`
`10`	`25`	`classHTMLSanitizerMixin(object):`
`11`	`26`	`""" sanitization of XHTML+MathML+SVG and of inline style attributes."""`
`12`	`27`
`@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):`
`138`	`153`	`acceptable_protocols= ['ed2k','ftp','http','https','irc',`
`139`	`154`	`'mailto','news','gopher','nntp','telnet','webcal',`
`140`	`155`	`'xmpp','callto','feed','urn','aim','rsync','tag',`
`141`		`-'ssh','sftp','rtsp','afs']`
	`156`	`+'ssh','sftp','rtsp','afs','data']`
	`157`	`+`
	`158`	`+acceptable_content_types= ['image/png','image/jpeg','image/gif','image/webp','image/bmp','text/plain']`
`142`	`159`
`143`	`160`	`# subclasses may define their own versions of these constants`
`144`	`161`	`allowed_elements=acceptable_elements+mathml_elements+svg_elements`
`@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):`
`147`	`164`	`allowed_css_keywords=acceptable_css_keywords`
`148`	`165`	`allowed_svg_properties=acceptable_svg_properties`
`149`	`166`	`allowed_protocols=acceptable_protocols`
	`167`	`+allowed_content_types=acceptable_content_types`
`150`	`168`
`151`	`169`	`# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and`
`152`	`170`	`# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style`
`@@ -189,10 +207,17 @@ def allowed_token(self, token, token_type):`
`189`	`207`	`unescape(attrs[attr])).lower()`
`190`	`208`	`# remove replacement characters from unescaped characters`
`191`	`209`	`val_unescaped=val_unescaped.replace("\ufffd","")`
`192`		`-if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped)and`
`193`		`- (val_unescaped.split(':')[0]notin`
`194`		`-self.allowed_protocols)):`
`195`		`-delattrs[attr]`
	`210`	`+uri=urlparse.urlparse(val_unescaped)`
	`211`	`+ifuri:`
	`212`	`+ifuri.schemenotinself.allowed_protocols:`
	`213`	`+delattrs[attr]`
	`214`	`+ifuri.scheme=='data':`
	`215`	`+m=content_type_rgx.match(uri.path)`
	`216`	`+ifnotm:`
	`217`	`+delattrs[attr]`
	`218`	`+ifm.group('content_type')notinself.allowed_content_types:`
	`219`	`+delattrs[attr]`
	`220`	`+`
`196`	`221`	`forattrinself.svg_attr_val_allows_ref:`
`197`	`222`	`ifattrinattrs:`
`198`	`223`	`attrs[attr]=re.sub(r'url\s$\s[^#\s][^)]+?$',`

`‎html5lib/tests/test_sanitizer.py`

Lines changed: 17 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -80,9 +80,12 @@ def test_sanitizer():`
`80`	`80`	`continue# TODO`
`81`	`81`	`ifattribute_name=='style':`
`82`	`82`	`continue`
	`83`	`+attribute_value='foo'`
	`84`	`+ifattribute_nameinsanitizer.HTMLSanitizer.attr_val_is_uri:`
	`85`	`+attribute_value='%s://sub.domain.tld/path/object.ext'%sanitizer.HTMLSanitizer.allowed_protocols[0]`
`83`	`86`	`yield (runSanitizerTest,"test_should_allow_%s_attribute"%attribute_name,`
`84`		`-"<p %s=\"foo\">foo <bad>bar</bad> baz</p>"%attribute_name,`
`85`		`-"<p %s='foo'>foo <bad>bar</bad> baz</p>"%attribute_name,`
	`87`	`+"<p %s=\"%s\">foo <bad>bar</bad> baz</p>"%(attribute_name,attribute_value),`
	`88`	`+"<p %s='%s'>foo <bad>bar</bad> baz</p>"%(attribute_name,attribute_value),`
`86`	`89`	`toxml)`
`87`	`90`
`88`	`91`	`forattribute_nameinsanitizer.HTMLSanitizer.allowed_attributes:`
`@@ -93,13 +96,20 @@ def test_sanitizer():`
`93`	`96`	`toxml)`
`94`	`97`
`95`	`98`	`forprotocolinsanitizer.HTMLSanitizer.allowed_protocols:`
`96`		`-yield (runSanitizerTest,"test_should_allow_%s_uris"%protocol,`
`97`		`-"<a href=\"%s\">foo</a>"%protocol,`
`98`		`-"""<a href="%s">foo</a>"""%protocol,`
	`99`	`+rest_of_uri='//sub.domain.tld/path/object.ext'`
	`100`	`+ifprotocol=='data':`
	`101`	`+rest_of_uri='image/png;base64,aGVsbG8gd29ybGQ='`
	`102`	`+yield (runSanitizerTest,"test_should_allow_uppercase_%s_uris"%protocol,`
	`103`	`+"<img src=\"%s:%s\">foo</a>"% (protocol,rest_of_uri),`
	`104`	`+"""<img src="%s:%s">foo</a>"""% (protocol,rest_of_uri),`
`99`	`105`	`toxml)`
`100`	`106`
`101`	`107`	`forprotocolinsanitizer.HTMLSanitizer.allowed_protocols:`
	`108`	`+rest_of_uri='//sub.domain.tld/path/object.ext'`
	`109`	`+ifprotocol=='data':`
	`110`	`+rest_of_uri='image/png;base64,aGVsbG8gd29ybGQ='`
	`111`	`+protocol=protocol.upper()`
`102`	`112`	`yield (runSanitizerTest,"test_should_allow_uppercase_%s_uris"%protocol,`
`103`		`-"<a href=\"%s\">foo</a>"%protocol,`
`104`		`-"""<a href="%s">foo</a>"""%protocol,`
	`113`	`+"<img src=\"%s:%s\">foo</a>"%(protocol,rest_of_uri),`
	`114`	`+"""<img src="%s:%s">foo</a>"""%(protocol,rest_of_uri),`
`105`	`115`	`toxml)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitf644865

File tree

3 files changed

3 files changed

`‎AUTHORS.rst`

`‎html5lib/sanitizer.py`

`‎html5lib/tests/test_sanitizer.py`

0 commit comments