Commita265d2d

Drew Hubl

authored and

CaptainCodeman

committed

Allow the data URI scheme, a whitelist for content types, and update tests to correctly check URIs

1 parentb0c3975 commita265d2dCopy full SHA for a265d2d

File tree

3 files changed

+65

-12

lines changed

html5lib
- sanitizer.py
- tests
  - test_sanitizer.py
  - testdata

3 files changed

+65

-12

lines changed

`‎html5lib/sanitizer.py‎`

Lines changed: 47 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,10 @@`
`2`	`2`
`3`	`3`	`importre`
`4`	`4`	`fromxml.sax.saxutilsimportescape,unescape`
	`5`	`+try:`
	`6`	`+fromurllib.parseimporturlparse`
	`7`	`+exceptImportError:`
	`8`	`+fromurlparseimporturlparse`
`5`	`9`
`6`	`10`	`from .tokenizerimportHTMLTokenizer`
`7`	`11`	`from .constantsimporttokenTypes`
`@@ -140,13 +144,16 @@ class HTMLSanitizerMixin(object):`
`140`	`144`	`'xmpp','callto','feed','urn','aim','rsync','tag',`
`141`	`145`	`'ssh','sftp','rtsp','afs','data']`
`142`	`146`
	`147`	`+acceptable_content_types= ['image/png','image/jpeg','image/gif','image/webp','image/bmp','text/plain']`
	`148`	`+`
`143`	`149`	`# subclasses may define their own versions of these constants`
`144`	`150`	`allowed_elements=acceptable_elements+mathml_elements+svg_elements`
`145`	`151`	`allowed_attributes=acceptable_attributes+mathml_attributes+svg_attributes`
`146`	`152`	`allowed_css_properties=acceptable_css_properties`
`147`	`153`	`allowed_css_keywords=acceptable_css_keywords`
`148`	`154`	`allowed_svg_properties=acceptable_svg_properties`
`149`	`155`	`allowed_protocols=acceptable_protocols`
	`156`	`+allowed_content_types=acceptable_content_types`
`150`	`157`
`151`	`158`	`# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and`
`152`	`159`	`# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style`
`@@ -189,10 +196,46 @@ def allowed_token(self, token, token_type):`
`189`	`196`	`unescape(attrs[attr])).lower()`
`190`	`197`	`# remove replacement characters from unescaped characters`
`191`	`198`	`val_unescaped=val_unescaped.replace("\ufffd","")`
`192`		`-if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped)and`
`193`		`- (val_unescaped.split(':')[0]notin`
`194`		`-self.allowed_protocols)):`
`195`		`-delattrs[attr]`
	`199`	`+uri=urlparse(val_unescaped)`
	`200`	`+ifuri:`
	`201`	`+ifuri.schemenotinself.allowed_protocols:`
	`202`	`+delattrs[attr]`
	`203`	`+rgx=re.compile(r'''`
	`204`	`+ ^`
	`205`	`+ # Match a content type <application>/<type>`
	`206`	`+ (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)`
	`207`	`+ # Match any character set and encoding`
	`208`	`+ # Note that this does not prevent the`
	`209`	`+ # same one being set twice`
	`210`	`+ # The charset group is currently unused`
	`211`	`+ (?:;charset=(?P<charset>[-a-zA-Z0-9]+)\|;(?P<encoding>base64)){0,2}`
	`212`	`+ # Match the base64-encoded or urlencoded`
	`213`	`+ # data`
	`214`	`+ # The data group is currently unused`
	`215`	`+ (?P<data>,(?P<base64_encoded_data>[a-zA-Z0-9+/]+=*\|(?P<url_encoded_data>[a-zA-Z0-9]+\|%[a-fA-F0-9]{2})))`
	`216`	`+ $`
	`217`	`+ ''',`
	`218`	`+re.VERBOSE)`
	`219`	`+ifuri.scheme=='data':`
	`220`	`+m=rgx.match(uri.path)`
	`221`	`+ifnotm:`
	`222`	`+delattrs[attr]`
	`223`	`+ifm.group('content_type')notinself.allowed_content_types:`
	`224`	`+delattrs[attr]`
	`225`	`+ifm.group('encoding'):`
	`226`	`+ifm.group('encoding')=='base64':`
	`227`	`+# If the encoding identifier is base64, then`
	`228`	`+# make sure the data is encoded in base64`
	`229`	`+ifnotm.group('base64_encoded_data'):`
	`230`	`+delattrs[attr]`
	`231`	`+else:`
	`232`	`+delattrs[attr]`
	`233`	`+else:`
	`234`	`+# If the encoding is not given, expect the data to`
	`235`	`+# be urlencoded`
	`236`	`+ifnotm.group('url_encoded_data'):`
	`237`	`+delattrs[attr]`
	`238`	`+`
`196`	`239`	`forattrinself.svg_attr_val_allows_ref:`
`197`	`240`	`ifattrinattrs:`
`198`	`241`	`attrs[attr]=re.sub(r'url\s$\s[^#\s][^)]+?$',`

`‎html5lib/tests/test_sanitizer.py‎`

Lines changed: 17 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -80,9 +80,12 @@ def test_sanitizer():`
`80`	`80`	`continue# TODO`
`81`	`81`	`ifattribute_name=='style':`
`82`	`82`	`continue`
	`83`	`+attribute_value='foo'`
	`84`	`+ifattribute_nameinsanitizer.HTMLSanitizer.attr_val_is_uri:`
	`85`	`+attribute_value='http://sub.domain.tld/path/object.ext'`
`83`	`86`	`yield (runSanitizerTest,"test_should_allow_%s_attribute"%attribute_name,`
`84`		`-"<p %s=\"foo\">foo <bad>bar</bad> baz</p>"%attribute_name,`
`85`		`-"<p %s='foo'>foo <bad>bar</bad> baz</p>"%attribute_name,`
	`87`	`+"<p %s=\"%s\">foo <bad>bar</bad> baz</p>"%(attribute_name,attribute_value),`
	`88`	`+"<p %s='%s'>foo <bad>bar</bad> baz</p>"%(attribute_name,attribute_value),`
`86`	`89`	`toxml)`
`87`	`90`
`88`	`91`	`forattribute_nameinsanitizer.HTMLSanitizer.allowed_attributes:`
`@@ -93,13 +96,20 @@ def test_sanitizer():`
`93`	`96`	`toxml)`
`94`	`97`
`95`	`98`	`forprotocolinsanitizer.HTMLSanitizer.allowed_protocols:`
`96`		`-yield (runSanitizerTest,"test_should_allow_%s_uris"%protocol,`
`97`		`-"<a href=\"%s\">foo</a>"%protocol,`
`98`		`-"""<a href="%s">foo</a>"""%protocol,`
	`99`	`+rest_of_uri='//sub.domain.tld/path/object.ext'`
	`100`	`+ifprotocol=='data':`
	`101`	`+rest_of_uri='image/png;base64,aGVsbG8gd29ybGQ='`
	`102`	`+yield (runSanitizerTest,"test_should_allow_uppercase_%s_uris"%protocol,`
	`103`	`+"<img src=\"%s:%s\">foo</a>"% (protocol,rest_of_uri),`
	`104`	`+"""<img src="%s:%s">foo</a>"""% (protocol,rest_of_uri),`
`99`	`105`	`toxml)`
`100`	`106`
`101`	`107`	`forprotocolinsanitizer.HTMLSanitizer.allowed_protocols:`
	`108`	`+rest_of_uri='//sub.domain.tld/path/object.ext'`
	`109`	`+ifprotocol=='data':`
	`110`	`+rest_of_uri='image/png;base64,aGVsbG8gd29ybGQ='`
	`111`	`+protocol=protocol.upper()`
`102`	`112`	`yield (runSanitizerTest,"test_should_allow_uppercase_%s_uris"%protocol,`
`103`		`-"<a href=\"%s\">foo</a>"%protocol,`
`104`		`-"""<a href="%s">foo</a>"""%protocol,`
	`113`	`+"<img src=\"%s:%s\">foo</a>"%(protocol,rest_of_uri),`
	`114`	`+"""<img src="%s:%s">foo</a>"""%(protocol,rest_of_uri),`
`105`	`115`	`toxml)`

`‎html5lib/tests/testdata‎`

Submoduletestdata updated62 files

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commita265d2d

File tree

3 files changed

3 files changed

`‎html5lib/sanitizer.py‎`

`‎html5lib/tests/test_sanitizer.py‎`

`‎html5lib/tests/testdata‎`

0 commit comments