May 17, 2016 · May 8, 2016 · May 8, 2016 · May 8, 2016 · May 8, 2016 · May 11, 2016
diff --git a/CHANGES.rst b/CHANGES.rst
 * **Use scripting disabled by default (as we don't implement
  scripting).**

 * **Fix #11, avoiding the XSS bug potentially caused by serializer
  allowing attribute values to be escaped out of in old browser versions,
  changing the quote_attr_values option on serializer to take one of
  three values, "always" (the old True value), "legacy" (the new option,
  and the new default), and "spec" (the old False value, and the old
  default).**


 0.9999999/1.0b8
 ~~~~~~~~~~~~~~~
diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py


 class Filter(_base.Filter):
    def __init__(self, source, require_matching_tags=True):
        super(Filter, self).__init__(source)
        self.require_matching_tags = require_matching_tags

    def __iter__(self):
        open_elements = []
        for token in _base.Filter.__iter__(self):
                    assert type == "EmptyTag"
                else:
                    assert type == "StartTag"
                if type == "StartTag":
                if type == "StartTag" and self.require_matching_tags:
                    open_elements.append((namespace, name))
                for (namespace, name), value in token["data"].items():
                    assert namespace is None or isinstance(namespace, text_type)
                assert name != ""
                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                    assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
 else:
 elif self.require_matching_tags:
                    start = open_elements.pop()
                    assert start == (namespace, name)

diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type

 try:
    from functools import reduce
 except ImportError:
    pass
 import re

 from ..constants import voidElements, booleanAttributes, spaceCharacters
 from ..constants import rcdataElements, entities, xmlEntities

 spaceCharacters = "".join(spaceCharacters)

 quoteAttributeSpecChars = spaceCharacters + "\"'=<>`"
 quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]")
 quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars +
                                  "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
                                  "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
                                  "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
                                  "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
                                  "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
                                  "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
                                  "\u3000]")

 try:
    from codecs import register_error, xmlcharrefreplace_errors
 except ImportError:
 class HTMLSerializer(object):

    # attribute quoting options
    quote_attr_values =False
    quote_attr_values ="legacy"  # be secure by default
    quote_char = '"'
    use_best_quote_char = True

        inject_meta_charset=True|False
          Whether it insert a meta element to define the character set of the
          document.
        quote_attr_values=True|False
        quote_attr_values="legacy"|"spec"|"always"
          Whether to quote attribute values that don't require quoting
          perHTML5 parsing rules.
          perlegacy browser behaviour, when required by the standard, or always.
        quote_char=u'"'|u"'"
          Use given quote character for attribute quoting. Default is to
          use double quote unless attribute value contains a double quote,
                        (k not in booleanAttributes.get(name, tuple()) and
                         k not in booleanAttributes.get("", tuple())):
                        yield self.encodeStrict("=")
                        if self.quote_attr_valuesor not v:
                        if self.quote_attr_values== "always" or len(v) == 0:
                            quote_attr = True
                        elif self.quote_attr_values == "spec":
                            quote_attr = quoteAttributeSpec.search(v) is not None
                        elif self.quote_attr_values == "legacy":
                            quote_attr = quoteAttributeLegacy.search(v) is not None
                        else:
 quote_attr = reduce(lambda x, y: x or (y in v),
   spaceCharacters + ">\"'=", False)
 raise ValueError("quote_attr_values must be one of: "
 "'always', 'spec', or 'legacy'")
                        v = v.replace("&", "&amp;")
                        if self.escape_lt_in_attrs:
                            v = v.replace("<", "&lt;")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -33,6 +33,13 @@ Released on XXX
		* **Use scripting disabled by default (as we don't implement
		scripting).**

		* **Fix #11, avoiding the XSS bug potentially caused by serializer
		allowing attribute values to be escaped out of in old browser versions,
		changing the quote_attr_values option on serializer to take one of
		three values, "always" (the old True value), "legacy" (the new option,
		and the new default), and "spec" (the old False value, and the old
		default).**


		0.9999999/1.0b8
		~~~~~~~~~~~~~~~
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,6 +10,10 @@


		class Filter(_base.Filter):
		def __init__(self, source, require_matching_tags=True):
		super(Filter, self).__init__(source)
		self.require_matching_tags = require_matching_tags

		def __iter__(self):
		open_elements = []
		for token in _base.Filter.__iter__(self):
Expand All		@@ -26,7 +30,7 @@ def __iter__(self):
		assert type == "EmptyTag"
		else:
		assert type == "StartTag"
		if type == "StartTag":
		if type == "StartTag" and self.require_matching_tags:
		open_elements.append((namespace, name))
		for (namespace, name), value in token["data"].items():
		assert namespace is None or isinstance(namespace, text_type)
Expand All		@@ -44,7 +48,7 @@ def __iter__(self):
		assert name != ""
		if (not namespace or namespace == namespaces["html"]) and name in voidElements:
		assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
		else:
		elif self.require_matching_tags:
		start = open_elements.pop()
		assert start == (namespace, name)

Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,10 +1,7 @@
		from __future__ import absolute_import, division, unicode_literals
		from six import text_type

		try:
		from functools import reduce
		except ImportError:
		pass
		import re

		from ..constants import voidElements, booleanAttributes, spaceCharacters
		from ..constants import rcdataElements, entities, xmlEntities
Expand All		@@ -13,6 +10,17 @@

		spaceCharacters = "".join(spaceCharacters)

		quoteAttributeSpecChars = spaceCharacters + "\"'=<>`"
		quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]")
		quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars +
		"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
		"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
		"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
		"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
		"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
		"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
		"\u3000]")

		try:
		from codecs import register_error, xmlcharrefreplace_errors
		except ImportError:
Expand DownExpand Up		@@ -73,7 +81,7 @@ def htmlentityreplace_errors(exc):
		class HTMLSerializer(object):

		# attribute quoting options
		quote_attr_values =False
		quote_attr_values ="legacy" # be secure by default
		quote_char = '"'
		use_best_quote_char = True

Expand DownExpand Up		@@ -109,9 +117,9 @@ def __init__(self, **kwargs):
		inject_meta_charset=True\|False
		Whether it insert a meta element to define the character set of the
		document.
		quote_attr_values=True\|False
		quote_attr_values="legacy"\|"spec"\|"always"
		Whether to quote attribute values that don't require quoting
		perHTML5 parsing rules.
		perlegacy browser behaviour, when required by the standard, or always.
		quote_char=u'"'\|u"'"
		Use given quote character for attribute quoting. Default is to
		use double quote unless attribute value contains a double quote,
Expand DownExpand Up		@@ -240,11 +248,15 @@ def serialize(self, treewalker, encoding=None):
		(k not in booleanAttributes.get(name, tuple()) and
		k not in booleanAttributes.get("", tuple())):
		yield self.encodeStrict("=")
		if self.quote_attr_valuesor not v:
		if self.quote_attr_values== "always" or len(v) == 0:
		quote_attr = True
		elif self.quote_attr_values == "spec":
		quote_attr = quoteAttributeSpec.search(v) is not None
		elif self.quote_attr_values == "legacy":
		quote_attr = quoteAttributeLegacy.search(v) is not None
		else:
		quote_attr = reduce(lambda x, y: x or (y in v),
		spaceCharacters + ">\"'=", False)
		raise ValueError("quote_attr_values must be one of: "
		"'always', 'spec', or 'legacy'")
		v = v.replace("&", "&")
		if self.escape_lt_in_attrs:
		v = v.replace("<", "<")
Expand Down