Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/.appveyor.yml b/.appveyor.yml
diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml
        os: [ubuntu-latest, windows-latest]
        deps: [base, optional]
        include:
          - python: "pypy-2.7"
            os: ubuntu-latest
            deps: base
          - python: "pypy-3.10"
            os: ubuntu-latest
            deps: base
diff --git a/README.rst b/README.rst

 By default, the ``document`` will be an ``xml.etree`` element instance.
 Whenever possible, html5lib chooses the accelerated ``ElementTree``
 implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
 implementation.

 Two other tree types are supported: ``xml.dom.minidom`` and
 ``lxml.etree``. To use an alternative format, specify the name of
  with open("mydocument.html", "rb") as f:
      lxml_etree_document = html5lib.parse(f, treebuilder="lxml")

 When using with ``urllib2`` (Python 2), the charset from HTTP should be
 pass into html5lib as follows:

 .. code-block:: python

  from contextlib import closing
  from urllib2 import urlopen
  import html5lib

  with closing(urlopen("http://example.com/")) as f:
      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))

 When using with ``urllib.request`` (Python 3), the charset from HTTP
 should be pass into html5lib as follows:

 Installation
 ------------

 html5lib works on CPython2.7+, CPython 3.5+ and PyPy. To install:
 html5lib works on CPython3.8+ and PyPy. To install:

 .. code-block:: bash

diff --git a/debug-info.py b/debug-info.py
 from __future__ import print_function, unicode_literals

 import platform
 import sys
    "maxsize": sys.maxsize
 }

 search_modules = ["chardet", "genshi", "html5lib", "lxml", "six"]
 search_modules = ["chardet", "genshi", "html5lib", "lxml"]
 found_modules = []

 for m in search_modules:
diff --git a/doc/conf.py b/doc/conf.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 # html5lib documentation build configuration file, created by
 # sphinx-quickstart on Wed May  8 00:04:49 2013.
 }


 classCExtMock(object):
 classCExtMock:
 """Required for autodoc on readthedocs.org where you cannot build C extensions."""
 def__init__(self,*args,**kwargs):
 pass
diff --git a/html5lib/__init__.py b/html5lib/__init__.py
 * :func:`~.serializer.serialize`
 """

 from __future__importabsolute_import,division,unicode_literals

 from .html5parserimportHTMLParser,parse,parseFragment
 from .treebuildersimportgetTreeBuilder
diff --git a/html5lib/_ihatexml.py b/html5lib/_ihatexml.py
 from __future__importabsolute_import,division,unicode_literals

 importre
 importwarnings
 nonPubidCharRegexp=re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")


 classInfosetFilter(object):
 classInfosetFilter:
 replacementRegexp=re.compile(r"U[\dA-F]{5,5}")

 def__init__(self,
diff --git a/html5lib/_inputstream.py b/html5lib/_inputstream.py
 from __future__ import absolute_import, division, unicode_literals

 from siximporttext_type
 from six.movesimporthttp_client,urllib
 importhttp.client
 import urllib.response

 import codecs
 import re
 charsUntilRegEx = {}


 class BufferedStream(object):
 class BufferedStream:
    """Buffering for streams that do not have buffering of their own

    The buffer is implemented as a list of chunks on the assumption that
 def HTMLInputStream(source, **kwargs):
    # Work around Python bug #20007: read(0) closes the connection.
    # http://bugs.python.org/issue20007
    if (isinstance(source,http_client.HTTPResponse) or
    if (isinstance(source,http.client.HTTPResponse) or
        # Also check for addinfourl wrapping HTTPResponse
        (isinstance(source, urllib.response.addbase) and
         isinstance(source.fp,http_client.HTTPResponse))):
         isinstance(source.fp,http.client.HTTPResponse))):
        isUnicode = False
    elif hasattr(source, "read"):
        isUnicode = isinstance(source.read(0), text_type)
        return HTMLBinaryInputStream(source, **kwargs)


 class HTMLUnicodeInputStream(object):
 class HTMLUnicodeInputStream:
    """Provides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
        return True


 class EncodingParser(object):
 class EncodingParser:
    """Mini parser for detecting character encoding from meta elements"""

    def __init__(self, data):
                attrValue.append(c)


 class ContentAttrParser(object):
 class ContentAttrParser:
    def __init__(self, data):
        assert isinstance(data, bytes)
        self.data = data
diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py
 from __future__ import absolute_import, division, unicode_literals

 from six import unichr as chr

 from collections import deque, OrderedDict
 from sys import version_info
    attributeMap = OrderedDict


 class HTMLTokenizer(object):
 class HTMLTokenizer:
    """ This class takes care of tokenizing HTML.

    * self.currentToken
diff --git a/html5lib/_trie/__init__.py b/html5lib/_trie/__init__.py
 from __future__importabsolute_import,division,unicode_literals

 from .pyimportTrie

diff --git a/html5lib/_trie/_base.py b/html5lib/_trie/_base.py
 from __future__ import absolute_import, division, unicode_literals

 try:
    from collections.abc import Mapping
 except ImportError:  # Python 2.7
    from collections import Mapping
 from collections.abc import Mapping


 class Trie(Mapping):
diff --git a/html5lib/_trie/py.py b/html5lib/_trie/py.py
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type

 from bisect import bisect_left

 from ._base import Trie as ABCTrie


 class Trie(ABCTrie):
    def __init__(self, data):
        if not all(isinstance(x,text_type) for x in data.keys()):
        if not all(isinstance(x,str) for x in data.keys()):
            raise TypeError("All keys must be strings")

        self._data = data
diff --git a/html5lib/_utils.py b/html5lib/_utils.py
 from __future__ import absolute_import, division, unicode_literals

 from types import ModuleType

 try:
    from collections.abc import Mapping
 except ImportError:
    from collections import Mapping

 from six import text_type, PY3
 from collections.abc import Mapping

 if PY3:
    import xml.etree.ElementTree as default_etree
 else:
    try:
        import xml.etree.cElementTree as default_etree
    except ImportError:
        import xml.etree.ElementTree as default_etree
 import xml.etree.ElementTree as default_etree


 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
 # escapes.
 try:
    _x = eval('"\\uD800"')  # pylint:disable=eval-used
    if not isinstance(_x,text_type):
    if not isinstance(_x,str):
        # We need this with u"" because of http://bugs.jython.org/issue2039
        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
        assert isinstance(_x,text_type)
        assert isinstance(_x,str)
 except Exception:
    supports_lone_surrogates = False
 else:
    moduleCache = {}

    def moduleFactory(baseModule, *args, **kwargs):
        if isinstance(ModuleType.__name__,type("")):
        if isinstance(ModuleType.__name__,str):
            name = "_%s_factory" % baseModule.__name__
        else:
            name = b"_%s_factory" % baseModule.__name__
diff --git a/html5lib/constants.py b/html5lib/constants.py
 from __future__ import absolute_import, division, unicode_literals

 import string

diff --git a/html5lib/filters/alphabeticalattributes.py b/html5lib/filters/alphabeticalattributes.py
 from __future__ import absolute_import, division, unicode_literals

 from . import base

diff --git a/html5lib/filters/base.py b/html5lib/filters/base.py
 from __future__ import absolute_import, division, unicode_literals


 class Filter(object):
 class Filter:
    def __init__(self, source):
        self.source = source

diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py
 from __future__ import absolute_import, division, unicode_literals

 from . import base

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
 from __future__ import absolute_import, division, unicode_literals

 from six import text_type

 from . import base
 from ..constants import namespaces, voidElements
            if type in ("StartTag", "EmptyTag"):
                namespace = token["namespace"]
                name = token["name"]
                assert namespace is None or isinstance(namespace,text_type)
                assert namespace is None or isinstance(namespace,str)
                assert namespace != ""
                assert isinstance(name,text_type)
                assert isinstance(name,str)
                assert name != ""
                assert isinstance(token["data"], dict)
                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                if type == "StartTag" and self.require_matching_tags:
                    open_elements.append((namespace, name))
                for (namespace, name), value in token["data"].items():
                    assert namespace is None or isinstance(namespace,text_type)
                    assert namespace is None or isinstance(namespace,str)
                    assert namespace != ""
                    assert isinstance(name,text_type)
                    assert isinstance(name,str)
                    assert name != ""
                    assert isinstance(value,text_type)
                    assert isinstance(value,str)

            elif type == "EndTag":
                namespace = token["namespace"]
                name = token["name"]
                assert namespace is None or isinstance(namespace,text_type)
                assert namespace is None or isinstance(namespace,str)
                assert namespace != ""
                assert isinstance(name,text_type)
                assert isinstance(name,str)
                assert name != ""
                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                    assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}

            elif type == "Comment":
                data = token["data"]
                assert isinstance(data,text_type)
                assert isinstance(data,str)

            elif type in ("Characters", "SpaceCharacters"):
                data = token["data"]
                assert isinstance(data,text_type)
                assert isinstance(data,str)
                assert data != ""
                if type == "SpaceCharacters":
                    assert data.strip(spaceCharacters) == ""

            elif type == "Doctype":
                name = token["name"]
                assert name is None or isinstance(name,text_type)
                assert token["publicId"] is None or isinstance(name,text_type)
                assert token["systemId"] is None or isinstance(name,text_type)
                assert name is None or isinstance(name,str)
                assert token["publicId"] is None or isinstance(name,str)
                assert token["systemId"] is None or isinstance(name,str)

            elif type == "Entity":
                assert isinstance(token["name"],text_type)
                assert isinstance(token["name"],str)

            elif type == "SerializerError":
                assert isinstance(token["data"],text_type)
                assert isinstance(token["data"],str)

            else:
                assert False, "Unknown token type: %(type)s" % {"type": type}
diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py
 from __future__ import absolute_import, division, unicode_literals

 from . import base
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,9 +12,6 @@ jobs:
		os: [ubuntu-latest, windows-latest]
		deps: [base, optional]
		include:
		- python: "pypy-2.7"
		os: ubuntu-latest
		deps: base
		- python: "pypy-3.10"
		os: ubuntu-latest
		deps: base
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,7 +29,7 @@ or:

		By default, the ``document`` will be an ``xml.etree`` element instance.
		Whenever possible, html5lib chooses the accelerated ``ElementTree``
		implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
		implementation.

		Two other tree types are supported: ``xml.dom.minidom`` and
		``lxml.etree``. To use an alternative format, specify the name of
Expand All		@@ -41,18 +41,6 @@ a treebuilder:
		with open("mydocument.html", "rb") as f:
		lxml_etree_document = html5lib.parse(f, treebuilder="lxml")

		When using with ``urllib2`` (Python 2), the charset from HTTP should be
		pass into html5lib as follows:

		.. code-block:: python

		from contextlib import closing
		from urllib2 import urlopen
		import html5lib

		with closing(urlopen("http://example.com/")) as f:
		document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))

		When using with ``urllib.request`` (Python 3), the charset from HTTP
		should be pass into html5lib as follows:

Expand DownExpand Up		@@ -90,7 +78,7 @@ More documentation is available at https://html5lib.readthedocs.io/.
		Installation
		------------

		html5lib works on CPython2.7+, CPython 3.5+ and PyPy. To install:
		html5lib works on CPython3.8+ and PyPy. To install:

		.. code-block:: bash

Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		from __future__ import print_function, unicode_literals

		import platform
		import sys
Expand All		@@ -12,7 +11,7 @@
		"maxsize": sys.maxsize
		}

		search_modules = ["chardet", "genshi", "html5lib", "lxml", "six"]
		search_modules = ["chardet", "genshi", "html5lib", "lxml"]
		found_modules = []

		for m in search_modules:
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,4 @@
		#!/usr/bin/env python3
		# -- coding: utf-8 --
		#
		# html5lib documentation build configuration file, created by
		# sphinx-quickstart on Wed May 8 00:04:49 2013.
Expand DownExpand Up		@@ -100,7 +99,7 @@
		}


		classCExtMock(object):
		classCExtMock:
		"""Required for autodoc on readthedocs.org where you cannot build C extensions."""
		def__init__(self,args,*kwargs):
		pass
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -20,7 +20,6 @@
		* :func:`~.serializer.serialize`
		"""

		from __future__importabsolute_import,division,unicode_literals

		from .html5parserimportHTMLParser,parse,parseFragment
		from .treebuildersimportgetTreeBuilder
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		from __future__importabsolute_import,division,unicode_literals

		importre
		importwarnings
Expand DownExpand Up		@@ -181,7 +180,7 @@ def escapeRegexp(string):
		nonPubidCharRegexp=re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")


		classInfosetFilter(object):
		classInfosetFilter:
		replacementRegexp=re.compile(r"U[\dA-F]{5,5}")

		def__init__(self,
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,7 +1,6 @@
		from __future__ import absolute_import, division, unicode_literals

		from siximporttext_type
		from six.movesimporthttp_client,urllib
		importhttp.client
		import urllib.response

		import codecs
		import re
Expand DownExpand Up		@@ -48,7 +47,7 @@
		charsUntilRegEx = {}


		class BufferedStream(object):
		class BufferedStream:
		"""Buffering for streams that do not have buffering of their own

		The buffer is implemented as a list of chunks on the assumption that
Expand DownExpand Up		@@ -125,10 +124,10 @@ def _readFromBuffer(self, bytes):
		def HTMLInputStream(source, **kwargs):
		# Work around Python bug #20007: read(0) closes the connection.
		# http://bugs.python.org/issue20007
		if (isinstance(source,http_client.HTTPResponse) or
		if (isinstance(source,http.client.HTTPResponse) or
		# Also check for addinfourl wrapping HTTPResponse
		(isinstance(source, urllib.response.addbase) and
		isinstance(source.fp,http_client.HTTPResponse))):
		isinstance(source.fp,http.client.HTTPResponse))):
		isUnicode = False
		elif hasattr(source, "read"):
		isUnicode = isinstance(source.read(0), text_type)
Expand All		@@ -145,7 +144,7 @@ def HTMLInputStream(source, **kwargs):
		return HTMLBinaryInputStream(source, **kwargs)


		class HTMLUnicodeInputStream(object):
		class HTMLUnicodeInputStream:
		"""Provides a unicode stream of characters to the HTMLTokenizer.

		This class takes care of character encoding and removing or replacing
Expand DownExpand Up		@@ -673,7 +672,7 @@ def jumpTo(self, bytes):
		return True


		class EncodingParser(object):
		class EncodingParser:
		"""Mini parser for detecting character encoding from meta elements"""

		def __init__(self, data):
Expand DownExpand Up		@@ -861,7 +860,7 @@ def getAttribute(self):
		attrValue.append(c)


		class ContentAttrParser(object):
		class ContentAttrParser:
		def __init__(self, data):
		assert isinstance(data, bytes)
		self.data = data
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,6 +1,3 @@
		from __future__ import absolute_import, division, unicode_literals

		from six import unichr as chr

		from collections import deque, OrderedDict
		from sys import version_info
Expand All		@@ -24,7 +21,7 @@
		attributeMap = OrderedDict


		class HTMLTokenizer(object):
		class HTMLTokenizer:
		""" This class takes care of tokenizing HTML.

		* self.currentToken
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		from __future__importabsolute_import,division,unicode_literals

		from .pyimportTrie

Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,9 +1,5 @@
		from __future__ import absolute_import, division, unicode_literals

		try:
		from collections.abc import Mapping
		except ImportError: # Python 2.7
		from collections import Mapping
		from collections.abc import Mapping


		class Trie(Mapping):
Expand Down