Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/.appveyor.yml b/.appveyor.yml
diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml
        os: [ubuntu-latest, windows-latest]
        deps: [base, optional]
        include:
          - python: "pypy-2.7"
            os: ubuntu-latest
            deps: base
          - python: "pypy-3.10"
            os: ubuntu-latest
            deps: base
diff --git a/README.rst b/README.rst

 By default, the ``document`` will be an ``xml.etree`` element instance.
 Whenever possible, html5lib chooses the accelerated ``ElementTree``
 implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
 implementation.

 Two other tree types are supported: ``xml.dom.minidom`` and
 ``lxml.etree``. To use an alternative format, specify the name of
  with open("mydocument.html", "rb") as f:
      lxml_etree_document = html5lib.parse(f, treebuilder="lxml")

 When using with ``urllib2`` (Python 2), the charset from HTTP should be
 pass into html5lib as follows:

 .. code-block:: python

  from contextlib import closing
  from urllib2 import urlopen
  import html5lib

  with closing(urlopen("http://example.com/")) as f:
      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))

 When using with ``urllib.request`` (Python 3), the charset from HTTP
 should be pass into html5lib as follows:

 Installation
 ------------

 html5lib works on CPython2.7+, CPython 3.5+ and PyPy. To install:
 html5lib works on CPython3.8+ and PyPy. To install:

 .. code-block:: bash

diff --git a/debug-info.py b/debug-info.py
 from __future__ import print_function, unicode_literals

 import platform
 import sys
diff --git a/doc/conf.py b/doc/conf.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 # html5lib documentation build configuration file, created by
 # sphinx-quickstart on Wed May  8 00:04:49 2013.
 }


 class CExtMock(object):
 class CExtMock:
    """Required for autodoc on readthedocs.org where you cannot build C extensions."""
    def __init__(self, *args, **kwargs):
        pass
diff --git a/html5lib/__init__.py b/html5lib/__init__.py
 * :func:`~.serializer.serialize`
 """

 from __future__ import absolute_import, division, unicode_literals

 from .html5parser import HTMLParser, parse, parseFragment
 from .treebuilders import getTreeBuilder
diff --git a/html5lib/_ihatexml.py b/html5lib/_ihatexml.py
 from __future__ import absolute_import, division, unicode_literals

 import re
 import warnings
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")


 class InfosetFilter(object):
 class InfosetFilter:
    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")

    def __init__(self,
diff --git a/html5lib/_inputstream.py b/html5lib/_inputstream.py
 from __future__ import absolute_import, division, unicode_literals

 from six import text_type
 from six.moves import http_client, urllib
 charsUntilRegEx = {}


 class BufferedStream(object):
 class BufferedStream:
    """Buffering for streams that do not have buffering of their own

    The buffer is implemented as a list of chunks on the assumption that
        return HTMLBinaryInputStream(source, **kwargs)


 class HTMLUnicodeInputStream(object):
 class HTMLUnicodeInputStream:
    """Provides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
        return True


 class EncodingParser(object):
 class EncodingParser:
    """Mini parser for detecting character encoding from meta elements"""

    def __init__(self, data):
                attrValue.append(c)


 class ContentAttrParser(object):
 class ContentAttrParser:
    def __init__(self, data):
        assert isinstance(data, bytes)
        self.data = data
diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py
 from __future__ import absolute_import, division, unicode_literals

 from six import unichr as chr

    attributeMap = OrderedDict


 class HTMLTokenizer(object):
 class HTMLTokenizer:
    """ This class takes care of tokenizing HTML.

    * self.currentToken
diff --git a/html5lib/_trie/__init__.py b/html5lib/_trie/__init__.py
 from __future__ import absolute_import, division, unicode_literals

 from .py import Trie

diff --git a/html5lib/_trie/_base.py b/html5lib/_trie/_base.py
 from __future__ import absolute_import, division, unicode_literals

 try:
    from collections.abc import Mapping
 except ImportError:  # Python 2.7
    from collections import Mapping
 from collections.abc import Mapping


 class Trie(Mapping):
diff --git a/html5lib/_trie/py.py b/html5lib/_trie/py.py
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type

 from bisect import bisect_left
diff --git a/html5lib/_utils.py b/html5lib/_utils.py
 from __future__ import absolute_import, division, unicode_literals

 from types import ModuleType

 try:
    from collections.abc import Mapping
 except ImportError:
    from collections import Mapping
 from collections.abc import Mapping

 from six import text_type, PY3

 if PY3:
    import xml.etree.ElementTree as default_etree
 else:
    try:
        import xml.etree.cElementTree as default_etree
        import xml.etree.ElementTree as default_etree
    except ImportError:
        import xml.etree.ElementTree as default_etree

    moduleCache = {}

    def moduleFactory(baseModule, *args, **kwargs):
        if isinstance(ModuleType.__name__,type("")):
        if isinstance(ModuleType.__name__,str):
            name = "_%s_factory" % baseModule.__name__
        else:
            name = b"_%s_factory" % baseModule.__name__
diff --git a/html5lib/constants.py b/html5lib/constants.py
 from __future__ import absolute_import, division, unicode_literals

 import string

diff --git a/html5lib/filters/alphabeticalattributes.py b/html5lib/filters/alphabeticalattributes.py
 from __future__ import absolute_import, division, unicode_literals

 from . import base

diff --git a/html5lib/filters/base.py b/html5lib/filters/base.py
 from __future__ import absolute_import, division, unicode_literals


 class Filter(object):
 class Filter:
    def __init__(self, source):
        self.source = source

diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py
 from __future__ import absolute_import, division, unicode_literals

 from . import base

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
 from __future__ import absolute_import, division, unicode_literals

 from six import text_type

diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py
 from __future__ import absolute_import, division, unicode_literals

 from . import base

diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py
 if Bleach is unsuitable for your needs.

 """
 from __future__ import absolute_import, division, unicode_literals

 import re
 import warnings
diff --git a/html5lib/filters/whitespace.py b/html5lib/filters/whitespace.py
 from __future__ import absolute_import, division, unicode_literals

 import re

diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
 from __future__ import absolute_import, division, unicode_literals
 from six import viewkeys

 from . import _inputstream
    return p.parseFragment(doc, container=container, **kwargs)


 class HTMLParser(object):
 class HTMLParser:
    """HTML parser

    Generates a tree structure from a stream of (possibly malformed) HTML.
        self.phase = self.phases["text"]


 class Phase(object):
 class Phase:
    """Base class for helper object that implements each phase of processing
    """
    __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
    def processStartTag(self, token):
        # Note the caching is done here rather than BoundMethodDispatcher as doing it there
        # requires a circular reference to the Phase, and this ends up with a significant
        # (CPython2.7,3.8) GC cost when parsing many short inputs
        # (CPython 3.8) GC cost when parsing many short inputs
        name = token["name"]
        # In Py2, using `in` is quicker in general than try/except KeyError
        # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
    def processEndTag(self, token):
        # Note the caching is done here rather than BoundMethodDispatcher as doing it there
        # requires a circular reference to the Phase, and this ends up with a significant
        # (CPython2.7,3.8) GC cost when parsing many short inputs
        # (CPython 3.8) GC cost when parsing many short inputs
        name = token["name"]
        # In Py2, using `in` is quicker in general than try/except KeyError
        # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
diff --git a/html5lib/serializer.py b/html5lib/serializer.py
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type

 import re
    return s.render(walker(input), encoding)


 class HTMLSerializer(object):
 class HTMLSerializer:

    # attribute quoting options
    quote_attr_values = "legacy"  # be secure by default
diff --git a/html5lib/tests/__init__.py b/html5lib/tests/__init__.py
 from __future__ import absolute_import, division, unicode_literals
diff --git a/html5lib/tests/conftest.py b/html5lib/tests/conftest.py
 from __future__ import print_function
 import os.path
 import sys

        # Check for optional requirements
        req_file = os.path.join(_root, "requirements-optional.txt")
        if os.path.exists(req_file):
            with open(req_file, "r") as fp:
            with open(req_file) as fp:
                for line in fp:
                    if (line.strip() and
                        not (line.startswith("-r") or
        import xml.etree.ElementTree as ElementTree

        try:
            import xml.etree.cElementTree as cElementTree
            import xml.etree.ElementTree as cElementTree
        except ImportError:
            msgs.append("cElementTree unable to be imported")
        else:
diff --git a/html5lib/tests/sanitizer.py b/html5lib/tests/sanitizer.py
 from __future__ import absolute_import, division, unicode_literals

 import codecs
 import json
diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py
 from __future__ import absolute_import, division, unicode_literals

 # pylint:disable=wrong-import-position

        return dict.get(self, key, self.default)


 class TestData(object):
 class TestData:
    def __init__(self, filename, newTestHeading="data", encoding="utf8"):
        if encoding is None:
            self.f = open(filename, mode="rb")
diff --git a/html5lib/tests/test_alphabeticalattributes.py b/html5lib/tests/test_alphabeticalattributes.py
 from __future__ import absolute_import, division, unicode_literals

 from collections import OrderedDict

diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
 from __future__ import absolute_import, division, unicode_literals

 import os



 def test_basic_prescan_length():
    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode()
    pad = 1024 - len(data) + 1
    data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
    assert len(data) == 1024  # Sanity


 def test_parser_reparse():
    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode()
    pad = 10240 - len(data) + 1
    data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
    assert len(data) == 10240  # Sanity
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,9 +12,6 @@ jobs:
		os: [ubuntu-latest, windows-latest]
		deps: [base, optional]
		include:
		- python: "pypy-2.7"
		os: ubuntu-latest
		deps: base
		- python: "pypy-3.10"
		os: ubuntu-latest
		deps: base
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,7 +29,7 @@ or:

		By default, the ``document`` will be an ``xml.etree`` element instance.
		Whenever possible, html5lib chooses the accelerated ``ElementTree``
		implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
		implementation.

		Two other tree types are supported: ``xml.dom.minidom`` and
		``lxml.etree``. To use an alternative format, specify the name of
Expand All		@@ -41,18 +41,6 @@ a treebuilder:
		with open("mydocument.html", "rb") as f:
		lxml_etree_document = html5lib.parse(f, treebuilder="lxml")

		When using with ``urllib2`` (Python 2), the charset from HTTP should be
		pass into html5lib as follows:

		.. code-block:: python

		from contextlib import closing
		from urllib2 import urlopen
		import html5lib

		with closing(urlopen("http://example.com/")) as f:
		document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))

		When using with ``urllib.request`` (Python 3), the charset from HTTP
		should be pass into html5lib as follows:

Expand DownExpand Up		@@ -90,7 +78,7 @@ More documentation is available at https://html5lib.readthedocs.io/.
		Installation
		------------

		html5lib works on CPython2.7+, CPython 3.5+ and PyPy. To install:
		html5lib works on CPython3.8+ and PyPy. To install:

		.. code-block:: bash

Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		from __future__ import print_function, unicode_literals

		import platform
		import sys
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,4 @@
		#!/usr/bin/env python3
		# -- coding: utf-8 --
		#
		# html5lib documentation build configuration file, created by
		# sphinx-quickstart on Wed May 8 00:04:49 2013.
Expand DownExpand Up		@@ -100,7 +99,7 @@
		}


		class CExtMock(object):
		class CExtMock:
		"""Required for autodoc on readthedocs.org where you cannot build C extensions."""
		def __init__(self, args, *kwargs):
		pass
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -20,7 +20,6 @@
		* :func:`~.serializer.serialize`
		"""

		from __future__ import absolute_import, division, unicode_literals

		from .html5parser import HTMLParser, parse, parseFragment
		from .treebuilders import getTreeBuilder
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		from __future__ import absolute_import, division, unicode_literals

		import re
		import warnings
Expand DownExpand Up		@@ -181,7 +180,7 @@ def escapeRegexp(string):
		nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")


		class InfosetFilter(object):
		class InfosetFilter:
		replacementRegexp = re.compile(r"U[\dA-F]{5,5}")

		def __init__(self,
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		from __future__ import absolute_import, division, unicode_literals

		from six import text_type
		from six.moves import http_client, urllib
Expand DownExpand Up		@@ -48,7 +47,7 @@
		charsUntilRegEx = {}


		class BufferedStream(object):
		class BufferedStream:
		"""Buffering for streams that do not have buffering of their own

		The buffer is implemented as a list of chunks on the assumption that
Expand DownExpand Up		@@ -145,7 +144,7 @@ def HTMLInputStream(source, **kwargs):
		return HTMLBinaryInputStream(source, **kwargs)


		class HTMLUnicodeInputStream(object):
		class HTMLUnicodeInputStream:
		"""Provides a unicode stream of characters to the HTMLTokenizer.

		This class takes care of character encoding and removing or replacing
Expand DownExpand Up		@@ -673,7 +672,7 @@ def jumpTo(self, bytes):
		return True


		class EncodingParser(object):
		class EncodingParser:
		"""Mini parser for detecting character encoding from meta elements"""

		def __init__(self, data):
Expand DownExpand Up		@@ -861,7 +860,7 @@ def getAttribute(self):
		attrValue.append(c)


		class ContentAttrParser(object):
		class ContentAttrParser:
		def __init__(self, data):
		assert isinstance(data, bytes)
		self.data = data
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		from __future__ import absolute_import, division, unicode_literals

		from six import unichr as chr

Expand All		@@ -24,7 +23,7 @@
		attributeMap = OrderedDict


		class HTMLTokenizer(object):
		class HTMLTokenizer:
		""" This class takes care of tokenizing HTML.

		* self.currentToken
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@
		from __future__ import absolute_import, division, unicode_literals

		from .py import Trie

Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,9 +1,5 @@
		from __future__ import absolute_import, division, unicode_literals

		try:
		from collections.abc import Mapping
		except ImportError: # Python 2.7
		from collections import Mapping
		from collections.abc import Mapping


		class Trie(Mapping):
Expand Down