Jul 14, 2024 · May 13, 2025 · May 13, 2025 · May 13, 2025 · May 13, 2025 · May 13, 2025
diff --git a/Lib/html/parser.py b/Lib/html/parser.py

 starttagopen = re.compile('<[a-zA-Z]')
 piclose = re.compile('>')
 escapable_raw_text_close = re.compile('</(title|textarea)>', re.I)
 commentclose = re.compile(r'--\s*>')
 # Note:
 #  1) if you change tagfind/attrfind remember to update locatestarttagend too;
    """

    CDATA_CONTENT_ELEMENTS = ("script", "style")
    ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea")

    def __init__(self, *, convert_charrefs=True):
        """Initialize and reset this instance.
        self.lasttag = '???'
        self.interesting = interesting_normal
        self.cdata_elem = None
        self.escapable_raw_text_elem = None
        super().reset()

    def feed(self, data):
        """Return full source of start tag: '<...>'."""
        return self.__starttag_text

    def set_escapable_raw_text_mode(self, elem):
        self.escapable_raw_text_elem = elem.lower()
        self.interesting = re.compile(r'</\s*%s\s*>' % self.escapable_raw_text_elem, re.I)

    def clear_escapable_raw_text_mode(self):
        self.interesting = interesting_normal
        self.escapable_raw_text_elem = None

    def set_cdata_mode(self, elem):
        self.cdata_elem = elem.lower()
        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
        i = 0
        n = len(rawdata)
        while i < n:
            if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
            if self.convert_charrefs and not self.cdata_elem:
                j = rawdata.find('<', i)
                if j < 0:
                    # if we can't find the next <, either we are at the end
                if match:
                    j = match.start()
                else:
                    if self.escapable_raw_text_elem:
                        break
                    if self.cdata_elem:
                        break
                    j = n
            if i < j:
                if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
                if self.convert_charrefs and not self.cdata_elem:
                    self.handle_data(unescape(rawdata[i:j]))
                else:
                    self.handle_data(rawdata[i:j])
            self.handle_startendtag(tag, attrs)
        else:
            self.handle_starttag(tag, attrs)
            if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS:
                self.set_escapable_raw_text_mode(tag)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag)
        return endpos
                self.handle_data(rawdata[i:gtpos])
                return gtpos

        if self.escapable_raw_text_elem is not None: # title or textarea
            if elem != self.escapable_raw_text_elem:
                self.handle_data(rawdata[i:gtpos])
                return gtpos

        self.handle_endtag(elem)
        self.clear_cdata_mode()
        self.clear_escapable_raw_text_mode()
        return gtpos

    # Overridable -- finish processing of start+end tag: <tag.../>
        pass

    def unknown_decl(self, data):
        pass
        pass
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
            #'foo = </\nscript>',
            #'foo = </ script>',
        ]
        tags = ['script', 'style', 'textarea', 'title']
        # test the following 'casing' for each tag: script, SCRIPT, Script etc.
        elements = [f(tag) for tag in tags for f in (str.lower, str.upper, str.capitalize)]
        elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
        for content in contents:
            for element in elements:
                element_lower = element.lower()
                                ("endtag", element_lower)],
                            collector=Collector(convert_charrefs=False))

    def test_escapable_raw_text_content(self):
        contents = [
            'foo = "</TITLE" + ">";',
            'foo = <\n/title> ',
            '<!-- document.write("</scr" + "ipt>"); -->',
            '\n//<![CDATA[\n'
            '\n<!-- //\nvar foo = 3.14;\n// -->\n',
            # valid character reference
            '&#65;',
            # ambiguous ampersand example
            '&notaref',
            'foo = "</sty" + "le>";',
            '<!-- \u2603 -->',
            # these two should be invalid according to the HTML 5 spec,
            # section 8.1.2.2
            #'foo = </\nscript>',
            #'foo = </ script>',
        ]
        elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea']
        for content in contents:
            for element in elements:
                element_lower = element.lower()
                s = '<{element}>{content}</{element}>'.format(element=element,
                                                               content=content)
                self._run_check(s, [("starttag", element_lower, []),
                                    ("data", content),
                                    ("endtag", element_lower)])

    def test_EOF_in_cdata(self):
        content = """<!-- not a comment --> &not-an-entity-ref;
                  <a href="" /> </p><p> <span></span></style>
                        ('starttag', 'script', []), ('data', text),
                        ('endtag', 'script'), ('data', '"'),
                        ('starttag', 'style', []), ('data', text),
                        ('endtag', 'style'), ('data', '"'),
                        ('starttag', 'title', []), ('data', text),
                        ('endtag', 'title'), ('data', '"'),
                        ('starttag', 'textarea', []), ('data', text),
                        ('endtag', 'textarea'), ('data', '"')]
                        ('endtag', 'style'), ('data', '"')]
            self._run_check('{1}<script>{0}</script>{1}'
                            '<style>{0}</style>{1}'
                            '<title>{0}</title>{1}'
                            '<textarea>{0}</textarea>{1}'.format(text, charref),
                            '<style>{0}</style>{1}'.format(text, charref),
                            expected, collector=collector())
        # check truncated charrefs at the end of the file
        html = '&quo &# &#x'


 if __name__ == "__main__":
    unittest.main()
    unittest.main()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -28,7 +28,6 @@

		starttagopen = re.compile('<[a-zA-Z]')
		piclose = re.compile('>')
		escapable_raw_text_close = re.compile('</(title\|textarea)>', re.I)
		commentclose = re.compile(r'--\s*>')
		# Note:
		# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
Expand DownExpand Up		@@ -101,7 +100,6 @@ class HTMLParser(_markupbase.ParserBase):
		"""

		CDATA_CONTENT_ELEMENTS = ("script", "style")
		ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea")

		def __init__(self, *, convert_charrefs=True):
		"""Initialize and reset this instance.
Expand All		@@ -119,7 +117,6 @@ def reset(self):
		self.lasttag = '???'
		self.interesting = interesting_normal
		self.cdata_elem = None
		self.escapable_raw_text_elem = None
		super().reset()

		def feed(self, data):
Expand All		@@ -141,14 +138,6 @@ def get_starttag_text(self):
		"""Return full source of start tag: '<...>'."""
		return self.__starttag_text

		def set_escapable_raw_text_mode(self, elem):
		self.escapable_raw_text_elem = elem.lower()
		self.interesting = re.compile(r'</\s%s\s>' % self.escapable_raw_text_elem, re.I)

		def clear_escapable_raw_text_mode(self):
		self.interesting = interesting_normal
		self.escapable_raw_text_elem = None

		def set_cdata_mode(self, elem):
		self.cdata_elem = elem.lower()
		self.interesting = re.compile(r'</\s%s\s>' % self.cdata_elem, re.I)
Expand All		@@ -165,7 +154,7 @@ def goahead(self, end):
		i = 0
		n = len(rawdata)
		while i < n:
		if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
		if self.convert_charrefs and not self.cdata_elem:
		j = rawdata.find('<', i)
		if j < 0:
		# if we can't find the next <, either we are at the end
Expand All		@@ -184,13 +173,11 @@ def goahead(self, end):
		if match:
		j = match.start()
		else:
		if self.escapable_raw_text_elem:
		break
		if self.cdata_elem:
		break
		j = n
		if i < j:
		if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
		if self.convert_charrefs and not self.cdata_elem:
		self.handle_data(unescape(rawdata[i:j]))
		else:
		self.handle_data(rawdata[i:j])
Expand DownExpand Up		@@ -367,8 +354,6 @@ def parse_starttag(self, i):
		self.handle_startendtag(tag, attrs)
		else:
		self.handle_starttag(tag, attrs)
		if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS:
		self.set_escapable_raw_text_mode(tag)
		if tag in self.CDATA_CONTENT_ELEMENTS:
		self.set_cdata_mode(tag)
		return endpos
Expand DownExpand Up		@@ -444,14 +429,8 @@ def parse_endtag(self, i):
		self.handle_data(rawdata[i:gtpos])
		return gtpos

		if self.escapable_raw_text_elem is not None: # title or textarea
		if elem != self.escapable_raw_text_elem:
		self.handle_data(rawdata[i:gtpos])
		return gtpos

		self.handle_endtag(elem)
		self.clear_cdata_mode()
		self.clear_escapable_raw_text_mode()
		return gtpos

		# Overridable -- finish processing of start+end tag: <tag.../>
Expand DownExpand Up		@@ -492,4 +471,4 @@ def handle_pi(self, data):
		pass

		def unknown_decl(self, data):
		pass
		pass
Original file line number	Diff line number	Diff line change
Expand Up		@@ -285,9 +285,7 @@ def test_cdata_content(self):
		#'foo = </\nscript>',
		#'foo = </ script>',
		]
		tags = ['script', 'style', 'textarea', 'title']
		# test the following 'casing' for each tag: script, SCRIPT, Script etc.
		elements = [f(tag) for tag in tags for f in (str.lower, str.upper, str.capitalize)]
		elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
		for content in contents:
		for element in elements:
		element_lower = element.lower()
Expand DownExpand Up		@@ -319,34 +317,6 @@ def get_events(self):
		("endtag", element_lower)],
		collector=Collector(convert_charrefs=False))

		def test_escapable_raw_text_content(self):
		contents = [
		'foo = "</TITLE" + ">";',
		'foo = <\n/title> ',
		'<!-- document.write("</scr" + "ipt>"); -->',
		'\n//<![CDATA[\n'
		'\n<!-- //\nvar foo = 3.14;\n// -->\n',
		# valid character reference
		'A',
		# ambiguous ampersand example
		'&notaref',
		'foo = "</sty" + "le>";',
		'<!-- \u2603 -->',
		# these two should be invalid according to the HTML 5 spec,
		# section 8.1.2.2
		#'foo = </\nscript>',
		#'foo = </ script>',
		]
		elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea']
		for content in contents:
		for element in elements:
		element_lower = element.lower()
		s = '<{element}>{content}</{element}>'.format(element=element,
		content=content)
		self._run_check(s, [("starttag", element_lower, []),
		("data", content),
		("endtag", element_lower)])

		def test_EOF_in_cdata(self):
		content = """<!-- not a comment --> &not-an-entity-ref;
		<a href="" /> </p><p> <span></span></style>
Expand DownExpand Up		@@ -407,15 +377,9 @@ def test_convert_charrefs(self):
		('starttag', 'script', []), ('data', text),
		('endtag', 'script'), ('data', '"'),
		('starttag', 'style', []), ('data', text),
		('endtag', 'style'), ('data', '"'),
		('starttag', 'title', []), ('data', text),
		('endtag', 'title'), ('data', '"'),
		('starttag', 'textarea', []), ('data', text),
		('endtag', 'textarea'), ('data', '"')]
		('endtag', 'style'), ('data', '"')]
		self._run_check('{1}<script>{0}</script>{1}'
		'<style>{0}</style>{1}'
		'<title>{0}</title>{1}'
		'<textarea>{0}</textarea>{1}'.format(text, charref),
		'<style>{0}</style>{1}'.format(text, charref),
		expected, collector=collector())
		# check truncated charrefs at the end of the file
		html = '&quo &# &#x'
Expand DownExpand Up		@@ -922,4 +886,4 @@ def test_base_class_methods_called(self, super_reset_method, super_init_method):


		if __name__ == "__main__":
		unittest.main()
		unittest.main()