Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

[3.12] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464)#135483

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
ambv merged 1 commit intopython:3.12fromserhiy-storchaka:backport-6eb6c5d-3.12
Jul 3, 2025
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 30 additions & 11 deletionsLib/html/parser.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -25,6 +25,7 @@
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

starttagopen = re.compile('<[a-zA-Z]')
endtagopen = re.compile('</[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
# Note:
Expand DownExpand Up@@ -177,25 +178,43 @@ def goahead(self, end):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
elif (i + 1) < n:
elif (i + 1) < n or end:
self.handle_data("<")
k = i + 1
else:
break
if k < 0:
if not end:
break
k = rawdata.find('>', i + 1)
if k < 0:
k = rawdata.find('<', i + 1)
if k < 0:
k = i + 1
else:
k += 1
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:k]))
if starttagopen.match(rawdata, i): # < + letter
pass
elif startswith("</", i):
if i + 2 == n:
self.handle_data("</")
elif endtagopen.match(rawdata, i): # </ + letter
pass
else:
# bogus comment
self.handle_comment(rawdata[i+2:])
elif startswith("<!--", i):
j = n
for suffix in ("--!", "--", "-"):
if rawdata.endswith(suffix, i+4):
j -= len(suffix)
break
self.handle_comment(rawdata[i+4:j])
elif startswith("<![CDATA[", i):
self.unknown_decl(rawdata[i+3:])
elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:])
elif startswith("<!", i):
# bogus comment
self.handle_comment(rawdata[i+2:])
elif startswith("<?", i):
self.handle_pi(rawdata[i+2:])
else:
self.handle_data(rawdata[i:k])
raise AssertionError("we should not get here!")
k = n
i = self.updatepos(i, k)
elif startswith("&#", i):
match = charref.match(rawdata, i)
Expand Down
94 changes: 82 additions & 12 deletionsLib/test/test_htmlparser.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -5,6 +5,7 @@
importunittest

fromunittest.mockimportpatch
fromtestimportsupport


classEventCollector(html.parser.HTMLParser):
Expand DownExpand Up@@ -393,28 +394,34 @@ def test_tolerant_parsing(self):
('data','<'),
('starttag','bc<', [('a',None)]),
('endtag','html'),
('data','\n<img src="URL>'),
('comment','/img'),
('endtag','html<')])
('data','\n')])

deftest_starttag_junk_chars(self):
self._run_check("<", [('data','<')])
self._run_check("<>", [('data','<>')])
self._run_check("< >", [('data','< >')])
self._run_check("< ", [('data','< ')])
self._run_check("</>", [])
self._run_check("<$>", [('data','<$>')])
self._run_check("</$>", [('comment','$')])
self._run_check("</", [('data','</')])
self._run_check("</a", [('data','</a')])
self._run_check("</a", [])
self._run_check("</ a>", [('endtag','a')])
self._run_check("</ a", [('comment',' a')])
self._run_check("<a<a>", [('starttag','a<a', [])])
self._run_check("</a<a>", [('endtag','a<a')])
self._run_check("<!", [('data','<!')])
self._run_check("<a", [('data','<a')])
self._run_check("<a foo='bar'", [('data',"<a foo='bar'")])
self._run_check("<a foo='bar", [('data',"<a foo='bar")])
self._run_check("<a foo='>'", [('data',"<a foo='>'")])
self._run_check("<a foo='>", [('data',"<a foo='>")])
self._run_check("<!", [('comment','')])
self._run_check("<a", [])
self._run_check("<a foo='bar'", [])
self._run_check("<a foo='bar", [])
self._run_check("<a foo='>'", [])
self._run_check("<a foo='>", [])
self._run_check("<a$>", [('starttag','a$', [])])
self._run_check("<a$b>", [('starttag','a$b', [])])
self._run_check("<a$b/>", [('startendtag','a$b', [])])
self._run_check("<a$b >", [('starttag','a$b', [])])
self._run_check("<a$b />", [('startendtag','a$b', [])])
self._run_check("</a$b>", [('endtag','a$b')])

deftest_slashes_in_starttag(self):
self._run_check('<a foo="var"/>', [('startendtag','a', [('foo','var')])])
Expand DownExpand Up@@ -539,13 +546,56 @@ def test_EOF_in_charref(self):
forhtml,expectedindata:
self._run_check(html,expected)

deftest_broken_comments(self):
html= ('<! not really a comment >'
deftest_eof_in_comments(self):
data= [
('<!--', [('comment','')]),
('<!---', [('comment','')]),
('<!----', [('comment','')]),
('<!-----', [('comment','-')]),
('<!------', [('comment','--')]),
('<!----!', [('comment','')]),
('<!---!', [('comment','-!')]),
('<!---!>', [('comment','-!>')]),
('<!--foo', [('comment','foo')]),
('<!--foo-', [('comment','foo')]),
('<!--foo--', [('comment','foo')]),
('<!--foo--!', [('comment','foo')]),
('<!--<!--', [('comment','<!')]),
('<!--<!--!', [('comment','<!')]),
]
forhtml,expectedindata:
self._run_check(html,expected)

deftest_eof_in_declarations(self):
data= [
('<!', [('comment','')]),
('<!-', [('comment','-')]),
('<![', [('comment','[')]),
('<![CDATA[', [('unknown decl','CDATA[')]),
('<![CDATA[x', [('unknown decl','CDATA[x')]),
('<![CDATA[x]', [('unknown decl','CDATA[x]')]),
('<![CDATA[x]]', [('unknown decl','CDATA[x]]')]),
('<!DOCTYPE', [('decl','DOCTYPE')]),
('<!DOCTYPE ', [('decl','DOCTYPE ')]),
('<!DOCTYPE html', [('decl','DOCTYPE html')]),
('<!DOCTYPE html ', [('decl','DOCTYPE html ')]),
('<!DOCTYPE html PUBLIC', [('decl','DOCTYPE html PUBLIC')]),
('<!DOCTYPE html PUBLIC "foo', [('decl','DOCTYPE html PUBLIC "foo')]),
('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
[('decl','DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
]
forhtml,expectedindata:
self._run_check(html,expected)

deftest_bogus_comments(self):
html= ('<!ELEMENT br EMPTY>'
'<! not really a comment >'
'<! not a comment either -->'
'<! -- close enough -->'
'<!><!<-- this was an empty comment>'
'<!!! another bogus comment !!!>')
expected= [
('comment','ELEMENT br EMPTY'),
('comment',' not really a comment '),
('comment',' not a comment either --'),
('comment',' -- close enough --'),
Expand DownExpand Up@@ -600,6 +650,26 @@ def test_convert_charrefs_dropped_text(self):
('endtag','a'), ('data',' bar & baz')]
)

@support.requires_resource('cpu')
deftest_eof_no_quadratic_complexity(self):
# Each of these examples used to take about an hour.
# Now they take a fraction of a second.
defcheck(source):
parser=html.parser.HTMLParser()
parser.feed(source)
parser.close()
n=120_000
check("<a "*n)
check("<a a="*n)
check("</a "*14*n)
check("</a a="*11*n)
check("<!--"*4*n)
check("<!"*60*n)
check("<?"*19*n)
check("</$"*15*n)
check("<![CDATA["*9*n)
check("<!doctype"*35*n)


classAttributesTestCase(TestCaseBase):

Expand Down
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
Fix quadratic complexity in processing specially crafted input in
:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
to the HTML5 specs -- comments and declarations are automatically closed,
tags are ignored.
Loading

[8]ページ先頭

©2009-2025 Movatter.jp