Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb579dba

Browse files
committed
#1486713: Add a tolerant mode to HTMLParser.
The motivation for adding this option is that the the functionality itprovides used to be provided by sgmllib in Python2, and was used by,for example, BeautifulSoup. Without this option, the Python3 versionof BeautifulSoup and the many programs that use it are crippled.The original patch was by 'kxroberto'. I modified it heavily but kept hisheuristics and test. I also added additional heuristics to fix #975556,#1046092, and part of#6191. This patch should be completely backwardcompatible: the behavior with the default strict=True is unchanged.
1 parent79cdb66 commitb579dba

File tree

4 files changed

+139
-24
lines changed

4 files changed

+139
-24
lines changed

‎Doc/library/html.parser.rst

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,13 @@
1212
This module defines a class:class:`HTMLParser` which serves as the basis for
1313
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
1414

15-
..class::HTMLParser()
15+
..class::HTMLParser(strict=True)
1616

17-
The:class:`HTMLParser` class is instantiated without arguments.
17+
Create a parser instance. If *strict* is ``True`` (the default), invalid
18+
html results in:exc:`~html.parser.HTMLParseError` exceptions [#]_. If
19+
*strict* is ``False``, the parser uses heuristics to make a best guess at
20+
the intention of any invalid html it encounters, similar to the way most
21+
browsers do.
1822

1923
An:class:`HTMLParser` instance is fed HTML data and calls handler functions when tags
2024
begin and end. The:class:`HTMLParser` class is meant to be overridden by the
@@ -191,3 +195,8 @@ As a basic example, below is a very basic HTML parser that uses the
191195
Encountered a html end tag
192196

193197

198+
..rubric::Footnotes
199+
200+
.. [#]For backward compatibility reasons *strict* mode does not throw
201+
errors for all non-compliant HTML. That is, some invalid HTML
202+
is tolerated even in *strict* mode.

‎Lib/html/parser.py

Lines changed: 83 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,14 @@
2424
piclose=re.compile('>')
2525
commentclose=re.compile(r'--\s*>')
2626
tagfind=re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
27+
# Note, the strict one of this pair isn't really strict, but we can't
28+
# make it correctly strict without breaking backward compatibility.
2729
attrfind=re.compile(
2830
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
2931
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
30-
32+
attrfind_tolerant=re.compile(
33+
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
34+
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
3135
locatestarttagend=re.compile(r"""
3236
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
3337
(?:\s+ # whitespace before attribute name
@@ -42,6 +46,21 @@
4246
)*
4347
\s* # trailing whitespace
4448
""",re.VERBOSE)
49+
locatestarttagend_tolerant=re.compile(r"""
50+
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
51+
(?:\s* # optional whitespace before attribute name
52+
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
53+
(?:\s*=\s* # value indicator
54+
(?:'[^']*' # LITA-enclosed value
55+
|\"[^\"]*\" # LIT-enclosed value
56+
|[^'\">\s]+ # bare value
57+
)
58+
(?:\s*,)* # possibly followed by a comma
59+
)?
60+
)
61+
)*
62+
\s* # trailing whitespace
63+
""",re.VERBOSE)
4564
endendtag=re.compile('>')
4665
endtagfind=re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
4766

@@ -86,9 +105,15 @@ class HTMLParser(_markupbase.ParserBase):
86105

87106
CDATA_CONTENT_ELEMENTS= ("script","style")
88107

108+
def__init__(self,strict=True):
109+
"""Initialize and reset this instance.
89110
90-
def__init__(self):
91-
"""Initialize and reset this instance."""
111+
If strict is set to True (the default), errors are raised when invalid
112+
HTML is encountered. If set to False, an attempt is instead made to
113+
continue parsing, making "best guesses" about the intended meaning, in
114+
a fashion similar to what browsers typically do.
115+
"""
116+
self.strict=strict
92117
self.reset()
93118

94119
defreset(self):
@@ -160,9 +185,18 @@ def goahead(self, end):
160185
else:
161186
break
162187
ifk<0:
163-
ifend:
188+
ifnotend:
189+
break
190+
ifself.strict:
164191
self.error("EOF in middle of construct")
165-
break
192+
k=rawdata.find('>',i+1)
193+
ifk<0:
194+
k=rawdata.find('<',i+1)
195+
ifk<0:
196+
k=i+1
197+
else:
198+
k+=1
199+
self.handle_data(rawdata[i:k])
166200
i=self.updatepos(i,k)
167201
elifstartswith("&#",i):
168202
match=charref.match(rawdata,i)
@@ -193,7 +227,12 @@ def goahead(self, end):
193227
ifmatch:
194228
# match.group() will contain at least 2 chars
195229
ifendandmatch.group()==rawdata[i:]:
196-
self.error("EOF in middle of entity or char ref")
230+
ifself.strict:
231+
self.error("EOF in middle of entity or char ref")
232+
else:
233+
ifk<=i:
234+
k=n
235+
i=self.updatepos(i,i+1)
197236
# incomplete
198237
break
199238
elif (i+1)<n:
@@ -240,7 +279,10 @@ def parse_starttag(self, i):
240279
self.lasttag=tag=rawdata[i+1:k].lower()
241280

242281
whilek<endpos:
243-
m=attrfind.match(rawdata,k)
282+
ifself.strict:
283+
m=attrfind.match(rawdata,k)
284+
else:
285+
m=attrfind_tolerant.search(rawdata,k)
244286
ifnotm:
245287
break
246288
attrname,rest,attrvalue=m.group(1,2,3)
@@ -262,8 +304,11 @@ def parse_starttag(self, i):
262304
-self.__starttag_text.rfind("\n")
263305
else:
264306
offset=offset+len(self.__starttag_text)
265-
self.error("junk characters in start tag: %r"
266-
% (rawdata[k:endpos][:20],))
307+
ifself.strict:
308+
self.error("junk characters in start tag: %r"
309+
% (rawdata[k:endpos][:20],))
310+
self.handle_data(rawdata[i:endpos])
311+
returnendpos
267312
ifend.endswith('/>'):
268313
# XHTML-style empty tag: <span attr="value" />
269314
self.handle_startendtag(tag,attrs)
@@ -277,7 +322,10 @@ def parse_starttag(self, i):
277322
# or -1 if incomplete.
278323
defcheck_for_whole_start_tag(self,i):
279324
rawdata=self.rawdata
280-
m=locatestarttagend.match(rawdata,i)
325+
ifself.strict:
326+
m=locatestarttagend.match(rawdata,i)
327+
else:
328+
m=locatestarttagend_tolerant.match(rawdata,i)
281329
ifm:
282330
j=m.end()
283331
next=rawdata[j:j+1]
@@ -290,8 +338,13 @@ def check_for_whole_start_tag(self, i):
290338
# buffer boundary
291339
return-1
292340
# else bogus input
293-
self.updatepos(i,j+1)
294-
self.error("malformed empty start tag")
341+
ifself.strict:
342+
self.updatepos(i,j+1)
343+
self.error("malformed empty start tag")
344+
ifj>i:
345+
returnj
346+
else:
347+
returni+1
295348
ifnext=="":
296349
# end of input
297350
return-1
@@ -300,8 +353,13 @@ def check_for_whole_start_tag(self, i):
300353
# end of input in or before attribute value, or we have the
301354
# '/' from a '/>' ending
302355
return-1
303-
self.updatepos(i,j)
304-
self.error("malformed start tag")
356+
ifself.strict:
357+
self.updatepos(i,j)
358+
self.error("malformed start tag")
359+
ifj>i:
360+
returnj
361+
else:
362+
returni+1
305363
raiseAssertionError("we should not get here!")
306364

307365
# Internal -- parse endtag, return end or -1 if incomplete
@@ -314,7 +372,15 @@ def parse_endtag(self, i):
314372
j=match.end()
315373
match=endtagfind.match(rawdata,i)# </ + tag + >
316374
ifnotmatch:
317-
self.error("bad end tag: %r"% (rawdata[i:j],))
375+
ifself.strict:
376+
self.error("bad end tag: %r"% (rawdata[i:j],))
377+
k=rawdata.find('<',i+1,j)
378+
ifk>i:
379+
j=k
380+
ifj<=i:
381+
j=i+1
382+
self.handle_data(rawdata[i:j])
383+
returnj
318384
tag=match.group(1)
319385
self.handle_endtag(tag.lower())
320386
self.clear_cdata_mode()
@@ -358,7 +424,8 @@ def handle_pi(self, data):
358424
pass
359425

360426
defunknown_decl(self,data):
361-
self.error("unknown declaration: %r"% (data,))
427+
ifself.strict:
428+
self.error("unknown declaration: %r"% (data,))
362429

363430
# Internal -- helper to remove special character quoting
364431
entitydefs=None

‎Lib/test/test_htmlparser.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88

99
classEventCollector(html.parser.HTMLParser):
1010

11-
def__init__(self):
11+
def__init__(self,*args,**kw):
1212
self.events= []
1313
self.append=self.events.append
14-
html.parser.HTMLParser.__init__(self)
14+
html.parser.HTMLParser.__init__(self,*args,**kw)
1515

1616
defget_events(self):
1717
# Normalize the list of events so that buffer artefacts don't
@@ -72,8 +72,10 @@ def handle_starttag(self, tag, attrs):
7272

7373
classTestCaseBase(unittest.TestCase):
7474

75-
def_run_check(self,source,expected_events,collector=EventCollector):
76-
parser=collector()
75+
def_run_check(self,source,expected_events,collector=None):
76+
ifcollectorisNone:
77+
collector=EventCollector()
78+
parser=collector
7779
forsinsource:
7880
parser.feed(s)
7981
parser.close()
@@ -84,7 +86,7 @@ def _run_check(self, source, expected_events, collector=EventCollector):
8486
"\nReceived:\n"+pprint.pformat(events))
8587

8688
def_run_check_extra(self,source,events):
87-
self._run_check(source,events,EventCollectorExtra)
89+
self._run_check(source,events,EventCollectorExtra())
8890

8991
def_parse_error(self,source):
9092
defparse(source=source):
@@ -321,8 +323,42 @@ def test_entityrefs_in_attributes(self):
321323
])
322324

323325

326+
classHTMLParserTolerantTestCase(TestCaseBase):
327+
328+
defsetUp(self):
329+
self.collector=EventCollector(strict=False)
330+
331+
deftest_tolerant_parsing(self):
332+
self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
333+
'<img src="URL><//img></html</html>', [
334+
('data','<html '),
335+
('starttag','html', []),
336+
('data','te>>xt'),
337+
('entityref','a'),
338+
('data','<<bc'),
339+
('endtag','a'),
340+
('endtag','html'),
341+
('data','\n<img src="URL><//img></html'),
342+
('endtag','html')],
343+
collector=self.collector)
344+
345+
deftest_comma_between_attributes(self):
346+
self._run_check('<form action="/xxx.php?a=1&amp;b=2&amp", '
347+
'method="post">', [
348+
('starttag','form',
349+
[('action','/xxx.php?a=1&b=2&amp'),
350+
('method','post')])],
351+
collector=self.collector)
352+
353+
deftest_weird_chars_in_unquoted_attribute_values(self):
354+
self._run_check('<form action=bogus|&#()value>', [
355+
('starttag','form',
356+
[('action','bogus|&#()value')])],
357+
collector=self.collector)
358+
359+
324360
deftest_main():
325-
support.run_unittest(HTMLParserTestCase)
361+
support.run_unittest(HTMLParserTestCase,HTMLParserTolerantTestCase)
326362

327363

328364
if__name__=="__main__":

‎Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ Core and Builtins
5858
Library
5959
-------
6060

61+
- Issue #1486713: HTMLParser now has an optional tolerant mode where it
62+
tries to guess at the correct parsing of invalid html.
63+
6164
- Issue #10554: Add context manager support to subprocess.Popen objects.
6265

6366
- Issue #8989: email.utils.make_msgid now has a domain parameter that can

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp