NotificationsYou must be signed in to change notification settings
Fork32.3k
Star67.8k

Commitb579dba

committed

#1486713: Add a tolerant mode to HTMLParser.

The motivation for adding this option is that the the functionality itprovides used to be provided by sgmllib in Python2, and was used by,for example, BeautifulSoup. Without this option, the Python3 versionof BeautifulSoup and the many programs that use it are crippled.The original patch was by 'kxroberto'. I modified it heavily but kept hisheuristics and test. I also added additional heuristics to fix #975556,#1046092, and part of#6191. This patch should be completely backwardcompatible: the behavior with the default strict=True is unchanged.

1 parent79cdb66 commitb579dbaCopy full SHA for b579dba

File tree

4 files changed

+139

-24

lines changed

Doc/library
- html.parser.rst
Lib
- html
  - parser.py
- test
  - test_htmlparser.py
Misc
- NEWS

4 files changed

+139

-24

lines changed

`‎Doc/library/html.parser.rst`

Lines changed: 11 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -12,9 +12,13 @@`
`12`	`12`	This module defines a class:class:`HTMLParser` which serves as the basis for
`13`	`13`	`parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.`
`14`	`14`
`15`		`-..class::HTMLParser()`
	`15`	`+..class::HTMLParser(strict=True)`
`16`	`16`
`17`		- The:class:`HTMLParser` class is instantiated without arguments.
	`17`	+ Create a parser instance. If strict is ``True`` (the default), invalid
	`18`	+ html results in:exc:`~html.parser.HTMLParseError` exceptions [#]_. If
	`19`	+ strict is ``False``, the parser uses heuristics to make a best guess at
	`20`	`+ the intention of any invalid html it encounters, similar to the way most`
	`21`	`+ browsers do.`
`18`	`22`
`19`	`23`	An:class:`HTMLParser` instance is fed HTML data and calls handler functions when tags
`20`	`24`	begin and end. The:class:`HTMLParser` class is meant to be overridden by the
`@@ -191,3 +195,8 @@ As a basic example, below is a very basic HTML parser that uses the`
`191`	`195`	`Encountered a html end tag`
`192`	`196`
`193`	`197`
	`198`	`+..rubric::Footnotes`
	`199`	`+`
	`200`	`+.. [#]For backward compatibility reasons strict mode does not throw`
	`201`	`+ errors for all non-compliant HTML. That is, some invalid HTML`
	`202`	`+ is tolerated even in strict mode.`

`‎Lib/html/parser.py`

Lines changed: 83 additions & 16 deletions

Original file line number	Diff line number	Diff line change
`@@ -24,10 +24,14 @@`
`24`	`24`	`piclose=re.compile('>')`
`25`	`25`	`commentclose=re.compile(r'--\s*>')`
`26`	`26`	`tagfind=re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')`
	`27`	`+# Note, the strict one of this pair isn't really strict, but we can't`
	`28`	`+# make it correctly strict without breaking backward compatibility.`
`27`	`29`	`attrfind=re.compile(`
`28`	`30`	`r'\s([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s'`
`29`	`31`	`r'(\'[^\']\'\|"[^"]"\|[-a-zA-Z0-9./,:;+%?!&$_#=~@]))?')`
`30`		`-`
	`32`	`+attrfind_tolerant=re.compile(`
	`33`	`+r'\s([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s'`
	`34`	`+r'(\'[^\']\'\|"[^"]"\|[^>\s]*))?')`
`31`	`35`	`locatestarttagend=re.compile(r"""`
`32`	`36`	`<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name`
`33`	`37`	`(?:\s+ # whitespace before attribute name`
`@@ -42,6 +46,21 @@`
`42`	`46`	`)*`
`43`	`47`	`\s* # trailing whitespace`
`44`	`48`	`""",re.VERBOSE)`
	`49`	`+locatestarttagend_tolerant=re.compile(r"""`
	`50`	`+ <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name`
	`51`	`+ (?:\s* # optional whitespace before attribute name`
	`52`	`+ (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name`
	`53`	`+ (?:\s=\s # value indicator`
	`54`	`+ (?:'[^']*' # LITA-enclosed value`
	`55`	`+ \|\"[^\"]*\" # LIT-enclosed value`
	`56`	`+ \|[^'\">\s]+ # bare value`
	`57`	`+ )`
	`58`	`+ (?:\s,) # possibly followed by a comma`
	`59`	`+ )?`
	`60`	`+ )`
	`61`	`+ )*`
	`62`	`+ \s* # trailing whitespace`
	`63`	`+""",re.VERBOSE)`
`45`	`64`	`endendtag=re.compile('>')`
`46`	`65`	`endtagfind=re.compile('</\s([a-zA-Z][-.a-zA-Z0-9:_])\s*>')`
`47`	`66`
`@@ -86,9 +105,15 @@ class HTMLParser(_markupbase.ParserBase):`
`86`	`105`
`87`	`106`	`CDATA_CONTENT_ELEMENTS= ("script","style")`
`88`	`107`
	`108`	`+def__init__(self,strict=True):`
	`109`	`+"""Initialize and reset this instance.`
`89`	`110`
`90`		`-def__init__(self):`
`91`		`-"""Initialize and reset this instance."""`
	`111`	`+ If strict is set to True (the default), errors are raised when invalid`
	`112`	`+ HTML is encountered. If set to False, an attempt is instead made to`
	`113`	`+ continue parsing, making "best guesses" about the intended meaning, in`
	`114`	`+ a fashion similar to what browsers typically do.`
	`115`	`+ """`
	`116`	`+self.strict=strict`
`92`	`117`	`self.reset()`
`93`	`118`
`94`	`119`	`defreset(self):`
`@@ -160,9 +185,18 @@ def goahead(self, end):`
`160`	`185`	`else:`
`161`	`186`	`break`
`162`	`187`	`ifk<0:`
`163`		`-ifend:`
	`188`	`+ifnotend:`
	`189`	`+break`
	`190`	`+ifself.strict:`
`164`	`191`	`self.error("EOF in middle of construct")`
`165`		`-break`
	`192`	`+k=rawdata.find('>',i+1)`
	`193`	`+ifk<0:`
	`194`	`+k=rawdata.find('<',i+1)`
	`195`	`+ifk<0:`
	`196`	`+k=i+1`
	`197`	`+else:`
	`198`	`+k+=1`
	`199`	`+self.handle_data(rawdata[i:k])`
`166`	`200`	`i=self.updatepos(i,k)`
`167`	`201`	`elifstartswith("&#",i):`
`168`	`202`	`match=charref.match(rawdata,i)`
`@@ -193,7 +227,12 @@ def goahead(self, end):`
`193`	`227`	`ifmatch:`
`194`	`228`	`# match.group() will contain at least 2 chars`
`195`	`229`	`ifendandmatch.group()==rawdata[i:]:`
`196`		`-self.error("EOF in middle of entity or char ref")`
	`230`	`+ifself.strict:`
	`231`	`+self.error("EOF in middle of entity or char ref")`
	`232`	`+else:`
	`233`	`+ifk<=i:`
	`234`	`+k=n`
	`235`	`+i=self.updatepos(i,i+1)`
`197`	`236`	`# incomplete`
`198`	`237`	`break`
`199`	`238`	`elif (i+1)<n:`
`@@ -240,7 +279,10 @@ def parse_starttag(self, i):`
`240`	`279`	`self.lasttag=tag=rawdata[i+1:k].lower()`
`241`	`280`
`242`	`281`	`whilek<endpos:`
`243`		`-m=attrfind.match(rawdata,k)`
	`282`	`+ifself.strict:`
	`283`	`+m=attrfind.match(rawdata,k)`
	`284`	`+else:`
	`285`	`+m=attrfind_tolerant.search(rawdata,k)`
`244`	`286`	`ifnotm:`
`245`	`287`	`break`
`246`	`288`	`attrname,rest,attrvalue=m.group(1,2,3)`
`@@ -262,8 +304,11 @@ def parse_starttag(self, i):`
`262`	`304`	`-self.__starttag_text.rfind("\n")`
`263`	`305`	`else:`
`264`	`306`	`offset=offset+len(self.__starttag_text)`
`265`		`-self.error("junk characters in start tag: %r"`
`266`		`-% (rawdata[k:endpos][:20],))`
	`307`	`+ifself.strict:`
	`308`	`+self.error("junk characters in start tag: %r"`
	`309`	`+% (rawdata[k:endpos][:20],))`
	`310`	`+self.handle_data(rawdata[i:endpos])`
	`311`	`+returnendpos`
`267`	`312`	`ifend.endswith('/>'):`
`268`	`313`	`# XHTML-style empty tag: <span attr="value" />`
`269`	`314`	`self.handle_startendtag(tag,attrs)`
`@@ -277,7 +322,10 @@ def parse_starttag(self, i):`
`277`	`322`	`# or -1 if incomplete.`
`278`	`323`	`defcheck_for_whole_start_tag(self,i):`
`279`	`324`	`rawdata=self.rawdata`
`280`		`-m=locatestarttagend.match(rawdata,i)`
	`325`	`+ifself.strict:`
	`326`	`+m=locatestarttagend.match(rawdata,i)`
	`327`	`+else:`
	`328`	`+m=locatestarttagend_tolerant.match(rawdata,i)`
`281`	`329`	`ifm:`
`282`	`330`	`j=m.end()`
`283`	`331`	`next=rawdata[j:j+1]`
`@@ -290,8 +338,13 @@ def check_for_whole_start_tag(self, i):`
`290`	`338`	`# buffer boundary`
`291`	`339`	`return-1`
`292`	`340`	`# else bogus input`
`293`		`-self.updatepos(i,j+1)`
`294`		`-self.error("malformed empty start tag")`
	`341`	`+ifself.strict:`
	`342`	`+self.updatepos(i,j+1)`
	`343`	`+self.error("malformed empty start tag")`
	`344`	`+ifj>i:`
	`345`	`+returnj`
	`346`	`+else:`
	`347`	`+returni+1`
`295`	`348`	`ifnext=="":`
`296`	`349`	`# end of input`
`297`	`350`	`return-1`
`@@ -300,8 +353,13 @@ def check_for_whole_start_tag(self, i):`
`300`	`353`	`# end of input in or before attribute value, or we have the`
`301`	`354`	`# '/' from a '/>' ending`
`302`	`355`	`return-1`
`303`		`-self.updatepos(i,j)`
`304`		`-self.error("malformed start tag")`
	`356`	`+ifself.strict:`
	`357`	`+self.updatepos(i,j)`
	`358`	`+self.error("malformed start tag")`
	`359`	`+ifj>i:`
	`360`	`+returnj`
	`361`	`+else:`
	`362`	`+returni+1`
`305`	`363`	`raiseAssertionError("we should not get here!")`
`306`	`364`
`307`	`365`	`# Internal -- parse endtag, return end or -1 if incomplete`
`@@ -314,7 +372,15 @@ def parse_endtag(self, i):`
`314`	`372`	`j=match.end()`
`315`	`373`	`match=endtagfind.match(rawdata,i)# </ + tag + >`
`316`	`374`	`ifnotmatch:`
`317`		`-self.error("bad end tag: %r"% (rawdata[i:j],))`
	`375`	`+ifself.strict:`
	`376`	`+self.error("bad end tag: %r"% (rawdata[i:j],))`
	`377`	`+k=rawdata.find('<',i+1,j)`
	`378`	`+ifk>i:`
	`379`	`+j=k`
	`380`	`+ifj<=i:`
	`381`	`+j=i+1`
	`382`	`+self.handle_data(rawdata[i:j])`
	`383`	`+returnj`
`318`	`384`	`tag=match.group(1)`
`319`	`385`	`self.handle_endtag(tag.lower())`
`320`	`386`	`self.clear_cdata_mode()`
`@@ -358,7 +424,8 @@ def handle_pi(self, data):`
`358`	`424`	`pass`
`359`	`425`
`360`	`426`	`defunknown_decl(self,data):`
`361`		`-self.error("unknown declaration: %r"% (data,))`
	`427`	`+ifself.strict:`
	`428`	`+self.error("unknown declaration: %r"% (data,))`
`362`	`429`
`363`	`430`	`# Internal -- helper to remove special character quoting`
`364`	`431`	`entitydefs=None`

`‎Lib/test/test_htmlparser.py`

Lines changed: 42 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -8,10 +8,10 @@`
`8`	`8`
`9`	`9`	`classEventCollector(html.parser.HTMLParser):`
`10`	`10`
`11`		`-def__init__(self):`
	`11`	`+def__init__(self,args,*kw):`
`12`	`12`	`self.events= []`
`13`	`13`	`self.append=self.events.append`
`14`		`-html.parser.HTMLParser.__init__(self)`
	`14`	`+html.parser.HTMLParser.__init__(self,args,*kw)`
`15`	`15`
`16`	`16`	`defget_events(self):`
`17`	`17`	`# Normalize the list of events so that buffer artefacts don't`
`@@ -72,8 +72,10 @@ def handle_starttag(self, tag, attrs):`
`72`	`72`
`73`	`73`	`classTestCaseBase(unittest.TestCase):`
`74`	`74`
`75`		`-def_run_check(self,source,expected_events,collector=EventCollector):`
`76`		`-parser=collector()`
	`75`	`+def_run_check(self,source,expected_events,collector=None):`
	`76`	`+ifcollectorisNone:`
	`77`	`+collector=EventCollector()`
	`78`	`+parser=collector`
`77`	`79`	`forsinsource:`
`78`	`80`	`parser.feed(s)`
`79`	`81`	`parser.close()`
`@@ -84,7 +86,7 @@ def _run_check(self, source, expected_events, collector=EventCollector):`
`84`	`86`	`"\nReceived:\n"+pprint.pformat(events))`
`85`	`87`
`86`	`88`	`def_run_check_extra(self,source,events):`
`87`		`-self._run_check(source,events,EventCollectorExtra)`
	`89`	`+self._run_check(source,events,EventCollectorExtra())`
`88`	`90`
`89`	`91`	`def_parse_error(self,source):`
`90`	`92`	`defparse(source=source):`
`@@ -321,8 +323,42 @@ def test_entityrefs_in_attributes(self):`
`321`	`323`	`])`
`322`	`324`
`323`	`325`
	`326`	`+classHTMLParserTolerantTestCase(TestCaseBase):`
	`327`	`+`
	`328`	`+defsetUp(self):`
	`329`	`+self.collector=EventCollector(strict=False)`
	`330`	`+`
	`331`	`+deftest_tolerant_parsing(self):`
	`332`	`+self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'`
	`333`	`+'<img src="URL><//img></html</html>', [`
	`334`	`+ ('data','<html '),`
	`335`	`+ ('starttag','html', []),`
	`336`	`+ ('data','te>>xt'),`
	`337`	`+ ('entityref','a'),`
	`338`	`+ ('data','<<bc'),`
	`339`	`+ ('endtag','a'),`
	`340`	`+ ('endtag','html'),`
	`341`	`+ ('data','\n<img src="URL><//img></html'),`
	`342`	`+ ('endtag','html')],`
	`343`	`+collector=self.collector)`
	`344`	`+`
	`345`	`+deftest_comma_between_attributes(self):`
	`346`	`+self._run_check('<form action="/xxx.php?a=1&b=2&amp", '`
	`347`	`+'method="post">', [`
	`348`	`+ ('starttag','form',`
	`349`	`+ [('action','/xxx.php?a=1&b=2&amp'),`
	`350`	`+ ('method','post')])],`
	`351`	`+collector=self.collector)`
	`352`	`+`
	`353`	`+deftest_weird_chars_in_unquoted_attribute_values(self):`
	`354`	`+self._run_check('<form action=bogus\|&#()value>', [`
	`355`	`+ ('starttag','form',`
	`356`	`+ [('action','bogus\|&#()value')])],`
	`357`	`+collector=self.collector)`
	`358`	`+`
	`359`	`+`
`324`	`360`	`deftest_main():`
`325`		`-support.run_unittest(HTMLParserTestCase)`
	`361`	`+support.run_unittest(HTMLParserTestCase,HTMLParserTolerantTestCase)`
`326`	`362`
`327`	`363`
`328`	`364`	`if__name__=="__main__":`

`‎Misc/NEWS`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,9 @@ Core and Builtins`
`58`	`58`	`Library`
`59`	`59`	`-------`
`60`	`60`
	`61`	`+- Issue #1486713: HTMLParser now has an optional tolerant mode where it`
	`62`	`+ tries to guess at the correct parsing of invalid html.`
	`63`	`+`
`61`	`64`	`- Issue #10554: Add context manager support to subprocess.Popen objects.`
`62`	`65`
`63`	`66`	`- Issue #8989: email.utils.make_msgid now has a domain parameter that can`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commitb579dba

File tree

4 files changed

4 files changed

`‎Doc/library/html.parser.rst`

`‎Lib/html/parser.py`

`‎Lib/test/test_htmlparser.py`

`‎Misc/NEWS`

0 commit comments