Commit0243f97

serhiy-storchaka

and

ezio-melotti

authored

gh-135661: Fix parsing start and end tags in HTMLParser according to the HTML5 standard (GH-135930)

* Whitespaces no longer accepted between `</` and the tag name. E.g. `</ script>` does not end the script section.* Vertical tabulation (`\v`) and non-ASCII whitespaces no longer recognized as whitespaces. The only whitespaces are `\t\n\r\f `.* Null character (U+0000) no longer ends the tag name.* Attributes and slashes after the tag name in end tags are now ignored, instead of terminating after the first `>` in quoted attribute value. E.g. `</script/foo=">"/>`.* Multiple slashes and whitespaces between the last attribute and closing `>` are now ignored in both start and end tags. E.g. `<a foo=bar/ //>`.* Multiple `=` between attribute name and value are no longer collapsed. E.g. `<a foo==bar>` produces attribute "foo" with value "=bar".* Whitespaces between the `=` separator and attribute name or value are no longer ignored. E.g. `<a foo =bar>` produces two attributes "foo" and "=bar", both with value None; `<a foo= bar>` produces two attributes: "foo" with value "" and "bar" with value None.* Fix Sphinx errors.* Apply suggestions from code reviewCo-authored-by: Ezio Melotti <ezio.melotti@gmail.com>* Address review comments.* Move to Security.---------Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>

1 parent938a5d7 commit0243f97Copy full SHA for 0243f97

File tree

3 files changed

+194

-129

lines changed

Lib
- html
  - parser.py
- test
  - test_htmlparser.py
Misc/NEWS.d/next/Security
- 2025-06-25-14-13-39.gh-issue-135661.idjQ0B.rst

3 files changed

+194

-129

lines changed

`‎Lib/html/parser.py`

Lines changed: 69 additions & 74 deletions

Original file line number	Diff line number	Diff line change
`@@ -31,15 +31,43 @@`
`31`	`31`	`piclose=re.compile('>')`
`32`	`32`	`commentclose=re.compile(r'--\s*>')`
`33`	`33`	`# Note:`
`34`		`-# 1) if you change tagfind/attrfind remember to updatelocatestarttagend too;`
`35`		`-# 2) if you change tagfind/attrfind and/orlocatestarttagend the parser will`
	`34`	`+# 1) if you change tagfind/attrfind remember to updatelocatetagend too;`
	`35`	`+# 2) if you change tagfind/attrfind and/orlocatetagend the parser will`
`36`	`36`	`# explode, so don't do it.`
`37`		`-# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state`
`38`		`-# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state`
`39`		`-tagfind_tolerant=re.compile(r'([a-zA-Z][^\t\n\r\f />\x00])(?:\s\|/(?!>))')`
`40`		`-attrfind_tolerant=re.compile(`
`41`		`-r'((?<=[\'"\s/])[^\s/>][^\s/=>])(\s=+\s*'`
`42`		`-r'(\'[^\']\'\|"[^"]"\|(?![\'"])[^>\s]))?(?:\s\|/(?!>))')`
	`37`	`+# see the HTML5 specs section "13.2.5.6 Tag open state",`
	`38`	`+# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".`
	`39`	`+# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state`
	`40`	`+# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state`
	`41`	`+# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state`
	`42`	`+tagfind_tolerant=re.compile(r'([a-zA-Z][^\t\n\r\f />])(?:[\t\n\r\f ]\|/(?!>))')`
	`43`	`+attrfind_tolerant=re.compile(r"""`
	`44`	`+ (`
	`45`	`+ (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name`
	`46`	`+ )`
	`47`	`+ (= # value indicator`
	`48`	`+ ('[^']*' # LITA-enclosed value`
	`49`	`+ \|"[^"]*" # LIT-enclosed value`
	`50`	`+ \|(?!['"])[^>\t\n\r\f ]* # bare value`
	`51`	`+ )`
	`52`	`+ )?`
	`53`	`+ (?:[\t\n\r\f ]\|/(?!>))* # possibly followed by a space`
	`54`	`+""",re.VERBOSE)`
	`55`	`+locatetagend=re.compile(r"""`
	`56`	`+ [a-zA-Z][^\t\n\r\f />]* # tag name`
	`57`	`+ [\t\n\r\f /]* # optional whitespace before attribute name`
	`58`	`+ (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name`
	`59`	`+ (?:= # value indicator`
	`60`	`+ (?:'[^']*' # LITA-enclosed value`
	`61`	`+ \|"[^"]*" # LIT-enclosed value`
	`62`	`+ \|(?!['"])[^>\t\n\r\f ]* # bare value`
	`63`	`+ )`
	`64`	`+ )?`
	`65`	`+ [\t\n\r\f /]* # possibly followed by a space`
	`66`	`+ )*`
	`67`	`+ >?`
	`68`	`+""",re.VERBOSE)`
	`69`	`+# The following variables are not used, but are temporarily left for`
	`70`	`+# backward compatibility.`
`43`	`71`	`locatestarttagend_tolerant=re.compile(r"""`
`44`	`72`	`<[a-zA-Z][^\t\n\r\f />\x00]* # tag name`
`45`	`73`	`(?:[\s/]* # optional whitespace before attribute name`
`@@ -56,8 +84,6 @@`
`56`	`84`	`\s* # trailing whitespace`
`57`	`85`	`""",re.VERBOSE)`
`58`	`86`	`endendtag=re.compile('>')`
`59`		`-# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between`
`60`		`-# </ and the tag name, so maybe this should be fixed`
`61`	`87`	`endtagfind=re.compile(r'</\s([a-zA-Z][-.a-zA-Z0-9:_])\s*>')`
`62`	`88`
`63`	`89`	`# Character reference processing logic specific to attribute values`
`@@ -141,7 +167,8 @@ def get_starttag_text(self):`
`141`	`167`
`142`	`168`	`defset_cdata_mode(self,elem):`
`143`	`169`	`self.cdata_elem=elem.lower()`
`144`		`-self.interesting=re.compile(r'</\s%s\s>'%self.cdata_elem,re.I)`
	`170`	`+self.interesting=re.compile(r'</%s(?=[\t\n\r\f />])'%self.cdata_elem,`
	`171`	`+re.IGNORECASE\|re.ASCII)`
`145`	`172`
`146`	`173`	`defclear_cdata_mode(self):`
`147`	`174`	`self.interesting=interesting_normal`
`@@ -166,7 +193,7 @@ def goahead(self, end):`
`166`	`193`	`# & near the end and see if it's followed by a space or ;.`
`167`	`194`	`amppos=rawdata.rfind('&',max(i,n-34))`
`168`	`195`	`if (amppos>=0and`
`169`		`-notre.compile(r'[\s;]').search(rawdata,amppos)):`
	`196`	`+notre.compile(r'[\t\n\r\f;]').search(rawdata,amppos)):`
`170`	`197`	`break# wait till we get all the text`
`171`	`198`	`j=n`
`172`	`199`	`else:`
`@@ -310,7 +337,7 @@ def parse_html_declaration(self, i):`
`310`	`337`	`returnself.parse_bogus_comment(i)`
`311`	`338`
`312`	`339`	`# Internal -- parse bogus comment, return length or -1 if not terminated`
`313`		`-# seehttp://www.w3.org/TR/html5/tokenization.html#bogus-comment-state`
	`340`	`+# seehttps://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state`
`314`	`341`	`defparse_bogus_comment(self,i,report=1):`
`315`	`342`	`rawdata=self.rawdata`
`316`	`343`	`assertrawdata[i:i+2]in ('<!','</'), ('unexpected call to '`
`@@ -336,6 +363,8 @@ def parse_pi(self, i):`
`336`	`363`
`337`	`364`	`# Internal -- handle starttag, return end or -1 if not terminated`
`338`	`365`	`defparse_starttag(self,i):`
	`366`	`+# See the HTML5 specs section "13.2.5.8 Tag name state"`
	`367`	`+# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state`
`339`	`368`	`self.__starttag_text=None`
`340`	`369`	`endpos=self.check_for_whole_start_tag(i)`
`341`	`370`	`ifendpos<0:`
`@@ -381,76 +410,42 @@ def parse_starttag(self, i):`
`381`	`410`	`# or -1 if incomplete.`
`382`	`411`	`defcheck_for_whole_start_tag(self,i):`
`383`	`412`	`rawdata=self.rawdata`
`384`		`-m=locatestarttagend_tolerant.match(rawdata,i)`
`385`		`-ifm:`
`386`		`-j=m.end()`
`387`		`-next=rawdata[j:j+1]`
`388`		`-ifnext==">":`
`389`		`-returnj+1`
`390`		`-ifnext=="/":`
`391`		`-ifrawdata.startswith("/>",j):`
`392`		`-returnj+2`
`393`		`-ifrawdata.startswith("/",j):`
`394`		`-# buffer boundary`
`395`		`-return-1`
`396`		`-# else bogus input`
`397`		`-ifj>i:`
`398`		`-returnj`
`399`		`-else:`
`400`		`-returni+1`
`401`		`-ifnext=="":`
`402`		`-# end of input`
`403`		`-return-1`
`404`		`-ifnextin ("abcdefghijklmnopqrstuvwxyz=/"`
`405`		`-"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):`
`406`		`-# end of input in or before attribute value, or we have the`
`407`		`-# '/' from a '/>' ending`
`408`		`-return-1`
`409`		`-ifj>i:`
`410`		`-returnj`
`411`		`-else:`
`412`		`-returni+1`
`413`		`-raiseAssertionError("we should not get here!")`
	`413`	`+match=locatetagend.match(rawdata,i+1)`
	`414`	`+assertmatch`
	`415`	`+j=match.end()`
	`416`	`+ifrawdata[j-1]!=">":`
	`417`	`+return-1`
	`418`	`+returnj`
`414`	`419`
`415`	`420`	`# Internal -- parse endtag, return end or -1 if incomplete`
`416`	`421`	`defparse_endtag(self,i):`
	`422`	`+# See the HTML5 specs section "13.2.5.7 End tag open state"`
	`423`	`+# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state`
`417`	`424`	`rawdata=self.rawdata`
`418`	`425`	`assertrawdata[i:i+2]=="</","unexpected call to parse_endtag"`
`419`		`-match=endendtag.search(rawdata,i+1)# >`
`420`		`-ifnotmatch:`
	`426`	`+ifrawdata.find('>',i+2)<0:# fast check`
`421`	`427`	`return-1`
`422`		`-gtpos=match.end()`
`423`		`-match=endtagfind.match(rawdata,i)# </ + tag + >`
`424`		`-ifnotmatch:`
`425`		`-ifself.cdata_elemisnotNone:`
`426`		`-self.handle_data(rawdata[i:gtpos])`
`427`		`-returngtpos`
`428`		`-# find the name: w3.org/TR/html5/tokenization.html#tag-name-state`
`429`		`-namematch=tagfind_tolerant.match(rawdata,i+2)`
`430`		`-ifnotnamematch:`
`431`		`-# w3.org/TR/html5/tokenization.html#end-tag-open-state`
`432`		`-ifrawdata[i:i+3]=='</>':`
`433`		`-returni+3`
`434`		`-else:`
`435`		`-returnself.parse_bogus_comment(i)`
`436`		`-tagname=namematch.group(1).lower()`
`437`		`-# consume and ignore other stuff between the name and the >`
`438`		`-# Note: this is not 100% correct, since we might have things like`
`439`		`-# </tag attr=">">, but looking for > after the name should cover`
`440`		`-# most of the cases and is much simpler`
`441`		`-gtpos=rawdata.find('>',namematch.end())`
`442`		`-self.handle_endtag(tagname)`
`443`		`-returngtpos+1`
	`428`	`+ifnotendtagopen.match(rawdata,i):# </ + letter`
	`429`	`+ifrawdata[i+2:i+3]=='>':# </> is ignored`
	`430`	`+# "missing-end-tag-name" parser error`
	`431`	`+returni+3`
	`432`	`+else:`
	`433`	`+returnself.parse_bogus_comment(i)`
`444`	`434`
`445`		`-elem=match.group(1).lower()# script or style`
`446`		`-ifself.cdata_elemisnotNone:`
`447`		`-ifelem!=self.cdata_elem:`
`448`		`-self.handle_data(rawdata[i:gtpos])`
`449`		`-returngtpos`
	`435`	`+match=locatetagend.match(rawdata,i+2)`
	`436`	`+assertmatch`
	`437`	`+j=match.end()`
	`438`	`+ifrawdata[j-1]!=">":`
	`439`	`+return-1`
`450`	`440`
`451`		`-self.handle_endtag(elem)`
	`441`	`+# find the name: "13.2.5.8 Tag name state"`
	`442`	`+# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state`
	`443`	`+match=tagfind_tolerant.match(rawdata,i+2)`
	`444`	`+assertmatch`
	`445`	`+tag=match.group(1).lower()`
	`446`	`+self.handle_endtag(tag)`
`452`	`447`	`self.clear_cdata_mode()`
`453`		`-returngtpos`
	`448`	`+returnj`
`454`	`449`
`455`	`450`	`# Overridable -- finish processing of start+end tag: <tag.../>`
`456`	`451`	`defhandle_startendtag(self,tag,attrs):`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit0243f97

File tree

3 files changed

3 files changed

`‎Lib/html/parser.py`

0 commit comments