Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit0243f97

Browse files
gh-135661: Fix parsing start and end tags in HTMLParser according to the HTML5 standard (GH-135930)
* Whitespaces no longer accepted between `</` and the tag name. E.g. `</ script>` does not end the script section.* Vertical tabulation (`\v`) and non-ASCII whitespaces no longer recognized as whitespaces. The only whitespaces are `\t\n\r\f `.* Null character (U+0000) no longer ends the tag name.* Attributes and slashes after the tag name in end tags are now ignored, instead of terminating after the first `>` in quoted attribute value. E.g. `</script/foo=">"/>`.* Multiple slashes and whitespaces between the last attribute and closing `>` are now ignored in both start and end tags. E.g. `<a foo=bar/ //>`.* Multiple `=` between attribute name and value are no longer collapsed. E.g. `<a foo==bar>` produces attribute "foo" with value "=bar".* Whitespaces between the `=` separator and attribute name or value are no longer ignored. E.g. `<a foo =bar>` produces two attributes "foo" and "=bar", both with value None; `<a foo= bar>` produces two attributes: "foo" with value "" and "bar" with value None.* Fix Sphinx errors.* Apply suggestions from code reviewCo-authored-by: Ezio Melotti <ezio.melotti@gmail.com>* Address review comments.* Move to Security.---------Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
1 parent938a5d7 commit0243f97

File tree

3 files changed

+194
-129
lines changed

3 files changed

+194
-129
lines changed

‎Lib/html/parser.py

Lines changed: 69 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,43 @@
3131
piclose=re.compile('>')
3232
commentclose=re.compile(r'--\s*>')
3333
# Note:
34-
# 1) if you change tagfind/attrfind remember to updatelocatestarttagend too;
35-
# 2) if you change tagfind/attrfind and/orlocatestarttagend the parser will
34+
# 1) if you change tagfind/attrfind remember to updatelocatetagend too;
35+
# 2) if you change tagfind/attrfind and/orlocatetagend the parser will
3636
# explode, so don't do it.
37-
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
38-
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
39-
tagfind_tolerant=re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
40-
attrfind_tolerant=re.compile(
41-
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
42-
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
37+
# see the HTML5 specs section "13.2.5.6 Tag open state",
38+
# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
39+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
40+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
41+
# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
42+
tagfind_tolerant=re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
43+
attrfind_tolerant=re.compile(r"""
44+
(
45+
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
46+
)
47+
(= # value indicator
48+
('[^']*' # LITA-enclosed value
49+
|"[^"]*" # LIT-enclosed value
50+
|(?!['"])[^>\t\n\r\f ]* # bare value
51+
)
52+
)?
53+
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
54+
""",re.VERBOSE)
55+
locatetagend=re.compile(r"""
56+
[a-zA-Z][^\t\n\r\f />]* # tag name
57+
[\t\n\r\f /]* # optional whitespace before attribute name
58+
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
59+
(?:= # value indicator
60+
(?:'[^']*' # LITA-enclosed value
61+
|"[^"]*" # LIT-enclosed value
62+
|(?!['"])[^>\t\n\r\f ]* # bare value
63+
)
64+
)?
65+
[\t\n\r\f /]* # possibly followed by a space
66+
)*
67+
>?
68+
""",re.VERBOSE)
69+
# The following variables are not used, but are temporarily left for
70+
# backward compatibility.
4371
locatestarttagend_tolerant=re.compile(r"""
4472
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
4573
(?:[\s/]* # optional whitespace before attribute name
@@ -56,8 +84,6 @@
5684
\s* # trailing whitespace
5785
""",re.VERBOSE)
5886
endendtag=re.compile('>')
59-
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
60-
# </ and the tag name, so maybe this should be fixed
6187
endtagfind=re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
6288

6389
# Character reference processing logic specific to attribute values
@@ -141,7 +167,8 @@ def get_starttag_text(self):
141167

142168
defset_cdata_mode(self,elem):
143169
self.cdata_elem=elem.lower()
144-
self.interesting=re.compile(r'</\s*%s\s*>'%self.cdata_elem,re.I)
170+
self.interesting=re.compile(r'</%s(?=[\t\n\r\f />])'%self.cdata_elem,
171+
re.IGNORECASE|re.ASCII)
145172

146173
defclear_cdata_mode(self):
147174
self.interesting=interesting_normal
@@ -166,7 +193,7 @@ def goahead(self, end):
166193
# & near the end and see if it's followed by a space or ;.
167194
amppos=rawdata.rfind('&',max(i,n-34))
168195
if (amppos>=0and
169-
notre.compile(r'[\s;]').search(rawdata,amppos)):
196+
notre.compile(r'[\t\n\r\f;]').search(rawdata,amppos)):
170197
break# wait till we get all the text
171198
j=n
172199
else:
@@ -310,7 +337,7 @@ def parse_html_declaration(self, i):
310337
returnself.parse_bogus_comment(i)
311338

312339
# Internal -- parse bogus comment, return length or -1 if not terminated
313-
# seehttp://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
340+
# seehttps://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
314341
defparse_bogus_comment(self,i,report=1):
315342
rawdata=self.rawdata
316343
assertrawdata[i:i+2]in ('<!','</'), ('unexpected call to '
@@ -336,6 +363,8 @@ def parse_pi(self, i):
336363

337364
# Internal -- handle starttag, return end or -1 if not terminated
338365
defparse_starttag(self,i):
366+
# See the HTML5 specs section "13.2.5.8 Tag name state"
367+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
339368
self.__starttag_text=None
340369
endpos=self.check_for_whole_start_tag(i)
341370
ifendpos<0:
@@ -381,76 +410,42 @@ def parse_starttag(self, i):
381410
# or -1 if incomplete.
382411
defcheck_for_whole_start_tag(self,i):
383412
rawdata=self.rawdata
384-
m=locatestarttagend_tolerant.match(rawdata,i)
385-
ifm:
386-
j=m.end()
387-
next=rawdata[j:j+1]
388-
ifnext==">":
389-
returnj+1
390-
ifnext=="/":
391-
ifrawdata.startswith("/>",j):
392-
returnj+2
393-
ifrawdata.startswith("/",j):
394-
# buffer boundary
395-
return-1
396-
# else bogus input
397-
ifj>i:
398-
returnj
399-
else:
400-
returni+1
401-
ifnext=="":
402-
# end of input
403-
return-1
404-
ifnextin ("abcdefghijklmnopqrstuvwxyz=/"
405-
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
406-
# end of input in or before attribute value, or we have the
407-
# '/' from a '/>' ending
408-
return-1
409-
ifj>i:
410-
returnj
411-
else:
412-
returni+1
413-
raiseAssertionError("we should not get here!")
413+
match=locatetagend.match(rawdata,i+1)
414+
assertmatch
415+
j=match.end()
416+
ifrawdata[j-1]!=">":
417+
return-1
418+
returnj
414419

415420
# Internal -- parse endtag, return end or -1 if incomplete
416421
defparse_endtag(self,i):
422+
# See the HTML5 specs section "13.2.5.7 End tag open state"
423+
# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
417424
rawdata=self.rawdata
418425
assertrawdata[i:i+2]=="</","unexpected call to parse_endtag"
419-
match=endendtag.search(rawdata,i+1)# >
420-
ifnotmatch:
426+
ifrawdata.find('>',i+2)<0:# fast check
421427
return-1
422-
gtpos=match.end()
423-
match=endtagfind.match(rawdata,i)# </ + tag + >
424-
ifnotmatch:
425-
ifself.cdata_elemisnotNone:
426-
self.handle_data(rawdata[i:gtpos])
427-
returngtpos
428-
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
429-
namematch=tagfind_tolerant.match(rawdata,i+2)
430-
ifnotnamematch:
431-
# w3.org/TR/html5/tokenization.html#end-tag-open-state
432-
ifrawdata[i:i+3]=='</>':
433-
returni+3
434-
else:
435-
returnself.parse_bogus_comment(i)
436-
tagname=namematch.group(1).lower()
437-
# consume and ignore other stuff between the name and the >
438-
# Note: this is not 100% correct, since we might have things like
439-
# </tag attr=">">, but looking for > after the name should cover
440-
# most of the cases and is much simpler
441-
gtpos=rawdata.find('>',namematch.end())
442-
self.handle_endtag(tagname)
443-
returngtpos+1
428+
ifnotendtagopen.match(rawdata,i):# </ + letter
429+
ifrawdata[i+2:i+3]=='>':# </> is ignored
430+
# "missing-end-tag-name" parser error
431+
returni+3
432+
else:
433+
returnself.parse_bogus_comment(i)
444434

445-
elem=match.group(1).lower()# script or style
446-
ifself.cdata_elemisnotNone:
447-
ifelem!=self.cdata_elem:
448-
self.handle_data(rawdata[i:gtpos])
449-
returngtpos
435+
match=locatetagend.match(rawdata,i+2)
436+
assertmatch
437+
j=match.end()
438+
ifrawdata[j-1]!=">":
439+
return-1
450440

451-
self.handle_endtag(elem)
441+
# find the name: "13.2.5.8 Tag name state"
442+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
443+
match=tagfind_tolerant.match(rawdata,i+2)
444+
assertmatch
445+
tag=match.group(1).lower()
446+
self.handle_endtag(tag)
452447
self.clear_cdata_mode()
453-
returngtpos
448+
returnj
454449

455450
# Overridable -- finish processing of start+end tag: <tag.../>
456451
defhandle_startendtag(self,tag,attrs):

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp