Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

gh-118350: Add escapable-raw-text mode to html parser#121770

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Closed
timonviola wants to merge10 commits intopython:mainfromtimonviola:fix-issue-118350
Closed
Show file tree
Hide file tree
Changes from1 commit
Commits
Show all changes
10 commits
Select commitHold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
PrevPrevious commit
NextNext commit
update to latest main
  • Loading branch information
@timonviola
timonviola committedMay 14, 2025
commita36070a641ff20f2a795476492ff0970dc9c2103
27 changes: 3 additions & 24 deletionsLib/html/parser.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -28,7 +28,6 @@

starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
escapable_raw_text_close = re.compile('</(title|textarea)>', re.I)
commentclose = re.compile(r'--\s*>')
# Note:
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
Expand DownExpand Up@@ -101,7 +100,6 @@ class HTMLParser(_markupbase.ParserBase):
"""

CDATA_CONTENT_ELEMENTS = ("script", "style")
ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea")

def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
Expand All@@ -119,7 +117,6 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self.escapable_raw_text_elem = None
super().reset()

def feed(self, data):
Expand All@@ -141,14 +138,6 @@ def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text

def set_escapable_raw_text_mode(self, elem):
self.escapable_raw_text_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.escapable_raw_text_elem, re.I)

def clear_escapable_raw_text_mode(self):
self.interesting = interesting_normal
self.escapable_raw_text_elem = None

def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Expand All@@ -165,7 +154,7 @@ def goahead(self, end):
i = 0
n = len(rawdata)
while i < n:
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
if self.convert_charrefs and not self.cdata_elem:
j = rawdata.find('<', i)
if j < 0:
# if we can't find the next <, either we are at the end
Expand All@@ -184,13 +173,11 @@ def goahead(self, end):
if match:
j = match.start()
else:
if self.escapable_raw_text_elem:
break
if self.cdata_elem:
break
j = n
if i < j:
if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
Expand DownExpand Up@@ -367,8 +354,6 @@ def parse_starttag(self, i):
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS:
self.set_escapable_raw_text_mode(tag)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
Expand DownExpand Up@@ -444,14 +429,8 @@ def parse_endtag(self, i):
self.handle_data(rawdata[i:gtpos])
return gtpos

if self.escapable_raw_text_elem is not None: # title or textarea
if elem != self.escapable_raw_text_elem:
self.handle_data(rawdata[i:gtpos])
return gtpos

self.handle_endtag(elem)
self.clear_cdata_mode()
self.clear_escapable_raw_text_mode()
return gtpos

# Overridable -- finish processing of start+end tag: <tag.../>
Expand DownExpand Up@@ -492,4 +471,4 @@ def handle_pi(self, data):
pass

def unknown_decl(self, data):
pass
pass
44 changes: 4 additions & 40 deletionsLib/test/test_htmlparser.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -285,9 +285,7 @@ def test_cdata_content(self):
#'foo = </\nscript>',
#'foo = </ script>',
]
tags = ['script', 'style', 'textarea', 'title']
# test the following 'casing' for each tag: script, SCRIPT, Script etc.
elements = [f(tag) for tag in tags for f in (str.lower, str.upper, str.capitalize)]
elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
for content in contents:
for element in elements:
element_lower = element.lower()
Expand DownExpand Up@@ -319,34 +317,6 @@ def get_events(self):
("endtag", element_lower)],
collector=Collector(convert_charrefs=False))

def test_escapable_raw_text_content(self):
contents = [
'foo = "</TITLE" + ">";',
'foo = <\n/title> ',
'<!-- document.write("</scr" + "ipt>"); -->',
'\n//<![CDATA[\n'
'\n<!-- //\nvar foo = 3.14;\n// -->\n',
# valid character reference
'&#65;',
# ambiguous ampersand example
'&notaref',
'foo = "</sty" + "le>";',
'<!-- \u2603 -->',
# these two should be invalid according to the HTML 5 spec,
# section 8.1.2.2
#'foo = </\nscript>',
#'foo = </ script>',
]
elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea']
for content in contents:
for element in elements:
element_lower = element.lower()
s = '<{element}>{content}</{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", element_lower, []),
("data", content),
("endtag", element_lower)])

def test_EOF_in_cdata(self):
content = """<!-- not a comment --> &not-an-entity-ref;
<a href="" /> </p><p> <span></span></style>
Expand DownExpand Up@@ -407,15 +377,9 @@ def test_convert_charrefs(self):
('starttag', 'script', []), ('data', text),
('endtag', 'script'), ('data', '"'),
('starttag', 'style', []), ('data', text),
('endtag', 'style'), ('data', '"'),
('starttag', 'title', []), ('data', text),
('endtag', 'title'), ('data', '"'),
('starttag', 'textarea', []), ('data', text),
('endtag', 'textarea'), ('data', '"')]
('endtag', 'style'), ('data', '"')]
self._run_check('{1}<script>{0}</script>{1}'
'<style>{0}</style>{1}'
'<title>{0}</title>{1}'
'<textarea>{0}</textarea>{1}'.format(text, charref),
'<style>{0}</style>{1}'.format(text, charref),
expected, collector=collector())
# check truncated charrefs at the end of the file
html = '&quo &# &#x'
Expand DownExpand Up@@ -922,4 +886,4 @@ def test_base_class_methods_called(self, super_reset_method, super_init_method):


if __name__ == "__main__":
unittest.main()
unittest.main()

[8]ページ先頭

©2009-2025 Movatter.jp