Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Tokenizer: pretranslate lowercase element and attribute names#520

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Closed
jayaddison wants to merge7 commits intohtml5lib:masterfromjayaddison:tokenizer/pretranslate-lowercase-names
Closed
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 52 additions & 41 deletionshtml5lib/_tokenizer.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -233,7 +233,6 @@ def emitCurrentToken(self):
token = self.currentToken
# Add token to the queue to be yielded
if (token["type"] in tagTokenTypes):
token["name"] = token["name"].translate(asciiUpper2Lower)
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
data = attributeMap(raw)
Expand DownExpand Up@@ -380,7 +379,8 @@ def tagOpenState(self):
self.state = self.closeTagOpenState
elif data in asciiLetters:
self.currentToken = {"type": tokenTypes["StartTag"],
"name": data, "data": [],
"name": data.translate(asciiUpper2Lower),
"data": [],
"selfClosing": False,
"selfClosingAcknowledged": False}
self.state = self.tagNameState
Expand DownExpand Up@@ -410,7 +410,8 @@ def tagOpenState(self):
def closeTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
self.currentToken = {"type": tokenTypes["EndTag"],
"name": data.translate(asciiUpper2Lower),
"data": [], "selfClosing": False}
self.state = self.tagNameState
elif data == ">":
Expand DownExpand Up@@ -448,7 +449,7 @@ def tagNameState(self):
"data": "invalid-codepoint"})
self.currentToken["name"] += "\uFFFD"
else:
self.currentToken["name"] += data
self.currentToken["name"] += data.translate(asciiUpper2Lower)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

I'm very skeptical about this being a perf win, versus it being inemitCurrentToken. What do the benchmarks say?

Yes,emitCurrentToken's lowercasing becomes redundant in the RCDATA/RAWTEXT/script cases, but I expect the cost of this will negate any gains.

Copy link
ContributorAuthor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others.Learn more.

That's fair, yep - especially for short element names it seems likely that thetranslate method call overhead (especially if called repeatedly) could negate any benefits provided by simpler comparisons.

I hadn't assessed the performance of this code path separately; it felt worth maintaining consistency but I don't believe there's a noticeable performance change.

# (Don't use charsUntil here, because tag names are
# very short and it's faster to not do anything fancy)
return True
Expand DownExpand Up@@ -476,26 +477,29 @@ def rcdataEndTagOpenState(self):
return True

def rcdataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand DownExpand Up@@ -526,26 +530,29 @@ def rawtextEndTagOpenState(self):
return True

def rawtextEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand DownExpand Up@@ -579,26 +586,29 @@ def scriptDataEndTagOpenState(self):
return True

def scriptDataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand DownExpand Up@@ -715,26 +725,29 @@ def scriptDataEscapedEndTagOpenState(self):
return True

def scriptDataEscapedEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name":self.temporaryBuffer,
"name":name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand DownExpand Up@@ -776,7 +789,9 @@ def scriptDataDoubleEscapedState(self):
"eof-in-script-in-script"})
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
chars = self.stream.charsUntil(("<", "-", "\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True

def scriptDataDoubleEscapedDashState(self):
Expand DownExpand Up@@ -859,7 +874,8 @@ def beforeAttributeNameState(self):
if data in spaceCharacters:
self.stream.charsUntil(spaceCharacters, True)
elif data in asciiLetters:
self.currentToken["data"].append([data, ""])
attr_name = data.translate(asciiUpper2Lower)
self.currentToken["data"].append([attr_name, ""])
self.state = self.attributeNameState
elif data == ">":
self.emitCurrentToken()
Expand DownExpand Up@@ -891,8 +907,7 @@ def attributeNameState(self):
if data == "=":
self.state = self.beforeAttributeValueState
elif data in asciiLetters:
self.currentToken["data"][-1][0] += data +\
self.stream.charsUntil(asciiLetters, True)
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower)
leavingThisState = False
elif data == ">":
# XXX If we emit here the attributes are converted to a dict
Expand All@@ -919,15 +934,13 @@ def attributeNameState(self):
"data": "eof-in-attribute-name"})
self.state = self.dataState
else:
self.currentToken["data"][-1][0] += data
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower)
leavingThisState = False

if leavingThisState:
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
self.currentToken["data"][-1][0] = (
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
for name, _ in self.currentToken["data"][:-1]:
if self.currentToken["data"][-1][0] == name:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
Expand All@@ -947,7 +960,8 @@ def afterAttributeNameState(self):
elif data == ">":
self.emitCurrentToken()
elif data in asciiLetters:
self.currentToken["data"].append([data, ""])
attr_name = data.translate(asciiUpper2Lower)
self.currentToken["data"].append([attr_name, ""])
self.state = self.attributeNameState
elif data == "/":
self.state = self.selfClosingStartTagState
Expand DownExpand Up@@ -1341,17 +1355,15 @@ def beforeDoctypeNameState(self):
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["name"] = data
self.currentToken["name"] = data.translate(asciiUpper2Lower)
self.state = self.doctypeNameState
return True

def doctypeNameState(self):
data = self.stream.char()
if data in spaceCharacters:
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.state = self.afterDoctypeNameState
elif data == ">":
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data == "\u0000":
Expand All@@ -1363,11 +1375,10 @@ def doctypeNameState(self):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype-name"})
self.currentToken["correct"] = False
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["name"] += data
self.currentToken["name"] += data.translate(asciiUpper2Lower)
return True

def afterDoctypeNameState(self):
Expand Down

[8]ページ先頭

©2009-2025 Movatter.jp