- Notifications
You must be signed in to change notification settings - Fork295
Tokenizer: pretranslate lowercase element and attribute names#520
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
183d8a0
2e86373
8f96b17
a912842
f9f370e
fa62671
df94e2d
File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -233,7 +233,6 @@ def emitCurrentToken(self): | ||
token = self.currentToken | ||
# Add token to the queue to be yielded | ||
if (token["type"] in tagTokenTypes): | ||
if token["type"] == tokenTypes["StartTag"]: | ||
raw = token["data"] | ||
data = attributeMap(raw) | ||
@@ -380,7 +379,8 @@ def tagOpenState(self): | ||
self.state = self.closeTagOpenState | ||
elif data in asciiLetters: | ||
self.currentToken = {"type": tokenTypes["StartTag"], | ||
"name": data.translate(asciiUpper2Lower), | ||
"data": [], | ||
"selfClosing": False, | ||
"selfClosingAcknowledged": False} | ||
self.state = self.tagNameState | ||
@@ -410,7 +410,8 @@ def tagOpenState(self): | ||
def closeTagOpenState(self): | ||
data = self.stream.char() | ||
if data in asciiLetters: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name": data.translate(asciiUpper2Lower), | ||
"data": [], "selfClosing": False} | ||
self.state = self.tagNameState | ||
elif data == ">": | ||
@@ -448,7 +449,7 @@ def tagNameState(self): | ||
"data": "invalid-codepoint"}) | ||
self.currentToken["name"] += "\uFFFD" | ||
else: | ||
self.currentToken["name"] += data.translate(asciiUpper2Lower) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I'm very skeptical about this being a perf win, versus it being in Yes, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. That's fair, yep - especially for short element names it seems likely that the I hadn't assessed the performance of this code path separately; it felt worth maintaining consistency but I don't believe there's a noticeable performance change. | ||
# (Don't use charsUntil here, because tag names are | ||
# very short and it's faster to not do anything fancy) | ||
return True | ||
@@ -476,26 +477,29 @@ def rcdataEndTagOpenState(self): | ||
return True | ||
def rcdataEndTagNameState(self): | ||
data = self.stream.char() | ||
if data in asciiLetters: | ||
self.temporaryBuffer += data | ||
return True | ||
name = self.temporaryBuffer.translate(asciiUpper2Lower) | ||
appropriate = self.currentToken and self.currentToken["name"] == name | ||
if data in spaceCharacters and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.beforeAttributeNameState | ||
elif data == "/" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.selfClosingStartTagState | ||
elif data == ">" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.emitCurrentToken() | ||
self.state = self.dataState | ||
else: | ||
self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
"data": "</" + self.temporaryBuffer}) | ||
@@ -526,26 +530,29 @@ def rawtextEndTagOpenState(self): | ||
return True | ||
def rawtextEndTagNameState(self): | ||
data = self.stream.char() | ||
if data in asciiLetters: | ||
self.temporaryBuffer += data | ||
return True | ||
name = self.temporaryBuffer.translate(asciiUpper2Lower) | ||
appropriate = self.currentToken and self.currentToken["name"] == name | ||
if data in spaceCharacters and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.beforeAttributeNameState | ||
elif data == "/" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.selfClosingStartTagState | ||
elif data == ">" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.emitCurrentToken() | ||
self.state = self.dataState | ||
else: | ||
self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
"data": "</" + self.temporaryBuffer}) | ||
@@ -579,26 +586,29 @@ def scriptDataEndTagOpenState(self): | ||
return True | ||
def scriptDataEndTagNameState(self): | ||
data = self.stream.char() | ||
if data in asciiLetters: | ||
self.temporaryBuffer += data | ||
return True | ||
name = self.temporaryBuffer.translate(asciiUpper2Lower) | ||
appropriate = self.currentToken and self.currentToken["name"] == name | ||
if data in spaceCharacters and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.beforeAttributeNameState | ||
elif data == "/" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.selfClosingStartTagState | ||
elif data == ">" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.emitCurrentToken() | ||
self.state = self.dataState | ||
else: | ||
self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
"data": "</" + self.temporaryBuffer}) | ||
@@ -715,26 +725,29 @@ def scriptDataEscapedEndTagOpenState(self): | ||
return True | ||
def scriptDataEscapedEndTagNameState(self): | ||
data = self.stream.char() | ||
if data in asciiLetters: | ||
self.temporaryBuffer += data | ||
return True | ||
name = self.temporaryBuffer.translate(asciiUpper2Lower) | ||
appropriate = self.currentToken and self.currentToken["name"] == name | ||
if data in spaceCharacters and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.beforeAttributeNameState | ||
elif data == "/" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.state = self.selfClosingStartTagState | ||
elif data == ">" and appropriate: | ||
self.currentToken = {"type": tokenTypes["EndTag"], | ||
"name":name, | ||
"data": [], "selfClosing": False} | ||
self.emitCurrentToken() | ||
self.state = self.dataState | ||
else: | ||
self.tokenQueue.append({"type": tokenTypes["Characters"], | ||
"data": "</" + self.temporaryBuffer}) | ||
@@ -776,7 +789,9 @@ def scriptDataDoubleEscapedState(self): | ||
"eof-in-script-in-script"}) | ||
self.state = self.dataState | ||
else: | ||
chars = self.stream.charsUntil(("<", "-", "\u0000")) | ||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": | ||
data + chars}) | ||
return True | ||
def scriptDataDoubleEscapedDashState(self): | ||
@@ -859,7 +874,8 @@ def beforeAttributeNameState(self): | ||
if data in spaceCharacters: | ||
self.stream.charsUntil(spaceCharacters, True) | ||
elif data in asciiLetters: | ||
attr_name = data.translate(asciiUpper2Lower) | ||
self.currentToken["data"].append([attr_name, ""]) | ||
self.state = self.attributeNameState | ||
elif data == ">": | ||
self.emitCurrentToken() | ||
@@ -891,8 +907,7 @@ def attributeNameState(self): | ||
if data == "=": | ||
self.state = self.beforeAttributeValueState | ||
elif data in asciiLetters: | ||
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower) | ||
leavingThisState = False | ||
elif data == ">": | ||
# XXX If we emit here the attributes are converted to a dict | ||
@@ -919,15 +934,13 @@ def attributeNameState(self): | ||
"data": "eof-in-attribute-name"}) | ||
self.state = self.dataState | ||
else: | ||
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower) | ||
leavingThisState = False | ||
if leavingThisState: | ||
# Attributes are not dropped at this stage. That happens when the | ||
# start tag token is emitted so values can still be safely appended | ||
# to attributes, but we do want to report the parse error in time. | ||
for name, _ in self.currentToken["data"][:-1]: | ||
if self.currentToken["data"][-1][0] == name: | ||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
@@ -947,7 +960,8 @@ def afterAttributeNameState(self): | ||
elif data == ">": | ||
self.emitCurrentToken() | ||
elif data in asciiLetters: | ||
attr_name = data.translate(asciiUpper2Lower) | ||
self.currentToken["data"].append([attr_name, ""]) | ||
self.state = self.attributeNameState | ||
elif data == "/": | ||
self.state = self.selfClosingStartTagState | ||
@@ -1341,17 +1355,15 @@ def beforeDoctypeNameState(self): | ||
self.tokenQueue.append(self.currentToken) | ||
self.state = self.dataState | ||
else: | ||
self.currentToken["name"] = data.translate(asciiUpper2Lower) | ||
self.state = self.doctypeNameState | ||
return True | ||
def doctypeNameState(self): | ||
data = self.stream.char() | ||
if data in spaceCharacters: | ||
self.state = self.afterDoctypeNameState | ||
elif data == ">": | ||
self.tokenQueue.append(self.currentToken) | ||
self.state = self.dataState | ||
elif data == "\u0000": | ||
@@ -1363,11 +1375,10 @@ def doctypeNameState(self): | ||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": | ||
"eof-in-doctype-name"}) | ||
self.currentToken["correct"] = False | ||
self.tokenQueue.append(self.currentToken) | ||
self.state = self.dataState | ||
else: | ||
self.currentToken["name"] += data.translate(asciiUpper2Lower) | ||
return True | ||
def afterDoctypeNameState(self): | ||