Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Tokenizer: use Python objects to represent tokens#521

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Closed
jayaddison wants to merge19 commits intohtml5lib:masterfromjayaddison:tokenizer/object-tokens
Closed
Show file tree
Hide file tree
Changes fromall commits
Commits
Show all changes
19 commits
Select commitHold shift + click to select a range
183d8a0
Consistency: consume a single character at a time during attribute na…
jayaddisonDec 29, 2020
2e86373
Refactor: pretranslate lowercase element and attribute names
jayaddisonDec 29, 2020
8f96b17
Restore self.currentToken safety check
jayaddisonDec 29, 2020
a912842
Alternate approach: do not pretranslate temporary buffered data
jayaddisonDec 30, 2020
f9f370e
Consistency: character consumption within double-escaped state
jayaddisonDec 30, 2020
bcee8bd
Refactor: use Python objects for tokens within tokenizer
jayaddisonDec 29, 2020
67262f8
Introduce type hierarchy for tag-related tokens
jayaddisonDec 29, 2020
900bdaf
Simplify tag token construction
jayaddisonDec 29, 2020
1f6cae9
Refactor token attribution name/value accumulation
jayaddisonDec 29, 2020
695ac1c
Cleanup: remove leavingThisState / emitToken logic
jayaddisonDec 29, 2020
b1a444b
Remove EmptyTag tokenizer token class
jayaddisonDec 29, 2020
bb7fabc
Refactor: pre-translate strings that are only used in lowercase context
jayaddisonDec 29, 2020
5f4ace9
Cleanup: remove getattr anti-pattern
jayaddisonDec 29, 2020
d744c86
Consistency: use camel-casing to correspond with existing codebase style
jayaddisonDec 29, 2020
1d62e69
Consistency: consume a single character at a time during attribute na…
jayaddisonDec 29, 2020
8772408
Merge branch 'tokenizer/pretranslate-lowercase-names' into tokenizer/…
jayaddisonDec 30, 2020
192cce0
Linting cleanup
jayaddisonDec 30, 2020
e76e0dd
Clarify method name: clearAttribute -> flushAttribute
jayaddisonJan 4, 2021
da37332
Merge branch 'master' into tokenizer/object-tokens
jayaddisonSep 20, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
881 changes: 372 additions & 509 deletionshtml5lib/_tokenizer.py
View file
Open in desktop

Large diffs are not rendered by default.

14 changes: 0 additions & 14 deletionshtml5lib/constants.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -2918,20 +2918,6 @@
0x9F: "\u0178",
}

tokenTypes = {
"Doctype": 0,
"Characters": 1,
"SpaceCharacters": 2,
"StartTag": 3,
"EndTag": 4,
"EmptyTag": 5,
"Comment": 6,
"ParseError": 7
}

tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]])


prefixes = {v: k for k, v in namespaces.items()}
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
Expand Down
404 changes: 197 additions & 207 deletionshtml5lib/html5parser.py
View file
Open in desktop

Large diffs are not rendered by default.

17 changes: 8 additions & 9 deletionshtml5lib/tests/test_tokenizer2.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -4,13 +4,12 @@

from six import unichr, text_type

from html5lib._tokenizer import HTMLTokenizer
from html5lib.constants import tokenTypes
from html5lib._tokenizer import HTMLTokenizer, ParseError, StartTag


def ignore_parse_errors(toks):
for tok in toks:
if tok['type'] != tokenTypes['ParseError']:
ifnot isinstance(tok,ParseError):
yield tok


Expand All@@ -23,9 +22,9 @@ def test_maintain_attribute_order():
out = list(ignore_parse_errors(toks))

assert len(out) == 1
assert out[0]['type'] == tokenTypes['StartTag']
assertisinstance(out[0],StartTag)

attrs_tok = out[0]['data']
attrs_tok = out[0].attributes
assert len(attrs_tok) == len(attrs)

for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
Expand All@@ -40,9 +39,9 @@ def test_duplicate_attribute():
out = list(ignore_parse_errors(toks))

assert len(out) == 1
assert out[0]['type'] == tokenTypes['StartTag']
assertisinstance(out[0],StartTag)

attrs_tok = out[0]['data']
attrs_tok = out[0].attributes
assert len(attrs_tok) == 1
assert list(attrs_tok.items()) == [('a', '1')]

Expand All@@ -56,9 +55,9 @@ def test_maintain_duplicate_attribute_order():
out = list(ignore_parse_errors(toks))

assert len(out) == 1
assert out[0]['type'] == tokenTypes['StartTag']
assertisinstance(out[0],StartTag)

attrs_tok = out[0]['data']
attrs_tok = out[0].attributes
assert len(attrs_tok) == len(attrs)

for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
Expand Down
31 changes: 14 additions & 17 deletionshtml5lib/tests/tokenizer.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -8,7 +8,7 @@
import pytest
from six import unichr

from html5lib._tokenizer import HTMLTokenizer
from html5lib._tokenizer import HTMLTokenizer, StartTag
from html5lib import constants, _utils


Expand All@@ -25,47 +25,44 @@ def parse(self, stream, encoding=None, innerHTML=False):

tokenizer.state = getattr(tokenizer, self._state)
if self._lastStartTag is not None:
tokenizer.currentToken = {"type": "startTag",
"name": self._lastStartTag}
tokenizer.currentToken = StartTag(name=self._lastStartTag)

types = {v: k for k, v in constants.tokenTypes.items()}
for token in tokenizer:
getattr(self, 'process%s' %types[token["type"]])(token)
getattr(self, 'process%s' % token.__class__.__name__)(token)

return self.outputTokens

def processDoctype(self, token):
self.outputTokens.append(["DOCTYPE", token["name"], token["publicId"],
token["systemId"], token["correct"]])
self.outputTokens.append(["DOCTYPE", token.name, token.publicId,
token.systemId, token.correct])

def processStartTag(self, token):
self.outputTokens.append(["StartTag", token["name"],
token["data"], token["selfClosing"]])
self.outputTokens.append(["StartTag", token.name,
token.attributes, token.self_closing])

def processEmptyTag(self, token):
if token["name"] not in constants.voidElements:
if token.name not in constants.voidElements:
self.outputTokens.append("ParseError")
self.outputTokens.append(["StartTag", token["name"], dict(token["data"][::-1])])
self.outputTokens.append(["StartTag", token.name,token.attributes])

def processEndTag(self, token):
self.outputTokens.append(["EndTag", token["name"],
token["selfClosing"]])
self.outputTokens.append(["EndTag", token.name, token.self_closing])

def processComment(self, token):
self.outputTokens.append(["Comment", token["data"]])
self.outputTokens.append(["Comment", token.data])

def processSpaceCharacters(self, token):
self.outputTokens.append(["Character", token["data"]])
self.outputTokens.append(["Character", token.data])
self.processSpaceCharacters = self.processCharacters

def processCharacters(self, token):
self.outputTokens.append(["Character", token["data"]])
self.outputTokens.append(["Character", token.data])

def processEOF(self, token):
pass

def processParseError(self, token):
self.outputTokens.append(["ParseError", token["data"]])
self.outputTokens.append(["ParseError", token.data])


def concatenateCharacterTokens(tokens):
Expand Down
28 changes: 14 additions & 14 deletionshtml5lib/treebuilders/base.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -2,6 +2,7 @@
from six import text_type

from ..constants import scopingElements, tableInsertModeElements, namespaces
from .._tokenizer import StartTag

# The scope markers are inserted when entering object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
Expand DownExpand Up@@ -249,10 +250,9 @@ def reconstructActiveFormattingElements(self):
clone = entry.cloneNode() # Mainly to get a new copy of the attributes

# Step 9
element = self.insertElement({"type": "StartTag",
"name": clone.name,
"namespace": clone.namespace,
"data": clone.attributes})
tag = StartTag(name=clone.name, data=clone.attributes)
tag.namespace = clone.namespace
element = self.insertElement(tag)

# Step 10
self.activeFormattingElements[i] = element
Expand DownExpand Up@@ -286,24 +286,24 @@ def insertRoot(self, token):
self.document.appendChild(element)

def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
name = token.name
publicId = token.publicId
systemId = token.systemId

doctype = self.doctypeClass(name, publicId, systemId)
self.document.appendChild(doctype)

def insertComment(self, token, parent=None):
if parent is None:
parent = self.openElements[-1]
parent.appendChild(self.commentClass(token["data"]))
parent.appendChild(self.commentClass(token.data))

def createElement(self, token):
"""Create an element but don't insert it anywhere"""
name = token["name"]
namespace = token.get("namespace",self.defaultNamespace)
name = token.name
namespace = token.namespace orself.defaultNamespace
element = self.elementClass(name, namespace)
element.attributes = token["data"]
element.attributes = token.attributes
return element

def _getInsertFromTable(self):
Expand All@@ -321,11 +321,11 @@ def _setInsertFromTable(self, value):
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)

def insertElementNormal(self, token):
name = token["name"]
name = token.name
assert isinstance(name, text_type), "Element %s not unicode" % name
namespace = token.get("namespace",self.defaultNamespace)
namespace = token.namespace orself.defaultNamespace
element = self.elementClass(name, namespace)
element.attributes = token["data"]
element.attributes = token.attributes
self.openElements[-1].appendChild(element)
self.openElements.append(element)
return element
Expand Down
6 changes: 3 additions & 3 deletionshtml5lib/treebuilders/dom.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -126,9 +126,9 @@ def documentClass(self):
return weakref.proxy(self)

def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
name = token.name
publicId = token.publicId
systemId = token.systemId

domimpl = Dom.getDOMImplementation()
doctype = domimpl.createDocumentType(name, publicId, systemId)
Expand Down
14 changes: 7 additions & 7 deletionshtml5lib/treebuilders/etree_lxml.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -308,9 +308,9 @@ def getFragment(self):
return fragment

def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
name = token.name
publicId = token.publicId
systemId = token.systemId

if not name:
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
Expand DownExpand Up@@ -359,23 +359,23 @@ def insertRoot(self, token):
else:
docStr += "''"
docStr += ">"
if self.doctype.name != token["name"]:
if self.doctype.name != token.name:
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
root = etree.fromstring(docStr)

# Append the initial comments:
for comment_token in self.initial_comments:
comment = self.commentClass(comment_token["data"])
comment = self.commentClass(comment_token.data)
root.addprevious(comment._element)

# Create the root document and add the ElementTree to it
self.document = self.documentClass()
self.document._elementTree = root.getroottree()

# Give the root element the right name
name = token["name"]
namespace = token.get("namespace",self.defaultNamespace)
name = token.name
namespace = token.namespace orself.defaultNamespace
if namespace is None:
etree_tag = name
else:
Expand Down

[8]ページ先頭

©2009-2025 Movatter.jp