- Notifications
You must be signed in to change notification settings - Fork294
Compile html5lib with Cython#524
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Draft
gsnedders wants to merge22 commits intohtml5lib:masterChoose a base branch fromgsnedders:cythonzied
base:master
Could not load branches
Branch not found:{{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline, and old review comments may become outdated.
Uh oh!
There was an error while loading.Please reload this page.
Draft
Changes fromall commits
Commits
Show all changes
22 commits Select commitHold shift + click to select a range
c52e731
Get rid of getPhases
gsnedders8cff6aa
fixup! Get rid of getPhases
gsnedders6eb4d2d
Move tests
gsneddersd2474af
Make InputStream.readChunk default an int
gsnedders0904df3
Remove last trace of Tokenizer.lastFourChars
gsnedders8ebff2e
Move Tokenizer.state to Tokenizer._state
gsnedders4a8e28a
Instead of comparing with a set of ints, use maths
gsnedders2ae13cc
Remove unused Tokenizer.escape/escapeFlag
gsneddersc22d069
Avoid needless setter write, mutate value directly
gsnedders81b3aaf
Reduce list/tuple access
gsnedders47df02b
Move lowercasing to _ascii module
gsnedders7d7a079
Always initialize Parser.tokenizer
gsnedders1acb5dd
Remove long unused Parser.lastPhase/Parser.beforeRCDataPhase
gsneddersb6a6484
Speed-up Parser.mainLoop a bit
gsnedders4822712
Get rid of more frozenset calls around constants
gsneddersf06451e
Add assert for leavingThisState
gsnedders9e9ff5f
Avoid recursion in etree.testSerializer
gsnedders2036738
Get rid of remaining non-decorator property()
gsnedders2c8e0ec
Call super().f() rather than Base.f(self)
gsnedders84cbc20
Move _getEtreeTag out of the class
gsnedders8b89668
Change attributes to be created as dicts from day one
gsnedderse65c433
Start of Cythonizing the tokenizer
gsneddersFile filter
Filter by extension
Conversations
Failed to load comments.
Loading
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Jump to file
Failed to load files.
Loading
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
2 changes: 1 addition & 1 deletion.gitmodules
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
[submodule "testdata"] | ||
path = tests/testdata | ||
url = https://github.com/html5lib/html5lib-tests.git |
2,642 changes: 1,321 additions & 1,321 deletions.pytest.expect
Large diffs are not rendered by default.
Oops, something went wrong.
Uh oh!
There was an error while loading.Please reload this page.
1 change: 1 addition & 0 deletionsMANIFEST.in
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.Learn more about bidirectional Unicode characters
10 changes: 5 additions & 5 deletionsbenchmarks/bench_html.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletionbenchmarks/bench_wpt.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -4,7 +4,7 @@ | ||
import pyperf | ||
#sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")] | ||
import html5lib # noqa: E402 | ||
5 changes: 5 additions & 0 deletionshtml5lib/_ascii.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .constants import asciiUpper2Lower | ||
def ascii_lower(s): | ||
return s.translate(asciiUpper2Lower) |
109 changes: 109 additions & 0 deletionshtml5lib/_inputstream.pxd
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
# cython: language_level=3 | ||
cimport cython | ||
from cpython cimport array | ||
ctypedef void (*rCEf)(HTMLUnicodeInputStream, unicode) except * | ||
cdef dict charsUntilCache | ||
cdef class BufferedStream(object): | ||
cdef object stream | ||
cdef object buffer | ||
cdef object position | ||
cpdef object tell(self) | ||
cpdef object seek(self, object pos) | ||
cpdef object read(self, object bytes) | ||
cdef object _bufferedBytes(self) | ||
cdef object _readStream(self, object bytes) | ||
cdef object _readFromBuffer(self, object bytes) | ||
#def HTMLInputStream(source, object **kwargs) | ||
cdef class HTMLUnicodeInputStream(object): | ||
cdef rCEf reportCharacterErrors | ||
cdef object newLines | ||
cdef readonly object charEncoding | ||
cdef object dataStream | ||
cdef unicode chunk | ||
cdef Py_ssize_t chunkSize | ||
cdef Py_ssize_t chunkOffset | ||
cdef readonly list errors | ||
# number of (complete) lines in previous chunks | ||
cdef Py_ssize_t prevNumLines | ||
# number of columns in the last line of the previous chunk | ||
cdef Py_ssize_t prevNumCols | ||
# Deal with CR LF and surrogates split over chunk boundaries | ||
cdef unicode _bufferedCharacter | ||
cdef object reset(self) | ||
cdef object openStream(self, object source) | ||
@cython.locals(nLines=Py_ssize_t, lastLinePos=Py_ssize_t) | ||
cdef tuple _position(self, Py_ssize_t offset) | ||
cpdef tuple position(self) | ||
@cython.locals(chunkOffset=Py_ssize_t, char=unicode) | ||
cpdef unicode char(self) | ||
@cython.locals(data=unicode) | ||
cdef bint readChunk(self, Py_ssize_t chunkSize=?) except? -1 | ||
@cython.locals(c=ulong) | ||
cdef void characterErrorsUCS4(self, unicode data) except * | ||
cdef void characterErrorsUCS2(self, unicode data) except * | ||
cpdef object charsUntil(self, object characters, bint opposite=?) | ||
cpdef object unget(self, object char) | ||
cdef class HTMLBinaryInputStream(HTMLUnicodeInputStream): | ||
cdef object rawStream | ||
cdef readonly object numBytesMeta | ||
cdef readonly object numBytesChardet | ||
cdef object override_encoding | ||
cdef object transport_encoding | ||
cdef object same_origin_parent_encoding | ||
cdef object likely_encoding | ||
cdef object default_encoding | ||
cdef object reset(self) | ||
cdef object openStream(self, object source) | ||
cdef object determineEncoding(self, object chardet=?) | ||
cpdef object changeEncoding(self, object newEncoding) | ||
@cython.locals(string=bytes) | ||
cdef object detectBOM(self) | ||
cdef object detectEncodingMeta(self) | ||
# cdef class EncodingBytes(bytes): | ||
# cdef object previous(self) | ||
# cdef object setPosition(self, object position) | ||
# cdef object getPosition(self) | ||
# cdef object getCurrentByte(self) | ||
# cdef object skip(self, object chars=?) | ||
# cdef object skipUntil(self, object chars) | ||
# cdef object matchBytes(self, object bytes) | ||
# cdef object jumpTo(self, object bytes) | ||
ctypedef bint (*encstate)(EncodingParser) except? -1 | ||
cdef class EncodingParser(object): | ||
cdef object data | ||
cdef object encoding | ||
@cython.locals(func=encstate, keepParsing=bint) | ||
cdef object getEncoding(self) | ||
cdef bint handleComment(self) except? -1 | ||
@cython.locals(hasPragma=bint, name=bytes, value=bytes, tentativeEncoding=bytes) | ||
cdef bint handleMeta(self) except? -1 | ||
cdef bint handlePossibleStartTag(self) except? -1 | ||
cdef bint handlePossibleEndTag(self) except? -1 | ||
cdef bint handlePossibleTag(self, bint endTag) except? -1 | ||
cdef bint handleOther(self) except? -1 | ||
@cython.locals(c=bytes) | ||
cdef tuple getAttribute(self) | ||
cdef class ContentAttrParser(object): | ||
cdef object data | ||
cpdef object parse(self) # this needs to be cpdef for tests | ||
cdef object lookupEncoding(object encoding) |
Oops, something went wrong.
Uh oh!
There was an error while loading.Please reload this page.
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.