Commit56c7e58

committed

Attempted optimisation of HTMLInputStream. (Reduces overall parsing time by 15-25% in some typical cases.)

--HG--extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401154

1 parentfb146a3 commit56c7e58Copy full SHA for 56c7e58

File tree

1 file changed

+63

-45

lines changed

src/html5lib
- inputstream.py

1 file changed

+63

-45

lines changed

`‎src/html5lib/inputstream.py`

Lines changed: 63 additions & 45 deletions

Original file line number	Diff line number	Diff line change
`@@ -13,10 +13,8 @@`
`13`	`13`
`14`	`14`	`invalid_unicode_re=re.compile(u"[\u0001-\u0008]\|[\u000E-\u001F]\|[\u007F-\u009F]\|[\uD800-\uDFFF]\|[\uFDD0-\uFDDF]\|\uFFFE\|\uFFFF\|\U0001FFFE\|\U0001FFFF\|\U0002FFFE\|\U0002FFFF\|\U0003FFFE\|\U0003FFFF\|\U0004FFFE\|\U0004FFFF\|\U0005FFFE\|\U0005FFFF\|\U0006FFFE\|\U0006FFFF\|\U0007FFFE\|\U0007FFFF\|\U0008FFFE\|\U0008FFFF\|\U0009FFFE\|\U0009FFFF\|\U000AFFFE\|\U000AFFFF\|\U000BFFFE\U000BFFFF\|\U000CFFFE\|\U000CFFFF\|\U000DFFFE\|\U000DFFFF\|\U000EFFFE\|\U000EFFFF\|\U000FFFFE\|\U000FFFFF\|\U0010FFFE\|\U0010FFFF")`
`15`	`15`
`16`		`-try:`
`17`		`-fromcollectionsimportdeque`
`18`		`-exceptImportError:`
`19`		`-fromutilsimportdeque`
	`16`	`+# Cache for charsUntil()`
	`17`	`+charsUntilRegEx= {}`
`20`	`18`
`21`	`19`	`classHTMLInputStream(object):`
`22`	`20`	`"""Provides a unicode stream of characters to the HTMLTokenizer.`
`@@ -68,7 +66,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):`
`68`	`66`	`self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,`
`69`	`67`	`'replace')`
`70`	`68`
`71`		`-self.queue=deque([])`
	`69`	`+self.chunk=u""`
	`70`	`+self.chunkOffset=0`
	`71`	`+self.ungetBuffer= []# reversed list of chars from unget()`
`72`	`72`	`self.readChars= []`
`73`	`73`	`self.errors= []`
`74`	`74`
`@@ -247,21 +247,25 @@ def char(self):`
`247`	`247`	`""" Read one character from the stream or queue if available. Return`
`248`	`248`	`EOF when EOF is reached.`
`249`	`249`	`"""`
`250`		`-ifnotself.queue:`
`251`		`-self.readChunk()`
`252`		`-#If we still don't have a character we have reached EOF`
`253`		`-ifnotself.queue:`
`254`		`-returnEOF`
`255`		`-`
`256`		`-char=self.queue.popleft()`
`257`		`-`
	`250`	`+ifself.ungetBuffer:`
	`251`	`+returnself.ungetBuffer.pop()`
	`252`	`+`
	`253`	`+ifself.chunkOffset>=len(self.chunk):`
	`254`	`+ifnotself.readChunk():`
	`255`	`+returnEOF`
	`256`	`+`
	`257`	`+char=self.chunk[self.chunkOffset]`
	`258`	`+self.chunkOffset+=1`
	`259`	`+`
`258`	`260`	`self.readChars.append(char)`
`259`	`261`	`returnchar`
`260`	`262`
`261`	`263`	`defreadChunk(self,chunkSize=10240):`
`262`	`264`	`data=self.dataStream.read(chunkSize)`
`263`	`265`	`ifnotdata:`
`264`		`-return`
	`266`	`+self.chunk=u""`
	`267`	`+self.chunkOffset=0`
	`268`	`+returnFalse`
`265`	`269`	`#Replace null characters`
`266`	`270`	`foriinxrange(data.count(u"\u0000")):`
`267`	`271`	`self.errors.append("null-character")`
`@@ -275,53 +279,67 @@ def readChunk(self, chunkSize=10240):`
`275`	`279`	`self._lastChunkEndsWithCR=data[-1]=="\r"`
`276`	`280`	`data=data.replace("\r\n","\n")`
`277`	`281`	`data=data.replace("\r","\n")`
`278`		`-`
	`282`	`+`
`279`	`283`	`data=unicode(data)`
`280`		`-self.queue.extend(list(data))`
	`284`	`+self.chunk=data`
	`285`	`+self.chunkOffset=0`
`281`	`286`
`282`	`287`	`self.updatePosition()`
	`288`	`+returnTrue`
`283`	`289`
`284`	`290`	`defcharsUntil(self,characters,opposite=False):`
`285`	`291`	`""" Returns a string of characters from the stream up to but not`
`286`		`- including any character in characters or EOF. characters can be`
`287`		`- any container that supports the in method being called on it.`
	`292`	`+ including any character in 'characters' or EOF. 'characters' must be`
	`293`	`+ a container that supports the 'in' method and iteration over its`
	`294`	`+ characters.`
`288`	`295`	`"""`
`289`	`296`
`290`		`-#This method is currently 40-50% of our total runtime and badly needs`
`291`		`-#optimizing`
`292`		`-#Possible improvements:`
`293`		`-# - use regexp to find characters that match the required character set`
`294`		`-# (with regexp cache since we do the same searches many many times)`
`295`		`-# - improve EOF handling for fewer if statements`
`296`		`-`
`297`		`-ifnotself.queue:`
`298`		`-self.readChunk()`
`299`		`-#Break if we have reached EOF`
`300`		`-ifnotself.queueorself.queue[0]==None:`
`301`		`-returnu""`
`302`		`-`
`303`		`-i=0`
`304`		`-while (self.queue[i]incharacters)==opposite:`
`305`		`-i+=1`
`306`		`-ifi==len(self.queue):`
`307`		`-self.readChunk()`
`308`		`-#If the queue doesn't grow we have reached EOF`
`309`		`-ifi==len(self.queue)orself.queue[i]isEOF:`
	`297`	`+rv= []`
	`298`	`+`
	`299`	`+# The unget buffer is typically small and rarely used, so`
	`300`	`+# just check each character individually`
	`301`	`+whileself.ungetBuffer:`
	`302`	`+ifself.ungetBuffer[-1]==EOFor (self.ungetBuffer[-1]incharacters)!=opposite:`
	`303`	`+r=u"".join(rv)`
	`304`	`+self.readChars.extend(list(r))`
	`305`	`+returnr`
	`306`	`+else:`
	`307`	`+rv.append(self.ungetBuffer.pop())`
	`308`	`+`
	`309`	`+# Use a cache of regexps to find the required characters`
	`310`	`+try:`
	`311`	`+chars=charsUntilRegEx[characters]`
	`312`	`+exceptKeyError:`
	`313`	`+forcincharacters:assert(ord(c)<128)`
	`314`	`+regex=u"".join("\\x%02x"%ord(c)forcincharacters)`
	`315`	`+ifnotopposite:`
	`316`	`+regex=u"^%s"%regex`
	`317`	`+chars=charsUntilRegEx[characters]=re.compile(u"[%s]*"%regex)`
	`318`	`+`
	`319`	`+whileTrue:`
	`320`	`+# Find the longest matching prefix`
	`321`	`+m=chars.match(self.chunk,self.chunkOffset)`
	`322`	`+# If not everything matched, return everything up to the part that didn't match`
	`323`	`+ifm.end()!=len(self.chunk):`
	`324`	`+rv.append(self.chunk[self.chunkOffset:m.end()])`
	`325`	`+self.chunkOffset=m.end()`
	`326`	`+break`
	`327`	`+# If the whole chunk matched, use it all and read the next chunk`
	`328`	`+rv.append(self.chunk[self.chunkOffset:])`
	`329`	`+ifnotself.readChunk():`
	`330`	`+# Reached EOF`
`310`	`331`	`break`
`311`	`332`
`312`		`-rv= [self.queue.popleft()forcinrange(i)]`
`313`		`-`
`314`		`-self.readChars.extend(rv)`
`315`		`-`
`316`		`-rv=u"".join(rv)`
`317`		`-returnrv`
	`333`	`+r=u"".join(rv)`
	`334`	`+self.readChars.extend(list(r))`
	`335`	`+returnr`
`318`	`336`
`319`	`337`	`defunget(self,chars):`
`320`	`338`	`self.updatePosition()`
`321`	`339`	`ifchars:`
`322`	`340`	`l=list(chars)`
`323`	`341`	`l.reverse()`
`324`		`-self.queue.extendleft(l)`
	`342`	`+self.ungetBuffer.extend(l)`
`325`	`343`	`#Alter the current line, col position`
`326`	`344`	`forcinchars[::-1]:`
`327`	`345`	`ifcisNone:`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit56c7e58

File tree

1 file changed

1 file changed

`‎src/html5lib/inputstream.py`

0 commit comments