Commitce43212

committed

Rejiggered the tokeniser so it only ever unconsumes a single character. Simplified the line/column position counters. (Saves about 5% parsing time.)

--HG--extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401241

1 parentb7c7de7 commitce43212Copy full SHA for ce43212

File tree

3 files changed

+268

-192

lines changed

src/html5lib
- inputstream.py
- tokenizer.py
tests
- test_stream.py

3 files changed

+268

-192

lines changed

`‎src/html5lib/inputstream.py‎`

Lines changed: 85 additions & 70 deletions

Original file line number	Diff line number	Diff line change
`@@ -72,11 +72,17 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):`
`72`	`72`	`self.chunk=u""`
`73`	`73`	`self.chunkSize=0`
`74`	`74`	`self.chunkOffset=0`
`75`		`-self.ungetBuffer= []# reversed list of chars from unget()`
`76`		`-self.readChars= []`
`77`	`75`	`self.errors= []`
`78`		`-`
`79`		`-self.lineLengths= []`
	`76`	`+# Single-character buffer to handle 'unget'`
	`77`	`+self.ungetChar=u""# use u"" to mean 'no character' (because None means EOF)`
	`78`	`+`
	`79`	`+# Remember the current position in the document`
	`80`	`+self.positionLine=1`
	`81`	`+self.positionCol=0`
	`82`	`+# Remember the length of the last line, so unget("\n") can restore`
	`83`	`+# positionCol. (Only one character can be ungot at once, so we only`
	`84`	`+# need to remember the single last line.)`
	`85`	`+self.lastLineLength=None`
`80`	`86`
`81`	`87`	`#Flag to indicate we may have a CR LF broken across a data chunk`
`82`	`88`	`self._lastChunkEndsWithCR=False`
`@@ -219,51 +225,59 @@ def detectEncodingMeta(self):`
`219`	`225`	`encoding=parser.getEncoding()`
`220`	`226`	`returnencoding`
`221`	`227`
`222`		`-defupdatePosition(self):`
`223`		`-#Remove EOF from readChars, if present`
`224`		`-ifnotself.readChars:`
`225`		`-return`
`226`		`-ifself.readCharsandself.readChars[-1]==EOF:`
`227`		`-#There may be more than one EOF in readChars so we cannot assume`
`228`		`-#readChars.index(EOF) == -1`
`229`		`-self.readChars=self.readChars[:self.readChars.index(EOF)]`
`230`		`-readChars="".join(self.readChars)`
`231`		`-lines=readChars.split("\n")`
`232`		`-ifself.lineLengths:`
`233`		`-self.lineLengths[-1]+=len(lines[0])`
	`228`	`+defupdatePosition(self,chars):`
	`229`	`+# Update the position attributes to correspond to some sequence of`
	`230`	`+# read characters`
	`231`	`+`
	`232`	`+# Find the last newline character`
	`233`	`+idx=chars.rfind(u"\n")`
	`234`	`+ifidx==-1:`
	`235`	`+# No newlines in chars`
	`236`	`+self.positionCol+=len(chars)`
`234`	`237`	`else:`
`235`		`-self.lineLengths.append(len(lines[0]))`
`236`		`-forlineinlines[1:]:`
`237`		`-self.lineLengths.append(len(line))`
`238`		`-self.readChars= []`
`239`		`-#print self.lineLengths`
	`238`	`+# Find the last-but-one newline character`
	`239`	`+idx2=chars.rfind(u"\n",0,idx)`
	`240`	`+ifidx2==-1:`
	`241`	`+# Only one newline in chars`
	`242`	`+self.positionLine+=1`
	`243`	`+self.lastLineLength=self.positionCol+idx`
	`244`	`+self.positionCol=len(chars)- (idx+1)`
	`245`	`+else:`
	`246`	`+# At least two newlines in chars`
	`247`	`+newlines=chars.count(u"\n")`
	`248`	`+self.positionLine+=newlines`
	`249`	`+self.lastLineLength=idx- (idx2+1)`
	`250`	`+self.positionCol=len(chars)- (idx+1)`
`240`	`251`
`241`	`252`	`defposition(self):`
`242`	`253`	`"""Returns (line, col) of the current position in the stream."""`
`243`		`-self.updatePosition()`
`244`		`-ifself.lineLengths:`
`245`		`-line,col=len(self.lineLengths),self.lineLengths[-1]`
`246`		`-else:`
`247`		`-line,col=1,0`
`248`		`-return (line,col)`
	`254`	`+return (self.positionLine,self.positionCol)`
`249`	`255`
`250`	`256`	`defchar(self):`
`251`	`257`	`""" Read one character from the stream or queue if available. Return`
`252`	`258`	`EOF when EOF is reached.`
`253`	`259`	`"""`
`254`		`-ifself.ungetBuffer:`
`255`		`-char=self.ungetBuffer.pop()`
`256`		`-self.readChars.append(char)`
`257`		`-returnchar`
`258`		`-`
`259`		`-ifself.chunkOffset>=self.chunkSize:`
`260`		`-ifnotself.readChunk():`
`261`		`-returnEOF`
`262`		`-`
`263`		`-char=self.chunk[self.chunkOffset]`
`264`		`-self.chunkOffset+=1`
	`260`	`+char=self.ungetChar`
	`261`	`+ifchar!=u"":`
	`262`	`+# Use the ungot character, and reset the buffer`
	`263`	`+self.ungetChar=u""`
	`264`	`+else:`
	`265`	`+# Read a new chunk from the input stream if necessary`
	`266`	`+ifself.chunkOffset>=self.chunkSize:`
	`267`	`+ifnotself.readChunk():`
	`268`	`+returnEOF`
	`269`	`+`
	`270`	`+char=self.chunk[self.chunkOffset]`
	`271`	`+self.chunkOffset+=1`
	`272`	`+`
	`273`	`+# Update the position attributes`
	`274`	`+ifchar==u"\n":`
	`275`	`+self.lastLineLength=self.positionCol`
	`276`	`+self.positionCol=0`
	`277`	`+self.positionLine+=1`
	`278`	`+elifcharisnotEOF:`
	`279`	`+self.positionCol+=1`
`265`	`280`
`266`		`-self.readChars.append(char)`
`267`	`281`	`returnchar`
`268`	`282`
`269`	`283`	`defreadChunk(self,chunkSize=_defaultChunkSize):`
`@@ -282,20 +296,18 @@ def readChunk(self, chunkSize=_defaultChunkSize):`
`282`	`296`
`283`	`297`	`data=data.replace(u"\u0000",u"\ufffd")`
`284`	`298`	`#Check for CR LF broken across chunks`
`285`		`-if (self._lastChunkEndsWithCRanddata[0]=="\n"):`
	`299`	`+if (self._lastChunkEndsWithCRanddata[0]==u"\n"):`
`286`	`300`	`data=data[1:]`
`287`	`301`	`# Stop if the chunk is now empty`
`288`	`302`	`ifnotdata:`
`289`	`303`	`returnFalse`
`290`		`-self._lastChunkEndsWithCR=data[-1]=="\r"`
`291`		`-data=data.replace("\r\n","\n")`
`292`		`-data=data.replace("\r","\n")`
	`304`	`+self._lastChunkEndsWithCR=data[-1]==u"\r"`
	`305`	`+data=data.replace(u"\r\n",u"\n")`
	`306`	`+data=data.replace(u"\r",u"\n")`
`293`	`307`
`294`		`-data=unicode(data)`
`295`	`308`	`self.chunk=data`
`296`	`309`	`self.chunkSize=len(data)`
`297`	`310`
`298`		`-self.updatePosition()`
`299`	`311`	`returnTrue`
`300`	`312`
`301`	`313`	`defcharsUntil(self,characters,opposite=False):`
`@@ -307,22 +319,22 @@ def charsUntil(self, characters, opposite = False):`
`307`	`319`
`308`	`320`	`rv= []`
`309`	`321`
`310`		`-# The unget buffer is typically small and rarely used, so`
`311`		`-# just check each character individually`
`312`		`-whileself.ungetBuffer:`
`313`		`-ifself.ungetBuffer[-1]==EOFor (self.ungetBuffer[-1]incharacters)!=opposite:`
`314`		`-r=u"".join(rv)`
`315`		`-self.readChars.extend(list(r))`
`316`		`-returnr`
	`322`	`+# Check the ungot character, if any.`
	`323`	`+# (Since it's only a single character, don't use the regex here)`
	`324`	`+char=self.ungetChar`
	`325`	`+ifchar!=u"":`
	`326`	`+ifcharisEOFor (charincharacters)!=opposite:`
	`327`	`+returnu""`
`317`	`328`	`else:`
`318`		`-rv.append(self.ungetBuffer.pop())`
	`329`	`+rv.append(char)`
	`330`	`+self.ungetChar=u""`
`319`	`331`
`320`	`332`	`# Use a cache of regexps to find the required characters`
`321`	`333`	`try:`
`322`	`334`	`chars=charsUntilRegEx[(characters,opposite)]`
`323`	`335`	`exceptKeyError:`
`324`	`336`	`forcincharacters:assert(ord(c)<128)`
`325`		`-regex=u"".join(["\\x%02x"%ord(c)forcincharacters])`
	`337`	`+regex=u"".join([u"\\x%02x"%ord(c)forcincharacters])`
`326`	`338`	`ifnotopposite:`
`327`	`339`	`regex=u"^%s"%regex`
`328`	`340`	`chars=charsUntilRegEx[(characters,opposite)]=re.compile(u"[%s]*"%regex)`
`@@ -343,24 +355,27 @@ def charsUntil(self, characters, opposite = False):`
`343`	`355`	`break`
`344`	`356`
`345`	`357`	`r=u"".join(rv)`
`346`		`-self.readChars.extend(list(r))`
	`358`	`+self.updatePosition(r)`
`347`	`359`	`returnr`
`348`	`360`
`349`		`-defunget(self,chars):`
`350`		`-self.updatePosition()`
`351`		`-ifchars:`
`352`		`-l=list(chars)`
`353`		`-l.reverse()`
`354`		`-self.ungetBuffer.extend(l)`
`355`		`-#Alter the current line, col position`
`356`		`-forcinchars[::-1]:`
`357`		`-ifcisNone:`
`358`		`-continue`
`359`		`-elifc=='\n':`
`360`		`-assertself.lineLengths[-1]==0`
`361`		`-self.lineLengths.pop()`
`362`		`-else:`
`363`		`-self.lineLengths[-1]-=1`
	`361`	`+defunget(self,char):`
	`362`	`+# Only one character is allowed to be ungotten at once - it must`
	`363`	`+# be consumed again before any further call to unget`
	`364`	`+assertself.ungetChar==u""`
	`365`	`+`
	`366`	`+self.ungetChar=char`
	`367`	`+`
	`368`	`+# Update the position attributes`
	`369`	`+ifcharisNone:`
	`370`	`+pass`
	`371`	`+elifchar==u"\n":`
	`372`	`+assertself.positionLine>=1`
	`373`	`+assertself.lastLineLengthisnotNone`
	`374`	`+self.positionLine-=1`
	`375`	`+self.positionCol=self.lastLineLength`
	`376`	`+self.lastLineLength=None`
	`377`	`+else:`
	`378`	`+self.positionCol-=1`
`364`	`379`
`365`	`380`	`classEncodingBytes(str):`
`366`	`381`	`"""String-like object with an assosiated position and various extra methods`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitce43212

File tree

3 files changed

3 files changed

`‎src/html5lib/inputstream.py‎`

0 commit comments