Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitce43212

Browse files
committed
Rejiggered the tokeniser so it only ever unconsumes a single character. Simplified the line/column position counters. (Saves about 5% parsing time.)
--HG--extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401241
1 parentb7c7de7 commitce43212

File tree

3 files changed

+268
-192
lines changed

3 files changed

+268
-192
lines changed

‎src/html5lib/inputstream.py

Lines changed: 85 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,17 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
7272
self.chunk=u""
7373
self.chunkSize=0
7474
self.chunkOffset=0
75-
self.ungetBuffer= []# reversed list of chars from unget()
76-
self.readChars= []
7775
self.errors= []
78-
79-
self.lineLengths= []
76+
# Single-character buffer to handle 'unget'
77+
self.ungetChar=u""# use u"" to mean 'no character' (because None means EOF)
78+
79+
# Remember the current position in the document
80+
self.positionLine=1
81+
self.positionCol=0
82+
# Remember the length of the last line, so unget("\n") can restore
83+
# positionCol. (Only one character can be ungot at once, so we only
84+
# need to remember the single last line.)
85+
self.lastLineLength=None
8086

8187
#Flag to indicate we may have a CR LF broken across a data chunk
8288
self._lastChunkEndsWithCR=False
@@ -219,51 +225,59 @@ def detectEncodingMeta(self):
219225
encoding=parser.getEncoding()
220226
returnencoding
221227

222-
defupdatePosition(self):
223-
#Remove EOF from readChars, if present
224-
ifnotself.readChars:
225-
return
226-
ifself.readCharsandself.readChars[-1]==EOF:
227-
#There may be more than one EOF in readChars so we cannot assume
228-
#readChars.index(EOF) == -1
229-
self.readChars=self.readChars[:self.readChars.index(EOF)]
230-
readChars="".join(self.readChars)
231-
lines=readChars.split("\n")
232-
ifself.lineLengths:
233-
self.lineLengths[-1]+=len(lines[0])
228+
defupdatePosition(self,chars):
229+
# Update the position attributes to correspond to some sequence of
230+
# read characters
231+
232+
# Find the last newline character
233+
idx=chars.rfind(u"\n")
234+
ifidx==-1:
235+
# No newlines in chars
236+
self.positionCol+=len(chars)
234237
else:
235-
self.lineLengths.append(len(lines[0]))
236-
forlineinlines[1:]:
237-
self.lineLengths.append(len(line))
238-
self.readChars= []
239-
#print self.lineLengths
238+
# Find the last-but-one newline character
239+
idx2=chars.rfind(u"\n",0,idx)
240+
ifidx2==-1:
241+
# Only one newline in chars
242+
self.positionLine+=1
243+
self.lastLineLength=self.positionCol+idx
244+
self.positionCol=len(chars)- (idx+1)
245+
else:
246+
# At least two newlines in chars
247+
newlines=chars.count(u"\n")
248+
self.positionLine+=newlines
249+
self.lastLineLength=idx- (idx2+1)
250+
self.positionCol=len(chars)- (idx+1)
240251

241252
defposition(self):
242253
"""Returns (line, col) of the current position in the stream."""
243-
self.updatePosition()
244-
ifself.lineLengths:
245-
line,col=len(self.lineLengths),self.lineLengths[-1]
246-
else:
247-
line,col=1,0
248-
return (line,col)
254+
return (self.positionLine,self.positionCol)
249255

250256
defchar(self):
251257
""" Read one character from the stream or queue if available. Return
252258
EOF when EOF is reached.
253259
"""
254-
ifself.ungetBuffer:
255-
char=self.ungetBuffer.pop()
256-
self.readChars.append(char)
257-
returnchar
258-
259-
ifself.chunkOffset>=self.chunkSize:
260-
ifnotself.readChunk():
261-
returnEOF
262-
263-
char=self.chunk[self.chunkOffset]
264-
self.chunkOffset+=1
260+
char=self.ungetChar
261+
ifchar!=u"":
262+
# Use the ungot character, and reset the buffer
263+
self.ungetChar=u""
264+
else:
265+
# Read a new chunk from the input stream if necessary
266+
ifself.chunkOffset>=self.chunkSize:
267+
ifnotself.readChunk():
268+
returnEOF
269+
270+
char=self.chunk[self.chunkOffset]
271+
self.chunkOffset+=1
272+
273+
# Update the position attributes
274+
ifchar==u"\n":
275+
self.lastLineLength=self.positionCol
276+
self.positionCol=0
277+
self.positionLine+=1
278+
elifcharisnotEOF:
279+
self.positionCol+=1
265280

266-
self.readChars.append(char)
267281
returnchar
268282

269283
defreadChunk(self,chunkSize=_defaultChunkSize):
@@ -282,20 +296,18 @@ def readChunk(self, chunkSize=_defaultChunkSize):
282296

283297
data=data.replace(u"\u0000",u"\ufffd")
284298
#Check for CR LF broken across chunks
285-
if (self._lastChunkEndsWithCRanddata[0]=="\n"):
299+
if (self._lastChunkEndsWithCRanddata[0]==u"\n"):
286300
data=data[1:]
287301
# Stop if the chunk is now empty
288302
ifnotdata:
289303
returnFalse
290-
self._lastChunkEndsWithCR=data[-1]=="\r"
291-
data=data.replace("\r\n","\n")
292-
data=data.replace("\r","\n")
304+
self._lastChunkEndsWithCR=data[-1]==u"\r"
305+
data=data.replace(u"\r\n",u"\n")
306+
data=data.replace(u"\r",u"\n")
293307

294-
data=unicode(data)
295308
self.chunk=data
296309
self.chunkSize=len(data)
297310

298-
self.updatePosition()
299311
returnTrue
300312

301313
defcharsUntil(self,characters,opposite=False):
@@ -307,22 +319,22 @@ def charsUntil(self, characters, opposite = False):
307319

308320
rv= []
309321

310-
# The unget buffer is typically small and rarely used, so
311-
# just check each character individually
312-
whileself.ungetBuffer:
313-
ifself.ungetBuffer[-1]==EOFor (self.ungetBuffer[-1]incharacters)!=opposite:
314-
r=u"".join(rv)
315-
self.readChars.extend(list(r))
316-
returnr
322+
# Check the ungot character, if any.
323+
# (Since it's only a single character, don't use the regex here)
324+
char=self.ungetChar
325+
ifchar!=u"":
326+
ifcharisEOFor (charincharacters)!=opposite:
327+
returnu""
317328
else:
318-
rv.append(self.ungetBuffer.pop())
329+
rv.append(char)
330+
self.ungetChar=u""
319331

320332
# Use a cache of regexps to find the required characters
321333
try:
322334
chars=charsUntilRegEx[(characters,opposite)]
323335
exceptKeyError:
324336
forcincharacters:assert(ord(c)<128)
325-
regex=u"".join(["\\x%02x"%ord(c)forcincharacters])
337+
regex=u"".join([u"\\x%02x"%ord(c)forcincharacters])
326338
ifnotopposite:
327339
regex=u"^%s"%regex
328340
chars=charsUntilRegEx[(characters,opposite)]=re.compile(u"[%s]*"%regex)
@@ -343,24 +355,27 @@ def charsUntil(self, characters, opposite = False):
343355
break
344356

345357
r=u"".join(rv)
346-
self.readChars.extend(list(r))
358+
self.updatePosition(r)
347359
returnr
348360

349-
defunget(self,chars):
350-
self.updatePosition()
351-
ifchars:
352-
l=list(chars)
353-
l.reverse()
354-
self.ungetBuffer.extend(l)
355-
#Alter the current line, col position
356-
forcinchars[::-1]:
357-
ifcisNone:
358-
continue
359-
elifc=='\n':
360-
assertself.lineLengths[-1]==0
361-
self.lineLengths.pop()
362-
else:
363-
self.lineLengths[-1]-=1
361+
defunget(self,char):
362+
# Only one character is allowed to be ungotten at once - it must
363+
# be consumed again before any further call to unget
364+
assertself.ungetChar==u""
365+
366+
self.ungetChar=char
367+
368+
# Update the position attributes
369+
ifcharisNone:
370+
pass
371+
elifchar==u"\n":
372+
assertself.positionLine>=1
373+
assertself.lastLineLengthisnotNone
374+
self.positionLine-=1
375+
self.positionCol=self.lastLineLength
376+
self.lastLineLength=None
377+
else:
378+
self.positionCol-=1
364379

365380
classEncodingBytes(str):
366381
"""String-like object with an assosiated position and various extra methods

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp