Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit56c7e58

Browse files
committed
Attempted optimisation of HTMLInputStream. (Reduces overall parsing time by 15-25% in some typical cases.)
--HG--extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401154
1 parentfb146a3 commit56c7e58

File tree

1 file changed

+63
-45
lines changed

1 file changed

+63
-45
lines changed

‎src/html5lib/inputstream.py

Lines changed: 63 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,8 @@
1313

1414
invalid_unicode_re=re.compile(u"[\u0001-\u0008]|[\u000E-\u001F]|[\u007F-\u009F]|[\uD800-\uDFFF]|[\uFDD0-\uFDDF]|\uFFFE|\uFFFF|\U0001FFFE|\U0001FFFF|\U0002FFFE|\U0002FFFF|\U0003FFFE|\U0003FFFF|\U0004FFFE|\U0004FFFF|\U0005FFFE|\U0005FFFF|\U0006FFFE|\U0006FFFF|\U0007FFFE|\U0007FFFF|\U0008FFFE|\U0008FFFF|\U0009FFFE|\U0009FFFF|\U000AFFFE|\U000AFFFF|\U000BFFFE\U000BFFFF|\U000CFFFE|\U000CFFFF|\U000DFFFE|\U000DFFFF|\U000EFFFE|\U000EFFFF|\U000FFFFE|\U000FFFFF|\U0010FFFE|\U0010FFFF")
1515

16-
try:
17-
fromcollectionsimportdeque
18-
exceptImportError:
19-
fromutilsimportdeque
16+
# Cache for charsUntil()
17+
charsUntilRegEx= {}
2018

2119
classHTMLInputStream(object):
2220
"""Provides a unicode stream of characters to the HTMLTokenizer.
@@ -68,7 +66,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
6866
self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,
6967
'replace')
7068

71-
self.queue=deque([])
69+
self.chunk=u""
70+
self.chunkOffset=0
71+
self.ungetBuffer= []# reversed list of chars from unget()
7272
self.readChars= []
7373
self.errors= []
7474

@@ -247,21 +247,25 @@ def char(self):
247247
""" Read one character from the stream or queue if available. Return
248248
EOF when EOF is reached.
249249
"""
250-
ifnotself.queue:
251-
self.readChunk()
252-
#If we still don't have a character we have reached EOF
253-
ifnotself.queue:
254-
returnEOF
255-
256-
char=self.queue.popleft()
257-
250+
ifself.ungetBuffer:
251+
returnself.ungetBuffer.pop()
252+
253+
ifself.chunkOffset>=len(self.chunk):
254+
ifnotself.readChunk():
255+
returnEOF
256+
257+
char=self.chunk[self.chunkOffset]
258+
self.chunkOffset+=1
259+
258260
self.readChars.append(char)
259261
returnchar
260262

261263
defreadChunk(self,chunkSize=10240):
262264
data=self.dataStream.read(chunkSize)
263265
ifnotdata:
264-
return
266+
self.chunk=u""
267+
self.chunkOffset=0
268+
returnFalse
265269
#Replace null characters
266270
foriinxrange(data.count(u"\u0000")):
267271
self.errors.append("null-character")
@@ -275,53 +279,67 @@ def readChunk(self, chunkSize=10240):
275279
self._lastChunkEndsWithCR=data[-1]=="\r"
276280
data=data.replace("\r\n","\n")
277281
data=data.replace("\r","\n")
278-
282+
279283
data=unicode(data)
280-
self.queue.extend(list(data))
284+
self.chunk=data
285+
self.chunkOffset=0
281286

282287
self.updatePosition()
288+
returnTrue
283289

284290
defcharsUntil(self,characters,opposite=False):
285291
""" Returns a string of characters from the stream up to but not
286-
including any character in characters or EOF. characters can be
287-
any container that supports the in method being called on it.
292+
including any character in 'characters' or EOF. 'characters' must be
293+
a container that supports the 'in' method and iteration over its
294+
characters.
288295
"""
289296

290-
#This method is currently 40-50% of our total runtime and badly needs
291-
#optimizing
292-
#Possible improvements:
293-
# - use regexp to find characters that match the required character set
294-
# (with regexp cache since we do the same searches many many times)
295-
# - improve EOF handling for fewer if statements
296-
297-
ifnotself.queue:
298-
self.readChunk()
299-
#Break if we have reached EOF
300-
ifnotself.queueorself.queue[0]==None:
301-
returnu""
302-
303-
i=0
304-
while (self.queue[i]incharacters)==opposite:
305-
i+=1
306-
ifi==len(self.queue):
307-
self.readChunk()
308-
#If the queue doesn't grow we have reached EOF
309-
ifi==len(self.queue)orself.queue[i]isEOF:
297+
rv= []
298+
299+
# The unget buffer is typically small and rarely used, so
300+
# just check each character individually
301+
whileself.ungetBuffer:
302+
ifself.ungetBuffer[-1]==EOFor (self.ungetBuffer[-1]incharacters)!=opposite:
303+
r=u"".join(rv)
304+
self.readChars.extend(list(r))
305+
returnr
306+
else:
307+
rv.append(self.ungetBuffer.pop())
308+
309+
# Use a cache of regexps to find the required characters
310+
try:
311+
chars=charsUntilRegEx[characters]
312+
exceptKeyError:
313+
forcincharacters:assert(ord(c)<128)
314+
regex=u"".join("\\x%02x"%ord(c)forcincharacters)
315+
ifnotopposite:
316+
regex=u"^%s"%regex
317+
chars=charsUntilRegEx[characters]=re.compile(u"[%s]*"%regex)
318+
319+
whileTrue:
320+
# Find the longest matching prefix
321+
m=chars.match(self.chunk,self.chunkOffset)
322+
# If not everything matched, return everything up to the part that didn't match
323+
ifm.end()!=len(self.chunk):
324+
rv.append(self.chunk[self.chunkOffset:m.end()])
325+
self.chunkOffset=m.end()
326+
break
327+
# If the whole chunk matched, use it all and read the next chunk
328+
rv.append(self.chunk[self.chunkOffset:])
329+
ifnotself.readChunk():
330+
# Reached EOF
310331
break
311332

312-
rv= [self.queue.popleft()forcinrange(i)]
313-
314-
self.readChars.extend(rv)
315-
316-
rv=u"".join(rv)
317-
returnrv
333+
r=u"".join(rv)
334+
self.readChars.extend(list(r))
335+
returnr
318336

319337
defunget(self,chars):
320338
self.updatePosition()
321339
ifchars:
322340
l=list(chars)
323341
l.reverse()
324-
self.queue.extendleft(l)
342+
self.ungetBuffer.extend(l)
325343
#Alter the current line, col position
326344
forcinchars[::-1]:
327345
ifcisNone:

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp