1313
1414invalid_unicode_re = re .compile (u"[\u0001 -\u0008 ]|[\u000E -\u001F ]|[\u007F -\u009F ]|[\uD800 -\uDFFF ]|[\uFDD0 -\uFDDF ]|\uFFFE |\uFFFF |\U0001FFFE |\U0001FFFF |\U0002FFFE |\U0002FFFF |\U0003FFFE |\U0003FFFF |\U0004FFFE |\U0004FFFF |\U0005FFFE |\U0005FFFF |\U0006FFFE |\U0006FFFF |\U0007FFFE |\U0007FFFF |\U0008FFFE |\U0008FFFF |\U0009FFFE |\U0009FFFF |\U000AFFFE |\U000AFFFF |\U000BFFFE \U000BFFFF |\U000CFFFE |\U000CFFFF |\U000DFFFE |\U000DFFFF |\U000EFFFE |\U000EFFFF |\U000FFFFE |\U000FFFFF |\U0010FFFE |\U0010FFFF " )
1515
16- try :
17- from collections import deque
18- except ImportError :
19- from utils import deque
16+ # Cache for charsUntil()
17+ charsUntilRegEx = {}
2018
2119class HTMLInputStream (object ):
2220"""Provides a unicode stream of characters to the HTMLTokenizer.
@@ -68,7 +66,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
6866self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
6967'replace' )
7068
71- self .queue = deque ([])
69+ self .chunk = u""
70+ self .chunkOffset = 0
71+ self .ungetBuffer = []# reversed list of chars from unget()
7272self .readChars = []
7373self .errors = []
7474
@@ -247,21 +247,25 @@ def char(self):
247247""" Read one character from the stream or queue if available. Return
248248 EOF when EOF is reached.
249249 """
250- if not self .queue :
251- self .readChunk ()
252- #If we still don't have a character we have reached EOF
253- if not self .queue :
254- return EOF
255-
256- char = self .queue .popleft ()
257-
250+ if self .ungetBuffer :
251+ return self .ungetBuffer .pop ()
252+
253+ if self .chunkOffset >= len (self .chunk ):
254+ if not self .readChunk ():
255+ return EOF
256+
257+ char = self .chunk [self .chunkOffset ]
258+ self .chunkOffset += 1
259+
258260self .readChars .append (char )
259261return char
260262
261263def readChunk (self ,chunkSize = 10240 ):
262264data = self .dataStream .read (chunkSize )
263265if not data :
264- return
266+ self .chunk = u""
267+ self .chunkOffset = 0
268+ return False
265269#Replace null characters
266270for i in xrange (data .count (u"\u0000 " )):
267271self .errors .append ("null-character" )
@@ -275,53 +279,67 @@ def readChunk(self, chunkSize=10240):
275279self ._lastChunkEndsWithCR = data [- 1 ]== "\r "
276280data = data .replace ("\r \n " ,"\n " )
277281data = data .replace ("\r " ,"\n " )
278-
282+
279283data = unicode (data )
280- self .queue .extend (list (data ))
284+ self .chunk = data
285+ self .chunkOffset = 0
281286
282287self .updatePosition ()
288+ return True
283289
284290def charsUntil (self ,characters ,opposite = False ):
285291""" Returns a string of characters from the stream up to but not
286- including any character in characters or EOF. characters can be
287- any container that supports the in method being called on it.
292+ including any character in 'characters' or EOF. 'characters' must be
293+ a container that supports the 'in' method and iteration over its
294+ characters.
288295 """
289296
290- #This method is currently 40-50% of our total runtime and badly needs
291- #optimizing
292- #Possible improvements:
293- # - use regexp to find characters that match the required character set
294- # (with regexp cache since we do the same searches many many times)
295- # - improve EOF handling for fewer if statements
296-
297- if not self .queue :
298- self .readChunk ()
299- #Break if we have reached EOF
300- if not self .queue or self .queue [0 ]== None :
301- return u""
302-
303- i = 0
304- while (self .queue [i ]in characters )== opposite :
305- i += 1
306- if i == len (self .queue ):
307- self .readChunk ()
308- #If the queue doesn't grow we have reached EOF
309- if i == len (self .queue )or self .queue [i ]is EOF :
297+ rv = []
298+
299+ # The unget buffer is typically small and rarely used, so
300+ # just check each character individually
301+ while self .ungetBuffer :
302+ if self .ungetBuffer [- 1 ]== EOF or (self .ungetBuffer [- 1 ]in characters )!= opposite :
303+ r = u"" .join (rv )
304+ self .readChars .extend (list (r ))
305+ return r
306+ else :
307+ rv .append (self .ungetBuffer .pop ())
308+
309+ # Use a cache of regexps to find the required characters
310+ try :
311+ chars = charsUntilRegEx [characters ]
312+ except KeyError :
313+ for c in characters :assert (ord (c )< 128 )
314+ regex = u"" .join ("\\ x%02x" % ord (c )for c in characters )
315+ if not opposite :
316+ regex = u"^%s" % regex
317+ chars = charsUntilRegEx [characters ]= re .compile (u"[%s]*" % regex )
318+
319+ while True :
320+ # Find the longest matching prefix
321+ m = chars .match (self .chunk ,self .chunkOffset )
322+ # If not everything matched, return everything up to the part that didn't match
323+ if m .end ()!= len (self .chunk ):
324+ rv .append (self .chunk [self .chunkOffset :m .end ()])
325+ self .chunkOffset = m .end ()
326+ break
327+ # If the whole chunk matched, use it all and read the next chunk
328+ rv .append (self .chunk [self .chunkOffset :])
329+ if not self .readChunk ():
330+ # Reached EOF
310331break
311332
312- rv = [self .queue .popleft ()for c in range (i )]
313-
314- self .readChars .extend (rv )
315-
316- rv = u"" .join (rv )
317- return rv
333+ r = u"" .join (rv )
334+ self .readChars .extend (list (r ))
335+ return r
318336
319337def unget (self ,chars ):
320338self .updatePosition ()
321339if chars :
322340l = list (chars )
323341l .reverse ()
324- self .queue . extendleft (l )
342+ self .ungetBuffer . extend (l )
325343#Alter the current line, col position
326344for c in chars [::- 1 ]:
327345if c is None :