@@ -73,8 +73,6 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
7373self .chunkSize = 0
7474self .chunkOffset = 0
7575self .errors = []
76- # Single-character buffer to handle 'unget'
77- self .ungetChar = u"" # use u"" to mean 'no character' (because None means EOF)
7876
7977# Remember the current position in the document
8078self .positionLine = 1
@@ -257,18 +255,13 @@ def char(self):
257255""" Read one character from the stream or queue if available. Return
258256 EOF when EOF is reached.
259257 """
260- char = self .ungetChar
261- if char != u"" :
262- # Use the ungot character, and reset the buffer
263- self .ungetChar = u""
264- else :
265- # Read a new chunk from the input stream if necessary
266- if self .chunkOffset >= self .chunkSize :
267- if not self .readChunk ():
268- return EOF
258+ # Read a new chunk from the input stream if necessary
259+ if self .chunkOffset >= self .chunkSize :
260+ if not self .readChunk ():
261+ return EOF
269262
270- char = self .chunk [self .chunkOffset ]
271- self .chunkOffset += 1
263+ char = self .chunk [self .chunkOffset ]
264+ self .chunkOffset += 1
272265
273266# Update the position attributes
274267if char == u"\n " :
@@ -317,18 +310,6 @@ def charsUntil(self, characters, opposite = False):
317310 characters.
318311 """
319312
320- rv = []
321-
322- # Check the ungot character, if any.
323- # (Since it's only a single character, don't use the regex here)
324- char = self .ungetChar
325- if char != u"" :
326- if char is EOF or (char in characters )!= opposite :
327- return u""
328- else :
329- rv .append (char )
330- self .ungetChar = u""
331-
332313# Use a cache of regexps to find the required characters
333314try :
334315chars = charsUntilRegEx [(characters ,opposite )]
@@ -339,6 +320,8 @@ def charsUntil(self, characters, opposite = False):
339320regex = u"^%s" % regex
340321chars = charsUntilRegEx [(characters ,opposite )]= re .compile (u"[%s]+" % regex )
341322
323+ rv = []
324+
342325while True :
343326# Find the longest matching prefix
344327m = chars .match (self .chunk ,self .chunkOffset )
@@ -369,21 +352,29 @@ def charsUntil(self, characters, opposite = False):
369352def unget (self ,char ):
370353# Only one character is allowed to be ungotten at once - it must
371354# be consumed again before any further call to unget
372- assert self .ungetChar == u""
373-
374- self .ungetChar = char
375355
376- # Update the position attributes
377- if char is None :
378- pass
379- elif char == u"\n " :
380- assert self .positionLine >= 1
381- assert self .lastLineLength is not None
382- self .positionLine -= 1
383- self .positionCol = self .lastLineLength
384- self .lastLineLength = None
385- else :
386- self .positionCol -= 1
356+ if char is not None :
357+ if self .chunkOffset == 0 :
358+ # unget is called quite rarely, so it's a good idea to do
359+ # more work here if it saves a bit of work in the frequently
360+ # called char and charsUntil.
361+ # So, just prepend the ungotten character onto the current
362+ # chunk:
363+ self .chunk = char + self .chunk
364+ self .chunkSize += 1
365+ else :
366+ self .chunkOffset -= 1
367+ assert self .chunk [self .chunkOffset ]== char
368+
369+ # Update the position attributes
370+ if char == u"\n " :
371+ assert self .positionLine >= 1
372+ assert self .lastLineLength is not None
373+ self .positionLine -= 1
374+ self .positionCol = self .lastLineLength
375+ self .lastLineLength = None
376+ else :
377+ self .positionCol -= 1
387378
388379class EncodingBytes (str ):
389380"""String-like object with an assosiated position and various extra methods