@@ -72,11 +72,17 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
7272self .chunk = u""
7373self .chunkSize = 0
7474self .chunkOffset = 0
75- self .ungetBuffer = []# reversed list of chars from unget()
76- self .readChars = []
7775self .errors = []
78-
79- self .lineLengths = []
76+ # Single-character buffer to handle 'unget'
77+ self .ungetChar = u"" # use u"" to mean 'no character' (because None means EOF)
78+
79+ # Remember the current position in the document
80+ self .positionLine = 1
81+ self .positionCol = 0
82+ # Remember the length of the last line, so unget("\n") can restore
83+ # positionCol. (Only one character can be ungot at once, so we only
84+ # need to remember the single last line.)
85+ self .lastLineLength = None
8086
8187#Flag to indicate we may have a CR LF broken across a data chunk
8288self ._lastChunkEndsWithCR = False
@@ -219,51 +225,59 @@ def detectEncodingMeta(self):
219225encoding = parser .getEncoding ()
220226return encoding
221227
222- def updatePosition (self ):
223- #Remove EOF from readChars, if present
224- if not self .readChars :
225- return
226- if self .readChars and self .readChars [- 1 ]== EOF :
227- #There may be more than one EOF in readChars so we cannot assume
228- #readChars.index(EOF) == -1
229- self .readChars = self .readChars [:self .readChars .index (EOF )]
230- readChars = "" .join (self .readChars )
231- lines = readChars .split ("\n " )
232- if self .lineLengths :
233- self .lineLengths [- 1 ]+= len (lines [0 ])
228+ def updatePosition (self ,chars ):
229+ # Update the position attributes to correspond to some sequence of
230+ # read characters
231+
232+ # Find the last newline character
233+ idx = chars .rfind (u"\n " )
234+ if idx == - 1 :
235+ # No newlines in chars
236+ self .positionCol += len (chars )
234237else :
235- self .lineLengths .append (len (lines [0 ]))
236- for line in lines [1 :]:
237- self .lineLengths .append (len (line ))
238- self .readChars = []
239- #print self.lineLengths
238+ # Find the last-but-one newline character
239+ idx2 = chars .rfind (u"\n " ,0 ,idx )
240+ if idx2 == - 1 :
241+ # Only one newline in chars
242+ self .positionLine += 1
243+ self .lastLineLength = self .positionCol + idx
244+ self .positionCol = len (chars )- (idx + 1 )
245+ else :
246+ # At least two newlines in chars
247+ newlines = chars .count (u"\n " )
248+ self .positionLine += newlines
249+ self .lastLineLength = idx - (idx2 + 1 )
250+ self .positionCol = len (chars )- (idx + 1 )
240251
241252def position (self ):
242253"""Returns (line, col) of the current position in the stream."""
243- self .updatePosition ()
244- if self .lineLengths :
245- line ,col = len (self .lineLengths ),self .lineLengths [- 1 ]
246- else :
247- line ,col = 1 ,0
248- return (line ,col )
254+ return (self .positionLine ,self .positionCol )
249255
250256def char (self ):
251257""" Read one character from the stream or queue if available. Return
252258 EOF when EOF is reached.
253259 """
254- if self .ungetBuffer :
255- char = self .ungetBuffer .pop ()
256- self .readChars .append (char )
257- return char
258-
259- if self .chunkOffset >= self .chunkSize :
260- if not self .readChunk ():
261- return EOF
262-
263- char = self .chunk [self .chunkOffset ]
264- self .chunkOffset += 1
260+ char = self .ungetChar
261+ if char != u"" :
262+ # Use the ungot character, and reset the buffer
263+ self .ungetChar = u""
264+ else :
265+ # Read a new chunk from the input stream if necessary
266+ if self .chunkOffset >= self .chunkSize :
267+ if not self .readChunk ():
268+ return EOF
269+
270+ char = self .chunk [self .chunkOffset ]
271+ self .chunkOffset += 1
272+
273+ # Update the position attributes
274+ if char == u"\n " :
275+ self .lastLineLength = self .positionCol
276+ self .positionCol = 0
277+ self .positionLine += 1
278+ elif char is not EOF :
279+ self .positionCol += 1
265280
266- self .readChars .append (char )
267281return char
268282
269283def readChunk (self ,chunkSize = _defaultChunkSize ):
@@ -282,20 +296,18 @@ def readChunk(self, chunkSize=_defaultChunkSize):
282296
283297data = data .replace (u"\u0000 " ,u"\ufffd " )
284298#Check for CR LF broken across chunks
285- if (self ._lastChunkEndsWithCR and data [0 ]== "\n " ):
299+ if (self ._lastChunkEndsWithCR and data [0 ]== u "\n " ):
286300data = data [1 :]
287301# Stop if the chunk is now empty
288302if not data :
289303return False
290- self ._lastChunkEndsWithCR = data [- 1 ]== "\r "
291- data = data .replace ("\r \n " ,"\n " )
292- data = data .replace ("\r " ,"\n " )
304+ self ._lastChunkEndsWithCR = data [- 1 ]== u "\r "
305+ data = data .replace (u "\r \n " ,u "\n " )
306+ data = data .replace (u "\r " ,u "\n " )
293307
294- data = unicode (data )
295308self .chunk = data
296309self .chunkSize = len (data )
297310
298- self .updatePosition ()
299311return True
300312
301313def charsUntil (self ,characters ,opposite = False ):
@@ -307,22 +319,22 @@ def charsUntil(self, characters, opposite = False):
307319
308320rv = []
309321
310- # The unget buffer is typically small and rarely used, so
311- # just check each character individually
312- while self .ungetBuffer :
313- if self .ungetBuffer [- 1 ]== EOF or (self .ungetBuffer [- 1 ]in characters )!= opposite :
314- r = u"" .join (rv )
315- self .readChars .extend (list (r ))
316- return r
322+ # Check the ungot character, if any.
323+ # (Since it's only a single character, don't use the regex here)
324+ char = self .ungetChar
325+ if char != u"" :
326+ if char is EOF or (char in characters )!= opposite :
327+ return u""
317328else :
318- rv .append (self .ungetBuffer .pop ())
329+ rv .append (char )
330+ self .ungetChar = u""
319331
320332# Use a cache of regexps to find the required characters
321333try :
322334chars = charsUntilRegEx [(characters ,opposite )]
323335except KeyError :
324336for c in characters :assert (ord (c )< 128 )
325- regex = u"" .join (["\\ x%02x" % ord (c )for c in characters ])
337+ regex = u"" .join ([u "\\ x%02x"% ord (c )for c in characters ])
326338if not opposite :
327339regex = u"^%s" % regex
328340chars = charsUntilRegEx [(characters ,opposite )]= re .compile (u"[%s]*" % regex )
@@ -343,24 +355,27 @@ def charsUntil(self, characters, opposite = False):
343355break
344356
345357r = u"" .join (rv )
346- self .readChars . extend ( list ( r ) )
358+ self .updatePosition ( r )
347359return r
348360
349- def unget (self ,chars ):
350- self .updatePosition ()
351- if chars :
352- l = list (chars )
353- l .reverse ()
354- self .ungetBuffer .extend (l )
355- #Alter the current line, col position
356- for c in chars [::- 1 ]:
357- if c is None :
358- continue
359- elif c == '\n ' :
360- assert self .lineLengths [- 1 ]== 0
361- self .lineLengths .pop ()
362- else :
363- self .lineLengths [- 1 ]-= 1
361+ def unget (self ,char ):
362+ # Only one character is allowed to be ungotten at once - it must
363+ # be consumed again before any further call to unget
364+ assert self .ungetChar == u""
365+
366+ self .ungetChar = char
367+
368+ # Update the position attributes
369+ if char is None :
370+ pass
371+ elif char == u"\n " :
372+ assert self .positionLine >= 1
373+ assert self .lastLineLength is not None
374+ self .positionLine -= 1
375+ self .positionCol = self .lastLineLength
376+ self .lastLineLength = None
377+ else :
378+ self .positionCol -= 1
364379
365380class EncodingBytes (str ):
366381"""String-like object with an assosiated position and various extra methods