33import types
44
55from constants import EOF ,spaceCharacters ,asciiLetters ,asciiUppercase
6- from constants import encodings
6+ from constants import encodings , ReparseException
77
88#Non-unicode versions of constants for use in the pre-parser
99spaceCharactersBytes = [str (item )for item in spaceCharacters ]
1616
1717# Cache for charsUntil()
1818charsUntilRegEx = {}
19+
20+ class BufferedStream :
21+ """Buffering for streams that do not have buffering of their own
22+
23+ The buffer is implemented as a list of chunks on the assumption that
24+ joining many strings will be slow since it is O(n**2)
25+ """
26+
27+ def __init__ (self ,stream ):
28+ self .stream = stream
29+ self .buffer = []
30+ self .position = [- 1 ,0 ]#chunk number, offset
31+
32+ def tell (self ):
33+ pos = 0
34+ for chunk in self .buffer [:self .position [0 ]]:
35+ pos += len (chunk )
36+ pos += self .position [1 ]
37+ return pos
38+
39+ def seek (self ,pos ):
40+ assert pos < self ._bufferedBytes ()
41+ offset = pos
42+ i = 0
43+ while len (self .buffer [i ])< offset :
44+ offset -= pos
45+ i += 1
46+ self .position = [i ,offset ]
47+
48+ def read (self ,bytes ):
49+ if not self .buffer :
50+ return self ._readStream (bytes )
51+ elif (self .position [0 ]== len (self .buffer )and
52+ self .position [1 ]== len (self .buffer [- 1 ])):
53+ return self ._readStream (bytes )
54+ else :
55+ return self ._readFromBuffer (bytes )
56+
57+ def _bufferedBytes (self ):
58+ return sum ([len (item )for item in self .buffer ])
59+
60+ def _readStream (self ,bytes ):
61+ data = self .stream .read (bytes )
62+ self .buffer .append (data )
63+ self .position [0 ]+= 1
64+ self .position [1 ]= len (data )
65+ return data
66+
67+ def _readFromBuffer (self ,bytes ):
68+ remainingBytes = bytes
69+ rv = []
70+ bufferIndex = self .position [0 ]
71+ bufferOffset = self .position [1 ]
72+ while bufferIndex < len (self .buffer )and remainingBytes != 0 :
73+ assert remainingBytes > 0
74+ bufferedData = self .buffer [bufferIndex ]
75+
76+ if remainingBytes <= len (bufferedData )- bufferOffset :
77+ bytesToRead = remainingBytes
78+ self .position = [bufferIndex ,bufferOffset + bytesToRead ]
79+ else :
80+ bytesToRead = len (bufferedData )- bufferOffset
81+ self .position = [bufferIndex ,len (bufferedData )]
82+ bufferIndex += 1
83+ data = rv .append (bufferedData [bufferOffset :
84+ bufferOffset + bytesToRead ])
85+ remainingBytes -= bytesToRead
86+
87+ bufferOffset = 0
88+
89+ if remainingBytes :
90+ rv .append (self ._readStream (remainingBytes ))
91+
92+ return "" .join (rv )
93+
94+
1995
2096class HTMLInputStream :
2197"""Provides a unicode stream of characters to the HTMLTokenizer.
@@ -65,6 +141,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
65141if (self .charEncoding [0 ]is None ):
66142self .charEncoding = self .detectEncoding (parseMeta ,chardet )
67143
144+ self .reset ()
145+
146+ def reset (self ):
68147self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
69148'replace' )
70149
@@ -100,6 +179,10 @@ def openStream(self, source):
100179self .charEncoding = ("utf-8" ,"certain" )
101180import cStringIO
102181stream = cStringIO .StringIO (str (source ))
182+
183+ if not (hasattr (stream ,"tell" )and hasattr (stream ,"seek" )):
184+ stream = BufferedStream (stream )
185+
103186return stream
104187
105188def detectEncoding (self ,parseMeta = True ,chardet = True ):
@@ -128,7 +211,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
128211detector .feed (buffer )
129212detector .close ()
130213encoding = detector .result ['encoding' ]
131- self .seek ("" . join ( buffers ), 0 )
214+ self .rawStream . seek (0 )
132215except ImportError :
133216pass
134217# If all else fails use the default encoding
@@ -146,16 +229,18 @@ def detectEncoding(self, parseMeta=True, chardet=True):
146229
147230def changeEncoding (self ,newEncoding ):
148231newEncoding = codecName (newEncoding )
149- if newEncoding == "utf16" :
150- newEncoding = "utf8"
151-
232+ if newEncoding in ("utf-16" ,"utf-16-be" ,"utf-16-le" ):
233+ newEncoding = "utf-8"
152234if newEncoding is None :
153235return
154236elif newEncoding == self .charEncoding [0 ]:
155- self .charEncoding = (self .charEncoding [0 ]and "certian" )
237+ self .charEncoding = (self .charEncoding [0 ], "certian" )
156238else :
157- raise NotImplementedError ,"Cannot change character encoding mid stream"
158-
239+ self .rawStream .seek (0 )
240+ self .reset ()
241+ self .charEncoding = (newEncoding ,"certian" )
242+ raise ReparseException ,"Encoding changed from %s to %s" % (self .charEncoding [0 ],newEncoding )
243+
159244def detectBOM (self ):
160245"""Attempts to detect at BOM at the start of the stream. If
161246 an encoding can be determined from the BOM return the name of the
@@ -182,56 +267,21 @@ def detectBOM(self):
182267
183268# Set the read position past the BOM if one was found, otherwise
184269# set it to the start of the stream
185- self .seek (string , encoding and seek or 0 )
270+ self .rawStream . seek (encoding and seek or 0 )
186271
187272return encoding
188273
189- def seek (self ,buffer ,n ):
190- """Unget buffer[n:]"""
191- if hasattr (self .rawStream ,'unget' ):
192- self .rawStream .unget (buffer [n :])
193- return
194-
195- if hasattr (self .rawStream ,'seek' ):
196- try :
197- self .rawStream .seek (n )
198- return
199- except IOError :
200- pass
201-
202- class BufferedStream :
203- def __init__ (self ,data ,stream ):
204- self .data = data
205- self .stream = stream
206- def read (self ,chars = - 1 ):
207- if chars == - 1 or chars > len (self .data ):
208- result = self .data
209- self .data = ''
210- if chars == - 1 :
211- return result + self .stream .read ()
212- else :
213- return result + self .stream .read (chars - len (result ))
214- elif not self .data :
215- return self .stream .read (chars )
216- else :
217- result = self .data [:chars ]
218- self .data = self .data [chars :]
219- return result
220- def unget (self ,data ):
221- if self .data :
222- self .data += data
223- else :
224- self .data = data
225-
226- self .rawStream = BufferedStream (buffer [n :],self .rawStream )
227-
228274def detectEncodingMeta (self ):
229275"""Report the encoding declared by the meta element
230276 """
231277buffer = self .rawStream .read (self .numBytesMeta )
232278parser = EncodingParser (buffer )
233- self .seek (buffer , 0 )
279+ self .rawStream . seek (0 )
234280encoding = parser .getEncoding ()
281+
282+ if encoding in ("utf-16" ,"utf-16-be" ,"utf-16-le" ):
283+ encoding = "utf-8"
284+
235285return encoding
236286
237287def updatePosition (self ,chars ):
@@ -485,13 +535,6 @@ def getEncoding(self):
485535break
486536if not keepParsing :
487537break
488- if self .encoding is not None :
489- self .encoding = self .encoding .strip ()
490- #Spec violation that complies with hsivonen + mjs
491- if (ascii_punctuation_re .sub ("" ,self .encoding )in
492- ("utf16" ,"utf16be" ,"utf16le" ,
493- "utf32" ,"utf32be" ,"utf32le" )):
494- self .encoding = "utf-8"
495538
496539return self .encoding
497540
@@ -666,11 +709,12 @@ def parse(self):
666709except StopIteration :
667710return None
668711
712+
669713def codecName (encoding ):
670714"""Return the python codec name corresponding to an encoding or None if the
671715 string doesn't correspond to a valid encoding."""
672- if (encoding is not None and type (encoding )== types .StringType ):
716+ if (encoding is not None and type (encoding )in types .StringTypes ):
673717canonicalName = ascii_punctuation_re .sub ("" ,encoding ).lower ()
674- return encodings .get (canonicalName ,None )
718+ return encodings .get (canonicalName ,None )
675719else :
676720return None