77from .constants import encodings ,ReparseException
88from .import utils
99
10+ from io import StringIO
11+
12+ try :
13+ from io import BytesIO
14+ except ImportError :
15+ BytesIO = StringIO
16+
17+ try :
18+ from io import BufferedIOBase
19+ except ImportError :
20+ class BufferedIOBase (object ):
21+ pass
22+
1023#Non-unicode versions of constants for use in the pre-parser
1124spaceCharactersBytes = frozenset ([str (item )for item in spaceCharacters ])
1225asciiLettersBytes = frozenset ([str (item )for item in asciiLetters ])
@@ -101,10 +114,21 @@ def _readFromBuffer(self, bytes):
101114rv .append (self ._readStream (remainingBytes ))
102115
103116return "" .join (rv )
104-
105117
106118
107- class HTMLInputStream :
119+ def HTMLInputStream (source ,encoding = None ,parseMeta = True ,chardet = True ):
120+ if hasattr (source ,"read" ):
121+ isUnicode = isinstance (source .read (0 ),str )
122+ else :
123+ isUnicode = isinstance (source ,str )
124+
125+ if isUnicode :
126+ return HTMLUnicodeInputStream (source )
127+ else :
128+ return HTMLBinaryInputStream (source ,encoding ,parseMeta ,chardet )
129+
130+
131+ class HTMLUnicodeInputStream :
108132"""Provides a unicode stream of characters to the HTMLTokenizer.
109133
110134 This class takes care of character encoding and removing or replacing
@@ -114,7 +138,7 @@ class HTMLInputStream:
114138
115139_defaultChunkSize = 10240
116140
117- def __init__ (self ,source , encoding = None , parseMeta = True , chardet = True ):
141+ def __init__ (self ,source ):
118142"""Initialises the HTMLInputStream.
119143
120144 HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -142,32 +166,12 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
142166# List of where new lines occur
143167self .newLines = [0 ]
144168
145- self .charEncoding = (codecName (encoding ),"certain" )
146-
147- # Raw Stream - for unicode objects this will encode to utf-8 and set
148- # self.charEncoding as appropriate
149- self .rawStream = self .openStream (source )
150-
151- # Encoding Information
152- #Number of bytes to use when looking for a meta element with
153- #encoding information
154- self .numBytesMeta = 512
155- #Number of bytes to use when using detecting encoding using chardet
156- self .numBytesChardet = 100
157- #Encoding to use if no other information can be found
158- self .defaultEncoding = "windows-1252"
159-
160- #Detect encoding iff no explicit "transport level" encoding is supplied
161- if (self .charEncoding [0 ]is None ):
162- self .charEncoding = self .detectEncoding (parseMeta ,chardet )
163-
169+ self .charEncoding = ("utf-8" ,"certain" )
170+ self .dataStream = self .openStream (source )
164171
165172self .reset ()
166173
167174def reset (self ):
168- self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
169- 'replace' )
170-
171175self .chunk = ""
172176self .chunkSize = 0
173177self .chunkOffset = 0
@@ -191,128 +195,16 @@ def openStream(self, source):
191195if hasattr (source ,'read' ):
192196stream = source
193197else :
194- # Otherwise treat source as a string and convert to a file object
195- if isinstance (source ,str ):
196- # XXX: we should handle lone surrogates here
197- source = source .encode ('utf-8' ,errors = "replace" )
198- self .charEncoding = ("utf-8" ,"certain" )
199- try :
200- from io import BytesIO
201- except :
202- try :
203- # 2to3 converts this line to: from io import StringIO
204- from io import StringIO as BytesIO
205- except :
206- from io import StringIO as BytesIO
207- stream = BytesIO (source )
198+ stream = StringIO (source )
208199
209- if (not (hasattr (stream ,"tell" )and hasattr (stream ,"seek" ))or
200+ if (#not isinstance(stream, BufferedIOBase) and
201+ not (hasattr (stream ,"tell" )and
202+ hasattr (stream ,"seek" ))or
210203stream is sys .stdin ):
211204stream = BufferedStream (stream )
212205
213206return stream
214207
215- def detectEncoding (self ,parseMeta = True ,chardet = True ):
216- #First look for a BOM
217- #This will also read past the BOM if present
218- encoding = self .detectBOM ()
219- confidence = "certain"
220- #If there is no BOM need to look for meta elements with encoding
221- #information
222- if encoding is None and parseMeta :
223- encoding = self .detectEncodingMeta ()
224- confidence = "tentative"
225- #Guess with chardet, if avaliable
226- if encoding is None and chardet :
227- confidence = "tentative"
228- try :
229- from chardet .universaldetector import UniversalDetector
230- buffers = []
231- detector = UniversalDetector ()
232- while not detector .done :
233- buffer = self .rawStream .read (self .numBytesChardet )
234- assert isinstance (buffer ,bytes )
235- if not buffer :
236- break
237- buffers .append (buffer )
238- detector .feed (buffer )
239- detector .close ()
240- encoding = detector .result ['encoding' ]
241- self .rawStream .seek (0 )
242- except ImportError :
243- pass
244- # If all else fails use the default encoding
245- if encoding is None :
246- confidence = "tentative"
247- encoding = self .defaultEncoding
248-
249- #Substitute for equivalent encodings:
250- encodingSub = {"iso-8859-1" :"windows-1252" }
251-
252- if encoding .lower ()in encodingSub :
253- encoding = encodingSub [encoding .lower ()]
254-
255- return encoding ,confidence
256-
257- def changeEncoding (self ,newEncoding ):
258- newEncoding = codecName (newEncoding )
259- if newEncoding in ("utf-16" ,"utf-16-be" ,"utf-16-le" ):
260- newEncoding = "utf-8"
261- if newEncoding is None :
262- return
263- elif newEncoding == self .charEncoding [0 ]:
264- self .charEncoding = (self .charEncoding [0 ],"certain" )
265- else :
266- self .rawStream .seek (0 )
267- self .reset ()
268- self .charEncoding = (newEncoding ,"certain" )
269- raise ReparseException ("Encoding changed from %s to %s" % (self .charEncoding [0 ],newEncoding ))
270-
271- def detectBOM (self ):
272- """Attempts to detect at BOM at the start of the stream. If
273- an encoding can be determined from the BOM return the name of the
274- encoding otherwise return None"""
275- bomDict = {
276- codecs .BOM_UTF8 :'utf-8' ,
277- codecs .BOM_UTF16_LE :'utf-16-le' ,codecs .BOM_UTF16_BE :'utf-16-be' ,
278- codecs .BOM_UTF32_LE :'utf-32-le' ,codecs .BOM_UTF32_BE :'utf-32-be'
279- }
280-
281- # Go to beginning of file and read in 4 bytes
282- string = self .rawStream .read (4 )
283- assert isinstance (string ,bytes )
284-
285- # Try detecting the BOM using bytes from the string
286- encoding = bomDict .get (string [:3 ])# UTF-8
287- seek = 3
288- if not encoding :
289- # Need to detect UTF-32 before UTF-16
290- encoding = bomDict .get (string )# UTF-32
291- seek = 4
292- if not encoding :
293- encoding = bomDict .get (string [:2 ])# UTF-16
294- seek = 2
295-
296- # Set the read position past the BOM if one was found, otherwise
297- # set it to the start of the stream
298- self .rawStream .seek (encoding and seek or 0 )
299-
300- return encoding
301-
302- def detectEncodingMeta (self ):
303- """Report the encoding declared by the meta element
304- """
305- buffer = self .rawStream .read (self .numBytesMeta )
306- assert isinstance (buffer ,bytes )
307- parser = EncodingParser (buffer )
308- self .rawStream .seek (0 )
309- encoding = parser .getEncoding ()
310-
311- if encoding in ("utf-16" ,"utf-16-be" ,"utf-16-le" ):
312- encoding = "utf-8"
313-
314- return encoding
315-
316208def _position (self ,offset ):
317209chunk = self .chunk
318210nLines = chunk .count ('\n ' ,0 ,offset )
@@ -475,6 +367,177 @@ def unget(self, char):
475367self .chunkOffset -= 1
476368assert self .chunk [self .chunkOffset ]== char
477369
370+ class HTMLBinaryInputStream (HTMLUnicodeInputStream ):
371+ """Provides a unicode stream of characters to the HTMLTokenizer.
372+
373+ This class takes care of character encoding and removing or replacing
374+ incorrect byte-sequences and also provides column and line tracking.
375+
376+ """
377+
378+ def __init__ (self ,source ,encoding = None ,parseMeta = True ,chardet = True ):
379+ """Initialises the HTMLInputStream.
380+
381+ HTMLInputStream(source, [encoding]) -> Normalized stream from source
382+ for use by html5lib.
383+
384+ source can be either a file-object, local filename or a string.
385+
386+ The optional encoding parameter must be a string that indicates
387+ the encoding. If specified, that encoding will be used,
388+ regardless of any BOM or later declaration (such as in a meta
389+ element)
390+
391+ parseMeta - Look for a <meta> element containing encoding information
392+
393+ """
394+ self .charEncoding = (codecName (encoding ),"certain" )
395+
396+ # Raw Stream - for unicode objects this will encode to utf-8 and set
397+ # self.charEncoding as appropriate
398+ self .rawStream = self .openStream (source )
399+
400+ # Encoding Information
401+ #Number of bytes to use when looking for a meta element with
402+ #encoding information
403+ self .numBytesMeta = 512
404+ #Number of bytes to use when using detecting encoding using chardet
405+ self .numBytesChardet = 100
406+ #Encoding to use if no other information can be found
407+ self .defaultEncoding = "windows-1252"
408+
409+ #Detect encoding iff no explicit "transport level" encoding is supplied
410+ if (self .charEncoding [0 ]is None ):
411+ self .charEncoding = self .detectEncoding (parseMeta ,chardet )
412+
413+ #Call superclass
414+ HTMLUnicodeInputStream .__init__ (self ,self .rawStream )
415+
416+ def reset (self ):
417+ self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
418+ 'replace' )
419+ HTMLUnicodeInputStream .reset (self )
420+
421+ def openStream (self ,source ):
422+ """Produces a file object from source.
423+
424+ source can be either a file object, local filename or a string.
425+
426+ """
427+ # Already a file object
428+ if hasattr (source ,'read' ):
429+ stream = source
430+ else :
431+ stream = BytesIO (source )
432+
433+ if (not (hasattr (stream ,"tell" )and hasattr (stream ,"seek" ))or
434+ stream is sys .stdin ):
435+ stream = BufferedStream (stream )
436+
437+ return stream
438+
439+ def detectEncoding (self ,parseMeta = True ,chardet = True ):
440+ #First look for a BOM
441+ #This will also read past the BOM if present
442+ encoding = self .detectBOM ()
443+ confidence = "certain"
444+ #If there is no BOM need to look for meta elements with encoding
445+ #information
446+ if encoding is None and parseMeta :
447+ encoding = self .detectEncodingMeta ()
448+ confidence = "tentative"
449+ #Guess with chardet, if avaliable
450+ if encoding is None and chardet :
451+ confidence = "tentative"
452+ try :
453+ from chardet .universaldetector import UniversalDetector
454+ buffers = []
455+ detector = UniversalDetector ()
456+ while not detector .done :
457+ buffer = self .rawStream .read (self .numBytesChardet )
458+ assert isinstance (buffer ,bytes )
459+ if not buffer :
460+ break
461+ buffers .append (buffer )
462+ detector .feed (buffer )
463+ detector .close ()
464+ encoding = detector .result ['encoding' ]
465+ self .rawStream .seek (0 )
466+ except ImportError :
467+ pass
468+ # If all else fails use the default encoding
469+ if encoding is None :
470+ confidence = "tentative"
471+ encoding = self .defaultEncoding
472+
473+ #Substitute for equivalent encodings:
474+ encodingSub = {"iso-8859-1" :"windows-1252" }
475+
476+ if encoding .lower ()in encodingSub :
477+ encoding = encodingSub [encoding .lower ()]
478+
479+ return encoding ,confidence
480+
481+ def changeEncoding (self ,newEncoding ):
482+ assert self .charEncoding [1 ]!= "certain"
483+ newEncoding = codecName (newEncoding )
484+ if newEncoding in ("utf-16" ,"utf-16-be" ,"utf-16-le" ):
485+ newEncoding = "utf-8"
486+ if newEncoding is None :
487+ return
488+ elif newEncoding == self .charEncoding [0 ]:
489+ self .charEncoding = (self .charEncoding [0 ],"certain" )
490+ else :
491+ self .rawStream .seek (0 )
492+ self .reset ()
493+ self .charEncoding = (newEncoding ,"certain" )
494+ raise ReparseException ("Encoding changed from %s to %s" % (self .charEncoding [0 ],newEncoding ))
495+
496+ def detectBOM (self ):
497+ """Attempts to detect at BOM at the start of the stream. If
498+ an encoding can be determined from the BOM return the name of the
499+ encoding otherwise return None"""
500+ bomDict = {
501+ codecs .BOM_UTF8 :'utf-8' ,
502+ codecs .BOM_UTF16_LE :'utf-16-le' ,codecs .BOM_UTF16_BE :'utf-16-be' ,
503+ codecs .BOM_UTF32_LE :'utf-32-le' ,codecs .BOM_UTF32_BE :'utf-32-be'
504+ }
505+
506+ # Go to beginning of file and read in 4 bytes
507+ string = self .rawStream .read (4 )
508+ assert isinstance (string ,bytes )
509+
510+ # Try detecting the BOM using bytes from the string
511+ encoding = bomDict .get (string [:3 ])# UTF-8
512+ seek = 3
513+ if not encoding :
514+ # Need to detect UTF-32 before UTF-16
515+ encoding = bomDict .get (string )# UTF-32
516+ seek = 4
517+ if not encoding :
518+ encoding = bomDict .get (string [:2 ])# UTF-16
519+ seek = 2
520+
521+ # Set the read position past the BOM if one was found, otherwise
522+ # set it to the start of the stream
523+ self .rawStream .seek (encoding and seek or 0 )
524+
525+ return encoding
526+
527+ def detectEncodingMeta (self ):
528+ """Report the encoding declared by the meta element
529+ """
530+ buffer = self .rawStream .read (self .numBytesMeta )
531+ assert isinstance (buffer ,bytes )
532+ parser = EncodingParser (buffer )
533+ self .rawStream .seek (0 )
534+ encoding = parser .getEncoding ()
535+
536+ if encoding in ("utf-16" ,"utf-16-be" ,"utf-16-le" ):
537+ encoding = "utf-8"
538+
539+ return encoding
540+
478541class EncodingBytes (str ):
479542"""String-like object with an associated position and various extra methods
480543 If the position is ever greater than the string length then an exception is