@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128128return b"" .join (rv )
129129
130130
131- def HTMLInputStream (source ,encoding = None ,parseMeta = True , chardet = True ):
131+ def HTMLInputStream (source ,override_encoding = None ,** kwargs ):
132132# Work around Python bug #20007: read(0) closes the connection.
133133# http://bugs.python.org/issue20007
134134if (isinstance (source ,http_client .HTTPResponse )or
@@ -142,12 +142,12 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142142isUnicode = isinstance (source ,text_type )
143143
144144if isUnicode :
145- if encoding is not None :
146- raise TypeError ("Cannotexplicitly set an encoding with a unicodestring " )
145+ if override_encoding is not None :
146+ raise TypeError ("Cannot set anoverride encoding with a unicodeinput " )
147147
148148return HTMLUnicodeInputStream (source )
149149else :
150- return HTMLBinaryInputStream (source ,encoding , parseMeta , chardet )
150+ return HTMLBinaryInputStream (source ,override_encoding = override_encoding , ** kwargs )
151151
152152
153153class HTMLUnicodeInputStream (object ):
@@ -173,8 +173,6 @@ def __init__(self, source):
173173 regardless of any BOM or later declaration (such as in a meta
174174 element)
175175
176- parseMeta - Look for a <meta> element containing encoding information
177-
178176 """
179177
180178if not utils .supports_lone_surrogates :
@@ -390,7 +388,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390388
391389 """
392390
393- def __init__ (self ,source ,encoding = None ,parseMeta = True ,chardet = True ):
391+ def __init__ (self ,source ,override_encoding = None ,transport_encoding = None ,
392+ same_origin_parent_encoding = None ,likely_encoding = None ,
393+ default_encoding = "windows-1252" ,useChardet = True ):
394394"""Initialises the HTMLInputStream.
395395
396396 HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +403,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403403 regardless of any BOM or later declaration (such as in a meta
404404 element)
405405
406- parseMeta - Look for a <meta> element containing encoding information
407-
408406 """
409407# Raw Stream - for unicode objects this will encode to utf-8 and set
410408# self.charEncoding as appropriate
411409self .rawStream = self .openStream (source )
412410
413411HTMLUnicodeInputStream .__init__ (self ,self .rawStream )
414412
415- self .charEncoding = (lookupEncoding (encoding ),"certain" )
416-
417413# Encoding Information
418414# Number of bytes to use when looking for a meta element with
419415# encoding information
420416self .numBytesMeta = 1024
421417# Number of bytes to use when using detecting encoding using chardet
422418self .numBytesChardet = 100
423- # Encoding to use if no other information can be found
424- self .defaultEncoding = "windows-1252"
419+ # Things from args
420+ self .override_encoding = override_encoding
421+ self .transport_encoding = transport_encoding
422+ self .same_origin_parent_encoding = same_origin_parent_encoding
423+ self .likely_encoding = likely_encoding
424+ self .default_encoding = default_encoding
425425
426- # Detect encoding iff no explicit "transport level" encoding is supplied
427- if (self .charEncoding [0 ]is None ):
428- self .charEncoding = self .detectEncoding (parseMeta ,chardet )
429- assert self .charEncoding [0 ]is not None
426+ # Determine encoding
427+ self .charEncoding = self .determineEncoding (useChardet )
428+ assert self .charEncoding [0 ]is not None
430429
431430# Call superclass
432431self .reset ()
@@ -454,21 +453,45 @@ def openStream(self, source):
454453
455454return stream
456455
457- def detectEncoding (self , parseMeta = True ,chardet = True ):
458- #First look for a BOM
456+ def determineEncoding (self ,chardet = True ):
457+ #BOMs take precedence over everything
459458# This will also read past the BOM if present
460- encoding = self .detectBOM ()
461- confidence = "certain"
462- # If there is no BOM need to look for meta elements with encoding
463- # information
464- if encoding is None and parseMeta :
465- encoding = self .detectEncodingMeta ()
466- confidence = "tentative"
459+ charEncoding = self .detectBOM (),"certain"
460+ if charEncoding [0 ]is not None :
461+ return charEncoding
462+
463+ # If we've been overriden, we've been overriden
464+ charEncoding = lookupEncoding (self .override_encoding ),"certain"
465+ if charEncoding [0 ]is not None :
466+ return charEncoding
467+
468+ # Now check the transport layer
469+ charEncoding = lookupEncoding (self .transport_encoding ),"certain"
470+ if charEncoding [0 ]is not None :
471+ return charEncoding
472+
473+ # Look for meta elements with encoding information
474+ charEncoding = self .detectEncodingMeta (),"tentative"
475+ if charEncoding [0 ]is not None :
476+ return charEncoding
477+
478+ # Parent document encoding
479+ charEncoding = lookupEncoding (self .same_origin_parent_encoding ),"tentative"
480+ if charEncoding [0 ]is not None and not charEncoding [0 ].name .startswith ("utf-16" ):
481+ return charEncoding
482+
483+ # "likely" encoding
484+ charEncoding = lookupEncoding (self .likely_encoding ),"tentative"
485+ if charEncoding [0 ]is not None :
486+ return charEncoding
487+
467488# Guess with chardet, if available
468- if encoding is None and chardet :
469- confidence = "tentative"
489+ if chardet :
470490try :
471491from chardet .universaldetector import UniversalDetector
492+ except ImportError :
493+ pass
494+ else :
472495buffers = []
473496detector = UniversalDetector ()
474497while not detector .done :
@@ -481,14 +504,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481504detector .close ()
482505encoding = lookupEncoding (detector .result ['encoding' ])
483506self .rawStream .seek (0 )
484- except ImportError :
485- pass
486- # If all else fails use the default encoding
487- if encoding is None :
488- confidence = "tentative"
489- encoding = lookupEncoding (self .defaultEncoding )
507+ if encoding is not None :
508+ return encoding ,"tentative"
509+
510+ # Try the default encoding
511+ charEncoding = lookupEncoding (self .default_encoding ),"tentative"
512+ if charEncoding [0 ]is not None :
513+ return charEncoding
490514
491- return encoding ,confidence
515+ # Fallback to html5lib's default if even that hasn't worked
516+ return lookupEncoding ("windows-1252" ),"tentative"
492517
493518def changeEncoding (self ,newEncoding ):
494519assert self .charEncoding [1 ]!= "certain"