@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128128return b"" .join (rv )
129129
130130
131- def HTMLInputStream (source ,encoding = None , parseMeta = True , chardet = True ):
131+ def HTMLInputStream (source ,** kwargs ):
132132# Work around Python bug #20007: read(0) closes the connection.
133133# http://bugs.python.org/issue20007
134134if (isinstance (source ,http_client .HTTPResponse )or
@@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142142isUnicode = isinstance (source ,text_type )
143143
144144if isUnicode :
145- if encoding is not None :
146- raise TypeError ("Cannot explicitly set an encoding with a unicode string" )
145+ encodings = [x for x in kwargs if x .endswith ("_encoding" )]
146+ if encodings :
147+ raise TypeError ("Cannot set an encoding with a unicode input, set %r" % encodings )
147148
148- return HTMLUnicodeInputStream (source )
149+ return HTMLUnicodeInputStream (source , ** kwargs )
149150else :
150- return HTMLBinaryInputStream (source ,encoding , parseMeta , chardet )
151+ return HTMLBinaryInputStream (source ,** kwargs )
151152
152153
153154class HTMLUnicodeInputStream (object ):
@@ -173,8 +174,6 @@ def __init__(self, source):
173174 regardless of any BOM or later declaration (such as in a meta
174175 element)
175176
176- parseMeta - Look for a <meta> element containing encoding information
177-
178177 """
179178
180179if not utils .supports_lone_surrogates :
@@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390389
391390 """
392391
393- def __init__ (self ,source ,encoding = None ,parseMeta = True ,chardet = True ):
392+ def __init__ (self ,source ,override_encoding = None ,transport_encoding = None ,
393+ same_origin_parent_encoding = None ,likely_encoding = None ,
394+ default_encoding = "windows-1252" ,useChardet = True ):
394395"""Initialises the HTMLInputStream.
395396
396397 HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +404,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403404 regardless of any BOM or later declaration (such as in a meta
404405 element)
405406
406- parseMeta - Look for a <meta> element containing encoding information
407-
408407 """
409408# Raw Stream - for unicode objects this will encode to utf-8 and set
410409# self.charEncoding as appropriate
411410self .rawStream = self .openStream (source )
412411
413412HTMLUnicodeInputStream .__init__ (self ,self .rawStream )
414413
415- self .charEncoding = (lookupEncoding (encoding ),"certain" )
416-
417414# Encoding Information
418415# Number of bytes to use when looking for a meta element with
419416# encoding information
420417self .numBytesMeta = 1024
421418# Number of bytes to use when using detecting encoding using chardet
422419self .numBytesChardet = 100
423- # Encoding to use if no other information can be found
424- self .defaultEncoding = "windows-1252"
420+ # Things from args
421+ self .override_encoding = override_encoding
422+ self .transport_encoding = transport_encoding
423+ self .same_origin_parent_encoding = same_origin_parent_encoding
424+ self .likely_encoding = likely_encoding
425+ self .default_encoding = default_encoding
425426
426- # Detect encoding iff no explicit "transport level" encoding is supplied
427- if (self .charEncoding [0 ]is None ):
428- self .charEncoding = self .detectEncoding (parseMeta ,chardet )
429- assert self .charEncoding [0 ]is not None
427+ # Determine encoding
428+ self .charEncoding = self .determineEncoding (useChardet )
429+ assert self .charEncoding [0 ]is not None
430430
431431# Call superclass
432432self .reset ()
@@ -454,21 +454,45 @@ def openStream(self, source):
454454
455455return stream
456456
457- def detectEncoding (self , parseMeta = True ,chardet = True ):
458- #First look for a BOM
457+ def determineEncoding (self ,chardet = True ):
458+ #BOMs take precedence over everything
459459# This will also read past the BOM if present
460- encoding = self .detectBOM ()
461- confidence = "certain"
462- # If there is no BOM need to look for meta elements with encoding
463- # information
464- if encoding is None and parseMeta :
465- encoding = self .detectEncodingMeta ()
466- confidence = "tentative"
460+ charEncoding = self .detectBOM (),"certain"
461+ if charEncoding [0 ]is not None :
462+ return charEncoding
463+
464+ # If we've been overriden, we've been overriden
465+ charEncoding = lookupEncoding (self .override_encoding ),"certain"
466+ if charEncoding [0 ]is not None :
467+ return charEncoding
468+
469+ # Now check the transport layer
470+ charEncoding = lookupEncoding (self .transport_encoding ),"certain"
471+ if charEncoding [0 ]is not None :
472+ return charEncoding
473+
474+ # Look for meta elements with encoding information
475+ charEncoding = self .detectEncodingMeta (),"tentative"
476+ if charEncoding [0 ]is not None :
477+ return charEncoding
478+
479+ # Parent document encoding
480+ charEncoding = lookupEncoding (self .same_origin_parent_encoding ),"tentative"
481+ if charEncoding [0 ]is not None and not charEncoding [0 ].name .startswith ("utf-16" ):
482+ return charEncoding
483+
484+ # "likely" encoding
485+ charEncoding = lookupEncoding (self .likely_encoding ),"tentative"
486+ if charEncoding [0 ]is not None :
487+ return charEncoding
488+
467489# Guess with chardet, if available
468- if encoding is None and chardet :
469- confidence = "tentative"
490+ if chardet :
470491try :
471492from chardet .universaldetector import UniversalDetector
493+ except ImportError :
494+ pass
495+ else :
472496buffers = []
473497detector = UniversalDetector ()
474498while not detector .done :
@@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481505detector .close ()
482506encoding = lookupEncoding (detector .result ['encoding' ])
483507self .rawStream .seek (0 )
484- except ImportError :
485- pass
486- # If all else fails use the default encoding
487- if encoding is None :
488- confidence = "tentative"
489- encoding = lookupEncoding (self .defaultEncoding )
508+ if encoding is not None :
509+ return encoding ,"tentative"
510+
511+ # Try the default encoding
512+ charEncoding = lookupEncoding (self .default_encoding ),"tentative"
513+ if charEncoding [0 ]is not None :
514+ return charEncoding
490515
491- return encoding ,confidence
516+ # Fallback to html5lib's default if even that hasn't worked
517+ return lookupEncoding ("windows-1252" ),"tentative"
492518
493519def changeEncoding (self ,newEncoding ):
494520assert self .charEncoding [1 ]!= "certain"