@@ -21,10 +21,10 @@ class BufferedIOBase(object):
2121pass
2222
2323#Non-unicode versions of constants for use in the pre-parser
24- spaceCharactersBytes = frozenset ([str ( item )for item in spaceCharacters ])
25- asciiLettersBytes = frozenset ([str ( item )for item in asciiLetters ])
26- asciiUppercaseBytes = frozenset ([str ( item )for item in asciiUppercase ])
27- spacesAngleBrackets = spaceCharactersBytes | frozenset ([">" ,"<" ])
24+ spaceCharactersBytes = frozenset ([item . encode ( "ascii" )for item in spaceCharacters ])
25+ asciiLettersBytes = frozenset ([item . encode ( "ascii" )for item in asciiLetters ])
26+ asciiUppercaseBytes = frozenset ([item . encode ( "ascii" )for item in asciiUppercase ])
27+ spacesAngleBrackets = spaceCharactersBytes | frozenset ([b ">" ,b "<" ])
2828
2929invalid_unicode_re = re .compile ("[\u0001 -\u0008 \u000B \u000E -\u001F \u007F -\u009F \uD800 -\uDFFF \uFDD0 -\uFDEF \uFFFE \uFFFF \U0001FFFE \U0001FFFF \U0002FFFE \U0002FFFF \U0003FFFE \U0003FFFF \U0004FFFE \U0004FFFF \U0005FFFE \U0005FFFF \U0006FFFE \U0006FFFF \U0007FFFE \U0007FFFF \U0008FFFE \U0008FFFF \U0009FFFE \U0009FFFF \U000AFFFE \U000AFFFF \U000BFFFE \U000BFFFF \U000CFFFE \U000CFFFF \U000DFFFE \U000DFFFF \U000EFFFE \U000EFFFF \U000FFFFE \U000FFFFF \U0010FFFE \U0010FFFF ]" )
3030
@@ -391,12 +391,14 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
391391 parseMeta - Look for a <meta> element containing encoding information
392392
393393 """
394- self .charEncoding = (codecName (encoding ),"certain" )
395-
396394# Raw Stream - for unicode objects this will encode to utf-8 and set
397395# self.charEncoding as appropriate
398396self .rawStream = self .openStream (source )
399397
398+ HTMLUnicodeInputStream .__init__ (self ,self .rawStream )
399+
400+ self .charEncoding = (codecName (encoding ),"certain" )
401+
400402# Encoding Information
401403#Number of bytes to use when looking for a meta element with
402404#encoding information
@@ -411,7 +413,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
411413self .charEncoding = self .detectEncoding (parseMeta ,chardet )
412414
413415#Call superclass
414- HTMLUnicodeInputStream . __init__ ( self , self . rawStream )
416+ self . reset ( )
415417
416418def reset (self ):
417419self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
@@ -538,12 +540,13 @@ def detectEncodingMeta(self):
538540
539541return encoding
540542
541- class EncodingBytes (str ):
543+ class EncodingBytes (bytes ):
542544"""String-like object with an associated position and various extra methods
543545 If the position is ever greater than the string length then an exception is
544546 raised"""
545547def __new__ (self ,value ):
546- return str .__new__ (self ,value .lower ())
548+ assert isinstance (value ,bytes )
549+ return bytes .__new__ (self ,value .lower ())
547550
548551def __init__ (self ,value ):
549552self ._position = - 1
@@ -557,7 +560,7 @@ def __next__(self):
557560raise StopIteration
558561elif p < 0 :
559562raise TypeError
560- return self [p ]
563+ return self [p : p + 1 ]
561564
562565def previous (self ):
563566p = self ._position
@@ -566,7 +569,7 @@ def previous(self):
566569elif p < 0 :
567570raise TypeError
568571self ._position = p = p - 1
569- return self [p ]
572+ return self [p : p + 1 ]
570573
571574def setPosition (self ,position ):
572575if self ._position >= len (self ):
@@ -584,15 +587,15 @@ def getPosition(self):
584587position = property (getPosition ,setPosition )
585588
586589def getCurrentByte (self ):
587- return self [self .position ]
590+ return self [self .position : self . position + 1 ]
588591
589592currentByte = property (getCurrentByte )
590593
591594def skip (self ,chars = spaceCharactersBytes ):
592595"""Skip past a list of characters"""
593596p = self .position # use property for the error-checking
594597while p < len (self ):
595- c = self [p ]
598+ c = self [p : p + 1 ]
596599if c not in chars :
597600self ._position = p
598601return c
@@ -603,7 +606,7 @@ def skip(self, chars=spaceCharactersBytes):
603606def skipUntil (self ,chars ):
604607p = self .position
605608while p < len (self ):
606- c = self [p ]
609+ c = self [p : p + 1 ]
607610if c in chars :
608611self ._position = p
609612return c
@@ -645,12 +648,12 @@ def __init__(self, data):
645648
646649def getEncoding (self ):
647650methodDispatch = (
648- ("<!--" ,self .handleComment ),
649- ("<meta" ,self .handleMeta ),
650- ("</" ,self .handlePossibleEndTag ),
651- ("<!" ,self .handleOther ),
652- ("<?" ,self .handleOther ),
653- ("<" ,self .handlePossibleStartTag ))
651+ (b "<!--" ,self .handleComment ),
652+ (b "<meta" ,self .handleMeta ),
653+ (b "</" ,self .handlePossibleEndTag ),
654+ (b "<!" ,self .handleOther ),
655+ (b "<?" ,self .handleOther ),
656+ (b "<" ,self .handlePossibleStartTag ))
654657for byte in self .data :
655658keepParsing = True
656659for key ,method in methodDispatch :
@@ -663,37 +666,48 @@ def getEncoding(self):
663666break
664667if not keepParsing :
665668break
666-
669+
667670return self .encoding
668671
669672def handleComment (self ):
670673"""Skip over comments"""
671- return self .data .jumpTo ("-->" )
674+ return self .data .jumpTo (b "-->" )
672675
673676def handleMeta (self ):
674677if self .data .currentByte not in spaceCharactersBytes :
675678#if we have <meta not followed by a space so just keep going
676679return True
677680#We have a valid meta element we want to search for attributes
681+ hasPragma = False
682+ pendingEncoding = None
678683while True :
679684#Try to find the next attribute after the current position
680685attr = self .getAttribute ()
681686if attr is None :
682687return True
683688else :
684- if attr [0 ]== "charset" :
689+ if attr [0 ]== b"http-equiv" :
690+ hasPragma = attr [1 ]== b"content-type"
691+ if hasPragma and pendingEncoding is not None :
692+ self .encoding = pendingEncoding
693+ return False
694+ elif attr [0 ]== b"charset" :
685695tentativeEncoding = attr [1 ]
686696codec = codecName (tentativeEncoding )
687697if codec is not None :
688698self .encoding = codec
689699return False
690- elif attr [0 ]== "content" :
700+ elif attr [0 ]== b "content" :
691701contentParser = ContentAttrParser (EncodingBytes (attr [1 ]))
692702tentativeEncoding = contentParser .parse ()
693- codec = codecName (tentativeEncoding )
694- if codec is not None :
695- self .encoding = codec
696- return False
703+ if tentativeEncoding is not None :
704+ codec = codecName (tentativeEncoding )
705+ if codec is not None :
706+ if hasPragma :
707+ self .encoding = codec
708+ return False
709+ else :
710+ pendingEncoding = codec
697711
698712def handlePossibleStartTag (self ):
699713return self .handlePossibleTag (False )
@@ -714,7 +728,7 @@ def handlePossibleTag(self, endTag):
714728return True
715729
716730c = data .skipUntil (spacesAngleBrackets )
717- if c == "<" :
731+ if c == b "<" :
718732#return to the first step in the overall "two step" algorithm
719733#reprocessing the < byte
720734data .previous ()
@@ -726,31 +740,31 @@ def handlePossibleTag(self, endTag):
726740return True
727741
728742def handleOther (self ):
729- return self .data .jumpTo (">" )
743+ return self .data .jumpTo (b ">" )
730744
731745def getAttribute (self ):
732746"""Return a name,value pair for the next attribute in the stream,
733747 if one is found, or None"""
734748data = self .data
735749# Step 1 (skip chars)
736- c = data .skip (spaceCharactersBytes | frozenset ("/" ))
750+ c = data .skip (spaceCharactersBytes | frozenset ([b"/" ]))
751+ assert c is None or len (c )== 1
737752# Step 2
738- if c in (">" ,None ):
753+ if c in (b ">" ,None ):
739754return None
740755# Step 3
741756attrName = []
742757attrValue = []
743758#Step 4 attribute name
744759while True :
745- if c == "=" and attrName :
760+ if c == b "="and attrName :
746761break
747762elif c in spaceCharactersBytes :
748763#Step 6!
749764c = data .skip ()
750- c = next (data )
751765break
752- elif c in ("/" ,">" ):
753- return "" .join (attrName ),""
766+ elif c in (b "/" ,b ">" ):
767+ return b "" .join (attrName ),b ""
754768elif c in asciiUppercaseBytes :
755769attrName .append (c .lower ())
756770elif c == None :
@@ -760,15 +774,15 @@ def getAttribute(self):
760774#Step 5
761775c = next (data )
762776#Step 7
763- if c != "=" :
777+ if c != b "=" :
764778data .previous ()
765- return "" .join (attrName ),""
779+ return b "" .join (attrName ),b ""
766780#Step 8
767781next (data )
768782#Step 9
769783c = data .skip ()
770784#Step 10
771- if c in ("'" ,'"' ):
785+ if c in (b "'" ,b '"' ):
772786#10.1
773787quoteChar = c
774788while True :
@@ -777,15 +791,15 @@ def getAttribute(self):
777791#10.3
778792if c == quoteChar :
779793next (data )
780- return "" .join (attrName ),"" .join (attrValue )
794+ return b "" .join (attrName ),b "" .join (attrValue )
781795#10.4
782796elif c in asciiUppercaseBytes :
783797attrValue .append (c .lower ())
784798#10.5
785799else :
786800attrValue .append (c )
787- elif c == ">" :
788- return "" .join (attrName ),""
801+ elif c == b ">" :
802+ return b "" .join (attrName ),b ""
789803elif c in asciiUppercaseBytes :
790804attrValue .append (c .lower ())
791805elif c is None :
@@ -796,7 +810,7 @@ def getAttribute(self):
796810while True :
797811c = next (data )
798812if c in spacesAngleBrackets :
799- return "" .join (attrName ),"" .join (attrValue )
813+ return b "" .join (attrName ),b "" .join (attrValue )
800814elif c in asciiUppercaseBytes :
801815attrValue .append (c .lower ())
802816elif c is None :
@@ -807,21 +821,22 @@ def getAttribute(self):
807821
808822class ContentAttrParser (object ):
809823def __init__ (self ,data ):
824+ assert isinstance (data ,bytes )
810825self .data = data
811826def parse (self ):
812827try :
813828#Check if the attr name is charset
814829#otherwise return
815- self .data .jumpTo ("charset" )
830+ self .data .jumpTo (b "charset" )
816831self .data .position += 1
817832self .data .skip ()
818- if not self .data .currentByte == "=" :
833+ if not self .data .currentByte == b "=" :
819834#If there is no = sign keep looking for attrs
820835return None
821836self .data .position += 1
822837self .data .skip ()
823838#Look for an encoding between matching quote marks
824- if self .data .currentByte in ('"' ,"'" ):
839+ if self .data .currentByte in (b '"' ,b "'" ):
825840quoteMark = self .data .currentByte
826841self .data .position += 1
827842oldPosition = self .data .position
@@ -845,6 +860,11 @@ def parse(self):
845860def codecName (encoding ):
846861"""Return the python codec name corresponding to an encoding or None if the
847862 string doesn't correspond to a valid encoding."""
863+ if isinstance (encoding ,bytes ):
864+ try :
865+ encoding = encoding .decode ("ascii" )
866+ except UnicodeDecodeError :
867+ return None
848868if encoding :
849869canonicalName = ascii_punctuation_re .sub ("" ,encoding ).lower ()
850870return encodings .get (canonicalName ,None )