1111asciiLettersBytes = [str (item )for item in asciiLetters ]
1212asciiUppercaseBytes = [str (item )for item in asciiUppercase ]
1313
14+ invalid_unicode_re = re .compile (u"[\u0001 -\u0008 ]|[\u000E -\u001F ]|[\u007F -\u009F ]|[\uD800 -\uDFFF ]|[\uFDD0 -\uFDDF ]|\uFFFE |\uFFFF |\U0001FFFE |\U0001FFFF |\U0002FFFE |\U0002FFFF |\U0003FFFE |\U0003FFFF |\U0004FFFE |\U0004FFFF |\U0005FFFE |\U0005FFFF |\U0006FFFE |\U0006FFFF |\U0007FFFE |\U0007FFFF |\U0008FFFE |\U0008FFFF |\U0009FFFE |\U0009FFFF |\U000AFFFE |\U000AFFFF |\U000BFFFE \U000BFFFF |\U000CFFFE |\U000CFFFF |\U000DFFFE |\U000DFFFF |\U000EFFFE |\U000EFFFF |\U000FFFFE |\U000FFFFF |\U0010FFFE |\U0010FFFF " )
15+
1416try :
1517from collections import deque
1618except ImportError :
@@ -28,7 +30,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
2830"""Initialises the HTMLInputStream.
2931
3032 HTMLInputStream(source, [encoding]) -> Normalized stream from source
31- for use bythe HTML5Lib .
33+ for use byhtml5lib .
3234
3335 source can be either a file-object, local filename or a string.
3436
@@ -59,7 +61,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
5961self .defaultEncoding = "windows-1252"
6062
6163#Detect encoding iff no explicit "transport level" encoding is supplied
62- if self .charEncoding [0 ]is None or not isValidEncoding (self .charEncoding [0 ]):
64+ if (self .charEncoding [0 ]is None or
65+ not isValidEncoding (self .charEncoding [0 ])):
6366self .charEncoding = self .detectEncoding (parseMeta ,chardet )
6467
6568self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
@@ -87,7 +90,7 @@ def openStream(self, source):
8790# Otherwise treat source as a string and convert to a file object
8891if isinstance (source ,unicode ):
8992source = source .encode ('utf-8' )
90- self .charEncoding = "utf-8"
93+ self .charEncoding = ( "utf-8" , "certian" )
9194import cStringIO
9295stream = cStringIO .StringIO (str (source ))
9396return stream
@@ -262,6 +265,9 @@ def readChunk(self, chunkSize=10240):
262265#Replace null characters
263266for i in xrange (data .count (u"\u0000 " )):
264267self .errors .append ("null-character" )
268+ for i in xrange (len (invalid_unicode_re .findall (data ))):
269+ self .errors .append ("invalid-codepoint" )
270+
265271data = data .replace (u"\u0000 " ,u"\ufffd " )
266272#Check for CR LF broken across chunks
267273if (self ._lastChunkEndsWithCR and data [0 ]== "\n " ):
@@ -271,7 +277,7 @@ def readChunk(self, chunkSize=10240):
271277data = data .replace ("\r " ,"\n " )
272278
273279data = unicode (data )
274- self .queue .extend ([ char for char in data ] )
280+ self .queue .extend (list ( data ) )
275281
276282self .updatePosition ()
277283