Commitfaa4953

committed

Update Python 3, now passes identical number of tests as Python 2.

1 parentd81e892 commitfaa4953Copy full SHA for faa4953

File tree

2 files changed

+204

-144

lines changed

html5lib
- inputstream.py
- tests
  - test_tokenizer.py

2 files changed

+204

-144

lines changed

`‎html5lib/inputstream.py‎`

Lines changed: 204 additions & 141 deletions

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,19 @@`
`7`	`7`	`from .constantsimportencodings,ReparseException`
`8`	`8`	`from .importutils`
`9`	`9`
	`10`	`+fromioimportStringIO`
	`11`	`+`
	`12`	`+try:`
	`13`	`+fromioimportBytesIO`
	`14`	`+exceptImportError:`
	`15`	`+BytesIO=StringIO`
	`16`	`+`
	`17`	`+try:`
	`18`	`+fromioimportBufferedIOBase`
	`19`	`+exceptImportError:`
	`20`	`+classBufferedIOBase(object):`
	`21`	`+pass`
	`22`	`+`
`10`	`23`	`#Non-unicode versions of constants for use in the pre-parser`
`11`	`24`	`spaceCharactersBytes=frozenset([str(item)foriteminspaceCharacters])`
`12`	`25`	`asciiLettersBytes=frozenset([str(item)foriteminasciiLetters])`
`@@ -101,10 +114,21 @@ def _readFromBuffer(self, bytes):`
`101`	`114`	`rv.append(self._readStream(remainingBytes))`
`102`	`115`
`103`	`116`	`return"".join(rv)`
`104`		`-`
`105`	`117`
`106`	`118`
`107`		`-classHTMLInputStream:`
	`119`	`+defHTMLInputStream(source,encoding=None,parseMeta=True,chardet=True):`
	`120`	`+ifhasattr(source,"read"):`
	`121`	`+isUnicode=isinstance(source.read(0),str)`
	`122`	`+else:`
	`123`	`+isUnicode=isinstance(source,str)`
	`124`	`+`
	`125`	`+ifisUnicode:`
	`126`	`+returnHTMLUnicodeInputStream(source)`
	`127`	`+else:`
	`128`	`+returnHTMLBinaryInputStream(source,encoding,parseMeta,chardet)`
	`129`	`+`
	`130`	`+`
	`131`	`+classHTMLUnicodeInputStream:`
`108`	`132`	`"""Provides a unicode stream of characters to the HTMLTokenizer.`
`109`	`133`
`110`	`134`	`This class takes care of character encoding and removing or replacing`
`@@ -114,7 +138,7 @@ class HTMLInputStream:`
`114`	`138`
`115`	`139`	`_defaultChunkSize=10240`
`116`	`140`
`117`		`-def__init__(self,source,encoding=None,parseMeta=True,chardet=True):`
	`141`	`+def__init__(self,source):`
`118`	`142`	`"""Initialises the HTMLInputStream.`
`119`	`143`
`120`	`144`	`HTMLInputStream(source, [encoding]) -> Normalized stream from source`
`@@ -142,32 +166,12 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):`
`142`	`166`	`# List of where new lines occur`
`143`	`167`	`self.newLines= [0]`
`144`	`168`
`145`		`-self.charEncoding= (codecName(encoding),"certain")`
`146`		`-`
`147`		`-# Raw Stream - for unicode objects this will encode to utf-8 and set`
`148`		`-# self.charEncoding as appropriate`
`149`		`-self.rawStream=self.openStream(source)`
`150`		`-`
`151`		`-# Encoding Information`
`152`		`-#Number of bytes to use when looking for a meta element with`
`153`		`-#encoding information`
`154`		`-self.numBytesMeta=512`
`155`		`-#Number of bytes to use when using detecting encoding using chardet`
`156`		`-self.numBytesChardet=100`
`157`		`-#Encoding to use if no other information can be found`
`158`		`-self.defaultEncoding="windows-1252"`
`159`		`-`
`160`		`-#Detect encoding iff no explicit "transport level" encoding is supplied`
`161`		`-if (self.charEncoding[0]isNone):`
`162`		`-self.charEncoding=self.detectEncoding(parseMeta,chardet)`
`163`		`-`
	`169`	`+self.charEncoding= ("utf-8","certain")`
	`170`	`+self.dataStream=self.openStream(source)`
`164`	`171`
`165`	`172`	`self.reset()`
`166`	`173`
`167`	`174`	`defreset(self):`
`168`		`-self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,`
`169`		`-'replace')`
`170`		`-`
`171`	`175`	`self.chunk=""`
`172`	`176`	`self.chunkSize=0`
`173`	`177`	`self.chunkOffset=0`
`@@ -191,128 +195,16 @@ def openStream(self, source):`
`191`	`195`	`ifhasattr(source,'read'):`
`192`	`196`	`stream=source`
`193`	`197`	`else:`
`194`		`-# Otherwise treat source as a string and convert to a file object`
`195`		`-ifisinstance(source,str):`
`196`		`-# XXX: we should handle lone surrogates here`
`197`		`-source=source.encode('utf-8',errors="replace")`
`198`		`-self.charEncoding= ("utf-8","certain")`
`199`		`-try:`
`200`		`-fromioimportBytesIO`
`201`		`-except:`
`202`		`-try:`
`203`		`-# 2to3 converts this line to: from io import StringIO`
`204`		`-fromioimportStringIOasBytesIO`
`205`		`-except:`
`206`		`-fromioimportStringIOasBytesIO`
`207`		`-stream=BytesIO(source)`
	`198`	`+stream=StringIO(source)`
`208`	`199`
`209`		`-if (not(hasattr(stream,"tell")andhasattr(stream,"seek"))or`
	`200`	`+if (#not isinstance(stream, BufferedIOBase) and`
	`201`	`+not(hasattr(stream,"tell")and`
	`202`	`+hasattr(stream,"seek"))or`
`210`	`203`	`streamissys.stdin):`
`211`	`204`	`stream=BufferedStream(stream)`
`212`	`205`
`213`	`206`	`returnstream`
`214`	`207`
`215`		`-defdetectEncoding(self,parseMeta=True,chardet=True):`
`216`		`-#First look for a BOM`
`217`		`-#This will also read past the BOM if present`
`218`		`-encoding=self.detectBOM()`
`219`		`-confidence="certain"`
`220`		`-#If there is no BOM need to look for meta elements with encoding`
`221`		`-#information`
`222`		`-ifencodingisNoneandparseMeta:`
`223`		`-encoding=self.detectEncodingMeta()`
`224`		`-confidence="tentative"`
`225`		`-#Guess with chardet, if avaliable`
`226`		`-ifencodingisNoneandchardet:`
`227`		`-confidence="tentative"`
`228`		`-try:`
`229`		`-fromchardet.universaldetectorimportUniversalDetector`
`230`		`-buffers= []`
`231`		`-detector=UniversalDetector()`
`232`		`-whilenotdetector.done:`
`233`		`-buffer=self.rawStream.read(self.numBytesChardet)`
`234`		`-assertisinstance(buffer,bytes)`
`235`		`-ifnotbuffer:`
`236`		`-break`
`237`		`-buffers.append(buffer)`
`238`		`-detector.feed(buffer)`
`239`		`-detector.close()`
`240`		`-encoding=detector.result['encoding']`
`241`		`-self.rawStream.seek(0)`
`242`		`-exceptImportError:`
`243`		`-pass`
`244`		`-# If all else fails use the default encoding`
`245`		`-ifencodingisNone:`
`246`		`-confidence="tentative"`
`247`		`-encoding=self.defaultEncoding`
`248`		`-`
`249`		`-#Substitute for equivalent encodings:`
`250`		`-encodingSub= {"iso-8859-1":"windows-1252"}`
`251`		`-`
`252`		`-ifencoding.lower()inencodingSub:`
`253`		`-encoding=encodingSub[encoding.lower()]`
`254`		`-`
`255`		`-returnencoding,confidence`
`256`		`-`
`257`		`-defchangeEncoding(self,newEncoding):`
`258`		`-newEncoding=codecName(newEncoding)`
`259`		`-ifnewEncodingin ("utf-16","utf-16-be","utf-16-le"):`
`260`		`-newEncoding="utf-8"`
`261`		`-ifnewEncodingisNone:`
`262`		`-return`
`263`		`-elifnewEncoding==self.charEncoding[0]:`
`264`		`-self.charEncoding= (self.charEncoding[0],"certain")`
`265`		`-else:`
`266`		`-self.rawStream.seek(0)`
`267`		`-self.reset()`
`268`		`-self.charEncoding= (newEncoding,"certain")`
`269`		`-raiseReparseException("Encoding changed from %s to %s"%(self.charEncoding[0],newEncoding))`
`270`		`-`
`271`		`-defdetectBOM(self):`
`272`		`-"""Attempts to detect at BOM at the start of the stream. If`
`273`		`- an encoding can be determined from the BOM return the name of the`
`274`		`- encoding otherwise return None"""`
`275`		`-bomDict= {`
`276`		`-codecs.BOM_UTF8:'utf-8',`
`277`		`-codecs.BOM_UTF16_LE:'utf-16-le',codecs.BOM_UTF16_BE:'utf-16-be',`
`278`		`-codecs.BOM_UTF32_LE:'utf-32-le',codecs.BOM_UTF32_BE:'utf-32-be'`
`279`		`- }`
`280`		`-`
`281`		`-# Go to beginning of file and read in 4 bytes`
`282`		`-string=self.rawStream.read(4)`
`283`		`-assertisinstance(string,bytes)`
`284`		`-`
`285`		`-# Try detecting the BOM using bytes from the string`
`286`		`-encoding=bomDict.get(string[:3])# UTF-8`
`287`		`-seek=3`
`288`		`-ifnotencoding:`
`289`		`-# Need to detect UTF-32 before UTF-16`
`290`		`-encoding=bomDict.get(string)# UTF-32`
`291`		`-seek=4`
`292`		`-ifnotencoding:`
`293`		`-encoding=bomDict.get(string[:2])# UTF-16`
`294`		`-seek=2`
`295`		`-`
`296`		`-# Set the read position past the BOM if one was found, otherwise`
`297`		`-# set it to the start of the stream`
`298`		`-self.rawStream.seek(encodingandseekor0)`
`299`		`-`
`300`		`-returnencoding`
`301`		`-`
`302`		`-defdetectEncodingMeta(self):`
`303`		`-"""Report the encoding declared by the meta element`
`304`		`- """`
`305`		`-buffer=self.rawStream.read(self.numBytesMeta)`
`306`		`-assertisinstance(buffer,bytes)`
`307`		`-parser=EncodingParser(buffer)`
`308`		`-self.rawStream.seek(0)`
`309`		`-encoding=parser.getEncoding()`
`310`		`-`
`311`		`-ifencodingin ("utf-16","utf-16-be","utf-16-le"):`
`312`		`-encoding="utf-8"`
`313`		`-`
`314`		`-returnencoding`
`315`		`-`
`316`	`208`	`def_position(self,offset):`
`317`	`209`	`chunk=self.chunk`
`318`	`210`	`nLines=chunk.count('\n',0,offset)`
`@@ -475,6 +367,177 @@ def unget(self, char):`
`475`	`367`	`self.chunkOffset-=1`
`476`	`368`	`assertself.chunk[self.chunkOffset]==char`
`477`	`369`
	`370`	`+classHTMLBinaryInputStream(HTMLUnicodeInputStream):`
	`371`	`+"""Provides a unicode stream of characters to the HTMLTokenizer.`
	`372`	`+`
	`373`	`+ This class takes care of character encoding and removing or replacing`
	`374`	`+ incorrect byte-sequences and also provides column and line tracking.`
	`375`	`+`
	`376`	`+ """`
	`377`	`+`
	`378`	`+def__init__(self,source,encoding=None,parseMeta=True,chardet=True):`
	`379`	`+"""Initialises the HTMLInputStream.`
	`380`	`+`
	`381`	`+ HTMLInputStream(source, [encoding]) -> Normalized stream from source`
	`382`	`+ for use by html5lib.`
	`383`	`+`
	`384`	`+ source can be either a file-object, local filename or a string.`
	`385`	`+`
	`386`	`+ The optional encoding parameter must be a string that indicates`
	`387`	`+ the encoding. If specified, that encoding will be used,`
	`388`	`+ regardless of any BOM or later declaration (such as in a meta`
	`389`	`+ element)`
	`390`	`+`
	`391`	`+ parseMeta - Look for a <meta> element containing encoding information`
	`392`	`+`
	`393`	`+ """`
	`394`	`+self.charEncoding= (codecName(encoding),"certain")`
	`395`	`+`
	`396`	`+# Raw Stream - for unicode objects this will encode to utf-8 and set`
	`397`	`+# self.charEncoding as appropriate`
	`398`	`+self.rawStream=self.openStream(source)`
	`399`	`+`
	`400`	`+# Encoding Information`
	`401`	`+#Number of bytes to use when looking for a meta element with`
	`402`	`+#encoding information`
	`403`	`+self.numBytesMeta=512`
	`404`	`+#Number of bytes to use when using detecting encoding using chardet`
	`405`	`+self.numBytesChardet=100`
	`406`	`+#Encoding to use if no other information can be found`
	`407`	`+self.defaultEncoding="windows-1252"`
	`408`	`+`
	`409`	`+#Detect encoding iff no explicit "transport level" encoding is supplied`
	`410`	`+if (self.charEncoding[0]isNone):`
	`411`	`+self.charEncoding=self.detectEncoding(parseMeta,chardet)`
	`412`	`+`
	`413`	`+#Call superclass`
	`414`	`+HTMLUnicodeInputStream.__init__(self,self.rawStream)`
	`415`	`+`
	`416`	`+defreset(self):`
	`417`	`+self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,`
	`418`	`+'replace')`
	`419`	`+HTMLUnicodeInputStream.reset(self)`
	`420`	`+`
	`421`	`+defopenStream(self,source):`
	`422`	`+"""Produces a file object from source.`
	`423`	`+`
	`424`	`+ source can be either a file object, local filename or a string.`
	`425`	`+`
	`426`	`+ """`
	`427`	`+# Already a file object`
	`428`	`+ifhasattr(source,'read'):`
	`429`	`+stream=source`
	`430`	`+else:`
	`431`	`+stream=BytesIO(source)`
	`432`	`+`
	`433`	`+if (not(hasattr(stream,"tell")andhasattr(stream,"seek"))or`
	`434`	`+streamissys.stdin):`
	`435`	`+stream=BufferedStream(stream)`
	`436`	`+`
	`437`	`+returnstream`
	`438`	`+`
	`439`	`+defdetectEncoding(self,parseMeta=True,chardet=True):`
	`440`	`+#First look for a BOM`
	`441`	`+#This will also read past the BOM if present`
	`442`	`+encoding=self.detectBOM()`
	`443`	`+confidence="certain"`
	`444`	`+#If there is no BOM need to look for meta elements with encoding`
	`445`	`+#information`
	`446`	`+ifencodingisNoneandparseMeta:`
	`447`	`+encoding=self.detectEncodingMeta()`
	`448`	`+confidence="tentative"`
	`449`	`+#Guess with chardet, if avaliable`
	`450`	`+ifencodingisNoneandchardet:`
	`451`	`+confidence="tentative"`
	`452`	`+try:`
	`453`	`+fromchardet.universaldetectorimportUniversalDetector`
	`454`	`+buffers= []`
	`455`	`+detector=UniversalDetector()`
	`456`	`+whilenotdetector.done:`
	`457`	`+buffer=self.rawStream.read(self.numBytesChardet)`
	`458`	`+assertisinstance(buffer,bytes)`
	`459`	`+ifnotbuffer:`
	`460`	`+break`
	`461`	`+buffers.append(buffer)`
	`462`	`+detector.feed(buffer)`
	`463`	`+detector.close()`
	`464`	`+encoding=detector.result['encoding']`
	`465`	`+self.rawStream.seek(0)`
	`466`	`+exceptImportError:`
	`467`	`+pass`
	`468`	`+# If all else fails use the default encoding`
	`469`	`+ifencodingisNone:`
	`470`	`+confidence="tentative"`
	`471`	`+encoding=self.defaultEncoding`
	`472`	`+`
	`473`	`+#Substitute for equivalent encodings:`
	`474`	`+encodingSub= {"iso-8859-1":"windows-1252"}`
	`475`	`+`
	`476`	`+ifencoding.lower()inencodingSub:`
	`477`	`+encoding=encodingSub[encoding.lower()]`
	`478`	`+`
	`479`	`+returnencoding,confidence`
	`480`	`+`
	`481`	`+defchangeEncoding(self,newEncoding):`
	`482`	`+assertself.charEncoding[1]!="certain"`
	`483`	`+newEncoding=codecName(newEncoding)`
	`484`	`+ifnewEncodingin ("utf-16","utf-16-be","utf-16-le"):`
	`485`	`+newEncoding="utf-8"`
	`486`	`+ifnewEncodingisNone:`
	`487`	`+return`
	`488`	`+elifnewEncoding==self.charEncoding[0]:`
	`489`	`+self.charEncoding= (self.charEncoding[0],"certain")`
	`490`	`+else:`
	`491`	`+self.rawStream.seek(0)`
	`492`	`+self.reset()`
	`493`	`+self.charEncoding= (newEncoding,"certain")`
	`494`	`+raiseReparseException("Encoding changed from %s to %s"%(self.charEncoding[0],newEncoding))`
	`495`	`+`
	`496`	`+defdetectBOM(self):`
	`497`	`+"""Attempts to detect at BOM at the start of the stream. If`
	`498`	`+ an encoding can be determined from the BOM return the name of the`
	`499`	`+ encoding otherwise return None"""`
	`500`	`+bomDict= {`
	`501`	`+codecs.BOM_UTF8:'utf-8',`
	`502`	`+codecs.BOM_UTF16_LE:'utf-16-le',codecs.BOM_UTF16_BE:'utf-16-be',`
	`503`	`+codecs.BOM_UTF32_LE:'utf-32-le',codecs.BOM_UTF32_BE:'utf-32-be'`
	`504`	`+ }`
	`505`	`+`
	`506`	`+# Go to beginning of file and read in 4 bytes`
	`507`	`+string=self.rawStream.read(4)`
	`508`	`+assertisinstance(string,bytes)`
	`509`	`+`
	`510`	`+# Try detecting the BOM using bytes from the string`
	`511`	`+encoding=bomDict.get(string[:3])# UTF-8`
	`512`	`+seek=3`
	`513`	`+ifnotencoding:`
	`514`	`+# Need to detect UTF-32 before UTF-16`
	`515`	`+encoding=bomDict.get(string)# UTF-32`
	`516`	`+seek=4`
	`517`	`+ifnotencoding:`
	`518`	`+encoding=bomDict.get(string[:2])# UTF-16`
	`519`	`+seek=2`
	`520`	`+`
	`521`	`+# Set the read position past the BOM if one was found, otherwise`
	`522`	`+# set it to the start of the stream`
	`523`	`+self.rawStream.seek(encodingandseekor0)`
	`524`	`+`
	`525`	`+returnencoding`
	`526`	`+`
	`527`	`+defdetectEncodingMeta(self):`
	`528`	`+"""Report the encoding declared by the meta element`
	`529`	`+ """`
	`530`	`+buffer=self.rawStream.read(self.numBytesMeta)`
	`531`	`+assertisinstance(buffer,bytes)`
	`532`	`+parser=EncodingParser(buffer)`
	`533`	`+self.rawStream.seek(0)`
	`534`	`+encoding=parser.getEncoding()`
	`535`	`+`
	`536`	`+ifencodingin ("utf-16","utf-16-be","utf-16-le"):`
	`537`	`+encoding="utf-8"`
	`538`	`+`
	`539`	`+returnencoding`
	`540`	`+`
`478`	`541`	`classEncodingBytes(str):`
`479`	`542`	`"""String-like object with an associated position and various extra methods`
`480`	`543`	`If the position is ever greater than the string length then an exception is`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitfaa4953

File tree

2 files changed

2 files changed

`‎html5lib/inputstream.py‎`

0 commit comments