Commitdfa444f

committed

Fix#120: introduce keyword arguments for encodings by source

1 parent244a6eb commitdfa444fCopy full SHA for dfa444f

File tree

7 files changed

+79

-58

lines changed

7 files changed

+79

-58

lines changed

`‎CHANGES.rst`

Lines changed: 4 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,10 @@ Released on XXX`
`46`	`46`
`47`	`47`	`* Drop support of charade, now that chardet is supported once more.`
`48`	`48`
	`49`	`+* **Replace the charset keyword argument on parse and related methods`
	`50`	`+ with a set of keyword arguments: override_encoding, transport_encoding,`
	`51`	`+ same_origin_parent_encoding, likely_encoding, and default_encoding.**`
	`52`	`+`
`49`	`53`
`50`	`54`	`0.9999999/1.0b8`
`51`	`55`	`~~~~~~~~~~~~~~~`

`‎README.rst`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ pass into html5lib as follows:`
`51`	`51`	`import html5lib`
`52`	`52`
`53`	`53`	`with closing(urlopen("http://example.com/"))as f:`
`54`		`- document= html5lib.parse(f,encoding=f.info().getparam("charset"))`
	`54`	`+ document= html5lib.parse(f,transport_encoding=f.info().getparam("charset"))`
`55`	`55`
`56`	`56`	When using with ``urllib.request`` (Python 3), the charset from HTTP
`57`	`57`	`should be pass into html5lib as follows:`
`@@ -62,7 +62,7 @@ should be pass into html5lib as follows:`
`62`	`62`	`import html5lib`
`63`	`63`
`64`	`64`	`with urlopen("http://example.com/")as f:`
`65`		`- document= html5lib.parse(f,encoding=f.info().get_content_charset())`
	`65`	`+ document= html5lib.parse(f,transport_encoding=f.info().get_content_charset())`
`66`	`66`
`67`	`67`	`To have more control over the parser, create a parser object explicitly.`
`68`	`68`	`For instance, to make the parser raise exceptions on parse errors, use:`

`‎html5lib/html5parser.py`

Lines changed: 6 additions & 14 deletions

Original file line number	Diff line number	Diff line change
`@@ -79,15 +79,12 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa`
`79`	`79`	`self.phases=dict([(name,cls(self,self.tree))forname,clsin`
`80`	`80`	`getPhases(debug).items()])`
`81`	`81`
`82`		`-def_parse(self,stream,innerHTML=False,container="div",encoding=None,`
`83`		`-useChardet=True,scripting=False,**kwargs):`
	`82`	`+def_parse(self,stream,innerHTML=False,container="div",scripting=False,**kwargs):`
`84`	`83`
`85`	`84`	`self.innerHTMLMode=innerHTML`
`86`	`85`	`self.container=container`
`87`	`86`	`self.scripting=scripting`
`88`		`-self.tokenizer=tokenizer.HTMLTokenizer(stream,encoding=encoding,`
`89`		`-useChardet=useChardet,`
`90`		`-parser=self,**kwargs)`
	`87`	`+self.tokenizer=tokenizer.HTMLTokenizer(stream,parser=self,**kwargs)`
`91`	`88`	`self.reset()`
`92`	`89`
`93`	`90`	`try:`
`@@ -222,8 +219,7 @@ def normalizedTokens(self):`
`222`	`219`	`fortokeninself.tokenizer:`
`223`	`220`	`yieldself.normalizeToken(token)`
`224`	`221`
`225`		`-defparse(self,stream,encoding=None,`
`226`		`-useChardet=True,scripting=False):`
	`222`	`+defparse(self,stream,args,*kwargs):`
`227`	`223`	`"""Parse a HTML document into a well-formed tree`
`228`	`224`
`229`	`225`	`stream - a filelike object or string containing the HTML to be parsed`
`@@ -235,13 +231,10 @@ def parse(self, stream, encoding=None,`
`235`	`231`
`236`	`232`	`scripting - treat noscript elements as if javascript was turned on`
`237`	`233`	`"""`
`238`		`-self._parse(stream,innerHTML=False,encoding=encoding,`
`239`		`-useChardet=useChardet,scripting=scripting)`
	`234`	`+self._parse(stream,False,None,args,*kwargs)`
`240`	`235`	`returnself.tree.getDocument()`
`241`	`236`
`242`		`-defparseFragment(self,stream,container="div",encoding=None,`
`243`		`-useChardet=True,scripting=False):`
`244`		`-# pylint:disable=unused-argument`
	`237`	`+defparseFragment(self,stream,args,*kwargs):`
`245`	`238`	`"""Parse a HTML fragment into a well-formed tree fragment`
`246`	`239`
`247`	`240`	`container - name of the element we're setting the innerHTML property`
`@@ -256,8 +249,7 @@ def parseFragment(self, stream, container="div", encoding=None,`
`256`	`249`
`257`	`250`	`scripting - treat noscript elements as if javascript was turned on`
`258`	`251`	`"""`
`259`		`-self._parse(stream,True,container=container,`
`260`		`-encoding=encoding,scripting=scripting)`
	`252`	`+self._parse(stream,True,args,*kwargs)`
`261`	`253`	`returnself.tree.getFragment()`
`262`	`254`
`263`	`255`	`defparseError(self,errorcode="XXX-undefined-error",datavars=None):`

`‎html5lib/inputstream.py`

Lines changed: 60 additions & 35 deletions

Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):`
`128`	`128`	`returnb"".join(rv)`
`129`	`129`
`130`	`130`
`131`		`-defHTMLInputStream(source,encoding=None,parseMeta=True,chardet=True):`
	`131`	`+defHTMLInputStream(source,override_encoding=None,**kwargs):`
`132`	`132`	`# Work around Python bug #20007: read(0) closes the connection.`
`133`	`133`	`# http://bugs.python.org/issue20007`
`134`	`134`	`if (isinstance(source,http_client.HTTPResponse)or`
`@@ -142,12 +142,12 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):`
`142`	`142`	`isUnicode=isinstance(source,text_type)`
`143`	`143`
`144`	`144`	`ifisUnicode:`
`145`		`-ifencodingisnotNone:`
`146`		`-raiseTypeError("Cannotexplicitlyset an encoding with a unicodestring")`
	`145`	`+ifoverride_encodingisnotNone:`
	`146`	`+raiseTypeError("Cannot set anoverrideencoding with a unicodeinput")`
`147`	`147`
`148`	`148`	`returnHTMLUnicodeInputStream(source)`
`149`	`149`	`else:`
`150`		`-returnHTMLBinaryInputStream(source,encoding,parseMeta,chardet)`
	`150`	`+returnHTMLBinaryInputStream(source,override_encoding=override_encoding,**kwargs)`
`151`	`151`
`152`	`152`
`153`	`153`	`classHTMLUnicodeInputStream(object):`
`@@ -173,8 +173,6 @@ def __init__(self, source):`
`173`	`173`	`regardless of any BOM or later declaration (such as in a meta`
`174`	`174`	`element)`
`175`	`175`
`176`		`- parseMeta - Look for a <meta> element containing encoding information`
`177`		`-`
`178`	`176`	`"""`
`179`	`177`
`180`	`178`	`ifnotutils.supports_lone_surrogates:`
`@@ -390,7 +388,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):`
`390`	`388`
`391`	`389`	`"""`
`392`	`390`
`393`		`-def__init__(self,source,encoding=None,parseMeta=True,chardet=True):`
	`391`	`+def__init__(self,source,override_encoding=None,transport_encoding=None,`
	`392`	`+same_origin_parent_encoding=None,likely_encoding=None,`
	`393`	`+default_encoding="windows-1252",useChardet=True):`
`394`	`394`	`"""Initialises the HTMLInputStream.`
`395`	`395`
`396`	`396`	`HTMLInputStream(source, [encoding]) -> Normalized stream from source`
`@@ -403,30 +403,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):`
`403`	`403`	`regardless of any BOM or later declaration (such as in a meta`
`404`	`404`	`element)`
`405`	`405`
`406`		`- parseMeta - Look for a <meta> element containing encoding information`
`407`		`-`
`408`	`406`	`"""`
`409`	`407`	`# Raw Stream - for unicode objects this will encode to utf-8 and set`
`410`	`408`	`# self.charEncoding as appropriate`
`411`	`409`	`self.rawStream=self.openStream(source)`
`412`	`410`
`413`	`411`	`HTMLUnicodeInputStream.__init__(self,self.rawStream)`
`414`	`412`
`415`		`-self.charEncoding= (lookupEncoding(encoding),"certain")`
`416`		`-`
`417`	`413`	`# Encoding Information`
`418`	`414`	`# Number of bytes to use when looking for a meta element with`
`419`	`415`	`# encoding information`
`420`	`416`	`self.numBytesMeta=1024`
`421`	`417`	`# Number of bytes to use when using detecting encoding using chardet`
`422`	`418`	`self.numBytesChardet=100`
`423`		`-# Encoding to use if no other information can be found`
`424`		`-self.defaultEncoding="windows-1252"`
	`419`	`+# Things from args`
	`420`	`+self.override_encoding=override_encoding`
	`421`	`+self.transport_encoding=transport_encoding`
	`422`	`+self.same_origin_parent_encoding=same_origin_parent_encoding`
	`423`	`+self.likely_encoding=likely_encoding`
	`424`	`+self.default_encoding=default_encoding`
`425`	`425`
`426`		`-# Detect encoding iff no explicit "transport level" encoding is supplied`
`427`		`-if (self.charEncoding[0]isNone):`
`428`		`-self.charEncoding=self.detectEncoding(parseMeta,chardet)`
`429`		`-assertself.charEncoding[0]isnotNone`
	`426`	`+# Determine encoding`
	`427`	`+self.charEncoding=self.determineEncoding(useChardet)`
	`428`	`+assertself.charEncoding[0]isnotNone`
`430`	`429`
`431`	`430`	`# Call superclass`
`432`	`431`	`self.reset()`
`@@ -454,21 +453,45 @@ def openStream(self, source):`
`454`	`453`
`455`	`454`	`returnstream`
`456`	`455`
`457`		`-defdetectEncoding(self,parseMeta=True,chardet=True):`
`458`		`-#First look for a BOM`
	`456`	`+defdetermineEncoding(self,chardet=True):`
	`457`	`+#BOMs take precedence over everything`
`459`	`458`	`# This will also read past the BOM if present`
`460`		`-encoding=self.detectBOM()`
`461`		`-confidence="certain"`
`462`		`-# If there is no BOM need to look for meta elements with encoding`
`463`		`-# information`
`464`		`-ifencodingisNoneandparseMeta:`
`465`		`-encoding=self.detectEncodingMeta()`
`466`		`-confidence="tentative"`
	`459`	`+charEncoding=self.detectBOM(),"certain"`
	`460`	`+ifcharEncoding[0]isnotNone:`
	`461`	`+returncharEncoding`
	`462`	`+`
	`463`	`+# If we've been overriden, we've been overriden`
	`464`	`+charEncoding=lookupEncoding(self.override_encoding),"certain"`
	`465`	`+ifcharEncoding[0]isnotNone:`
	`466`	`+returncharEncoding`
	`467`	`+`
	`468`	`+# Now check the transport layer`
	`469`	`+charEncoding=lookupEncoding(self.transport_encoding),"certain"`
	`470`	`+ifcharEncoding[0]isnotNone:`
	`471`	`+returncharEncoding`
	`472`	`+`
	`473`	`+# Look for meta elements with encoding information`
	`474`	`+charEncoding=self.detectEncodingMeta(),"tentative"`
	`475`	`+ifcharEncoding[0]isnotNone:`
	`476`	`+returncharEncoding`
	`477`	`+`
	`478`	`+# Parent document encoding`
	`479`	`+charEncoding=lookupEncoding(self.same_origin_parent_encoding),"tentative"`
	`480`	`+ifcharEncoding[0]isnotNoneandnotcharEncoding[0].name.startswith("utf-16"):`
	`481`	`+returncharEncoding`
	`482`	`+`
	`483`	`+# "likely" encoding`
	`484`	`+charEncoding=lookupEncoding(self.likely_encoding),"tentative"`
	`485`	`+ifcharEncoding[0]isnotNone:`
	`486`	`+returncharEncoding`
	`487`	`+`
`467`	`488`	`# Guess with chardet, if available`
`468`		`-ifencodingisNoneandchardet:`
`469`		`-confidence="tentative"`
	`489`	`+ifchardet:`
`470`	`490`	`try:`
`471`	`491`	`fromchardet.universaldetectorimportUniversalDetector`
	`492`	`+exceptImportError:`
	`493`	`+pass`
	`494`	`+else:`
`472`	`495`	`buffers= []`
`473`	`496`	`detector=UniversalDetector()`
`474`	`497`	`whilenotdetector.done:`
`@@ -481,14 +504,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):`
`481`	`504`	`detector.close()`
`482`	`505`	`encoding=lookupEncoding(detector.result['encoding'])`
`483`	`506`	`self.rawStream.seek(0)`
`484`		`-exceptImportError:`
`485`		`-pass`
`486`		`-# If all else fails use the default encoding`
`487`		`-ifencodingisNone:`
`488`		`-confidence="tentative"`
`489`		`-encoding=lookupEncoding(self.defaultEncoding)`
	`507`	`+ifencodingisnotNone:`
	`508`	`+returnencoding,"tentative"`
	`509`	`+`
	`510`	`+# Try the default encoding`
	`511`	`+charEncoding=lookupEncoding(self.default_encoding),"tentative"`
	`512`	`+ifcharEncoding[0]isnotNone:`
	`513`	`+returncharEncoding`
`490`	`514`
`491`		`-returnencoding,confidence`
	`515`	`+# Fallback to html5lib's default if even that hasn't worked`
	`516`	`+returnlookupEncoding("windows-1252"),"tentative"`
`492`	`517`
`493`	`518`	`defchangeEncoding(self,newEncoding):`
`494`	`519`	`assertself.charEncoding[1]!="certain"`

`‎html5lib/tests/test_encoding.py`

Lines changed: 3 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ def test_basic_prescan_length():`
`11`	`11`	`pad=1024-len(data)+1`
`12`	`12`	`data=data.replace(b"-a-",b"-"+ (b"a"*pad)+b"-")`
`13`	`13`	`assertlen(data)==1024# Sanity`
`14`		`-stream=inputstream.HTMLBinaryInputStream(data,chardet=False)`
	`14`	`+stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)`
`15`	`15`	`assert'utf-8'==stream.charEncoding[0].name`
`16`	`16`
`17`	`17`
`@@ -20,7 +20,7 @@ def test_parser_reparse():`
`20`	`20`	`pad=10240-len(data)+1`
`21`	`21`	`data=data.replace(b"-a-",b"-"+ (b"a"*pad)+b"-")`
`22`	`22`	`assertlen(data)==10240# Sanity`
`23`		`-stream=inputstream.HTMLBinaryInputStream(data,chardet=False)`
	`23`	`+stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)`
`24`	`24`	`assert'windows-1252'==stream.charEncoding[0].name`
`25`	`25`	`p=HTMLParser(namespaceHTMLElements=False)`
`26`	`26`	`doc=p.parse(data,useChardet=False)`
`@@ -38,7 +38,7 @@ def runParserEncodingTest(data, encoding):`
`38`	`38`
`39`	`39`
`40`	`40`	`defrunPreScanEncodingTest(data,encoding):`
`41`		`-stream=inputstream.HTMLBinaryInputStream(data,chardet=False)`
	`41`	`+stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)`
`42`	`42`	`encoding=encoding.lower().decode("ascii")`
`43`	`43`
`44`	`44`	`# Very crude way to ignore irrelevant tests`

`‎html5lib/tests/test_stream.py`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -99,13 +99,13 @@ class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):`
`99`	`99`
`100`	`100`
`101`	`101`	`deftest_char_ascii():`
`102`		`-stream=HTMLInputStream(b"'",encoding='ascii')`
	`102`	`+stream=HTMLInputStream(b"'",override_encoding='ascii')`
`103`	`103`	`assertstream.charEncoding[0].name=='windows-1252'`
`104`	`104`	`assertstream.char()=="'"`
`105`	`105`
`106`	`106`
`107`	`107`	`deftest_char_utf8():`
`108`		`-stream=HTMLInputStream('\u2018'.encode('utf-8'),encoding='utf-8')`
	`108`	`+stream=HTMLInputStream('\u2018'.encode('utf-8'),override_encoding='utf-8')`
`109`	`109`	`assertstream.charEncoding[0].name=='utf-8'`
`110`	`110`	`assertstream.char()=='\u2018'`
`111`	`111`

`‎html5lib/tokenizer.py`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -31,9 +31,9 @@ class HTMLTokenizer(object):`
`31`	`31`	`Points to HTMLInputStream object.`
`32`	`32`	`"""`
`33`	`33`
`34`		`-def__init__(self,stream,encoding=None,useChardet=True,parser=None):`
	`34`	`+def__init__(self,stream,parser=None,**kwargs):`
`35`	`35`
`36`		`-self.stream=HTMLInputStream(stream,encoding,True,useChardet)`
	`36`	`+self.stream=HTMLInputStream(stream,**kwargs)`
`37`	`37`	`self.parser=parser`
`38`	`38`
`39`	`39`	`# Setup the initial tokenizer state`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitdfa444f

File tree

7 files changed

7 files changed

`‎CHANGES.rst`

`‎README.rst`

`‎html5lib/html5parser.py`

`‎html5lib/inputstream.py`

`‎html5lib/tests/test_encoding.py`

`‎html5lib/tests/test_stream.py`

`‎html5lib/tokenizer.py`

0 commit comments