Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitdfa444f

Browse files
committed
Fix#120: introduce keyword arguments for encodings by source
1 parent244a6eb commitdfa444f

File tree

7 files changed

+79
-58
lines changed

7 files changed

+79
-58
lines changed

‎CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ Released on XXX
4646

4747
* **Drop support of charade, now that chardet is supported once more.**
4848

49+
* **Replace the charset keyword argument on parse and related methods
50+
with a set of keyword arguments: override_encoding, transport_encoding,
51+
same_origin_parent_encoding, likely_encoding, and default_encoding.**
52+
4953

5054
0.9999999/1.0b8
5155
~~~~~~~~~~~~~~~

‎README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ pass into html5lib as follows:
5151
import html5lib
5252
5353
with closing(urlopen("http://example.com/"))as f:
54-
document= html5lib.parse(f,encoding=f.info().getparam("charset"))
54+
document= html5lib.parse(f,transport_encoding=f.info().getparam("charset"))
5555
5656
When using with ``urllib.request`` (Python 3), the charset from HTTP
5757
should be pass into html5lib as follows:
@@ -62,7 +62,7 @@ should be pass into html5lib as follows:
6262
import html5lib
6363
6464
with urlopen("http://example.com/")as f:
65-
document= html5lib.parse(f,encoding=f.info().get_content_charset())
65+
document= html5lib.parse(f,transport_encoding=f.info().get_content_charset())
6666
6767
To have more control over the parser, create a parser object explicitly.
6868
For instance, to make the parser raise exceptions on parse errors, use:

‎html5lib/html5parser.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,12 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa
7979
self.phases=dict([(name,cls(self,self.tree))forname,clsin
8080
getPhases(debug).items()])
8181

82-
def_parse(self,stream,innerHTML=False,container="div",encoding=None,
83-
useChardet=True,scripting=False,**kwargs):
82+
def_parse(self,stream,innerHTML=False,container="div",scripting=False,**kwargs):
8483

8584
self.innerHTMLMode=innerHTML
8685
self.container=container
8786
self.scripting=scripting
88-
self.tokenizer=tokenizer.HTMLTokenizer(stream,encoding=encoding,
89-
useChardet=useChardet,
90-
parser=self,**kwargs)
87+
self.tokenizer=tokenizer.HTMLTokenizer(stream,parser=self,**kwargs)
9188
self.reset()
9289

9390
try:
@@ -222,8 +219,7 @@ def normalizedTokens(self):
222219
fortokeninself.tokenizer:
223220
yieldself.normalizeToken(token)
224221

225-
defparse(self,stream,encoding=None,
226-
useChardet=True,scripting=False):
222+
defparse(self,stream,*args,**kwargs):
227223
"""Parse a HTML document into a well-formed tree
228224
229225
stream - a filelike object or string containing the HTML to be parsed
@@ -235,13 +231,10 @@ def parse(self, stream, encoding=None,
235231
236232
scripting - treat noscript elements as if javascript was turned on
237233
"""
238-
self._parse(stream,innerHTML=False,encoding=encoding,
239-
useChardet=useChardet,scripting=scripting)
234+
self._parse(stream,False,None,*args,**kwargs)
240235
returnself.tree.getDocument()
241236

242-
defparseFragment(self,stream,container="div",encoding=None,
243-
useChardet=True,scripting=False):
244-
# pylint:disable=unused-argument
237+
defparseFragment(self,stream,*args,**kwargs):
245238
"""Parse a HTML fragment into a well-formed tree fragment
246239
247240
container - name of the element we're setting the innerHTML property
@@ -256,8 +249,7 @@ def parseFragment(self, stream, container="div", encoding=None,
256249
257250
scripting - treat noscript elements as if javascript was turned on
258251
"""
259-
self._parse(stream,True,container=container,
260-
encoding=encoding,scripting=scripting)
252+
self._parse(stream,True,*args,**kwargs)
261253
returnself.tree.getFragment()
262254

263255
defparseError(self,errorcode="XXX-undefined-error",datavars=None):

‎html5lib/inputstream.py

Lines changed: 60 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128128
returnb"".join(rv)
129129

130130

131-
defHTMLInputStream(source,encoding=None,parseMeta=True,chardet=True):
131+
defHTMLInputStream(source,override_encoding=None,**kwargs):
132132
# Work around Python bug #20007: read(0) closes the connection.
133133
# http://bugs.python.org/issue20007
134134
if (isinstance(source,http_client.HTTPResponse)or
@@ -142,12 +142,12 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142142
isUnicode=isinstance(source,text_type)
143143

144144
ifisUnicode:
145-
ifencodingisnotNone:
146-
raiseTypeError("Cannotexplicitlyset an encoding with a unicodestring")
145+
ifoverride_encodingisnotNone:
146+
raiseTypeError("Cannot set anoverrideencoding with a unicodeinput")
147147

148148
returnHTMLUnicodeInputStream(source)
149149
else:
150-
returnHTMLBinaryInputStream(source,encoding,parseMeta,chardet)
150+
returnHTMLBinaryInputStream(source,override_encoding=override_encoding,**kwargs)
151151

152152

153153
classHTMLUnicodeInputStream(object):
@@ -173,8 +173,6 @@ def __init__(self, source):
173173
regardless of any BOM or later declaration (such as in a meta
174174
element)
175175
176-
parseMeta - Look for a <meta> element containing encoding information
177-
178176
"""
179177

180178
ifnotutils.supports_lone_surrogates:
@@ -390,7 +388,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390388
391389
"""
392390

393-
def__init__(self,source,encoding=None,parseMeta=True,chardet=True):
391+
def__init__(self,source,override_encoding=None,transport_encoding=None,
392+
same_origin_parent_encoding=None,likely_encoding=None,
393+
default_encoding="windows-1252",useChardet=True):
394394
"""Initialises the HTMLInputStream.
395395
396396
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +403,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403403
regardless of any BOM or later declaration (such as in a meta
404404
element)
405405
406-
parseMeta - Look for a <meta> element containing encoding information
407-
408406
"""
409407
# Raw Stream - for unicode objects this will encode to utf-8 and set
410408
# self.charEncoding as appropriate
411409
self.rawStream=self.openStream(source)
412410

413411
HTMLUnicodeInputStream.__init__(self,self.rawStream)
414412

415-
self.charEncoding= (lookupEncoding(encoding),"certain")
416-
417413
# Encoding Information
418414
# Number of bytes to use when looking for a meta element with
419415
# encoding information
420416
self.numBytesMeta=1024
421417
# Number of bytes to use when using detecting encoding using chardet
422418
self.numBytesChardet=100
423-
# Encoding to use if no other information can be found
424-
self.defaultEncoding="windows-1252"
419+
# Things from args
420+
self.override_encoding=override_encoding
421+
self.transport_encoding=transport_encoding
422+
self.same_origin_parent_encoding=same_origin_parent_encoding
423+
self.likely_encoding=likely_encoding
424+
self.default_encoding=default_encoding
425425

426-
# Detect encoding iff no explicit "transport level" encoding is supplied
427-
if (self.charEncoding[0]isNone):
428-
self.charEncoding=self.detectEncoding(parseMeta,chardet)
429-
assertself.charEncoding[0]isnotNone
426+
# Determine encoding
427+
self.charEncoding=self.determineEncoding(useChardet)
428+
assertself.charEncoding[0]isnotNone
430429

431430
# Call superclass
432431
self.reset()
@@ -454,21 +453,45 @@ def openStream(self, source):
454453

455454
returnstream
456455

457-
defdetectEncoding(self,parseMeta=True,chardet=True):
458-
#First look for a BOM
456+
defdetermineEncoding(self,chardet=True):
457+
#BOMs take precedence over everything
459458
# This will also read past the BOM if present
460-
encoding=self.detectBOM()
461-
confidence="certain"
462-
# If there is no BOM need to look for meta elements with encoding
463-
# information
464-
ifencodingisNoneandparseMeta:
465-
encoding=self.detectEncodingMeta()
466-
confidence="tentative"
459+
charEncoding=self.detectBOM(),"certain"
460+
ifcharEncoding[0]isnotNone:
461+
returncharEncoding
462+
463+
# If we've been overriden, we've been overriden
464+
charEncoding=lookupEncoding(self.override_encoding),"certain"
465+
ifcharEncoding[0]isnotNone:
466+
returncharEncoding
467+
468+
# Now check the transport layer
469+
charEncoding=lookupEncoding(self.transport_encoding),"certain"
470+
ifcharEncoding[0]isnotNone:
471+
returncharEncoding
472+
473+
# Look for meta elements with encoding information
474+
charEncoding=self.detectEncodingMeta(),"tentative"
475+
ifcharEncoding[0]isnotNone:
476+
returncharEncoding
477+
478+
# Parent document encoding
479+
charEncoding=lookupEncoding(self.same_origin_parent_encoding),"tentative"
480+
ifcharEncoding[0]isnotNoneandnotcharEncoding[0].name.startswith("utf-16"):
481+
returncharEncoding
482+
483+
# "likely" encoding
484+
charEncoding=lookupEncoding(self.likely_encoding),"tentative"
485+
ifcharEncoding[0]isnotNone:
486+
returncharEncoding
487+
467488
# Guess with chardet, if available
468-
ifencodingisNoneandchardet:
469-
confidence="tentative"
489+
ifchardet:
470490
try:
471491
fromchardet.universaldetectorimportUniversalDetector
492+
exceptImportError:
493+
pass
494+
else:
472495
buffers= []
473496
detector=UniversalDetector()
474497
whilenotdetector.done:
@@ -481,14 +504,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481504
detector.close()
482505
encoding=lookupEncoding(detector.result['encoding'])
483506
self.rawStream.seek(0)
484-
exceptImportError:
485-
pass
486-
# If all else fails use the default encoding
487-
ifencodingisNone:
488-
confidence="tentative"
489-
encoding=lookupEncoding(self.defaultEncoding)
507+
ifencodingisnotNone:
508+
returnencoding,"tentative"
509+
510+
# Try the default encoding
511+
charEncoding=lookupEncoding(self.default_encoding),"tentative"
512+
ifcharEncoding[0]isnotNone:
513+
returncharEncoding
490514

491-
returnencoding,confidence
515+
# Fallback to html5lib's default if even that hasn't worked
516+
returnlookupEncoding("windows-1252"),"tentative"
492517

493518
defchangeEncoding(self,newEncoding):
494519
assertself.charEncoding[1]!="certain"

‎html5lib/tests/test_encoding.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def test_basic_prescan_length():
1111
pad=1024-len(data)+1
1212
data=data.replace(b"-a-",b"-"+ (b"a"*pad)+b"-")
1313
assertlen(data)==1024# Sanity
14-
stream=inputstream.HTMLBinaryInputStream(data,chardet=False)
14+
stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)
1515
assert'utf-8'==stream.charEncoding[0].name
1616

1717

@@ -20,7 +20,7 @@ def test_parser_reparse():
2020
pad=10240-len(data)+1
2121
data=data.replace(b"-a-",b"-"+ (b"a"*pad)+b"-")
2222
assertlen(data)==10240# Sanity
23-
stream=inputstream.HTMLBinaryInputStream(data,chardet=False)
23+
stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)
2424
assert'windows-1252'==stream.charEncoding[0].name
2525
p=HTMLParser(namespaceHTMLElements=False)
2626
doc=p.parse(data,useChardet=False)
@@ -38,7 +38,7 @@ def runParserEncodingTest(data, encoding):
3838

3939

4040
defrunPreScanEncodingTest(data,encoding):
41-
stream=inputstream.HTMLBinaryInputStream(data,chardet=False)
41+
stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)
4242
encoding=encoding.lower().decode("ascii")
4343

4444
# Very crude way to ignore irrelevant tests

‎html5lib/tests/test_stream.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,13 @@ class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):
9999

100100

101101
deftest_char_ascii():
102-
stream=HTMLInputStream(b"'",encoding='ascii')
102+
stream=HTMLInputStream(b"'",override_encoding='ascii')
103103
assertstream.charEncoding[0].name=='windows-1252'
104104
assertstream.char()=="'"
105105

106106

107107
deftest_char_utf8():
108-
stream=HTMLInputStream('\u2018'.encode('utf-8'),encoding='utf-8')
108+
stream=HTMLInputStream('\u2018'.encode('utf-8'),override_encoding='utf-8')
109109
assertstream.charEncoding[0].name=='utf-8'
110110
assertstream.char()=='\u2018'
111111

‎html5lib/tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ class HTMLTokenizer(object):
3131
Points to HTMLInputStream object.
3232
"""
3333

34-
def__init__(self,stream,encoding=None,useChardet=True,parser=None):
34+
def__init__(self,stream,parser=None,**kwargs):
3535

36-
self.stream=HTMLInputStream(stream,encoding,True,useChardet)
36+
self.stream=HTMLInputStream(stream,**kwargs)
3737
self.parser=parser
3838

3939
# Setup the initial tokenizer state

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp