Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitfc9f63b

Browse files
committed
Fixhtml5lib#120: introduce keyword arguments for encodings by source
1 parent6464fc4 commitfc9f63b

File tree

7 files changed

+133
-65
lines changed

7 files changed

+133
-65
lines changed

‎CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ Released on XXX
4646

4747
* **Drop support of charade, now that chardet is supported once more.**
4848

49+
* **Replace the charset keyword argument on parse and related methods
50+
with a set of keyword arguments: override_encoding, transport_encoding,
51+
same_origin_parent_encoding, likely_encoding, and default_encoding.**
52+
4953

5054
0.9999999/1.0b8
5155
~~~~~~~~~~~~~~~

‎README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ pass into html5lib as follows:
5151
import html5lib
5252
5353
with closing(urlopen("http://example.com/"))as f:
54-
document= html5lib.parse(f,encoding=f.info().getparam("charset"))
54+
document= html5lib.parse(f,transport_encoding=f.info().getparam("charset"))
5555
5656
When using with ``urllib.request`` (Python 3), the charset from HTTP
5757
should be pass into html5lib as follows:
@@ -62,7 +62,7 @@ should be pass into html5lib as follows:
6262
import html5lib
6363
6464
with urlopen("http://example.com/")as f:
65-
document= html5lib.parse(f,encoding=f.info().get_content_charset())
65+
document= html5lib.parse(f,transport_encoding=f.info().get_content_charset())
6666
6767
To have more control over the parser, create a parser object explicitly.
6868
For instance, to make the parser raise exceptions on parse errors, use:

‎html5lib/html5parser.py

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,17 @@
2828
)
2929

3030

31-
defparse(doc,treebuilder="etree",encoding=None,
32-
namespaceHTMLElements=True,scripting=False):
31+
defparse(doc,treebuilder="etree",namespaceHTMLElements=True,**kwargs):
3332
"""Parse a string or file-like object into a tree"""
3433
tb=treebuilders.getTreeBuilder(treebuilder)
3534
p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)
36-
returnp.parse(doc,encoding=encoding,scripting=scripting)
35+
returnp.parse(doc,**kwargs)
3736

3837

39-
defparseFragment(doc,container="div",treebuilder="etree",encoding=None,
40-
namespaceHTMLElements=True,scripting=False):
38+
defparseFragment(doc,container="div",treebuilder="etree",namespaceHTMLElements=True,**kwargs):
4139
tb=treebuilders.getTreeBuilder(treebuilder)
4240
p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)
43-
returnp.parseFragment(doc,container=container,encoding=encoding,scripting=scripting)
41+
returnp.parseFragment(doc,container=container,**kwargs)
4442

4543

4644
defmethod_decorator_metaclass(function):
@@ -79,15 +77,12 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa
7977
self.phases=dict([(name,cls(self,self.tree))forname,clsin
8078
getPhases(debug).items()])
8179

82-
def_parse(self,stream,innerHTML=False,container="div",encoding=None,
83-
useChardet=True,scripting=False,**kwargs):
80+
def_parse(self,stream,innerHTML=False,container="div",scripting=False,**kwargs):
8481

8582
self.innerHTMLMode=innerHTML
8683
self.container=container
8784
self.scripting=scripting
88-
self.tokenizer=tokenizer.HTMLTokenizer(stream,encoding=encoding,
89-
useChardet=useChardet,
90-
parser=self,**kwargs)
85+
self.tokenizer=tokenizer.HTMLTokenizer(stream,parser=self,**kwargs)
9186
self.reset()
9287

9388
try:
@@ -225,8 +220,7 @@ def normalizedTokens(self):
225220
fortokeninself.tokenizer:
226221
yieldself.normalizeToken(token)
227222

228-
defparse(self,stream,encoding=None,
229-
useChardet=True,scripting=False):
223+
defparse(self,stream,*args,**kwargs):
230224
"""Parse a HTML document into a well-formed tree
231225
232226
stream - a filelike object or string containing the HTML to be parsed
@@ -238,13 +232,10 @@ def parse(self, stream, encoding=None,
238232
239233
scripting - treat noscript elements as if javascript was turned on
240234
"""
241-
self._parse(stream,innerHTML=False,encoding=encoding,
242-
useChardet=useChardet,scripting=scripting)
235+
self._parse(stream,False,None,*args,**kwargs)
243236
returnself.tree.getDocument()
244237

245-
defparseFragment(self,stream,container="div",encoding=None,
246-
useChardet=True,scripting=False):
247-
# pylint:disable=unused-argument
238+
defparseFragment(self,stream,*args,**kwargs):
248239
"""Parse a HTML fragment into a well-formed tree fragment
249240
250241
container - name of the element we're setting the innerHTML property
@@ -259,8 +250,7 @@ def parseFragment(self, stream, container="div", encoding=None,
259250
260251
scripting - treat noscript elements as if javascript was turned on
261252
"""
262-
self._parse(stream,True,container=container,
263-
encoding=encoding,scripting=scripting)
253+
self._parse(stream,True,*args,**kwargs)
264254
returnself.tree.getFragment()
265255

266256
defparseError(self,errorcode="XXX-undefined-error",datavars=None):

‎html5lib/inputstream.py

Lines changed: 62 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128128
returnb"".join(rv)
129129

130130

131-
defHTMLInputStream(source,encoding=None,parseMeta=True,chardet=True):
131+
defHTMLInputStream(source,**kwargs):
132132
# Work around Python bug #20007: read(0) closes the connection.
133133
# http://bugs.python.org/issue20007
134134
if (isinstance(source,http_client.HTTPResponse)or
@@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142142
isUnicode=isinstance(source,text_type)
143143

144144
ifisUnicode:
145-
ifencodingisnotNone:
146-
raiseTypeError("Cannot explicitly set an encoding with a unicode string")
145+
encodings= [xforxinkwargsifx.endswith("_encoding")]
146+
ifencodings:
147+
raiseTypeError("Cannot set an encoding with a unicode input, set %r"%encodings)
147148

148-
returnHTMLUnicodeInputStream(source)
149+
returnHTMLUnicodeInputStream(source,**kwargs)
149150
else:
150-
returnHTMLBinaryInputStream(source,encoding,parseMeta,chardet)
151+
returnHTMLBinaryInputStream(source,**kwargs)
151152

152153

153154
classHTMLUnicodeInputStream(object):
@@ -173,8 +174,6 @@ def __init__(self, source):
173174
regardless of any BOM or later declaration (such as in a meta
174175
element)
175176
176-
parseMeta - Look for a <meta> element containing encoding information
177-
178177
"""
179178

180179
ifnotutils.supports_lone_surrogates:
@@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390389
391390
"""
392391

393-
def__init__(self,source,encoding=None,parseMeta=True,chardet=True):
392+
def__init__(self,source,override_encoding=None,transport_encoding=None,
393+
same_origin_parent_encoding=None,likely_encoding=None,
394+
default_encoding="windows-1252",useChardet=True):
394395
"""Initialises the HTMLInputStream.
395396
396397
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +404,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403404
regardless of any BOM or later declaration (such as in a meta
404405
element)
405406
406-
parseMeta - Look for a <meta> element containing encoding information
407-
408407
"""
409408
# Raw Stream - for unicode objects this will encode to utf-8 and set
410409
# self.charEncoding as appropriate
411410
self.rawStream=self.openStream(source)
412411

413412
HTMLUnicodeInputStream.__init__(self,self.rawStream)
414413

415-
self.charEncoding= (lookupEncoding(encoding),"certain")
416-
417414
# Encoding Information
418415
# Number of bytes to use when looking for a meta element with
419416
# encoding information
420417
self.numBytesMeta=1024
421418
# Number of bytes to use when using detecting encoding using chardet
422419
self.numBytesChardet=100
423-
# Encoding to use if no other information can be found
424-
self.defaultEncoding="windows-1252"
420+
# Things from args
421+
self.override_encoding=override_encoding
422+
self.transport_encoding=transport_encoding
423+
self.same_origin_parent_encoding=same_origin_parent_encoding
424+
self.likely_encoding=likely_encoding
425+
self.default_encoding=default_encoding
425426

426-
# Detect encoding iff no explicit "transport level" encoding is supplied
427-
if (self.charEncoding[0]isNone):
428-
self.charEncoding=self.detectEncoding(parseMeta,chardet)
429-
assertself.charEncoding[0]isnotNone
427+
# Determine encoding
428+
self.charEncoding=self.determineEncoding(useChardet)
429+
assertself.charEncoding[0]isnotNone
430430

431431
# Call superclass
432432
self.reset()
@@ -454,21 +454,45 @@ def openStream(self, source):
454454

455455
returnstream
456456

457-
defdetectEncoding(self,parseMeta=True,chardet=True):
458-
#First look for a BOM
457+
defdetermineEncoding(self,chardet=True):
458+
#BOMs take precedence over everything
459459
# This will also read past the BOM if present
460-
encoding=self.detectBOM()
461-
confidence="certain"
462-
# If there is no BOM need to look for meta elements with encoding
463-
# information
464-
ifencodingisNoneandparseMeta:
465-
encoding=self.detectEncodingMeta()
466-
confidence="tentative"
460+
charEncoding=self.detectBOM(),"certain"
461+
ifcharEncoding[0]isnotNone:
462+
returncharEncoding
463+
464+
# If we've been overriden, we've been overriden
465+
charEncoding=lookupEncoding(self.override_encoding),"certain"
466+
ifcharEncoding[0]isnotNone:
467+
returncharEncoding
468+
469+
# Now check the transport layer
470+
charEncoding=lookupEncoding(self.transport_encoding),"certain"
471+
ifcharEncoding[0]isnotNone:
472+
returncharEncoding
473+
474+
# Look for meta elements with encoding information
475+
charEncoding=self.detectEncodingMeta(),"tentative"
476+
ifcharEncoding[0]isnotNone:
477+
returncharEncoding
478+
479+
# Parent document encoding
480+
charEncoding=lookupEncoding(self.same_origin_parent_encoding),"tentative"
481+
ifcharEncoding[0]isnotNoneandnotcharEncoding[0].name.startswith("utf-16"):
482+
returncharEncoding
483+
484+
# "likely" encoding
485+
charEncoding=lookupEncoding(self.likely_encoding),"tentative"
486+
ifcharEncoding[0]isnotNone:
487+
returncharEncoding
488+
467489
# Guess with chardet, if available
468-
ifencodingisNoneandchardet:
469-
confidence="tentative"
490+
ifchardet:
470491
try:
471492
fromchardet.universaldetectorimportUniversalDetector
493+
exceptImportError:
494+
pass
495+
else:
472496
buffers= []
473497
detector=UniversalDetector()
474498
whilenotdetector.done:
@@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481505
detector.close()
482506
encoding=lookupEncoding(detector.result['encoding'])
483507
self.rawStream.seek(0)
484-
exceptImportError:
485-
pass
486-
# If all else fails use the default encoding
487-
ifencodingisNone:
488-
confidence="tentative"
489-
encoding=lookupEncoding(self.defaultEncoding)
508+
ifencodingisnotNone:
509+
returnencoding,"tentative"
510+
511+
# Try the default encoding
512+
charEncoding=lookupEncoding(self.default_encoding),"tentative"
513+
ifcharEncoding[0]isnotNone:
514+
returncharEncoding
490515

491-
returnencoding,confidence
516+
# Fallback to html5lib's default if even that hasn't worked
517+
returnlookupEncoding("windows-1252"),"tentative"
492518

493519
defchangeEncoding(self,newEncoding):
494520
assertself.charEncoding[1]!="certain"

‎html5lib/tests/test_encoding.py

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
importos
44

5+
importpytest
6+
57
from .supportimportget_data_files,test_dir,errorMessage,TestDataas_TestData
68
fromhtml5libimportHTMLParser,inputstream
79

@@ -11,7 +13,7 @@ def test_basic_prescan_length():
1113
pad=1024-len(data)+1
1214
data=data.replace(b"-a-",b"-"+ (b"a"*pad)+b"-")
1315
assertlen(data)==1024# Sanity
14-
stream=inputstream.HTMLBinaryInputStream(data,chardet=False)
16+
stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)
1517
assert'utf-8'==stream.charEncoding[0].name
1618

1719

@@ -20,14 +22,59 @@ def test_parser_reparse():
2022
pad=10240-len(data)+1
2123
data=data.replace(b"-a-",b"-"+ (b"a"*pad)+b"-")
2224
assertlen(data)==10240# Sanity
23-
stream=inputstream.HTMLBinaryInputStream(data,chardet=False)
25+
stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)
2426
assert'windows-1252'==stream.charEncoding[0].name
2527
p=HTMLParser(namespaceHTMLElements=False)
2628
doc=p.parse(data,useChardet=False)
2729
assert'utf-8'==p.documentEncoding
2830
assertdoc.find(".//title").text=="Caf\u00E9"
2931

3032

33+
@pytest.mark.parametrize("expected,data,kwargs", [
34+
("utf-16le",b"\xFF\xFE", {"override_encoding":"iso-8859-2"}),
35+
("utf-16be",b"\xFE\xFF", {"override_encoding":"iso-8859-2"}),
36+
("utf-8",b"\xEF\xBB\xBF", {"override_encoding":"iso-8859-2"}),
37+
("iso-8859-2",b"", {"override_encoding":"iso-8859-2","transport_encoding":"iso-8859-3"}),
38+
("iso-8859-2",b"<meta charset=iso-8859-3>", {"transport_encoding":"iso-8859-2"}),
39+
("iso-8859-2",b"<meta charset=iso-8859-2>", {"same_origin_parent_encoding":"iso-8859-3"}),
40+
("iso-8859-2",b"", {"same_origin_parent_encoding":"iso-8859-2","likely_encoding":"iso-8859-3"}),
41+
("iso-8859-2",b"", {"same_origin_parent_encoding":"utf-16","likely_encoding":"iso-8859-2"}),
42+
("iso-8859-2",b"", {"same_origin_parent_encoding":"utf-16be","likely_encoding":"iso-8859-2"}),
43+
("iso-8859-2",b"", {"same_origin_parent_encoding":"utf-16le","likely_encoding":"iso-8859-2"}),
44+
("iso-8859-2",b"", {"likely_encoding":"iso-8859-2","default_encoding":"iso-8859-3"}),
45+
("iso-8859-2",b"", {"default_encoding":"iso-8859-2"}),
46+
("windows-1252",b"", {"default_encoding":"totally-bogus-string"}),
47+
("windows-1252",b"", {}),
48+
])
49+
deftest_parser_args(expected,data,kwargs):
50+
stream=inputstream.HTMLBinaryInputStream(data,useChardet=False,**kwargs)
51+
assertexpected==stream.charEncoding[0].name
52+
p=HTMLParser()
53+
p.parse(data,useChardet=False,**kwargs)
54+
assertexpected==p.documentEncoding
55+
56+
57+
@pytest.mark.parametrize("kwargs", [
58+
{"override_encoding":"iso-8859-2"},
59+
{"override_encoding":None},
60+
{"transport_encoding":"iso-8859-2"},
61+
{"transport_encoding":None},
62+
{"same_origin_parent_encoding":"iso-8859-2"},
63+
{"same_origin_parent_encoding":None},
64+
{"likely_encoding":"iso-8859-2"},
65+
{"likely_encoding":None},
66+
{"default_encoding":"iso-8859-2"},
67+
{"default_encoding":None},
68+
{"foo_encoding":"iso-8859-2"},
69+
{"foo_encoding":None},
70+
])
71+
deftest_parser_args_raises(kwargs):
72+
withpytest.raises(TypeError)asexc_info:
73+
p=HTMLParser()
74+
p.parse("",useChardet=False,**kwargs)
75+
assertexc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
76+
77+
3178
defrunParserEncodingTest(data,encoding):
3279
p=HTMLParser()
3380
assertp.documentEncodingisNone
@@ -38,7 +85,7 @@ def runParserEncodingTest(data, encoding):
3885

3986

4087
defrunPreScanEncodingTest(data,encoding):
41-
stream=inputstream.HTMLBinaryInputStream(data,chardet=False)
88+
stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)
4289
encoding=encoding.lower().decode("ascii")
4390

4491
# Very crude way to ignore irrelevant tests
@@ -55,6 +102,7 @@ def test_encoding():
55102
yield (runParserEncodingTest,test[b'data'],test[b'encoding'])
56103
yield (runPreScanEncodingTest,test[b'data'],test[b'encoding'])
57104

105+
58106
# pylint:disable=wrong-import-position
59107
try:
60108
importchardet# noqa

‎html5lib/tests/test_stream.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,13 @@ class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):
9999

100100

101101
deftest_char_ascii():
102-
stream=HTMLInputStream(b"'",encoding='ascii')
102+
stream=HTMLInputStream(b"'",override_encoding='ascii')
103103
assertstream.charEncoding[0].name=='windows-1252'
104104
assertstream.char()=="'"
105105

106106

107107
deftest_char_utf8():
108-
stream=HTMLInputStream('\u2018'.encode('utf-8'),encoding='utf-8')
108+
stream=HTMLInputStream('\u2018'.encode('utf-8'),override_encoding='utf-8')
109109
assertstream.charEncoding[0].name=='utf-8'
110110
assertstream.char()=='\u2018'
111111

‎html5lib/tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ class HTMLTokenizer(object):
3131
Points to HTMLInputStream object.
3232
"""
3333

34-
def__init__(self,stream,encoding=None,useChardet=True,parser=None):
34+
def__init__(self,stream,parser=None,**kwargs):
3535

36-
self.stream=HTMLInputStream(stream,encoding,True,useChardet)
36+
self.stream=HTMLInputStream(stream,**kwargs)
3737
self.parser=parser
3838

3939
# Setup the initial tokenizer state

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp