Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit699276b

Browse files
authored
Merge pull request#257 from gsnedders/det_encoding
Update encoding detection; r=nobody!
2 parentsdce9d62 +fc9f63b commit699276b

File tree

7 files changed

+137
-83
lines changed

7 files changed

+137
-83
lines changed

‎CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ Released on XXX
4646

4747
* **Drop support of charade, now that chardet is supported once more.**
4848

49+
* **Replace the charset keyword argument on parse and related methods
50+
with a set of keyword arguments: override_encoding, transport_encoding,
51+
same_origin_parent_encoding, likely_encoding, and default_encoding.**
52+
4953

5054
0.9999999/1.0b8
5155
~~~~~~~~~~~~~~~

‎README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ pass into html5lib as follows:
5151
import html5lib
5252
5353
with closing(urlopen("http://example.com/"))as f:
54-
document= html5lib.parse(f,encoding=f.info().getparam("charset"))
54+
document= html5lib.parse(f,transport_encoding=f.info().getparam("charset"))
5555
5656
When using with ``urllib.request`` (Python 3), the charset from HTTP
5757
should be pass into html5lib as follows:
@@ -62,7 +62,7 @@ should be pass into html5lib as follows:
6262
import html5lib
6363
6464
with urlopen("http://example.com/")as f:
65-
document= html5lib.parse(f,encoding=f.info().get_content_charset())
65+
document= html5lib.parse(f,transport_encoding=f.info().get_content_charset())
6666
6767
To have more control over the parser, create a parser object explicitly.
6868
For instance, to make the parser raise exceptions on parse errors, use:

‎html5lib/html5parser.py

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,17 @@
2828
)
2929

3030

31-
defparse(doc,treebuilder="etree",encoding=None,
32-
namespaceHTMLElements=True,scripting=False):
31+
defparse(doc,treebuilder="etree",namespaceHTMLElements=True,**kwargs):
3332
"""Parse a string or file-like object into a tree"""
3433
tb=treebuilders.getTreeBuilder(treebuilder)
3534
p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)
36-
returnp.parse(doc,encoding=encoding,scripting=scripting)
35+
returnp.parse(doc,**kwargs)
3736

3837

39-
defparseFragment(doc,container="div",treebuilder="etree",encoding=None,
40-
namespaceHTMLElements=True,scripting=False):
38+
defparseFragment(doc,container="div",treebuilder="etree",namespaceHTMLElements=True,**kwargs):
4139
tb=treebuilders.getTreeBuilder(treebuilder)
4240
p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)
43-
returnp.parseFragment(doc,container=container,encoding=encoding,scripting=scripting)
41+
returnp.parseFragment(doc,container=container,**kwargs)
4442

4543

4644
defmethod_decorator_metaclass(function):
@@ -59,18 +57,13 @@ class HTMLParser(object):
5957
"""HTML parser. Generates a tree structure from a stream of (possibly
6058
malformed) HTML"""
6159

62-
def__init__(self,tree=None,tokenizer=tokenizer.HTMLTokenizer,
63-
strict=False,namespaceHTMLElements=True,debug=False):
60+
def__init__(self,tree=None,strict=False,namespaceHTMLElements=True,debug=False):
6461
"""
6562
strict - raise an exception when a parse error is encountered
6663
6764
tree - a treebuilder class controlling the type of tree that will be
6865
returned. Built in treebuilders can be accessed through
6966
html5lib.treebuilders.getTreeBuilder(treeType)
70-
71-
tokenizer - a class that provides a stream of tokens to the treebuilder.
72-
This may be replaced for e.g. a sanitizer which converts some tags to
73-
text
7467
"""
7568

7669
# Raise an exception on the first error encountered
@@ -79,22 +72,17 @@ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
7972
iftreeisNone:
8073
tree=treebuilders.getTreeBuilder("etree")
8174
self.tree=tree(namespaceHTMLElements)
82-
self.tokenizer_class=tokenizer
8375
self.errors= []
8476

8577
self.phases=dict([(name,cls(self,self.tree))forname,clsin
8678
getPhases(debug).items()])
8779

88-
def_parse(self,stream,innerHTML=False,container="div",encoding=None,
89-
parseMeta=True,useChardet=True,scripting=False,**kwargs):
80+
def_parse(self,stream,innerHTML=False,container="div",scripting=False,**kwargs):
9081

9182
self.innerHTMLMode=innerHTML
9283
self.container=container
9384
self.scripting=scripting
94-
self.tokenizer=self.tokenizer_class(stream,encoding=encoding,
95-
parseMeta=parseMeta,
96-
useChardet=useChardet,
97-
parser=self,**kwargs)
85+
self.tokenizer=tokenizer.HTMLTokenizer(stream,parser=self,**kwargs)
9886
self.reset()
9987

10088
try:
@@ -232,8 +220,7 @@ def normalizedTokens(self):
232220
fortokeninself.tokenizer:
233221
yieldself.normalizeToken(token)
234222

235-
defparse(self,stream,encoding=None,parseMeta=True,
236-
useChardet=True,scripting=False):
223+
defparse(self,stream,*args,**kwargs):
237224
"""Parse a HTML document into a well-formed tree
238225
239226
stream - a filelike object or string containing the HTML to be parsed
@@ -245,13 +232,10 @@ def parse(self, stream, encoding=None, parseMeta=True,
245232
246233
scripting - treat noscript elements as if javascript was turned on
247234
"""
248-
self._parse(stream,innerHTML=False,encoding=encoding,
249-
parseMeta=parseMeta,useChardet=useChardet,scripting=scripting)
235+
self._parse(stream,False,None,*args,**kwargs)
250236
returnself.tree.getDocument()
251237

252-
defparseFragment(self,stream,container="div",encoding=None,
253-
parseMeta=False,useChardet=True,scripting=False):
254-
# pylint:disable=unused-argument
238+
defparseFragment(self,stream,*args,**kwargs):
255239
"""Parse a HTML fragment into a well-formed tree fragment
256240
257241
container - name of the element we're setting the innerHTML property
@@ -266,8 +250,7 @@ def parseFragment(self, stream, container="div", encoding=None,
266250
267251
scripting - treat noscript elements as if javascript was turned on
268252
"""
269-
self._parse(stream,True,container=container,
270-
encoding=encoding,scripting=scripting)
253+
self._parse(stream,True,*args,**kwargs)
271254
returnself.tree.getFragment()
272255

273256
defparseError(self,errorcode="XXX-undefined-error",datavars=None):

‎html5lib/inputstream.py

Lines changed: 62 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128128
returnb"".join(rv)
129129

130130

131-
defHTMLInputStream(source,encoding=None,parseMeta=True,chardet=True):
131+
defHTMLInputStream(source,**kwargs):
132132
# Work around Python bug #20007: read(0) closes the connection.
133133
# http://bugs.python.org/issue20007
134134
if (isinstance(source,http_client.HTTPResponse)or
@@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142142
isUnicode=isinstance(source,text_type)
143143

144144
ifisUnicode:
145-
ifencodingisnotNone:
146-
raiseTypeError("Cannot explicitly set an encoding with a unicode string")
145+
encodings= [xforxinkwargsifx.endswith("_encoding")]
146+
ifencodings:
147+
raiseTypeError("Cannot set an encoding with a unicode input, set %r"%encodings)
147148

148-
returnHTMLUnicodeInputStream(source)
149+
returnHTMLUnicodeInputStream(source,**kwargs)
149150
else:
150-
returnHTMLBinaryInputStream(source,encoding,parseMeta,chardet)
151+
returnHTMLBinaryInputStream(source,**kwargs)
151152

152153

153154
classHTMLUnicodeInputStream(object):
@@ -173,8 +174,6 @@ def __init__(self, source):
173174
regardless of any BOM or later declaration (such as in a meta
174175
element)
175176
176-
parseMeta - Look for a <meta> element containing encoding information
177-
178177
"""
179178

180179
ifnotutils.supports_lone_surrogates:
@@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390389
391390
"""
392391

393-
def__init__(self,source,encoding=None,parseMeta=True,chardet=True):
392+
def__init__(self,source,override_encoding=None,transport_encoding=None,
393+
same_origin_parent_encoding=None,likely_encoding=None,
394+
default_encoding="windows-1252",useChardet=True):
394395
"""Initialises the HTMLInputStream.
395396
396397
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +404,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403404
regardless of any BOM or later declaration (such as in a meta
404405
element)
405406
406-
parseMeta - Look for a <meta> element containing encoding information
407-
408407
"""
409408
# Raw Stream - for unicode objects this will encode to utf-8 and set
410409
# self.charEncoding as appropriate
411410
self.rawStream=self.openStream(source)
412411

413412
HTMLUnicodeInputStream.__init__(self,self.rawStream)
414413

415-
self.charEncoding= (lookupEncoding(encoding),"certain")
416-
417414
# Encoding Information
418415
# Number of bytes to use when looking for a meta element with
419416
# encoding information
420417
self.numBytesMeta=1024
421418
# Number of bytes to use when using detecting encoding using chardet
422419
self.numBytesChardet=100
423-
# Encoding to use if no other information can be found
424-
self.defaultEncoding="windows-1252"
420+
# Things from args
421+
self.override_encoding=override_encoding
422+
self.transport_encoding=transport_encoding
423+
self.same_origin_parent_encoding=same_origin_parent_encoding
424+
self.likely_encoding=likely_encoding
425+
self.default_encoding=default_encoding
425426

426-
# Detect encoding iff no explicit "transport level" encoding is supplied
427-
if (self.charEncoding[0]isNone):
428-
self.charEncoding=self.detectEncoding(parseMeta,chardet)
429-
assertself.charEncoding[0]isnotNone
427+
# Determine encoding
428+
self.charEncoding=self.determineEncoding(useChardet)
429+
assertself.charEncoding[0]isnotNone
430430

431431
# Call superclass
432432
self.reset()
@@ -454,21 +454,45 @@ def openStream(self, source):
454454

455455
returnstream
456456

457-
defdetectEncoding(self,parseMeta=True,chardet=True):
458-
#First look for a BOM
457+
defdetermineEncoding(self,chardet=True):
458+
#BOMs take precedence over everything
459459
# This will also read past the BOM if present
460-
encoding=self.detectBOM()
461-
confidence="certain"
462-
# If there is no BOM need to look for meta elements with encoding
463-
# information
464-
ifencodingisNoneandparseMeta:
465-
encoding=self.detectEncodingMeta()
466-
confidence="tentative"
460+
charEncoding=self.detectBOM(),"certain"
461+
ifcharEncoding[0]isnotNone:
462+
returncharEncoding
463+
464+
# If we've been overriden, we've been overriden
465+
charEncoding=lookupEncoding(self.override_encoding),"certain"
466+
ifcharEncoding[0]isnotNone:
467+
returncharEncoding
468+
469+
# Now check the transport layer
470+
charEncoding=lookupEncoding(self.transport_encoding),"certain"
471+
ifcharEncoding[0]isnotNone:
472+
returncharEncoding
473+
474+
# Look for meta elements with encoding information
475+
charEncoding=self.detectEncodingMeta(),"tentative"
476+
ifcharEncoding[0]isnotNone:
477+
returncharEncoding
478+
479+
# Parent document encoding
480+
charEncoding=lookupEncoding(self.same_origin_parent_encoding),"tentative"
481+
ifcharEncoding[0]isnotNoneandnotcharEncoding[0].name.startswith("utf-16"):
482+
returncharEncoding
483+
484+
# "likely" encoding
485+
charEncoding=lookupEncoding(self.likely_encoding),"tentative"
486+
ifcharEncoding[0]isnotNone:
487+
returncharEncoding
488+
467489
# Guess with chardet, if available
468-
ifencodingisNoneandchardet:
469-
confidence="tentative"
490+
ifchardet:
470491
try:
471492
fromchardet.universaldetectorimportUniversalDetector
493+
exceptImportError:
494+
pass
495+
else:
472496
buffers= []
473497
detector=UniversalDetector()
474498
whilenotdetector.done:
@@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481505
detector.close()
482506
encoding=lookupEncoding(detector.result['encoding'])
483507
self.rawStream.seek(0)
484-
exceptImportError:
485-
pass
486-
# If all else fails use the default encoding
487-
ifencodingisNone:
488-
confidence="tentative"
489-
encoding=lookupEncoding(self.defaultEncoding)
508+
ifencodingisnotNone:
509+
returnencoding,"tentative"
510+
511+
# Try the default encoding
512+
charEncoding=lookupEncoding(self.default_encoding),"tentative"
513+
ifcharEncoding[0]isnotNone:
514+
returncharEncoding
490515

491-
returnencoding,confidence
516+
# Fallback to html5lib's default if even that hasn't worked
517+
returnlookupEncoding("windows-1252"),"tentative"
492518

493519
defchangeEncoding(self,newEncoding):
494520
assertself.charEncoding[1]!="certain"

‎html5lib/tests/test_encoding.py

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
importos
44

5+
importpytest
6+
57
from .supportimportget_data_files,test_dir,errorMessage,TestDataas_TestData
68
fromhtml5libimportHTMLParser,inputstream
79

@@ -11,7 +13,7 @@ def test_basic_prescan_length():
1113
pad=1024-len(data)+1
1214
data=data.replace(b"-a-",b"-"+ (b"a"*pad)+b"-")
1315
assertlen(data)==1024# Sanity
14-
stream=inputstream.HTMLBinaryInputStream(data,chardet=False)
16+
stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)
1517
assert'utf-8'==stream.charEncoding[0].name
1618

1719

@@ -20,14 +22,59 @@ def test_parser_reparse():
2022
pad=10240-len(data)+1
2123
data=data.replace(b"-a-",b"-"+ (b"a"*pad)+b"-")
2224
assertlen(data)==10240# Sanity
23-
stream=inputstream.HTMLBinaryInputStream(data,chardet=False)
25+
stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)
2426
assert'windows-1252'==stream.charEncoding[0].name
2527
p=HTMLParser(namespaceHTMLElements=False)
2628
doc=p.parse(data,useChardet=False)
2729
assert'utf-8'==p.documentEncoding
2830
assertdoc.find(".//title").text=="Caf\u00E9"
2931

3032

33+
@pytest.mark.parametrize("expected,data,kwargs", [
34+
("utf-16le",b"\xFF\xFE", {"override_encoding":"iso-8859-2"}),
35+
("utf-16be",b"\xFE\xFF", {"override_encoding":"iso-8859-2"}),
36+
("utf-8",b"\xEF\xBB\xBF", {"override_encoding":"iso-8859-2"}),
37+
("iso-8859-2",b"", {"override_encoding":"iso-8859-2","transport_encoding":"iso-8859-3"}),
38+
("iso-8859-2",b"<meta charset=iso-8859-3>", {"transport_encoding":"iso-8859-2"}),
39+
("iso-8859-2",b"<meta charset=iso-8859-2>", {"same_origin_parent_encoding":"iso-8859-3"}),
40+
("iso-8859-2",b"", {"same_origin_parent_encoding":"iso-8859-2","likely_encoding":"iso-8859-3"}),
41+
("iso-8859-2",b"", {"same_origin_parent_encoding":"utf-16","likely_encoding":"iso-8859-2"}),
42+
("iso-8859-2",b"", {"same_origin_parent_encoding":"utf-16be","likely_encoding":"iso-8859-2"}),
43+
("iso-8859-2",b"", {"same_origin_parent_encoding":"utf-16le","likely_encoding":"iso-8859-2"}),
44+
("iso-8859-2",b"", {"likely_encoding":"iso-8859-2","default_encoding":"iso-8859-3"}),
45+
("iso-8859-2",b"", {"default_encoding":"iso-8859-2"}),
46+
("windows-1252",b"", {"default_encoding":"totally-bogus-string"}),
47+
("windows-1252",b"", {}),
48+
])
49+
deftest_parser_args(expected,data,kwargs):
50+
stream=inputstream.HTMLBinaryInputStream(data,useChardet=False,**kwargs)
51+
assertexpected==stream.charEncoding[0].name
52+
p=HTMLParser()
53+
p.parse(data,useChardet=False,**kwargs)
54+
assertexpected==p.documentEncoding
55+
56+
57+
@pytest.mark.parametrize("kwargs", [
58+
{"override_encoding":"iso-8859-2"},
59+
{"override_encoding":None},
60+
{"transport_encoding":"iso-8859-2"},
61+
{"transport_encoding":None},
62+
{"same_origin_parent_encoding":"iso-8859-2"},
63+
{"same_origin_parent_encoding":None},
64+
{"likely_encoding":"iso-8859-2"},
65+
{"likely_encoding":None},
66+
{"default_encoding":"iso-8859-2"},
67+
{"default_encoding":None},
68+
{"foo_encoding":"iso-8859-2"},
69+
{"foo_encoding":None},
70+
])
71+
deftest_parser_args_raises(kwargs):
72+
withpytest.raises(TypeError)asexc_info:
73+
p=HTMLParser()
74+
p.parse("",useChardet=False,**kwargs)
75+
assertexc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
76+
77+
3178
defrunParserEncodingTest(data,encoding):
3279
p=HTMLParser()
3380
assertp.documentEncodingisNone
@@ -38,7 +85,7 @@ def runParserEncodingTest(data, encoding):
3885

3986

4087
defrunPreScanEncodingTest(data,encoding):
41-
stream=inputstream.HTMLBinaryInputStream(data,chardet=False)
88+
stream=inputstream.HTMLBinaryInputStream(data,useChardet=False)
4289
encoding=encoding.lower().decode("ascii")
4390

4491
# Very crude way to ignore irrelevant tests
@@ -55,6 +102,7 @@ def test_encoding():
55102
yield (runParserEncodingTest,test[b'data'],test[b'encoding'])
56103
yield (runPreScanEncodingTest,test[b'data'],test[b'encoding'])
57104

105+
58106
# pylint:disable=wrong-import-position
59107
try:
60108
importchardet# noqa

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp