Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd05f439

Browse files
committed
Several changes related to character encoding; convert utf-16 to utf-8 if found in pre-parse algorithm, allow chardet to be switched off, start implementing reparsing if <meta> found during actual parse (not yet complete)
--HG--extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401056
1 parent84313a8 commitd05f439

File tree

6 files changed

+36
-24
lines changed

6 files changed

+36
-24
lines changed

‎src/html5lib/html5parser.py‎

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,15 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder,
7878
}
7979

8080
def_parse(self,stream,innerHTML=False,container="div",
81-
encoding=None,**kwargs):
81+
encoding=None,parseMeta=True,useChardet=True,**kwargs):
8282

8383
self.tree.reset()
8484
self.firstStartTag=False
8585
self.errors= []
8686

8787
self.tokenizer=self.tokenizer_class(stream,encoding=encoding,
88-
parseMeta=notinnerHTML,**kwargs)
88+
parseMeta=parseMeta,
89+
useChardet=useChardet,**kwargs)
8990

9091
ifinnerHTML:
9192
self.innerHTML=container.lower()
@@ -131,7 +132,7 @@ def _parse(self, stream, innerHTML=False, container="div",
131132
# When the loop finishes it's EOF
132133
self.phase.processEOF()
133134

134-
defparse(self,stream,encoding=None):
135+
defparse(self,stream,encoding=None,parseMeta=True,useChardet=True):
135136
"""Parse a HTML document into a well-formed tree
136137
137138
stream - a filelike object or string containing the HTML to be parsed
@@ -144,7 +145,8 @@ def parse(self, stream, encoding=None):
144145
self._parse(stream,innerHTML=False,encoding=encoding)
145146
returnself.tree.getDocument()
146147

147-
defparseFragment(self,stream,container="div",encoding=None):
148+
defparseFragment(self,stream,container="div",encoding=None,
149+
parseMeta=False,useChardet=True):
148150
"""Parse a HTML fragment into a well-formed tree fragment
149151
150152
container - name of the element we're setting the innerHTML property

‎src/html5lib/inputstream.py‎

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
3838
# List of where new lines occur
3939
self.newLines= [0]
4040

41-
self.charEncoding=encoding
41+
self.charEncoding=(encoding,"certian")
4242

4343
# Raw Stream - for unicode objects this will encode to utf-8 and set
4444
# self.charEncoding as appropriate
@@ -54,11 +54,11 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
5454
self.defaultEncoding="windows-1252"
5555

5656
#Detect encoding iff no explicit "transport level" encoding is supplied
57-
ifself.charEncodingisNoneornotisValidEncoding(self.charEncoding):
57+
ifself.charEncoding[0]isNoneornotisValidEncoding(self.charEncoding[0]):
5858
self.charEncoding=self.detectEncoding(parseMeta,chardet)
5959

60-
self.dataStream=codecs.getreader(self.charEncoding)(self.rawStream,
61-
'replace')
60+
self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,
61+
'replace')
6262

6363
self.queue=deque([])
6464
self.readChars= []
@@ -92,12 +92,15 @@ def detectEncoding(self, parseMeta=True, chardet=True):
9292
#First look for a BOM
9393
#This will also read past the BOM if present
9494
encoding=self.detectBOM()
95+
confidence="certain"
9596
#If there is no BOM need to look for meta elements with encoding
9697
#information
9798
ifencodingisNoneandparseMeta:
9899
encoding=self.detectEncodingMeta()
100+
confidence="tentative"
99101
#Guess with chardet, if avaliable
100102
ifencodingisNoneandchardet:
103+
confidence="tentative"
101104
try:
102105
fromchardet.universaldetectorimportUniversalDetector
103106
buffers= []
@@ -115,6 +118,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
115118
pass
116119
# If all else fails use the default encoding
117120
ifencodingisNone:
121+
confidence="tentative"
118122
encoding=self.defaultEncoding
119123

120124
#Substitute for equivalent encodings:
@@ -123,7 +127,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
123127
ifencoding.lower()inencodingSub:
124128
encoding=encodingSub[encoding.lower()]
125129

126-
returnencoding
130+
returnencoding,confidence
127131

128132
defdetectBOM(self):
129133
"""Attempts to detect at BOM at the start of the stream. If
@@ -200,7 +204,8 @@ def detectEncodingMeta(self):
200204
buffer=self.rawStream.read(self.numBytesMeta)
201205
parser=EncodingParser(buffer)
202206
self.seek(buffer,0)
203-
returnparser.getEncoding()
207+
encoding=parser.getEncoding()
208+
returnencoding
204209

205210
defupdatePosition(self):
206211
#Remove EOF from readChars, if present
@@ -414,7 +419,12 @@ def getEncoding(self):
414419
ifnotkeepParsing:
415420
break
416421
ifself.encodingisnotNone:
417-
self.encoding=self.encoding.strip()
422+
self.encoding=self.encoding.strip()
423+
#Spec violation that complies with hsivonen + mjs
424+
ifself.encoding.upper()in ("UTF-16","UTF-16BE","UTF-16LE",
425+
"UTF-32","UTF-32BE","UTF-32LE"):
426+
self.encoding="utf-8"
427+
418428
returnself.encoding
419429

420430
defhandleComment(self):
@@ -531,7 +541,7 @@ def getAttribute(self):
531541
#11.5
532542
else:
533543
attrValue.extend(self.data.currentByte)
534-
elifself.data.currentBytein (">",'<'):
544+
elifself.data.currentBytein (">","<"):
535545
return"".join(attrName),""
536546
elifself.data.currentByteinasciiUppercase:
537547
attrValue.extend(self.data.currentByte.lower())
@@ -540,7 +550,7 @@ def getAttribute(self):
540550
whileTrue:
541551
self.data.position+=1
542552
ifself.data.currentBytein (
543-
list(spaceCharacters)+ [">",'<']):
553+
list(spaceCharacters)+ [">","<"]):
544554
return"".join(attrName),"".join(attrValue)
545555
elifself.data.currentByteinasciiUppercase:
546556
attrValue.extend(self.data.currentByte.lower())

‎src/html5lib/sanitizer.py‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,11 +188,11 @@ def sanitize_css(self, style):
188188
return' '.join(clean)
189189

190190
classHTMLSanitizer(HTMLTokenizer,HTMLSanitizerMixin):
191-
def__init__(self,stream,encoding=None,parseMeta=True,
191+
def__init__(self,stream,encoding=None,parseMeta=True,useChardet=True,
192192
lowercaseElementName=False,lowercaseAttrName=False):
193193
#Change case matching defaults as we only output lowercase html anyway
194194
#This solution doesn't seem ideal...
195-
HTMLTokenizer.__init__(self,stream,encoding,parseMeta,
195+
HTMLTokenizer.__init__(self,stream,encoding,parseMeta,useChardet,
196196
lowercaseElementName,lowercaseAttrName)
197197

198198
def__iter__(self):

‎src/html5lib/tokenizer.py‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ class HTMLTokenizer(object):
3030

3131
# XXX need to fix documentation
3232

33-
def__init__(self,stream,encoding=None,parseMeta=True,
33+
def__init__(self,stream,encoding=None,parseMeta=True,useChardet=True,
3434
lowercaseElementName=True,lowercaseAttrName=True,):
35-
self.stream=HTMLInputStream(stream,encoding,parseMeta)
35+
self.stream=HTMLInputStream(stream,encoding,parseMeta,useChardet)
3636

3737
#Perform case conversions?
3838
self.lowercaseElementName=lowercaseElementName

‎tests/test_encoding.py‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def buildTestSuite():
1616
foridx,testinenumerate(tests):
1717
defencodingTest(self,data=test['data'],encoding=test['encoding']):
1818
stream=inputstream.HTMLInputStream(data,chardet=False)
19-
self.assertEquals(encoding.lower(),stream.charEncoding)
19+
self.assertEquals(encoding.lower(),stream.charEncoding[0])
2020
setattr(Html5EncodingTestCase,'test_%s_%d'% (test_name,idx+1),
2121
encodingTest)
2222

@@ -25,7 +25,7 @@ def encodingTest(self, data=test['data'], encoding=test['encoding']):
2525
deftest_chardet(self):
2626
data=open(os.path.join(test_dir,"encoding" ,"chardet","test_big5.txt")).read()
2727
encoding=inputstream.HTMLInputStream(data).charEncoding
28-
assertencoding.lower()=="big5"
28+
assertencoding[0].lower()=="big5"
2929
setattr(Html5EncodingTestCase,'test_chardet',test_chardet)
3030
exceptImportError:
3131
print"chardet not found, skipping chardet tests"

‎tests/test_stream.py‎

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ class HTMLInputStreamTest(unittest.TestCase):
77

88
deftest_char_ascii(self):
99
stream=HTMLInputStream("'",encoding='ascii')
10-
self.assertEquals(stream.charEncoding,'ascii')
10+
self.assertEquals(stream.charEncoding[0],'ascii')
1111
self.assertEquals(stream.char(),"'")
1212

1313
deftest_char_null(self):
@@ -16,24 +16,24 @@ def test_char_null(self):
1616

1717
deftest_char_utf8(self):
1818
stream=HTMLInputStream(u'\u2018'.encode('utf-8'),encoding='utf-8')
19-
self.assertEquals(stream.charEncoding,'utf-8')
19+
self.assertEquals(stream.charEncoding[0],'utf-8')
2020
self.assertEquals(stream.char(),u'\u2018')
2121

2222
deftest_char_win1252(self):
2323
stream=HTMLInputStream(u"\xa9\xf1\u2019".encode('windows-1252'))
24-
self.assertEquals(stream.charEncoding,'windows-1252')
24+
self.assertEquals(stream.charEncoding[0],'windows-1252')
2525
self.assertEquals(stream.char(),u"\xa9")
2626
self.assertEquals(stream.char(),u"\xf1")
2727
self.assertEquals(stream.char(),u"\u2019")
2828

2929
deftest_bom(self):
3030
stream=HTMLInputStream(codecs.BOM_UTF8+"'")
31-
self.assertEquals(stream.charEncoding,'utf-8')
31+
self.assertEquals(stream.charEncoding[0],'utf-8')
3232
self.assertEquals(stream.char(),"'")
3333

3434
deftest_utf_16(self):
3535
stream=HTMLInputStream((' '*1025).encode('utf-16'))
36-
self.assert_(stream.charEncodingin ['utf-16-le','utf-16-be'])
36+
self.assert_(stream.charEncoding[0]in ['utf-16-le','utf-16-be'],stream.charEncoding)
3737
self.assertEquals(len(stream.charsUntil(' ',True)),1025)
3838

3939
deftest_newlines(self):

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp