Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitfaa4953

Browse files
committed
Update Python 3, now passes identical number of tests as Python 2.
1 parentd81e892 commitfaa4953

File tree

2 files changed

+204
-144
lines changed

2 files changed

+204
-144
lines changed

‎html5lib/inputstream.py‎

Lines changed: 204 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,19 @@
77
from .constantsimportencodings,ReparseException
88
from .importutils
99

10+
fromioimportStringIO
11+
12+
try:
13+
fromioimportBytesIO
14+
exceptImportError:
15+
BytesIO=StringIO
16+
17+
try:
18+
fromioimportBufferedIOBase
19+
exceptImportError:
20+
classBufferedIOBase(object):
21+
pass
22+
1023
#Non-unicode versions of constants for use in the pre-parser
1124
spaceCharactersBytes=frozenset([str(item)foriteminspaceCharacters])
1225
asciiLettersBytes=frozenset([str(item)foriteminasciiLetters])
@@ -101,10 +114,21 @@ def _readFromBuffer(self, bytes):
101114
rv.append(self._readStream(remainingBytes))
102115

103116
return"".join(rv)
104-
105117

106118

107-
classHTMLInputStream:
119+
defHTMLInputStream(source,encoding=None,parseMeta=True,chardet=True):
120+
ifhasattr(source,"read"):
121+
isUnicode=isinstance(source.read(0),str)
122+
else:
123+
isUnicode=isinstance(source,str)
124+
125+
ifisUnicode:
126+
returnHTMLUnicodeInputStream(source)
127+
else:
128+
returnHTMLBinaryInputStream(source,encoding,parseMeta,chardet)
129+
130+
131+
classHTMLUnicodeInputStream:
108132
"""Provides a unicode stream of characters to the HTMLTokenizer.
109133
110134
This class takes care of character encoding and removing or replacing
@@ -114,7 +138,7 @@ class HTMLInputStream:
114138

115139
_defaultChunkSize=10240
116140

117-
def__init__(self,source,encoding=None,parseMeta=True,chardet=True):
141+
def__init__(self,source):
118142
"""Initialises the HTMLInputStream.
119143
120144
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -142,32 +166,12 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
142166
# List of where new lines occur
143167
self.newLines= [0]
144168

145-
self.charEncoding= (codecName(encoding),"certain")
146-
147-
# Raw Stream - for unicode objects this will encode to utf-8 and set
148-
# self.charEncoding as appropriate
149-
self.rawStream=self.openStream(source)
150-
151-
# Encoding Information
152-
#Number of bytes to use when looking for a meta element with
153-
#encoding information
154-
self.numBytesMeta=512
155-
#Number of bytes to use when using detecting encoding using chardet
156-
self.numBytesChardet=100
157-
#Encoding to use if no other information can be found
158-
self.defaultEncoding="windows-1252"
159-
160-
#Detect encoding iff no explicit "transport level" encoding is supplied
161-
if (self.charEncoding[0]isNone):
162-
self.charEncoding=self.detectEncoding(parseMeta,chardet)
163-
169+
self.charEncoding= ("utf-8","certain")
170+
self.dataStream=self.openStream(source)
164171

165172
self.reset()
166173

167174
defreset(self):
168-
self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,
169-
'replace')
170-
171175
self.chunk=""
172176
self.chunkSize=0
173177
self.chunkOffset=0
@@ -191,128 +195,16 @@ def openStream(self, source):
191195
ifhasattr(source,'read'):
192196
stream=source
193197
else:
194-
# Otherwise treat source as a string and convert to a file object
195-
ifisinstance(source,str):
196-
# XXX: we should handle lone surrogates here
197-
source=source.encode('utf-8',errors="replace")
198-
self.charEncoding= ("utf-8","certain")
199-
try:
200-
fromioimportBytesIO
201-
except:
202-
try:
203-
# 2to3 converts this line to: from io import StringIO
204-
fromioimportStringIOasBytesIO
205-
except:
206-
fromioimportStringIOasBytesIO
207-
stream=BytesIO(source)
198+
stream=StringIO(source)
208199

209-
if (not(hasattr(stream,"tell")andhasattr(stream,"seek"))or
200+
if (#not isinstance(stream, BufferedIOBase) and
201+
not(hasattr(stream,"tell")and
202+
hasattr(stream,"seek"))or
210203
streamissys.stdin):
211204
stream=BufferedStream(stream)
212205

213206
returnstream
214207

215-
defdetectEncoding(self,parseMeta=True,chardet=True):
216-
#First look for a BOM
217-
#This will also read past the BOM if present
218-
encoding=self.detectBOM()
219-
confidence="certain"
220-
#If there is no BOM need to look for meta elements with encoding
221-
#information
222-
ifencodingisNoneandparseMeta:
223-
encoding=self.detectEncodingMeta()
224-
confidence="tentative"
225-
#Guess with chardet, if avaliable
226-
ifencodingisNoneandchardet:
227-
confidence="tentative"
228-
try:
229-
fromchardet.universaldetectorimportUniversalDetector
230-
buffers= []
231-
detector=UniversalDetector()
232-
whilenotdetector.done:
233-
buffer=self.rawStream.read(self.numBytesChardet)
234-
assertisinstance(buffer,bytes)
235-
ifnotbuffer:
236-
break
237-
buffers.append(buffer)
238-
detector.feed(buffer)
239-
detector.close()
240-
encoding=detector.result['encoding']
241-
self.rawStream.seek(0)
242-
exceptImportError:
243-
pass
244-
# If all else fails use the default encoding
245-
ifencodingisNone:
246-
confidence="tentative"
247-
encoding=self.defaultEncoding
248-
249-
#Substitute for equivalent encodings:
250-
encodingSub= {"iso-8859-1":"windows-1252"}
251-
252-
ifencoding.lower()inencodingSub:
253-
encoding=encodingSub[encoding.lower()]
254-
255-
returnencoding,confidence
256-
257-
defchangeEncoding(self,newEncoding):
258-
newEncoding=codecName(newEncoding)
259-
ifnewEncodingin ("utf-16","utf-16-be","utf-16-le"):
260-
newEncoding="utf-8"
261-
ifnewEncodingisNone:
262-
return
263-
elifnewEncoding==self.charEncoding[0]:
264-
self.charEncoding= (self.charEncoding[0],"certain")
265-
else:
266-
self.rawStream.seek(0)
267-
self.reset()
268-
self.charEncoding= (newEncoding,"certain")
269-
raiseReparseException("Encoding changed from %s to %s"%(self.charEncoding[0],newEncoding))
270-
271-
defdetectBOM(self):
272-
"""Attempts to detect at BOM at the start of the stream. If
273-
an encoding can be determined from the BOM return the name of the
274-
encoding otherwise return None"""
275-
bomDict= {
276-
codecs.BOM_UTF8:'utf-8',
277-
codecs.BOM_UTF16_LE:'utf-16-le',codecs.BOM_UTF16_BE:'utf-16-be',
278-
codecs.BOM_UTF32_LE:'utf-32-le',codecs.BOM_UTF32_BE:'utf-32-be'
279-
}
280-
281-
# Go to beginning of file and read in 4 bytes
282-
string=self.rawStream.read(4)
283-
assertisinstance(string,bytes)
284-
285-
# Try detecting the BOM using bytes from the string
286-
encoding=bomDict.get(string[:3])# UTF-8
287-
seek=3
288-
ifnotencoding:
289-
# Need to detect UTF-32 before UTF-16
290-
encoding=bomDict.get(string)# UTF-32
291-
seek=4
292-
ifnotencoding:
293-
encoding=bomDict.get(string[:2])# UTF-16
294-
seek=2
295-
296-
# Set the read position past the BOM if one was found, otherwise
297-
# set it to the start of the stream
298-
self.rawStream.seek(encodingandseekor0)
299-
300-
returnencoding
301-
302-
defdetectEncodingMeta(self):
303-
"""Report the encoding declared by the meta element
304-
"""
305-
buffer=self.rawStream.read(self.numBytesMeta)
306-
assertisinstance(buffer,bytes)
307-
parser=EncodingParser(buffer)
308-
self.rawStream.seek(0)
309-
encoding=parser.getEncoding()
310-
311-
ifencodingin ("utf-16","utf-16-be","utf-16-le"):
312-
encoding="utf-8"
313-
314-
returnencoding
315-
316208
def_position(self,offset):
317209
chunk=self.chunk
318210
nLines=chunk.count('\n',0,offset)
@@ -475,6 +367,177 @@ def unget(self, char):
475367
self.chunkOffset-=1
476368
assertself.chunk[self.chunkOffset]==char
477369

370+
classHTMLBinaryInputStream(HTMLUnicodeInputStream):
371+
"""Provides a unicode stream of characters to the HTMLTokenizer.
372+
373+
This class takes care of character encoding and removing or replacing
374+
incorrect byte-sequences and also provides column and line tracking.
375+
376+
"""
377+
378+
def__init__(self,source,encoding=None,parseMeta=True,chardet=True):
379+
"""Initialises the HTMLInputStream.
380+
381+
HTMLInputStream(source, [encoding]) -> Normalized stream from source
382+
for use by html5lib.
383+
384+
source can be either a file-object, local filename or a string.
385+
386+
The optional encoding parameter must be a string that indicates
387+
the encoding. If specified, that encoding will be used,
388+
regardless of any BOM or later declaration (such as in a meta
389+
element)
390+
391+
parseMeta - Look for a <meta> element containing encoding information
392+
393+
"""
394+
self.charEncoding= (codecName(encoding),"certain")
395+
396+
# Raw Stream - for unicode objects this will encode to utf-8 and set
397+
# self.charEncoding as appropriate
398+
self.rawStream=self.openStream(source)
399+
400+
# Encoding Information
401+
#Number of bytes to use when looking for a meta element with
402+
#encoding information
403+
self.numBytesMeta=512
404+
#Number of bytes to use when using detecting encoding using chardet
405+
self.numBytesChardet=100
406+
#Encoding to use if no other information can be found
407+
self.defaultEncoding="windows-1252"
408+
409+
#Detect encoding iff no explicit "transport level" encoding is supplied
410+
if (self.charEncoding[0]isNone):
411+
self.charEncoding=self.detectEncoding(parseMeta,chardet)
412+
413+
#Call superclass
414+
HTMLUnicodeInputStream.__init__(self,self.rawStream)
415+
416+
defreset(self):
417+
self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,
418+
'replace')
419+
HTMLUnicodeInputStream.reset(self)
420+
421+
defopenStream(self,source):
422+
"""Produces a file object from source.
423+
424+
source can be either a file object, local filename or a string.
425+
426+
"""
427+
# Already a file object
428+
ifhasattr(source,'read'):
429+
stream=source
430+
else:
431+
stream=BytesIO(source)
432+
433+
if (not(hasattr(stream,"tell")andhasattr(stream,"seek"))or
434+
streamissys.stdin):
435+
stream=BufferedStream(stream)
436+
437+
returnstream
438+
439+
defdetectEncoding(self,parseMeta=True,chardet=True):
440+
#First look for a BOM
441+
#This will also read past the BOM if present
442+
encoding=self.detectBOM()
443+
confidence="certain"
444+
#If there is no BOM need to look for meta elements with encoding
445+
#information
446+
ifencodingisNoneandparseMeta:
447+
encoding=self.detectEncodingMeta()
448+
confidence="tentative"
449+
#Guess with chardet, if avaliable
450+
ifencodingisNoneandchardet:
451+
confidence="tentative"
452+
try:
453+
fromchardet.universaldetectorimportUniversalDetector
454+
buffers= []
455+
detector=UniversalDetector()
456+
whilenotdetector.done:
457+
buffer=self.rawStream.read(self.numBytesChardet)
458+
assertisinstance(buffer,bytes)
459+
ifnotbuffer:
460+
break
461+
buffers.append(buffer)
462+
detector.feed(buffer)
463+
detector.close()
464+
encoding=detector.result['encoding']
465+
self.rawStream.seek(0)
466+
exceptImportError:
467+
pass
468+
# If all else fails use the default encoding
469+
ifencodingisNone:
470+
confidence="tentative"
471+
encoding=self.defaultEncoding
472+
473+
#Substitute for equivalent encodings:
474+
encodingSub= {"iso-8859-1":"windows-1252"}
475+
476+
ifencoding.lower()inencodingSub:
477+
encoding=encodingSub[encoding.lower()]
478+
479+
returnencoding,confidence
480+
481+
defchangeEncoding(self,newEncoding):
482+
assertself.charEncoding[1]!="certain"
483+
newEncoding=codecName(newEncoding)
484+
ifnewEncodingin ("utf-16","utf-16-be","utf-16-le"):
485+
newEncoding="utf-8"
486+
ifnewEncodingisNone:
487+
return
488+
elifnewEncoding==self.charEncoding[0]:
489+
self.charEncoding= (self.charEncoding[0],"certain")
490+
else:
491+
self.rawStream.seek(0)
492+
self.reset()
493+
self.charEncoding= (newEncoding,"certain")
494+
raiseReparseException("Encoding changed from %s to %s"%(self.charEncoding[0],newEncoding))
495+
496+
defdetectBOM(self):
497+
"""Attempts to detect at BOM at the start of the stream. If
498+
an encoding can be determined from the BOM return the name of the
499+
encoding otherwise return None"""
500+
bomDict= {
501+
codecs.BOM_UTF8:'utf-8',
502+
codecs.BOM_UTF16_LE:'utf-16-le',codecs.BOM_UTF16_BE:'utf-16-be',
503+
codecs.BOM_UTF32_LE:'utf-32-le',codecs.BOM_UTF32_BE:'utf-32-be'
504+
}
505+
506+
# Go to beginning of file and read in 4 bytes
507+
string=self.rawStream.read(4)
508+
assertisinstance(string,bytes)
509+
510+
# Try detecting the BOM using bytes from the string
511+
encoding=bomDict.get(string[:3])# UTF-8
512+
seek=3
513+
ifnotencoding:
514+
# Need to detect UTF-32 before UTF-16
515+
encoding=bomDict.get(string)# UTF-32
516+
seek=4
517+
ifnotencoding:
518+
encoding=bomDict.get(string[:2])# UTF-16
519+
seek=2
520+
521+
# Set the read position past the BOM if one was found, otherwise
522+
# set it to the start of the stream
523+
self.rawStream.seek(encodingandseekor0)
524+
525+
returnencoding
526+
527+
defdetectEncodingMeta(self):
528+
"""Report the encoding declared by the meta element
529+
"""
530+
buffer=self.rawStream.read(self.numBytesMeta)
531+
assertisinstance(buffer,bytes)
532+
parser=EncodingParser(buffer)
533+
self.rawStream.seek(0)
534+
encoding=parser.getEncoding()
535+
536+
ifencodingin ("utf-16","utf-16-be","utf-16-le"):
537+
encoding="utf-8"
538+
539+
returnencoding
540+
478541
classEncodingBytes(str):
479542
"""String-like object with an associated position and various extra methods
480543
If the position is ever greater than the string length then an exception is

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp