Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitafe181d

Browse files
committed
Check for invalid codepoints in input stream
--HG--extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401141
1 parentbd4ad51 commitafe181d

File tree

3 files changed

+20
-11
lines changed

3 files changed

+20
-11
lines changed

‎src/html5lib/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
E= {
1414
"null-character":
1515
_(u"Null character in input stream, replaced with U+FFFD."),
16+
"invalid-character":
17+
_(u"Invalid codepoint in stream."),
1618
"incorrectly-placed-solidus":
1719
_(u"Solidus (/) incorrectly placed in tag."),
1820
"incorrect-cr-newline-entity":
@@ -1052,4 +1054,4 @@
10521054
))
10531055

10541056
classDataLossWarning(UserWarning):
1055-
pass
1057+
pass

‎src/html5lib/inputstream.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
asciiLettersBytes= [str(item)foriteminasciiLetters]
1212
asciiUppercaseBytes= [str(item)foriteminasciiUppercase]
1313

14+
invalid_unicode_re=re.compile(u"[\u0001-\u0008]|[\u000E-\u001F]|[\u007F-\u009F]|[\uD800-\uDFFF]|[\uFDD0-\uFDDF]|\uFFFE|\uFFFF|\U0001FFFE|\U0001FFFF|\U0002FFFE|\U0002FFFF|\U0003FFFE|\U0003FFFF|\U0004FFFE|\U0004FFFF|\U0005FFFE|\U0005FFFF|\U0006FFFE|\U0006FFFF|\U0007FFFE|\U0007FFFF|\U0008FFFE|\U0008FFFF|\U0009FFFE|\U0009FFFF|\U000AFFFE|\U000AFFFF|\U000BFFFE\U000BFFFF|\U000CFFFE|\U000CFFFF|\U000DFFFE|\U000DFFFF|\U000EFFFE|\U000EFFFF|\U000FFFFE|\U000FFFFF|\U0010FFFE|\U0010FFFF")
15+
1416
try:
1517
fromcollectionsimportdeque
1618
exceptImportError:
@@ -28,7 +30,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
2830
"""Initialises the HTMLInputStream.
2931
3032
HTMLInputStream(source, [encoding]) -> Normalized stream from source
31-
for use bythe HTML5Lib.
33+
for use byhtml5lib.
3234
3335
source can be either a file-object, local filename or a string.
3436
@@ -59,7 +61,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
5961
self.defaultEncoding="windows-1252"
6062

6163
#Detect encoding iff no explicit "transport level" encoding is supplied
62-
ifself.charEncoding[0]isNoneornotisValidEncoding(self.charEncoding[0]):
64+
if (self.charEncoding[0]isNoneor
65+
notisValidEncoding(self.charEncoding[0])):
6366
self.charEncoding=self.detectEncoding(parseMeta,chardet)
6467

6568
self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,
@@ -87,7 +90,7 @@ def openStream(self, source):
8790
# Otherwise treat source as a string and convert to a file object
8891
ifisinstance(source,unicode):
8992
source=source.encode('utf-8')
90-
self.charEncoding="utf-8"
93+
self.charEncoding=("utf-8","certian")
9194
importcStringIO
9295
stream=cStringIO.StringIO(str(source))
9396
returnstream
@@ -262,6 +265,9 @@ def readChunk(self, chunkSize=10240):
262265
#Replace null characters
263266
foriinxrange(data.count(u"\u0000")):
264267
self.errors.append("null-character")
268+
foriinxrange(len(invalid_unicode_re.findall(data))):
269+
self.errors.append("invalid-codepoint")
270+
265271
data=data.replace(u"\u0000",u"\ufffd")
266272
#Check for CR LF broken across chunks
267273
if (self._lastChunkEndsWithCRanddata[0]=="\n"):
@@ -271,7 +277,7 @@ def readChunk(self, chunkSize=10240):
271277
data=data.replace("\r","\n")
272278

273279
data=unicode(data)
274-
self.queue.extend([charforcharindata])
280+
self.queue.extend(list(data))
275281

276282
self.updatePosition()
277283

‎src/html5lib/tokenizer.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@
44
# Import from the sets module for python 2.3
55
fromsetsimportSetasset
66
fromsetsimportImmutableSetasfrozenset
7-
7+
try:
8+
fromcollectionsimportdeque
9+
exceptImportError:
10+
fromutilsimportdeque
11+
812
fromconstantsimportcontentModelFlags,spaceCharacters
913
fromconstantsimportentitiesWindows1252,entities
1014
fromconstantsimportasciiLowercase,asciiLetters,asciiUpper2Lower
@@ -83,24 +87,21 @@ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
8387
# The current token being created
8488
self.currentToken=None
8589

86-
# Tokens to be processed.
87-
self.tokenQueue= []
88-
8990
def__iter__(self):
9091
""" This is where the magic happens.
9192
9293
We do our usually processing through the states and when we have a token
9394
to return we yield the token which pauses processing until the next token
9495
is requested.
9596
"""
96-
self.tokenQueue=[]
97+
self.tokenQueue=deque([])
9798
# Start processing. When EOF is reached self.state will return False
9899
# instead of True and the loop will terminate.
99100
whileself.state():
100101
whileself.stream.errors:
101102
yield {"type":"ParseError","data":self.stream.errors.pop(0)}
102103
whileself.tokenQueue:
103-
yieldself.tokenQueue.pop(0)
104+
yieldself.tokenQueue.popleft()
104105

105106
# Below are various helper functions the tokenizer states use worked out.
106107
defprocessSolidusInTag(self):

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp