NotificationsYou must be signed in to change notification settings
Fork0
Star0

Commitafe181d

committed

Check for invalid codepoints in input stream

--HG--extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401141

1 parentbd4ad51 commitafe181dCopy full SHA for afe181d

File tree

3 files changed

+20

-11

lines changed

src/html5lib

3 files changed

+20

-11

lines changed

`‎src/html5lib/constants.py‎`

Lines changed: 3 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,8 @@`
`13`	`13`	`E= {`
`14`	`14`	`"null-character":`
`15`	`15`	`_(u"Null character in input stream, replaced with U+FFFD."),`
	`16`	`+"invalid-character":`
	`17`	`+_(u"Invalid codepoint in stream."),`
`16`	`18`	`"incorrectly-placed-solidus":`
`17`	`19`	`_(u"Solidus (/) incorrectly placed in tag."),`
`18`	`20`	`"incorrect-cr-newline-entity":`
`@@ -1052,4 +1054,4 @@`
`1052`	`1054`	`))`
`1053`	`1055`
`1054`	`1056`	`classDataLossWarning(UserWarning):`
`1055`		`-pass`
	`1057`	`+pass`

`‎src/html5lib/inputstream.py‎`

Lines changed: 10 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,8 @@`
`11`	`11`	`asciiLettersBytes= [str(item)foriteminasciiLetters]`
`12`	`12`	`asciiUppercaseBytes= [str(item)foriteminasciiUppercase]`
`13`	`13`
	`14`	`+invalid_unicode_re=re.compile(u"[\u0001-\u0008]\|[\u000E-\u001F]\|[\u007F-\u009F]\|[\uD800-\uDFFF]\|[\uFDD0-\uFDDF]\|\uFFFE\|\uFFFF\|\U0001FFFE\|\U0001FFFF\|\U0002FFFE\|\U0002FFFF\|\U0003FFFE\|\U0003FFFF\|\U0004FFFE\|\U0004FFFF\|\U0005FFFE\|\U0005FFFF\|\U0006FFFE\|\U0006FFFF\|\U0007FFFE\|\U0007FFFF\|\U0008FFFE\|\U0008FFFF\|\U0009FFFE\|\U0009FFFF\|\U000AFFFE\|\U000AFFFF\|\U000BFFFE\U000BFFFF\|\U000CFFFE\|\U000CFFFF\|\U000DFFFE\|\U000DFFFF\|\U000EFFFE\|\U000EFFFF\|\U000FFFFE\|\U000FFFFF\|\U0010FFFE\|\U0010FFFF")`
	`15`	`+`
`14`	`16`	`try:`
`15`	`17`	`fromcollectionsimportdeque`
`16`	`18`	`exceptImportError:`
`@@ -28,7 +30,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):`
`28`	`30`	`"""Initialises the HTMLInputStream.`
`29`	`31`
`30`	`32`	`HTMLInputStream(source, [encoding]) -> Normalized stream from source`
`31`		`- for use bythe HTML5Lib.`
	`33`	`+ for use byhtml5lib.`
`32`	`34`
`33`	`35`	`source can be either a file-object, local filename or a string.`
`34`	`36`
`@@ -59,7 +61,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):`
`59`	`61`	`self.defaultEncoding="windows-1252"`
`60`	`62`
`61`	`63`	`#Detect encoding iff no explicit "transport level" encoding is supplied`
`62`		`-ifself.charEncoding[0]isNoneornotisValidEncoding(self.charEncoding[0]):`
	`64`	`+if (self.charEncoding[0]isNoneor`
	`65`	`+notisValidEncoding(self.charEncoding[0])):`
`63`	`66`	`self.charEncoding=self.detectEncoding(parseMeta,chardet)`
`64`	`67`
`65`	`68`	`self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,`
`@@ -87,7 +90,7 @@ def openStream(self, source):`
`87`	`90`	`# Otherwise treat source as a string and convert to a file object`
`88`	`91`	`ifisinstance(source,unicode):`
`89`	`92`	`source=source.encode('utf-8')`
`90`		`-self.charEncoding="utf-8"`
	`93`	`+self.charEncoding=("utf-8","certian")`
`91`	`94`	`importcStringIO`
`92`	`95`	`stream=cStringIO.StringIO(str(source))`
`93`	`96`	`returnstream`
`@@ -262,6 +265,9 @@ def readChunk(self, chunkSize=10240):`
`262`	`265`	`#Replace null characters`
`263`	`266`	`foriinxrange(data.count(u"\u0000")):`
`264`	`267`	`self.errors.append("null-character")`
	`268`	`+foriinxrange(len(invalid_unicode_re.findall(data))):`
	`269`	`+self.errors.append("invalid-codepoint")`
	`270`	`+`
`265`	`271`	`data=data.replace(u"\u0000",u"\ufffd")`
`266`	`272`	`#Check for CR LF broken across chunks`
`267`	`273`	`if (self._lastChunkEndsWithCRanddata[0]=="\n"):`
`@@ -271,7 +277,7 @@ def readChunk(self, chunkSize=10240):`
`271`	`277`	`data=data.replace("\r","\n")`
`272`	`278`
`273`	`279`	`data=unicode(data)`
`274`		`-self.queue.extend([charforcharindata])`
	`280`	`+self.queue.extend(list(data))`
`275`	`281`
`276`	`282`	`self.updatePosition()`
`277`	`283`

`‎src/html5lib/tokenizer.py‎`

Lines changed: 7 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,11 @@`
`4`	`4`	`# Import from the sets module for python 2.3`
`5`	`5`	`fromsetsimportSetasset`
`6`	`6`	`fromsetsimportImmutableSetasfrozenset`
`7`		`-`
	`7`	`+try:`
	`8`	`+fromcollectionsimportdeque`
	`9`	`+exceptImportError:`
	`10`	`+fromutilsimportdeque`
	`11`	`+`
`8`	`12`	`fromconstantsimportcontentModelFlags,spaceCharacters`
`9`	`13`	`fromconstantsimportentitiesWindows1252,entities`
`10`	`14`	`fromconstantsimportasciiLowercase,asciiLetters,asciiUpper2Lower`
`@@ -83,24 +87,21 @@ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,`
`83`	`87`	`# The current token being created`
`84`	`88`	`self.currentToken=None`
`85`	`89`
`86`		`-# Tokens to be processed.`
`87`		`-self.tokenQueue= []`
`88`		`-`
`89`	`90`	`def__iter__(self):`
`90`	`91`	`""" This is where the magic happens.`
`91`	`92`
`92`	`93`	`We do our usually processing through the states and when we have a token`
`93`	`94`	`to return we yield the token which pauses processing until the next token`
`94`	`95`	`is requested.`
`95`	`96`	`"""`
`96`		`-self.tokenQueue=[]`
	`97`	`+self.tokenQueue=deque([])`
`97`	`98`	`# Start processing. When EOF is reached self.state will return False`
`98`	`99`	`# instead of True and the loop will terminate.`
`99`	`100`	`whileself.state():`
`100`	`101`	`whileself.stream.errors:`
`101`	`102`	`yield {"type":"ParseError","data":self.stream.errors.pop(0)}`
`102`	`103`	`whileself.tokenQueue:`
`103`		`-yieldself.tokenQueue.pop(0)`
	`104`	`+yieldself.tokenQueue.popleft()`
`104`	`105`
`105`	`106`	`# Below are various helper functions the tokenizer states use worked out.`
`106`	`107`	`defprocessSolidusInTag(self):`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitafe181d

File tree

3 files changed

3 files changed

`‎src/html5lib/constants.py‎`

`‎src/html5lib/inputstream.py‎`

`‎src/html5lib/tokenizer.py‎`

0 commit comments