Commit447b711

committed

Don't crash and burn when non-ascii characters are found in the pre-parse

--HG--extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401060

1 parentb24ea63 commit447b711Copy full SHA for 447b711

File tree

2 files changed

+18

-13

lines changed

src/html5lib
- inputstream.py
- tokenizer.py

2 files changed

+18

-13

lines changed

`‎src/html5lib/inputstream.py‎`

Lines changed: 17 additions & 12 deletions

Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,11 @@`
`6`	`6`	`fromconstantsimportencodings`
`7`	`7`	`fromutilsimportMethodDispatcher`
`8`	`8`
	`9`	`+#Non-unicode versions of constants for use in the pre-parser`
	`10`	`+spaceCharactersBytes= [str(item)foriteminspaceCharacters]`
	`11`	`+asciiLettersBytes= [str(item)foriteminasciiLetters]`
	`12`	`+asciiUppercaseBytes= [str(item)foriteminasciiUppercase]`
	`13`	`+`
`9`	`14`	`try:`
`10`	`15`	`fromcollectionsimportdeque`
`11`	`16`	`exceptImportError:`
`@@ -357,7 +362,7 @@ def getCurrentByte(self):`
`357`	`362`
`358`	`363`	`currentByte=property(getCurrentByte)`
`359`	`364`
`360`		`-defskip(self,chars=spaceCharacters):`
	`365`	`+defskip(self,chars=spaceCharactersBytes):`
`361`	`366`	`"""Skip past a list of characters"""`
`362`	`367`	`whileself.currentByteinchars:`
`363`	`368`	`self.position+=1`
`@@ -432,7 +437,7 @@ def handleComment(self):`
`432`	`437`	`returnself.data.jumpTo("-->")`
`433`	`438`
`434`	`439`	`defhandleMeta(self):`
`435`		`-ifself.data.currentBytenotinspaceCharacters:`
	`440`	`+ifself.data.currentBytenotinspaceCharactersBytes:`
`436`	`441`	`#if we have <meta not followed by a space so just keep going`
`437`	`442`	`returnTrue`
`438`	`443`	`#We have a valid meta element we want to search for attributes`
`@@ -462,7 +467,7 @@ def handlePossibleEndTag(self):`
`462`	`467`	`returnself.handlePossibleTag(True)`
`463`	`468`
`464`	`469`	`defhandlePossibleTag(self,endTag):`
`465`		`-ifself.data.currentBytenotinasciiLetters:`
	`470`	`+ifself.data.currentBytenotinasciiLettersBytes:`
`466`	`471`	`#If the next byte is not an ascii letter either ignore this`
`467`	`472`	`#fragment (possible start tag case) or treat it according to`
`468`	`473`	`#handleOther`
`@@ -471,7 +476,7 @@ def handlePossibleTag(self, endTag):`
`471`	`476`	`self.handleOther()`
`472`	`477`	`returnTrue`
`473`	`478`
`474`		`-self.data.findNext(list(spaceCharacters)+ ["<",">"])`
	`479`	`+self.data.findNext(list(spaceCharactersBytes)+ ["<",">"])`
`475`	`480`	`ifself.data.currentByte=="<":`
`476`	`481`	`#return to the first step in the overall "two step" algorithm`
`477`	`482`	`#reprocessing the < byte`
`@@ -489,7 +494,7 @@ def handleOther(self):`
`489`	`494`	`defgetAttribute(self):`
`490`	`495`	`"""Return a name,value pair for the next attribute in the stream,`
`491`	`496`	`if one is found, or None"""`
`492`		`-self.data.skip(list(spaceCharacters)+["/"])`
	`497`	`+self.data.skip(list(spaceCharactersBytes)+["/"])`
`493`	`498`	`ifself.data.currentByte=="<":`
`494`	`499`	`self.data.position-=1`
`495`	`500`	`returnNone`
`@@ -502,12 +507,12 @@ def getAttribute(self):`
`502`	`507`	`whileTrue:`
`503`	`508`	`ifself.data.currentByte=="="andattrName:`
`504`	`509`	`break`
`505`		`-elifself.data.currentByteinspaceCharacters:`
	`510`	`+elifself.data.currentByteinspaceCharactersBytes:`
`506`	`511`	`spaceFound=True`
`507`	`512`	`break`
`508`	`513`	`elifself.data.currentBytein ("/","<",">"):`
`509`	`514`	`return"".join(attrName),""`
`510`		`-elifself.data.currentByteinasciiUppercase:`
	`515`	`+elifself.data.currentByteinasciiUppercaseBytes:`
`511`	`516`	`attrName.extend(self.data.currentByte.lower())`
`512`	`517`	`else:`
`513`	`518`	`attrName.extend(self.data.currentByte)`
`@@ -536,23 +541,23 @@ def getAttribute(self):`
`536`	`541`	`self.data.position+=1`
`537`	`542`	`return"".join(attrName),"".join(attrValue)`
`538`	`543`	`#11.4`
`539`		`-elifself.data.currentByteinasciiUppercase:`
	`544`	`+elifself.data.currentByteinasciiUppercaseBytes:`
`540`	`545`	`attrValue.extend(self.data.currentByte.lower())`
`541`	`546`	`#11.5`
`542`	`547`	`else:`
`543`	`548`	`attrValue.extend(self.data.currentByte)`
`544`	`549`	`elifself.data.currentBytein (">","<"):`
`545`	`550`	`return"".join(attrName),""`
`546`		`-elifself.data.currentByteinasciiUppercase:`
	`551`	`+elifself.data.currentByteinasciiUppercaseBytes:`
`547`	`552`	`attrValue.extend(self.data.currentByte.lower())`
`548`	`553`	`else:`
`549`	`554`	`attrValue.extend(self.data.currentByte)`
`550`	`555`	`whileTrue:`
`551`	`556`	`self.data.position+=1`
`552`	`557`	`ifself.data.currentBytein (`
`553`		`-list(spaceCharacters)+ [">","<"]):`
	`558`	`+list(spaceCharactersBytes)+ [">","<"]):`
`554`	`559`	`return"".join(attrName),"".join(attrValue)`
`555`		`-elifself.data.currentByteinasciiUppercase:`
	`560`	`+elifself.data.currentByteinasciiUppercaseBytes:`
`556`	`561`	`attrValue.extend(self.data.currentByte.lower())`
`557`	`562`	`else:`
`558`	`563`	`attrValue.extend(self.data.currentByte)`
`@@ -588,7 +593,7 @@ def parse(self):`
`588`	`593`	`#Unquoted value`
`589`	`594`	`oldPosition=self.data.position`
`590`	`595`	`try:`
`591`		`-self.data.findNext(spaceCharacters)`
	`596`	`+self.data.findNext(spaceCharactersBytes)`
`592`	`597`	`returnself.data[oldPosition:self.data.position]`
`593`	`598`	`exceptStopIteration:`
`594`	`599`	`#Return the whole remaining value`

`‎src/html5lib/tokenizer.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ class HTMLTokenizer(object):`
`31`	`31`	`# XXX need to fix documentation`
`32`	`32`
`33`	`33`	`def__init__(self,stream,encoding=None,parseMeta=True,useChardet=True,`
`34`		`-lowercaseElementName=True,lowercaseAttrName=True,):`
	`34`	`+lowercaseElementName=True,lowercaseAttrName=True):`
`35`	`35`	`self.stream=HTMLInputStream(stream,encoding,parseMeta,useChardet)`
`36`	`36`
`37`	`37`	`#Perform case conversions?`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit447b711

File tree

2 files changed

2 files changed

`‎src/html5lib/inputstream.py‎`

`‎src/html5lib/tokenizer.py‎`

0 commit comments