Commitdb43ce2

committed

Get encoding tests passing, and test the pre-scan separately

1 parent2816de7 commitdb43ce2Copy full SHA for db43ce2

File tree

3 files changed

+85

-49

lines changed

html5lib
- html5parser.py
- inputstream.py
- tests
  - test_encoding.py

3 files changed

+85

-49

lines changed

`‎html5lib/html5parser.py‎`

Lines changed: 3 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -777,7 +777,9 @@ def startTagMeta(self, token):`
`777`	`777`	`ifself.parser.tokenizer.stream.charEncoding[1]=="tentative":`
`778`	`778`	`if"charset"inattributes:`
`779`	`779`	`self.parser.tokenizer.stream.changeEncoding(attributes["charset"])`
`780`		`-elif"content"inattributes:`
	`780`	`+elif ("content"inattributesand`
	`781`	`+"http-equiv"inattributesand`
	`782`	`+attributes["http-equiv"].lower()=="content-type"):`
`781`	`783`	`# Encoding it as UTF-8 here is a hack, as really we should pass`
`782`	`784`	`# the abstract Unicode string, and just use the`
`783`	`785`	`# ContentAttrParser on that, but using UTF-8 allows all chars`

`‎html5lib/inputstream.py‎`

Lines changed: 66 additions & 46 deletions

Original file line number	Diff line number	Diff line change
`@@ -21,10 +21,10 @@ class BufferedIOBase(object):`
`21`	`21`	`pass`
`22`	`22`
`23`	`23`	`#Non-unicode versions of constants for use in the pre-parser`
`24`		`-spaceCharactersBytes=frozenset([str(item)foriteminspaceCharacters])`
`25`		`-asciiLettersBytes=frozenset([str(item)foriteminasciiLetters])`
`26`		`-asciiUppercaseBytes=frozenset([str(item)foriteminasciiUppercase])`
`27`		`-spacesAngleBrackets=spaceCharactersBytes\|frozenset([">","<"])`
	`24`	`+spaceCharactersBytes=frozenset([item.encode("ascii")foriteminspaceCharacters])`
	`25`	`+asciiLettersBytes=frozenset([item.encode("ascii")foriteminasciiLetters])`
	`26`	`+asciiUppercaseBytes=frozenset([item.encode("ascii")foriteminasciiUppercase])`
	`27`	`+spacesAngleBrackets=spaceCharactersBytes\|frozenset([b">",b"<"])`
`28`	`28`
`29`	`29`	`invalid_unicode_re=re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")`
`30`	`30`
`@@ -391,12 +391,14 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):`
`391`	`391`	`parseMeta - Look for a <meta> element containing encoding information`
`392`	`392`
`393`	`393`	`"""`
`394`		`-self.charEncoding= (codecName(encoding),"certain")`
`395`		`-`
`396`	`394`	`# Raw Stream - for unicode objects this will encode to utf-8 and set`
`397`	`395`	`# self.charEncoding as appropriate`
`398`	`396`	`self.rawStream=self.openStream(source)`
`399`	`397`
	`398`	`+HTMLUnicodeInputStream.__init__(self,self.rawStream)`
	`399`	`+`
	`400`	`+self.charEncoding= (codecName(encoding),"certain")`
	`401`	`+`
`400`	`402`	`# Encoding Information`
`401`	`403`	`#Number of bytes to use when looking for a meta element with`
`402`	`404`	`#encoding information`
`@@ -411,7 +413,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):`
`411`	`413`	`self.charEncoding=self.detectEncoding(parseMeta,chardet)`
`412`	`414`
`413`	`415`	`#Call superclass`
`414`		`-HTMLUnicodeInputStream.__init__(self,self.rawStream)`
	`416`	`+self.reset()`
`415`	`417`
`416`	`418`	`defreset(self):`
`417`	`419`	`self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,`
`@@ -538,12 +540,13 @@ def detectEncodingMeta(self):`
`538`	`540`
`539`	`541`	`returnencoding`
`540`	`542`
`541`		`-classEncodingBytes(str):`
	`543`	`+classEncodingBytes(bytes):`
`542`	`544`	`"""String-like object with an associated position and various extra methods`
`543`	`545`	`If the position is ever greater than the string length then an exception is`
`544`	`546`	`raised"""`
`545`	`547`	`def__new__(self,value):`
`546`		`-returnstr.__new__(self,value.lower())`
	`548`	`+assertisinstance(value,bytes)`
	`549`	`+returnbytes.__new__(self,value.lower())`
`547`	`550`
`548`	`551`	`def__init__(self,value):`
`549`	`552`	`self._position=-1`
`@@ -557,7 +560,7 @@ def __next__(self):`
`557`	`560`	`raiseStopIteration`
`558`	`561`	`elifp<0:`
`559`	`562`	`raiseTypeError`
`560`		`-returnself[p]`
	`563`	`+returnself[p:p+1]`
`561`	`564`
`562`	`565`	`defprevious(self):`
`563`	`566`	`p=self._position`
`@@ -566,7 +569,7 @@ def previous(self):`
`566`	`569`	`elifp<0:`
`567`	`570`	`raiseTypeError`
`568`	`571`	`self._position=p=p-1`
`569`		`-returnself[p]`
	`572`	`+returnself[p:p+1]`
`570`	`573`
`571`	`574`	`defsetPosition(self,position):`
`572`	`575`	`ifself._position>=len(self):`
`@@ -584,15 +587,15 @@ def getPosition(self):`
`584`	`587`	`position=property(getPosition,setPosition)`
`585`	`588`
`586`	`589`	`defgetCurrentByte(self):`
`587`		`-returnself[self.position]`
	`590`	`+returnself[self.position:self.position+1]`
`588`	`591`
`589`	`592`	`currentByte=property(getCurrentByte)`
`590`	`593`
`591`	`594`	`defskip(self,chars=spaceCharactersBytes):`
`592`	`595`	`"""Skip past a list of characters"""`
`593`	`596`	`p=self.position# use property for the error-checking`
`594`	`597`	`whilep<len(self):`
`595`		`-c=self[p]`
	`598`	`+c=self[p:p+1]`
`596`	`599`	`ifcnotinchars:`
`597`	`600`	`self._position=p`
`598`	`601`	`returnc`
`@@ -603,7 +606,7 @@ def skip(self, chars=spaceCharactersBytes):`
`603`	`606`	`defskipUntil(self,chars):`
`604`	`607`	`p=self.position`
`605`	`608`	`whilep<len(self):`
`606`		`-c=self[p]`
	`609`	`+c=self[p:p+1]`
`607`	`610`	`ifcinchars:`
`608`	`611`	`self._position=p`
`609`	`612`	`returnc`
`@@ -645,12 +648,12 @@ def __init__(self, data):`
`645`	`648`
`646`	`649`	`defgetEncoding(self):`
`647`	`650`	`methodDispatch= (`
`648`		`- ("<!--",self.handleComment),`
`649`		`- ("<meta",self.handleMeta),`
`650`		`- ("</",self.handlePossibleEndTag),`
`651`		`- ("<!",self.handleOther),`
`652`		`- ("<?",self.handleOther),`
`653`		`- ("<",self.handlePossibleStartTag))`
	`651`	`+ (b"<!--",self.handleComment),`
	`652`	`+ (b"<meta",self.handleMeta),`
	`653`	`+ (b"</",self.handlePossibleEndTag),`
	`654`	`+ (b"<!",self.handleOther),`
	`655`	`+ (b"<?",self.handleOther),`
	`656`	`+ (b"<",self.handlePossibleStartTag))`
`654`	`657`	`forbyteinself.data:`
`655`	`658`	`keepParsing=True`
`656`	`659`	`forkey,methodinmethodDispatch:`
`@@ -663,37 +666,48 @@ def getEncoding(self):`
`663`	`666`	`break`
`664`	`667`	`ifnotkeepParsing:`
`665`	`668`	`break`
`666`		`-`
	`669`	`+`
`667`	`670`	`returnself.encoding`
`668`	`671`
`669`	`672`	`defhandleComment(self):`
`670`	`673`	`"""Skip over comments"""`
`671`		`-returnself.data.jumpTo("-->")`
	`674`	`+returnself.data.jumpTo(b"-->")`
`672`	`675`
`673`	`676`	`defhandleMeta(self):`
`674`	`677`	`ifself.data.currentBytenotinspaceCharactersBytes:`
`675`	`678`	`#if we have <meta not followed by a space so just keep going`
`676`	`679`	`returnTrue`
`677`	`680`	`#We have a valid meta element we want to search for attributes`
	`681`	`+hasPragma=False`
	`682`	`+pendingEncoding=None`
`678`	`683`	`whileTrue:`
`679`	`684`	`#Try to find the next attribute after the current position`
`680`	`685`	`attr=self.getAttribute()`
`681`	`686`	`ifattrisNone:`
`682`	`687`	`returnTrue`
`683`	`688`	`else:`
`684`		`-ifattr[0]=="charset":`
	`689`	`+ifattr[0]==b"http-equiv":`
	`690`	`+hasPragma=attr[1]==b"content-type"`
	`691`	`+ifhasPragmaandpendingEncodingisnotNone:`
	`692`	`+self.encoding=pendingEncoding`
	`693`	`+returnFalse`
	`694`	`+elifattr[0]==b"charset":`
`685`	`695`	`tentativeEncoding=attr[1]`
`686`	`696`	`codec=codecName(tentativeEncoding)`
`687`	`697`	`ifcodecisnotNone:`
`688`	`698`	`self.encoding=codec`
`689`	`699`	`returnFalse`
`690`		`-elifattr[0]=="content":`
	`700`	`+elifattr[0]==b"content":`
`691`	`701`	`contentParser=ContentAttrParser(EncodingBytes(attr[1]))`
`692`	`702`	`tentativeEncoding=contentParser.parse()`
`693`		`-codec=codecName(tentativeEncoding)`
`694`		`-ifcodecisnotNone:`
`695`		`-self.encoding=codec`
`696`		`-returnFalse`
	`703`	`+iftentativeEncodingisnotNone:`
	`704`	`+codec=codecName(tentativeEncoding)`
	`705`	`+ifcodecisnotNone:`
	`706`	`+ifhasPragma:`
	`707`	`+self.encoding=codec`
	`708`	`+returnFalse`
	`709`	`+else:`
	`710`	`+pendingEncoding=codec`
`697`	`711`
`698`	`712`	`defhandlePossibleStartTag(self):`
`699`	`713`	`returnself.handlePossibleTag(False)`
`@@ -714,7 +728,7 @@ def handlePossibleTag(self, endTag):`
`714`	`728`	`returnTrue`
`715`	`729`
`716`	`730`	`c=data.skipUntil(spacesAngleBrackets)`
`717`		`-ifc=="<":`
	`731`	`+ifc==b"<":`
`718`	`732`	`#return to the first step in the overall "two step" algorithm`
`719`	`733`	`#reprocessing the < byte`
`720`	`734`	`data.previous()`
`@@ -726,31 +740,31 @@ def handlePossibleTag(self, endTag):`
`726`	`740`	`returnTrue`
`727`	`741`
`728`	`742`	`defhandleOther(self):`
`729`		`-returnself.data.jumpTo(">")`
	`743`	`+returnself.data.jumpTo(b">")`
`730`	`744`
`731`	`745`	`defgetAttribute(self):`
`732`	`746`	`"""Return a name,value pair for the next attribute in the stream,`
`733`	`747`	`if one is found, or None"""`
`734`	`748`	`data=self.data`
`735`	`749`	`# Step 1 (skip chars)`
`736`		`-c=data.skip(spaceCharactersBytes\|frozenset("/"))`
	`750`	`+c=data.skip(spaceCharactersBytes\|frozenset([b"/"]))`
	`751`	`+assertcisNoneorlen(c)==1`
`737`	`752`	`# Step 2`
`738`		`-ifcin (">",None):`
	`753`	`+ifcin (b">",None):`
`739`	`754`	`returnNone`
`740`	`755`	`# Step 3`
`741`	`756`	`attrName= []`
`742`	`757`	`attrValue= []`
`743`	`758`	`#Step 4 attribute name`
`744`	`759`	`whileTrue:`
`745`		`-ifc=="="andattrName:`
	`760`	`+ifc==b"="andattrName:`
`746`	`761`	`break`
`747`	`762`	`elifcinspaceCharactersBytes:`
`748`	`763`	`#Step 6!`
`749`	`764`	`c=data.skip()`
`750`		`-c=next(data)`
`751`	`765`	`break`
`752`		`-elifcin ("/",">"):`
`753`		`-return"".join(attrName),""`
	`766`	`+elifcin (b"/",b">"):`
	`767`	`+returnb"".join(attrName),b""`
`754`	`768`	`elifcinasciiUppercaseBytes:`
`755`	`769`	`attrName.append(c.lower())`
`756`	`770`	`elifc==None:`
`@@ -760,15 +774,15 @@ def getAttribute(self):`
`760`	`774`	`#Step 5`
`761`	`775`	`c=next(data)`
`762`	`776`	`#Step 7`
`763`		`-ifc!="=":`
	`777`	`+ifc!=b"=":`
`764`	`778`	`data.previous()`
`765`		`-return"".join(attrName),""`
	`779`	`+returnb"".join(attrName),b""`
`766`	`780`	`#Step 8`
`767`	`781`	`next(data)`
`768`	`782`	`#Step 9`
`769`	`783`	`c=data.skip()`
`770`	`784`	`#Step 10`
`771`		`-ifcin ("'",'"'):`
	`785`	`+ifcin (b"'",b'"'):`
`772`	`786`	`#10.1`
`773`	`787`	`quoteChar=c`
`774`	`788`	`whileTrue:`
`@@ -777,15 +791,15 @@ def getAttribute(self):`
`777`	`791`	`#10.3`
`778`	`792`	`ifc==quoteChar:`
`779`	`793`	`next(data)`
`780`		`-return"".join(attrName),"".join(attrValue)`
	`794`	`+returnb"".join(attrName),b"".join(attrValue)`
`781`	`795`	`#10.4`
`782`	`796`	`elifcinasciiUppercaseBytes:`
`783`	`797`	`attrValue.append(c.lower())`
`784`	`798`	`#10.5`
`785`	`799`	`else:`
`786`	`800`	`attrValue.append(c)`
`787`		`-elifc==">":`
`788`		`-return"".join(attrName),""`
	`801`	`+elifc==b">":`
	`802`	`+returnb"".join(attrName),b""`
`789`	`803`	`elifcinasciiUppercaseBytes:`
`790`	`804`	`attrValue.append(c.lower())`
`791`	`805`	`elifcisNone:`
`@@ -796,7 +810,7 @@ def getAttribute(self):`
`796`	`810`	`whileTrue:`
`797`	`811`	`c=next(data)`
`798`	`812`	`ifcinspacesAngleBrackets:`
`799`		`-return"".join(attrName),"".join(attrValue)`
	`813`	`+returnb"".join(attrName),b"".join(attrValue)`
`800`	`814`	`elifcinasciiUppercaseBytes:`
`801`	`815`	`attrValue.append(c.lower())`
`802`	`816`	`elifcisNone:`
`@@ -807,21 +821,22 @@ def getAttribute(self):`
`807`	`821`
`808`	`822`	`classContentAttrParser(object):`
`809`	`823`	`def__init__(self,data):`
	`824`	`+assertisinstance(data,bytes)`
`810`	`825`	`self.data=data`
`811`	`826`	`defparse(self):`
`812`	`827`	`try:`
`813`	`828`	`#Check if the attr name is charset`
`814`	`829`	`#otherwise return`
`815`		`-self.data.jumpTo("charset")`
	`830`	`+self.data.jumpTo(b"charset")`
`816`	`831`	`self.data.position+=1`
`817`	`832`	`self.data.skip()`
`818`		`-ifnotself.data.currentByte=="=":`
	`833`	`+ifnotself.data.currentByte==b"=":`
`819`	`834`	`#If there is no = sign keep looking for attrs`
`820`	`835`	`returnNone`
`821`	`836`	`self.data.position+=1`
`822`	`837`	`self.data.skip()`
`823`	`838`	`#Look for an encoding between matching quote marks`
`824`		`-ifself.data.currentBytein ('"',"'"):`
	`839`	`+ifself.data.currentBytein (b'"',b"'"):`
`825`	`840`	`quoteMark=self.data.currentByte`
`826`	`841`	`self.data.position+=1`
`827`	`842`	`oldPosition=self.data.position`
`@@ -845,6 +860,11 @@ def parse(self):`
`845`	`860`	`defcodecName(encoding):`
`846`	`861`	`"""Return the python codec name corresponding to an encoding or None if the`
`847`	`862`	`string doesn't correspond to a valid encoding."""`
	`863`	`+ifisinstance(encoding,bytes):`
	`864`	`+try:`
	`865`	`+encoding=encoding.decode("ascii")`
	`866`	`+exceptUnicodeDecodeError:`
	`867`	`+returnNone`
`848`	`868`	`ifencoding:`
`849`	`869`	`canonicalName=ascii_punctuation_re.sub("",encoding).lower()`
`850`	`870`	`returnencodings.get(canonicalName,None)`

`‎html5lib/tests/test_encoding.py‎`

Lines changed: 16 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ def test_codec_name_c(self):`
`23`	`23`	`deftest_codec_name_d(self):`
`24`	`24`	`self.assertEqual(inputstream.codecName("ISO_8859--1"),"windows-1252")`
`25`	`25`
`26`		`-defrunEncodingTest(data,encoding):`
	`26`	`+defrunParserEncodingTest(data,encoding):`
`27`	`27`	`p=HTMLParser()`
`28`	`28`	`t=p.parse(data,useChardet=False)`
`29`	`29`	`encoding=encoding.lower().decode("ascii")`
`@@ -33,13 +33,27 @@ def runEncodingTest(data, encoding):`
`33`	`33`	`repr(p.tokenizer.stream.charEncoding[0])))`
`34`	`34`	`assertencoding==p.tokenizer.stream.charEncoding[0],errorMessage`
`35`	`35`
	`36`	`+`
	`37`	`+defrunPreScanEncodingTest(data,encoding):`
	`38`	`+stream=inputstream.HTMLBinaryInputStream(data,chardet=False)`
	`39`	`+encoding=encoding.lower().decode("ascii")`
	`40`	`+`
	`41`	`+iflen(data)>stream.numBytesMeta:`
	`42`	`+return`
	`43`	`+`
	`44`	`+errorMessage= ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"%`
	`45`	`+ (data,repr(encoding),`
	`46`	`+repr(stream.charEncoding[0])))`
	`47`	`+assertencoding==stream.charEncoding[0],errorMessage`
	`48`	`+`
`36`	`49`	`deftest_encoding():`
`37`	`50`	`forfilenameinget_data_files("encoding"):`
`38`	`51`	`test_name=os.path.basename(filename).replace('.dat',''). \`
`39`	`52`	`replace('-','')`
`40`	`53`	`tests=TestData(filename,b"data",encoding=None)`
`41`	`54`	`foridx,testinenumerate(tests):`
`42`		`-yield (runEncodingTest,test[b'data'],test[b'encoding'])`
	`55`	`+yield (runParserEncodingTest,test[b'data'],test[b'encoding'])`
	`56`	`+yield (runPreScanEncodingTest,test[b'data'],test[b'encoding'])`
`43`	`57`
`44`	`58`	`try:`
`45`	`59`	`importchardet`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitdb43ce2

File tree

3 files changed

3 files changed

`‎html5lib/html5parser.py‎`

`‎html5lib/inputstream.py‎`

`‎html5lib/tests/test_encoding.py‎`

0 commit comments