Commitd05f439

committed

Several changes related to character encoding; convert utf-16 to utf-8 if found in pre-parse algorithm, allow chardet to be switched off, start implementing reparsing if <meta> found during actual parse (not yet complete)

--HG--extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401056

1 parent84313a8 commitd05f439Copy full SHA for d05f439

File tree

6 files changed

+36

-24

lines changed

src/html5lib
tests
- test_encoding.py
- test_stream.py

6 files changed

+36

-24

lines changed

`‎src/html5lib/html5parser.py‎`

Lines changed: 6 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -78,14 +78,15 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder,`
`78`	`78`	`}`
`79`	`79`
`80`	`80`	`def_parse(self,stream,innerHTML=False,container="div",`
`81`		`-encoding=None,**kwargs):`
	`81`	`+encoding=None,parseMeta=True,useChardet=True,**kwargs):`
`82`	`82`
`83`	`83`	`self.tree.reset()`
`84`	`84`	`self.firstStartTag=False`
`85`	`85`	`self.errors= []`
`86`	`86`
`87`	`87`	`self.tokenizer=self.tokenizer_class(stream,encoding=encoding,`
`88`		`-parseMeta=notinnerHTML,**kwargs)`
	`88`	`+parseMeta=parseMeta,`
	`89`	`+useChardet=useChardet,**kwargs)`
`89`	`90`
`90`	`91`	`ifinnerHTML:`
`91`	`92`	`self.innerHTML=container.lower()`
`@@ -131,7 +132,7 @@ def _parse(self, stream, innerHTML=False, container="div",`
`131`	`132`	`# When the loop finishes it's EOF`
`132`	`133`	`self.phase.processEOF()`
`133`	`134`
`134`		`-defparse(self,stream,encoding=None):`
	`135`	`+defparse(self,stream,encoding=None,parseMeta=True,useChardet=True):`
`135`	`136`	`"""Parse a HTML document into a well-formed tree`
`136`	`137`
`137`	`138`	`stream - a filelike object or string containing the HTML to be parsed`
`@@ -144,7 +145,8 @@ def parse(self, stream, encoding=None):`
`144`	`145`	`self._parse(stream,innerHTML=False,encoding=encoding)`
`145`	`146`	`returnself.tree.getDocument()`
`146`	`147`
`147`		`-defparseFragment(self,stream,container="div",encoding=None):`
	`148`	`+defparseFragment(self,stream,container="div",encoding=None,`
	`149`	`+parseMeta=False,useChardet=True):`
`148`	`150`	`"""Parse a HTML fragment into a well-formed tree fragment`
`149`	`151`
`150`	`152`	`container - name of the element we're setting the innerHTML property`

`‎src/html5lib/inputstream.py‎`

Lines changed: 19 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):`
`38`	`38`	`# List of where new lines occur`
`39`	`39`	`self.newLines= [0]`
`40`	`40`
`41`		`-self.charEncoding=encoding`
	`41`	`+self.charEncoding=(encoding,"certian")`
`42`	`42`
`43`	`43`	`# Raw Stream - for unicode objects this will encode to utf-8 and set`
`44`	`44`	`# self.charEncoding as appropriate`
`@@ -54,11 +54,11 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):`
`54`	`54`	`self.defaultEncoding="windows-1252"`
`55`	`55`
`56`	`56`	`#Detect encoding iff no explicit "transport level" encoding is supplied`
`57`		`-ifself.charEncodingisNoneornotisValidEncoding(self.charEncoding):`
	`57`	`+ifself.charEncoding[0]isNoneornotisValidEncoding(self.charEncoding[0]):`
`58`	`58`	`self.charEncoding=self.detectEncoding(parseMeta,chardet)`
`59`	`59`
`60`		`-self.dataStream=codecs.getreader(self.charEncoding)(self.rawStream,`
`61`		`-'replace')`
	`60`	`+self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,`
	`61`	`+'replace')`
`62`	`62`
`63`	`63`	`self.queue=deque([])`
`64`	`64`	`self.readChars= []`
`@@ -92,12 +92,15 @@ def detectEncoding(self, parseMeta=True, chardet=True):`
`92`	`92`	`#First look for a BOM`
`93`	`93`	`#This will also read past the BOM if present`
`94`	`94`	`encoding=self.detectBOM()`
	`95`	`+confidence="certain"`
`95`	`96`	`#If there is no BOM need to look for meta elements with encoding`
`96`	`97`	`#information`
`97`	`98`	`ifencodingisNoneandparseMeta:`
`98`	`99`	`encoding=self.detectEncodingMeta()`
	`100`	`+confidence="tentative"`
`99`	`101`	`#Guess with chardet, if avaliable`
`100`	`102`	`ifencodingisNoneandchardet:`
	`103`	`+confidence="tentative"`
`101`	`104`	`try:`
`102`	`105`	`fromchardet.universaldetectorimportUniversalDetector`
`103`	`106`	`buffers= []`
`@@ -115,6 +118,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):`
`115`	`118`	`pass`
`116`	`119`	`# If all else fails use the default encoding`
`117`	`120`	`ifencodingisNone:`
	`121`	`+confidence="tentative"`
`118`	`122`	`encoding=self.defaultEncoding`
`119`	`123`
`120`	`124`	`#Substitute for equivalent encodings:`
`@@ -123,7 +127,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):`
`123`	`127`	`ifencoding.lower()inencodingSub:`
`124`	`128`	`encoding=encodingSub[encoding.lower()]`
`125`	`129`
`126`		`-returnencoding`
	`130`	`+returnencoding,confidence`
`127`	`131`
`128`	`132`	`defdetectBOM(self):`
`129`	`133`	`"""Attempts to detect at BOM at the start of the stream. If`
`@@ -200,7 +204,8 @@ def detectEncodingMeta(self):`
`200`	`204`	`buffer=self.rawStream.read(self.numBytesMeta)`
`201`	`205`	`parser=EncodingParser(buffer)`
`202`	`206`	`self.seek(buffer,0)`
`203`		`-returnparser.getEncoding()`
	`207`	`+encoding=parser.getEncoding()`
	`208`	`+returnencoding`
`204`	`209`
`205`	`210`	`defupdatePosition(self):`
`206`	`211`	`#Remove EOF from readChars, if present`
`@@ -414,7 +419,12 @@ def getEncoding(self):`
`414`	`419`	`ifnotkeepParsing:`
`415`	`420`	`break`
`416`	`421`	`ifself.encodingisnotNone:`
`417`		`-self.encoding=self.encoding.strip()`
	`422`	`+self.encoding=self.encoding.strip()`
	`423`	`+#Spec violation that complies with hsivonen + mjs`
	`424`	`+ifself.encoding.upper()in ("UTF-16","UTF-16BE","UTF-16LE",`
	`425`	`+"UTF-32","UTF-32BE","UTF-32LE"):`
	`426`	`+self.encoding="utf-8"`
	`427`	`+`
`418`	`428`	`returnself.encoding`
`419`	`429`
`420`	`430`	`defhandleComment(self):`
`@@ -531,7 +541,7 @@ def getAttribute(self):`
`531`	`541`	`#11.5`
`532`	`542`	`else:`
`533`	`543`	`attrValue.extend(self.data.currentByte)`
`534`		`-elifself.data.currentBytein (">",'<'):`
	`544`	`+elifself.data.currentBytein (">","<"):`
`535`	`545`	`return"".join(attrName),""`
`536`	`546`	`elifself.data.currentByteinasciiUppercase:`
`537`	`547`	`attrValue.extend(self.data.currentByte.lower())`
`@@ -540,7 +550,7 @@ def getAttribute(self):`
`540`	`550`	`whileTrue:`
`541`	`551`	`self.data.position+=1`
`542`	`552`	`ifself.data.currentBytein (`
`543`		`-list(spaceCharacters)+ [">",'<']):`
	`553`	`+list(spaceCharacters)+ [">","<"]):`
`544`	`554`	`return"".join(attrName),"".join(attrValue)`
`545`	`555`	`elifself.data.currentByteinasciiUppercase:`
`546`	`556`	`attrValue.extend(self.data.currentByte.lower())`

`‎src/html5lib/sanitizer.py‎`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -188,11 +188,11 @@ def sanitize_css(self, style):`
`188`	`188`	`return' '.join(clean)`
`189`	`189`
`190`	`190`	`classHTMLSanitizer(HTMLTokenizer,HTMLSanitizerMixin):`
`191`		`-def__init__(self,stream,encoding=None,parseMeta=True,`
	`191`	`+def__init__(self,stream,encoding=None,parseMeta=True,useChardet=True,`
`192`	`192`	`lowercaseElementName=False,lowercaseAttrName=False):`
`193`	`193`	`#Change case matching defaults as we only output lowercase html anyway`
`194`	`194`	`#This solution doesn't seem ideal...`
`195`		`-HTMLTokenizer.__init__(self,stream,encoding,parseMeta,`
	`195`	`+HTMLTokenizer.__init__(self,stream,encoding,parseMeta,useChardet,`
`196`	`196`	`lowercaseElementName,lowercaseAttrName)`
`197`	`197`
`198`	`198`	`def__iter__(self):`

`‎src/html5lib/tokenizer.py‎`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -30,9 +30,9 @@ class HTMLTokenizer(object):`
`30`	`30`
`31`	`31`	`# XXX need to fix documentation`
`32`	`32`
`33`		`-def__init__(self,stream,encoding=None,parseMeta=True,`
	`33`	`+def__init__(self,stream,encoding=None,parseMeta=True,useChardet=True,`
`34`	`34`	`lowercaseElementName=True,lowercaseAttrName=True,):`
`35`		`-self.stream=HTMLInputStream(stream,encoding,parseMeta)`
	`35`	`+self.stream=HTMLInputStream(stream,encoding,parseMeta,useChardet)`
`36`	`36`
`37`	`37`	`#Perform case conversions?`
`38`	`38`	`self.lowercaseElementName=lowercaseElementName`

`‎tests/test_encoding.py‎`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ def buildTestSuite():`
`16`	`16`	`foridx,testinenumerate(tests):`
`17`	`17`	`defencodingTest(self,data=test['data'],encoding=test['encoding']):`
`18`	`18`	`stream=inputstream.HTMLInputStream(data,chardet=False)`
`19`		`-self.assertEquals(encoding.lower(),stream.charEncoding)`
	`19`	`+self.assertEquals(encoding.lower(),stream.charEncoding[0])`
`20`	`20`	`setattr(Html5EncodingTestCase,'test_%s_%d'% (test_name,idx+1),`
`21`	`21`	`encodingTest)`
`22`	`22`
`@@ -25,7 +25,7 @@ def encodingTest(self, data=test['data'], encoding=test['encoding']):`
`25`	`25`	`deftest_chardet(self):`
`26`	`26`	`data=open(os.path.join(test_dir,"encoding" ,"chardet","test_big5.txt")).read()`
`27`	`27`	`encoding=inputstream.HTMLInputStream(data).charEncoding`
`28`		`-assertencoding.lower()=="big5"`
	`28`	`+assertencoding[0].lower()=="big5"`
`29`	`29`	`setattr(Html5EncodingTestCase,'test_chardet',test_chardet)`
`30`	`30`	`exceptImportError:`
`31`	`31`	`print"chardet not found, skipping chardet tests"`

`‎tests/test_stream.py‎`

Lines changed: 5 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ class HTMLInputStreamTest(unittest.TestCase):`
`7`	`7`
`8`	`8`	`deftest_char_ascii(self):`
`9`	`9`	`stream=HTMLInputStream("'",encoding='ascii')`
`10`		`-self.assertEquals(stream.charEncoding,'ascii')`
	`10`	`+self.assertEquals(stream.charEncoding[0],'ascii')`
`11`	`11`	`self.assertEquals(stream.char(),"'")`
`12`	`12`
`13`	`13`	`deftest_char_null(self):`
`@@ -16,24 +16,24 @@ def test_char_null(self):`
`16`	`16`
`17`	`17`	`deftest_char_utf8(self):`
`18`	`18`	`stream=HTMLInputStream(u'\u2018'.encode('utf-8'),encoding='utf-8')`
`19`		`-self.assertEquals(stream.charEncoding,'utf-8')`
	`19`	`+self.assertEquals(stream.charEncoding[0],'utf-8')`
`20`	`20`	`self.assertEquals(stream.char(),u'\u2018')`
`21`	`21`
`22`	`22`	`deftest_char_win1252(self):`
`23`	`23`	`stream=HTMLInputStream(u"\xa9\xf1\u2019".encode('windows-1252'))`
`24`		`-self.assertEquals(stream.charEncoding,'windows-1252')`
	`24`	`+self.assertEquals(stream.charEncoding[0],'windows-1252')`
`25`	`25`	`self.assertEquals(stream.char(),u"\xa9")`
`26`	`26`	`self.assertEquals(stream.char(),u"\xf1")`
`27`	`27`	`self.assertEquals(stream.char(),u"\u2019")`
`28`	`28`
`29`	`29`	`deftest_bom(self):`
`30`	`30`	`stream=HTMLInputStream(codecs.BOM_UTF8+"'")`
`31`		`-self.assertEquals(stream.charEncoding,'utf-8')`
	`31`	`+self.assertEquals(stream.charEncoding[0],'utf-8')`
`32`	`32`	`self.assertEquals(stream.char(),"'")`
`33`	`33`
`34`	`34`	`deftest_utf_16(self):`
`35`	`35`	`stream=HTMLInputStream((' '*1025).encode('utf-16'))`
`36`		`-self.assert_(stream.charEncodingin ['utf-16-le','utf-16-be'])`
	`36`	`+self.assert_(stream.charEncoding[0]in ['utf-16-le','utf-16-be'],stream.charEncoding)`
`37`	`37`	`self.assertEquals(len(stream.charsUntil(' ',True)),1025)`
`38`	`38`
`39`	`39`	`deftest_newlines(self):`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitd05f439

File tree

6 files changed

6 files changed

`‎src/html5lib/html5parser.py‎`

`‎src/html5lib/inputstream.py‎`

`‎src/html5lib/sanitizer.py‎`

`‎src/html5lib/tokenizer.py‎`

`‎tests/test_encoding.py‎`

`‎tests/test_stream.py‎`

0 commit comments