Commit72f2169

committed

Fix#51: DataLossWarning hiding real exceptions in parser.

Previously we stopped parser tests once DataLossWarning was raised; wenow run let the parser run to completion before checking for raisedwarnings. This hid several cases of where real exceptions were raisedafter DataLossWarning was raised. This commit reveals these realexceptions in the testsuite and fixes current failure.Similar fixes are needed for the other tests.

1 parent19e1b9b commit72f2169Copy full SHA for 72f2169

File tree

3 files changed

+58

-27

lines changed

html5lib
- ihatexml.py
- tests
  - test_parser.py
- treebuilders
  - etree_lxml.py

3 files changed

+58

-27

lines changed

`‎html5lib/ihatexml.py‎`

Lines changed: 19 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -179,6 +179,9 @@ def escapeRegexp(string):`
`179`	`179`
`180`	`180`	nonXmlNameFirstBMPRegexp=re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
`181`	`181`
	`182`	`+# Simpler things`
	`183`	`+nonPubidCharRegexp=re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")`
	`184`	`+`
`182`	`185`
`183`	`186`	`classInfosetFilter(object):`
`184`	`187`	`replacementRegexp=re.compile(r"U[\dA-F]{5,5}")`
`@@ -188,7 +191,8 @@ def __init__(self, replaceChars=None,`
`188`	`191`	`dropXmlnsAttrNs=False,`
`189`	`192`	`preventDoubleDashComments=False,`
`190`	`193`	`preventDashAtCommentEnd=False,`
`191`		`-replaceFormFeedCharacters=True):`
	`194`	`+replaceFormFeedCharacters=True,`
	`195`	`+preventSingleQuotePubid=False):`
`192`	`196`
`193`	`197`	`self.dropXmlnsLocalName=dropXmlnsLocalName`
`194`	`198`	`self.dropXmlnsAttrNs=dropXmlnsAttrNs`
`@@ -198,6 +202,8 @@ def __init__(self, replaceChars=None,`
`198`	`202`
`199`	`203`	`self.replaceFormFeedCharacters=replaceFormFeedCharacters`
`200`	`204`
	`205`	`+self.preventSingleQuotePubid=preventSingleQuotePubid`
	`206`	`+`
`201`	`207`	`self.replaceCache= {}`
`202`	`208`
`203`	`209`	`defcoerceAttribute(self,name,namespace=None):`
`@@ -229,6 +235,17 @@ def coerceCharacters(self, data):`
`229`	`235`	`# Other non-xml characters`
`230`	`236`	`returndata`
`231`	`237`
	`238`	`+defcoercePubid(self,data):`
	`239`	`+dataOutput=data`
	`240`	`+forcharinnonPubidCharRegexp.findall(data):`
	`241`	`+warnings.warn("Coercing non-XML pubid",DataLossWarning)`
	`242`	`+replacement=self.getReplacementCharacter(char)`
	`243`	`+dataOutput=dataOutput.replace(char,replacement)`
	`244`	`+ifself.preventSingleQuotePubidanddataOutput.find("'")>=0:`
	`245`	`+warnings.warn("Pubid cannot contain single quote",DataLossWarning)`
	`246`	`+dataOutput=dataOutput.replace("'",self.getReplacementCharacter("'"))`
	`247`	`+returndataOutput`
	`248`	`+`
`232`	`249`	`deftoXmlName(self,name):`
`233`	`250`	`nameFirst=name[0]`
`234`	`251`	`nameRest=name[1:]`
`@@ -260,7 +277,7 @@ def fromXmlName(self, name):`
`260`	`277`	`returnname`
`261`	`278`
`262`	`279`	`defescapeChar(self,char):`
`263`		`-replacement="U"+hex(ord(char))[2:].upper().rjust(5,"0")`
	`280`	`+replacement="U%05X"%ord(char)`
`264`	`281`	`self.replaceCache[char]=replacement`
`265`	`282`	`returnreplacement`
`266`	`283`

`‎html5lib/tests/test_parser.py‎`

Lines changed: 15 additions & 18 deletions

Original file line number	Diff line number	Diff line change
`@@ -27,28 +27,25 @@ def convertTreeDump(data):`
`27`	`27`
`28`	`28`	`defrunParserTest(innerHTML,input,expected,errors,treeClass,`
`29`	`29`	`namespaceHTMLElements):`
`30`		`-warnings.resetwarnings()`
`31`		`-warnings.simplefilter("error")`
`32`		`-# XXX - move this out into the setup function`
`33`		`-# concatenate all consecutive character tokens into a single token`
`34`		`-try:`
	`30`	`+withwarnings.catch_warnings(record=True)asw:`
	`31`	`+warnings.simplefilter("always")`
`35`	`32`	`p=html5parser.HTMLParser(tree=treeClass,`
`36`	`33`	`namespaceHTMLElements=namespaceHTMLElements)`
`37`		`-exceptconstants.DataLossWarning:`
`38`		`-return`
`39`	`34`
`40`		`-try:`
`41`		`-ifinnerHTML:`
`42`		`-document=p.parseFragment(input,innerHTML)`
`43`		`-else:`
`44`		`-try:`
	`35`	`+try:`
	`36`	`+ifinnerHTML:`
	`37`	`+document=p.parseFragment(input,innerHTML)`
	`38`	`+else:`
`45`	`39`	`document=p.parse(input)`
`46`		`-exceptconstants.DataLossWarning:`
`47`		`-return`
`48`		`-except:`
`49`		`-errorMsg="\n".join(["\n\nInput:",input,"\nExpected:",expected,`
`50`		`-"\nTraceback:",traceback.format_exc()])`
`51`		`-assertFalse,errorMsg`
	`40`	`+except:`
	`41`	`+errorMsg="\n".join(["\n\nInput:",input,"\nExpected:",expected,`
	`42`	`+"\nTraceback:",traceback.format_exc()])`
	`43`	`+assertFalse,errorMsg`
	`44`	`+`
	`45`	`+otherW= [xforxinwifnotissubclass(x.category,constants.DataLossWarning)]`
	`46`	`+assertlen(otherW)==0, [(x.category,x.message)forxinotherW]`
	`47`	`+iflen(w):`
	`48`	`+return`
`52`	`49`
`53`	`50`	`output=convertTreeDump(p.tree.testSerializer(document))`
`54`	`51`

`‎html5lib/treebuilders/etree_lxml.py‎`

Lines changed: 24 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -291,11 +291,16 @@ def insertDoctype(self, token):`
`291`	`291`	`publicId=token["publicId"]`
`292`	`292`	`systemId=token["systemId"]`
`293`	`293`
`294`		`-ifnotnameorihatexml.nonXmlNameBMPRegexp.search(name)orname[0]=='"':`
`295`		`-warnings.warn("lxml cannot represent null or non-xml doctype",DataLossWarning)`
	`294`	`+ifnotname:`
	`295`	`+warnings.warn("lxml cannot represent empty doctype",DataLossWarning)`
	`296`	`+self.doctype=None`
	`297`	`+else:`
	`298`	`+coercedName=self.infosetFilter.coerceElement(name)`
	`299`	`+ifcoercedName!=name:`
	`300`	`+warnings.warn("lxml cannot represent non-xml doctype",DataLossWarning)`
`296`	`301`
`297`		`-doctype=self.doctypeClass(name,publicId,systemId)`
`298`		`-self.doctype=doctype`
	`302`	`+doctype=self.doctypeClass(coercedName,publicId,systemId)`
	`303`	`+self.doctype=doctype`
`299`	`304`
`300`	`305`	`definsertCommentInitial(self,data,parent=None):`
`301`	`306`	`self.initial_comments.append(data)`
`@@ -313,12 +318,24 @@ def insertRoot(self, token):`
`313`	`318`	`# Therefore we need to use the built-in parser to create our iniial`
`314`	`319`	`# tree, after which we can add elements like normal`
`315`	`320`	`docStr=""`
`316`		`-ifself.doctypeandself.doctype.nameandnotself.doctype.name.startswith('"'):`
	`321`	`+ifself.doctype:`
	`322`	`+assertself.doctype.name`
`317`	`323`	`docStr+="<!DOCTYPE %s"%self.doctype.name`
`318`	`324`	`if (self.doctype.publicIdisnotNoneor`
`319`	`325`	`self.doctype.systemIdisnotNone):`
`320`		`-docStr+=' PUBLIC "%s" "%s"'% (self.doctype.publicIdor"",`
`321`		`-self.doctype.systemIdor"")`
	`326`	`+docStr+= (' PUBLIC "%s" '%`
	`327`	`+ (self.infosetFilter.coercePubid(self.doctype.publicIdor"")))`
	`328`	`+ifself.doctype.systemId:`
	`329`	`+sysid=self.doctype.systemId`
	`330`	`+ifsysid.find("'")>=0andsysid.find('"')>=0:`
	`331`	`+warnings.warn("DOCTYPE system cannot contain single and double quotes",DataLossWarning)`
	`332`	`+sysid=sysid.replace("'",'U00027')`
	`333`	`+ifsysid.find("'")>=0:`
	`334`	`+docStr+='"%s"'%sysid`
	`335`	`+else:`
	`336`	`+docStr+="'%s'"%sysid`
	`337`	`+else:`
	`338`	`+docStr+="''"`
`322`	`339`	`docStr+=">"`
`323`	`340`	`ifself.doctype.name!=token["name"]:`
`324`	`341`	`warnings.warn("lxml cannot represent doctype with a different name to the root element",DataLossWarning)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit72f2169

File tree

3 files changed

3 files changed

`‎html5lib/ihatexml.py‎`

`‎html5lib/tests/test_parser.py‎`

`‎html5lib/treebuilders/etree_lxml.py‎`

0 commit comments