Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit72f2169

Browse files
committed
Fix#51: DataLossWarning hiding real exceptions in parser.
Previously we stopped parser tests once DataLossWarning was raised; wenow run let the parser run to completion before checking for raisedwarnings. This hid several cases of where real exceptions were raisedafter DataLossWarning was raised. This commit reveals these realexceptions in the testsuite and fixes current failure.Similar fixes are needed for the other tests.
1 parent19e1b9b commit72f2169

File tree

3 files changed

+58
-27
lines changed

3 files changed

+58
-27
lines changed

‎html5lib/ihatexml.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ def escapeRegexp(string):
179179

180180
nonXmlNameFirstBMPRegexp=re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
181181

182+
# Simpler things
183+
nonPubidCharRegexp=re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
184+
182185

183186
classInfosetFilter(object):
184187
replacementRegexp=re.compile(r"U[\dA-F]{5,5}")
@@ -188,7 +191,8 @@ def __init__(self, replaceChars=None,
188191
dropXmlnsAttrNs=False,
189192
preventDoubleDashComments=False,
190193
preventDashAtCommentEnd=False,
191-
replaceFormFeedCharacters=True):
194+
replaceFormFeedCharacters=True,
195+
preventSingleQuotePubid=False):
192196

193197
self.dropXmlnsLocalName=dropXmlnsLocalName
194198
self.dropXmlnsAttrNs=dropXmlnsAttrNs
@@ -198,6 +202,8 @@ def __init__(self, replaceChars=None,
198202

199203
self.replaceFormFeedCharacters=replaceFormFeedCharacters
200204

205+
self.preventSingleQuotePubid=preventSingleQuotePubid
206+
201207
self.replaceCache= {}
202208

203209
defcoerceAttribute(self,name,namespace=None):
@@ -229,6 +235,17 @@ def coerceCharacters(self, data):
229235
# Other non-xml characters
230236
returndata
231237

238+
defcoercePubid(self,data):
239+
dataOutput=data
240+
forcharinnonPubidCharRegexp.findall(data):
241+
warnings.warn("Coercing non-XML pubid",DataLossWarning)
242+
replacement=self.getReplacementCharacter(char)
243+
dataOutput=dataOutput.replace(char,replacement)
244+
ifself.preventSingleQuotePubidanddataOutput.find("'")>=0:
245+
warnings.warn("Pubid cannot contain single quote",DataLossWarning)
246+
dataOutput=dataOutput.replace("'",self.getReplacementCharacter("'"))
247+
returndataOutput
248+
232249
deftoXmlName(self,name):
233250
nameFirst=name[0]
234251
nameRest=name[1:]
@@ -260,7 +277,7 @@ def fromXmlName(self, name):
260277
returnname
261278

262279
defescapeChar(self,char):
263-
replacement="U"+hex(ord(char))[2:].upper().rjust(5,"0")
280+
replacement="U%05X"%ord(char)
264281
self.replaceCache[char]=replacement
265282
returnreplacement
266283

‎html5lib/tests/test_parser.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,28 +27,25 @@ def convertTreeDump(data):
2727

2828
defrunParserTest(innerHTML,input,expected,errors,treeClass,
2929
namespaceHTMLElements):
30-
warnings.resetwarnings()
31-
warnings.simplefilter("error")
32-
# XXX - move this out into the setup function
33-
# concatenate all consecutive character tokens into a single token
34-
try:
30+
withwarnings.catch_warnings(record=True)asw:
31+
warnings.simplefilter("always")
3532
p=html5parser.HTMLParser(tree=treeClass,
3633
namespaceHTMLElements=namespaceHTMLElements)
37-
exceptconstants.DataLossWarning:
38-
return
3934

40-
try:
41-
ifinnerHTML:
42-
document=p.parseFragment(input,innerHTML)
43-
else:
44-
try:
35+
try:
36+
ifinnerHTML:
37+
document=p.parseFragment(input,innerHTML)
38+
else:
4539
document=p.parse(input)
46-
exceptconstants.DataLossWarning:
47-
return
48-
except:
49-
errorMsg="\n".join(["\n\nInput:",input,"\nExpected:",expected,
50-
"\nTraceback:",traceback.format_exc()])
51-
assertFalse,errorMsg
40+
except:
41+
errorMsg="\n".join(["\n\nInput:",input,"\nExpected:",expected,
42+
"\nTraceback:",traceback.format_exc()])
43+
assertFalse,errorMsg
44+
45+
otherW= [xforxinwifnotissubclass(x.category,constants.DataLossWarning)]
46+
assertlen(otherW)==0, [(x.category,x.message)forxinotherW]
47+
iflen(w):
48+
return
5249

5350
output=convertTreeDump(p.tree.testSerializer(document))
5451

‎html5lib/treebuilders/etree_lxml.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -291,11 +291,16 @@ def insertDoctype(self, token):
291291
publicId=token["publicId"]
292292
systemId=token["systemId"]
293293

294-
ifnotnameorihatexml.nonXmlNameBMPRegexp.search(name)orname[0]=='"':
295-
warnings.warn("lxml cannot represent null or non-xml doctype",DataLossWarning)
294+
ifnotname:
295+
warnings.warn("lxml cannot represent empty doctype",DataLossWarning)
296+
self.doctype=None
297+
else:
298+
coercedName=self.infosetFilter.coerceElement(name)
299+
ifcoercedName!=name:
300+
warnings.warn("lxml cannot represent non-xml doctype",DataLossWarning)
296301

297-
doctype=self.doctypeClass(name,publicId,systemId)
298-
self.doctype=doctype
302+
doctype=self.doctypeClass(coercedName,publicId,systemId)
303+
self.doctype=doctype
299304

300305
definsertCommentInitial(self,data,parent=None):
301306
self.initial_comments.append(data)
@@ -313,12 +318,24 @@ def insertRoot(self, token):
313318
# Therefore we need to use the built-in parser to create our iniial
314319
# tree, after which we can add elements like normal
315320
docStr=""
316-
ifself.doctypeandself.doctype.nameandnotself.doctype.name.startswith('"'):
321+
ifself.doctype:
322+
assertself.doctype.name
317323
docStr+="<!DOCTYPE %s"%self.doctype.name
318324
if (self.doctype.publicIdisnotNoneor
319325
self.doctype.systemIdisnotNone):
320-
docStr+=' PUBLIC "%s" "%s"'% (self.doctype.publicIdor"",
321-
self.doctype.systemIdor"")
326+
docStr+= (' PUBLIC "%s" '%
327+
(self.infosetFilter.coercePubid(self.doctype.publicIdor"")))
328+
ifself.doctype.systemId:
329+
sysid=self.doctype.systemId
330+
ifsysid.find("'")>=0andsysid.find('"')>=0:
331+
warnings.warn("DOCTYPE system cannot contain single and double quotes",DataLossWarning)
332+
sysid=sysid.replace("'",'U00027')
333+
ifsysid.find("'")>=0:
334+
docStr+='"%s"'%sysid
335+
else:
336+
docStr+="'%s'"%sysid
337+
else:
338+
docStr+="''"
322339
docStr+=">"
323340
ifself.doctype.name!=token["name"]:
324341
warnings.warn("lxml cannot represent doctype with a different name to the root element",DataLossWarning)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp