Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit2a4154e

Browse files
committed
Resync my tree with the trunk. Adds support for coercing trees to xml infosets in particular for lxml (still need to wire up the tests) and some speed improvements in the parser. Big apologies for the large checkin, there are some regressions in the liberal xml parser and the sanitizer that need to be fixed.
--HG--extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401248
1 parentddfddb9 commit2a4154e

17 files changed

+548
-229
lines changed

‎parse.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#RELEASE remove
1313
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
1414
#END RELEASE
15-
fromhtml5libimporthtml5parser,liberalxmlparser
15+
fromhtml5libimporthtml5parser,liberalxmlparser,sanitizer,tokenizer
1616
fromhtml5libimporttreebuilders,serializer,treewalkers
1717
fromhtml5libimportconstants
1818

@@ -46,17 +46,23 @@ def parse():
4646

4747
treebuilder=treebuilders.getTreeBuilder(opts.treebuilder)
4848

49+
ifopts.sanitize:
50+
tokenizer=sanitizer.HTMLSanitizer
51+
else:
52+
tokenizer=HTMLTokenizer
53+
4954
ifopts.xml:
50-
p=liberalxmlparser.XHTMLParser(tree=treebuilder)
55+
p=liberalxmlparser.XHTMLParser(tree=treebuilder,tokenizer=tokenizer)
5156
else:
52-
p=html5parser.HTMLParser(tree=treebuilder)
57+
p=html5parser.HTMLParser(tree=treebuilder,tokenizer=tokenizer)
5358

5459
ifopts.fragment:
5560
parseMethod=p.parseFragment
5661
else:
5762
parseMethod=p.parse
5863

5964
ifopts.profile:
65+
#XXX should import cProfile instead and use that
6066
importhotshot
6167
importhotshot.stats
6268
prof=hotshot.Profile('stats.prof')

‎src/html5lib/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,7 @@
1111
p = html5lib.HTMLParser()
1212
tree = p.parse(f)
1313
"""
14-
fromhtml5parserimportHTMLParser
14+
fromhtml5parserimportHTMLParser,parse
15+
fromtreebuildersimportgetTreeBuilder
16+
1517
fromliberalxmlparserimportXMLParser,XHTMLParser

‎src/html5lib/constants.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,6 @@
370370
spaceCharacters=frozenset((
371371
u"\t",
372372
u"\n",
373-
u"\u000B",
374373
u"\u000C",
375374
u" ",
376375
u"\r"
@@ -1088,5 +1087,16 @@
10881087
'windows936':'gbk',
10891088
'x-x-big5':'big5'}
10901089

1090+
tokenTypes= {
1091+
"Doctype":0,
1092+
"Characters":1,
1093+
"SpaceCharacters":2,
1094+
"StartTag":3,
1095+
"EndTag":4,
1096+
"EmptyTag":5,
1097+
"Comment":6,
1098+
"ParseError":7
1099+
}
1100+
10911101
classDataLossWarning(UserWarning):
10921102
pass

‎src/html5lib/filters/validator.py

Lines changed: 35 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
importiso639codes
2323
importrfc3987
2424
importrfc2046
25-
fromhtml5lib.constantsimportE,spaceCharacters,digits
25+
fromhtml5lib.constantsimportE,spaceCharacters,digits,tokenTypes
2626
fromhtml5libimporttokenizer
2727
importgettext
2828
_=gettext.gettext
@@ -267,8 +267,9 @@ def __init__(self, stream, encoding, parseMeta, **kwargs):
267267
self.IDsWeHaveKnownAndLoved= []
268268

269269
def__iter__(self):
270+
types=dict((v,k)fork,vintokenTypes.iteritems())
270271
fortokenin_base.Filter.__iter__(self):
271-
fakeToken= {"type":token.get("type","-"),
272+
fakeToken= {"type":types.get(token.get("type","-"),"-"),
272273
"name":token.get("name","-").capitalize()}
273274
method=getattr(self,"validate%(type)s%(name)s"%fakeToken,None)
274275
ifmethod:
@@ -301,23 +302,23 @@ def validateStartTagInput(self, token):
301302
attrDict=dict([(name.lower(),value)forname,valueintoken.get("data", [])])
302303
inputType=attrDict.get("type","text")
303304
ifinputTypenotininputTypeAllowedAttributeMap.keys():
304-
yield {"type":"ParseError",
305+
yield {"type":tokenTypes["ParseError"],
305306
"data":"unknown-input-type",
306307
"datavars": {"attrValue":inputType}}
307308
allowedAttributes=inputTypeAllowedAttributeMap.get(inputType, [])
308309
forattrName,attrValueinattrDict.items():
309310
ifattrNamenotinallowedAttributeMap['input']:
310-
yield {"type":"ParseError",
311+
yield {"type":tokenTypes["ParseError"],
311312
"data":"unknown-attribute",
312313
"datavars": {"tagName":"input",
313314
"attributeName":attrName}}
314315
elifattrNamenotinallowedAttributes:
315-
yield {"type":"ParseError",
316+
yield {"type":tokenTypes["ParseError"],
316317
"data":"attribute-not-allowed-on-this-input-type",
317318
"datavars": {"attributeName":attrName,
318319
"inputType":inputType}}
319320
ifattrNameininputTypeDeprecatedAttributeMap.get(inputType, []):
320-
yield {"type":"ParseError",
321+
yield {"type":tokenTypes["ParseError"],
321322
"data":"deprecated-attribute",
322323
"datavars": {"attributeName":attrName,
323324
"inputType":inputType}}
@@ -330,7 +331,7 @@ def checkUnknownStartTag(self, token):
330331
# check for recognized tag name
331332
name=token.get("name","").lower()
332333
ifnamenotinallowedAttributeMap.keys():
333-
yield {"type":"ParseError",
334+
yield {"type":tokenTypes["ParseError"],
334335
"data":"unknown-start-tag",
335336
"datavars": {"tagName":name}}
336337

@@ -342,7 +343,7 @@ def checkStartTagRequiredAttributes(self, token):
342343
intoken.get("data", [])]
343344
forattrNameinrequiredAttributeMap[name]:
344345
ifattrNamenotinattrsPresent:
345-
yield {"type":"ParseError",
346+
yield {"type":tokenTypes["ParseError"],
346347
"data":"missing-required-attribute",
347348
"datavars": {"tagName":name,
348349
"attributeName":attrName}}
@@ -353,7 +354,7 @@ def checkStartTagUnknownAttributes(self, token):
353354
allowedAttributes=globalAttributes|allowedAttributeMap.get(name,frozenset(()))
354355
forattrName,attrValueintoken.get("data", []):
355356
ifattrName.lower()notinallowedAttributes:
356-
yield {"type":"ParseError",
357+
yield {"type":tokenTypes["ParseError"],
357358
"data":"unknown-attribute",
358359
"datavars": {"tagName":name,
359360
"attributeName":attrName}}
@@ -365,40 +366,40 @@ def checkStartTagUnknownAttributes(self, token):
365366
# def checkURI(self, token, tagName, attrName, attrValue):
366367
# isValid, errorCode = rfc3987.isValidURI(attrValue)
367368
# if not isValid:
368-
# yield {"type": "ParseError",
369+
# yield {"type":tokenTypes["ParseError"],
369370
# "data": errorCode,
370371
# "datavars": {"tagName": tagName,
371372
# "attributeName": attrName}}
372-
# yield {"type": "ParseError",
373+
# yield {"type":tokenTypes["ParseError"],
373374
# "data": "invalid-attribute-value",
374375
# "datavars": {"tagName": tagName,
375376
# "attributeName": attrName}}
376377

377378
defcheckIRI(self,token,tagName,attrName,attrValue):
378379
isValid,errorCode=rfc3987.isValidIRI(attrValue)
379380
ifnotisValid:
380-
yield {"type":"ParseError",
381+
yield {"type":tokenTypes["ParseError"],
381382
"data":errorCode,
382383
"datavars": {"tagName":tagName,
383384
"attributeName":attrName}}
384-
yield {"type":"ParseError",
385+
yield {"type":tokenTypes["ParseError"],
385386
"data":"invalid-attribute-value",
386387
"datavars": {"tagName":tagName,
387388
"attributeName":attrName}}
388389

389390
defcheckID(self,token,tagName,attrName,attrValue):
390391
ifnotattrValue:
391-
yield {"type":"ParseError",
392+
yield {"type":tokenTypes["ParseError"],
392393
"data":"attribute-value-can-not-be-blank",
393394
"datavars": {"tagName":tagName,
394395
"attributeName":attrName}}
395396
forcinattrValue:
396397
ifcinspaceCharacters:
397-
yield {"type":"ParseError",
398+
yield {"type":tokenTypes["ParseError"],
398399
"data":"space-in-id",
399400
"datavars": {"tagName":tagName,
400401
"attributeName":attrName}}
401-
yield {"type":"ParseError",
402+
yield {"type":tokenTypes["ParseError"],
402403
"data":"invalid-attribute-value",
403404
"datavars": {"tagName":tagName,
404405
"attributeName":attrName}}
@@ -427,7 +428,7 @@ def checkTokenList(self, tagName, attrName, attrValue):
427428
valueDict= {}
428429
forcurrentValueinvalueList:
429430
ifvalueDict.has_key(currentValue):
430-
yield {"type":"ParseError",
431+
yield {"type":tokenTypes["ParseError"],
431432
"data":"duplicate-value-in-token-list",
432433
"datavars": {"tagName":tagName,
433434
"attributeName":attrName,
@@ -437,32 +438,32 @@ def checkTokenList(self, tagName, attrName, attrValue):
437438

438439
defcheckEnumeratedValue(self,token,tagName,attrName,attrValue,enumeratedValues):
439440
ifnotattrValueand (''notinenumeratedValues):
440-
yield {"type":"ParseError",
441+
yield {"type":tokenTypes["ParseError"],
441442
"data":"attribute-value-can-not-be-blank",
442443
"datavars": {"tagName":tagName,
443444
"attributeName":attrName}}
444445
return
445446
attrValue=attrValue.lower()
446447
ifattrValuenotinenumeratedValues:
447-
yield {"type":"ParseError",
448+
yield {"type":tokenTypes["ParseError"],
448449
"data":"invalid-enumerated-value",
449450
"datavars": {"tagName":tagName,
450451
"attributeName":attrName,
451452
"enumeratedValues":tuple(enumeratedValues)}}
452-
yield {"type":"ParseError",
453+
yield {"type":tokenTypes["ParseError"],
453454
"data":"invalid-attribute-value",
454455
"datavars": {"tagName":tagName,
455456
"attributeName":attrName}}
456457

457458
defcheckBoolean(self,token,tagName,attrName,attrValue):
458459
enumeratedValues=frozenset((attrName,''))
459460
ifattrValuenotinenumeratedValues:
460-
yield {"type":"ParseError",
461+
yield {"type":tokenTypes["ParseError"],
461462
"data":"invalid-boolean-value",
462463
"datavars": {"tagName":tagName,
463464
"attributeName":attrName,
464465
"enumeratedValues":tuple(enumeratedValues)}}
465-
yield {"type":"ParseError",
466+
yield {"type":tokenTypes["ParseError"],
466467
"data":"invalid-attribute-value",
467468
"datavars": {"tagName":tagName,
468469
"attributeName":attrName}}
@@ -471,7 +472,7 @@ def checkInteger(self, token, tagName, attrName, attrValue):
471472
sign=1
472473
numberString=''
473474
state='begin'# ('begin', 'initial-number', 'number', 'trailing-junk')
474-
error= {"type":"ParseError",
475+
error= {"type":tokenTypes["ParseError"],
475476
"data":"invalid-integer-value",
476477
"datavars": {"tagName":tagName,
477478
"attributeName":attrName,
@@ -503,7 +504,7 @@ def checkInteger(self, token, tagName, attrName, attrValue):
503504
elifstate=='trailing-junk':
504505
pass
505506
ifnotnumberString:
506-
yield {"type":"ParseError",
507+
yield {"type":tokenTypes["ParseError"],
507508
"data":"attribute-value-can-not-be-blank",
508509
"datavars": {"tagName":tagName,
509510
"attributeName":attrName}}
@@ -517,15 +518,15 @@ def checkBrowsingContext(self, token, tagName, attrName, attrValue):
517518
ifattrValue[0]!='_':return
518519
attrValue=attrValue.lower()
519520
ifattrValueinfrozenset(('_self','_parent','_top','_blank')):return
520-
yield {"type":"ParseError",
521+
yield {"type":tokenTypes["ParseError"],
521522
"data":"invalid-browsing-context",
522523
"datavars": {"tagName":tagName,
523524
"attributeName":attrName}}
524525

525526
defcheckLangCode(self,token,tagName,attrName,attrValue):
526527
ifnotattrValue:return# blank is OK
527528
ifnotiso639codes.isValidLangCode(attrValue):
528-
yield {"type":"ParseError",
529+
yield {"type":tokenTypes["ParseError"],
529530
"data":"invalid-lang-code",
530531
"datavars": {"tagName":tagName,
531532
"attributeName":attrName,
@@ -534,13 +535,13 @@ def checkLangCode(self, token, tagName, attrName, attrValue):
534535
defcheckMIMEType(self,token,tagName,attrName,attrValue):
535536
# XXX needs tests
536537
ifnotattrValue:
537-
yield {"type":"ParseError",
538+
yield {"type":tokenTypes["ParseError"],
538539
"data":"attribute-value-can-not-be-blank",
539540
"datavars": {"tagName":tagName,
540541
"attributeName":attrName}}
541542

542543
ifnotrfc2046.isValidMIMEType(attrValue):
543-
yield {"type":"ParseError",
544+
yield {"type":tokenTypes["ParseError"],
544545
"data":"invalid-mime-type",
545546
"datavars": {"tagName":tagName,
546547
"attributeName":attrName,
@@ -556,7 +557,7 @@ def checkLinkRelation(self, token, tagName, attrName, attrValue):
556557
allowedValues= (tagName=='link')andlinkRelValuesoraRelValues
557558
forcurrentValueinvalueList:
558559
ifcurrentValuenotinallowedValues:
559-
yield {"type":"ParseError",
560+
yield {"type":tokenTypes["ParseError"],
560561
"data":"invalid-rel",
561562
"datavars": {"tagName":tagName,
562563
"attributeName":attrName}}
@@ -593,7 +594,7 @@ def checkAttributeValues(self, token):
593594
defvalidateAttributeValueClass(self,token,tagName,attrName,attrValue):
594595
fortinself.checkTokenList(tagName,attrName,attrValue)or []:
595596
yieldt
596-
yield {"type":"ParseError",
597+
yield {"type":tokenTypes["ParseError"],
597598
"data":"invalid-attribute-value",
598599
"datavars": {"tagName":tagName,
599600
"attributeName":attrName}}
@@ -623,7 +624,7 @@ def validateAttributeValueId(self, token, tagName, attrName, attrValue):
623624
fortinself.checkID(token,tagName,attrName,attrValue)or []:yieldt
624625
ifnotattrValue:return
625626
ifattrValueinself.IDsWeHaveKnownAndLoved:
626-
yield {"type":"ParseError",
627+
yield {"type":tokenTypes["ParseError"],
627628
"data":"duplicate-id",
628629
"datavars": {"tagName":tagName}}
629630
self.IDsWeHaveKnownAndLoved.append(attrValue)
@@ -641,7 +642,7 @@ def validateAttributeValueTemplate(self, token, tagName, attrName, attrValue):
641642

642643
defvalidateAttributeValueHtmlXmlns(self,token,tagName,attrName,attrValue):
643644
ifattrValue!="http://www.w3.org/1999/xhtml":
644-
yield {"type":"ParseError",
645+
yield {"type":tokenTypes["ParseError"],
645646
"data":"invalid-root-namespace",
646647
"datavars": {"tagName":tagName,
647648
"attributeName":attrName}}
@@ -699,7 +700,7 @@ def eof(self):
699700
# hooray for obscure side effects!
700701
attrValue=attrsDict.get("contextmenu","")
701702
ifattrValueand (attrValuenotinself.IDsWeHaveKnownAndLoved):
702-
yield {"type":"ParseError",
703+
yield {"type":tokenTypes["ParseError"],
703704
"data":"id-does-not-exist",
704705
"datavars": {"tagName":tagName,
705706
"attributeName":"contextmenu",
@@ -710,6 +711,6 @@ def eof(self):
710711
ifnotid:continue
711712
ifid==attrValue:
712713
ifrefToken.get("name","").lower()!="menu":
713-
yield {"type":"ParseError",
714+
yield {"type":tokenTypes["ParseError"],
714715
"data":"contextmenu-must-point-to-menu"}
715716
break

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp