2121import _base
2222import iso639codes
2323import rfc3987
24+ import rfc2046
2425from html5lib .constants import E ,spaceCharacters ,digits
2526from html5lib import tokenizer
2627import gettext
6566_ (u"Root namespace must be 'http://www.w3.org/1999/xhtml', or omitted." ),
6667"invalid-browsing-context" :
6768_ (u"Value must be one of ('_self', '_parent', '_top'), or a name that does not start with '_': '%(attributeName)s' attribute on <%(tagName)s>." ),
69+ "invalid-tag-uri" :
70+ _ (u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>." ),
71+ "invalid-urn" :
72+ _ (u"Invalid URN: '%(attributeName)s' attribute on <%(tagName)s>." ),
73+ "invalid-uri-char" :
74+ _ (u"Illegal character in URI: '%(attributeName)s' attribute on <%(tagName)s>." ),
75+ "uri-not-iri" :
76+ _ (u"Expected a URI but found an IRI: '%(attributeName)s' attribute on <%(tagName)s>." ),
77+ "invalid-uri" :
78+ _ (u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>." ),
79+ "invalid-http-or-ftp-uri" :
80+ _ (u"Invalid URI: '%(attributeName)s' attribute on <%(tagName)s>." ),
81+ "invalid-scheme" :
82+ _ (u"Unregistered URI scheme: '%(attributeName)s' attribute on <%(tagName)s>." ),
83+ "invalid-rel" :
84+ _ (u"Invalid link relation: '%(attributeName)s' attribute on <%(tagName)s>." ),
85+ "invalid-mime-type" :
86+ _ (u"Invalid MIME type: '%(attributeName)s' attribute on <%(tagName)s>." ),
6887})
6988
7089globalAttributes = frozenset (('class' ,'contenteditable' ,'contextmenu' ,'dir' ,
236255'password' :frozenset (('size' ,))
237256}
238257
258+ linkRelValues = frozenset (('alternate' ,'archive' ,'archives' ,'author' ,'contact' ,'feed' ,'first' ,'begin' ,'start' ,'help' ,'icon' ,'index' ,'top' ,'contents' ,'toc' ,'last' ,'end' ,'license' ,'copyright' ,'next' ,'pingback' ,'prefetch' ,'prev' ,'previous' ,'search' ,'stylesheet' ,'sidebar' ,'tag' ,'up' ))
259+ aRelValues = frozenset (('alternate' ,'archive' ,'archives' ,'author' ,'contact' ,'feed' ,'first' ,'begin' ,'start' ,'help' ,'index' ,'top' ,'contents' ,'toc' ,'last' ,'end' ,'license' ,'copyright' ,'next' ,'prev' ,'previous' ,'search' ,'sidebar' ,'tag' ,'up' ,'bookmark' ,'external' ,'nofollow' ))
260+
239261class HTMLConformanceChecker (_base .Filter ):
240262def __init__ (self ,stream ,encoding ,parseMeta ,** kwargs ):
241263_base .Filter .__init__ (self ,tokenizer .HTMLTokenizer (
@@ -340,17 +362,17 @@ def checkStartTagUnknownAttributes(self, token):
340362# Attribute validation helpers
341363##########################################################################
342364
343- def checkURI (self ,token ,tagName ,attrName ,attrValue ):
344- isValid ,errorCode = rfc3987 .isValidURI (attrValue )
345- if not isValid :
346- yield {"type" :"ParseError" ,
347- "data" :errorCode ,
348- "datavars" : {"tagName" :tagName ,
349- "attributeName" :attrName }}
350- yield {"type" :"ParseError" ,
351- "data" :"invalid-attribute-value" ,
352- "datavars" : {"tagName" :tagName ,
353- "attributeName" :attrName }}
365+ # def checkURI(self, token, tagName, attrName, attrValue):
366+ # isValid, errorCode = rfc3987.isValidURI(attrValue)
367+ # if not isValid:
368+ # yield {"type": "ParseError",
369+ # "data": errorCode,
370+ # "datavars": {"tagName": tagName,
371+ # "attributeName": attrName}}
372+ # yield {"type": "ParseError",
373+ # "data": "invalid-attribute-value",
374+ # "datavars": {"tagName": tagName,
375+ # "attributeName": attrName}}
354376
355377def checkIRI (self ,token ,tagName ,attrName ,attrValue ):
356378isValid ,errorCode = rfc3987 .isValidIRI (attrValue )
@@ -382,26 +404,36 @@ def checkID(self, token, tagName, attrName, attrValue):
382404"attributeName" :attrName }}
383405break
384406
385- def checkTokenList (self ,tagName ,attrName ,attrValue ):
386- # The "token" in the method name refers to tokens in an attribute value
387- # i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of
388- # but the "token" parameter refers to the token generated from
389- # HTMLTokenizer. Sorry for the confusion.
407+ def parseTokenList (self ,value ):
390408valueList = []
391409currentValue = ''
392- for c in attrValue + ' ' :
410+ for c in value + ' ' :
393411if c in spaceCharacters :
394412if currentValue :
395- if currentValue in valueList :
396- yield {"type" :"ParseError" ,
397- "data" :"duplicate-value-in-token-list" ,
398- "datavars" : {"tagName" :tagName ,
399- "attributeName" :attrName ,
400- "attributeValue" :currentValue }}
401413valueList .append (currentValue )
402414currentValue = ''
403415else :
404416currentValue += c
417+ if currentValue :
418+ valueList .append (currentValue )
419+ return valueList
420+
421+ def checkTokenList (self ,tagName ,attrName ,attrValue ):
422+ # The "token" in the method name refers to tokens in an attribute value
423+ # i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of
424+ # but the "token" parameter refers to the token generated from
425+ # HTMLTokenizer. Sorry for the confusion.
426+ valueList = self .parseTokenList (attrValue )
427+ valueDict = {}
428+ for currentValue in valueList :
429+ if valueDict .has_key (currentValue ):
430+ yield {"type" :"ParseError" ,
431+ "data" :"duplicate-value-in-token-list" ,
432+ "datavars" : {"tagName" :tagName ,
433+ "attributeName" :attrName ,
434+ "attributeValue" :currentValue }}
435+ break
436+ valueDict [currentValue ]= 1
405437
406438def checkEnumeratedValue (self ,token ,tagName ,attrName ,attrValue ,enumeratedValues ):
407439if not attrValue and ('' not in enumeratedValues ):
@@ -422,7 +454,7 @@ def checkEnumeratedValue(self, token, tagName, attrName, attrValue, enumeratedVa
422454"datavars" : {"tagName" :tagName ,
423455"attributeName" :attrName }}
424456
425- def checkBooleanValue (self ,token ,tagName ,attrName ,attrValue ):
457+ def checkBoolean (self ,token ,tagName ,attrName ,attrValue ):
426458enumeratedValues = frozenset ((attrName ,'' ))
427459if attrValue not in enumeratedValues :
428460yield {"type" :"ParseError" ,
@@ -435,7 +467,7 @@ def checkBooleanValue(self, token, tagName, attrName, attrValue):
435467"datavars" : {"tagName" :tagName ,
436468"attributeName" :attrName }}
437469
438- def checkIntegerValue (self ,token ,tagName ,attrName ,attrValue ):
470+ def checkInteger (self ,token ,tagName ,attrName ,attrValue ):
439471sign = 1
440472numberString = ''
441473state = 'begin' # ('begin', 'initial-number', 'number', 'trailing-junk')
@@ -476,6 +508,10 @@ def checkIntegerValue(self, token, tagName, attrName, attrValue):
476508"datavars" : {"tagName" :tagName ,
477509"attributeName" :attrName }}
478510
511+ def checkFloatingPointNumber (self ,token ,tagName ,attrName ,attrValue ):
512+ # XXX
513+ pass
514+
479515def checkBrowsingContext (self ,token ,tagName ,attrName ,attrValue ):
480516if not attrValue :return
481517if attrValue [0 ]!= '_' :return
@@ -486,6 +522,56 @@ def checkBrowsingContext(self, token, tagName, attrName, attrValue):
486522"datavars" : {"tagName" :tagName ,
487523"attributeName" :attrName }}
488524
525+ def checkLangCode (self ,token ,tagName ,attrName ,attrValue ):
526+ if not attrValue :return # blank is OK
527+ if not iso639codes .isValidLangCode (attrValue ):
528+ yield {"type" :"ParseError" ,
529+ "data" :"invalid-lang-code" ,
530+ "datavars" : {"tagName" :tagName ,
531+ "attributeName" :attrName ,
532+ "attributeValue" :attrValue }}
533+
534+ def checkMIMEType (self ,token ,tagName ,attrName ,attrValue ):
535+ # XXX needs tests
536+ if not attrValue :
537+ yield {"type" :"ParseError" ,
538+ "data" :"attribute-value-can-not-be-blank" ,
539+ "datavars" : {"tagName" :tagName ,
540+ "attributeName" :attrName }}
541+
542+ if not rfc2046 .isValidMIMEType (attrValue ):
543+ yield {"type" :"ParseError" ,
544+ "data" :"invalid-mime-type" ,
545+ "datavars" : {"tagName" :tagName ,
546+ "attributeName" :attrName ,
547+ "attributeValue" :attrValue }}
548+
549+ def checkMediaQuery (self ,token ,tagName ,attrName ,attrValue ):
550+ # XXX
551+ pass
552+
553+ def checkLinkRelation (self ,token ,tagName ,attrName ,attrValue ):
554+ for t in self .checkTokenList (tagName ,attrName ,attrValue )or []:yield t
555+ valueList = self .parseTokenList (attrValue )
556+ allowedValues = (tagName == 'link' )and linkRelValues or aRelValues
557+ for currentValue in valueList :
558+ if currentValue not in allowedValues :
559+ yield {"type" :"ParseError" ,
560+ "data" :"invalid-rel" ,
561+ "datavars" : {"tagName" :tagName ,
562+ "attributeName" :attrName }}
563+
564+ def checkDateTime (self ,token ,tagName ,attrName ,attrValue ):
565+ # XXX
566+ state = 'begin' # ('begin', '...
567+ # for c in attrValue:
568+ # if state == 'begin':
569+ # if c in spaceCharacters:
570+ # continue
571+ # elif c in digits:
572+ # state = ...
573+
574+
489575##########################################################################
490576# Attribute validation
491577##########################################################################
@@ -521,17 +607,8 @@ def validateAttributeValueDir(self, token, tagName, attrName, attrValue):
521607def validateAttributeValueDraggable (self ,token ,tagName ,attrName ,attrValue ):
522608for t in self .checkEnumeratedValue (token ,tagName ,attrName ,attrValue ,frozenset (('true' ,'false' )))or []:yield t
523609
524- def validateAttributeValueIrrelevant (self ,token ,tagName ,attrName ,attrValue ):
525- for t in self .checkBooleanValue (token ,tagName ,attrName ,attrValue )or []:yield t
526-
527- def validateAttributeValueLang (self ,token ,tagName ,attrName ,attrValue ):
528- if not attrValue :return # blank is OK
529- if not iso639codes .isValidLangCode (attrValue ):
530- yield {"type" :"ParseError" ,
531- "data" :"invalid-lang-code" ,
532- "datavars" : {"tagName" :tagName ,
533- "attributeName" :attrName ,
534- "attributeValue" :attrValue }}
610+ validateAttributeValueIrrelevant = checkBoolean
611+ validateAttributeValueLang = checkLangCode
535612
536613def validateAttributeValueContextmenu (self ,token ,tagName ,attrName ,attrValue ):
537614for t in self .checkID (token ,tagName ,attrName ,attrValue )or []:yield t
@@ -552,7 +629,7 @@ def validateAttributeValueId(self, token, tagName, attrName, attrValue):
552629self .IDsWeHaveKnownAndLoved .append (attrValue )
553630self .thingsThatDefineAnID .append (token )
554631
555- validateAttributeValueTabindex = checkIntegerValue
632+ validateAttributeValueTabindex = checkInteger
556633
557634def validateAttributeValueRef (self ,token ,tagName ,attrName ,attrValue ):
558635# XXX
@@ -569,13 +646,47 @@ def validateAttributeValueHtmlXmlns(self, token, tagName, attrName, attrValue):
569646"datavars" : {"tagName" :tagName ,
570647"attributeName" :attrName }}
571648
572- def validateAttributeValueBaseHref (self ,token ,tagName ,attrName ,attrValue ):
573- # XXX
574- pass
575-
576649validateAttributeValueBaseHref = checkIRI
577650validateAttributeValueBaseTarget = checkBrowsingContext
578651validateAttributeValueLinkHref = checkIRI
652+ validateAttributeValueLinkRel = checkLinkRelation
653+ validateAttributeValueLinkMedia = checkMediaQuery
654+ validateAttributeValueLinkHreflang = checkLangCode
655+ validateAttributeValueLinkType = checkMIMEType
656+ # XXX <meta> attributes
657+ validateAttributeValueStyleMedia = checkMediaQuery
658+ validateAttributeValueStyleType = checkMIMEType
659+ validateAttributeValueStyleScoped = checkBoolean
660+ validateAttributeValueBlockquoteCite = checkIRI
661+ validateAttributeValueOlStart = checkInteger
662+ validateAttributeValueLiValue = checkInteger
663+ # XXX need tests from here on
664+ validateAttributeValueAHref = checkIRI
665+ validateAttributeValueATarget = checkBrowsingContext
666+
667+ def validateAttributeValueAPing (self ,token ,tagName ,attrName ,attrValue ):
668+ valueList = self .parseTokenList (attrValue )
669+ for currentValue in valueList :
670+ for t in self .checkIRI (token ,tagName ,attrName ,attrValue )or []:yield t
671+
672+ validateAttributeValueARel = checkLinkRelation
673+ validateAttributeValueAMedia = checkMediaQuery
674+ validateAttributeValueAHreflang = checkLangCode
675+ validateAttributeValueAType = checkMIMEType
676+ validateAttributeValueQCite = checkIRI
677+ validateAttributeValueTimeDatetime = checkDateTime
678+ validateAttributeValueMeterValue = checkFloatingPointNumber
679+ validateAttributeValueMeterMin = checkFloatingPointNumber
680+ validateAttributeValueMeterLow = checkFloatingPointNumber
681+ validateAttributeValueMeterHigh = checkFloatingPointNumber
682+ validateAttributeValueMeterMax = checkFloatingPointNumber
683+ validateAttributeValueMeterOptimum = checkFloatingPointNumber
684+ validateAttributeValueProgressValue = checkFloatingPointNumber
685+ validateAttributeValueProgressMax = checkFloatingPointNumber
686+ validateAttributeValueInsCite = checkIRI
687+ validateAttributeValueInsDatetime = checkDateTime
688+ validateAttributeValueDelCite = checkIRI
689+ validateAttributeValueDelDatetime = checkDateTime
579690
580691##########################################################################
581692# Whole document validation (IDs, etc.)