99>>> p.parse('<!doctype html>\n <html foo=bar></html>')
1010<<class 'html5lib.treebuilders.simpletree.Document'> None>
1111>>> p.errors
12- [((2, 14), 'unrecognized -attribute', {'attributeName': u'foo', 'tagName': u'html'})]
12+ [((2, 14), 'unknown -attribute', {'attributeName': u'foo', 'tagName': u'html'})]
1313"""
1414
15+ try :
16+ frozenset
17+ except NameError :
18+ # Import from the sets module for python 2.3
19+ from sets import Set as set
20+ from sets import ImmutableSet as frozenset
1521import _base
1622from html5lib .constants import E
1723from html5lib import tokenizer
1824import gettext
1925_ = gettext .gettext
2026
2127E .update ({
22- "unrecognized-attribute" :
23- _ (u"Unrecognized attribute '%(attributeName)s' in <%(tagName)s>" ),
28+ "unknown-start-tag" :
29+ _ (u"Unknown start tag <%(tagName)s'" ),
30+ "unknown-attribute" :
31+ _ (u"Unknown '%(attributeName)s' attribute on <%(tagName)s>" ),
2432"missing-required-attribute" :
25- _ (u"Missing required attribute '%(attributeName)s' in <%(tagName)s>" ),
33+ _ (u"Missing required '%(attributeName)s' attribute on <%(tagName)s>" ),
34+ "unknown-input-type" :
35+ _ (u"Unknown value for input type: '%(inputType)s'" ),
36+ "attribute-not-allowed-on-this-input-type" :
37+ _ (u"'%(attributeName)s' attribute is not allowed on <input type='%(inputType)s'>" ),
2638})
2739
28- globalAttributes = [ 'class' ,'contenteditable' ,'contextmenu' ,'dir' ,
40+ globalAttributes = frozenset (( 'class' ,'contenteditable' ,'contextmenu' ,'dir' ,
2941'draggable' ,'id' ,'irrelevant' ,'lang' ,'ref' ,'tabindex' ,'template' ,
3042'title' ,'onabort' ,'onbeforeunload' ,'onblur' ,'onchange' ,'onclick' ,
3143'oncontextmenu' ,'ondblclick' ,'ondrag' ,'ondragend' ,'ondragenter' ,
3244'ondragleave' ,'ondragover' ,'ondragstart' ,'ondrop' ,'onerror' ,
3345'onfocus' ,'onkeydown' ,'onkeypress' ,'onkeyup' ,'onload' ,'onmessage' ,
3446'onmousedown' ,'onmousemove' ,'onmouseout' ,'onmouseover' ,'onmouseup' ,
35- 'onmousewheel' ,'onresize' ,'onscroll' ,'onselect' ,'onsubmit' ,'onunload' ]
47+ 'onmousewheel' ,'onresize' ,'onscroll' ,'onselect' ,'onsubmit' ,'onunload' ))
3648# XXX lang in HTML only, xml:lang in XHTML only
3749
3850allowedAttributeMap = {
39- 'html' : ['xmlns' ],
40- 'base' : ['href' ,'target' ],
41- 'link' : ['href' ,'rel' ,'media' ,'hreflang' ,'type' ],
42- 'meta' : ['name' ,'http-equiv' ,'content' ,'charset' ],# XXX charset in HTML only
43- 'style' : ['media' ,'type' ,'scoped' ],
44- 'blockquote' : ['cite' ],
45- 'ol' : ['start' ],
46- 'li' : ['value' ],# XXX depends on parent
47- 'a' : ['href' ,'target' ,'ping' ,'rel' ,'media' ,'hreflang' ,'type' ],
48- 'q' : ['cite' ],
49- 'time' : ['datetime' ],
50- 'meter' : ['value' ,'min' ,'low' ,'high' ,'max' ,'optimum' ],
51- 'progress' : ['value' ,'max' ],
52- 'ins' : ['cite' ,'datetime' ],
53- 'del' : ['cite' ,'datetime' ],
54- 'img' : ['alt' ,'src' ,'usemap' ,'ismap' ,'height' ,'width' ],# XXX ismap depends on parent
55- 'iframe' : ['src' ],
56- 'object' : ['data' ,'type' ,'usemap' ,'height' ,'width' ],
57- 'param' : ['name' ,'value' ],
58- 'video' : ['src' ,'autoplay' ,'start' ,'loopstart' ,'loopend' ,'end' ,
59- 'loopcount' ,'controls' ],
60- 'audio' : ['src' ,'autoplay' ,'start' ,'loopstart' ,'loopend' ,'end' ,
61- 'loopcount' ,'controls' ],
62- 'source' : ['src' ,'type' ,'media' ],
63- 'canvas' : ['height' ,'width' ],
64- 'area' : ['alt' ,'coords' ,'shape' ,'href' ,'target' ,'ping' ,'rel' ,
65- 'media' ,'hreflang' ,'type' ],
66- 'colgroup' : ['span' ],# XXX only if element contains no <col> elements
67- 'col' : ['span' ],
68- 'td' : ['colspan' ,'rowspan' ],
69- 'th' : ['colspan' ,'rowspan' ,'scope' ],
70- # XXX form elements
71- 'script' : ['src' ,'defer' ,'async' ,'type' ],
72- 'event-source' : ['src' ],
73- 'details' : ['open' ],
74- 'datagrid' : ['multiple' ,'disabled' ],
75- 'command' : ['type' ,'label' ,'icon' ,'hidden' ,'disabled' ,'checked' ,
76- 'radiogroup' ,'default' ],
77- 'menu' : ['type' ,'label' ,'autosubmit' ],
78- 'font' : ['style' ]
51+ 'html' :frozenset (('xmlns' ,)),
52+ 'head' :frozenset (()),
53+ 'title' :frozenset (()),
54+ 'base' :frozenset (('href' ,'target' )),
55+ 'link' :frozenset (('href' ,'rel' ,'media' ,'hreflang' ,'type' )),
56+ 'meta' :frozenset (('name' ,'http-equiv' ,'content' ,'charset' )),# XXX charset in HTML only
57+ 'style' :frozenset (('media' ,'type' ,'scoped' )),
58+ 'body' :frozenset (()),
59+ 'section' :frozenset (()),
60+ 'nav' :frozenset (()),
61+ 'article' :frozenset (()),
62+ 'blockquote' :frozenset (('cite' ,)),
63+ 'aside' :frozenset (()),
64+ 'h1' :frozenset (()),
65+ 'h2' :frozenset (()),
66+ 'h3' :frozenset (()),
67+ 'h4' :frozenset (()),
68+ 'h5' :frozenset (()),
69+ 'h6' :frozenset (()),
70+ 'header' :frozenset (()),
71+ 'footer' :frozenset (()),
72+ 'address' :frozenset (()),
73+ 'p' :frozenset (()),
74+ 'hr' :frozenset (()),
75+ 'br' :frozenset (()),
76+ 'dialog' :frozenset (()),
77+ 'pre' :frozenset (()),
78+ 'ol' :frozenset (('start' ,)),
79+ 'ul' :frozenset (()),
80+ 'li' :frozenset (('value' ,)),# XXX depends on parent
81+ 'dl' :frozenset (()),
82+ 'dt' :frozenset (()),
83+ 'dd' :frozenset (()),
84+ 'a' :frozenset (('href' ,'target' ,'ping' ,'rel' ,'media' ,'hreflang' ,'type' )),
85+ 'q' :frozenset (('cite' ,)),
86+ 'cite' :frozenset (()),
87+ 'em' :frozenset (()),
88+ 'strong' :frozenset (()),
89+ 'small' :frozenset (()),
90+ 'm' :frozenset (()),
91+ 'dfn' :frozenset (()),
92+ 'abbr' :frozenset (()),
93+ 'time' :frozenset (('datetime' ,)),
94+ 'meter' :frozenset (('value' ,'min' ,'low' ,'high' ,'max' ,'optimum' )),
95+ 'progress' :frozenset (('value' ,'max' )),
96+ 'code' :frozenset (()),
97+ 'var' :frozenset (()),
98+ 'samp' :frozenset (()),
99+ 'kbd' :frozenset (()),
100+ 'sup' :frozenset (()),
101+ 'sub' :frozenset (()),
102+ 'span' :frozenset (()),
103+ 'i' :frozenset (()),
104+ 'b' :frozenset (()),
105+ 'bdo' :frozenset (()),
106+ 'ins' :frozenset (('cite' ,'datetime' )),
107+ 'del' :frozenset (('cite' ,'datetime' )),
108+ 'figure' :frozenset (()),
109+ 'img' :frozenset (('alt' ,'src' ,'usemap' ,'ismap' ,'height' ,'width' )),# XXX ismap depends on parent
110+ 'iframe' :frozenset (('src' ,)),
111+ # <embed> handled separately
112+ 'object' :frozenset (('data' ,'type' ,'usemap' ,'height' ,'width' )),
113+ 'param' :frozenset (('name' ,'value' )),
114+ 'video' :frozenset (('src' ,'autoplay' ,'start' ,'loopstart' ,'loopend' ,'end' ,
115+ 'loopcount' ,'controls' )),
116+ 'audio' :frozenset (('src' ,'autoplay' ,'start' ,'loopstart' ,'loopend' ,'end' ,
117+ 'loopcount' ,'controls' )),
118+ 'source' :frozenset (('src' ,'type' ,'media' )),
119+ 'canvas' :frozenset (('height' ,'width' )),
120+ 'map' :frozenset (()),
121+ 'area' :frozenset (('alt' ,'coords' ,'shape' ,'href' ,'target' ,'ping' ,'rel' ,
122+ 'media' ,'hreflang' ,'type' )),
123+ 'table' :frozenset (()),
124+ 'caption' :frozenset (()),
125+ 'colgroup' :frozenset (('span' ,)),# XXX only if element contains no <col> elements
126+ 'col' :frozenset (('span' ,)),
127+ 'tbody' :frozenset (()),
128+ 'thead' :frozenset (()),
129+ 'tfoot' :frozenset (()),
130+ 'tr' :frozenset (()),
131+ 'td' :frozenset (('colspan' ,'rowspan' )),
132+ 'th' :frozenset (('colspan' ,'rowspan' ,'scope' )),
133+ # 'form': frozenset(('action', 'method', 'enctype', 'accept', 'name', 'onsubmit',
134+ # 'onreset', 'accept-charset', 'data', 'replace')),
135+ # all possible <input> attributes are listed here but <input> is really handled separately
136+ 'input' :frozenset (('accept' ,'accesskey' ,'action' ,'alt' ,'autocomplete' ,'autofocus' ,'checked' ,'disabled' ,'enctype' ,'form' ,'inputmode' ,'list' ,'maxlength' ,'method' ,'min' ,'max' ,'name' ,'pattern' ,'step' ,'readonly' ,'replace' ,'required' ,'size' ,'src' ,'tabindex' ,'target' ,'template' ,'value' )),
137+ # 'button': frozenset(('name', 'value', 'type', 'disabled', 'form', 'autofocus')),
138+ # 'select': frozenset(('name', 'size', 'multiple', 'disabled', 'data', 'accesskey',
139+ # 'form', 'autofocus')),
140+ # 'optgroup': frozenset(('disabled', 'label', 'form', 'autofocus')),
141+ # 'option': frozenset(('selected', 'disabled', 'label', 'value', 'form', 'autofocus')),
142+ # 'textarea': frozenset(('name', 'rows', 'cols', 'disabled', 'readonly', 'required',
143+ # 'form', 'autofocus', 'wrap', 'accept')),
144+ # 'label': frozenset(('for', 'accesskey', 'form')),
145+ # 'fieldset': frozenset(('disabled', 'form')),
146+ # 'output': frozenset(('form', 'name', 'for', 'onforminput', 'onformchange')),
147+ # 'datalist': frozenset(('data')),
148+ # # XXX repetition model for repeating form controls
149+ 'script' :frozenset (('src' ,'defer' ,'async' ,'type' )),
150+ 'noscript' :frozenset (()),
151+ 'noembed' :frozenset (()),
152+ 'event-source' :frozenset (('src' ,)),
153+ 'details' :frozenset (('open' ,)),
154+ 'datagrid' :frozenset (('multiple' ,'disabled' )),
155+ 'command' :frozenset (('type' ,'label' ,'icon' ,'hidden' ,'disabled' ,'checked' ,
156+ 'radiogroup' ,'default' )),
157+ 'menu' :frozenset (('type' ,'label' ,'autosubmit' )),
158+ 'datatemplate' :frozenset (()),
159+ 'rule' :frozenset (()),
160+ 'nest' :frozenset (()),
161+ 'legend' :frozenset (()),
162+ 'div' :frozenset (()),
163+ 'font' :frozenset (('style' ,)),
79164}
80165
81166requiredAttributeMap = {
82- 'link' : ['href' ,'rel' ],
83- 'bdo' : ['dir' ],
84- 'img' : ['src' ],
85- 'embed' : ['src' ],
86- 'object' : [],# XXX one of 'data' or 'type' is required
87- 'param' : ['name' ,'value' ],
88- 'source' : ['src' ],
89- 'map' : ['id' ],
167+ 'link' :frozenset (('href' ,'rel' )),
168+ 'bdo' :frozenset (('dir' ,)),
169+ 'img' :frozenset (('src' ,)),
170+ 'embed' :frozenset (('src' ,)),
171+ 'object' :frozenset (()),# XXX one of 'data' or 'type' is required
172+ 'param' :frozenset (('name' ,'value' )),
173+ 'source' :frozenset (('src' ,)),
174+ 'map' :frozenset (('id' ,)),
175+ }
176+
177+ inputTypeAllowedAttributeMap = {
178+ 'text' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'inputmode' ,'list' ,'maxlength' ,'name' ,'pattern' ,'readonly' ,'required' ,'size' ,'tabindex' ,'value' )),
179+ 'password' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'inputmode' ,'maxlength' ,'name' ,'pattern' ,'readonly' ,'required' ,'size' ,'tabindex' ,'value' )),
180+ 'checkbox' :frozenset (('accesskey' ,'autofocus' ,'checked' ,'disabled' ,'form' ,'name' ,'required' ,'tabindex' ,'value' )),
181+ 'radio' :frozenset (('accesskey' ,'autofocus' ,'checked' ,'disabled' ,'form' ,'name' ,'required' ,'tabindex' ,'value' )),
182+ 'button' :frozenset (('accesskey' ,'autofocus' ,'disabled' ,'form' ,'name' ,'tabindex' ,'value' )),
183+ 'submit' :frozenset (('accesskey' ,'action' ,'autofocus' ,'disabled' ,'enctype' ,'form' ,'method' ,'name' ,'replace' ,'tabindex' ,'target' ,'value' )),
184+ 'reset' :frozenset (('accesskey' ,'autofocus' ,'disabled' ,'form' ,'name' ,'tabindex' ,'value' )),
185+ 'add' :frozenset (('accesskey' ,'autofocus' ,'disabled' ,'form' ,'name' ,'tabindex' ,'template' ,'value' )),
186+ 'remove' :frozenset (('accesskey' ,'autofocus' ,'disabled' ,'form' ,'name' ,'tabindex' ,'value' )),
187+ 'move-up' :frozenset (('accesskey' ,'autofocus' ,'disabled' ,'form' ,'name' ,'tabindex' ,'value' )),
188+ 'move-down' :frozenset (('accesskey' ,'autofocus' ,'disabled' ,'form' ,'name' ,'tabindex' ,'value' )),
189+ 'file' :frozenset (('accept' ,'accesskey' ,'autofocus' ,'disabled' ,'form' ,'min' ,'max' ,'name' ,'required' ,'tabindex' )),
190+ 'hidden' :frozenset (('disabled' ,'form' ,'name' ,'value' )),
191+ 'image' :frozenset (('accesskey' ,'action' ,'alt' ,'autofocus' ,'disabled' ,'enctype' ,'form' ,'method' ,'name' ,'replace' ,'src' ,'tabindex' ,'target' )),
192+ 'datetime' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'list' ,'min' ,'max' ,'name' ,'step' ,'readonly' ,'required' ,'tabindex' ,'value' )),
193+ 'datetime-local' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'list' ,'min' ,'max' ,'name' ,'step' ,'readonly' ,'required' ,'tabindex' ,'value' )),
194+ 'date' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'list' ,'min' ,'max' ,'name' ,'step' ,'readonly' ,'required' ,'tabindex' ,'value' )),
195+ 'month' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'list' ,'min' ,'max' ,'name' ,'step' ,'readonly' ,'required' ,'tabindex' ,'value' )),
196+ 'week' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'list' ,'min' ,'max' ,'name' ,'step' ,'readonly' ,'required' ,'tabindex' ,'value' )),
197+ 'time' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'list' ,'min' ,'max' ,'name' ,'step' ,'readonly' ,'required' ,'tabindex' ,'value' )),
198+ 'number' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'list' ,'min' ,'max' ,'name' ,'step' ,'readonly' ,'required' ,'tabindex' ,'value' )),
199+ 'range' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'list' ,'min' ,'max' ,'name' ,'step' ,'readonly' ,'required' ,'tabindex' ,'value' )),
200+ 'email' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'inputmode' ,'list' ,'maxlength' ,'name' ,'pattern' ,'readonly' ,'required' ,'tabindex' ,'value' )),
201+ 'url' :frozenset (('accesskey' ,'autocomplete' ,'autofocus' ,'disabled' ,'form' ,'inputmode' ,'list' ,'maxlength' ,'name' ,'pattern' ,'readonly' ,'required' ,'tabindex' ,'value' )),
90202}
91203
92204class HTMLConformanceChecker (_base .Filter ):
@@ -96,31 +208,76 @@ def __init__(self, stream, encoding, parseMeta, **kwargs):
96208
97209def __iter__ (self ):
98210for token in _base .Filter .__iter__ (self ):
99- type = token ["type" ]
100- if type == "StartTag" :
101- name = token ["name" ].lower ()
102- if name == 'embed' :
103- # XXX spec says "any attributes w/o namespace"
104- pass
105- else :
106- if name in allowedAttributeMap .keys ():
107- allowedAttributes = globalAttributes + \
108- allowedAttributeMap [name ]
109- else :
110- allowedAttributes = globalAttributes
111- for attrName ,attrValue in token ["data" ]:
112- if attrName .lower ()not in allowedAttributes :
113- yield {"type" :"ParseError" ,
114- "data" :"unrecognized-attribute" ,
115- "datavars" : {"tagName" :name ,
116- "attributeName" :attrName }}
117- if name in requiredAttributeMap .keys ():
118- attrsPresent = [attrName for attrName ,attrValue
119- in token ["data" ]]
120- for attrName in requiredAttributeMap [name ]:
121- if attrName not in attrsPresent :
122- yield {"type" :"ParseError" ,
123- "data" :"missing-required-attribute" ,
124- "datavars" : {"tagName" :name ,
125- "attributeName" :attrName }}
211+ fakeToken = {"type" :token .get ("type" ,"-" ),
212+ "name" :token .get ("name" ,"-" ).capitalize ()}
213+ method = getattr (self ,"validate%(type)s%(name)s" % fakeToken ,None )
214+ if method :
215+ for t in method (token )or []:yield t
216+ else :
217+ method = getattr (self ,"validate%(type)s" % fakeToken ,None )
218+ if method :
219+ for t in method (token )or []:yield t
126220yield token
221+
222+ def validateStartTag (self ,token ):
223+ for t in self .checkUnknownStartTag (token )or []:yield t
224+ for t in self .checkStartTagRequiredAttributes (token )or []:yield t
225+ for t in self .checkStartTagUnknownAttributes (token )or []:yield t
226+
227+ def validateStartTagEmbed (self ,token ):
228+ for t in self .checkStartTagRequiredAttributes (token )or []:yield t
229+ # spec says "any attributes w/o namespace"
230+ # so don't call checkStartTagUnknownAttributes
231+
232+ def validateStartTagInput (self ,token ):
233+ attrDict = dict ([(name .lower (),value )for name ,value in token ["data" ]])
234+ inputType = attrDict .get ("type" ,"text" )
235+ if inputType not in inputTypeAllowedAttributeMap .keys ():
236+ yield {"type" :"ParseError" ,
237+ "data" :"unknown-input-type" ,
238+ "datavars" : {"attrValue" :inputType }}
239+ allowedAttributes = inputTypeAllowedAttributeMap .get (inputType , [])
240+ for attrName ,attrValue in attrDict .items ():
241+ if attrName not in allowedAttributeMap ['input' ]:
242+ yield {"type" :"ParseError" ,
243+ "data" :"unknown-attribute" ,
244+ "datavars" : {"tagName" :"input" ,
245+ "attributeName" :attrName }}
246+ elif attrName not in allowedAttributes :
247+ yield {"type" :"ParseError" ,
248+ "data" :"attribute-not-allowed-on-this-input-type" ,
249+ "datavars" : {"attributeName" :attrName ,
250+ "inputType" :inputType }}
251+
252+ def checkUnknownStartTag (self ,token ):
253+ # check for recognized tag name
254+ name = token ["name" ].lower ()
255+ if name not in allowedAttributeMap .keys ():
256+ yield {"type" :"ParseError" ,
257+ "data" :"unknown-start-tag" ,
258+ "datavars" : {"tagName" :name }}
259+
260+ def checkStartTagRequiredAttributes (self ,token ):
261+ # check for presence of required attributes
262+ name = token ["name" ].lower ()
263+ if name in requiredAttributeMap .keys ():
264+ attrsPresent = [attrName for attrName ,attrValue
265+ in token ["data" ]]
266+ for attrName in requiredAttributeMap [name ]:
267+ if attrName not in attrsPresent :
268+ yield {"type" :"ParseError" ,
269+ "data" :"missing-required-attribute" ,
270+ "datavars" : {"tagName" :name ,
271+ "attributeName" :attrName }}
272+
273+ def checkStartTagUnknownAttributes (self ,token ):
274+ # check for recognized attribute names
275+ name = token ["name" ].lower ()
276+ allowedAttributes = globalAttributes | allowedAttributeMap .get (name ,frozenset (()))
277+ for attrName ,attrValue in token ["data" ]:
278+ if attrName .lower ()not in allowedAttributes :
279+ yield {"type" :"ParseError" ,
280+ "data" :"unknown-attribute" ,
281+ "datavars" : {"tagName" :name ,
282+ "attributeName" :attrName }}
283+