Commit38ec086

committed

Move token normalisation to the tokenizer

1 parentb2e4802 commit38ec086Copy full SHA for 38ec086

File tree

6 files changed

+151

-68

lines changed

html5lib

6 files changed

+151

-68

lines changed

`‎html5lib/_tokenizer.py`

Lines changed: 15 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,8 @@`
`2`	`2`
`3`	`3`	`fromsiximportunichraschr`
`4`	`4`
`5`		`-fromcollectionsimportdeque`
	`5`	`+fromcollectionsimportdeque,OrderedDict`
	`6`	`+fromsysimportversion_info`
`6`	`7`
`7`	`8`	`from .constantsimportspaceCharacters`
`8`	`9`	`from .constantsimportentities`
`@@ -17,6 +18,11 @@`
`17`	`18`
`18`	`19`	`entitiesTrie=Trie(entities)`
`19`	`20`
	`21`	`+ifversion_info>= (3,7):`
	`22`	`+attributeMap=dict`
	`23`	`+else:`
	`24`	`+attributeMap=OrderedDict`
	`25`	`+`
`20`	`26`
`21`	`27`	`classHTMLTokenizer(object):`
`22`	`28`	`""" This class takes care of tokenizing HTML.`
`@@ -228,6 +234,14 @@ def emitCurrentToken(self):`
`228`	`234`	`# Add token to the queue to be yielded`
`229`	`235`	`if (token["type"]intagTokenTypes):`
`230`	`236`	`token["name"]=token["name"].translate(asciiUpper2Lower)`
	`237`	`+iftoken["type"]==tokenTypes["StartTag"]:`
	`238`	`+raw=token["data"]`
	`239`	`+data=attributeMap(raw)`
	`240`	`+iflen(raw)>len(data):`
	`241`	`+# we had some duplicated attribute, fix so first wins`
	`242`	`+data.update(raw[::-1])`
	`243`	`+token["data"]=data`
	`244`	`+`
`231`	`245`	`iftoken["type"]==tokenTypes["EndTag"]:`
`232`	`246`	`iftoken["data"]:`
`233`	`247`	`self.tokenQueue.append({"type":tokenTypes["ParseError"],`

`‎html5lib/html5parser.py`

Lines changed: 3 additions & 28 deletions

Original file line number	Diff line number	Diff line change
`@@ -3,9 +3,6 @@`
`3`	`3`
`4`	`4`	`importtypes`
`5`	`5`
`6`		`-fromcollectionsimportOrderedDict`
`7`		`-fromsysimportversion_info`
`8`		`-`
`9`	`6`	`from .import_inputstream`
`10`	`7`	`from .import_tokenizer`
`11`	`8`
`@@ -26,12 +23,6 @@`
`26`	`23`	`)`
`27`	`24`
`28`	`25`
`29`		`-ifversion_info>= (3,7):`
`30`		`-attributeMap=dict`
`31`		`-else:`
`32`		`-attributeMap=OrderedDict`
`33`		`-`
`34`		`-`
`35`	`26`	`defparse(doc,treebuilder="etree",namespaceHTMLElements=True,**kwargs):`
`36`	`27`	`"""Parse an HTML document as a string or file-like object into a tree`
`37`	`28`
`@@ -210,7 +201,7 @@ def mainLoop(self):`
`210`	`201`	`DoctypeToken=tokenTypes["Doctype"]`
`211`	`202`	`ParseErrorToken=tokenTypes["ParseError"]`
`212`	`203`
`213`		`-fortokeninself.normalizedTokens():`
	`204`	`+fortokeninself.tokenizer:`
`214`	`205`	`prev_token=None`
`215`	`206`	`new_token=token`
`216`	`207`	`whilenew_tokenisnotNone:`
`@@ -268,10 +259,6 @@ def mainLoop(self):`
`268`	`259`	`ifreprocess:`
`269`	`260`	`assertself.phasenotinphases`
`270`	`261`
`271`		`-defnormalizedTokens(self):`
`272`		`-fortokeninself.tokenizer:`
`273`		`-yieldself.normalizeToken(token)`
`274`		`-`
`275`	`262`	`defparse(self,stream,args,*kwargs):`
`276`	`263`	`"""Parse a HTML document into a well-formed tree`
`277`	`264`
`@@ -333,18 +320,6 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None):`
`333`	`320`	`ifself.strict:`
`334`	`321`	`raiseParseError(E[errorcode]%datavars)`
`335`	`322`
`336`		`-defnormalizeToken(self,token):`
`337`		`-# HTML5 specific normalizations to the token stream`
`338`		`-iftoken["type"]==tokenTypes["StartTag"]:`
`339`		`-raw=token["data"]`
`340`		`-data=attributeMap(raw)`
`341`		`-iflen(raw)>len(data):`
`342`		`-# we had some duplicated attribute, fix so first wins`
`343`		`-data.update(raw[::-1])`
`344`		`-token["data"]=data`
`345`		`-`
`346`		`-returntoken`
`347`		`-`
`348`	`323`	`defadjustMathMLAttributes(self,token):`
`349`	`324`	`adjust_attributes(token,adjustMathMLAttributes)`
`350`	`325`
`@@ -2803,8 +2778,8 @@ def processEndTag(self, token):`
`2803`	`2778`	`defadjust_attributes(token,replacements):`
`2804`	`2779`	`needs_adjustment=viewkeys(token['data'])&viewkeys(replacements)`
`2805`	`2780`	`ifneeds_adjustment:`
`2806`		`-token['data']=attributeMap((replacements.get(k,k),v)`
`2807`		`-fork,vintoken['data'].items())`
	`2781`	`+token['data']=type(token['data'])((replacements.get(k,k),v)`
	`2782`	`+fork,vintoken['data'].items())`
`2808`	`2783`
`2809`	`2784`
`2810`	`2785`	`defimpliedTagToken(name,type="EndTag",attributes=None,`

`‎html5lib/tests/test_parser2.py`

Lines changed: 2 additions & 38 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,12 +1,12 @@`
`1`	`1`	`from __future__importabsolute_import,division,unicode_literals`
`2`	`2`
`3`		`-fromsiximportPY2,text_type,unichr`
	`3`	`+fromsiximportPY2,text_type`
`4`	`4`
`5`	`5`	`importio`
`6`	`6`
`7`	`7`	`from .importsupport# noqa`
`8`	`8`
`9`		`-fromhtml5lib.constantsimportnamespaces,tokenTypes`
	`9`	`+fromhtml5lib.constantsimportnamespaces`
`10`	`10`	`fromhtml5libimportparse,parseFragment,HTMLParser`
`11`	`11`
`12`	`12`
`@@ -53,42 +53,6 @@ def test_unicode_file():`
`53`	`53`	`assertparse(io.StringIO("a"))isnotNone`
`54`	`54`
`55`	`55`
`56`		`-deftest_maintain_attribute_order():`
`57`		`-# This is here because we impl it in parser and not tokenizer`
`58`		`-p=HTMLParser()`
`59`		`-# generate loads to maximize the chance a hash-based mutation will occur`
`60`		`-attrs= [(unichr(x),i)fori,xinenumerate(range(ord('a'),ord('z')))]`
`61`		`-token= {'name':'html',`
`62`		`-'selfClosing':False,`
`63`		`-'selfClosingAcknowledged':False,`
`64`		`-'type':tokenTypes["StartTag"],`
`65`		`-'data':attrs}`
`66`		`-out=p.normalizeToken(token)`
`67`		`-attr_order=list(out["data"].keys())`
`68`		`-assertattr_order== [xforx,iinattrs]`
`69`		`-`
`70`		`-`
`71`		`-deftest_duplicate_attribute():`
`72`		`-# This is here because we impl it in parser and not tokenizer`
`73`		`-doc=parse('<p class=a class=b>')`
`74`		`-el=doc[1][0]`
`75`		`-assertel.get("class")=="a"`
`76`		`-`
`77`		`-`
`78`		`-deftest_maintain_duplicate_attribute_order():`
`79`		`-# This is here because we impl it in parser and not tokenizer`
`80`		`-p=HTMLParser()`
`81`		`-attrs= [(unichr(x),i)fori,xinenumerate(range(ord('a'),ord('z')))]`
`82`		`-token= {'name':'html',`
`83`		`-'selfClosing':False,`
`84`		`-'selfClosingAcknowledged':False,`
`85`		`-'type':tokenTypes["StartTag"],`
`86`		`-'data':attrs+ [('a',len(attrs))]}`
`87`		`-out=p.normalizeToken(token)`
`88`		`-attr_order=list(out["data"].keys())`
`89`		`-assertattr_order== [xforx,iinattrs]`
`90`		`-`
`91`		`-`
`92`	`56`	`deftest_debug_log():`
`93`	`57`	`parser=HTMLParser(debug=True)`
`94`	`58`	`parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")`

`‎html5lib/tests/test_tokenizer2.py`

Lines changed: 66 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,66 @@`
	`1`	`+from __future__importabsolute_import,division,unicode_literals`
	`2`	`+`
	`3`	`+importio`
	`4`	`+`
	`5`	`+fromsiximportunichr,text_type`
	`6`	`+`
	`7`	`+fromhtml5lib._tokenizerimportHTMLTokenizer`
	`8`	`+fromhtml5lib.constantsimporttokenTypes`
	`9`	`+`
	`10`	`+`
	`11`	`+defignore_parse_errors(toks):`
	`12`	`+fortokintoks:`
	`13`	`+iftok['type']!=tokenTypes['ParseError']:`
	`14`	`+yieldtok`
	`15`	`+`
	`16`	`+`
	`17`	`+deftest_maintain_attribute_order():`
	`18`	`+# generate loads to maximize the chance a hash-based mutation will occur`
	`19`	`+attrs= [(unichr(x),text_type(i))fori,xinenumerate(range(ord('a'),ord('z')))]`
	`20`	`+stream=io.StringIO("<span "+" ".join("%s='%s'"% (x,i)forx,iinattrs)+">")`
	`21`	`+`
	`22`	`+toks=HTMLTokenizer(stream)`
	`23`	`+out=list(ignore_parse_errors(toks))`
	`24`	`+`
	`25`	`+assertlen(out)==1`
	`26`	`+assertout[0]['type']==tokenTypes['StartTag']`
	`27`	`+`
	`28`	`+attrs_tok=out[0]['data']`
	`29`	`+assertlen(attrs_tok)==len(attrs)`
	`30`	`+`
	`31`	`+for (in_name,in_value), (out_name,out_value)inzip(attrs,attrs_tok.items()):`
	`32`	`+assertin_name==out_name`
	`33`	`+assertin_value==out_value`
	`34`	`+`
	`35`	`+`
	`36`	`+deftest_duplicate_attribute():`
	`37`	`+stream=io.StringIO("<span a=1 a=2 a=3>")`
	`38`	`+`
	`39`	`+toks=HTMLTokenizer(stream)`
	`40`	`+out=list(ignore_parse_errors(toks))`
	`41`	`+`
	`42`	`+assertlen(out)==1`
	`43`	`+assertout[0]['type']==tokenTypes['StartTag']`
	`44`	`+`
	`45`	`+attrs_tok=out[0]['data']`
	`46`	`+assertlen(attrs_tok)==1`
	`47`	`+assertlist(attrs_tok.items())== [('a','1')]`
	`48`	`+`
	`49`	`+`
	`50`	`+deftest_maintain_duplicate_attribute_order():`
	`51`	`+# generate loads to maximize the chance a hash-based mutation will occur`
	`52`	`+attrs= [(unichr(x),text_type(i))fori,xinenumerate(range(ord('a'),ord('z')))]`
	`53`	`+stream=io.StringIO("<span "+" ".join("%s='%s'"% (x,i)forx,iinattrs)+" a=100>")`
	`54`	`+`
	`55`	`+toks=HTMLTokenizer(stream)`
	`56`	`+out=list(ignore_parse_errors(toks))`
	`57`	`+`
	`58`	`+assertlen(out)==1`
	`59`	`+assertout[0]['type']==tokenTypes['StartTag']`
	`60`	`+`
	`61`	`+attrs_tok=out[0]['data']`
	`62`	`+assertlen(attrs_tok)==len(attrs)`
	`63`	`+`
	`64`	`+for (in_name,in_value), (out_name,out_value)inzip(attrs,attrs_tok.items()):`
	`65`	`+assertin_name==out_name`
	`66`	`+assertin_value==out_value`

`‎html5lib/tests/test_treewalkers.py`

Lines changed: 64 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,9 @@`
`1`	`1`	`from __future__importabsolute_import,division,unicode_literals`
`2`	`2`
`3`	`3`	`importitertools`
	`4`	`+importsys`
`4`	`5`
	`6`	`+fromsiximportunichr,text_type`
`5`	`7`	`importpytest`
`6`	`8`
`7`	`9`	`try:`
`@@ -135,3 +137,65 @@ def test_lxml_xml():`
`135`	`137`	`output=Lint(walker(lxmltree))`
`136`	`138`
`137`	`139`	`assertlist(output)==expected`
	`140`	`+`
	`141`	`+`
	`142`	`+@pytest.mark.parametrize("treeName",`
	`143`	`+ [pytest.param(treeName,marks=[getattr(pytest.mark,treeName),`
	`144`	`+pytest.mark.skipif(sys.version_info< (3,7),reason="dict order undef")])`
	`145`	`+fortreeNameinsorted(treeTypes.keys())])`
	`146`	`+deftest_maintain_attribute_order(treeName):`
	`147`	`+treeAPIs=treeTypes[treeName]`
	`148`	`+iftreeAPIsisNone:`
	`149`	`+pytest.skip("Treebuilder not loaded")`
	`150`	`+`
	`151`	`+# generate loads to maximize the chance a hash-based mutation will occur`
	`152`	`+attrs= [(unichr(x),text_type(i))fori,xinenumerate(range(ord('a'),ord('z')))]`
	`153`	`+data="<span "+" ".join("%s='%s'"% (x,i)forx,iinattrs)+">"`
	`154`	`+`
	`155`	`+parser=html5parser.HTMLParser(tree=treeAPIs["builder"])`
	`156`	`+document=parser.parseFragment(data)`
	`157`	`+`
	`158`	`+document=treeAPIs.get("adapter",lambdax:x)(document)`
	`159`	`+output=list(Lint(treeAPIs["walker"](document)))`
	`160`	`+`
	`161`	`+assertlen(output)==2`
	`162`	`+assertoutput[0]['type']=='StartTag'`
	`163`	`+assertoutput[1]['type']=="EndTag"`
	`164`	`+`
	`165`	`+attrs_out=output[0]['data']`
	`166`	`+assertlen(attrs)==len(attrs_out)`
	`167`	`+`
	`168`	`+for (in_name,in_value), (out_name,out_value)inzip(attrs,attrs_out.items()):`
	`169`	`+assert (None,in_name)==out_name`
	`170`	`+assertin_value==out_value`
	`171`	`+`
	`172`	`+`
	`173`	`+@pytest.mark.parametrize("treeName",`
	`174`	`+ [pytest.param(treeName,marks=[getattr(pytest.mark,treeName),`
	`175`	`+pytest.mark.skipif(sys.version_info< (3,7),reason="dict order undef")])`
	`176`	`+fortreeNameinsorted(treeTypes.keys())])`
	`177`	`+deftest_maintain_attribute_order_adjusted(treeName):`
	`178`	`+treeAPIs=treeTypes[treeName]`
	`179`	`+iftreeAPIsisNone:`
	`180`	`+pytest.skip("Treebuilder not loaded")`
	`181`	`+`
	`182`	`+# generate loads to maximize the chance a hash-based mutation will occur`
	`183`	`+data="<svg a=1 refx=2 b=3 xml:lang=4 c=5>"`
	`184`	`+`
	`185`	`+parser=html5parser.HTMLParser(tree=treeAPIs["builder"])`
	`186`	`+document=parser.parseFragment(data)`
	`187`	`+`
	`188`	`+document=treeAPIs.get("adapter",lambdax:x)(document)`
	`189`	`+output=list(Lint(treeAPIs["walker"](document)))`
	`190`	`+`
	`191`	`+assertlen(output)==2`
	`192`	`+assertoutput[0]['type']=='StartTag'`
	`193`	`+assertoutput[1]['type']=="EndTag"`
	`194`	`+`
	`195`	`+attrs_out=output[0]['data']`
	`196`	`+`
	`197`	`+assertlist(attrs_out.items())== [((None,'a'),'1'),`
	`198`	`+ ((None,'refX'),'2'),`
	`199`	`+ ((None,'b'),'3'),`
	`200`	`+ (('http://www.w3.org/XML/1998/namespace','lang'),'4'),`
	`201`	`+ ((None,'c'),'5')]`

`‎html5lib/tests/tokenizer.py`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ def processDoctype(self, token):`
`40`	`40`
`41`	`41`	`defprocessStartTag(self,token):`
`42`	`42`	`self.outputTokens.append(["StartTag",token["name"],`
`43`		`-dict(token["data"][::-1]),token["selfClosing"]])`
	`43`	`+token["data"],token["selfClosing"]])`
`44`	`44`
`45`	`45`	`defprocessEmptyTag(self,token):`
`46`	`46`	`iftoken["name"]notinconstants.voidElements:`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit38ec086

File tree

6 files changed

6 files changed

`‎html5lib/_tokenizer.py`

`‎html5lib/html5parser.py`

`‎html5lib/tests/test_parser2.py`

`‎html5lib/tests/test_tokenizer2.py`

`‎html5lib/tests/test_treewalkers.py`

`‎html5lib/tests/tokenizer.py`

0 commit comments