awesome-python/html5lib-pythonPublic

NotificationsYou must be signed in to change notification settings
Fork0
Star1

Commit768ba79

committed

More stuff orking including treewalkers, parts of parse.py dom, (c)ElementTree

--HG--branch : svgmathmlextra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/branches/svgmathml%401266

1 parent10b9010 commit768ba79Copy full SHA for 768ba79

File tree

31 files changed

+303

-264

lines changed

parse.py
src/html5lib
- __init__.py
- filters
- html5parser.py
- ihatexml.py
- inputstream.py
- serializer
- treebuilders
- treewalkers
- utils.py
tests

31 files changed

+303

-264

lines changed

`‎parse.py`

Lines changed: 22 additions & 16 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#!/usr/bin/envpython`
	`1`	`+#!/usr/bin/envpython3.0`
`2`	`2`	`"""usage: %prog [options] filename`
`3`	`3`
`4`	`4`	`Parse a document to a simpletree tree, with optional profiling`
`@@ -9,11 +9,16 @@`
`9`	`9`	`importos`
`10`	`10`	`fromoptparseimportOptionParser`
`11`	`11`
	`12`	`+print(sys.stdout.encoding)`
	`13`	`+`
`12`	`14`	`#RELEASE remove`
`13`	`15`	`sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))`
`14`	`16`	`#END RELEASE`
`15`		`-fromhtml5libimporthtml5parser,liberalxmlparser,sanitizer`
	`17`	`+print(sys.path)`
	`18`	`+importhtml5lib`
	`19`	`+importhtml5lib.html5parserashtml5parser`
`16`	`20`	`fromhtml5lib.tokenizerimportHTMLTokenizer`
	`21`	`+fromhtml5libimporttreebuilders`
`17`	`22`	`fromhtml5libimporttreebuilders,serializer,treewalkers`
`18`	`23`	`fromhtml5libimportconstants`
`19`	`24`
`@@ -27,8 +32,8 @@ def parse():`
`27`	`32`	`# Try opening from the internet`
`28`	`33`	`iff.startswith('http://'):`
`29`	`34`	`try:`
`30`		`-importurllib,cgi`
`31`		`-f=urllib.urlopen(f)`
	`35`	`+fromurllibimportrequest`
	`36`	`+f=request.urlopen(f)`
`32`	`37`	`contentType=f.headers.get('content-type')`
`33`	`38`	`ifcontentType:`
`34`	`39`	`(mediaType,params)=cgi.parse_header(contentType)`
`@@ -39,7 +44,7 @@ def parse():`
`39`	`44`	`else:`
`40`	`45`	`try:`
`41`	`46`	`# Try opening from file system`
`42`		`-f=open(f)`
	`47`	`+f=open(f,"rb")`
`43`	`48`	`exceptIOError:pass`
`44`	`49`	`exceptIndexError:`
`45`	`50`	`sys.stderr.write("No filename provided. Use -h for help\n")`
`@@ -64,16 +69,16 @@ def parse():`
`64`	`69`
`65`	`70`	`ifopts.profile:`
`66`	`71`	`#XXX should import cProfile instead and use that`
`67`		`-importhotshot`
`68`		`-importhotshot.stats`
`69`		`-prof=hotshot.Profile('stats.prof')`
`70`		`-prof.runcall(parseMethod,f,encoding=encoding)`
	`72`	`+try:`
	`73`	`+importcProfileasprofile`
	`74`	`+exceptImportError:`
	`75`	`+importprofile`
	`76`	`+importpstats`
	`77`	`+prof=profile.run('parseMethod(f, encoding=encoding)','prof.out')`
`71`	`78`	`prof.close()`
`72`	`79`	`# XXX - We should use a temp file here`
`73`		`-stats=hotshot.stats.load('stats.prof')`
`74`		`-stats.strip_dirs()`
`75`		`-stats.sort_stats('time')`
`76`		`-stats.print_stats()`
	`80`	`+stats=pstats.stats('prof.out')`
	`81`	`+stats.strip_dirs().sort_stats('time').print_stats()`
`77`	`82`	`elifopts.time:`
`78`	`83`	`importtime`
`79`	`84`	`t0=time.time()`
`@@ -88,13 +93,14 @@ def parse():`
`88`	`93`
`89`	`94`	`defprintOutput(parser,document,opts):`
`90`	`95`	`ifopts.encoding:`
`91`		`-print"Encoding:",parser.tokenizer.stream.charEncoding`
	`96`	`+print("Encoding:",parser.tokenizer.stream.charEncoding)`
`92`	`97`	`ifopts.xml:`
`93`	`98`	`sys.stdout.write(document.toxml("utf-8"))`
`94`	`99`	`elifopts.tree:`
`95`	`100`	`ifnothasattr(document,'__getitem__'):document= [document]`
`96`	`101`	`forfragmentindocument:`
`97`		`-printparser.tree.testSerializer(fragment).encode("utf-8")`
	`102`	`+sys.stdout.write(parser.tree.testSerializer(fragment))`
	`103`	`+sys.stdout.write("\n")`
`98`	`104`	`elifopts.hilite:`
`99`	`105`	`sys.stdout.write(document.hilite("utf-8"))`
`100`	`106`	`elifopts.html:`
`@@ -103,7 +109,7 @@ def printOutput(parser, document, opts):`
`103`	`109`	`kwargs[opt]=getattr(opts,opt)`
`104`	`110`	`ifnotkwargs['quote_char']:delkwargs['quote_char']`
`105`	`111`	`tokens=treewalkers.getTreeWalker(opts.treebuilder)(document)`
`106`		`-fortextinserializer.HTMLSerializer(**kwargs).serialize(tokens,encoding='utf-8'):`
	`112`	`+fortextinserializer.HTMLSerializer(**kwargs).serialize(tokens):`
`107`	`113`	`sys.stdout.write(text)`
`108`	`114`	`ifnottext.endswith('\n'):sys.stdout.write('\n')`
`109`	`115`	`ifopts.error:`

`‎src/html5lib/init.py`

Lines changed: 2 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -10,10 +10,9 @@`
`10`	`10`	`f = open("my_document.html")`
`11`	`11`	`tree = html5lib.parse(f)`
`12`	`12`	`"""`
`13`		`-print(__path__)`
`14`	`13`
`15`		`-#from .html5parser import HTMLParser, parse`
`16`		`-#from treebuilders import getTreeBuilder`
	`14`	`+from .html5parserimportHTMLParser,parse`
	`15`	`+from.treebuildersimportgetTreeBuilder`
`17`	`16`
`18`	`17`	`#from .liberalxmlparser import XMLParser, XHTMLParser`
`19`	`18`

`‎src/html5lib/filters/formfiller.py`

Lines changed: 12 additions & 12 deletions

Original file line number	Diff line number	Diff line change
`@@ -4,10 +4,10 @@`
`4`	`4`	`# See http://www.whatwg.org/specs/web-forms/current-work/#seeding`
`5`	`5`	`#`
`6`	`6`
`7`		`-import_base`
	`7`	`+from .import_base`
`8`	`8`
`9`	`9`	`fromhtml5lib.constantsimportspaceCharacters`
`10`		`-spaceCharacters=u"".join(spaceCharacters)`
	`10`	`+spaceCharacters="".join(spaceCharacters)`
`11`	`11`
`12`	`12`	`classSimpleFilter(_base.Filter):`
`13`	`13`	`def__init__(self,source,fieldStorage):`
`@@ -29,13 +29,13 @@ def __iter__(self):`
`29`	`29`	`input_checked_index=-1`
`30`	`30`	`fori,(n,v)inenumerate(token["data"]):`
`31`	`31`	`n=n.lower()`
`32`		`-ifn==u"name":`
	`32`	`+ifn=="name":`
`33`	`33`	`field_name=v.strip(spaceCharacters)`
`34`		`-elifn==u"type":`
	`34`	`+elifn=="type":`
`35`	`35`	`field_type=v.strip(spaceCharacters)`
`36`		`-elifn==u"checked":`
	`36`	`+elifn=="checked":`
`37`	`37`	`input_checked_index=i`
`38`		`-elifn==u"value":`
	`38`	`+elifn=="value":`
`39`	`39`	`input_value_index=i`
`40`	`40`
`41`	`41`	`value_list=self.fieldStorage.getlist(field_name)`
`@@ -45,20 +45,20 @@ def __iter__(self):`
`45`	`45`	`else:`
`46`	`46`	`value=""`
`47`	`47`
`48`		`-iffield_typein (u"checkbox",u"radio"):`
	`48`	`+iffield_typein ("checkbox","radio"):`
`49`	`49`	`ifvalue_list:`
`50`	`50`	`iftoken["data"][input_value_index][1]==value:`
`51`	`51`	`ifinput_checked_index<0:`
`52`		`-token["data"].append((u"checked",u""))`
	`52`	`+token["data"].append(("checked",""))`
`53`	`53`	`field_indices[field_name]=field_index+1`
`54`	`54`	`elifinput_checked_index>=0:`
`55`	`55`	`deltoken["data"][input_checked_index]`
`56`	`56`
`57`		`-eliffield_typenotin (u"button",u"submit",u"reset"):`
	`57`	`+eliffield_typenotin ("button","submit","reset"):`
`58`	`58`	`ifinput_value_index>=0:`
`59`		`-token["data"][input_value_index]= (u"value",value)`
	`59`	`+token["data"][input_value_index]= ("value",value)`
`60`	`60`	`else:`
`61`		`-token["data"].append((u"value",value))`
	`61`	`+token["data"].append(("value",value))`
`62`	`62`	`field_indices[field_name]=field_index+1`
`63`	`63`
`64`	`64`	`field_type=None`
`@@ -96,7 +96,7 @@ def __iter__(self):`
`96`	`96`	`value=""`
`97`	`97`	`if (is_select_multipleornotis_selected_option_found)andoption_value==value:`
`98`	`98`	`ifoption_selected_index<0:`
`99`		`-token["data"].append((u"selected",u""))`
	`99`	`+token["data"].append(("selected",""))`
`100`	`100`	`field_indices[field_name]=field_index+1`
`101`	`101`	`is_selected_option_found=True`
`102`	`102`	`elifoption_selected_index>=0:`

`‎src/html5lib/filters/inject_meta_charset.py`

Lines changed: 3 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-import_base`
	`1`	`+from .import_base`
`2`	`2`
`3`	`3`	`classFilter(_base.Filter):`
`4`	`4`	`def__init__(self,source,encoding):`
`@@ -23,7 +23,7 @@ def __iter__(self):`
`23`	`23`	`content_index=-1`
`24`	`24`	`fori,(name,value)inenumerate(token["data"]):`
`25`	`25`	`ifname.lower()=='charset':`
`26`		`-token["data"][i]= (u'charset',self.encoding)`
	`26`	`+token["data"][i]= ('charset',self.encoding)`
`27`	`27`	`meta_found=True`
`28`	`28`	`break`
`29`	`29`	`elifname=='http-equiv'andvalue.lower()=='content-type':`
`@@ -32,7 +32,7 @@ def __iter__(self):`
`32`	`32`	`content_index=i`
`33`	`33`	`else:`
`34`	`34`	`ifhas_http_equiv_content_typeandcontent_index>=0:`
`35`		`-token["data"][content_index]= (u'content',u'text/html; charset=%s'%self.encoding)`
	`35`	`+token["data"][content_index]= ('content','text/html; charset=%s'%self.encoding)`
`36`	`36`	`meta_found=True`
`37`	`37`
`38`	`38`	`eliftoken["name"].lower()=="head"andnotmeta_found:`

`‎src/html5lib/filters/iso639codes.py`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -746,4 +746,4 @@ def isValidLangCode(value):`
`746`	`746`	`lang,sublang=value.split('-',1)`
`747`	`747`	`else:`
`748`	`748`	`lang=value`
`749`		`-returnisoLang.has_key(unicode.lower(unicode(lang)))`
	`749`	`+returnstr.lower(str(lang))inisoLang`

`‎src/html5lib/filters/lint.py`

Lines changed: 21 additions & 21 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,11 +1,11 @@`
`1`	`1`	`fromgettextimportgettext`
`2`	`2`	`_=gettext`
`3`	`3`
`4`		`-import_base`
	`4`	`+from .import_base`
`5`	`5`	`fromhtml5lib.constantsimportcdataElements,rcdataElements,voidElements`
`6`	`6`
`7`	`7`	`fromhtml5lib.constantsimportspaceCharacters`
`8`		`-spaceCharacters=u"".join(spaceCharacters)`
	`8`	`+spaceCharacters="".join(spaceCharacters)`
`9`	`9`
`10`	`10`	`classLintError(Exception):pass`
`11`	`11`
`@@ -19,22 +19,22 @@ def __iter__(self):`
`19`	`19`	`name=token["name"]`
`20`	`20`	`ifcontentModelFlag!="PCDATA":`
`21`	`21`	`raiseLintError(_("StartTag not in PCDATA content model flag: %s")%name)`
`22`		`-ifnotisinstance(name,unicode):`
`23`		`-raiseLintError(_(u"Tag name is not a string: %r")%name)`
	`22`	`+ifnotisinstance(name,str):`
	`23`	`+raiseLintError(_("Tag name is not a string: %r")%name)`
`24`	`24`	`ifnotname:`
`25`		`-raiseLintError(_(u"Empty tag name"))`
	`25`	`+raiseLintError(_("Empty tag name"))`
`26`	`26`	`iftype=="StartTag"andnameinvoidElements:`
`27`		`-raiseLintError(_(u"Void element reported as StartTag token: %s")%name)`
	`27`	`+raiseLintError(_("Void element reported as StartTag token: %s")%name)`
`28`	`28`	`eliftype=="EmptyTag"andnamenotinvoidElements:`
`29`		`-raiseLintError(_(u"Non-void element reported as EmptyTag token: %s")%token["name"])`
	`29`	`+raiseLintError(_("Non-void element reported as EmptyTag token: %s")%token["name"])`
`30`	`30`	`iftype=="StartTag":`
`31`	`31`	`open_elements.append(name)`
`32`	`32`	`forname,valueintoken["data"]:`
`33`		`-ifnotisinstance(name,unicode):`
	`33`	`+ifnotisinstance(name,str):`
`34`	`34`	`raiseLintError(_("Attribute name is not a string: %r")%name)`
`35`	`35`	`ifnotname:`
`36`		`-raiseLintError(_(u"Empty attribute name"))`
`37`		`-ifnotisinstance(value,unicode):`
	`36`	`+raiseLintError(_("Empty attribute name"))`
	`37`	`+ifnotisinstance(value,str):`
`38`	`38`	`raiseLintError(_("Attribute value is not a string: %r")%value)`
`39`	`39`	`ifnameincdataElements:`
`40`	`40`	`contentModelFlag="CDATA"`
`@@ -45,15 +45,15 @@ def __iter__(self):`
`45`	`45`
`46`	`46`	`eliftype=="EndTag":`
`47`	`47`	`name=token["name"]`
`48`		`-ifnotisinstance(name,unicode):`
`49`		`-raiseLintError(_(u"Tag name is not a string: %r")%name)`
	`48`	`+ifnotisinstance(name,str):`
	`49`	`+raiseLintError(_("Tag name is not a string: %r")%name)`
`50`	`50`	`ifnotname:`
`51`		`-raiseLintError(_(u"Empty tag name"))`
	`51`	`+raiseLintError(_("Empty tag name"))`
`52`	`52`	`ifnameinvoidElements:`
`53`		`-raiseLintError(_(u"Void element reported as EndTag token: %s")%name)`
	`53`	`+raiseLintError(_("Void element reported as EndTag token: %s")%name)`
`54`	`54`	`start_name=open_elements.pop()`
`55`	`55`	`ifstart_name!=name:`
`56`		`-raiseLintError(_(u"EndTag (%s) does not match StartTag (%s)")% (name,start_name))`
	`56`	`+raiseLintError(_("EndTag (%s) does not match StartTag (%s)")% (name,start_name))`
`57`	`57`	`contentModelFlag="PCDATA"`
`58`	`58`
`59`	`59`	`eliftype=="Comment":`
`@@ -62,27 +62,27 @@ def __iter__(self):`
`62`	`62`
`63`	`63`	`eliftypein ("Characters","SpaceCharacters"):`
`64`	`64`	`data=token["data"]`
`65`		`-ifnotisinstance(data,unicode):`
	`65`	`+ifnotisinstance(data,str):`
`66`	`66`	`raiseLintError(_("Attribute name is not a string: %r")%data)`
`67`	`67`	`ifnotdata:`
`68`		`-raiseLintError(_(u"%s token with empty data")%type)`
	`68`	`+raiseLintError(_("%s token with empty data")%type)`
`69`	`69`	`iftype=="SpaceCharacters":`
`70`	`70`	`data=data.strip(spaceCharacters)`
`71`	`71`	`ifdata:`
`72`		`-raiseLintError(_(u"Non-space character(s) found in SpaceCharacters token: ")%data)`
	`72`	`+raiseLintError(_("Non-space character(s) found in SpaceCharacters token: ")%data)`
`73`	`73`
`74`	`74`	`eliftype=="Doctype":`
`75`	`75`	`name=token["name"]`
`76`	`76`	`ifcontentModelFlag!="PCDATA":`
`77`	`77`	`raiseLintError(_("Doctype not in PCDATA content model flag: %s")%name)`
`78`		`-ifnotisinstance(name,unicode):`
`79`		`-raiseLintError(_(u"Tag name is not a string: %r")%name)`
	`78`	`+ifnotisinstance(name,str):`
	`79`	`+raiseLintError(_("Tag name is not a string: %r")%name)`
`80`	`80`	`# XXX: what to do with token["data"] ?`
`81`	`81`
`82`	`82`	`eliftypein ("ParseError","SerializeError"):`
`83`	`83`	`pass`
`84`	`84`
`85`	`85`	`else:`
`86`		`-raiseLintError(_(u"Unknown token type: %s")%type)`
	`86`	`+raiseLintError(_("Unknown token type: %s")%type)`
`87`	`87`
`88`	`88`	`yieldtoken`

`‎src/html5lib/filters/optionaltags.py`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-import_base`
	`1`	`+from .import_base`
`2`	`2`
`3`	`3`	`classFilter(_base.Filter):`
`4`	`4`	`defslider(self):`

`‎src/html5lib/filters/sanitizer.py`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-import_base`
	`1`	`+from .import_base`
`2`	`2`	`fromhtml5lib.sanitizerimportHTMLSanitizerMixin`
`3`	`3`
`4`	`4`	`classFilter(_base.Filter,HTMLSanitizerMixin):`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit768ba79

File tree

31 files changed

31 files changed

`‎parse.py`

`‎src/html5lib/init.py`

`‎src/html5lib/filters/formfiller.py`

`‎src/html5lib/filters/inject_meta_charset.py`

`‎src/html5lib/filters/iso639codes.py`

`‎src/html5lib/filters/lint.py`

`‎src/html5lib/filters/optionaltags.py`

`‎src/html5lib/filters/sanitizer.py`

0 commit comments