Commit805f272

committed

Document html5parser module

1 parentdc9443d commit805f272Copy full SHA for 805f272

File tree

1 file changed

+94

-29

lines changed

html5lib
- html5parser.py

1 file changed

+94

-29

lines changed

`‎html5lib/html5parser.py`

Lines changed: 94 additions & 29 deletions

Original file line number	Diff line number	Diff line change
`@@ -25,13 +25,48 @@`
`25`	`25`
`26`	`26`
`27`	`27`	`defparse(doc,treebuilder="etree",namespaceHTMLElements=True,**kwargs):`
`28`		`-"""Parse a string or file-like object into a tree"""`
	`28`	`+"""Parse an HTML document as a string or file-like object into a tree`
	`29`	`+`
	`30`	`+ :arg doc: the document to parse as a string or file-like object`
	`31`	`+`
	`32`	`+ :arg treebuilder: the treebuilder to use when parsing`
	`33`	`+`
	`34`	`+ :arg namespaceHTMLElements: whether or not to namespace HTML elements`
	`35`	`+`
	`36`	`+ :returns: parsed tree`
	`37`	`+`
	`38`	`+ Example:`
	`39`	`+`
	`40`	`+ >>> from html5lib.html5parser import parse`
	`41`	`+ >>> parse('<html><body><p>This is a doc</p></body></html>')`
	`42`	`+ <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>`
	`43`	`+`
	`44`	`+ """`
`29`	`45`	`tb=treebuilders.getTreeBuilder(treebuilder)`
`30`	`46`	`p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)`
`31`	`47`	`returnp.parse(doc,**kwargs)`
`32`	`48`
`33`	`49`
`34`	`50`	`defparseFragment(doc,container="div",treebuilder="etree",namespaceHTMLElements=True,**kwargs):`
	`51`	`+"""Parse an HTML fragment as a string or file-like object into a tree`
	`52`	`+`
	`53`	`+ :arg doc: the fragment to parse as a string or file-like object`
	`54`	`+`
	`55`	`+ :arg container: the container context to parse the fragment in`
	`56`	`+`
	`57`	`+ :arg treebuilder: the treebuilder to use when parsing`
	`58`	`+`
	`59`	`+ :arg namespaceHTMLElements: whether or not to namespace HTML elements`
	`60`	`+`
	`61`	`+ :returns: parsed tree`
	`62`	`+`
	`63`	`+ Example:`
	`64`	`+`
	`65`	`+ >>> from html5lib.html5libparser import parseFragment`
	`66`	`+ >>> parseFragment('<b>this is a fragment</b>')`
	`67`	`+ <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>`
	`68`	`+`
	`69`	`+ """`
`35`	`70`	`tb=treebuilders.getTreeBuilder(treebuilder)`
`36`	`71`	`p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)`
`37`	`72`	`returnp.parseFragment(doc,container=container,**kwargs)`
`@@ -50,16 +85,30 @@ def __new__(meta, classname, bases, classDict):`
`50`	`85`
`51`	`86`
`52`	`87`	`classHTMLParser(object):`
`53`		`-"""HTML parser. Generates a tree structure from a stream of (possibly`
`54`		`- malformed) HTML"""`
	`88`	`+"""HTML parser`
	`89`	`+`
	`90`	`+ Generates a tree structure from a stream of (possibly malformed) HTML.`
	`91`	`+`
	`92`	`+ """`
`55`	`93`
`56`	`94`	`def__init__(self,tree=None,strict=False,namespaceHTMLElements=True,debug=False):`
`57`	`95`	`"""`
`58`		`- strict - raise an exception when a parse error is encountered`
	`96`	`+ :arg tree: a treebuilder class controlling the type of tree that will be`
	`97`	`+ returned. Built in treebuilders can be accessed through`
	`98`	`+ html5lib.treebuilders.getTreeBuilder(treeType)`
	`99`	`+`
	`100`	`+ :arg strict: raise an exception when a parse error is encountered`
	`101`	`+`
	`102`	`+ :arg namespaceHTMLElements: whether or not to namespace HTML elements`
	`103`	`+`
	`104`	`+ :arg debug: whether or not to enable debug mode which logs things`
	`105`	`+`
	`106`	`+ Example:`
	`107`	`+`
	`108`	`+ >>> from html5lib.html5parser import HTMLParser`
	`109`	`+ >>> parser = HTMLParser() # generates parser with etree builder`
	`110`	`+ >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict`
`59`	`111`
`60`		`- tree - a treebuilder class controlling the type of tree that will be`
`61`		`- returned. Built in treebuilders can be accessed through`
`62`		`- html5lib.treebuilders.getTreeBuilder(treeType)`
`63`	`112`	`"""`
`64`	`113`
`65`	`114`	`# Raise an exception on the first error encountered`
`@@ -123,9 +172,8 @@ def reset(self):`
`123`	`172`
`124`	`173`	`@property`
`125`	`174`	`defdocumentEncoding(self):`
`126`		`-"""The name of the character encoding`
`127`		`- that was used to decode the input stream,`
`128`		- or :obj:`None` if that is not determined yet.
	`175`	`+"""Name of the character encoding that was used to decode the input stream, or`
	`176`	+ :obj:`None` if that is not determined yet
`129`	`177`
`130`	`178`	`"""`
`131`	`179`	`ifnothasattr(self,'tokenizer'):`
`@@ -219,32 +267,52 @@ def normalizedTokens(self):`
`219`	`267`	`defparse(self,stream,args,*kwargs):`
`220`	`268`	`"""Parse a HTML document into a well-formed tree`
`221`	`269`
`222`		`- stream - a filelike object or string containing the HTML to be parsed`
	`270`	`+ :arg stream: a file-like object or string containing the HTML to be parsed`
	`271`	`+`
	`272`	`+ The optional encoding parameter must be a string that indicates`
	`273`	`+ the encoding. If specified, that encoding will be used,`
	`274`	`+ regardless of any BOM or later declaration (such as in a meta`
	`275`	`+ element).`
	`276`	`+`
	`277`	`+ :arg scripting: treat noscript elements as if JavaScript was turned on`
`223`	`278`
`224`		`- The optional encoding parameter must be a string that indicates`
`225`		`- the encoding. If specified, that encoding will be used,`
`226`		`- regardless of any BOM or later declaration (such as in a meta`
`227`		`- element)`
	`279`	`+ :returns: parsed tree`
	`280`	`+`
	`281`	`+ Example:`
	`282`	`+`
	`283`	`+ >>> from html5lib.html5parser import HTMLParser`
	`284`	`+ >>> parser = HTMLParser()`
	`285`	`+ >>> parser.parse('<html><body><p>This is a doc</p></body></html>')`
	`286`	`+ <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>`
`228`	`287`
`229`		`- scripting - treat noscript elements as if javascript was turned on`
`230`	`288`	`"""`
`231`	`289`	`self._parse(stream,False,None,args,*kwargs)`
`232`	`290`	`returnself.tree.getDocument()`
`233`	`291`
`234`	`292`	`defparseFragment(self,stream,args,*kwargs):`
`235`	`293`	`"""Parse a HTML fragment into a well-formed tree fragment`
`236`	`294`
`237`		`- container - name of the element we're setting the innerHTML property`
`238`		`- if set to None, default to 'div'`
	`295`	`+ :arg container: name of the element we're setting the innerHTML`
	`296`	`+ property if set to None, default to 'div'`
	`297`	`+`
	`298`	`+ :arg stream: a file-like object or string containing the HTML to be parsed`
	`299`	`+`
	`300`	`+ The optional encoding parameter must be a string that indicates`
	`301`	`+ the encoding. If specified, that encoding will be used,`
	`302`	`+ regardless of any BOM or later declaration (such as in a meta`
	`303`	`+ element)`
`239`	`304`
`240`		`-stream - a filelike object or string containing the HTML to be parsed`
	`305`	`+:arg scripting: treat noscript elements as if JavaScript was turned on`
`241`	`306`
`242`		`- The optional encoding parameter must be a string that indicates`
`243`		`- the encoding. If specified, that encoding will be used,`
`244`		`- regardless of any BOM or later declaration (such as in a meta`
`245`		`- element)`
	`307`	`+ :returns: parsed tree`
	`308`	`+`
	`309`	`+ Example:`
	`310`	`+`
	`311`	`+ >>> from html5lib.html5libparser import HTMLParser`
	`312`	`+ >>> parser = HTMLParser()`
	`313`	`+ >>> parser.parseFragment('<b>this is a fragment</b>')`
	`314`	`+ <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>`
`246`	`315`
`247`		`- scripting - treat noscript elements as if javascript was turned on`
`248`	`316`	`"""`
`249`	`317`	`self._parse(stream,True,args,*kwargs)`
`250`	`318`	`returnself.tree.getFragment()`
`@@ -258,8 +326,7 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None):`
`258`	`326`	`raiseParseError(E[errorcode]%datavars)`
`259`	`327`
`260`	`328`	`defnormalizeToken(self,token):`
`261`		`-""" HTML5 specific normalizations to the token stream """`
`262`		`-`
	`329`	`+# HTML5 specific normalizations to the token stream`
`263`	`330`	`iftoken["type"]==tokenTypes["StartTag"]:`
`264`	`331`	`raw=token["data"]`
`265`	`332`	`token["data"]=OrderedDict(raw)`
`@@ -327,9 +394,7 @@ def resetInsertionMode(self):`
`327`	`394`	`self.phase=new_phase`
`328`	`395`
`329`	`396`	`defparseRCDataRawtext(self,token,contentType):`
`330`		`-"""Generic RCDATA/RAWTEXT Parsing algorithm`
`331`		`- contentType - RCDATA or RAWTEXT`
`332`		`- """`
	`397`	`+# Generic RCDATA/RAWTEXT Parsing algorithm`
`333`	`398`	`assertcontentTypein ("RAWTEXT","RCDATA")`
`334`	`399`
`335`	`400`	`self.tree.insertElement(token)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit805f272

File tree

1 file changed

1 file changed

`‎html5lib/html5parser.py`

0 commit comments