2525
2626
2727def parse (doc ,treebuilder = "etree" ,namespaceHTMLElements = True ,** kwargs ):
28- """Parse a string or file-like object into a tree"""
28+ """Parse an HTML document as a string or file-like object into a tree
29+
30+ :arg doc: the document to parse as a string or file-like object
31+
32+ :arg treebuilder: the treebuilder to use when parsing
33+
34+ :arg namespaceHTMLElements: whether or not to namespace HTML elements
35+
36+ :returns: parsed tree
37+
38+ Example:
39+
40+ >>> from html5lib.html5parser import parse
41+ >>> parse('<html><body><p>This is a doc</p></body></html>')
42+ <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
43+
44+ """
2945tb = treebuilders .getTreeBuilder (treebuilder )
3046p = HTMLParser (tb ,namespaceHTMLElements = namespaceHTMLElements )
3147return p .parse (doc ,** kwargs )
3248
3349
3450def parseFragment (doc ,container = "div" ,treebuilder = "etree" ,namespaceHTMLElements = True ,** kwargs ):
51+ """Parse an HTML fragment as a string or file-like object into a tree
52+
53+ :arg doc: the fragment to parse as a string or file-like object
54+
55+ :arg container: the container context to parse the fragment in
56+
57+ :arg treebuilder: the treebuilder to use when parsing
58+
59+ :arg namespaceHTMLElements: whether or not to namespace HTML elements
60+
61+ :returns: parsed tree
62+
63+ Example:
64+
65+ >>> from html5lib.html5libparser import parseFragment
66+ >>> parseFragment('<b>this is a fragment</b>')
67+ <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
68+
69+ """
3570tb = treebuilders .getTreeBuilder (treebuilder )
3671p = HTMLParser (tb ,namespaceHTMLElements = namespaceHTMLElements )
3772return p .parseFragment (doc ,container = container ,** kwargs )
@@ -50,16 +85,30 @@ def __new__(meta, classname, bases, classDict):
5085
5186
5287class HTMLParser (object ):
53- """HTML parser. Generates a tree structure from a stream of (possibly
54- malformed) HTML"""
88+ """HTML parser
89+
90+ Generates a tree structure from a stream of (possibly malformed) HTML.
91+
92+ """
5593
5694def __init__ (self ,tree = None ,strict = False ,namespaceHTMLElements = True ,debug = False ):
5795"""
58- strict - raise an exception when a parse error is encountered
96+ :arg tree: a treebuilder class controlling the type of tree that will be
97+ returned. Built in treebuilders can be accessed through
98+ html5lib.treebuilders.getTreeBuilder(treeType)
99+
100+ :arg strict: raise an exception when a parse error is encountered
101+
102+ :arg namespaceHTMLElements: whether or not to namespace HTML elements
103+
104+ :arg debug: whether or not to enable debug mode which logs things
105+
106+ Example:
107+
108+ >>> from html5lib.html5parser import HTMLParser
109+ >>> parser = HTMLParser() # generates parser with etree builder
110+ >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
59111
60- tree - a treebuilder class controlling the type of tree that will be
61- returned. Built in treebuilders can be accessed through
62- html5lib.treebuilders.getTreeBuilder(treeType)
63112 """
64113
65114# Raise an exception on the first error encountered
@@ -123,9 +172,8 @@ def reset(self):
123172
124173@property
125174def documentEncoding (self ):
126- """The name of the character encoding
127- that was used to decode the input stream,
128- or :obj:`None` if that is not determined yet.
175+ """Name of the character encoding that was used to decode the input stream, or
176+ :obj:`None` if that is not determined yet
129177
130178 """
131179if not hasattr (self ,'tokenizer' ):
@@ -219,32 +267,52 @@ def normalizedTokens(self):
219267def parse (self ,stream ,* args ,** kwargs ):
220268"""Parse a HTML document into a well-formed tree
221269
222- stream - a filelike object or string containing the HTML to be parsed
270+ :arg stream: a file-like object or string containing the HTML to be parsed
271+
272+ The optional encoding parameter must be a string that indicates
273+ the encoding. If specified, that encoding will be used,
274+ regardless of any BOM or later declaration (such as in a meta
275+ element).
276+
277+ :arg scripting: treat noscript elements as if JavaScript was turned on
223278
224- The optional encoding parameter must be a string that indicates
225- the encoding. If specified, that encoding will be used,
226- regardless of any BOM or later declaration (such as in a meta
227- element)
279+ :returns: parsed tree
280+
281+ Example:
282+
283+ >>> from html5lib.html5parser import HTMLParser
284+ >>> parser = HTMLParser()
285+ >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
286+ <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
228287
229- scripting - treat noscript elements as if javascript was turned on
230288 """
231289self ._parse (stream ,False ,None ,* args ,** kwargs )
232290return self .tree .getDocument ()
233291
234292def parseFragment (self ,stream ,* args ,** kwargs ):
235293"""Parse a HTML fragment into a well-formed tree fragment
236294
237- container - name of the element we're setting the innerHTML property
238- if set to None, default to 'div'
295+ :arg container: name of the element we're setting the innerHTML
296+ property if set to None, default to 'div'
297+
298+ :arg stream: a file-like object or string containing the HTML to be parsed
299+
300+ The optional encoding parameter must be a string that indicates
301+ the encoding. If specified, that encoding will be used,
302+ regardless of any BOM or later declaration (such as in a meta
303+ element)
239304
240- stream - a filelike object or string containing the HTML to be parsed
305+ :arg scripting: treat noscript elements as if JavaScript was turned on
241306
242- The optional encoding parameter must be a string that indicates
243- the encoding. If specified, that encoding will be used,
244- regardless of any BOM or later declaration (such as in a meta
245- element)
307+ :returns: parsed tree
308+
309+ Example:
310+
311+ >>> from html5lib.html5libparser import HTMLParser
312+ >>> parser = HTMLParser()
313+ >>> parser.parseFragment('<b>this is a fragment</b>')
314+ <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
246315
247- scripting - treat noscript elements as if javascript was turned on
248316 """
249317self ._parse (stream ,True ,* args ,** kwargs )
250318return self .tree .getFragment ()
@@ -258,8 +326,7 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None):
258326raise ParseError (E [errorcode ]% datavars )
259327
260328def normalizeToken (self ,token ):
261- """ HTML5 specific normalizations to the token stream """
262-
329+ # HTML5 specific normalizations to the token stream
263330if token ["type" ]== tokenTypes ["StartTag" ]:
264331raw = token ["data" ]
265332token ["data" ]= OrderedDict (raw )
@@ -327,9 +394,7 @@ def resetInsertionMode(self):
327394self .phase = new_phase
328395
329396def parseRCDataRawtext (self ,token ,contentType ):
330- """Generic RCDATA/RAWTEXT Parsing algorithm
331- contentType - RCDATA or RAWTEXT
332- """
397+ # Generic RCDATA/RAWTEXT Parsing algorithm
333398assert contentType in ("RAWTEXT" ,"RCDATA" )
334399
335400self .tree .insertElement (token )