Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit805f272

Browse files
committed
Document html5parser module
1 parentdc9443d commit805f272

File tree

1 file changed

+94
-29
lines changed

1 file changed

+94
-29
lines changed

‎html5lib/html5parser.py

Lines changed: 94 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,48 @@
2525

2626

2727
defparse(doc,treebuilder="etree",namespaceHTMLElements=True,**kwargs):
28-
"""Parse a string or file-like object into a tree"""
28+
"""Parse an HTML document as a string or file-like object into a tree
29+
30+
:arg doc: the document to parse as a string or file-like object
31+
32+
:arg treebuilder: the treebuilder to use when parsing
33+
34+
:arg namespaceHTMLElements: whether or not to namespace HTML elements
35+
36+
:returns: parsed tree
37+
38+
Example:
39+
40+
>>> from html5lib.html5parser import parse
41+
>>> parse('<html><body><p>This is a doc</p></body></html>')
42+
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
43+
44+
"""
2945
tb=treebuilders.getTreeBuilder(treebuilder)
3046
p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)
3147
returnp.parse(doc,**kwargs)
3248

3349

3450
defparseFragment(doc,container="div",treebuilder="etree",namespaceHTMLElements=True,**kwargs):
51+
"""Parse an HTML fragment as a string or file-like object into a tree
52+
53+
:arg doc: the fragment to parse as a string or file-like object
54+
55+
:arg container: the container context to parse the fragment in
56+
57+
:arg treebuilder: the treebuilder to use when parsing
58+
59+
:arg namespaceHTMLElements: whether or not to namespace HTML elements
60+
61+
:returns: parsed tree
62+
63+
Example:
64+
65+
>>> from html5lib.html5libparser import parseFragment
66+
>>> parseFragment('<b>this is a fragment</b>')
67+
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
68+
69+
"""
3570
tb=treebuilders.getTreeBuilder(treebuilder)
3671
p=HTMLParser(tb,namespaceHTMLElements=namespaceHTMLElements)
3772
returnp.parseFragment(doc,container=container,**kwargs)
@@ -50,16 +85,30 @@ def __new__(meta, classname, bases, classDict):
5085

5186

5287
classHTMLParser(object):
53-
"""HTML parser. Generates a tree structure from a stream of (possibly
54-
malformed) HTML"""
88+
"""HTML parser
89+
90+
Generates a tree structure from a stream of (possibly malformed) HTML.
91+
92+
"""
5593

5694
def__init__(self,tree=None,strict=False,namespaceHTMLElements=True,debug=False):
5795
"""
58-
strict - raise an exception when a parse error is encountered
96+
:arg tree: a treebuilder class controlling the type of tree that will be
97+
returned. Built in treebuilders can be accessed through
98+
html5lib.treebuilders.getTreeBuilder(treeType)
99+
100+
:arg strict: raise an exception when a parse error is encountered
101+
102+
:arg namespaceHTMLElements: whether or not to namespace HTML elements
103+
104+
:arg debug: whether or not to enable debug mode which logs things
105+
106+
Example:
107+
108+
>>> from html5lib.html5parser import HTMLParser
109+
>>> parser = HTMLParser() # generates parser with etree builder
110+
>>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
59111
60-
tree - a treebuilder class controlling the type of tree that will be
61-
returned. Built in treebuilders can be accessed through
62-
html5lib.treebuilders.getTreeBuilder(treeType)
63112
"""
64113

65114
# Raise an exception on the first error encountered
@@ -123,9 +172,8 @@ def reset(self):
123172

124173
@property
125174
defdocumentEncoding(self):
126-
"""The name of the character encoding
127-
that was used to decode the input stream,
128-
or :obj:`None` if that is not determined yet.
175+
"""Name of the character encoding that was used to decode the input stream, or
176+
:obj:`None` if that is not determined yet
129177
130178
"""
131179
ifnothasattr(self,'tokenizer'):
@@ -219,32 +267,52 @@ def normalizedTokens(self):
219267
defparse(self,stream,*args,**kwargs):
220268
"""Parse a HTML document into a well-formed tree
221269
222-
stream - a filelike object or string containing the HTML to be parsed
270+
:arg stream: a file-like object or string containing the HTML to be parsed
271+
272+
The optional encoding parameter must be a string that indicates
273+
the encoding. If specified, that encoding will be used,
274+
regardless of any BOM or later declaration (such as in a meta
275+
element).
276+
277+
:arg scripting: treat noscript elements as if JavaScript was turned on
223278
224-
The optional encoding parameter must be a string that indicates
225-
the encoding. If specified, that encoding will be used,
226-
regardless of any BOM or later declaration (such as in a meta
227-
element)
279+
:returns: parsed tree
280+
281+
Example:
282+
283+
>>> from html5lib.html5parser import HTMLParser
284+
>>> parser = HTMLParser()
285+
>>> parser.parse('<html><body><p>This is a doc</p></body></html>')
286+
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
228287
229-
scripting - treat noscript elements as if javascript was turned on
230288
"""
231289
self._parse(stream,False,None,*args,**kwargs)
232290
returnself.tree.getDocument()
233291

234292
defparseFragment(self,stream,*args,**kwargs):
235293
"""Parse a HTML fragment into a well-formed tree fragment
236294
237-
container - name of the element we're setting the innerHTML property
238-
if set to None, default to 'div'
295+
:arg container: name of the element we're setting the innerHTML
296+
property if set to None, default to 'div'
297+
298+
:arg stream: a file-like object or string containing the HTML to be parsed
299+
300+
The optional encoding parameter must be a string that indicates
301+
the encoding. If specified, that encoding will be used,
302+
regardless of any BOM or later declaration (such as in a meta
303+
element)
239304
240-
stream - a filelike object or string containing the HTML to be parsed
305+
:arg scripting: treat noscript elements as if JavaScript was turned on
241306
242-
The optional encoding parameter must be a string that indicates
243-
the encoding. If specified, that encoding will be used,
244-
regardless of any BOM or later declaration (such as in a meta
245-
element)
307+
:returns: parsed tree
308+
309+
Example:
310+
311+
>>> from html5lib.html5libparser import HTMLParser
312+
>>> parser = HTMLParser()
313+
>>> parser.parseFragment('<b>this is a fragment</b>')
314+
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
246315
247-
scripting - treat noscript elements as if javascript was turned on
248316
"""
249317
self._parse(stream,True,*args,**kwargs)
250318
returnself.tree.getFragment()
@@ -258,8 +326,7 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None):
258326
raiseParseError(E[errorcode]%datavars)
259327

260328
defnormalizeToken(self,token):
261-
""" HTML5 specific normalizations to the token stream """
262-
329+
# HTML5 specific normalizations to the token stream
263330
iftoken["type"]==tokenTypes["StartTag"]:
264331
raw=token["data"]
265332
token["data"]=OrderedDict(raw)
@@ -327,9 +394,7 @@ def resetInsertionMode(self):
327394
self.phase=new_phase
328395

329396
defparseRCDataRawtext(self,token,contentType):
330-
"""Generic RCDATA/RAWTEXT Parsing algorithm
331-
contentType - RCDATA or RAWTEXT
332-
"""
397+
# Generic RCDATA/RAWTEXT Parsing algorithm
333398
assertcontentTypein ("RAWTEXT","RCDATA")
334399

335400
self.tree.insertElement(token)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp