@@ -68,10 +68,33 @@ def htmlentityreplace_errors(exc):
6868else :
6969return xmlcharrefreplace_errors (exc )
7070
71+
7172register_error ("htmlentityreplace" ,htmlentityreplace_errors )
7273
7374
7475def serialize (input ,tree = "etree" ,encoding = None ,** serializer_opts ):
76+ """Serializes the input token stream using the specified treewalker
77+
78+ :arg input: the token stream to serialize
79+
80+ :arg tree: the treewalker to use
81+
82+ :arg encoding: the encoding to use
83+
84+ :arg serializer_opts: any options to pass to the
85+ :py:class:`html5lib.serializer.HTMLSerializer` that gets created
86+
87+ :returns: the tree serialized as a string
88+
89+ Example:
90+
91+ >>> from html5lib.html5parser import parse
92+ >>> from html5lib.serializer import serialize
93+ >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
94+ >>> serialize(token_stream, omit_optional_tags=False)
95+ '<html><head></head><body><p>Hi!</p></body></html>'
96+
97+ """
7598# XXX: Should we cache this?
7699walker = treewalkers .getTreeWalker (tree )
77100s = HTMLSerializer (** serializer_opts )
@@ -110,50 +133,83 @@ class HTMLSerializer(object):
110133"strip_whitespace" ,"sanitize" )
111134
112135def __init__ (self ,** kwargs ):
113- """Initialize HTMLSerializer.
114-
115- Keyword options (default given first unless specified) include:
116-
117- inject_meta_charset=True|False
118- Whether it insert a meta element to define the character set of the
119- document.
120- quote_attr_values="legacy"|"spec"|"always"
121- Whether to quote attribute values that don't require quoting
122- per legacy browser behaviour, when required by the standard, or always.
123- quote_char=u'"'|u"'"
124- Use given quote character for attribute quoting. Default is to
125- use double quote unless attribute value contains a double quote,
126- in which case single quotes are used instead.
127- escape_lt_in_attrs=False|True
128- Whether to escape < in attribute values.
129- escape_rcdata=False|True
130- Whether to escape characters that need to be escaped within normal
131- elements within rcdata elements such as style.
132- resolve_entities=True|False
133- Whether to resolve named character entities that appear in the
134- source tree. The XML predefined entities < > & " '
135- are unaffected by this setting.
136- strip_whitespace=False|True
137- Whether to remove semantically meaningless whitespace. (This
138- compresses all whitespace to a single space except within pre.)
139- minimize_boolean_attributes=True|False
140- Shortens boolean attributes to give just the attribute value,
141- for example <input disabled="disabled"> becomes <input disabled>.
142- use_trailing_solidus=False|True
143- Includes a close-tag slash at the end of the start tag of void
144- elements (empty elements whose end tag is forbidden). E.g. <hr/>.
145- space_before_trailing_solidus=True|False
146- Places a space immediately before the closing slash in a tag
147- using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
148- sanitize=False|True
149- Strip all unsafe or unknown constructs from output.
150- See `html5lib user documentation`_
151- omit_optional_tags=True|False
152- Omit start/end tags that are optional.
153- alphabetical_attributes=False|True
154- Reorder attributes to be in alphabetical order.
155-
156- .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
136+ """Initialize HTMLSerializer
137+
138+ :arg inject_meta_charset: Whether or not to inject the meta charset.
139+
140+ Defaults to ``True``.
141+
142+ :arg quote_attr_values: Whether to quote attribute values that don't
143+ require quoting per legacy browser behavior (``"legacy"``), when
144+ required by the standard (``"spec"``), or always (``"always"``).
145+
146+ Defaults to ``"legacy"``.
147+
148+ :arg quote_char: Use given quote character for attribute quoting.
149+
150+ Defaults to ``"`` which will use double quotes unless attribute
151+ value contains a double quote, in which case single quotes are
152+ used.
153+
154+ :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
155+ values.
156+
157+ Defaults to ``False``.
158+
159+ :arg escape_rcdata: Whether to escape characters that need to be
160+ escaped within normal elements within rcdata elements such as
161+ style.
162+
163+ Defaults to ``False``.
164+
165+ :arg resolve_entities: Whether to resolve named character entities that
166+ appear in the source tree. The XML predefined entities < >
167+ & " ' are unaffected by this setting.
168+
169+ Defaults to ``True``.
170+
171+ :arg strip_whitespace: Whether to remove semantically meaningless
172+ whitespace. (This compresses all whitespace to a single space
173+ except within ``pre``.)
174+
175+ Defaults to ``False``.
176+
177+ :arg minimize_boolean_attributes: Shortens boolean attributes to give
178+ just the attribute value, for example::
179+
180+ <input disabled="disabled">
181+
182+ becomes::
183+
184+ <input disabled>
185+
186+ Defaults to ``True``.
187+
188+ :arg use_trailing_solidus: Includes a close-tag slash at the end of the
189+ start tag of void elements (empty elements whose end tag is
190+ forbidden). E.g. ``<hr/>``.
191+
192+ Defaults to ``False``.
193+
194+ :arg space_before_trailing_solidus: Places a space immediately before
195+ the closing slash in a tag using a trailing solidus. E.g.
196+ ``<hr />``. Requires ``use_trailing_solidus=True``.
197+
198+ Defaults to ``True``.
199+
200+ :arg sanitize: Strip all unsafe or unknown constructs from output.
201+ See :py:class:`html5lib.filters.sanitizer.Filter`.
202+
203+ Defaults to ``False``.
204+
205+ :arg omit_optional_tags: Omit start/end tags that are optional.
206+
207+ Defaults to ``True``.
208+
209+ :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
210+
211+ Defaults to ``False``.
212+
157213 """
158214unexpected_args = frozenset (kwargs )- frozenset (self .options )
159215if len (unexpected_args )> 0 :
@@ -317,6 +373,25 @@ def serialize(self, treewalker, encoding=None):
317373self .serializeError (token ["data" ])
318374
319375def render (self ,treewalker ,encoding = None ):
376+ """Serializes the stream from the treewalker into a string
377+
378+ :arg treewalker: the treewalker to serialize
379+
380+ :arg encoding: the string encoding to use
381+
382+ :returns: the serialized tree
383+
384+ Example:
385+
386+ >>> from html5lib import parse, getTreeWalker
387+ >>> from html5lib.serializer import HTMLSerializer
388+ >>> token_stream = parse('<html><body>Hi!</body></html>')
389+ >>> walker = getTreeWalker('etree')
390+ >>> serializer = HTMLSerializer(omit_optional_tags=False)
391+ >>> serializer.render(walker(token_stream))
392+ '<html><head></head><body>Hi!</body></html>'
393+
394+ """
320395if encoding :
321396return b"" .join (list (self .serialize (treewalker ,encoding )))
322397else :