@@ -76,14 +76,12 @@ def htmlentityreplace_errors(exc):
7676
7777del register_error
7878
79- def encode (text ,encoding ):
80- return text .encode (encoding ,unicode_encode_errors )
8179
8280class HTMLSerializer (object ):
8381
8482# attribute quoting options
8583quote_attr_values = False
86- quote_char = '"'
84+ quote_char = u '"'
8785use_best_quote_char = True
8886
8987# tag syntax options
@@ -159,7 +157,22 @@ def __init__(self, **kwargs):
159157self .errors = []
160158self .strict = False
161159
160+ def encode (self ,string ):
161+ assert (isinstance (string ,unicode ))
162+ if self .encoding :
163+ return string .encode (self .encoding ,unicode_encode_errors )
164+ else :
165+ return string
166+
167+ def encodeStrict (self ,string ):
168+ assert (isinstance (string ,unicode ))
169+ if self .encoding :
170+ return string .encode (self .encoding ,"strict" )
171+ else :
172+ return string
173+
162174def serialize (self ,treewalker ,encoding = None ):
175+ self .encoding = encoding
163176in_cdata = False
164177self .errors = []
165178if encoding and self .inject_meta_charset :
@@ -195,27 +208,19 @@ def serialize(self, treewalker, encoding=None):
195208doctype += u" %s%s%s" % (quote_char ,token ["systemId" ],quote_char )
196209
197210doctype += u">"
198-
199- if encoding :
200- yield doctype .encode (encoding )
201- else :
202- yield doctype
211+ yield self .encodeStrict (doctype )
203212
204213elif type in ("Characters" ,"SpaceCharacters" ):
205214if type == "SpaceCharacters" or in_cdata :
206215if in_cdata and token ["data" ].find ("</" )>= 0 :
207216self .serializeError (_ ("Unexpected </ in CDATA" ))
208- if encoding :
209- yield token ["data" ].encode (encoding ,"strict" )
210- else :
211- yield token ["data" ]
212- elif encoding :
213- yield encode (escape (token ["data" ]),encoding )
217+ yield self .encode (token ["data" ])
214218else :
215- yield escape (token ["data" ])
219+ yield self . encode ( escape (token ["data" ]) )
216220
217221elif type in ("StartTag" ,"EmptyTag" ):
218222name = token ["name" ]
223+ yield self .encodeStrict (u"<%s" % name )
219224if name in rcdataElements and not self .escape_rcdata :
220225in_cdata = True
221226elif in_cdata :
@@ -225,69 +230,56 @@ def serialize(self, treewalker, encoding=None):
225230#TODO: Add namespace support here
226231k = attr_name
227232v = attr_value
228- if encoding :
229- k = k .encode (encoding ,"strict" )
230- attributes .append (' ' )
233+ yield self .encodeStrict (u' ' )
231234
232- attributes . append (k )
235+ yield self . encodeStrict (k )
233236if not self .minimize_boolean_attributes or \
234237 (k not in booleanAttributes .get (name ,tuple ()) \
235238and k not in booleanAttributes .get ("" ,tuple ())):
236- attributes . append ( "=" )
239+ yield self . encodeStrict ( u "=" )
237240if self .quote_attr_values or not v :
238241quote_attr = True
239242else :
240243quote_attr = reduce (lambda x ,y :x or (y in v ),
241- spaceCharacters + ">\" '=" ,False )
242- v = v .replace ("&" ,"&" )
243- if self .escape_lt_in_attrs :v = v .replace ("<" ,"<" )
244- if encoding :
245- v = encode (v ,encoding )
244+ spaceCharacters + u">\" '=" ,False )
245+ v = v .replace (u"&" ,u"&" )
246+ if self .escape_lt_in_attrs :v = v .replace (u"<" ,u"<" )
246247if quote_attr :
247248quote_char = self .quote_char
248249if self .use_best_quote_char :
249- if "'" in v and '"' not in v :
250- quote_char = '"'
251- elif '"' in v and "'" not in v :
252- quote_char = "'"
253- if quote_char == "'" :
254- v = v .replace ("'" ,"'" )
250+ if u "'"in v and u '"'not in v :
251+ quote_char = u '"'
252+ elif u '"'in v and u "'"not in v :
253+ quote_char = u "'"
254+ if quote_char == u "'" :
255+ v = v .replace (u "'" ,u "'" )
255256else :
256- v = v .replace ('"' ,""" )
257- attributes . append (quote_char )
258- attributes . append (v )
259- attributes . append (quote_char )
257+ v = v .replace (u '"' ,u """ )
258+ yield self . encodeStrict (quote_char )
259+ yield self . encode (v )
260+ yield self . encodeStrict (quote_char )
260261else :
261- attributes . append (v )
262+ yield self . encode (v )
262263if name in voidElements and self .use_trailing_solidus :
263264if self .space_before_trailing_solidus :
264- attributes . append ( " /" )
265+ yield self . encodeStrict ( u " /" )
265266else :
266- attributes .append ("/" )
267- if encoding :
268- yield "<%s%s>" % (name .encode (encoding ,"strict" ),"" .join (attributes ))
269- else :
270- yield u"<%s%s>" % (name ,u"" .join (attributes ))
267+ yield self .encodeStrict (u"/" )
268+ yield self .encode (u">" )
271269
272270elif type == "EndTag" :
273271name = token ["name" ]
274272if name in rcdataElements :
275273in_cdata = False
276274elif in_cdata :
277275self .serializeError (_ ("Unexpected child element of a CDATA element" ))
278- end_tag = u"</%s>" % name
279- if encoding :
280- end_tag = end_tag .encode (encoding ,"strict" )
281- yield end_tag
276+ yield self .encodeStrict (u"</%s>" % name )
282277
283278elif type == "Comment" :
284279data = token ["data" ]
285280if data .find ("--" )>= 0 :
286281self .serializeError (_ ("Comment contains --" ))
287- comment = u"<!--%s-->" % token ["data" ]
288- if encoding :
289- comment = comment .encode (encoding ,unicode_encode_errors )
290- yield comment
282+ yield self .encodeStrict (u"<!--%s-->" % token ["data" ])
291283
292284elif type == "Entity" :
293285name = token ["name" ]
@@ -298,9 +290,7 @@ def serialize(self, treewalker, encoding=None):
298290data = entities [key ]
299291else :
300292data = u"&%s;" % name
301- if encoding :
302- data = data .encode (encoding ,unicode_encode_errors )
303- yield data
293+ yield self .encodeStrict (data )
304294
305295else :
306296self .serializeError (token ["data" ])