1+ import _base
2+ import new
3+ import warnings
4+ from html5lib .constants import DataLossWarning
5+ import etree as etree_builders
6+ try :
7+ import lxml .html as etree
8+ except ImportError :
9+ import lxml .etree as etree
10+
11+ fullTree = True
12+
13+ """Module for supporting the lxml.etree library. The idea here is to use as much
14+ of the native library as possible, without using fragile hacks like custom element
15+ names that break between releases. The downside of this is that we cannot represent
16+ all possible trees; specifically the following are known to cause problems:
17+
18+ Text or comments as siblings of the root element
19+ Doctypes with mixed case names
20+ Docypes with no name
21+
22+ When any of these things occur, we emit a DataLossWarning
23+ """
24+
25+ class DocumentType (object ):
26+ def __init__ (self ,name ,publicId = None ,systemId = None ):
27+ self .name = name
28+ if name != name .lower ():
29+ warnings .warn ("lxml does not preserve doctype case" ,DataLossWarning )
30+ self .publicId = publicId
31+ self .systemId = systemId
32+
33+ class Document (object ):
34+ def __init__ (self ):
35+ self ._elementTree = None
36+ self ._childNodes = []
37+
38+ def appendChild (self ,element ):
39+ warnings .warn ("lxml does not support comments as siblings of the root node" ,DataLossWarning )
40+
41+ def _getChildNodes (self ):
42+ return self ._childNodes
43+
44+ childNodes = property (_getChildNodes )
45+
46+ def testSerializer (element ):
47+ rv = []
48+ finalText = None
49+ def serializeElement (element ,indent = 0 ):
50+ if not hasattr (element ,"tag" ):
51+ rv .append ("#document" )
52+ if element .docinfo .internalDTD :
53+ dtd_str = element .docinfo .doctype
54+ if not dtd_str :
55+ dtd_str = "<!DOCTYPE %s>" % element .docinfo .root_name
56+ rv .append ("|%s%s" % (' ' * (indent + 2 ),dtd_str ))
57+ serializeElement (element .getroot (),indent + 2 )
58+ elif type (element .tag )== type (etree .Comment ):
59+ rv .append ("|%s<!-- %s -->" % (' ' * indent ,element .text ))
60+ else :
61+ rv .append ("|%s<%s>" % (' ' * indent ,element .tag ))
62+ if hasattr (element ,"attrib" ):
63+ for name ,value in element .attrib .iteritems ():
64+ rv .append ('|%s%s="%s"' % (' ' * (indent + 2 ),name ,value ))
65+ if element .text :
66+ rv .append ("|%s\" %s\" " % (' ' * (indent + 2 ),element .text ))
67+ indent += 2
68+ for child in element .getchildren ():
69+ serializeElement (child ,indent )
70+ if hasattr (element ,"tail" )and element .tail :
71+ rv .append ("|%s\" %s\" " % (' ' * (indent - 2 ),element .tail ))
72+ serializeElement (element ,0 )
73+
74+ if finalText is not None :
75+ rv .append ("|%s\" %s\" " % (' ' * 2 ,finalText ))
76+
77+ return "\n " .join (rv )
78+
79+ def tostring (element ):
80+ """Serialize an element and its child nodes to a string"""
81+ rv = []
82+ finalText = None
83+ def serializeElement (element ):
84+ if not hasattr (element ,"tag" ):
85+ if element .docinfo .internalDTD :
86+ if element .docinfo .doctype :
87+ dtd_str = element .docinfo .doctype
88+ else :
89+ dtd_str = "<!DOCTYPE %s>" % element .docinfo .root_name
90+ rv .append (dtd_str )
91+ serializeElement (element .getroot ())
92+
93+ elif type (element .tag )== type (etree .Comment ):
94+ rv .append ("<!--%s-->" % (element .text ,))
95+
96+ else :
97+ #This is assumed to be an ordinary element
98+ if not element .attrib :
99+ rv .append ("<%s>" % (element .tag ,))
100+ else :
101+ attr = " " .join (["%s=\" %s\" " % (name ,value )
102+ for name ,value in element .attrib .iteritems ()])
103+ rv .append ("<%s %s>" % (element .tag ,attr ))
104+ if element .text :
105+ rv .append (element .text )
106+
107+ for child in element .getchildren ():
108+ serializeElement (child )
109+
110+ rv .append ("</%s>" % (element .tag ,))
111+
112+ if hasattr (element ,"tail" )and element .tail :
113+ rv .append (element .tail )
114+
115+ serializeElement (element )
116+
117+ if finalText is not None :
118+ rv .append ("%s\" " % (' ' * 2 ,finalText ))
119+
120+ return "" .join (rv )
121+
122+ class TreeBuilder (_base .TreeBuilder ):
123+ documentClass = Document
124+ doctypeClass = DocumentType
125+ elementClass = None
126+ commentClass = None
127+ fragmentClass = None
128+
129+ def __init__ (self ,fullTree = False ):
130+ builder = etree_builders .getETreeModule (etree ,fullTree = fullTree )
131+ self .elementClass = builder .Element
132+ self .commentClass = builder .Comment
133+ self .fragmentClass = builder .DocumentFragment
134+ _base .TreeBuilder .__init__ (self )
135+
136+ def reset (self ):
137+ _base .TreeBuilder .reset (self )
138+ self .insertComment = self .insertCommentInitial
139+ self .doctype = None
140+
141+ def testSerializer (self ,element ):
142+ return testSerializer (element )
143+
144+ def getDocument (self ):
145+ if fullTree :
146+ return self .document ._elementTree
147+ else :
148+ return self .document ._elementTree .getroot ()
149+
150+ def getFragment (self ):
151+ return _base .TreeBuilder .getFragment (self )._element
152+
153+ def insertDoctype (self ,name ,publicId ,systemId ):
154+ if not name :
155+ warnings .warn ("lxml cannot represent null doctype" ,DataLossWarning )
156+ doctype = self .doctypeClass (name )
157+ doctype .publicId = publicId
158+ doctype .systemId = systemId
159+ self .doctype = doctype
160+
161+ def insertCommentInitial (self ,data ,parent = None ):
162+ warnings .warn ("lxml does not support comments as siblings of the root node" ,DataLossWarning )
163+
164+ def insertRoot (self ,name ):
165+ """Create the document root"""
166+ #Because of the way libxml2 works, it doesn't seem to be possible to alter information
167+ #like the doctype after the tree has been parsed. Therefore we need to use the built-in
168+ #parser to create our iniial tree, after which we can add elements like normal
169+ docStr = ""
170+ if self .doctype :
171+ docStr += "<!DOCTYPE %s" % self .doctype .name
172+ if self .doctype .publicId is not None :
173+ docStr += "PUBLIC %s" % self .doctype .publicId
174+ if self .doctype .systemId :
175+ docStr += "SYSTEM %s" % self .doctype .systemId
176+ docStr += ">"
177+ docStr += "<html></html>"
178+
179+ root = etree .fromstring (docStr )
180+
181+ #Create the root document and add the ElementTree to it
182+ self .document = self .documentClass ()
183+ self .document ._elementTree = root .getroottree ()
184+
185+ #Add the root element to the internal child/open data structures
186+ root_element = self .elementClass (name )
187+ root_element ._element = root
188+ self .document ._childNodes .append (root_element )
189+ self .openElements .append (root_element )
190+
191+ #Reset to the default insert comment function
192+ self .insertComment = super (TreeBuilder ,self ).insertComment