Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commite51c9f6

Browse files
committed
Experimental new approach to lxml.etree that seems to fit better with the library philsophy but can't represent all possible html documents
--HG--extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401092
1 parent447b711 commite51c9f6

File tree

5 files changed

+204
-3
lines changed

5 files changed

+204
-3
lines changed

‎src/html5lib/constants.py‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,3 +1041,6 @@
10411041
"tis-620",
10421042
"hz-gb-2312",
10431043
))
1044+
1045+
classDataLossWarning(UserWarning):
1046+
pass

‎src/html5lib/html5parser.py‎

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -427,9 +427,7 @@ def processEndTag(self, name):
427427
classRootElementPhase(Phase):
428428
# helper methods
429429
definsertHtmlElement(self):
430-
element=self.tree.createElement("html", {})
431-
self.tree.openElements.append(element)
432-
self.tree.document.appendChild(element)
430+
self.tree.insertRoot("html")
433431
self.parser.phase=self.parser.phases["beforeHead"]
434432

435433
# other

‎src/html5lib/treebuilders/__init__.py‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
5858
eliftreeType=="beautifulsoup":
5959
importsoup
6060
treeBuilderCache[treeType]=soup.TreeBuilder
61+
eliftreeType=="lxml":
62+
importetree_lxml
63+
treeBuilderCache[treeType]=etree_lxml.TreeBuilder
6164
eliftreeType=="etree":
6265
importetree
6366
# XXX: NEVER cache here, caching is done in the etree submodule

‎src/html5lib/treebuilders/_base.py‎

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,11 @@ def elementInActiveFormattingElements(self, name):
207207
returnitem
208208
returnFalse
209209

210+
definsertRoot(self,name):
211+
element=self.createElement("html", {})
212+
self.openElements.append(element)
213+
self.document.appendChild(element)
214+
210215
definsertDoctype(self,name,publicId,systemId):
211216
doctype=self.doctypeClass(name)
212217
doctype.publicId=publicId
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
import_base
2+
importnew
3+
importwarnings
4+
fromhtml5lib.constantsimportDataLossWarning
5+
importetreeasetree_builders
6+
try:
7+
importlxml.htmlasetree
8+
exceptImportError:
9+
importlxml.etreeasetree
10+
11+
fullTree=True
12+
13+
"""Module for supporting the lxml.etree library. The idea here is to use as much
14+
of the native library as possible, without using fragile hacks like custom element
15+
names that break between releases. The downside of this is that we cannot represent
16+
all possible trees; specifically the following are known to cause problems:
17+
18+
Text or comments as siblings of the root element
19+
Doctypes with mixed case names
20+
Docypes with no name
21+
22+
When any of these things occur, we emit a DataLossWarning
23+
"""
24+
25+
classDocumentType(object):
26+
def__init__(self,name,publicId=None,systemId=None):
27+
self.name=name
28+
ifname!=name.lower():
29+
warnings.warn("lxml does not preserve doctype case",DataLossWarning)
30+
self.publicId=publicId
31+
self.systemId=systemId
32+
33+
classDocument(object):
34+
def__init__(self):
35+
self._elementTree=None
36+
self._childNodes= []
37+
38+
defappendChild(self,element):
39+
warnings.warn("lxml does not support comments as siblings of the root node",DataLossWarning)
40+
41+
def_getChildNodes(self):
42+
returnself._childNodes
43+
44+
childNodes=property(_getChildNodes)
45+
46+
deftestSerializer(element):
47+
rv= []
48+
finalText=None
49+
defserializeElement(element,indent=0):
50+
ifnothasattr(element,"tag"):
51+
rv.append("#document")
52+
ifelement.docinfo.internalDTD:
53+
dtd_str=element.docinfo.doctype
54+
ifnotdtd_str:
55+
dtd_str="<!DOCTYPE %s>"%element.docinfo.root_name
56+
rv.append("|%s%s"%(' '*(indent+2),dtd_str))
57+
serializeElement(element.getroot(),indent+2)
58+
eliftype(element.tag)==type(etree.Comment):
59+
rv.append("|%s<!-- %s -->"%(' '*indent,element.text))
60+
else:
61+
rv.append("|%s<%s>"%(' '*indent,element.tag))
62+
ifhasattr(element,"attrib"):
63+
forname,valueinelement.attrib.iteritems():
64+
rv.append('|%s%s="%s"'% (' '*(indent+2),name,value))
65+
ifelement.text:
66+
rv.append("|%s\"%s\""%(' '*(indent+2),element.text))
67+
indent+=2
68+
forchildinelement.getchildren():
69+
serializeElement(child,indent)
70+
ifhasattr(element,"tail")andelement.tail:
71+
rv.append("|%s\"%s\""%(' '*(indent-2),element.tail))
72+
serializeElement(element,0)
73+
74+
iffinalTextisnotNone:
75+
rv.append("|%s\"%s\""%(' '*2,finalText))
76+
77+
return"\n".join(rv)
78+
79+
deftostring(element):
80+
"""Serialize an element and its child nodes to a string"""
81+
rv= []
82+
finalText=None
83+
defserializeElement(element):
84+
ifnothasattr(element,"tag"):
85+
ifelement.docinfo.internalDTD:
86+
ifelement.docinfo.doctype:
87+
dtd_str=element.docinfo.doctype
88+
else:
89+
dtd_str="<!DOCTYPE %s>"%element.docinfo.root_name
90+
rv.append(dtd_str)
91+
serializeElement(element.getroot())
92+
93+
eliftype(element.tag)==type(etree.Comment):
94+
rv.append("<!--%s-->"%(element.text,))
95+
96+
else:
97+
#This is assumed to be an ordinary element
98+
ifnotelement.attrib:
99+
rv.append("<%s>"%(element.tag,))
100+
else:
101+
attr=" ".join(["%s=\"%s\""%(name,value)
102+
forname,valueinelement.attrib.iteritems()])
103+
rv.append("<%s %s>"%(element.tag,attr))
104+
ifelement.text:
105+
rv.append(element.text)
106+
107+
forchildinelement.getchildren():
108+
serializeElement(child)
109+
110+
rv.append("</%s>"%(element.tag,))
111+
112+
ifhasattr(element,"tail")andelement.tail:
113+
rv.append(element.tail)
114+
115+
serializeElement(element)
116+
117+
iffinalTextisnotNone:
118+
rv.append("%s\""%(' '*2,finalText))
119+
120+
return"".join(rv)
121+
122+
classTreeBuilder(_base.TreeBuilder):
123+
documentClass=Document
124+
doctypeClass=DocumentType
125+
elementClass=None
126+
commentClass=None
127+
fragmentClass=None
128+
129+
def__init__(self,fullTree=False):
130+
builder=etree_builders.getETreeModule(etree,fullTree=fullTree)
131+
self.elementClass=builder.Element
132+
self.commentClass=builder.Comment
133+
self.fragmentClass=builder.DocumentFragment
134+
_base.TreeBuilder.__init__(self)
135+
136+
defreset(self):
137+
_base.TreeBuilder.reset(self)
138+
self.insertComment=self.insertCommentInitial
139+
self.doctype=None
140+
141+
deftestSerializer(self,element):
142+
returntestSerializer(element)
143+
144+
defgetDocument(self):
145+
iffullTree:
146+
returnself.document._elementTree
147+
else:
148+
returnself.document._elementTree.getroot()
149+
150+
defgetFragment(self):
151+
return_base.TreeBuilder.getFragment(self)._element
152+
153+
definsertDoctype(self,name,publicId,systemId):
154+
ifnotname:
155+
warnings.warn("lxml cannot represent null doctype",DataLossWarning)
156+
doctype=self.doctypeClass(name)
157+
doctype.publicId=publicId
158+
doctype.systemId=systemId
159+
self.doctype=doctype
160+
161+
definsertCommentInitial(self,data,parent=None):
162+
warnings.warn("lxml does not support comments as siblings of the root node",DataLossWarning)
163+
164+
definsertRoot(self,name):
165+
"""Create the document root"""
166+
#Because of the way libxml2 works, it doesn't seem to be possible to alter information
167+
#like the doctype after the tree has been parsed. Therefore we need to use the built-in
168+
#parser to create our iniial tree, after which we can add elements like normal
169+
docStr=""
170+
ifself.doctype:
171+
docStr+="<!DOCTYPE %s"%self.doctype.name
172+
ifself.doctype.publicIdisnotNone:
173+
docStr+="PUBLIC %s"%self.doctype.publicId
174+
ifself.doctype.systemId:
175+
docStr+="SYSTEM %s"%self.doctype.systemId
176+
docStr+=">"
177+
docStr+="<html></html>"
178+
179+
root=etree.fromstring(docStr)
180+
181+
#Create the root document and add the ElementTree to it
182+
self.document=self.documentClass()
183+
self.document._elementTree=root.getroottree()
184+
185+
#Add the root element to the internal child/open data structures
186+
root_element=self.elementClass(name)
187+
root_element._element=root
188+
self.document._childNodes.append(root_element)
189+
self.openElements.append(root_element)
190+
191+
#Reset to the default insert comment function
192+
self.insertComment=super(TreeBuilder,self).insertComment

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp