Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commite834288

Browse files
committed
Work with lxml entities (based on patch by fantasai)
1 parent6fdd5d7 commite834288

File tree

5 files changed

+78
-5
lines changed

5 files changed

+78
-5
lines changed

‎src/html5lib/constants.py‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,8 @@
508508
376# 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
509509
)
510510

511+
xmlEntities=frozenset(('lt;','gt;','amp;','apos;','quot;'))
512+
511513
entities= {
512514
"AElig;":u"\u00C6",
513515
"AElig":u"\u00C6",

‎src/html5lib/serializer/htmlserializer.py‎

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
_=gettext.gettext
99

1010
fromhtml5lib.constantsimportvoidElements,booleanAttributes,spaceCharacters
11-
fromhtml5lib.constantsimportrcdataElements
11+
fromhtml5lib.constantsimportrcdataElements,entities,xmlEntities
1212

1313
fromxml.sax.saxutilsimportescape
1414

@@ -54,26 +54,32 @@ def encode(text, encoding):
5454

5555
classHTMLSerializer(object):
5656

57+
# attribute quoting options
5758
quote_attr_values=False
5859
quote_char='"'
5960
use_best_quote_char=True
60-
minimize_boolean_attributes=True
6161

62+
# tag syntax options
63+
omit_optional_tags=True
64+
minimize_boolean_attributes=True
6265
use_trailing_solidus=False
6366
space_before_trailing_solidus=True
67+
68+
# escaping options
6469
escape_lt_in_attrs=False
6570
escape_rcdata=False
71+
resolve_entities=True
6672

73+
# miscellaneous options
6774
inject_meta_charset=True
6875
strip_whitespace=False
6976
sanitize=False
70-
omit_optional_tags=True
7177

7278
options= ("quote_attr_values","quote_char","use_best_quote_char",
7379
"minimize_boolean_attributes","use_trailing_solidus",
7480
"space_before_trailing_solidus","omit_optional_tags",
7581
"strip_whitespace","inject_meta_charset","escape_lt_in_attrs",
76-
"escape_rcdata",'use_trailing_solidus',"sanitize")
82+
"escape_rcdata","resolve_entities","sanitize")
7783

7884
def__init__(self,**kwargs):
7985
ifkwargs.has_key('quote_char'):
@@ -214,6 +220,19 @@ def serialize(self, treewalker, encoding=None):
214220
comment=comment.encode(encoding,unicode_encode_errors)
215221
yieldcomment
216222

223+
eliftype=="Entity":
224+
name=token["name"]
225+
key=name+";"
226+
ifnotkeyinentities:
227+
self.serializeError(_("Entity %s not recognized"%name))
228+
ifself.resolve_entitiesandkeynotinxmlEntities:
229+
data=entities[key]
230+
else:
231+
data=u"&%s;"%name
232+
ifencoding:
233+
data=data.encode(encoding,unicode_encode_errors)
234+
yielddata
235+
217236
else:
218237
self.serializeError(token["data"])
219238

‎src/html5lib/treewalkers/_base.py‎

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ def doctype(self, name, publicId=None, systemId=None, correct=True):
6464
"systemId":systemId,
6565
"correct":correct}
6666

67+
defentity(self,name):
68+
return {"type":"Entity","name":unicode(name)}
69+
6770
defunknown(self,nodeType):
6871
returnself.error(_("Unknown node type: ")+nodeType)
6972

@@ -89,6 +92,7 @@ def element(self, node, namespace, name, attrs, hasChildren):
8992
TEXT=Node.TEXT_NODE
9093
ELEMENT=Node.ELEMENT_NODE
9194
COMMENT=Node.COMMENT_NODE
95+
ENTITY=Node.ENTITY_NODE
9296
UNKNOWN="<#UNKNOWN#>"
9397

9498
classNonRecursiveTreeWalker(TreeWalker):
@@ -133,6 +137,9 @@ def __iter__(self):
133137
eliftype==COMMENT:
134138
yieldself.comment(details[0])
135139

140+
eliftype==ENTITY:
141+
yieldself.entity(details[0])
142+
136143
eliftype==DOCUMENT:
137144
hasChildren=True
138145

‎src/html5lib/treewalkers/lxmletree.py‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ def getNodeDetails(self, node):
126126
elifnode.tag==etree.Comment:
127127
return_base.COMMENT,node.text
128128

129+
elifnode.tag==etree.Entity:
130+
return_base.ENTITY,node.text[1:-1]# strip &;
131+
129132
else:
130133
#This is assumed to be an ordinary element
131134
match=tag_regexp.match(node.tag)

‎tests/test_serializer.py‎

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,18 @@
22
importunittest
33
fromsupportimportsimplejson,html5lib_test_files
44

5+
importhtml5lib
56
fromhtml5libimporthtml5parser,serializer,constants
67
fromhtml5lib.treewalkers._baseimportTreeWalker
78

9+
optionals_loaded= []
10+
11+
try:
12+
fromlxmlimportetree
13+
optionals_loaded.append("lxml")
14+
exceptImportError:
15+
pass
16+
817
default_namespace=constants.namespaces["html"]
918

1019
classJsonWalker(TreeWalker):
@@ -80,7 +89,32 @@ def serialize_xhtml(self, input, options):
8089
returnu''.join(serializer.XHTMLSerializer(**options).
8190
serialize(JsonWalker(input),options.get("encoding",None)))
8291

83-
defbuildTestSuite():
92+
classLxmlTestCase(unittest.TestCase):
93+
defsetUp(self):
94+
self.parser=etree.XMLParser(resolve_entities=False)
95+
self.treewalker=html5lib.getTreeWalker("lxml")
96+
self.serializer=serializer.HTMLSerializer()
97+
98+
deftestEntityReplacement(self):
99+
doc="""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
100+
tree=etree.fromstring(doc,parser=self.parser).getroottree()
101+
result=serializer.serialize(tree,tree="lxml",omit_optional_tags=False)
102+
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""",result)
103+
104+
deftestEntityXML(self):
105+
doc="""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>"""
106+
tree=etree.fromstring(doc,parser=self.parser).getroottree()
107+
result=serializer.serialize(tree,tree="lxml",omit_optional_tags=False)
108+
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""",result)
109+
110+
deftestEntityNoResolve(self):
111+
doc="""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
112+
tree=etree.fromstring(doc,parser=self.parser).getroottree()
113+
result=serializer.serialize(tree,tree="lxml",omit_optional_tags=False,
114+
resolve_entities=False)
115+
self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""",result)
116+
117+
defbuildBasicTestSuite():
84118
forfilenameinhtml5lib_test_files('serializer','*.test'):
85119
test_name=os.path.basename(filename).replace('.test','')
86120
tests=simplejson.load(file(filename))
@@ -92,6 +126,14 @@ def buildTestSuite():
92126
test.get("options", {}))
93127
returnunittest.TestLoader().loadTestsFromTestCase(TestCase)
94128

129+
defbuildTestSuite():
130+
allTests= [buildBasicTestSuite()]
131+
if"lxml"inoptionals_loaded:
132+
allTests.append(unittest.TestLoader().loadTestsFromTestCase(LxmlTestCase))
133+
134+
returnunittest.TestSuite(allTests)
135+
136+
95137
defmain():
96138
buildTestSuite()
97139
unittest.main()

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp