Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit9cc3c2a

Browse files
committed
Fix for issue 143; deal with handling non-BMP codepoints in serialization to non-unicode encodings.
1 parent4fa9fda commit9cc3c2a

File tree

4 files changed

+42
-14
lines changed

4 files changed

+42
-14
lines changed

‎src/html5lib/inputstream.py‎

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
fromconstantsimportEOF,spaceCharacters,asciiLetters,asciiUppercase
77
fromconstantsimportencodings,ReparseException
8+
importutils
89

910
#Non-unicode versions of constants for use in the pre-parser
1011
spaceCharactersBytes=frozenset([str(item)foriteminspaceCharacters])
@@ -381,14 +382,9 @@ def characterErrorsUCS2(self, data):
381382
codepoint=ord(match.group())
382383
pos=match.start()
383384
#Pretty sure there should be endianness issues here
384-
if (codepoint>=0xD800andcodepoint<=0xDBFFand
385-
pos<len(data)-1and
386-
ord(data[pos+1])>=0xDC00and
387-
ord(data[pos+1])<=0xDFFF):
385+
ifutils.isSurrogatePair(data[pos:pos+2]):
388386
#We have a surrogate pair!
389-
#From a perl manpage
390-
char_val= (0x10000+ (codepoint-0xD800)*0x400+
391-
(ord(data[pos+1])-0xDC00))
387+
char_val=utils.surrogatePairToCodepoint(data[pos:pos+2])
392388
ifchar_valinnon_bmp_invalid_codepoints:
393389
self.errors.append("invalid-codepoint")
394390
skip=True

‎src/html5lib/serializer/htmlserializer.py‎

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
fromhtml5lib.constantsimportvoidElements,booleanAttributes,spaceCharacters
1111
fromhtml5lib.constantsimportrcdataElements,entities,xmlEntities
12-
12+
fromhtml5libimportutils
1313
fromxml.sax.saxutilsimportescape
1414

1515
spaceCharacters=u"".join(spaceCharacters)
@@ -27,20 +27,33 @@
2727
fork,vinentities.items():
2828
ifv!="&"andencode_entity_map.get(v)!=k.lower():
2929
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
30-
encode_entity_map[v]=k
30+
encode_entity_map[ord(v)]=k
3131

3232
defhtmlentityreplace_errors(exc):
3333
ifisinstance(exc, (UnicodeEncodeError,UnicodeTranslateError)):
3434
res= []
35-
forcinexc.object[exc.start:exc.end]:
36-
e=encode_entity_map.get(c)
35+
codepoints= []
36+
skip=False
37+
fori,cinenumerate(exc.object[exc.start:exc.end]):
38+
ifskip:
39+
skip=False
40+
continue
41+
index=i+exc.start
42+
ifutils.isSurrogatePair(exc.object[index:min([exc.end,index+2])]):
43+
codepoint=utils.surrogatePairToCodepoint(exc.object[index:index+2])
44+
skip=True
45+
else:
46+
codepoint=ord(c)
47+
codepoints.append(codepoint)
48+
forcpincodepoints:
49+
e=encode_entity_map.get(cp)
3750
ife:
3851
res.append("&")
3952
res.append(e)
4053
ifnote.endswith(";"):
4154
res.append(";")
4255
else:
43-
res.append(c.encode(exc.encoding,"xmlcharrefreplace"))
56+
res.append("&#x%s;"%(hex(cp)[2:]))
4457
return (u"".join(res),exc.end)
4558
else:
4659
returnxmlcharrefreplace_errors(exc)

‎src/html5lib/utils.py‎

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,4 +153,23 @@ def __deepcopy__(self, memo={}):
153153
result=self.__class__()
154154
memo[id(self)]=result
155155
result.__init__(deepcopy(tuple(self),memo))
156-
returnresult
156+
returnresult
157+
158+
#Some utility functions to dal with weirdness around UCS2 vs UCS4
159+
#python builds
160+
161+
defencodingType():
162+
iflen()==2:
163+
return"UCS2"
164+
else:
165+
return"UCS4"
166+
167+
defisSurrogatePair(data):
168+
return (len(data)==2and
169+
ord(data[0])>=0xD800andord(data[0])<=0xDBFFand
170+
ord(data[1])>=0xDC00andord(data[1])<=0xDFFF)
171+
172+
defsurrogatePairToCodepoint(data):
173+
char_val= (0x10000+ (ord(data[0])-0xD800)*0x400+
174+
(ord(data[1])-0xDC00))
175+
returnchar_val

‎tests/test_serializer.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def buildTestSuite():
132132
allTests.append(unittest.TestLoader().loadTestsFromTestCase(LxmlTestCase))
133133

134134
returnunittest.TestSuite(allTests)
135-
135+
136136

137137
defmain():
138138
buildTestSuite()

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp