Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb51828b

Browse files
committed
Allow for Python implementations that don't support lone surrogates (read: Jython).
This is based on earlier work by Jim Baker (thanks!).The two major parts of this are: * Avoiding having lone surrogates in any string literals, and * Avoiding tests that contain lone surrogates.As part of this, the decoder for double-escaped tokenizer tests is rewrittento avoid unicode_escape as that has bogus behaviour with non-ASCII characters.
1 parentb293489 commitb51828b

File tree

5 files changed

+87
-14
lines changed

5 files changed

+87
-14
lines changed

‎AUTHORS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,4 @@ Patches and suggestions
3232
- Juan Carlos Garcia Segovia
3333
- Mike West
3434
- Marc DM
35+
- Jim Baker

‎CHANGES.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@ Change Log
44
0.9999
55
~~~~~~
66

7-
Released on XXX,2014
7+
Released on XXX,2015
88

9-
* XXX
9+
* Add support for Python implementations that don't support lone surrogates
10+
(read: Jython).
1011

1112

1213
0.999

‎html5lib/inputstream.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,18 @@ class BufferedIOBase(object):
2828
asciiUppercaseBytes=frozenset([item.encode("ascii")foriteminasciiUppercase])
2929
spacesAngleBrackets=spaceCharactersBytes|frozenset([b">",b"<"])
3030

31-
invalid_unicode_re=re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
31+
32+
invalid_unicode_no_surrogate="[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
33+
34+
ifutils.supports_lone_surrogates:
35+
# Use one extra step of indirection and create surrogates with
36+
# unichr. Not using this indirection would introduce an illegal
37+
# unicode literal on platforms not supporting such lone
38+
# surrogates.
39+
invalid_unicode_re=re.compile(invalid_unicode_no_surrogate+
40+
eval('"\\uD800-\\uDFFF"'))
41+
else:
42+
invalid_unicode_re=re.compile(invalid_unicode_no_surrogate)
3243

3344
non_bmp_invalid_codepoints=set([0x1FFFE,0x1FFFF,0x2FFFE,0x2FFFF,0x3FFFE,
3445
0x3FFFF,0x4FFFE,0x4FFFF,0x5FFFE,0x5FFFF,
@@ -164,13 +175,18 @@ def __init__(self, source):
164175
165176
"""
166177

167-
# Craziness
168-
iflen("\U0010FFFF")==1:
178+
ifnotutils.supports_lone_surrogates:
179+
# Such platforms will have already checked for such
180+
# surrogate errors, so no need to do this checking.
181+
self.reportCharacterErrors=None
182+
self.replaceCharactersRegexp=None
183+
eliflen("\U0010FFFF")==1:
169184
self.reportCharacterErrors=self.characterErrorsUCS4
170-
self.replaceCharactersRegexp=re.compile("[\uD800-\uDFFF]")
185+
self.replaceCharactersRegexp=re.compile(eval('"[\\uD800-\\uDFFF]"'))
171186
else:
172187
self.reportCharacterErrors=self.characterErrorsUCS2
173-
self.replaceCharactersRegexp=re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
188+
self.replaceCharactersRegexp=re.compile(
189+
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
174190

175191
# List of where new lines occur
176192
self.newLines= [0]
@@ -265,11 +281,12 @@ def readChunk(self, chunkSize=None):
265281
self._bufferedCharacter=data[-1]
266282
data=data[:-1]
267283

268-
self.reportCharacterErrors(data)
284+
ifself.reportCharacterErrors:
285+
self.reportCharacterErrors(data)
269286

270-
# Replace invalid characters
271-
# Note U+0000 is dealt with in the tokenizer
272-
data=self.replaceCharactersRegexp.sub("\ufffd",data)
287+
# Replace invalid characters
288+
# Note U+0000 is dealt with in the tokenizer
289+
data=self.replaceCharactersRegexp.sub("\ufffd",data)
273290

274291
data=data.replace("\r\n","\n")
275292
data=data.replace("\r","\n")

‎html5lib/tests/test_tokenizer.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
importwarnings
55
importre
66

7+
fromsiximportunichr
8+
79
from .supportimportget_data_files
810

911
fromhtml5lib.tokenizerimportHTMLTokenizer
10-
fromhtml5libimportconstants
12+
fromhtml5libimportconstants,utils
1113

1214

1315
classTokenizerTestParser(object):
@@ -122,9 +124,38 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
122124
returntokens["expected"]==tokens["received"]
123125

124126

127+
_surrogateRe=re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
128+
129+
125130
defunescape(test):
126131
defdecode(inp):
127-
returninp.encode("utf-8").decode("unicode-escape")
132+
"""Decode\\uXXXX escapes
133+
134+
This decodes\\uXXXX escapes, possibly into non-BMP characters when
135+
two surrogate character escapes are adjacent to each other.
136+
"""
137+
# This cannot be implemented using the unicode_escape codec
138+
# because that requires its input be ISO-8859-1, and we need
139+
# arbitrary unicode as input.
140+
defrepl(m):
141+
ifm.group(2)isnotNone:
142+
high=int(m.group(1),16)
143+
low=int(m.group(2),16)
144+
if0xD800<=high<=0xDBFFand0xDC00<=low<=0xDFFF:
145+
cp= ((high-0xD800)<<10)+ (low-0xDC00)+0x10000
146+
returnunichr(cp)
147+
else:
148+
returnunichr(high)+unichr(low)
149+
else:
150+
returnunichr(int(m.group(1),16))
151+
try:
152+
return_surrogateRe.sub(repl,inp)
153+
exceptValueError:
154+
# This occurs when unichr throws ValueError, which should
155+
# only be for a lone-surrogate.
156+
ifutils.supports_lone_surrogates:
157+
raise
158+
returnNone
128159

129160
test["input"]=decode(test["input"])
130161
fortokenintest["output"]:
@@ -183,6 +214,8 @@ def testTokenizer():
183214
test["initialStates"]= ["Data state"]
184215
if'doubleEscaped'intest:
185216
test=unescape(test)
217+
iftest["input"]isNone:
218+
continue# Not valid input for this platform
186219
forinitialStateintest["initialStates"]:
187220
test["initialState"]=capitalize(initialState)
188221
yieldrunTokenizerTest,test

‎html5lib/utils.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,35 @@
22

33
fromtypesimportModuleType
44

5+
fromsiximporttext_type
6+
57
try:
68
importxml.etree.cElementTreeasdefault_etree
79
exceptImportError:
810
importxml.etree.ElementTreeasdefault_etree
911

1012

1113
__all__= ["default_etree","MethodDispatcher","isSurrogatePair",
12-
"surrogatePairToCodepoint","moduleFactoryFactory"]
14+
"surrogatePairToCodepoint","moduleFactoryFactory",
15+
"supports_lone_surrogates"]
16+
17+
18+
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
19+
# caught by the below test. In general this would be any platform
20+
# using UTF-16 as its encoding of unicode strings, such as
21+
# Jython. This is because UTF-16 itself is based on the use of such
22+
# surrogates, and there is no mechanism to further escape such
23+
# escapes.
24+
try:
25+
_x=eval('"\\uD800"')
26+
ifnotisinstance(_x,text_type):
27+
# We need this with u"" because of http://bugs.jython.org/issue2039
28+
_x=eval('u"\\uD800"')
29+
assertisinstance(_x,text_type)
30+
except:
31+
supports_lone_surrogates=False
32+
else:
33+
supports_lone_surrogates=True
1334

1435

1536
classMethodDispatcher(dict):

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp