Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit93d5302

Browse files
committed
fix(unicode): use surrogateescape in bytes.decode
That way, we will try to decode as default encoding (usuallyutf-8), but allow ourselves to simply keep bytes that don'tmatch within the resulting unicode string.That way, we allow for lossless decode/encode cycles while stillassuring that decoding never fails.NOTE: I was too lazy to create a test that would verify it, but manuallyexecutedhttps://github.com/petertodd/gitpython-unicode-error.fixes#532
1 parentff389af commit93d5302

File tree

3 files changed

+193
-7
lines changed

3 files changed

+193
-7
lines changed

‎git/compat.py

Lines changed: 191 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
importlocale
1111
importos
1212
importsys
13+
importcodecs
14+
1315

1416
fromgitdb.utils.compatimport (
1517
xrange,
@@ -67,7 +69,7 @@ def safe_decode(s):
6769
ifisinstance(s,unicode):
6870
returns
6971
elifisinstance(s,bytes):
70-
returns.decode(defenc,'replace')
72+
returns.decode(defenc,'surrogateescape')
7173
elifsisnotNone:
7274
raiseTypeError('Expected bytes or text, but got %r'% (s,))
7375

@@ -121,3 +123,191 @@ def __str__(self):
121123
else:# Python 2
122124
def__str__(self):
123125
returnself.__unicode__().encode(defenc)
126+
127+
128+
"""
129+
This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
130+
handler of Python 3.
131+
Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
132+
"""
133+
134+
# This code is released under the Python license and the BSD 2-clause license
135+
136+
137+
FS_ERRORS='surrogateescape'
138+
139+
# # -- Python 2/3 compatibility -------------------------------------
140+
# FS_ERRORS = 'my_surrogateescape'
141+
142+
defu(text):
143+
ifPY3:
144+
returntext
145+
else:
146+
returntext.decode('unicode_escape')
147+
148+
defb(data):
149+
ifPY3:
150+
returndata.encode('latin1')
151+
else:
152+
returndata
153+
154+
ifPY3:
155+
_unichr=chr
156+
bytes_chr=lambdacode:bytes((code,))
157+
else:
158+
_unichr=unichr
159+
bytes_chr=chr
160+
161+
defsurrogateescape_handler(exc):
162+
"""
163+
Pure Python implementation of the PEP 383: the "surrogateescape" error
164+
handler of Python 3. Undecodable bytes will be replaced by a Unicode
165+
character U+DCxx on decoding, and these are translated into the
166+
original bytes on encoding.
167+
"""
168+
mystring=exc.object[exc.start:exc.end]
169+
170+
try:
171+
ifisinstance(exc,UnicodeDecodeError):
172+
# mystring is a byte-string in this case
173+
decoded=replace_surrogate_decode(mystring)
174+
elifisinstance(exc,UnicodeEncodeError):
175+
# In the case of u'\udcc3'.encode('ascii',
176+
# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
177+
# exception anyway after this function is called, even though I think
178+
# it's doing what it should. It seems that the strict encoder is called
179+
# to encode the unicode string that this function returns ...
180+
decoded=replace_surrogate_encode(mystring)
181+
else:
182+
raiseexc
183+
exceptNotASurrogateError:
184+
raiseexc
185+
return (decoded,exc.end)
186+
187+
188+
classNotASurrogateError(Exception):
189+
pass
190+
191+
192+
defreplace_surrogate_encode(mystring):
193+
"""
194+
Returns a (unicode) string, not the more logical bytes, because the codecs
195+
register_error functionality expects this.
196+
"""
197+
decoded= []
198+
forchinmystring:
199+
# if PY3:
200+
# code = ch
201+
# else:
202+
code=ord(ch)
203+
204+
# The following magic comes from Py3.3's Python/codecs.c file:
205+
ifnot0xD800<=code<=0xDCFF:
206+
# Not a surrogate. Fail with the original exception.
207+
raiseexc
208+
# mybytes = [0xe0 | (code >> 12),
209+
# 0x80 | ((code >> 6) & 0x3f),
210+
# 0x80 | (code & 0x3f)]
211+
# Is this a good idea?
212+
if0xDC00<=code<=0xDC7F:
213+
decoded.append(_unichr(code-0xDC00))
214+
elifcode<=0xDCFF:
215+
decoded.append(_unichr(code-0xDC00))
216+
else:
217+
raiseNotASurrogateError
218+
returnstr().join(decoded)
219+
220+
221+
defreplace_surrogate_decode(mybytes):
222+
"""
223+
Returns a (unicode) string
224+
"""
225+
decoded= []
226+
forchinmybytes:
227+
# We may be parsing newbytes (in which case ch is an int) or a native
228+
# str on Py2
229+
ifisinstance(ch,int):
230+
code=ch
231+
else:
232+
code=ord(ch)
233+
if0x80<=code<=0xFF:
234+
decoded.append(_unichr(0xDC00+code))
235+
elifcode<=0x7F:
236+
decoded.append(_unichr(code))
237+
else:
238+
# # It may be a bad byte
239+
# # Try swallowing it.
240+
# continue
241+
# print("RAISE!")
242+
raiseNotASurrogateError
243+
returnstr().join(decoded)
244+
245+
246+
defencodefilename(fn):
247+
ifFS_ENCODING=='ascii':
248+
# ASCII encoder of Python 2 expects that the error handler returns a
249+
# Unicode string encodable to ASCII, whereas our surrogateescape error
250+
# handler has to return bytes in 0x80-0xFF range.
251+
encoded= []
252+
forindex,chinenumerate(fn):
253+
code=ord(ch)
254+
ifcode<128:
255+
ch=bytes_chr(code)
256+
elif0xDC80<=code<=0xDCFF:
257+
ch=bytes_chr(code-0xDC00)
258+
else:
259+
raiseUnicodeEncodeError(FS_ENCODING,
260+
fn,index,index+1,
261+
'ordinal not in range(128)')
262+
encoded.append(ch)
263+
returnbytes().join(encoded)
264+
elifFS_ENCODING=='utf-8':
265+
# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
266+
# doesn't go through our error handler
267+
encoded= []
268+
forindex,chinenumerate(fn):
269+
code=ord(ch)
270+
if0xD800<=code<=0xDFFF:
271+
if0xDC80<=code<=0xDCFF:
272+
ch=bytes_chr(code-0xDC00)
273+
encoded.append(ch)
274+
else:
275+
raiseUnicodeEncodeError(
276+
FS_ENCODING,
277+
fn,index,index+1,'surrogates not allowed')
278+
else:
279+
ch_utf8=ch.encode('utf-8')
280+
encoded.append(ch_utf8)
281+
returnbytes().join(encoded)
282+
else:
283+
returnfn.encode(FS_ENCODING,FS_ERRORS)
284+
285+
defdecodefilename(fn):
286+
returnfn.decode(FS_ENCODING,FS_ERRORS)
287+
288+
FS_ENCODING='ascii';fn=b('[abc\xff]');encoded=u('[abc\udcff]')
289+
# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
290+
# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
291+
292+
293+
# normalize the filesystem encoding name.
294+
# For example, we expect "utf-8", not "UTF8".
295+
FS_ENCODING=codecs.lookup(FS_ENCODING).name
296+
297+
298+
defregister_surrogateescape():
299+
"""
300+
Registers the surrogateescape error handler on Python 2 (only)
301+
"""
302+
ifPY3:
303+
return
304+
try:
305+
codecs.lookup_error(FS_ERRORS)
306+
exceptLookupError:
307+
codecs.register_error(FS_ERRORS,surrogateescape_handler)
308+
309+
310+
try:
311+
"hello".decode(defenc,"surrogateescape")
312+
except:
313+
register_surrogateescape()

‎git/objects/fun.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,7 @@ def tree_entries_from_data(data):
7676
# default encoding for strings in git is utf8
7777
# Only use the respective unicode object if the byte stream was encoded
7878
name=data[ns:i]
79-
try:
80-
name=name.decode(defenc)
81-
exceptUnicodeDecodeError:
82-
pass
83-
# END handle encoding
79+
name=name.decode(defenc,'surrogateescape')
8480

8581
# byte is NULL, get next 20
8682
i+=1

‎git/test/performance/test_commit.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_iteration(self):
5252
# END for each object
5353
# END for each commit
5454
elapsed_time=time()-st
55-
print("Traversed %i Trees and a total of %iunchached objects in %s [s] ( %f objs/s )"
55+
print("Traversed %i Trees and a total of %iuncached objects in %s [s] ( %f objs/s )"
5656
% (nc,no,elapsed_time,no/elapsed_time),file=sys.stderr)
5757

5858
deftest_commit_traversal(self):

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp