Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitec731f4

Browse files
committed
Merge with#532, fix unicode filenames with escapesurogates
2 parentsb2efa1b +9e4a454 commitec731f4

File tree

7 files changed

+209
-18
lines changed

7 files changed

+209
-18
lines changed

‎VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.0.9dev0
1+
2.0.10dev0

‎git/compat.py

Lines changed: 191 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
importlocale
1111
importos
1212
importsys
13+
importcodecs
14+
1315

1416
fromgitdb.utils.compatimport (
1517
xrange,
@@ -67,7 +69,7 @@ def safe_decode(s):
6769
ifisinstance(s,unicode):
6870
returns
6971
elifisinstance(s,bytes):
70-
returns.decode(defenc,'replace')
72+
returns.decode(defenc,'surrogateescape')
7173
elifsisnotNone:
7274
raiseTypeError('Expected bytes or text, but got %r'% (s,))
7375

@@ -121,3 +123,191 @@ def __str__(self):
121123
else:# Python 2
122124
def__str__(self):
123125
returnself.__unicode__().encode(defenc)
126+
127+
128+
"""
129+
This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
130+
handler of Python 3.
131+
Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
132+
"""
133+
134+
# This code is released under the Python license and the BSD 2-clause license
135+
136+
137+
FS_ERRORS='surrogateescape'
138+
139+
# # -- Python 2/3 compatibility -------------------------------------
140+
# FS_ERRORS = 'my_surrogateescape'
141+
142+
defu(text):
143+
ifPY3:
144+
returntext
145+
else:
146+
returntext.decode('unicode_escape')
147+
148+
defb(data):
149+
ifPY3:
150+
returndata.encode('latin1')
151+
else:
152+
returndata
153+
154+
ifPY3:
155+
_unichr=chr
156+
bytes_chr=lambdacode:bytes((code,))
157+
else:
158+
_unichr=unichr
159+
bytes_chr=chr
160+
161+
defsurrogateescape_handler(exc):
162+
"""
163+
Pure Python implementation of the PEP 383: the "surrogateescape" error
164+
handler of Python 3. Undecodable bytes will be replaced by a Unicode
165+
character U+DCxx on decoding, and these are translated into the
166+
original bytes on encoding.
167+
"""
168+
mystring=exc.object[exc.start:exc.end]
169+
170+
try:
171+
ifisinstance(exc,UnicodeDecodeError):
172+
# mystring is a byte-string in this case
173+
decoded=replace_surrogate_decode(mystring)
174+
elifisinstance(exc,UnicodeEncodeError):
175+
# In the case of u'\udcc3'.encode('ascii',
176+
# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
177+
# exception anyway after this function is called, even though I think
178+
# it's doing what it should. It seems that the strict encoder is called
179+
# to encode the unicode string that this function returns ...
180+
decoded=replace_surrogate_encode(mystring)
181+
else:
182+
raiseexc
183+
exceptNotASurrogateError:
184+
raiseexc
185+
return (decoded,exc.end)
186+
187+
188+
classNotASurrogateError(Exception):
189+
pass
190+
191+
192+
defreplace_surrogate_encode(mystring):
193+
"""
194+
Returns a (unicode) string, not the more logical bytes, because the codecs
195+
register_error functionality expects this.
196+
"""
197+
decoded= []
198+
forchinmystring:
199+
# if PY3:
200+
# code = ch
201+
# else:
202+
code=ord(ch)
203+
204+
# The following magic comes from Py3.3's Python/codecs.c file:
205+
ifnot0xD800<=code<=0xDCFF:
206+
# Not a surrogate. Fail with the original exception.
207+
raiseexc
208+
# mybytes = [0xe0 | (code >> 12),
209+
# 0x80 | ((code >> 6) & 0x3f),
210+
# 0x80 | (code & 0x3f)]
211+
# Is this a good idea?
212+
if0xDC00<=code<=0xDC7F:
213+
decoded.append(_unichr(code-0xDC00))
214+
elifcode<=0xDCFF:
215+
decoded.append(_unichr(code-0xDC00))
216+
else:
217+
raiseNotASurrogateError
218+
returnstr().join(decoded)
219+
220+
221+
defreplace_surrogate_decode(mybytes):
222+
"""
223+
Returns a (unicode) string
224+
"""
225+
decoded= []
226+
forchinmybytes:
227+
# We may be parsing newbytes (in which case ch is an int) or a native
228+
# str on Py2
229+
ifisinstance(ch,int):
230+
code=ch
231+
else:
232+
code=ord(ch)
233+
if0x80<=code<=0xFF:
234+
decoded.append(_unichr(0xDC00+code))
235+
elifcode<=0x7F:
236+
decoded.append(_unichr(code))
237+
else:
238+
# # It may be a bad byte
239+
# # Try swallowing it.
240+
# continue
241+
# print("RAISE!")
242+
raiseNotASurrogateError
243+
returnstr().join(decoded)
244+
245+
246+
defencodefilename(fn):
247+
ifFS_ENCODING=='ascii':
248+
# ASCII encoder of Python 2 expects that the error handler returns a
249+
# Unicode string encodable to ASCII, whereas our surrogateescape error
250+
# handler has to return bytes in 0x80-0xFF range.
251+
encoded= []
252+
forindex,chinenumerate(fn):
253+
code=ord(ch)
254+
ifcode<128:
255+
ch=bytes_chr(code)
256+
elif0xDC80<=code<=0xDCFF:
257+
ch=bytes_chr(code-0xDC00)
258+
else:
259+
raiseUnicodeEncodeError(FS_ENCODING,
260+
fn,index,index+1,
261+
'ordinal not in range(128)')
262+
encoded.append(ch)
263+
returnbytes().join(encoded)
264+
elifFS_ENCODING=='utf-8':
265+
# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
266+
# doesn't go through our error handler
267+
encoded= []
268+
forindex,chinenumerate(fn):
269+
code=ord(ch)
270+
if0xD800<=code<=0xDFFF:
271+
if0xDC80<=code<=0xDCFF:
272+
ch=bytes_chr(code-0xDC00)
273+
encoded.append(ch)
274+
else:
275+
raiseUnicodeEncodeError(
276+
FS_ENCODING,
277+
fn,index,index+1,'surrogates not allowed')
278+
else:
279+
ch_utf8=ch.encode('utf-8')
280+
encoded.append(ch_utf8)
281+
returnbytes().join(encoded)
282+
else:
283+
returnfn.encode(FS_ENCODING,FS_ERRORS)
284+
285+
defdecodefilename(fn):
286+
returnfn.decode(FS_ENCODING,FS_ERRORS)
287+
288+
FS_ENCODING='ascii';fn=b('[abc\xff]');encoded=u('[abc\udcff]')
289+
# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
290+
# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
291+
292+
293+
# normalize the filesystem encoding name.
294+
# For example, we expect "utf-8", not "UTF8".
295+
FS_ENCODING=codecs.lookup(FS_ENCODING).name
296+
297+
298+
defregister_surrogateescape():
299+
"""
300+
Registers the surrogateescape error handler on Python 2 (only)
301+
"""
302+
ifPY3:
303+
return
304+
try:
305+
codecs.lookup_error(FS_ERRORS)
306+
exceptLookupError:
307+
codecs.register_error(FS_ERRORS,surrogateescape_handler)
308+
309+
310+
try:
311+
b"100644\x9f\0aaa".decode(defenc,"surrogateescape")
312+
except:
313+
register_surrogateescape()

‎git/objects/fun.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
fromstatimportS_ISDIR
33
fromgit.compatimport (
44
byte_ord,
5+
safe_decode,
56
defenc,
67
xrange,
78
text_type,
@@ -76,11 +77,7 @@ def tree_entries_from_data(data):
7677
# default encoding for strings in git is utf8
7778
# Only use the respective unicode object if the byte stream was encoded
7879
name=data[ns:i]
79-
try:
80-
name=name.decode(defenc)
81-
exceptUnicodeDecodeError:
82-
pass
83-
# END handle encoding
80+
name=safe_decode(name)
8481

8582
# byte is NULL, get next 20
8683
i+=1

‎git/test/performance/test_commit.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_iteration(self):
5252
# END for each object
5353
# END for each commit
5454
elapsed_time=time()-st
55-
print("Traversed %i Trees and a total of %iunchached objects in %s [s] ( %f objs/s )"
55+
print("Traversed %i Trees and a total of %iuncached objects in %s [s] ( %f objs/s )"
5656
% (nc,no,elapsed_time,no/elapsed_time),file=sys.stderr)
5757

5858
deftest_commit_traversal(self):

‎git/test/test_fun.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
fromioimportBytesIO
2-
fromstatimport (
3-
S_IFDIR,
4-
S_IFREG,
5-
S_IFLNK
6-
)
2+
fromstatimportS_IFDIR,S_IFREG,S_IFLNK
3+
fromunittest.caseimportskipIf
74

5+
fromgit.compatimportPY3
86
fromgit.indeximportIndexFile
97
fromgit.index.funimport (
108
aggressive_tree_merge
@@ -253,6 +251,12 @@ def test_tree_traversal_single(self):
253251
assertentries
254252
# END for each commit
255253

256-
deftest_tree_entries_from_data_with_failing_name_decode(self):
254+
@skipIf(PY3,'odd types returned ... maybe figure it out one day')
255+
deftest_tree_entries_from_data_with_failing_name_decode_py2(self):
256+
r=tree_entries_from_data(b'100644\x9f\0aaa')
257+
assertr== [('aaa',33188,u'\udc9f')],r
258+
259+
@skipIf(notPY3,'odd types returned ... maybe figure it out one day')
260+
deftest_tree_entries_from_data_with_failing_name_decode_py3(self):
257261
r=tree_entries_from_data(b'100644\x9f\0aaa')
258-
assertr== [(b'aaa',33188,b'\x9f')],r
262+
assertr== [(b'aaa',33188,'\udc9f')],r

‎setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def _stamp_version(filename):
6464
else:
6565
print("WARNING: Couldn't find version line in file %s"%filename,file=sys.stderr)
6666

67-
install_requires= ['gitdb >=0.6.4']
67+
install_requires= ['gitdb2 >=2.0.0']
6868
extras_require= {
6969
':python_version == "2.6"': ['ordereddict'],
7070
}
@@ -100,7 +100,7 @@ def _stamp_version(filename):
100100
package_data={'git.test': ['fixtures/*']},
101101
package_dir={'git':'git'},
102102
license="BSD License",
103-
requires=['gitdb (>=0.6.4)'],
103+
requires=['gitdb2 (>=2.0.0)'],
104104
install_requires=install_requires,
105105
test_requirements=test_requires+install_requires,
106106
zip_safe=False,

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp