|
10 | 10 | importlocale
|
11 | 11 | importos
|
12 | 12 | importsys
|
| 13 | +importcodecs |
| 14 | + |
13 | 15 |
|
14 | 16 | fromgitdb.utils.compatimport (
|
15 | 17 | xrange,
|
@@ -67,7 +69,7 @@ def safe_decode(s):
|
67 | 69 | ifisinstance(s,unicode):
|
68 | 70 | returns
|
69 | 71 | elifisinstance(s,bytes):
|
70 |
| -returns.decode(defenc,'replace') |
| 72 | +returns.decode(defenc,'surrogateescape') |
71 | 73 | elifsisnotNone:
|
72 | 74 | raiseTypeError('Expected bytes or text, but got %r'% (s,))
|
73 | 75 |
|
@@ -121,3 +123,191 @@ def __str__(self):
|
121 | 123 | else:# Python 2
|
122 | 124 | def__str__(self):
|
123 | 125 | returnself.__unicode__().encode(defenc)
|
| 126 | + |
| 127 | + |
| 128 | +""" |
| 129 | +This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error |
| 130 | +handler of Python 3. |
| 131 | +Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc |
| 132 | +""" |
| 133 | + |
| 134 | +# This code is released under the Python license and the BSD 2-clause license |
| 135 | + |
| 136 | + |
| 137 | +FS_ERRORS='surrogateescape' |
| 138 | + |
| 139 | +# # -- Python 2/3 compatibility ------------------------------------- |
| 140 | +# FS_ERRORS = 'my_surrogateescape' |
| 141 | + |
| 142 | +defu(text): |
| 143 | +ifPY3: |
| 144 | +returntext |
| 145 | +else: |
| 146 | +returntext.decode('unicode_escape') |
| 147 | + |
| 148 | +defb(data): |
| 149 | +ifPY3: |
| 150 | +returndata.encode('latin1') |
| 151 | +else: |
| 152 | +returndata |
| 153 | + |
| 154 | +ifPY3: |
| 155 | +_unichr=chr |
| 156 | +bytes_chr=lambdacode:bytes((code,)) |
| 157 | +else: |
| 158 | +_unichr=unichr |
| 159 | +bytes_chr=chr |
| 160 | + |
| 161 | +defsurrogateescape_handler(exc): |
| 162 | +""" |
| 163 | + Pure Python implementation of the PEP 383: the "surrogateescape" error |
| 164 | + handler of Python 3. Undecodable bytes will be replaced by a Unicode |
| 165 | + character U+DCxx on decoding, and these are translated into the |
| 166 | + original bytes on encoding. |
| 167 | + """ |
| 168 | +mystring=exc.object[exc.start:exc.end] |
| 169 | + |
| 170 | +try: |
| 171 | +ifisinstance(exc,UnicodeDecodeError): |
| 172 | +# mystring is a byte-string in this case |
| 173 | +decoded=replace_surrogate_decode(mystring) |
| 174 | +elifisinstance(exc,UnicodeEncodeError): |
| 175 | +# In the case of u'\udcc3'.encode('ascii', |
| 176 | +# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an |
| 177 | +# exception anyway after this function is called, even though I think |
| 178 | +# it's doing what it should. It seems that the strict encoder is called |
| 179 | +# to encode the unicode string that this function returns ... |
| 180 | +decoded=replace_surrogate_encode(mystring) |
| 181 | +else: |
| 182 | +raiseexc |
| 183 | +exceptNotASurrogateError: |
| 184 | +raiseexc |
| 185 | +return (decoded,exc.end) |
| 186 | + |
| 187 | + |
| 188 | +classNotASurrogateError(Exception): |
| 189 | +pass |
| 190 | + |
| 191 | + |
| 192 | +defreplace_surrogate_encode(mystring): |
| 193 | +""" |
| 194 | + Returns a (unicode) string, not the more logical bytes, because the codecs |
| 195 | + register_error functionality expects this. |
| 196 | + """ |
| 197 | +decoded= [] |
| 198 | +forchinmystring: |
| 199 | +# if PY3: |
| 200 | +# code = ch |
| 201 | +# else: |
| 202 | +code=ord(ch) |
| 203 | + |
| 204 | +# The following magic comes from Py3.3's Python/codecs.c file: |
| 205 | +ifnot0xD800<=code<=0xDCFF: |
| 206 | +# Not a surrogate. Fail with the original exception. |
| 207 | +raiseexc |
| 208 | +# mybytes = [0xe0 | (code >> 12), |
| 209 | +# 0x80 | ((code >> 6) & 0x3f), |
| 210 | +# 0x80 | (code & 0x3f)] |
| 211 | +# Is this a good idea? |
| 212 | +if0xDC00<=code<=0xDC7F: |
| 213 | +decoded.append(_unichr(code-0xDC00)) |
| 214 | +elifcode<=0xDCFF: |
| 215 | +decoded.append(_unichr(code-0xDC00)) |
| 216 | +else: |
| 217 | +raiseNotASurrogateError |
| 218 | +returnstr().join(decoded) |
| 219 | + |
| 220 | + |
| 221 | +defreplace_surrogate_decode(mybytes): |
| 222 | +""" |
| 223 | + Returns a (unicode) string |
| 224 | + """ |
| 225 | +decoded= [] |
| 226 | +forchinmybytes: |
| 227 | +# We may be parsing newbytes (in which case ch is an int) or a native |
| 228 | +# str on Py2 |
| 229 | +ifisinstance(ch,int): |
| 230 | +code=ch |
| 231 | +else: |
| 232 | +code=ord(ch) |
| 233 | +if0x80<=code<=0xFF: |
| 234 | +decoded.append(_unichr(0xDC00+code)) |
| 235 | +elifcode<=0x7F: |
| 236 | +decoded.append(_unichr(code)) |
| 237 | +else: |
| 238 | +# # It may be a bad byte |
| 239 | +# # Try swallowing it. |
| 240 | +# continue |
| 241 | +# print("RAISE!") |
| 242 | +raiseNotASurrogateError |
| 243 | +returnstr().join(decoded) |
| 244 | + |
| 245 | + |
| 246 | +defencodefilename(fn): |
| 247 | +ifFS_ENCODING=='ascii': |
| 248 | +# ASCII encoder of Python 2 expects that the error handler returns a |
| 249 | +# Unicode string encodable to ASCII, whereas our surrogateescape error |
| 250 | +# handler has to return bytes in 0x80-0xFF range. |
| 251 | +encoded= [] |
| 252 | +forindex,chinenumerate(fn): |
| 253 | +code=ord(ch) |
| 254 | +ifcode<128: |
| 255 | +ch=bytes_chr(code) |
| 256 | +elif0xDC80<=code<=0xDCFF: |
| 257 | +ch=bytes_chr(code-0xDC00) |
| 258 | +else: |
| 259 | +raiseUnicodeEncodeError(FS_ENCODING, |
| 260 | +fn,index,index+1, |
| 261 | +'ordinal not in range(128)') |
| 262 | +encoded.append(ch) |
| 263 | +returnbytes().join(encoded) |
| 264 | +elifFS_ENCODING=='utf-8': |
| 265 | +# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF |
| 266 | +# doesn't go through our error handler |
| 267 | +encoded= [] |
| 268 | +forindex,chinenumerate(fn): |
| 269 | +code=ord(ch) |
| 270 | +if0xD800<=code<=0xDFFF: |
| 271 | +if0xDC80<=code<=0xDCFF: |
| 272 | +ch=bytes_chr(code-0xDC00) |
| 273 | +encoded.append(ch) |
| 274 | +else: |
| 275 | +raiseUnicodeEncodeError( |
| 276 | +FS_ENCODING, |
| 277 | +fn,index,index+1,'surrogates not allowed') |
| 278 | +else: |
| 279 | +ch_utf8=ch.encode('utf-8') |
| 280 | +encoded.append(ch_utf8) |
| 281 | +returnbytes().join(encoded) |
| 282 | +else: |
| 283 | +returnfn.encode(FS_ENCODING,FS_ERRORS) |
| 284 | + |
| 285 | +defdecodefilename(fn): |
| 286 | +returnfn.decode(FS_ENCODING,FS_ERRORS) |
| 287 | + |
| 288 | +FS_ENCODING='ascii';fn=b('[abc\xff]');encoded=u('[abc\udcff]') |
| 289 | +# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') |
| 290 | +# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') |
| 291 | + |
| 292 | + |
| 293 | +# normalize the filesystem encoding name. |
| 294 | +# For example, we expect "utf-8", not "UTF8". |
| 295 | +FS_ENCODING=codecs.lookup(FS_ENCODING).name |
| 296 | + |
| 297 | + |
| 298 | +defregister_surrogateescape(): |
| 299 | +""" |
| 300 | + Registers the surrogateescape error handler on Python 2 (only) |
| 301 | + """ |
| 302 | +ifPY3: |
| 303 | +return |
| 304 | +try: |
| 305 | +codecs.lookup_error(FS_ERRORS) |
| 306 | +exceptLookupError: |
| 307 | +codecs.register_error(FS_ERRORS,surrogateescape_handler) |
| 308 | + |
| 309 | + |
| 310 | +try: |
| 311 | +"hello".decode(defenc,"surrogateescape") |
| 312 | +except: |
| 313 | +register_surrogateescape() |