Commit93d5302

committed

fix(unicode): use surrogateescape in bytes.decode

That way, we will try to decode as default encoding (usuallyutf-8), but allow ourselves to simply keep bytes that don'tmatch within the resulting unicode string.That way, we allow for lossless decode/encode cycles while stillassuring that decoding never fails.NOTE: I was too lazy to create a test that would verify it, but manuallyexecutedhttps://github.com/petertodd/gitpython-unicode-error.fixes#532

1 parentff389af commit93d5302Copy full SHA for 93d5302

File tree

3 files changed

+193

-7

lines changed

git
- compat.py
- objects
  - fun.py
- test/performance
  - test_commit.py

3 files changed

+193

-7

lines changed

`‎git/compat.py`

Lines changed: 191 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,8 @@`
`10`	`10`	`importlocale`
`11`	`11`	`importos`
`12`	`12`	`importsys`
	`13`	`+importcodecs`
	`14`	`+`
`13`	`15`
`14`	`16`	`fromgitdb.utils.compatimport (`
`15`	`17`	`xrange,`
`@@ -67,7 +69,7 @@ def safe_decode(s):`
`67`	`69`	`ifisinstance(s,unicode):`
`68`	`70`	`returns`
`69`	`71`	`elifisinstance(s,bytes):`
`70`		`-returns.decode(defenc,'replace')`
	`72`	`+returns.decode(defenc,'surrogateescape')`
`71`	`73`	`elifsisnotNone:`
`72`	`74`	`raiseTypeError('Expected bytes or text, but got %r'% (s,))`
`73`	`75`
`@@ -121,3 +123,191 @@ def __str__(self):`
`121`	`123`	`else:# Python 2`
`122`	`124`	`def__str__(self):`
`123`	`125`	`returnself.__unicode__().encode(defenc)`
	`126`	`+`
	`127`	`+`
	`128`	`+"""`
	`129`	`+This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error`
	`130`	`+handler of Python 3.`
	`131`	`+Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc`
	`132`	`+"""`
	`133`	`+`
	`134`	`+# This code is released under the Python license and the BSD 2-clause license`
	`135`	`+`
	`136`	`+`
	`137`	`+FS_ERRORS='surrogateescape'`
	`138`	`+`
	`139`	`+# # -- Python 2/3 compatibility -------------------------------------`
	`140`	`+# FS_ERRORS = 'my_surrogateescape'`
	`141`	`+`
	`142`	`+defu(text):`
	`143`	`+ifPY3:`
	`144`	`+returntext`
	`145`	`+else:`
	`146`	`+returntext.decode('unicode_escape')`
	`147`	`+`
	`148`	`+defb(data):`
	`149`	`+ifPY3:`
	`150`	`+returndata.encode('latin1')`
	`151`	`+else:`
	`152`	`+returndata`
	`153`	`+`
	`154`	`+ifPY3:`
	`155`	`+_unichr=chr`
	`156`	`+bytes_chr=lambdacode:bytes((code,))`
	`157`	`+else:`
	`158`	`+_unichr=unichr`
	`159`	`+bytes_chr=chr`
	`160`	`+`
	`161`	`+defsurrogateescape_handler(exc):`
	`162`	`+"""`
	`163`	`+ Pure Python implementation of the PEP 383: the "surrogateescape" error`
	`164`	`+ handler of Python 3. Undecodable bytes will be replaced by a Unicode`
	`165`	`+ character U+DCxx on decoding, and these are translated into the`
	`166`	`+ original bytes on encoding.`
	`167`	`+ """`
	`168`	`+mystring=exc.object[exc.start:exc.end]`
	`169`	`+`
	`170`	`+try:`
	`171`	`+ifisinstance(exc,UnicodeDecodeError):`
	`172`	`+# mystring is a byte-string in this case`
	`173`	`+decoded=replace_surrogate_decode(mystring)`
	`174`	`+elifisinstance(exc,UnicodeEncodeError):`
	`175`	`+# In the case of u'\udcc3'.encode('ascii',`
	`176`	`+# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an`
	`177`	`+# exception anyway after this function is called, even though I think`
	`178`	`+# it's doing what it should. It seems that the strict encoder is called`
	`179`	`+# to encode the unicode string that this function returns ...`
	`180`	`+decoded=replace_surrogate_encode(mystring)`
	`181`	`+else:`
	`182`	`+raiseexc`
	`183`	`+exceptNotASurrogateError:`
	`184`	`+raiseexc`
	`185`	`+return (decoded,exc.end)`
	`186`	`+`
	`187`	`+`
	`188`	`+classNotASurrogateError(Exception):`
	`189`	`+pass`
	`190`	`+`
	`191`	`+`
	`192`	`+defreplace_surrogate_encode(mystring):`
	`193`	`+"""`
	`194`	`+ Returns a (unicode) string, not the more logical bytes, because the codecs`
	`195`	`+ register_error functionality expects this.`
	`196`	`+ """`
	`197`	`+decoded= []`
	`198`	`+forchinmystring:`
	`199`	`+# if PY3:`
	`200`	`+# code = ch`
	`201`	`+# else:`
	`202`	`+code=ord(ch)`
	`203`	`+`
	`204`	`+# The following magic comes from Py3.3's Python/codecs.c file:`
	`205`	`+ifnot0xD800<=code<=0xDCFF:`
	`206`	`+# Not a surrogate. Fail with the original exception.`
	`207`	`+raiseexc`
	`208`	`+# mybytes = [0xe0 \| (code >> 12),`
	`209`	`+# 0x80 \| ((code >> 6) & 0x3f),`
	`210`	`+# 0x80 \| (code & 0x3f)]`
	`211`	`+# Is this a good idea?`
	`212`	`+if0xDC00<=code<=0xDC7F:`
	`213`	`+decoded.append(_unichr(code-0xDC00))`
	`214`	`+elifcode<=0xDCFF:`
	`215`	`+decoded.append(_unichr(code-0xDC00))`
	`216`	`+else:`
	`217`	`+raiseNotASurrogateError`
	`218`	`+returnstr().join(decoded)`
	`219`	`+`
	`220`	`+`
	`221`	`+defreplace_surrogate_decode(mybytes):`
	`222`	`+"""`
	`223`	`+ Returns a (unicode) string`
	`224`	`+ """`
	`225`	`+decoded= []`
	`226`	`+forchinmybytes:`
	`227`	`+# We may be parsing newbytes (in which case ch is an int) or a native`
	`228`	`+# str on Py2`
	`229`	`+ifisinstance(ch,int):`
	`230`	`+code=ch`
	`231`	`+else:`
	`232`	`+code=ord(ch)`
	`233`	`+if0x80<=code<=0xFF:`
	`234`	`+decoded.append(_unichr(0xDC00+code))`
	`235`	`+elifcode<=0x7F:`
	`236`	`+decoded.append(_unichr(code))`
	`237`	`+else:`
	`238`	`+# # It may be a bad byte`
	`239`	`+# # Try swallowing it.`
	`240`	`+# continue`
	`241`	`+# print("RAISE!")`
	`242`	`+raiseNotASurrogateError`
	`243`	`+returnstr().join(decoded)`
	`244`	`+`
	`245`	`+`
	`246`	`+defencodefilename(fn):`
	`247`	`+ifFS_ENCODING=='ascii':`
	`248`	`+# ASCII encoder of Python 2 expects that the error handler returns a`
	`249`	`+# Unicode string encodable to ASCII, whereas our surrogateescape error`
	`250`	`+# handler has to return bytes in 0x80-0xFF range.`
	`251`	`+encoded= []`
	`252`	`+forindex,chinenumerate(fn):`
	`253`	`+code=ord(ch)`
	`254`	`+ifcode<128:`
	`255`	`+ch=bytes_chr(code)`
	`256`	`+elif0xDC80<=code<=0xDCFF:`
	`257`	`+ch=bytes_chr(code-0xDC00)`
	`258`	`+else:`
	`259`	`+raiseUnicodeEncodeError(FS_ENCODING,`
	`260`	`+fn,index,index+1,`
	`261`	`+'ordinal not in range(128)')`
	`262`	`+encoded.append(ch)`
	`263`	`+returnbytes().join(encoded)`
	`264`	`+elifFS_ENCODING=='utf-8':`
	`265`	`+# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF`
	`266`	`+# doesn't go through our error handler`
	`267`	`+encoded= []`
	`268`	`+forindex,chinenumerate(fn):`
	`269`	`+code=ord(ch)`
	`270`	`+if0xD800<=code<=0xDFFF:`
	`271`	`+if0xDC80<=code<=0xDCFF:`
	`272`	`+ch=bytes_chr(code-0xDC00)`
	`273`	`+encoded.append(ch)`
	`274`	`+else:`
	`275`	`+raiseUnicodeEncodeError(`
	`276`	`+FS_ENCODING,`
	`277`	`+fn,index,index+1,'surrogates not allowed')`
	`278`	`+else:`
	`279`	`+ch_utf8=ch.encode('utf-8')`
	`280`	`+encoded.append(ch_utf8)`
	`281`	`+returnbytes().join(encoded)`
	`282`	`+else:`
	`283`	`+returnfn.encode(FS_ENCODING,FS_ERRORS)`
	`284`	`+`
	`285`	`+defdecodefilename(fn):`
	`286`	`+returnfn.decode(FS_ENCODING,FS_ERRORS)`
	`287`	`+`
	`288`	`+FS_ENCODING='ascii';fn=b('[abc\xff]');encoded=u('[abc\udcff]')`
	`289`	`+# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')`
	`290`	`+# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')`
	`291`	`+`
	`292`	`+`
	`293`	`+# normalize the filesystem encoding name.`
	`294`	`+# For example, we expect "utf-8", not "UTF8".`
	`295`	`+FS_ENCODING=codecs.lookup(FS_ENCODING).name`
	`296`	`+`
	`297`	`+`
	`298`	`+defregister_surrogateescape():`
	`299`	`+"""`
	`300`	`+ Registers the surrogateescape error handler on Python 2 (only)`
	`301`	`+ """`
	`302`	`+ifPY3:`
	`303`	`+return`
	`304`	`+try:`
	`305`	`+codecs.lookup_error(FS_ERRORS)`
	`306`	`+exceptLookupError:`
	`307`	`+codecs.register_error(FS_ERRORS,surrogateescape_handler)`
	`308`	`+`
	`309`	`+`
	`310`	`+try:`
	`311`	`+"hello".decode(defenc,"surrogateescape")`
	`312`	`+except:`
	`313`	`+register_surrogateescape()`

`‎git/objects/fun.py`

Lines changed: 1 addition & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -76,11 +76,7 @@ def tree_entries_from_data(data):`
`76`	`76`	`# default encoding for strings in git is utf8`
`77`	`77`	`# Only use the respective unicode object if the byte stream was encoded`
`78`	`78`	`name=data[ns:i]`
`79`		`-try:`
`80`		`-name=name.decode(defenc)`
`81`		`-exceptUnicodeDecodeError:`
`82`		`-pass`
`83`		`-# END handle encoding`
	`79`	`+name=name.decode(defenc,'surrogateescape')`
`84`	`80`
`85`	`81`	`# byte is NULL, get next 20`
`86`	`82`	`i+=1`

`‎git/test/performance/test_commit.py`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ def test_iteration(self):`
`52`	`52`	`# END for each object`
`53`	`53`	`# END for each commit`
`54`	`54`	`elapsed_time=time()-st`
`55`		`-print("Traversed %i Trees and a total of %iunchached objects in %s [s] ( %f objs/s )"`
	`55`	`+print("Traversed %i Trees and a total of %iuncached objects in %s [s] ( %f objs/s )"`
`56`	`56`	`% (nc,no,elapsed_time,no/elapsed_time),file=sys.stderr)`
`57`	`57`
`58`	`58`	`deftest_commit_traversal(self):`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit93d5302

File tree

3 files changed

3 files changed

`‎git/compat.py`

`‎git/objects/fun.py`

`‎git/test/performance/test_commit.py`

0 commit comments