Commit4b4a514

committed

Added performance comparison to cgit ... and yes, git-python is faster :)

1 parent26e138c commit4b4a514Copy full SHA for 4b4a514

File tree

2 files changed

+87

-11

lines changed

lib/git/odb
- utils.py
test/git/performance
- test_streams.py

2 files changed

+87

-11

lines changed

`‎lib/git/odb/utils.py‎`

Lines changed: 27 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -103,10 +103,12 @@ class DecompressMemMapReader(object):`
`103`	`103`	`times we actually allocate. An own zlib implementation would be good here`
`104`	`104`	`to better support streamed reading - it would only need to keep the mmap`
`105`	`105`	`and decompress it into chunks, thats all ... """`
`106`		`-__slots__= ('_m','_zip','_buf','_buflen','_br','_cws','_cwe','_s','_cs','_close')`
	`106`	`+__slots__= ('_m','_zip','_buf','_buflen','_br','_cws','_cwe','_s','_close')`
`107`	`107`
`108`		`-def__init__(self,m,close_on_deletion,cs=128*1024):`
`109`		`-"""Initialize with mmap and chunk_size for stream reading"""`
	`108`	`+max_read_size=512*1024`
	`109`	`+`
	`110`	`+def__init__(self,m,close_on_deletion):`
	`111`	`+"""Initialize with mmap for stream reading"""`
`110`	`112`	`self._m=m`
`111`	`113`	`self._zip=zlib.decompressobj()`
`112`	`114`	`self._buf=None# buffer of decompressed bytes`
`@@ -115,7 +117,6 @@ def __init__(self, m, close_on_deletion, cs = 128*1024):`
`115`	`117`	`self._br=0# num uncompressed bytes read`
`116`	`118`	`self._cws=0# start byte of compression window`
`117`	`119`	`self._cwe=0# end byte of compression window`
`118`		`-self._cs=cs# chunk size (when reading from zip)`
`119`	`120`	`self._close=close_on_deletion# close the memmap on deletion ?`
`120`	`121`
`121`	`122`	`def__del__(self):`
`@@ -163,6 +164,28 @@ def read(self, size=-1):`
`163`	`164`	`returnstr()`
`164`	`165`	`# END handle depletion`
`165`	`166`
	`167`	`+# protect from memory peaks`
	`168`	`+# If he tries to read large chunks, our memory patterns get really bad`
	`169`	`+# as we end up copying a possibly huge chunk from our memory map right into`
	`170`	`+# memory. This might not even be possible. Nonetheless, try to dampen the`
	`171`	`+# effect a bit by reading in chunks, returning a huge string in the end.`
	`172`	`+# Our performance now depends on StringIO. This way we don't need two large`
	`173`	`+# buffers in peak times, but only one large one in the end which is`
	`174`	`+# the return buffer`
	`175`	`+ifsize>self.max_read_size:`
	`176`	`+sio=StringIO()`
	`177`	`+whilesize:`
	`178`	`+read_size=min(self.max_read_size,size)`
	`179`	`+data=self.read(read_size)`
	`180`	`+sio.write(data)`
	`181`	`+size-=len(data)`
	`182`	`+iflen(data)<read_size:`
	`183`	`+break`
	`184`	`+# END data loop`
	`185`	`+sio.seek(0)`
	`186`	`+returnsio.getvalue()`
	`187`	`+# END handle maxread`
	`188`	`+`
`166`	`189`	`# deplete the buffer, then just continue using the decompress object`
`167`	`190`	`# which has an own buffer. We just need this to transparently parse the`
`168`	`191`	`# header from the zlib stream`

`‎test/git/performance/test_streams.py‎`

Lines changed: 60 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@`
`10`	`10`	`importsys`
`11`	`11`	`importstat`
`12`	`12`	`importrandom`
	`13`	`+importsubprocess`
`13`	`14`
`14`	`15`
`15`	`16`	`fromlibimport (`
`@@ -51,23 +52,24 @@ def test_large_data_streaming(self, rwrepo):`
`51`	`52`	`# writing - due to the compression it will seem faster than it is`
`52`	`53`	`st=time()`
`53`	`54`	`sha=ldb.to_object('blob',size,stream)`
`54`		`-elapsed=time()-st`
	`55`	`+elapsed_add=time()-st`
`55`	`56`	`assertldb.has_object(sha)`
`56`		`-fsize_kib=os.path.getsize(ldb.readable_db_object_path(sha))/1000`
	`57`	`+db_file=ldb.readable_db_object_path(sha)`
	`58`	`+fsize_kib=os.path.getsize(db_file)/1000`
`57`	`59`
`58`	`60`
`59`	`61`	`size_kib=size/1000`
`60`		`-print>>sys.stderr,"Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)"% (size_kib,fsize_kib,desc,elapsed,size_kib/elapsed)`
	`62`	`+print>>sys.stderr,"Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)"% (size_kib,fsize_kib,desc,elapsed_add,size_kib/elapsed_add)`
`61`	`63`
`62`	`64`	`# reading all at once`
`63`	`65`	`st=time()`
`64`	`66`	`type,size,shastream=ldb.object(sha)`
`65`	`67`	`shadata=shastream.read()`
`66`		`-elapsed=time()-st`
	`68`	`+elapsed_readall=time()-st`
`67`	`69`
`68`	`70`	`stream.seek(0)`
`69`	`71`	`assertshadata==stream.getvalue()`
`70`		`-print>>sys.stderr,"Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)"% (size_kib,desc,elapsed,size_kib/elapsed)`
	`72`	`+print>>sys.stderr,"Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)"% (size_kib,desc,elapsed_readall,size_kib/elapsed_readall)`
`71`	`73`
`72`	`74`
`73`	`75`	`# reading in chunks of 1 MiB`
`@@ -81,11 +83,62 @@ def test_large_data_streaming(self, rwrepo):`
`81`	`83`	`iflen(data)<cs:`
`82`	`84`	`break`
`83`	`85`	`# END read in chunks`
`84`		`-elapsed=time()-st`
	`86`	`+elapsed_readchunks=time()-st`
`85`	`87`
`86`	`88`	`stream.seek(0)`
`87`	`89`	`assert''.join(chunks)==stream.getvalue()`
`88`	`90`
`89`	`91`	`cs_kib=cs/1000`
`90`		`-print>>sys.stderr,"Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)"% (size_kib,desc,cs_kib,elapsed,size_kib/elapsed)`
	`92`	`+print>>sys.stderr,"Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)"% (size_kib,desc,cs_kib,elapsed_readchunks,size_kib/elapsed_readchunks)`
	`93`	`+`
	`94`	`+# del db file so git has something to do`
	`95`	`+os.remove(db_file)`
	`96`	`+`
	`97`	`+# VS. CGIT`
	`98`	`+##########`
	`99`	`+# CGIT ! Can using the cgit programs be faster ?`
	`100`	`+proc=rwrepo.git.hash_object('-w','--stdin',as_process=True,istream=subprocess.PIPE)`
	`101`	`+`
	`102`	`+# write file - pump everything in at once to be a fast as possible`
	`103`	`+data=stream.getvalue()# cache it`
	`104`	`+st=time()`
	`105`	`+proc.stdin.write(data)`
	`106`	`+proc.stdin.close()`
	`107`	`+gitsha=proc.stdout.read().strip()`
	`108`	`+proc.wait()`
	`109`	`+gelapsed_add=time()-st`
	`110`	`+del(data)`
	`111`	`+assertgitsha==sha# we do it the same way, right ?`
	`112`	`+`
	`113`	`+# as its the same sha, we reuse our path`
	`114`	`+fsize_kib=os.path.getsize(db_file)/1000`
	`115`	`+print>>sys.stderr,"Added %i KiB (filesize = %i KiB) of %s data to using git-hash-object in %f s ( %f Write KiB / s)"% (size_kib,fsize_kib,desc,gelapsed_add,size_kib/gelapsed_add)`
	`116`	`+`
	`117`	`+# compare ...`
	`118`	`+print>>sys.stderr,"Git-Python is %f %% faster than git when adding big %s files"% (100.0- (elapsed_add/gelapsed_add)*100,desc)`
	`119`	`+`
	`120`	`+`
	`121`	`+# read all`
	`122`	`+st=time()`
	`123`	`+s,t,size,data=rwrepo.git.get_object_data(gitsha)`
	`124`	`+gelapsed_readall=time()-st`
	`125`	`+print>>sys.stderr,"Read %i KiB of %s data at once using git-cat-file in %f s ( %f Read KiB / s)"% (size_kib,desc,gelapsed_readall,size_kib/gelapsed_readall)`
	`126`	`+`
	`127`	`+# compare`
	`128`	`+print>>sys.stderr,"Git-Python is %f %% faster than git when reading big %sfiles"% (100.0- (elapsed_readall/gelapsed_readall)*100,desc)`
	`129`	`+`
	`130`	`+`
	`131`	`+# read chunks`
	`132`	`+st=time()`
	`133`	`+s,t,size,stream=rwrepo.git.stream_object_data(gitsha)`
	`134`	`+whileTrue:`
	`135`	`+data=stream.read(cs)`
	`136`	`+iflen(data)<cs:`
	`137`	`+break`
	`138`	`+# END read stream`
	`139`	`+gelapsed_readchunks=time()-st`
	`140`	`+print>>sys.stderr,"Read %i KiB of %s data in %i KiB chunks from git-cat-file in %f s ( %f Read KiB / s)"% (size_kib,desc,cs_kib,gelapsed_readchunks,size_kib/gelapsed_readchunks)`
	`141`	`+`
	`142`	`+# compare`
	`143`	`+print>>sys.stderr,"Git-Python is %f %% faster than git when reading big %s files in chunks"% (100.0- (elapsed_readchunks/gelapsed_readchunks)*100,desc)`
`91`	`144`	`# END for each randomization factor`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit4b4a514

File tree

2 files changed

2 files changed

`‎lib/git/odb/utils.py‎`

`‎test/git/performance/test_streams.py‎`

0 commit comments