Commitac62760

committed

index.add: now uses gitdb.store functionality instead of git-hash-file. The python version is about as fast, but could support multithreading using async

1 parentf164627 commitac62760Copy full SHA for ac62760

File tree

5 files changed

+678

-647

lines changed

lib/git
- index
  - base.py
- repo.py
test/git
- performance
  - lib.py
  - test_odb.py
- test_index.py

5 files changed

+678

-647

lines changed

`‎lib/git/index/base.py‎`

Lines changed: 46 additions & 44 deletions

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@`
`12`	`12`	`importstat`
`13`	`13`	`importsubprocess`
`14`	`14`	`importglob`
	`15`	`+fromcStringIOimportStringIO`
`15`	`16`
`16`	`17`	`fromtypimport*`
`17`	`18`	`fromutilimport (`
`@@ -48,6 +49,10 @@`
`48`	`49`	`)`
`49`	`50`
`50`	`51`
	`52`	`+fromgitdb.baseimport (`
	`53`	`+IStream`
	`54`	`+)`
	`55`	`+`
`51`	`56`	`__all__= ('IndexFile','CheckoutError' )`
`52`	`57`
`53`	`58`
`@@ -255,9 +260,6 @@ def write(self, file_path = None, ignore_tree_extension_data=False):`
`255`	`260`
`256`	`261`	`Returns`
`257`	`262`	`self`
`258`		`-`
`259`		`-Note`
`260`		`-Index writing based on the dulwich implementation`
`261`	`263`	`"""`
`262`	`264`	`lfd=LockedFD(file_pathorself._file_path)`
`263`	`265`	`stream=lfd.open(write=True,stream=True)`
`@@ -634,12 +636,10 @@ def _preprocess_add_items(self, items):`
`634`	`636`	`# END for each item`
`635`	`637`	`return (paths,entries)`
`636`	`638`
`637`		`-`
`638`	`639`	`@clear_cache`
`639`	`640`	`@default_index`
`640`	`641`	`defadd(self,items,force=True,fprogress=lambda*args:None,path_rewriter=None):`
`641`		`-"""`
`642`		`-Add files from the working tree, specific blobs or BaseIndexEntries`
	`642`	`+"""Add files from the working tree, specific blobs or BaseIndexEntries`
`643`	`643`	`to the index. The underlying index file will be written immediately, hence`
`644`	`644`	`you should provide as many items as possible to minimize the amounts of writes`
`645`	`645`
`@@ -695,7 +695,7 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non`
`695`	`695`
`696`	`696`	`:param fprogress:`
`697`	`697`	`Function with signature f(path, done=False, item=item) called for each`
`698`		`-path to be added,once once it is about to be added where done==False`
	`698`	`+path to be added,one time once it is about to be added where done==False`
`699`	`699`	`and once after it was added where done=True.`
`700`	`700`	`item is set to the actual item we handle, either a Path or a BaseIndexEntry`
`701`	`701`	`Please note that the processed path is not guaranteed to be present`
`@@ -713,8 +713,8 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non`
`713`	`713`	`:return:`
`714`	`714`	`List(BaseIndexEntries) representing the entries just actually added.`
`715`	`715`
`716`		`-Raises`
`717`		`-GitCommandErrorif a supplied Path did not exist. Please note that BaseIndexEntry`
	`716`	`+:raise OSError:`
	`717`	`+if a supplied Path did not exist. Please note that BaseIndexEntry`
`718`	`718`	`Objects that do not have a null sha will be added even if their paths`
`719`	`719`	`do not exist.`
`720`	`720`	`"""`
`@@ -734,28 +734,45 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non`
`734`	`734`	`del(paths[:])`
`735`	`735`	`# END rewrite paths`
`736`	`736`
	`737`	`+`
	`738`	`+defstore_path(filepath):`
	`739`	`+"""Store file at filepath in the database and return the base index entry"""`
	`740`	`+st=os.lstat(filepath)# handles non-symlinks as well`
	`741`	`+stream=None`
	`742`	`+ifstat.S_ISLNK(st.st_mode):`
	`743`	`+stream=StringIO(os.readlink(filepath))`
	`744`	`+else:`
	`745`	`+stream=open(filepath,'rb')`
	`746`	`+# END handle stream`
	`747`	`+fprogress(filepath,False,filepath)`
	`748`	`+istream=self.repo.odb.store(IStream(Blob.type,st.st_size,stream))`
	`749`	`+fprogress(filepath,True,filepath)`
	`750`	`+`
	`751`	`+returnBaseIndexEntry((st.st_mode,istream.sha,0,filepath))`
	`752`	`+# END utility method`
	`753`	`+`
	`754`	`+`
`737`	`755`	`# HANDLE PATHS`
`738`	`756`	`ifpaths:`
`739`		`-# to get suitable progress information, pipe paths to stdin`
`740`		`-args= ("--add","--replace","--verbose","--stdin")`
`741`		`-proc=self.repo.git.update_index(args,*{'as_process':True,'istream':subprocess.PIPE})`
`742`		`-make_exc=lambda :GitCommandError(("git-update-index",)+args,128,proc.stderr.read())`
	`757`	`+assertlen(entries_added)==0`
`743`	`758`	`added_files=list()`
`744`		`-`
`745`	`759`	`forfilepathinself._iter_expand_paths(paths):`
`746`		`-self._write_path_to_stdin(proc,filepath,filepath,make_exc,`
`747`		`-fprogress,read_from_stdout=False)`
`748`		`-added_files.append(filepath)`
	`760`	`+entries_added.append(store_path(filepath))`
`749`	`761`	`# END for each filepath`
`750`		`-self._flush_stdin_and_wait(proc,ignore_stdout=True)# ignore stdout`
	`762`	`+`
	`763`	`+# add the new entries to this instance, and write it`
	`764`	`+forentryinentries_added:`
	`765`	`+self.entries[(entry.path,0)]=IndexEntry.from_base(entry)`
`751`	`766`
`752`		`-# force rereading our entries once it is all done`
`753`		`-self._delete_entries_cache()`
`754`		`-entries_added.extend(self.entries[(f,0)]forfinadded_files)`
	`767`	`+# finally write the changed index`
	`768`	`+self.write()`
`755`	`769`	`# END path handling`
`756`	`770`
	`771`	`+`
`757`	`772`	`# HANDLE ENTRIES`
`758`	`773`	`ifentries:`
	`774`	`+# TODO: Add proper IndexEntries to ourselves, and write the index`
	`775`	`+# just once. Currently its done twice at least`
`759`	`776`	`null_mode_entries= [eforeinentriesife.mode==0 ]`
`760`	`777`	`ifnull_mode_entries:`
`761`	`778`	`raiseValueError("At least one Entry has a null-mode - please use index.remove to remove files for clarity")`
`@@ -765,37 +782,22 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non`
`765`	`782`	`# create objects if required, otherwise go with the existing shas`
`766`	`783`	`null_entries_indices= [ifori,einenumerate(entries)ife.sha==Object.NULL_HEX_SHA ]`
`767`	`784`	`ifnull_entries_indices:`
`768`		`-# creating object ids is the time consuming part. Hence we will`
`769`		`-# send progress for these now.`
`770`		`-args= ("-w","--stdin-paths")`
`771`		`-proc=self.repo.git.hash_object(args,*{'istream':subprocess.PIPE,'as_process':True})`
`772`		`-make_exc=lambda :GitCommandError(("git-hash-object",)+args,128,proc.stderr.read())`
`773`		`-obj_ids=list()`
`774`	`785`	`foreiinnull_entries_indices:`
`775`		`-entry=entries[ei]`
`776`		`-obj_ids.append(self._write_path_to_stdin(proc,entry.path,entry,`
`777`		`-make_exc,fprogress,read_from_stdout=True))`
	`786`	`+null_entry=entries[ei]`
	`787`	`+new_entry=store_path(null_entry.path)`
	`788`	`+`
	`789`	`+# update null entry`
	`790`	`+entries[ei]=BaseIndexEntry((null_entry.mode,new_entry.sha,null_entry.stage,null_entry.path))`
`778`	`791`	`# END for each entry index`
`779`		`-assertlen(obj_ids)==len(null_entries_indices),"git-hash-object did not produce all requested objects: want %i, got %i"% (len(null_entries_indices),len(obj_ids) )`
`780`		`-`
`781`		`-# update IndexEntries with new object id`
`782`		`-fori,new_shainzip(null_entries_indices,obj_ids):`
`783`		`-e=entries[i]`
`784`		`-`
`785`		`-new_entry=BaseIndexEntry((e.mode,new_sha,e.stage,e.path))`
`786`		`-entries[i]=new_entry`
`787`		`-# END for each index`
`788`	`792`	`# END null_entry handling`
`789`	`793`
`790`	`794`	`# REWRITE PATHS`
`791`	`795`	`# If we have to rewrite the entries, do so now, after we have generated`
`792`	`796`	`# all object sha's`
`793`	`797`	`ifpath_rewriter:`
`794`		`-new_entries=list()`
`795`		`-foreinentries:`
`796`		`-new_entries.append(BaseIndexEntry((e.mode,e.sha,e.stage,path_rewriter(e))))`
	`798`	`+fori,einenumerate(entries):`
	`799`	`+entries[i]=BaseIndexEntry((e.mode,e.sha,e.stage,path_rewriter(e)))`
`797`	`800`	`# END for each entry`
`798`		`-entries=new_entries`
`799`	`801`	`# END handle path rewriting`
`800`	`802`
`801`	`803`	`# feed pure entries to stdin`
`@@ -821,7 +823,7 @@ def add(self, items, force=True, fprogress=lambda *args: None, path_rewriter=Non`
`821`	`823`	`self._flush_stdin_and_wait(proc,ignore_stdout=True)`
`822`	`824`	`entries_added.extend(entries)`
`823`	`825`	`# END if there are base entries`
`824`		`-`
	`826`	`+`
`825`	`827`	`returnentries_added`
`826`	`828`
`827`	`829`	`def_items_to_rela_paths(self,items):`

`‎lib/git/repo.py‎`

Lines changed: 11 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -723,16 +723,17 @@ def init(cls, path=None, mkdir=True, **kwargs):`
`723`	`723`	`returnRepo(path)`
`724`	`724`
`725`	`725`	`defclone(self,path,**kwargs):`
`726`		`-"""`
`727`		`-Create a clone from this repository.`
`728`		`-`
`729`		-``path``
	`726`	`+"""Create a clone from this repository.`
	`727`	`+:param path:`
`730`	`728`	`is the full path of the new repo (traditionally ends with ./<name>.git).`
`731`	`729`
`732`		-``kwargs``
`733`		`-keyword arguments to be given to the git-clone command`
`734`		`-`
`735`		`-Returns`
	`730`	`+:param kwargs:`
	`731`	`+odbt = ObjectDatabase Type, allowing to determine the object database`
	`732`	`+implementation used by the returned Repo instance`
	`733`	`+`
	`734`	`+All remaining keyword arguments are given to the git-clone command`
	`735`	`+`
	`736`	`+:return:`
`736`	`737`	``git.Repo`` (the newly cloned repo)
`737`	`738`	`"""`
`738`	`739`	`# special handling for windows for path at which the clone should be`
`@@ -741,6 +742,7 @@ def clone(self, path, **kwargs):`
`741`	`742`	`# we at least give a proper error instead of letting git fail`
`742`	`743`	`prev_cwd=None`
`743`	`744`	`prev_path=None`
	`745`	`+odbt=kwargs.pop('odbt',GitCmdObjectDB)`
`744`	`746`	`ifos.name=='nt':`
`745`	`747`	`if'~'inpath:`
`746`	`748`	`raiseOSError("Git cannot handle the ~ character in path %r correctly"%path)`
`@@ -767,7 +769,7 @@ def clone(self, path, **kwargs):`
`767`	`769`	`path=prev_path`
`768`	`770`	`# END reset previous working dir`
`769`	`771`	`# END bad windows handling`
`770`		`-returnRepo(path)`
	`772`	`+returnRepo(path,odbt=odbt)`
`771`	`773`
`772`	`774`
`773`	`775`	`defarchive(self,ostream,treeish=None,prefix=None,**kwargs):`

`‎test/git/performance/lib.py‎`

Lines changed: 18 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,11 @@`
`4`	`4`	`importshutil`
`5`	`5`	`importtempfile`
`6`	`6`
	`7`	`+fromgit.dbimport (`
	`8`	`+GitCmdObjectDB,`
	`9`	`+GitDB`
	`10`	`+)`
	`11`	`+`
`7`	`12`	`fromgitimport (`
`8`	`13`	`Repo`
`9`	`14`	`)`
`@@ -31,9 +36,14 @@ class TestBigRepoR(TestBase):`
`31`	`36`	`"""TestCase providing access to readonly 'big' repositories using the following`
`32`	`37`	`member variables:`
`33`	`38`
`34`		`-*gitrepo`
	`39`	`+*gitrorepo`
`35`	`40`
`36`		`- * Read-Only git repository - actually the repo of git itself"""`
	`41`	`+ * Read-Only git repository - actually the repo of git itself`
	`42`	`+`
	`43`	`+ * puregitrorepo`
	`44`	`+`
	`45`	`+ * As gitrepo, but uses pure python implementation`
	`46`	`+ """`
`37`	`47`
`38`	`48`	`#{ Invariants`
`39`	`49`	`head_sha_2k='235d521da60e4699e5bd59ac658b5b48bd76ddca'`
`@@ -43,20 +53,23 @@ class TestBigRepoR(TestBase):`
`43`	`53`	`@classmethod`
`44`	`54`	`defsetUpAll(cls):`
`45`	`55`	`super(TestBigRepoR,cls).setUpAll()`
`46`		`-cls.gitrorepo=Repo(resolve_or_fail(k_env_git_repo))`
	`56`	`+repo_path=resolve_or_fail(k_env_git_repo)`
	`57`	`+cls.gitrorepo=Repo(repo_path,odbt=GitCmdObjectDB)`
	`58`	`+cls.puregitrorepo=Repo(repo_path,odbt=GitDB)`
`47`	`59`
`48`	`60`
`49`	`61`	`classTestBigRepoRW(TestBigRepoR):`
`50`	`62`	`"""As above, but provides a big repository that we can write to.`
`51`	`63`
`52`		-Provides ``self.gitrwrepo``"""
	`64`	+Provides ``self.gitrwrepo`` and ``self.puregitrwrepo``"""
`53`	`65`
`54`	`66`	`@classmethod`
`55`	`67`	`defsetUpAll(cls):`
`56`	`68`	`super(TestBigRepoRW,cls).setUpAll()`
`57`	`69`	`dirname=tempfile.mktemp()`
`58`	`70`	`os.mkdir(dirname)`
`59`		`-cls.gitrwrepo=cls.gitrorepo.clone(dirname,shared=True,bare=True)`
	`71`	`+cls.gitrwrepo=cls.gitrorepo.clone(dirname,shared=True,bare=True,odbt=GitCmdObjectDB)`
	`72`	`+cls.puregitrwrepo=Repo(dirname,odbt=GitDB)`
`60`	`73`
`61`	`74`	`@classmethod`
`62`	`75`	`deftearDownAll(cls):`

`‎test/git/performance/test_odb.py‎`

Lines changed: 52 additions & 44 deletions

Original file line number	Diff line number	Diff line change
`@@ -12,50 +12,58 @@`
`12`	`12`	`classTestObjDBPerformance(TestBigRepoR):`
`13`	`13`
`14`	`14`	`deftest_random_access(self):`
`15`		`-`
`16`		`-# GET COMMITS`
`17`		`-# TODO: use the actual db for this`
`18`		`-st=time()`
`19`		`-root_commit=self.gitrorepo.commit(self.head_sha_2k)`
`20`		`-commits=list(root_commit.traverse())`
`21`		`-nc=len(commits)`
`22`		`-elapsed=time()-st`
`23`		`-`
`24`		`-print>>sys.stderr,"Retrieved %i commits from ObjectStore in %g s ( %f commits / s )"% (nc,elapsed,nc/elapsed)`
	`15`	`+results= [ ["Iterate Commits"], ["Iterate Blobs"], ["Retrieve Blob Data"] ]`
	`16`	`+forrepoin (self.gitrorepo,self.puregitrorepo):`
	`17`	`+# GET COMMITS`
	`18`	`+st=time()`
	`19`	`+root_commit=repo.commit(self.head_sha_2k)`
	`20`	`+commits=list(root_commit.traverse())`
	`21`	`+nc=len(commits)`
	`22`	`+elapsed=time()-st`
`25`	`23`
	`24`	`+print>>sys.stderr,"%s: Retrieved %i commits from ObjectStore in %g s ( %f commits / s )"% (type(repo.odb),nc,elapsed,nc/elapsed)`
	`25`	`+results[0].append(elapsed)`
	`26`	`+`
	`27`	`+# GET TREES`
	`28`	`+# walk all trees of all commits`
	`29`	`+st=time()`
	`30`	`+blobs_per_commit=list()`
	`31`	`+nt=0`
	`32`	`+forcommitincommits:`
	`33`	`+tree=commit.tree`
	`34`	`+blobs=list()`
	`35`	`+foritemintree.traverse():`
	`36`	`+nt+=1`
	`37`	`+ifitem.type=='blob':`
	`38`	`+blobs.append(item)`
	`39`	`+# direct access for speed`
	`40`	`+# END while trees are there for walking`
	`41`	`+blobs_per_commit.append(blobs)`
	`42`	`+# END for each commit`
	`43`	`+elapsed=time()-st`
`26`	`44`
`27`		`-# GET TREES`
`28`		`-# walk all trees of all commits`
`29`		`-st=time()`
`30`		`-blobs_per_commit=list()`
`31`		`-nt=0`
`32`		`-forcommitincommits:`
`33`		`-tree=commit.tree`
`34`		`-blobs=list()`
`35`		`-foritemintree.traverse():`
`36`		`-nt+=1`
`37`		`-ifitem.type=='blob':`
`38`		`-blobs.append(item)`
`39`		`-# direct access for speed`
`40`		`-# END while trees are there for walking`
`41`		`-blobs_per_commit.append(blobs)`
`42`		`-# END for each commit`
`43`		`-elapsed=time()-st`
`44`		`-`
`45`		`-print>>sys.stderr,"Retrieved %i objects from %i commits in %g s ( %f objects / s )"% (nt,len(commits),elapsed,nt/elapsed)`
`46`		`-`
`47`		`-# GET BLOBS`
`48`		`-st=time()`
`49`		`-nb=0`
`50`		`-too_many=15000`
`51`		`-forblob_listinblobs_per_commit:`
`52`		`-forblobinblob_list:`
`53`		`-blob.data`
`54`		`-# END for each blobsha`
`55`		`-nb+=len(blob_list)`
`56`		`-ifnb>too_many:`
`57`		`-break`
`58`		`-# END for each bloblist`
`59`		`-elapsed=time()-st`
	`45`	`+print>>sys.stderr,"%s: Retrieved %i objects from %i commits in %g s ( %f objects / s )"% (type(repo.odb),nt,len(commits),elapsed,nt/elapsed)`
	`46`	`+results[1].append(elapsed)`
	`47`	`+`
	`48`	`+# GET BLOBS`
	`49`	`+st=time()`
	`50`	`+nb=0`
	`51`	`+too_many=15000`
	`52`	`+forblob_listinblobs_per_commit:`
	`53`	`+forblobinblob_list:`
	`54`	`+blob.data`
	`55`	`+# END for each blobsha`
	`56`	`+nb+=len(blob_list)`
	`57`	`+ifnb>too_many:`
	`58`	`+break`
	`59`	`+# END for each bloblist`
	`60`	`+elapsed=time()-st`
	`61`	`+`
	`62`	`+print>>sys.stderr,"%s: Retrieved %i blob and their data in %g s ( %f blobs / s )"% (type(repo.odb),nb,elapsed,nb/elapsed)`
	`63`	`+results[2].append(elapsed)`
	`64`	`+# END for each repo type`
`60`	`65`
`61`		`-print>>sys.stderr,"Retrieved %i blob and their data in %g s ( %f blobs / s )"% (nb,elapsed,nb/elapsed)`
	`66`	`+# final results`
	`67`	`+fortest_name,a,binresults:`
	`68`	`+print>>sys.stderr,"%s: %f s vs %f s, pure is %f times slower"% (test_name,a,b,b/a)`
	`69`	`+# END for each result`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commitac62760

File tree

5 files changed

5 files changed

`‎lib/git/index/base.py‎`

`‎lib/git/repo.py‎`

`‎test/git/performance/lib.py‎`

`‎test/git/performance/test_odb.py‎`

0 commit comments