From be6cbbefaf6c204a6564154ceebd0942d6aaf46a Mon Sep 17 00:00:00 2001 From: Purdea Andrei Date: Tue, 14 Jul 2015 00:48:40 +0300 Subject: [PATCH 1/3] Refactor the "git fat checkout" operation (and therefore also "git fat pull") to significantly improve its speed. The main change here is that git checkout-index is now called only a single time for all files. In some tests the performance improvement is six-fold. With this change it doesn't make sense anymore to print out each file as it is being processed, so only the number of files that need to be restored is shown. The actual filenames are only printed if git-fat is verbose. --- git-fat | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/git-fat b/git-fat index dd6af72..df18970 100755 --- a/git-fat +++ b/git-fat @@ -403,21 +403,38 @@ class GitFat(object): def checkout(self, show_orphans=False): 'Update any stale files in the present working tree' self.assert_init_done() - for digest, fname in self.orphan_files(): - objpath = os.path.join(self.objdir, digest) - if os.access(objpath, os.R_OK): - print('Restoring %s -> %s' % (digest, fname)) - # The output of our smudge filter depends on the existence of - # the file in .git/fat/objects, but git caches the file stat - # from the previous time the file was smudged, therefore it - # won't try to re-smudge. I don't know a git command that - # specifically invalidates that cache, but touching the file - # also does the trick. - os.utime(fname, None) - # This re-smudge is essentially a copy that restores permissions. - subprocess.check_call(['git', 'checkout-index', '--index', '--force', fname]) - elif show_orphans: - print('Data unavailable: %s %s' % (digest,fname)) + orphan_files_is_accessible = [(digest, fname, os.access(os.path.join(self.objdir, digest), os.R_OK)) for digest, fname in self.orphan_files()] + filenames_to_restore = [fname for digest, fname, accessible in orphan_files_is_accessible if accessible] + + for fname in filenames_to_restore: + # The output of our smudge filter depends on the existence of + # the file in .git/fat/objects, but git caches the file stat + # from the previous time the file was smudged, therefore it + # won't try to re-smudge. I don't know a git command that + # specifically invalidates that cache, but touching the file + # also does the trick. + os.utime(fname, None) + + if show_orphans: + for digest, fname, accessible in orphan_files_is_accessible: + if not accessible: + print('Data unavailable: %s %s' % (digest,fname)) + + filenames_str = "\n".join(filenames_to_restore) + "\n" + + print('Restoring %d Files' % (len(filenames_to_restore),)) + self.verbose(filenames_str) + + if filenames_to_restore: + # This re-smudge is essentially a copy that restores permissions. + cmd = ['git', 'checkout-index', '--stdin', '--index', '--force'] + p = subprocess.Popen(cmd, stdin = subprocess.PIPE) + p.communicate(filenames_str) + retcode = p.wait() + if retcode != 0: + error = subprocess.CalledProcessError(retcode, " ".join(cmd)) + raise error + def cmd_pull(self, args): 'Pull anything that I have referenced, but not stored' self.setup() From 8449d4aaf456ff9c2a3ed2e648b4403853e86e7f Mon Sep 17 00:00:00 2001 From: Purdea Andrei Date: Tue, 14 Jul 2015 01:33:42 +0300 Subject: [PATCH 2/3] git fat checkout / git fat pull: Print digest of restored filenames too in verbose mode. --- git-fat | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/git-fat b/git-fat index df18970..3cc600d 100755 --- a/git-fat +++ b/git-fat @@ -125,7 +125,8 @@ def gitconfig_set(name, value, file=None): class GitFat(object): DecodeError = RuntimeError def __init__(self): - self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore + self.is_verbose = os.environ.get('GIT_FAT_VERBOSE') + self.verbose = verbose_stderr if self.is_verbose else verbose_ignore try: self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() except subprocess.CalledProcessError: @@ -422,8 +423,13 @@ class GitFat(object): filenames_str = "\n".join(filenames_to_restore) + "\n" - print('Restoring %d Files' % (len(filenames_to_restore),)) - self.verbose(filenames_str) + if not self.is_verbose: + print('Restoring %d Files' % (len(filenames_to_restore),)) + else: + print('Restoring %d Files:' % (len(filenames_to_restore),)) + for digest, fname, accessible in orphan_files_is_accessible: + if accessible: + print('%s: %s' % (digest, fname)) if filenames_to_restore: # This re-smudge is essentially a copy that restores permissions. From 3641e8a2664c2964cb90e01ca30ee67efae67e1a Mon Sep 17 00:00:00 2001 From: Purdea Andrei Date: Tue, 14 Jul 2015 01:34:42 +0300 Subject: [PATCH 3/3] git fat checkout, git fat pull: use null-terminated filename strings when talking to git. --- git-fat | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/git-fat b/git-fat index 3cc600d..2d00762 100755 --- a/git-fat +++ b/git-fat @@ -421,7 +421,7 @@ class GitFat(object): if not accessible: print('Data unavailable: %s %s' % (digest,fname)) - filenames_str = "\n".join(filenames_to_restore) + "\n" + filenames_nullterm = "\x00".join(filenames_to_restore) if not self.is_verbose: print('Restoring %d Files' % (len(filenames_to_restore),)) @@ -433,9 +433,9 @@ class GitFat(object): if filenames_to_restore: # This re-smudge is essentially a copy that restores permissions. - cmd = ['git', 'checkout-index', '--stdin', '--index', '--force'] + cmd = ['git', 'checkout-index', '--stdin', '-z', '--index', '--force'] p = subprocess.Popen(cmd, stdin = subprocess.PIPE) - p.communicate(filenames_str) + p.communicate(filenames_nullterm) retcode = p.wait() if retcode != 0: error = subprocess.CalledProcessError(retcode, " ".join(cmd))