From aa338d35087fdfc268d6f386a40464056acc6237 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 20 Nov 2017 15:26:43 -0500
Subject: p5550: factor out nonsense-pack creation

We have a function to create a bunch of irrelevant packs to
measure the expense of reprepare_packed_git(). Let's make
that available to other perf scripts.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/perf/lib-pack.sh         | 29 +++++++++++++++++++++++++++++
 t/perf/p5550-fetch-tags.sh | 25 ++-----------------------
 2 files changed, 31 insertions(+), 23 deletions(-)
 create mode 100644 t/perf/lib-pack.sh

diff --git a/t/perf/lib-pack.sh b/t/perf/lib-pack.sh
new file mode 100644
index 0000000000..501bb7b272
--- /dev/null
+++ b/t/perf/lib-pack.sh
@@ -0,0 +1,29 @@
+# Helpers for dealing with large numbers of packs.
+
+# create $1 nonsense packs, each with a single blob
+create_packs () {
+	perl -le '
+		my ($n) = @ARGV;
+		for (1..$n) {
+			print "blob";
+			print "data <<EOF";
+			print "$_";
+			print "EOF";
+		}
+	' "$@" |
+	git fast-import &&
+
+	git cat-file --batch-all-objects --batch-check='%(objectname)' |
+	while read sha1
+	do
+		echo $sha1 | git pack-objects .git/objects/pack/pack
+	done
+}
+
+# create a large number of packs, disabling any gc which might
+# cause us to repack them
+setup_many_packs () {
+	git config gc.auto 0 &&
+	git config gc.autopacklimit 0 &&
+	create_packs 500
+}
diff --git a/t/perf/p5550-fetch-tags.sh b/t/perf/p5550-fetch-tags.sh
index a5dc39f86a..d0e0e019ea 100755
--- a/t/perf/p5550-fetch-tags.sh
+++ b/t/perf/p5550-fetch-tags.sh
@@ -20,6 +20,7 @@ start to show a noticeable performance problem on my machine, but without
 taking too long to set up and run the tests.
 '
 . ./perf-lib.sh
+. "$TEST_DIRECTORY/perf/lib-pack.sh"
 
 # make a long nonsense history on branch $1, consisting of $2 commits, each
 # with a unique file pointing to the blob at $2.
@@ -44,26 +45,6 @@ create_tags () {
 	git update-ref --stdin
 }
 
-# create $1 nonsense packs, each with a single blob
-create_packs () {
-	perl -le '
-		my ($n) = @ARGV;
-		for (1..$n) {
-			print "blob";
-			print "data <<EOF";
-			print "$_";
-			print "EOF";
-		}
-	' "$@" |
-	git fast-import &&
-
-	git cat-file --batch-all-objects --batch-check='%(objectname)' |
-	while read sha1
-	do
-		echo $sha1 | git pack-objects .git/objects/pack/pack
-	done
-}
-
 test_expect_success 'create parent and child' '
 	git init parent &&
 	git -C parent commit --allow-empty -m base &&
@@ -84,9 +65,7 @@ test_expect_success 'populate parent tags' '
 test_expect_success 'create child packs' '
 	(
 		cd child &&
-		git config gc.auto 0 &&
-		git config gc.autopacklimit 0 &&
-		create_packs 500
+		setup_many_packs
 	)
 '
 
-- 
cgit v1.2.3


From 0a11e40275b445fcc959a0dc05f8d73a5a9cbf36 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 20 Nov 2017 15:27:19 -0500
Subject: t/perf/lib-pack: use fast-import checkpoint to create packs

We currently use fast-import only to create a large number
of objects, and then run O(n) invocations of pack-objects to
turn them into packs.

We can do this faster by just asking fast-import to
checkpoint and create a pack for each (after telling it
not to turn loose tiny packs).

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/perf/lib-pack.sh | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/t/perf/lib-pack.sh b/t/perf/lib-pack.sh
index 501bb7b272..d3865db286 100644
--- a/t/perf/lib-pack.sh
+++ b/t/perf/lib-pack.sh
@@ -9,15 +9,10 @@ create_packs () {
 			print "data <<EOF";
 			print "$_";
 			print "EOF";
+			print "checkpoint"
 		}
 	' "$@" |
-	git fast-import &&
-
-	git cat-file --batch-all-objects --batch-check='%(objectname)' |
-	while read sha1
-	do
-		echo $sha1 | git pack-objects .git/objects/pack/pack
-	done
+	git fast-import
 }
 
 # create a large number of packs, disabling any gc which might
@@ -25,5 +20,6 @@ create_packs () {
 setup_many_packs () {
 	git config gc.auto 0 &&
 	git config gc.autopacklimit 0 &&
+	git config fastimport.unpacklimit 0 &&
 	create_packs 500
 }
-- 
cgit v1.2.3


From 7893bf1720d853a8d123cf74c1c3ff43c2eb6011 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 20 Nov 2017 15:28:28 -0500
Subject: p5551: add a script to test fetch pack-dir rescans

Since fetch often deals with object-ids we don't have (yet),
it's an easy mistake for it to use a function like
parse_object() that gives the correct result (e.g., NULL)
but does so very slowly (because after failing to find the
object, we re-scan the pack directory looking for new
packs).

The regular test suite won't catch this because the end
result is correct, but we would want to know about
performance regressions, too. Let's add a test to the
regression suite.

Note that this uses a synthetic repository that has a large
number of packs. That's not ideal, as it means we're not
testing what "normal" users see (in fact, some of these
problems have existed for ages without anybody noticing
simply because a rescan on a normal repository just isn't
that expensive).

So what we're really looking for here is the spike you'd
notice in a pathological case (a lot of unknown objects
coming into a repo with a lot of packs). If that's fast,
then the normal cases should be, too.

Note that the test also makes liberal use of $MODERN_GIT for
setup; some of these regressions go back a ways, and we
should be able to use it to find the problems there.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/perf/p5551-fetch-rescan.sh | 55 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100755 t/perf/p5551-fetch-rescan.sh

diff --git a/t/perf/p5551-fetch-rescan.sh b/t/perf/p5551-fetch-rescan.sh
new file mode 100755
index 0000000000..b99dc23e32
--- /dev/null
+++ b/t/perf/p5551-fetch-rescan.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+
+test_description='fetch performance with many packs
+
+It is common for fetch to consider objects that we might not have, and it is an
+easy mistake for the code to use a function like `parse_object` that might
+give the correct _answer_ on such an object, but do so slowly (due to
+re-scanning the pack directory for lookup failures).
+
+The resulting performance drop can be hard to notice in a real repository, but
+becomes quite large in a repository with a large number of packs. So this
+test creates a more pathological case, since any mistakes would produce a more
+noticeable slowdown.
+'
+. ./perf-lib.sh
+. "$TEST_DIRECTORY"/perf/lib-pack.sh
+
+test_expect_success 'create parent and child' '
+	git init parent &&
+	git clone parent child
+'
+
+
+test_expect_success 'create refs in the parent' '
+	(
+		cd parent &&
+		git commit --allow-empty -m foo &&
+		head=$(git rev-parse HEAD) &&
+		test_seq 1000 |
+		sed "s,.*,update refs/heads/& $head," |
+		$MODERN_GIT update-ref --stdin
+	)
+'
+
+test_expect_success 'create many packs in the child' '
+	(
+		cd child &&
+		setup_many_packs
+	)
+'
+
+test_perf 'fetch' '
+	# start at the same state for each iteration
+	obj=$($MODERN_GIT -C parent rev-parse HEAD) &&
+	(
+		cd child &&
+		$MODERN_GIT for-each-ref --format="delete %(refname)" refs/remotes |
+		$MODERN_GIT update-ref --stdin &&
+		rm -vf .git/objects/$(echo $obj | sed "s|^..|&/|") &&
+
+		git fetch
+	)
+'
+
+test_done
-- 
cgit v1.2.3


From c291293b2ecec8ca77dfd218fa820dd7a0137a2b Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 20 Nov 2017 15:29:21 -0500
Subject: everything_local: use "quick" object existence check

In b495697b82 (fetch-pack: avoid repeatedly re-scanning pack
directory, 2013-01-26), we noticed that everything_local()
could waste time trying to find and parse objects which we
_expect_ to be missing. The solution was to put
has_sha1_file() in front of parse_object() to skip the
more-expensive parse attempt.

That optimization was negated later when has_sha1_file()
learned to do the same re-scan in 45e8a74873 (has_sha1_file:
re-check pack directory before giving up, 2013-08-30).

We can restore it by using the "quick" flag to tell
has_sha1_file (actually has_object_file these days) that we
prefer speed to thoroughness for this call.  See also the
fixes in 5827a0354 and 0eeb077be7 for prior art and
discussion on using the "quick" flag for these cases.

The recently-added performance regression test in p5551
demonstrates the problem. You can see the original fix:

  Test            b495697b82^       b495697b82
  --------------------------------------------------------
  5551.4: fetch   1.68(1.33+0.35)   0.87(0.69+0.18) -48.2%

and then the regression:

  Test            45e8a74873^       45e8a74873
  ---------------------------------------------------------
  5551.4: fetch   0.96(0.77+0.19)   2.55(2.04+0.50) +165.6%

and now our fix:

  Test            HEAD^             HEAD
  --------------------------------------------------------
  5551.4: fetch   7.21(6.58+0.63)   5.47(5.04+0.43) -24.1%

You can also see that other things have gotten a lot slower
since 2013. We'll deal with those in separate patches.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 fetch-pack.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fetch-pack.c b/fetch-pack.c
index fbbc99c888..eb553cb631 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -715,7 +715,8 @@ static int everything_local(struct fetch_pack_args *args,
 	for (ref = *refs; ref; ref = ref->next) {
 		struct object *o;
 
-		if (!has_object_file(&ref->old_oid))
+		if (!has_object_file_with_flags(&ref->old_oid,
+						OBJECT_INFO_QUICK))
 			continue;
 
 		o = parse_object(&ref->old_oid);
-- 
cgit v1.2.3


From 87b5e236a14d4783b063041588c2f77f3cc6ee89 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 21 Nov 2017 18:17:39 -0500
Subject: sha1_file: fast-path null sha1 as a missing object

In theory nobody should ever ask the low-level object code
for a null sha1. It's used as a sentinel for "no such
object" in lots of places, so leaking through to this level
is a sign that the higher-level code is not being careful
about its error-checking.  In practice, though, quite a few
code paths seem to rely on the null sha1 lookup failing as a
way to quietly propagate non-existence (e.g., by feeding it
to lookup_commit_reference_gently(), which then returns
NULL).

When this happens, we do two inefficient things:

  1. We actually search for the null sha1 in packs and in
     the loose object directory.

  2. When we fail to find it, we re-scan the pack directory
     in case a simultaneous repack happened to move it from
     loose to packed. This can be very expensive if you have
     a large number of packs.

Only the second one actually causes noticeable performance
problems, so we could treat them independently. But for the
sake of simplicity (both of code and of reasoning about it),
it makes sense to just declare that the null sha1 cannot be
a real on-disk object, and looking it up will always return
"no such object".

There's no real loss of functionality to do so Its use as a
sentinel value means that anybody who is unlucky enough to
hit the 2^-160th chance of generating an object with that
sha1 is already going to find the object largely unusable.

In an ideal world, we'd simply fix all of the callers to
notice the null sha1 and avoid passing it to us. But a
simple experiment to catch this with a BUG() shows that
there are a large number of code paths that do so.

So in the meantime, let's fix the performance problem by
taking a fast exit from the object lookup when we see a null
sha1. p5551 shows off the improvement (when a fetched ref is
new, the "old" sha1 is 0{40}, which ends up being passed for
fast-forward checks, the status table abbreviations, etc):

  Test            HEAD^             HEAD
  --------------------------------------------------------
  5551.4: fetch   5.51(5.03+0.48)   0.17(0.10+0.06) -96.9%

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 sha1_file.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sha1_file.c b/sha1_file.c
index bd5f82e664..fbb73f557a 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -2971,6 +2971,9 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi,
 				    lookup_replace_object(sha1) :
 				    sha1;
 
+	if (is_null_sha1(real))
+		return -1;
+
 	if (!oi)
 		oi = &blank_oi;
 
-- 
cgit v1.2.3