From 27e1e22d5ee3005f228b67ea94b5af29547b54fe Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 15 Oct 2014 18:38:55 -0400 Subject: prune: factor out loose-object directory traversal Prune has to walk $GIT_DIR/objects/?? in order to find the set of loose objects to prune. Other parts of the code (e.g., count-objects) want to do the same. Let's factor it out into a reusable for_each-style function. Note that this is not quite a straight code movement. The original code had strange behavior when it found a file of the form "[0-9a-f]{2}/.{38}" that did _not_ contain all hex digits. It executed a "break" from the loop, meaning that we stopped pruning in that directory (but still pruned other directories!). This was probably a bug; we do not want to process the file as an object, but we should keep going otherwise (and that is how the new code handles it). We are also a little more careful with loose object directories which fail to open. The original code silently ignored any failures, but the new code will complain about any problems besides ENOENT. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/prune.c | 87 +++++++++++++++++---------------------------------------- 1 file changed, 26 insertions(+), 61 deletions(-) (limited to 'builtin') diff --git a/builtin/prune.c b/builtin/prune.c index 144a3bdb33..763f53e06a 100644 --- a/builtin/prune.c +++ b/builtin/prune.c @@ -31,11 +31,23 @@ static int prune_tmp_file(const char *fullpath) return 0; } -static int prune_object(const char *fullpath, const unsigned char *sha1) +static int prune_object(const unsigned char *sha1, const char *fullpath, + void *data) { struct stat st; - if (lstat(fullpath, &st)) - return error("Could not stat '%s'", fullpath); + + /* + * Do we know about this object? + * It must have been reachable + */ + if (lookup_object(sha1)) + return 0; + + if (lstat(fullpath, &st)) { + /* report errors, but do not stop pruning */ + error("Could not stat '%s'", fullpath); + return 0; + } if (st.st_mtime > expire) return 0; if (show_only || verbose) { @@ -48,68 +60,20 @@ static int prune_object(const char *fullpath, const unsigned char *sha1) return 0; } -static int prune_dir(int i, struct strbuf *path) +static int prune_cruft(const char *basename, const char *path, void *data) { - size_t baselen = path->len; - DIR *dir = opendir(path->buf); - struct dirent *de; - - if (!dir) - return 0; - - while ((de = readdir(dir)) != NULL) { - char name[100]; - unsigned char sha1[20]; - - if (is_dot_or_dotdot(de->d_name)) - continue; - if (strlen(de->d_name) == 38) { - sprintf(name, "%02x", i); - memcpy(name+2, de->d_name, 39); - if (get_sha1_hex(name, sha1) < 0) - break; - - /* - * Do we know about this object? - * It must have been reachable - */ - if (lookup_object(sha1)) - continue; - - strbuf_addf(path, "/%s", de->d_name); - prune_object(path->buf, sha1); - strbuf_setlen(path, baselen); - continue; - } - if (starts_with(de->d_name, "tmp_obj_")) { - strbuf_addf(path, "/%s", de->d_name); - prune_tmp_file(path->buf); - strbuf_setlen(path, baselen); - continue; - } - fprintf(stderr, "bad sha1 file: %s/%s\n", path->buf, de->d_name); - } - closedir(dir); - if (!show_only) - rmdir(path->buf); + if (starts_with(basename, "tmp_obj_")) + prune_tmp_file(path); + else + fprintf(stderr, "bad sha1 file: %s\n", path); return 0; } -static void prune_object_dir(const char *path) +static int prune_subdir(int nr, const char *path, void *data) { - struct strbuf buf = STRBUF_INIT; - size_t baselen; - int i; - - strbuf_addstr(&buf, path); - strbuf_addch(&buf, '/'); - baselen = buf.len; - - for (i = 0; i < 256; i++) { - strbuf_addf(&buf, "%02x", i); - prune_dir(i, &buf); - strbuf_setlen(&buf, baselen); - } + if (!show_only) + rmdir(path); + return 0; } /* @@ -173,7 +137,8 @@ int cmd_prune(int argc, const char **argv, const char *prefix) mark_reachable_objects(&revs, 1, progress); stop_progress(&progress); - prune_object_dir(get_object_directory()); + for_each_loose_file_in_objdir(get_object_directory(), prune_object, + prune_cruft, prune_subdir, NULL); prune_packed_objects(show_only ? PRUNE_PACKED_DRY_RUN : 0); remove_temporary_files(get_object_directory()); -- cgit v1.2.3 From 0d3b729680e9cab0f567122e796dab2bc9bc8cfb Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 15 Oct 2014 18:40:53 -0400 Subject: prune-packed: use for_each_loose_file_in_objdir This saves us from manually traversing the directory structure ourselves. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/prune-packed.c | 69 +++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 46 deletions(-) (limited to 'builtin') diff --git a/builtin/prune-packed.c b/builtin/prune-packed.c index d430731d70..f24a2c2bdc 100644 --- a/builtin/prune-packed.c +++ b/builtin/prune-packed.c @@ -10,65 +10,42 @@ static const char * const prune_packed_usage[] = { static struct progress *progress; -static void prune_dir(int i, DIR *dir, struct strbuf *pathname, int opts) +static int prune_subdir(int nr, const char *path, void *data) { - struct dirent *de; - char hex[40]; - int top_len = pathname->len; + int *opts = data; + display_progress(progress, nr + 1); + if (!(*opts & PRUNE_PACKED_DRY_RUN)) + rmdir(path); + return 0; +} + +static int prune_object(const unsigned char *sha1, const char *path, + void *data) +{ + int *opts = data; - sprintf(hex, "%02x", i); - while ((de = readdir(dir)) != NULL) { - unsigned char sha1[20]; - if (strlen(de->d_name) != 38) - continue; - memcpy(hex + 2, de->d_name, 38); - if (get_sha1_hex(hex, sha1)) - continue; - if (!has_sha1_pack(sha1)) - continue; + if (!has_sha1_pack(sha1)) + return 0; - strbuf_add(pathname, de->d_name, 38); - if (opts & PRUNE_PACKED_DRY_RUN) - printf("rm -f %s\n", pathname->buf); - else - unlink_or_warn(pathname->buf); - display_progress(progress, i + 1); - strbuf_setlen(pathname, top_len); - } + if (*opts & PRUNE_PACKED_DRY_RUN) + printf("rm -f %s\n", path); + else + unlink_or_warn(path); + return 0; } void prune_packed_objects(int opts) { - int i; - const char *dir = get_object_directory(); - struct strbuf pathname = STRBUF_INIT; - int top_len; - - strbuf_addstr(&pathname, dir); if (opts & PRUNE_PACKED_VERBOSE) progress = start_progress_delay(_("Removing duplicate objects"), 256, 95, 2); - if (pathname.len && pathname.buf[pathname.len - 1] != '/') - strbuf_addch(&pathname, '/'); - - top_len = pathname.len; - for (i = 0; i < 256; i++) { - DIR *d; + for_each_loose_file_in_objdir(get_object_directory(), + prune_object, NULL, prune_subdir, &opts); - display_progress(progress, i + 1); - strbuf_setlen(&pathname, top_len); - strbuf_addf(&pathname, "%02x/", i); - d = opendir(pathname.buf); - if (!d) - continue; - prune_dir(i, d, &pathname, opts); - closedir(d); - strbuf_setlen(&pathname, top_len + 2); - rmdir(pathname.buf); - } + /* Ensure we show 100% before finishing progress */ + display_progress(progress, 256); stop_progress(&progress); - strbuf_release(&pathname); } int cmd_prune_packed(int argc, const char **argv, const char *prefix) -- cgit v1.2.3 From cac05d4dfd148071462939a61ecd44cf932a0b02 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 15 Oct 2014 18:40:58 -0400 Subject: count-objects: do not use xsize_t when counting object size The point of xsize_t is to safely cast an off_t into a size_t (because we are about to mmap). But in count-objects, we are summing the sizes in an off_t. Using xsize_t means that count-objects could fail on a 32-bit system with a 4G object (not likely, as other parts of git would fail, but we should at least be correct here). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/count-objects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'builtin') diff --git a/builtin/count-objects.c b/builtin/count-objects.c index a7f70cb858..316a805a83 100644 --- a/builtin/count-objects.c +++ b/builtin/count-objects.c @@ -53,7 +53,7 @@ static void count_objects(DIR *d, char *path, int len, int verbose, if (lstat(path, &st) || !S_ISREG(st.st_mode)) bad = 1; else - (*loose_size) += xsize_t(on_disk_bytes(st)); + (*loose_size) += on_disk_bytes(st); } if (bad) { if (verbose) { -- cgit v1.2.3 From 4a1e693a30c816fa5293c5f9e83e1e98ee87584e Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 15 Oct 2014 18:41:11 -0400 Subject: count-objects: use for_each_loose_file_in_objdir This drops our line count considerably, and should make things more readable by keeping the counting logic separate from the traversal. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/count-objects.c | 101 ++++++++++++++---------------------------------- 1 file changed, 30 insertions(+), 71 deletions(-) (limited to 'builtin') diff --git a/builtin/count-objects.c b/builtin/count-objects.c index 316a805a83..e47ef0b1af 100644 --- a/builtin/count-objects.c +++ b/builtin/count-objects.c @@ -11,6 +11,9 @@ static unsigned long garbage; static off_t size_garbage; +static int verbose; +static unsigned long loose, packed, packed_loose; +static off_t loose_size; static void real_report_garbage(const char *desc, const char *path) { @@ -21,61 +24,31 @@ static void real_report_garbage(const char *desc, const char *path) garbage++; } -static void count_objects(DIR *d, char *path, int len, int verbose, - unsigned long *loose, - off_t *loose_size, - unsigned long *packed_loose) +static void loose_garbage(const char *path) { - struct dirent *ent; - while ((ent = readdir(d)) != NULL) { - char hex[41]; - unsigned char sha1[20]; - const char *cp; - int bad = 0; + if (verbose) + report_garbage("garbage found", path); +} - if (is_dot_or_dotdot(ent->d_name)) - continue; - for (cp = ent->d_name; *cp; cp++) { - int ch = *cp; - if (('0' <= ch && ch <= '9') || - ('a' <= ch && ch <= 'f')) - continue; - bad = 1; - break; - } - if (cp - ent->d_name != 38) - bad = 1; - else { - struct stat st; - memcpy(path + len + 3, ent->d_name, 38); - path[len + 2] = '/'; - path[len + 41] = 0; - if (lstat(path, &st) || !S_ISREG(st.st_mode)) - bad = 1; - else - (*loose_size) += on_disk_bytes(st); - } - if (bad) { - if (verbose) { - struct strbuf sb = STRBUF_INIT; - strbuf_addf(&sb, "%.*s/%s", - len + 2, path, ent->d_name); - report_garbage("garbage found", sb.buf); - strbuf_release(&sb); - } - continue; - } - (*loose)++; - if (!verbose) - continue; - memcpy(hex, path+len, 2); - memcpy(hex+2, ent->d_name, 38); - hex[40] = 0; - if (get_sha1_hex(hex, sha1)) - die("internal error"); - if (has_sha1_pack(sha1)) - (*packed_loose)++; +static int count_loose(const unsigned char *sha1, const char *path, void *data) +{ + struct stat st; + + if (lstat(path, &st) || !S_ISREG(st.st_mode)) + loose_garbage(path); + else { + loose_size += on_disk_bytes(st); + loose++; + if (verbose && has_sha1_pack(sha1)) + packed_loose++; } + return 0; +} + +static int count_cruft(const char *basename, const char *path, void *data) +{ + loose_garbage(path); + return 0; } static char const * const count_objects_usage[] = { @@ -85,12 +58,7 @@ static char const * const count_objects_usage[] = { int cmd_count_objects(int argc, const char **argv, const char *prefix) { - int i, verbose = 0, human_readable = 0; - const char *objdir = get_object_directory(); - int len = strlen(objdir); - char *path = xmalloc(len + 50); - unsigned long loose = 0, packed = 0, packed_loose = 0; - off_t loose_size = 0; + int human_readable = 0; struct option opts[] = { OPT__VERBOSE(&verbose, N_("be verbose")), OPT_BOOL('H', "human-readable", &human_readable, @@ -104,19 +72,10 @@ int cmd_count_objects(int argc, const char **argv, const char *prefix) usage_with_options(count_objects_usage, opts); if (verbose) report_garbage = real_report_garbage; - memcpy(path, objdir, len); - if (len && objdir[len-1] != '/') - path[len++] = '/'; - for (i = 0; i < 256; i++) { - DIR *d; - sprintf(path + len, "%02x", i); - d = opendir(path); - if (!d) - continue; - count_objects(d, path, len, verbose, - &loose, &loose_size, &packed_loose); - closedir(d); - } + + for_each_loose_file_in_objdir(get_object_directory(), + count_loose, count_cruft, NULL, NULL); + if (verbose) { struct packed_git *p; unsigned long num_pack = 0; -- cgit v1.2.3 From d3038d22f91aad9620bd8e6fc43fc67c16219738 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 15 Oct 2014 18:41:35 -0400 Subject: prune: keep objects reachable from recent objects Our current strategy with prune is that an object falls into one of three categories: 1. Reachable (from ref tips, reflogs, index, etc). 2. Not reachable, but recent (based on the --expire time). 3. Not reachable and not recent. We keep objects from (1) and (2), but prune objects in (3). The point of (2) is that these objects may be part of an in-progress operation that has not yet updated any refs. However, it is not always the case that objects for an in-progress operation will have a recent mtime. For example, the object database may have an old copy of a blob (from an abandoned operation, a branch that was deleted, etc). If we create a new tree that points to it, a simultaneous prune will leave our tree, but delete the blob. Referencing that tree with a commit will then work (we check that the tree is in the object database, but not that all of its referred objects are), as will mentioning the commit in a ref. But the resulting repo is corrupt; we are missing the blob reachable from a ref. One way to solve this is to be more thorough when referencing a sha1: make sure that not only do we have that sha1, but that we have objects it refers to, and so forth recursively. The problem is that this is very expensive. Creating a parent link would require traversing the entire object graph! Instead, this patch pushes the extra work onto prune, which runs less frequently (and has to look at the whole object graph anyway). It creates a new category of objects: objects which are not recent, but which are reachable from a recent object. We do not prune these objects, just like the reachable and recent ones. This lets us avoid the recursive check above, because if we have an object, even if it is unreachable, we should have its referent. We can make a simple inductive argument that with this patch, this property holds (that there are no objects with missing referents in the repository): 0. When we have no objects, we have nothing to refer or be referred to, so the property holds. 1. If we add objects to the repository, their direct referents must generally exist (e.g., if you create a tree, the blobs it references must exist; if you create a commit to point at the tree, the tree must exist). This is already the case before this patch. And it is not 100% foolproof (you can make bogus objects using `git hash-object`, for example), but it should be the case for normal usage. Therefore for any sequence of object additions, the property will continue to hold. 2. If we remove objects from the repository, then we will not remove a child object (like a blob) if an object that refers to it is being kept. That is the part implemented by this patch. Note, however, that our reachability check and the actual pruning are not atomic. So it _is_ still possible to violate the property (e.g., an object becomes referenced just as we are deleting it). This patch is shooting for eliminating problems where the mtimes of dependent objects differ by hours or days, and one is dropped without the other. It does nothing to help with short races. Naively, the simplest way to implement this would be to add all recent objects as tips to the reachability traversal. However, this does not perform well. In a recently-packed repository, all reachable objects will also be recent, and therefore we have to look at each object twice. This patch instead performs the reachability traversal, then follows up with a second traversal for recent objects, skipping any that have already been marked. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/prune.c | 2 +- builtin/reflog.c | 2 +- reachable.c | 112 +++++++++++++++++++++++++++++++++++++++++++++ reachable.h | 3 +- t/t6501-freshen-objects.sh | 88 +++++++++++++++++++++++++++++++++++ 5 files changed, 204 insertions(+), 3 deletions(-) create mode 100755 t/t6501-freshen-objects.sh (limited to 'builtin') diff --git a/builtin/prune.c b/builtin/prune.c index 763f53e06a..04d3b12ae4 100644 --- a/builtin/prune.c +++ b/builtin/prune.c @@ -135,7 +135,7 @@ int cmd_prune(int argc, const char **argv, const char *prefix) if (show_progress) progress = start_progress_delay(_("Checking connectivity"), 0, 0, 2); - mark_reachable_objects(&revs, 1, progress); + mark_reachable_objects(&revs, 1, expire, progress); stop_progress(&progress); for_each_loose_file_in_objdir(get_object_directory(), prune_object, prune_cruft, prune_subdir, NULL); diff --git a/builtin/reflog.c b/builtin/reflog.c index e8a8fb13b9..80bddc259b 100644 --- a/builtin/reflog.c +++ b/builtin/reflog.c @@ -649,7 +649,7 @@ static int cmd_reflog_expire(int argc, const char **argv, const char *prefix) init_revisions(&cb.revs, prefix); if (cb.verbose) printf("Marking reachable objects..."); - mark_reachable_objects(&cb.revs, 0, NULL); + mark_reachable_objects(&cb.revs, 0, 0, NULL); if (cb.verbose) putchar('\n'); } diff --git a/reachable.c b/reachable.c index d03f8294ee..55589a02eb 100644 --- a/reachable.c +++ b/reachable.c @@ -97,7 +97,109 @@ static void mark_commit(struct commit *c, void *data) mark_object(&c->object, NULL, NULL, data); } +struct recent_data { + struct rev_info *revs; + unsigned long timestamp; +}; + +static void add_recent_object(const unsigned char *sha1, + unsigned long mtime, + struct recent_data *data) +{ + struct object *obj; + enum object_type type; + + if (mtime <= data->timestamp) + return; + + /* + * We do not want to call parse_object here, because + * inflating blobs and trees could be very expensive. + * However, we do need to know the correct type for + * later processing, and the revision machinery expects + * commits and tags to have been parsed. + */ + type = sha1_object_info(sha1, NULL); + if (type < 0) + die("unable to get object info for %s", sha1_to_hex(sha1)); + + switch (type) { + case OBJ_TAG: + case OBJ_COMMIT: + obj = parse_object_or_die(sha1, NULL); + break; + case OBJ_TREE: + obj = (struct object *)lookup_tree(sha1); + break; + case OBJ_BLOB: + obj = (struct object *)lookup_blob(sha1); + break; + default: + die("unknown object type for %s: %s", + sha1_to_hex(sha1), typename(type)); + } + + if (!obj) + die("unable to lookup %s", sha1_to_hex(sha1)); + + add_pending_object(data->revs, obj, ""); +} + +static int add_recent_loose(const unsigned char *sha1, + const char *path, void *data) +{ + struct stat st; + struct object *obj = lookup_object(sha1); + + if (obj && obj->flags & SEEN) + return 0; + + if (stat(path, &st) < 0) { + /* + * It's OK if an object went away during our iteration; this + * could be due to a simultaneous repack. But anything else + * we should abort, since we might then fail to mark objects + * which should not be pruned. + */ + if (errno == ENOENT) + return 0; + return error("unable to stat %s: %s", + sha1_to_hex(sha1), strerror(errno)); + } + + add_recent_object(sha1, st.st_mtime, data); + return 0; +} + +static int add_recent_packed(const unsigned char *sha1, + struct packed_git *p, uint32_t pos, + void *data) +{ + struct object *obj = lookup_object(sha1); + + if (obj && obj->flags & SEEN) + return 0; + add_recent_object(sha1, p->mtime, data); + return 0; +} + +static int add_unseen_recent_objects_to_traversal(struct rev_info *revs, + unsigned long timestamp) +{ + struct recent_data data; + int r; + + data.revs = revs; + data.timestamp = timestamp; + + r = for_each_loose_object(add_recent_loose, &data); + if (r) + return r; + return for_each_packed_object(add_recent_packed, &data); +} + void mark_reachable_objects(struct rev_info *revs, int mark_reflog, + unsigned long mark_recent, struct progress *progress) { struct connectivity_progress cp; @@ -133,5 +235,15 @@ void mark_reachable_objects(struct rev_info *revs, int mark_reflog, if (prepare_revision_walk(revs)) die("revision walk setup failed"); traverse_commit_list(revs, mark_commit, mark_object, &cp); + + if (mark_recent) { + revs->ignore_missing_links = 1; + if (add_unseen_recent_objects_to_traversal(revs, mark_recent)) + die("unable to mark recent objects"); + if (prepare_revision_walk(revs)) + die("revision walk setup failed"); + traverse_commit_list(revs, mark_commit, mark_object, &cp); + } + display_progress(cp.progress, cp.count); } diff --git a/reachable.h b/reachable.h index 5d082adfec..141fe30879 100644 --- a/reachable.h +++ b/reachable.h @@ -2,6 +2,7 @@ #define REACHEABLE_H struct progress; -extern void mark_reachable_objects(struct rev_info *revs, int mark_reflog, struct progress *); +extern void mark_reachable_objects(struct rev_info *revs, int mark_reflog, + unsigned long mark_recent, struct progress *); #endif diff --git a/t/t6501-freshen-objects.sh b/t/t6501-freshen-objects.sh new file mode 100755 index 0000000000..de941c2cb2 --- /dev/null +++ b/t/t6501-freshen-objects.sh @@ -0,0 +1,88 @@ +#!/bin/sh +# +# This test covers the handling of objects which might have old +# mtimes in the filesystem (because they were used previously) +# and are just now becoming referenced again. +# +# We're going to do two things that are a little bit "fake" to +# help make our simulation easier: +# +# 1. We'll turn off reflogs. You can still run into +# problems with reflogs on, but your objects +# don't get pruned until both the reflog expiration +# has passed on their references, _and_ they are out +# of prune's expiration period. Dropping reflogs +# means we only have to deal with one variable in our tests, +# but the results generalize. +# +# 2. We'll use a temporary index file to create our +# works-in-progress. Most workflows would mention +# referenced objects in the index, which prune takes +# into account. However, many operations don't. For +# example, a partial commit with "git commit foo" +# will use a temporary index. Or they may not need +# an index at all (e.g., creating a new commit +# to refer to an existing tree). + +test_description='check pruning of dependent objects' +. ./test-lib.sh + +# We care about reachability, so we do not want to use +# the normal test_commit, which creates extra tags. +add () { + echo "$1" >"$1" && + git add "$1" +} +commit () { + test_tick && + add "$1" && + git commit -m "$1" +} + +test_expect_success 'disable reflogs' ' + git config core.logallrefupdates false && + rm -rf .git/logs +' + +test_expect_success 'setup basic history' ' + commit base +' + +test_expect_success 'create and abandon some objects' ' + git checkout -b experiment && + commit abandon && + git checkout master && + git branch -D experiment +' + +test_expect_success 'simulate time passing' ' + find .git/objects -type f | + xargs test-chmtime -v -86400 +' + +test_expect_success 'start writing new commit with old blob' ' + tree=$( + GIT_INDEX_FILE=index.tmp && + export GIT_INDEX_FILE && + git read-tree HEAD && + add unrelated && + add abandon && + git write-tree + ) +' + +test_expect_success 'simultaneous gc' ' + git gc --prune=12.hours.ago +' + +test_expect_success 'finish writing out commit' ' + commit=$(echo foo | git commit-tree -p HEAD $tree) && + git update-ref HEAD $commit +' + +# "abandon" blob should have been rescued by reference from new tree +test_expect_success 'repository passes fsck' ' + git fsck +' + +test_done -- cgit v1.2.3 From d0d46abc167d18fdbdbf2ece8ca6df704517c62f Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 15 Oct 2014 18:41:53 -0400 Subject: pack-objects: refactor unpack-unreachable expiration check When we are loosening unreachable packed objects, we do not bother to process objects that would simply be pruned immediately anyway. The "would be pruned" check is a simple comparison, but is about to get more complicated. Let's pull it out into a separate function. Note that this is slightly less efficient than the original, which avoided even opening old packs, since no object in them could pass the current check, which cares only about the pack mtime. But the new rules will depend on the exact object, so we need to perform the check even for old packs. Note also that we fix a minor buglet when the pack mtime is exactly the same as the expiration time. The prune code considers that worth pruning, whereas our check here considered it worth keeping. This wasn't a big deal. Besides being unlikely to happen, the result was simply that the object was loosened and then pruned, missing the optimization. Still, we can easily fix it while we are here. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'builtin') diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index d39193453a..2fe2ab0603 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -2407,6 +2407,16 @@ static int has_sha1_pack_kept_or_nonlocal(const unsigned char *sha1) return 0; } +static int loosened_object_can_be_discarded(const unsigned char *sha1, + unsigned long mtime) +{ + if (!unpack_unreachable_expiration) + return 0; + if (mtime > unpack_unreachable_expiration) + return 0; + return 1; +} + static void loosen_unused_packed_objects(struct rev_info *revs) { struct packed_git *p; @@ -2417,17 +2427,14 @@ static void loosen_unused_packed_objects(struct rev_info *revs) if (!p->pack_local || p->pack_keep) continue; - if (unpack_unreachable_expiration && - p->mtime < unpack_unreachable_expiration) - continue; - if (open_pack_index(p)) die("cannot open pack index"); for (i = 0; i < p->num_objects; i++) { sha1 = nth_packed_object_sha1(p, i); if (!packlist_find(&to_pack, sha1, NULL) && - !has_sha1_pack_kept_or_nonlocal(sha1)) + !has_sha1_pack_kept_or_nonlocal(sha1) && + !loosened_object_can_be_discarded(sha1, p->mtime)) if (force_object_loose(sha1, p->mtime)) die("unable to force loose object"); } -- cgit v1.2.3 From abcb86553d3ec4afffa4e3963089dffe0559740e Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 15 Oct 2014 18:42:09 -0400 Subject: pack-objects: match prune logic for discarding objects A recent commit taught git-prune to keep non-recent objects that are reachable from recent ones. However, pack-objects, when loosening unreachable objects, tries to optimize out the write in the case that the object will be immediately pruned. It now gets this wrong, since its rule does not reflect the new prune code (and this can be seen by running t6501 with a strategically placed repack). Let's teach pack-objects similar logic. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 39 +++++++++++++++++++ reachable.c | 4 +- reachable.h | 2 + t/t6501-freshen-objects.sh | 93 +++++++++++++++++++++++++++------------------- 4 files changed, 98 insertions(+), 40 deletions(-) (limited to 'builtin') diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 2fe2ab0603..4df9499040 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -20,6 +20,8 @@ #include "streaming.h" #include "thread-utils.h" #include "pack-bitmap.h" +#include "reachable.h" +#include "sha1-array.h" static const char *pack_usage[] = { N_("git pack-objects --stdout [options...] [< ref-list | < object-list]"), @@ -2407,6 +2409,15 @@ static int has_sha1_pack_kept_or_nonlocal(const unsigned char *sha1) return 0; } +/* + * Store a list of sha1s that are should not be discarded + * because they are either written too recently, or are + * reachable from another object that was. + * + * This is filled by get_object_list. + */ +static struct sha1_array recent_objects; + static int loosened_object_can_be_discarded(const unsigned char *sha1, unsigned long mtime) { @@ -2414,6 +2425,8 @@ static int loosened_object_can_be_discarded(const unsigned char *sha1, return 0; if (mtime > unpack_unreachable_expiration) return 0; + if (sha1_array_lookup(&recent_objects, sha1) >= 0) + return 0; return 1; } @@ -2470,6 +2483,19 @@ static int get_object_list_from_bitmap(struct rev_info *revs) return 0; } +static void record_recent_object(struct object *obj, + const struct name_path *path, + const char *last, + void *data) +{ + sha1_array_append(&recent_objects, obj->sha1); +} + +static void record_recent_commit(struct commit *commit, void *data) +{ + sha1_array_append(&recent_objects, commit->object.sha1); +} + static void get_object_list(int ac, const char **av) { struct rev_info revs; @@ -2517,10 +2543,23 @@ static void get_object_list(int ac, const char **av) mark_edges_uninteresting(&revs, show_edge); traverse_commit_list(&revs, show_commit, show_object, NULL); + if (unpack_unreachable_expiration) { + revs.ignore_missing_links = 1; + if (add_unseen_recent_objects_to_traversal(&revs, + unpack_unreachable_expiration)) + die("unable to add recent objects"); + if (prepare_revision_walk(&revs)) + die("revision walk setup failed"); + traverse_commit_list(&revs, record_recent_commit, + record_recent_object, NULL); + } + if (keep_unreachable) add_objects_in_unpacked_packs(&revs); if (unpack_unreachable) loosen_unused_packed_objects(&revs); + + sha1_array_clear(&recent_objects); } static int option_parse_index_version(const struct option *opt, diff --git a/reachable.c b/reachable.c index 55589a02eb..0176a88b81 100644 --- a/reachable.c +++ b/reachable.c @@ -183,8 +183,8 @@ static int add_recent_packed(const unsigned char *sha1, return 0; } -static int add_unseen_recent_objects_to_traversal(struct rev_info *revs, - unsigned long timestamp) +int add_unseen_recent_objects_to_traversal(struct rev_info *revs, + unsigned long timestamp) { struct recent_data data; int r; diff --git a/reachable.h b/reachable.h index 141fe30879..d23efc36ec 100644 --- a/reachable.h +++ b/reachable.h @@ -2,6 +2,8 @@ #define REACHEABLE_H struct progress; +extern int add_unseen_recent_objects_to_traversal(struct rev_info *revs, + unsigned long timestamp); extern void mark_reachable_objects(struct rev_info *revs, int mark_reflog, unsigned long mark_recent, struct progress *); diff --git a/t/t6501-freshen-objects.sh b/t/t6501-freshen-objects.sh index de941c2cb2..e25c47dd5c 100755 --- a/t/t6501-freshen-objects.sh +++ b/t/t6501-freshen-objects.sh @@ -39,50 +39,67 @@ commit () { git commit -m "$1" } -test_expect_success 'disable reflogs' ' - git config core.logallrefupdates false && - rm -rf .git/logs -' +maybe_repack () { + if test -n "$repack"; then + git repack -ad + fi +} + +for repack in '' true; do + title=${repack:+repack} + title=${title:-loose} + + test_expect_success "make repo completely empty ($title)" ' + rm -rf .git && + git init + ' + + test_expect_success "disable reflogs ($title)" ' + git config core.logallrefupdates false && + rm -rf .git/logs + ' -test_expect_success 'setup basic history' ' - commit base -' + test_expect_success "setup basic history ($title)" ' + commit base + ' -test_expect_success 'create and abandon some objects' ' - git checkout -b experiment && - commit abandon && - git checkout master && - git branch -D experiment -' + test_expect_success "create and abandon some objects ($title)" ' + git checkout -b experiment && + commit abandon && + maybe_repack && + git checkout master && + git branch -D experiment + ' -test_expect_success 'simulate time passing' ' - find .git/objects -type f | - xargs test-chmtime -v -86400 -' + test_expect_success "simulate time passing ($title)" ' + find .git/objects -type f | + xargs test-chmtime -v -86400 + ' -test_expect_success 'start writing new commit with old blob' ' - tree=$( - GIT_INDEX_FILE=index.tmp && - export GIT_INDEX_FILE && - git read-tree HEAD && - add unrelated && - add abandon && - git write-tree - ) -' + test_expect_success "start writing new commit with old blob ($title)" ' + tree=$( + GIT_INDEX_FILE=index.tmp && + export GIT_INDEX_FILE && + git read-tree HEAD && + add unrelated && + add abandon && + git write-tree + ) + ' -test_expect_success 'simultaneous gc' ' - git gc --prune=12.hours.ago -' + test_expect_success "simultaneous gc ($title)" ' + git gc --prune=12.hours.ago + ' -test_expect_success 'finish writing out commit' ' - commit=$(echo foo | git commit-tree -p HEAD $tree) && - git update-ref HEAD $commit -' + test_expect_success "finish writing out commit ($title)" ' + commit=$(echo foo | git commit-tree -p HEAD $tree) && + git update-ref HEAD $commit + ' -# "abandon" blob should have been rescued by reference from new tree -test_expect_success 'repository passes fsck' ' - git fsck -' + # "abandon" blob should have been rescued by reference from new tree + test_expect_success "repository passes fsck ($title)" ' + git fsck + ' +done test_done -- cgit v1.2.3 From 9e0c3c4fcdf3775a9e0256ee231efa4698297a0e Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 15 Oct 2014 18:42:57 -0400 Subject: make add_object_array_with_context interface more sane When you resolve a sha1, you can optionally keep any context found during the resolution, including the path and mode of a tree entry (e.g., when looking up "HEAD:subdir/file.c"). The add_object_array_with_context function lets you then attach that context to an entry in a list. Unfortunately, the interface for doing so is horrible. The object_context structure is large and most object_array users do not use it. Therefore we keep a pointer to the structure to avoid burdening other users too much. But that means when we do use it that we must allocate the struct ourselves. And the struct contains a fixed PATH_MAX-sized buffer, which makes this wholly unsuitable for any large arrays. We can observe that there is only a single user of the "with_context" variant: builtin/grep.c. And in that use case, the only element we care about is the path. We can therefore store only the path as a pointer (the context's mode field was redundant with the object_array_entry itself, and nobody actually cared about the surrounding tree). This still requires a strdup of the pathname, but at least we are only consuming the minimum amount of memory for each string. We can also handle the copying ourselves in add_object_array_*, and free it as appropriate in object_array_release_entry. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/grep.c | 8 ++++---- object.c | 23 +++++++++-------------- object.h | 4 ++-- 3 files changed, 15 insertions(+), 20 deletions(-) (limited to 'builtin') diff --git a/builtin/grep.c b/builtin/grep.c index c86a142f30..4063882f06 100644 --- a/builtin/grep.c +++ b/builtin/grep.c @@ -456,10 +456,10 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec, } static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec, - struct object *obj, const char *name, struct object_context *oc) + struct object *obj, const char *name, const char *path) { if (obj->type == OBJ_BLOB) - return grep_sha1(opt, obj->sha1, name, 0, oc ? oc->path : NULL); + return grep_sha1(opt, obj->sha1, name, 0, path); if (obj->type == OBJ_COMMIT || obj->type == OBJ_TREE) { struct tree_desc tree; void *data; @@ -501,7 +501,7 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec, for (i = 0; i < nr; i++) { struct object *real_obj; real_obj = deref_tag(list->objects[i].item, NULL, 0); - if (grep_object(opt, pathspec, real_obj, list->objects[i].name, list->objects[i].context)) { + if (grep_object(opt, pathspec, real_obj, list->objects[i].name, list->objects[i].path)) { hit = 1; if (opt->status_only) break; @@ -821,7 +821,7 @@ int cmd_grep(int argc, const char **argv, const char *prefix) struct object *object = parse_object_or_die(sha1, arg); if (!seen_dashdash) verify_non_filename(prefix, arg); - add_object_array_with_context(object, arg, &list, xmemdupz(&oc, sizeof(struct object_context))); + add_object_array_with_path(object, arg, &list, oc.mode, oc.path); continue; } if (!strcmp(arg, "--")) { diff --git a/object.c b/object.c index 6aeb1bbbe3..df86bdd5a5 100644 --- a/object.c +++ b/object.c @@ -307,10 +307,9 @@ int object_list_contains(struct object_list *list, struct object *obj) */ static char object_array_slopbuf[1]; -static void add_object_array_with_mode_context(struct object *obj, const char *name, - struct object_array *array, - unsigned mode, - struct object_context *context) +void add_object_array_with_path(struct object *obj, const char *name, + struct object_array *array, + unsigned mode, const char *path) { unsigned nr = array->nr; unsigned alloc = array->alloc; @@ -333,7 +332,10 @@ static void add_object_array_with_mode_context(struct object *obj, const char *n else entry->name = xstrdup(name); entry->mode = mode; - entry->context = context; + if (path) + entry->path = xstrdup(path); + else + entry->path = NULL; array->nr = ++nr; } @@ -344,15 +346,7 @@ void add_object_array(struct object *obj, const char *name, struct object_array void add_object_array_with_mode(struct object *obj, const char *name, struct object_array *array, unsigned mode) { - add_object_array_with_mode_context(obj, name, array, mode, NULL); -} - -void add_object_array_with_context(struct object *obj, const char *name, struct object_array *array, struct object_context *context) -{ - if (context) - add_object_array_with_mode_context(obj, name, array, context->mode, context); - else - add_object_array_with_mode_context(obj, name, array, S_IFINVALID, context); + add_object_array_with_path(obj, name, array, mode, NULL); } /* @@ -363,6 +357,7 @@ static void object_array_release_entry(struct object_array_entry *ent) { if (ent->name != object_array_slopbuf) free(ent->name); + free(ent->path); } void object_array_filter(struct object_array *array, diff --git a/object.h b/object.h index 2a755a2373..e5178a516d 100644 --- a/object.h +++ b/object.h @@ -18,8 +18,8 @@ struct object_array { * empty string. */ char *name; + char *path; unsigned mode; - struct object_context *context; } *objects; }; @@ -115,7 +115,7 @@ int object_list_contains(struct object_list *list, struct object *obj); /* Object array handling .. */ void add_object_array(struct object *obj, const char *name, struct object_array *array); void add_object_array_with_mode(struct object *obj, const char *name, struct object_array *array, unsigned mode); -void add_object_array_with_context(struct object *obj, const char *name, struct object_array *array, struct object_context *context); +void add_object_array_with_path(struct object *obj, const char *name, struct object_array *array, unsigned mode, const char *path); typedef int (*object_array_each_func_t)(struct object_array_entry *, void *); -- cgit v1.2.3 From edfbb2aa538148b38ee0ba6d62c95b7edda5d817 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 16 Oct 2014 20:44:35 -0400 Subject: pack-objects: use argv_array This saves us from having to bump the rp_av count when we add new traversal options. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'builtin') diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 4df9499040..b26276b424 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -22,6 +22,7 @@ #include "pack-bitmap.h" #include "reachable.h" #include "sha1-array.h" +#include "argv-array.h" static const char *pack_usage[] = { N_("git pack-objects --stdout [options...] [< ref-list | < object-list]"), @@ -2614,8 +2615,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) int use_internal_rev_list = 0; int thin = 0; int all_progress_implied = 0; - const char *rp_av[6]; - int rp_ac = 0; + struct argv_array rp = ARGV_ARRAY_INIT; int rev_list_unpacked = 0, rev_list_all = 0, rev_list_reflog = 0; struct option pack_objects_options[] = { OPT_SET_INT('q', "quiet", &progress, @@ -2705,24 +2705,24 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (pack_to_stdout != !base_name || argc) usage_with_options(pack_usage, pack_objects_options); - rp_av[rp_ac++] = "pack-objects"; + argv_array_push(&rp, "pack-objects"); if (thin) { use_internal_rev_list = 1; - rp_av[rp_ac++] = "--objects-edge"; + argv_array_push(&rp, "--objects-edge"); } else - rp_av[rp_ac++] = "--objects"; + argv_array_push(&rp, "--objects"); if (rev_list_all) { use_internal_rev_list = 1; - rp_av[rp_ac++] = "--all"; + argv_array_push(&rp, "--all"); } if (rev_list_reflog) { use_internal_rev_list = 1; - rp_av[rp_ac++] = "--reflog"; + argv_array_push(&rp, "--reflog"); } if (rev_list_unpacked) { use_internal_rev_list = 1; - rp_av[rp_ac++] = "--unpacked"; + argv_array_push(&rp, "--unpacked"); } if (!reuse_object) @@ -2766,8 +2766,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (!use_internal_rev_list) read_object_list_from_stdin(); else { - rp_av[rp_ac] = NULL; - get_object_list(rp_ac, rp_av); + get_object_list(rp.argc, rp.argv); + argv_array_clear(&rp); } cleanup_preferred_base(); if (include_tag && nr_result) -- cgit v1.2.3 From c90f9e13abae630551ada3e895633bdc2cf4e080 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 16 Oct 2014 20:44:49 -0400 Subject: repack: pack objects mentioned by the index When we pack all objects, we use only the objects reachable from references and reflogs. This misses any objects which are reachable from the index, but not yet referenced. By itself this isn't a big deal; the objects can remain loose until they are actually used in a commit. However, it does create a problem when we drop packed but unreachable objects. We try to optimize out the writing of objects that we will immediately prune, which means we must follow the same rules as prune in determining what is reachable. And prune uses the index for this purpose. This is rather uncommon in practice, as objects in the index would not usually have been packed in the first place. But it could happen in a sequence like: 1. You make a commit on a branch that references blob X. 2. You repack, moving X into the pack. 3. You delete the branch (and its reflog), so that X is unreferenced. 4. You "git add" blob X so that it is now referenced only by the index. 5. You repack again with git-gc. The pack-objects we invoke will see that X is neither referenced nor recent and not bother loosening it. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 8 ++++++++ builtin/repack.c | 1 + t/t7701-repack-unpack-unreachable.sh | 13 +++++++++++++ 3 files changed, 22 insertions(+) (limited to 'builtin') diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index b26276b424..0cf95c9901 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -2617,6 +2617,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) int all_progress_implied = 0; struct argv_array rp = ARGV_ARRAY_INIT; int rev_list_unpacked = 0, rev_list_all = 0, rev_list_reflog = 0; + int rev_list_index = 0; struct option pack_objects_options[] = { OPT_SET_INT('q', "quiet", &progress, N_("do not show progress meter"), 0), @@ -2663,6 +2664,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) { OPTION_SET_INT, 0, "reflog", &rev_list_reflog, NULL, N_("include objects referred by reflog entries"), PARSE_OPT_NOARG | PARSE_OPT_NONEG, NULL, 1 }, + { OPTION_SET_INT, 0, "indexed-objects", &rev_list_index, NULL, + N_("include objects referred to by the index"), + PARSE_OPT_NOARG | PARSE_OPT_NONEG, NULL, 1 }, OPT_BOOL(0, "stdout", &pack_to_stdout, N_("output pack to stdout")), OPT_BOOL(0, "include-tag", &include_tag, @@ -2720,6 +2724,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) use_internal_rev_list = 1; argv_array_push(&rp, "--reflog"); } + if (rev_list_index) { + use_internal_rev_list = 1; + argv_array_push(&rp, "--indexed-objects"); + } if (rev_list_unpacked) { use_internal_rev_list = 1; argv_array_push(&rp, "--unpacked"); diff --git a/builtin/repack.c b/builtin/repack.c index 2aae05d364..28456206c5 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -209,6 +209,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) argv_array_push(&cmd_args, "--non-empty"); argv_array_push(&cmd_args, "--all"); argv_array_push(&cmd_args, "--reflog"); + argv_array_push(&cmd_args, "--indexed-objects"); if (window) argv_array_pushf(&cmd_args, "--window=%s", window); if (window_memory) diff --git a/t/t7701-repack-unpack-unreachable.sh b/t/t7701-repack-unpack-unreachable.sh index b8d4cdea8c..aad8a9c64d 100755 --- a/t/t7701-repack-unpack-unreachable.sh +++ b/t/t7701-repack-unpack-unreachable.sh @@ -109,4 +109,17 @@ test_expect_success 'do not bother loosening old objects' ' test_must_fail git cat-file -p $obj2 ' +test_expect_success 'keep packed objects found only in index' ' + echo my-unique-content >file && + git add file && + git commit -m "make it reachable" && + git gc && + git reset HEAD^ && + git reflog expire --expire=now --all && + git add file && + test-chmtime =-86400 .git/objects/pack/* && + git gc --prune=1.hour.ago && + git cat-file blob :file +' + test_done -- cgit v1.2.3 From b1e757f36377df1f2a6c165ebf171b09a8ad957b Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 16 Oct 2014 20:44:54 -0400 Subject: pack-objects: double-check options before discarding objects When we are given an expiration time like --unpack-unreachable=2.weeks.ago, we avoid writing out old, unreachable loose objects entirely, under the assumption that running "prune" would simply delete them immediately anyway. However, this is only valid if we computed the same set of reachable objects as prune would. In practice, this is the case, because only git-repack uses the --unpack-unreachable option with an expiration, and it always feeds as many objects into the pack as possible. But we can double-check at runtime just to be sure. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'builtin') diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 0cf95c9901..64123d4222 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -2757,6 +2757,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (keep_unreachable && unpack_unreachable) die("--keep-unreachable and --unpack-unreachable are incompatible."); + if (!rev_list_all || !rev_list_reflog || !rev_list_index) + unpack_unreachable_expiration = 0; if (!use_internal_rev_list || !pack_to_stdout || is_repository_shallow()) use_bitmap_index = 0; -- cgit v1.2.3