summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--fs/bcachefs/bcachefs_format.h6
-rw-r--r--fs/bcachefs/btree_gc.c57
-rw-r--r--fs/bcachefs/btree_io.c17
-rw-r--r--fs/bcachefs/btree_iter.c25
-rw-r--r--fs/bcachefs/btree_types.h3
-rw-r--r--fs/bcachefs/btree_update.h5
-rw-r--r--fs/bcachefs/btree_update_interior.h23
-rw-r--r--fs/bcachefs/btree_update_leaf.c228
-rw-r--r--fs/bcachefs/buckets.c13
-rw-r--r--fs/bcachefs/buckets.h2
-rw-r--r--fs/bcachefs/extent_update.c410
-rw-r--r--fs/bcachefs/extent_update.h5
-rw-r--r--fs/bcachefs/fsck.c56
-rw-r--r--fs/bcachefs/recovery.c154
-rw-r--r--fs/bcachefs/recovery.h2
15 files changed, 404 insertions, 602 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d1c0a5d5580e..1ad5ff449a5b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1315,12 +1315,14 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(inline_data, 8) \
x(new_extent_overwrite, 9) \
x(incompressible, 10) \
- x(btree_ptr_v2, 11)
+ x(btree_ptr_v2, 11) \
+ x(extents_above_btree_updates, 12)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
- (1ULL << BCH_FEATURE_btree_ptr_v2))
+ (1ULL << BCH_FEATURE_btree_ptr_v2)| \
+ (1ULL << BCH_FEATURE_extents_above_btree_updates))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a5fe3b316e06..f85fbc057fb3 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -186,8 +186,16 @@ fsck_err:
return ret;
}
-static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
- u8 *max_stale, bool initial)
+static bool pos_in_journal_keys(struct journal_keys *journal_keys,
+ enum btree_id id, struct bpos pos)
+{
+ struct journal_key *k = journal_key_search(journal_keys, id, pos);
+
+ return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos);
+}
+
+static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
+ struct journal_keys *journal_keys, bool initial)
{
struct btree_node_iter iter;
struct bkey unpacked;
@@ -201,6 +209,10 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
for_each_btree_node_key_unpack(b, k, &iter,
&unpacked) {
+ if (!b->c.level && journal_keys &&
+ pos_in_journal_keys(journal_keys, b->c.btree_id, k.k->p))
+ continue;
+
bch2_bkey_debugcheck(c, b, k);
ret = bch2_gc_mark_key(c, k, max_stale, initial);
@@ -212,6 +224,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
}
static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
+ struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
struct btree_trans trans;
@@ -239,7 +252,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
gc_pos_set(c, gc_pos_btree_node(b));
- ret = btree_gc_mark_node(c, b, &max_stale, initial);
+ ret = btree_gc_mark_node(c, b, &max_stale,
+ journal_keys, initial);
if (ret)
break;
@@ -281,36 +295,6 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
(int) btree_id_to_gc_phase(r);
}
-static int mark_journal_key(struct bch_fs *c, enum btree_id id,
- struct bkey_i *insert)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
- u8 max_stale;
- int ret = 0;
-
- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
- if (ret)
- return ret;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
- BTREE_ITER_SLOTS, k, ret) {
- percpu_down_read(&c->mark_lock);
- ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
- BTREE_TRIGGER_GC|
- BTREE_TRIGGER_NOATOMIC);
- percpu_up_read(&c->mark_lock);
-
- if (!ret)
- break;
- }
-
- return bch2_trans_exit(&trans) ?: ret;
-}
-
static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
@@ -325,18 +309,21 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
enum btree_id id = ids[i];
enum btree_node_type type = __btree_node_type(0, id);
- int ret = bch2_gc_btree(c, id, initial, metadata_only);
+ int ret = bch2_gc_btree(c, id, journal_keys,
+ initial, metadata_only);
if (ret)
return ret;
if (journal_keys && !metadata_only &&
btree_node_type_needs_gc(type)) {
struct journal_key *j;
+ u8 max_stale;
int ret;
for_each_journal_key(*journal_keys, j)
if (j->btree_id == id) {
- ret = mark_journal_key(c, id, j->k);
+ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k),
+ &max_stale, initial);
if (ret)
return ret;
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a4732bf13a11..d0b761417903 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -708,9 +708,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
unsigned *whiteout_u64s, int write,
bool have_retry)
{
- struct bkey_packed *k;
- struct bkey prev = KEY(0, 0, 0);
- struct bpos prev_data = POS_MIN;
+ struct bkey_packed *k, *prev = NULL;
bool seen_non_whiteout = false;
unsigned version;
const char *err;
@@ -852,15 +850,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
if (!seen_non_whiteout &&
(!bkey_whiteout(k) ||
- (bkey_cmp(prev.p, bkey_start_pos(u.k)) > 0))) {
+ (prev && bkey_iter_cmp(b, prev, k) > 0))) {
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
- } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 ||
- bkey_cmp(prev.p, u.k->p) > 0) {
+ } else if (prev && bkey_iter_cmp(b, prev, k) > 0) {
char buf1[80];
char buf2[80];
+ struct bkey up = bkey_unpack_key(b, prev);
- bch2_bkey_to_text(&PBUF(buf1), &prev);
+ bch2_bkey_to_text(&PBUF(buf1), &up);
bch2_bkey_to_text(&PBUF(buf2), u.k);
bch2_dump_bset(b, i, 0);
@@ -870,10 +868,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
/* XXX: repair this */
}
- if (!bkey_deleted(u.k))
- prev_data = u.k->p;
- prev = *u.k;
-
+ prev = k;
k = bkey_next_skip_noops(k, vstruct_last(i));
}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 347477c62779..5f918c6c3efb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1504,12 +1504,12 @@ static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
struct btree_trans *trans = iter->trans;
struct btree_insert_entry *i;
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
if ((cmp_int(iter->btree_id, i->iter->btree_id) ?:
bkey_cmp(pos, i->k->k.p)) <= 0)
break;
- return i < trans->updates + trans->nr_updates &&
+ return i < trans->updates2 + trans->nr_updates2 &&
iter->btree_id == i->iter->btree_id
? bkey_i_to_s_c(i->k)
: bkey_s_c_null;
@@ -1821,7 +1821,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
static int bch2_trans_realloc_iters(struct btree_trans *trans,
unsigned new_size)
{
- void *new_iters, *new_updates;
+ void *p, *new_iters, *new_updates, *new_updates2;
size_t iters_bytes;
size_t updates_bytes;
@@ -1839,21 +1839,27 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
iters_bytes = sizeof(struct btree_iter) * new_size;
updates_bytes = sizeof(struct btree_insert_entry) * new_size;
- new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS);
- if (new_iters)
+ p = kmalloc(iters_bytes +
+ updates_bytes +
+ updates_bytes, GFP_NOFS);
+ if (p)
goto success;
- new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
new_size = BTREE_ITER_MAX;
trans->used_mempool = true;
success:
- new_updates = new_iters + iters_bytes;
+ new_iters = p; p += iters_bytes;
+ new_updates = p; p += updates_bytes;
+ new_updates2 = p; p += updates_bytes;
memcpy(new_iters, trans->iters,
sizeof(struct btree_iter) * trans->nr_iters);
memcpy(new_updates, trans->updates,
sizeof(struct btree_insert_entry) * trans->nr_updates);
+ memcpy(new_updates2, trans->updates2,
+ sizeof(struct btree_insert_entry) * trans->nr_updates2);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
memset(trans->iters, POISON_FREE,
@@ -1865,6 +1871,7 @@ success:
trans->iters = new_iters;
trans->updates = new_updates;
+ trans->updates2 = new_updates2;
trans->size = new_size;
if (trans->iters_live) {
@@ -2126,6 +2133,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
trans->need_reset = 0;
trans->nr_updates = 0;
+ trans->nr_updates2 = 0;
trans->mem_top = 0;
if (trans->fs_usage_deltas) {
@@ -2157,6 +2165,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
trans->size = ARRAY_SIZE(trans->iters_onstack);
trans->iters = trans->iters_onstack;
trans->updates = trans->updates_onstack;
+ trans->updates2 = trans->updates2_onstack;
trans->fs_usage_deltas = NULL;
if (expected_nr_iters > trans->size)
@@ -2194,5 +2203,5 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
sizeof(struct btree_iter) * nr +
sizeof(struct btree_insert_entry) * nr +
- sizeof(u8) * nr);
+ sizeof(struct btree_insert_entry) * nr);
}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d1d5385d1eb7..fdfa7a265850 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -283,6 +283,7 @@ struct btree_trans {
u8 nr_iters;
u8 nr_updates;
+ u8 nr_updates2;
u8 size;
unsigned used_mempool:1;
unsigned error:1;
@@ -295,6 +296,7 @@ struct btree_trans {
struct btree_iter *iters;
struct btree_insert_entry *updates;
+ struct btree_insert_entry *updates2;
/* update path: */
struct journal_res journal_res;
@@ -308,6 +310,7 @@ struct btree_trans {
struct btree_iter iters_onstack[2];
struct btree_insert_entry updates_onstack[2];
+ struct btree_insert_entry updates2_onstack[2];
};
#define BTREE_FLAG(flag) \
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index d1cd839ac08f..12127a33906b 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -132,4 +132,9 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
+#define trans_for_each_update2(_trans, _i) \
+ for ((_i) = (_trans)->updates2; \
+ (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \
+ (_i)++)
+
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index e3204f32cc68..f6aceed89427 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -303,18 +303,23 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
}
static inline void push_whiteout(struct bch_fs *c, struct btree *b,
- struct bkey_packed *k)
+ struct bpos pos)
{
- unsigned u64s = bkeyp_key_u64s(&b->format, k);
- struct bkey_packed *dst;
+ struct bkey_packed k;
- BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b));
+ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
- b->whiteout_u64s += bkeyp_key_u64s(&b->format, k);
- dst = unwritten_whiteouts_start(c, b);
- memcpy_u64s(dst, k, u64s);
- dst->u64s = u64s;
- dst->type = KEY_TYPE_deleted;
+ if (!bkey_pack_pos(&k, pos, b)) {
+ struct bkey *u = (void *) &k;
+
+ bkey_init(u);
+ u->p = pos;
+ }
+
+ k.needs_whiteout = true;
+
+ b->whiteout_u64s += k.u64s;
+ bkey_copy(unwritten_whiteouts_start(c, b), &k);
}
/*
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 94418c9b42e8..f0efc52c7590 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -23,11 +23,10 @@
static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
- return i != trans->updates &&
+ return i != trans->updates2 &&
i[0].iter->l[0].b == i[-1].iter->l[0].b;
}
-
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
{
@@ -61,6 +60,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
bkey_cmp(insert->k.p, b->data->max_key) > 0);
+ EBUG_ON(insert->k.u64s >
+ bch_btree_keys_u64s_remaining(iter->trans->c, b));
+ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && bkey_cmp_packed(b, k, &insert->k))
@@ -79,7 +81,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
- push_whiteout(iter->trans->c, b, k);
+ push_whiteout(iter->trans->c, b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
@@ -195,20 +197,6 @@ void bch2_btree_journal_key(struct btree_trans *trans,
set_btree_node_dirty(b);
}
-static void bch2_insert_fixup_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct btree_iter_level *l = &iter->l[0];
-
- EBUG_ON(iter->level);
- EBUG_ON(insert->k.u64s >
- bch_btree_keys_u64s_remaining(trans->c, l->b));
-
- if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert)))
- bch2_btree_journal_key(trans, iter, insert);
-}
-
/**
* btree_insert_key - insert a key one key into a leaf node
*/
@@ -223,12 +211,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
+ EBUG_ON(iter->level);
+
insert->k.needs_whiteout = false;
- if (!btree_node_is_extents(b))
- bch2_insert_fixup_key(trans, iter, insert);
- else
- bch2_insert_fixup_extent(trans, iter, insert);
+ if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert)))
+ bch2_btree_journal_key(trans, iter, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -254,12 +242,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct bch_fs *c = trans->c;
BUG_ON(iter->level);
- BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos));
- EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
- bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0);
-
+ BUG_ON(bkey_cmp(insert->k.p, iter->pos));
BUG_ON(debug_check_bkeys(c) &&
- !bkey_deleted(&insert->k) &&
bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
}
@@ -312,9 +296,16 @@ btree_key_can_insert(struct btree_trans *trans,
if (unlikely(btree_node_fake(b)))
return BTREE_INSERT_BTREE_NODE_FULL;
+ /*
+ * old bch2_extent_sort_fix_overlapping() algorithm won't work with new
+ * style extent updates:
+ */
+ if (unlikely(btree_node_old_extent_overwrite(b)))
+ return BTREE_INSERT_BTREE_NODE_FULL;
+
ret = !btree_node_is_extents(b)
? BTREE_INSERT_OK
- : bch2_extent_can_insert(trans, iter, insert, u64s);
+ : bch2_extent_can_insert(trans, iter, insert);
if (ret)
return ret;
@@ -383,7 +374,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
prefetch(&trans->c->journal.flags);
- trans_for_each_update(trans, i) {
+ trans_for_each_update2(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
u64s = 0;
@@ -422,10 +413,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (journal_seq_verify(c))
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
else if (inject_invalid_keys(c))
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
i->k->k.version = MAX_VERSION;
}
@@ -448,7 +439,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (unlikely(c->gc_pos.phase))
bch2_trans_mark_gc(trans);
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
do_btree_insert_one(trans, i->iter, i->k);
err:
if (marking) {
@@ -469,7 +460,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
struct btree_iter *iter;
int ret;
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
BUG_ON(!btree_node_intent_locked(i->iter, 0));
ret = bch2_journal_preres_get(&trans->c->journal,
@@ -497,18 +488,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
}
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
btree_insert_entry_checks(trans, i->iter, i->k);
bch2_btree_trans_verify_locks(trans);
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_lock_for_insert(trans->c,
i->iter->l[0].b, i->iter);
ret = bch2_trans_commit_write_locked(trans, stopped_at);
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
i->iter);
@@ -525,14 +516,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
if (trans->flags & BTREE_INSERT_NOUNLOCK)
trans->nounlock = true;
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_foreground_maybe_merge(trans->c, i->iter,
0, trans->flags);
trans->nounlock = false;
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
bch2_btree_iter_downgrade(i->iter);
return 0;
@@ -655,6 +646,135 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
return 0;
}
+static void bch2_trans_update2(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *insert)
+{
+ struct btree_insert_entry *i, n = (struct btree_insert_entry) {
+ .iter = iter, .k = insert
+ };
+
+ btree_insert_entry_checks(trans, n.iter, n.k);
+
+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
+ EBUG_ON(trans->nr_updates2 >= trans->nr_iters);
+
+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+
+ trans_for_each_update2(trans, i) {
+ if (btree_iter_cmp(n.iter, i->iter) == 0) {
+ *i = n;
+ return;
+ }
+
+ if (btree_iter_cmp(n.iter, i->iter) <= 0)
+ break;
+ }
+
+ array_insert_item(trans->updates2, trans->nr_updates2,
+ i - trans->updates2, n);
+}
+
+static int extent_update_to_keys(struct btree_trans *trans,
+ struct btree_iter *orig_iter,
+ struct bkey_i *insert)
+{
+ struct btree_iter *iter;
+
+ if (bkey_deleted(&insert->k))
+ return 0;
+
+ iter = bch2_trans_copy_iter(trans, orig_iter);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ iter->flags |= BTREE_ITER_INTENT;
+ __bch2_btree_iter_set_pos(iter, insert->k.p, false);
+ bch2_trans_update2(trans, iter, insert);
+ bch2_trans_iter_put(trans, iter);
+ return 0;
+}
+
+static int extent_handle_overwrites(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos start, struct bpos end)
+{
+ struct btree_iter *iter = NULL, *update_iter;
+ struct bkey_i *update;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT);
+ ret = PTR_ERR_OR_ZERO(iter);
+ if (ret)
+ return ret;
+
+ k = bch2_btree_iter_peek_with_updates(iter);
+
+ while (k.k && !(ret = bkey_err(k))) {
+ if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0)
+ break;
+
+ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
+ update_iter = bch2_trans_copy_iter(trans, iter);
+ if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+ goto err;
+
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+ bch2_cut_back(start, update);
+
+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ bch2_trans_update2(trans, update_iter, update);
+ bch2_trans_iter_put(trans, update_iter);
+ }
+
+ if (bkey_cmp(k.k->p, end) > 0) {
+ update_iter = bch2_trans_copy_iter(trans, iter);
+ if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+ goto err;
+
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+ bch2_cut_front(end, update);
+
+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ bch2_trans_update2(trans, update_iter, update);
+ bch2_trans_iter_put(trans, update_iter);
+ } else {
+ update_iter = bch2_trans_copy_iter(trans, iter);
+ if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+ goto err;
+
+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ update->k = *k.k;
+ set_bkey_val_u64s(&update->k, 0);
+ update->k.type = KEY_TYPE_deleted;
+ update->k.size = 0;
+
+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ bch2_trans_update2(trans, update_iter, update);
+ bch2_trans_iter_put(trans, update_iter);
+ }
+
+ k = bch2_btree_iter_next_with_updates(iter);
+ }
+err:
+ if (!IS_ERR_OR_NULL(iter))
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
int __bch2_trans_commit(struct btree_trans *trans)
{
struct btree_insert_entry *i = NULL;
@@ -724,7 +844,36 @@ int __bch2_trans_commit(struct btree_trans *trans)
}
} while (trans_trigger_run);
+ /* Turn extents updates into keys: */
+ trans_for_each_update(trans, i)
+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
+ struct bpos start = bkey_start_pos(&i->k->k);
+
+ while (i + 1 < trans->updates + trans->nr_updates &&
+ i[0].iter->btree_id == i[1].iter->btree_id &&
+ !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)))
+ i++;
+
+ ret = extent_handle_overwrites(trans, i->iter->btree_id,
+ start, i->k->k.p);
+ if (ret)
+ goto out;
+ }
+
trans_for_each_update(trans, i) {
+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
+ ret = extent_update_to_keys(trans, i->iter, i->k);
+ if (ret)
+ goto out;
+ } else {
+ bch2_trans_update2(trans, i->iter, i->k);
+ }
+ }
+
+ trans_for_each_update2(trans, i) {
+ BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
+ BUG_ON(i->iter->locks_want < 1);
+
u64s = jset_u64s(i->k->k.u64s);
if (0)
trans->journal_preres_u64s += u64s;
@@ -773,7 +922,10 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
.trigger_flags = flags, .iter = iter, .k = k
};
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k)));
+ EBUG_ON(bkey_cmp(iter->pos,
+ (iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_start_pos(&k->k)
+ : k->k.p));
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7b0f0583b1a5..cd54c2b1eff2 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1254,21 +1254,21 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
struct bkey_s_c old,
struct bkey_i *new,
struct bch_fs_usage *fs_usage,
- unsigned flags)
+ unsigned flags,
+ bool is_extents)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter->l[0].b;
unsigned offset = 0;
- s64 sectors = 0;
+ s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
- if (btree_node_is_extents(b)
+ if (is_extents
? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
: bkey_cmp(new->k.p, old.k->p))
return 0;
- if (btree_node_is_extents(b)) {
+ if (is_extents) {
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
offset = 0;
@@ -1341,7 +1341,8 @@ int bch2_mark_update(struct btree_trans *trans,
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
ret = bch2_mark_overwrite(trans, iter, k, insert,
- fs_usage, flags);
+ fs_usage, flags,
+ btree_node_type_is_extents(iter->btree_id));
if (ret <= 0)
break;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4c84787575f5..29ebc07a2497 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -268,7 +268,7 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
struct bkey_s_c, struct bkey_i *,
- struct bch_fs_usage *, unsigned);
+ struct bch_fs_usage *, unsigned, bool);
int bch2_mark_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, struct bch_fs_usage *, unsigned);
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 846d77dc2530..fa6c0698f385 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -39,6 +39,12 @@ static int count_iters_for_insert(struct btree_trans *trans,
{
int ret = 0;
+ /*
+ * The extent update path requires an _additional_ iterator for each
+ * extent we're inserting and overwriting:
+ */
+ *nr_iters += 1;
+
switch (k.k->type) {
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
@@ -167,402 +173,40 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_i *insert,
- unsigned *u64s)
+ struct bkey_i *insert)
{
struct btree_iter_level *l = &iter->l[0];
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *_k;
+ struct bkey_s_c k;
struct bkey unpacked;
int sectors;
- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
- KEY_TYPE_discard))) {
- struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked);
- enum bch_extent_overlap overlap =
- bch2_extent_overlap(&insert->k, k.k);
-
- if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
- break;
-
- overlap = bch2_extent_overlap(&insert->k, k.k);
-
- /*
- * If we're overwriting an existing extent, we may need to emit
- * a whiteout - unless we're inserting a new extent at the same
- * position:
- */
- if (k.k->needs_whiteout &&
- (!bkey_whiteout(&insert->k) ||
- bkey_cmp(k.k->p, insert->k.p)))
- *u64s += BKEY_U64s;
-
- /*
- * If we're partially overwriting an existing extent which has
- * been written out to disk, we'll need to emit a new version of
- * that extent:
- */
- if (bkey_written(l->b, _k) &&
- overlap != BCH_EXTENT_OVERLAP_ALL)
- *u64s += _k->u64s;
-
- /* And we may be splitting an existing extent: */
- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- *u64s += _k->u64s;
-
- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
- (sectors = bch2_bkey_sectors_compressed(k))) {
- int flags = trans->flags & BTREE_INSERT_NOFAIL
- ? BCH_DISK_RESERVATION_NOFAIL : 0;
-
- switch (bch2_disk_reservation_add(trans->c,
- trans->disk_res,
- sectors, flags)) {
- case 0:
- break;
- case -ENOSPC:
- return BTREE_INSERT_ENOSPC;
- default:
- BUG();
- }
- }
-
- if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
- overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- break;
-
- bch2_btree_node_iter_advance(&node_iter, l->b);
- }
-
- return BTREE_INSERT_OK;
-}
-
-static void verify_extent_nonoverlapping(struct bch_fs *c,
- struct btree *b,
- struct btree_node_iter *_iter,
- struct bkey_i *insert)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct btree_node_iter iter;
- struct bkey_packed *k;
- struct bkey uk;
-
- if (!expensive_debug_checks(c))
- return;
-
- iter = *_iter;
- k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
- BUG_ON(k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
-
- iter = *_iter;
- k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
-#if 0
- BUG_ON(k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
-#else
- if (k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
- char buf1[100];
- char buf2[100];
-
- bch2_bkey_to_text(&PBUF(buf1), &insert->k);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
-
- bch2_dump_btree_node(b);
- panic("insert > next :\n"
- "insert %s\n"
- "next %s\n",
- buf1, buf2);
- }
-#endif
-
-#endif
-}
-
-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_packed *k =
- bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
-
- BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
-
- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
- verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
-
- if (debug_check_bkeys(c))
- bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
-
- bch2_bset_insert(l->b, &l->iter, k, insert, 0);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
-}
-
-static void pack_push_whiteout(struct bch_fs *c, struct btree *b,
- struct bpos pos)
-{
- struct bkey_packed k;
-
- if (!bkey_pack_pos(&k, pos, b)) {
- struct bkey_i tmp;
-
- bkey_init(&tmp.k);
- tmp.k.p = pos;
- bkey_copy(&k, &tmp);
- }
-
- k.needs_whiteout = true;
- push_whiteout(c, b, &k);
-}
-
-static void
-extent_drop(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_packed *_k, struct bkey_s k)
-{
- struct btree_iter_level *l = &iter->l[0];
-
- if (!bkey_whiteout(k.k))
- btree_account_key_drop(l->b, _k);
-
- k.k->size = 0;
- k.k->type = KEY_TYPE_deleted;
-
- if (!btree_node_old_extent_overwrite(l->b) &&
- k.k->needs_whiteout) {
- pack_push_whiteout(c, l->b, k.k->p);
- k.k->needs_whiteout = false;
- }
-
- if (_k >= btree_bset_last(l->b)->start) {
- unsigned u64s = _k->u64s;
-
- bch2_bset_delete(l->b, _k, _k->u64s);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0);
- } else {
- extent_save(l->b, _k, k.k);
- bch2_btree_iter_fix_key_modified(iter, l->b, _k);
- }
-}
-
-static void
-extent_squash(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_i *insert,
- struct bkey_packed *_k, struct bkey_s k,
- enum bch_extent_overlap overlap)
-{
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_on_stack tmp, split;
-
- bkey_on_stack_init(&tmp);
- bkey_on_stack_init(&split);
-
- if (!btree_node_old_extent_overwrite(l->b)) {
- if (!bkey_whiteout(&insert->k) &&
- !bkey_cmp(k.k->p, insert->k.p)) {
- insert->k.needs_whiteout = k.k->needs_whiteout;
- k.k->needs_whiteout = false;
- }
- } else {
- insert->k.needs_whiteout |= k.k->needs_whiteout;
- }
-
- switch (overlap) {
- case BCH_EXTENT_OVERLAP_FRONT:
- if (bkey_written(l->b, _k)) {
- bkey_on_stack_reassemble(&tmp, c, k.s_c);
- bch2_cut_front(insert->k.p, tmp.k);
-
- /*
- * needs_whiteout was propagated to new version of @k,
- * @tmp:
- */
- if (!btree_node_old_extent_overwrite(l->b))
- k.k->needs_whiteout = false;
-
- extent_drop(c, iter, _k, k);
- extent_bset_insert(c, iter, tmp.k);
- } else {
- btree_keys_account_val_delta(l->b, _k,
- bch2_cut_front_s(insert->k.p, k));
-
- extent_save(l->b, _k, k.k);
- /*
- * No need to call bset_fix_invalidated_key, start of
- * extent changed but extents are indexed by where they
- * end
- */
- bch2_btree_iter_fix_key_modified(iter, l->b, _k);
- }
- break;
- case BCH_EXTENT_OVERLAP_BACK:
- if (bkey_written(l->b, _k)) {
- bkey_on_stack_reassemble(&tmp, c, k.s_c);
- bch2_cut_back(bkey_start_pos(&insert->k), tmp.k);
-
- /*
- * @tmp has different position than @k, needs_whiteout
- * should not be propagated:
- */
- if (!btree_node_old_extent_overwrite(l->b))
- tmp.k->k.needs_whiteout = false;
-
- extent_drop(c, iter, _k, k);
- extent_bset_insert(c, iter, tmp.k);
- } else {
- /*
- * position of @k is changing, emit a whiteout if
- * needs_whiteout is set:
- */
- if (!btree_node_old_extent_overwrite(l->b) &&
- k.k->needs_whiteout) {
- pack_push_whiteout(c, l->b, k.k->p);
- k.k->needs_whiteout = false;
- }
-
- btree_keys_account_val_delta(l->b, _k,
- bch2_cut_back_s(bkey_start_pos(&insert->k), k));
- extent_save(l->b, _k, k.k);
-
- bch2_bset_fix_invalidated_key(l->b, _k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
- }
- break;
- case BCH_EXTENT_OVERLAP_ALL:
- extent_drop(c, iter, _k, k);
- break;
- case BCH_EXTENT_OVERLAP_MIDDLE:
- bkey_on_stack_reassemble(&split, c, k.s_c);
- bch2_cut_back(bkey_start_pos(&insert->k), split.k);
-
- if (!btree_node_old_extent_overwrite(l->b))
- split.k->k.needs_whiteout = false;
-
- /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */
- if (bkey_written(l->b, _k)) {
- bkey_on_stack_reassemble(&tmp, c, k.s_c);
- bch2_cut_front(insert->k.p, tmp.k);
-
- if (!btree_node_old_extent_overwrite(l->b))
- k.k->needs_whiteout = false;
-
- extent_drop(c, iter, _k, k);
- extent_bset_insert(c, iter, tmp.k);
- } else {
- btree_keys_account_val_delta(l->b, _k,
- bch2_cut_front_s(insert->k.p, k));
-
- extent_save(l->b, _k, k.k);
- bch2_btree_iter_fix_key_modified(iter, l->b, _k);
- }
-
- extent_bset_insert(c, iter, split.k);
- break;
- }
-
- bkey_on_stack_exit(&split, c);
- bkey_on_stack_exit(&tmp, c);
-}
+ _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+ KEY_TYPE_discard);
+ if (!_k)
+ return BTREE_INSERT_OK;
-/**
- * bch_extent_insert_fixup - insert a new extent and deal with overlaps
- *
- * this may result in not actually doing the insert, or inserting some subset
- * of the insert key. For cmpxchg operations this is where that logic lives.
- *
- * All subsets of @insert that need to be inserted are inserted using
- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
- * returns false, setting @iter->pos for the prefix of @insert that actually got
- * inserted.
- *
- * BSET INVARIANTS: this function is responsible for maintaining all the
- * invariants for bsets of extents in memory. things get really hairy with 0
- * size extents
- *
- * within one bset:
- *
- * bkey_start_pos(bkey_next(k)) >= k
- * or bkey_start_offset(bkey_next(k)) >= k->offset
- *
- * i.e. strict ordering, no overlapping extents.
- *
- * multiple bsets (i.e. full btree node):
- *
- * ∀ k, j
- * k.size != 0 ∧ j.size != 0 →
- * ¬ (k > bkey_start_pos(j) ∧ k < j)
- *
- * i.e. no two overlapping keys _of nonzero size_
- *
- * We can't realistically maintain this invariant for zero size keys because of
- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
- * there may be another 0 size key between them in another bset, and it will
- * thus overlap with the merged key.
- *
- * In addition, the end of iter->pos indicates how much has been processed.
- * If the end of iter->pos is not the same as the end of insert, then
- * key insertion needs to continue/be retried.
- */
-void bch2_insert_fixup_extent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter_level *l = &iter->l[0];
- struct btree_node_iter node_iter = l->iter;
- bool do_update = !bkey_whiteout(&insert->k);
- struct bkey_packed *_k;
- struct bkey unpacked;
+ k = bkey_disassemble(l->b, _k, &unpacked);
- EBUG_ON(iter->level);
- EBUG_ON(!insert->k.size);
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+ /* Check if we're splitting a compressed extent: */
- while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
- KEY_TYPE_discard))) {
- struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
- enum bch_extent_overlap overlap =
- bch2_extent_overlap(&insert->k, k.k);
+ if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 &&
+ bkey_cmp(insert->k.p, k.k->p) < 0 &&
+ (sectors = bch2_bkey_sectors_compressed(k))) {
+ int flags = trans->flags & BTREE_INSERT_NOFAIL
+ ? BCH_DISK_RESERVATION_NOFAIL : 0;
- if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+ switch (bch2_disk_reservation_add(trans->c, trans->disk_res,
+ sectors, flags)) {
+ case 0:
break;
-
- if (!bkey_whiteout(k.k))
- do_update = true;
-
- if (!do_update) {
- struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
-
- bch2_cut_front(cur_end, insert);
- bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
- } else {
- extent_squash(c, iter, insert, _k, k, overlap);
+ case -ENOSPC:
+ return BTREE_INSERT_ENOSPC;
+ default:
+ BUG();
}
-
- node_iter = l->iter;
-
- if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
- overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- break;
}
- l->iter = node_iter;
- bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
-
- if (do_update) {
- if (insert->k.type == KEY_TYPE_deleted)
- insert->k.type = KEY_TYPE_discard;
-
- if (!bkey_whiteout(&insert->k) ||
- btree_node_old_extent_overwrite(l->b))
- extent_bset_insert(c, iter, insert);
-
- bch2_btree_journal_key(trans, iter, insert);
- }
-
- bch2_cut_front(insert->k.p, insert);
+ return BTREE_INSERT_OK;
}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index e9dc8091ba3f..38dc084627d2 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -11,9 +11,6 @@ int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, unsigned *);
-void bch2_insert_fixup_extent(struct btree_trans *,
- struct btree_iter *,
- struct bkey_i *);
+ struct bkey_i *);
#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index eca723121a2c..902c8da9dc15 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -422,6 +422,42 @@ static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
POS(inode_nr + 1, 0), NULL);
}
+static int bch2_fix_overlapping_extent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k, struct bpos cut_at)
+{
+ struct btree_iter *u_iter;
+ struct bkey_i *u;
+ int ret;
+
+ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(u, k);
+ bch2_cut_front(cut_at, u);
+
+ u_iter = bch2_trans_copy_iter(trans, iter);
+ ret = PTR_ERR_OR_ZERO(u_iter);
+ if (ret)
+ return ret;
+
+ /*
+ * We don't want to go through the
+ * extent_handle_overwrites path:
+ */
+ __bch2_btree_iter_set_pos(u_iter, u->k.p, false);
+
+ /*
+ * XXX: this is going to leave disk space
+ * accounting slightly wrong
+ */
+ ret = bch2_trans_update(trans, u_iter, u, 0);
+ bch2_trans_iter_put(trans, u_iter);
+ return ret;
+}
+
/*
* Walk extents: verify that extents have a corresponding S_ISREG inode, and
* that i_size an i_sectors are consistent
@@ -433,6 +469,7 @@ static int check_extents(struct bch_fs *c)
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
+ struct bkey prev = KEY(0, 0, 0);
u64 i_sectors;
int ret = 0;
@@ -444,6 +481,25 @@ static int check_extents(struct bch_fs *c)
POS(BCACHEFS_ROOT_INO, 0), 0);
retry:
for_each_btree_key_continue(iter, 0, k, ret) {
+ if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) {
+ char buf1[100];
+ char buf2[100];
+
+ bch2_bkey_to_text(&PBUF(buf1), &prev);
+ bch2_bkey_to_text(&PBUF(buf2), k.k);
+
+ if (fsck_err(c, "overlapping extents: %s, %s", buf1, buf2)) {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ bch2_fix_overlapping_extent(&trans,
+ iter, k, prev.p));
+ if (ret)
+ goto err;
+ }
+ }
+ prev = *k.k;
+
ret = walk_inode(&trans, &w, k.k->p.inode);
if (ret)
break;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index bd0edda7abf9..27378cc9cdd5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -161,13 +161,16 @@ static void journal_entries_free(struct list_head *list)
}
}
+/*
+ * When keys compare equal, oldest compares first:
+ */
static int journal_sort_key_cmp(const void *_l, const void *_r)
{
const struct journal_key *l = _l;
const struct journal_key *r = _r;
return cmp_int(l->btree_id, r->btree_id) ?:
- bkey_cmp(l->pos, r->pos) ?:
+ bkey_cmp(l->k->k.p, r->k->k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
@@ -179,25 +182,11 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
return cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
- bkey_cmp(l->pos, r->pos);
-}
-
-static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i)
-{
- while (i + 1 < keys->d + keys->nr &&
- journal_sort_key_cmp(i, i + 1) > 0) {
- swap(i[0], i[1]);
- i++;
- }
+ bkey_cmp(l->k->k.p, r->k->k.p);
}
static void journal_keys_free(struct journal_keys *keys)
{
- struct journal_key *i;
-
- for_each_journal_key(*keys, i)
- if (i->allocated)
- kfree(i->k);
kvfree(keys->d);
keys->d = NULL;
keys->nr = 0;
@@ -208,15 +197,15 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
struct journal_replay *p;
struct jset_entry *entry;
struct bkey_i *k, *_n;
- struct journal_keys keys = { NULL }, keys_deduped = { NULL };
- struct journal_key *i;
+ struct journal_keys keys = { NULL };
+ struct journal_key *src, *dst;
size_t nr_keys = 0;
list_for_each_entry(p, journal_entries, list)
for_each_jset_key(k, _n, entry, &p->j)
nr_keys++;
- keys.journal_seq_base = keys_deduped.journal_seq_base =
+ keys.journal_seq_base =
le64_to_cpu(list_first_entry(journal_entries,
struct journal_replay,
list)->j.seq);
@@ -225,96 +214,31 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
if (!keys.d)
goto err;
- keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL);
- if (!keys_deduped.d)
- goto err;
-
list_for_each_entry(p, journal_entries, list)
- for_each_jset_key(k, _n, entry, &p->j) {
- if (bkey_deleted(&k->k) &&
- btree_node_type_is_extents(entry->btree_id))
- continue;
-
+ for_each_jset_key(k, _n, entry, &p->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
- .pos = bkey_start_pos(&k->k),
.k = k,
.journal_seq = le64_to_cpu(p->j.seq) -
keys.journal_seq_base,
.journal_offset = k->_data - p->j._data,
};
- }
sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
- i = keys.d;
- while (i < keys.d + keys.nr) {
- if (i + 1 < keys.d + keys.nr &&
- i[0].btree_id == i[1].btree_id &&
- !bkey_cmp(i[0].pos, i[1].pos)) {
- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
- i++;
- } else {
- bch2_cut_front(i[1].k->k.p, i[0].k);
- i[0].pos = i[1].k->k.p;
- journal_keys_sift(&keys, i);
- }
- continue;
- }
-
- if (i + 1 < keys.d + keys.nr &&
- i[0].btree_id == i[1].btree_id &&
- bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) {
- if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
- cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
- bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k);
- } else {
- struct bkey_i *split =
- kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
-
- if (!split)
- goto err;
-
- bkey_copy(split, i[0].k);
- bch2_cut_back(bkey_start_pos(&i[1].k->k), split);
- keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
- .btree_id = i[0].btree_id,
- .allocated = true,
- .pos = bkey_start_pos(&split->k),
- .k = split,
- .journal_seq = i[0].journal_seq,
- .journal_offset = i[0].journal_offset,
- };
-
- bch2_cut_front(i[1].k->k.p, i[0].k);
- i[0].pos = i[1].k->k.p;
- journal_keys_sift(&keys, i);
- continue;
- }
- } else {
- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) {
- i[1] = i[0];
- i++;
- continue;
- } else {
- bch2_cut_front(i[0].k->k.p, i[1].k);
- i[1].pos = i[0].k->k.p;
- journal_keys_sift(&keys, i + 1);
- continue;
- }
- }
- }
+ src = dst = keys.d;
+ while (src < keys.d + keys.nr) {
+ while (src + 1 < keys.d + keys.nr &&
+ src[0].btree_id == src[1].btree_id &&
+ !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
+ src++;
- keys_deduped.d[keys_deduped.nr++] = *i++;
+ *dst++ = *src++;
}
- kvfree(keys.d);
- return keys_deduped;
+ keys.nr = dst - keys.d;
err:
- journal_keys_free(&keys_deduped);
- kvfree(keys.d);
- return (struct journal_keys) { NULL };
+ return keys;
}
/* journal replay: */
@@ -365,11 +289,6 @@ retry:
atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
- split_iter = bch2_trans_copy_iter(&trans, iter);
- ret = PTR_ERR_OR_ZERO(split_iter);
- if (ret)
- goto err;
-
split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
ret = PTR_ERR_OR_ZERO(split);
if (ret)
@@ -388,12 +307,25 @@ retry:
}
bkey_copy(split, k);
- bch2_cut_front(split_iter->pos, split);
+ bch2_cut_front(iter->pos, split);
bch2_cut_back(atomic_end, split);
+ split_iter = bch2_trans_copy_iter(&trans, iter);
+ ret = PTR_ERR_OR_ZERO(split_iter);
+ if (ret)
+ goto err;
+
+ /*
+ * It's important that we don't go through the
+ * extent_handle_overwrites() and extent_update_to_keys() path
+ * here: journal replay is supposed to treat extents like
+ * regular keys
+ */
+ __bch2_btree_iter_set_pos(split_iter, split->k.p, false);
bch2_trans_update(&trans, split_iter, split, !remark
? BTREE_TRIGGER_NORUN
: BTREE_TRIGGER_NOOVERWRITES);
+
bch2_btree_iter_set_pos(iter, split->k.p);
} while (bkey_cmp(iter->pos, k->k.p) < 0);
@@ -424,11 +356,18 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
struct btree_iter *iter;
int ret;
- iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
+ iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
+ /*
+ * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
+ * extent_handle_overwrites() and extent_update_to_keys() - but we don't
+ * want that here, journal replay is supposed to treat extents like
+ * regular keys:
+ */
+ __bch2_btree_iter_set_pos(iter, k->k.p, false);
+
ret = bch2_btree_iter_traverse(iter) ?:
bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
bch2_trans_iter_put(trans, iter);
@@ -459,7 +398,7 @@ static int bch2_journal_replay(struct bch_fs *c,
if (i->btree_id == BTREE_ID_ALLOC)
ret = bch2_alloc_replay_key(c, i->k);
- else if (btree_node_type_is_extents(i->btree_id))
+ else if (i->k->k.size)
ret = bch2_extent_replay_key(c, i->btree_id, i->k);
else
ret = bch2_journal_replay_key(c, i->btree_id, i->k);
@@ -859,6 +798,15 @@ int bch2_fs_recovery(struct bch_fs *c)
journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
+ if (!c->sb.clean &&
+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
+
ret = journal_replay_early(c, clean, &journal_entries);
if (ret)
goto err;
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index ccd84a8fe60d..c91309301563 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -5,8 +5,6 @@
struct journal_keys {
struct journal_key {
enum btree_id btree_id:8;
- unsigned allocated:1;
- struct bpos pos;
struct bkey_i *k;
u32 journal_seq;
u32 journal_offset;