diff options
Diffstat (limited to '')
-rw-r--r-- | fs/bcachefs/bcachefs_format.h | 6 | ||||
-rw-r--r-- | fs/bcachefs/btree_gc.c | 57 | ||||
-rw-r--r-- | fs/bcachefs/btree_io.c | 17 | ||||
-rw-r--r-- | fs/bcachefs/btree_iter.c | 25 | ||||
-rw-r--r-- | fs/bcachefs/btree_types.h | 3 | ||||
-rw-r--r-- | fs/bcachefs/btree_update.h | 5 | ||||
-rw-r--r-- | fs/bcachefs/btree_update_interior.h | 23 | ||||
-rw-r--r-- | fs/bcachefs/btree_update_leaf.c | 228 | ||||
-rw-r--r-- | fs/bcachefs/buckets.c | 13 | ||||
-rw-r--r-- | fs/bcachefs/buckets.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/extent_update.c | 410 | ||||
-rw-r--r-- | fs/bcachefs/extent_update.h | 5 | ||||
-rw-r--r-- | fs/bcachefs/fsck.c | 56 | ||||
-rw-r--r-- | fs/bcachefs/recovery.c | 154 | ||||
-rw-r--r-- | fs/bcachefs/recovery.h | 2 |
15 files changed, 404 insertions, 602 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index d1c0a5d5580e..1ad5ff449a5b 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1315,12 +1315,14 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(inline_data, 8) \ x(new_extent_overwrite, 9) \ x(incompressible, 10) \ - x(btree_ptr_v2, 11) + x(btree_ptr_v2, 11) \ + x(extents_above_btree_updates, 12) #define BCH_SB_FEATURES_ALL \ ((1ULL << BCH_FEATURE_new_siphash)| \ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ - (1ULL << BCH_FEATURE_btree_ptr_v2)) + (1ULL << BCH_FEATURE_btree_ptr_v2)| \ + (1ULL << BCH_FEATURE_extents_above_btree_updates)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index a5fe3b316e06..f85fbc057fb3 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -186,8 +186,16 @@ fsck_err: return ret; } -static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, - u8 *max_stale, bool initial) +static bool pos_in_journal_keys(struct journal_keys *journal_keys, + enum btree_id id, struct bpos pos) +{ + struct journal_key *k = journal_key_search(journal_keys, id, pos); + + return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos); +} + +static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + struct journal_keys *journal_keys, bool initial) { struct btree_node_iter iter; struct bkey unpacked; @@ -201,6 +209,10 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { + if (!b->c.level && journal_keys && + pos_in_journal_keys(journal_keys, b->c.btree_id, k.k->p)) + continue; + bch2_bkey_debugcheck(c, b, k); ret = bch2_gc_mark_key(c, k, max_stale, initial); @@ -212,6 +224,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, } static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + struct journal_keys *journal_keys, bool initial, bool metadata_only) { struct btree_trans trans; @@ -239,7 +252,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, gc_pos_set(c, gc_pos_btree_node(b)); - ret = btree_gc_mark_node(c, b, &max_stale, initial); + ret = btree_gc_mark_node(c, b, &max_stale, + journal_keys, initial); if (ret) break; @@ -281,36 +295,6 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) (int) btree_id_to_gc_phase(r); } -static int mark_journal_key(struct bch_fs *c, enum btree_id id, - struct bkey_i *insert) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - u8 max_stale; - int ret = 0; - - ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true); - if (ret) - return ret; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k), - BTREE_ITER_SLOTS, k, ret) { - percpu_down_read(&c->mark_lock); - ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL, - BTREE_TRIGGER_GC| - BTREE_TRIGGER_NOATOMIC); - percpu_up_read(&c->mark_lock); - - if (!ret) - break; - } - - return bch2_trans_exit(&trans) ?: ret; -} - static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, bool initial, bool metadata_only) { @@ -325,18 +309,21 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, enum btree_id id = ids[i]; enum btree_node_type type = __btree_node_type(0, id); - int ret = bch2_gc_btree(c, id, initial, metadata_only); + int ret = bch2_gc_btree(c, id, journal_keys, + initial, metadata_only); if (ret) return ret; if (journal_keys && !metadata_only && btree_node_type_needs_gc(type)) { struct journal_key *j; + u8 max_stale; int ret; for_each_journal_key(*journal_keys, j) if (j->btree_id == id) { - ret = mark_journal_key(c, id, j->k); + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k), + &max_stale, initial); if (ret) return ret; } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index a4732bf13a11..d0b761417903 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -708,9 +708,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, unsigned *whiteout_u64s, int write, bool have_retry) { - struct bkey_packed *k; - struct bkey prev = KEY(0, 0, 0); - struct bpos prev_data = POS_MIN; + struct bkey_packed *k, *prev = NULL; bool seen_non_whiteout = false; unsigned version; const char *err; @@ -852,15 +850,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b, if (!seen_non_whiteout && (!bkey_whiteout(k) || - (bkey_cmp(prev.p, bkey_start_pos(u.k)) > 0))) { + (prev && bkey_iter_cmp(b, prev, k) > 0))) { *whiteout_u64s = k->_data - i->_data; seen_non_whiteout = true; - } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 || - bkey_cmp(prev.p, u.k->p) > 0) { + } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { char buf1[80]; char buf2[80]; + struct bkey up = bkey_unpack_key(b, prev); - bch2_bkey_to_text(&PBUF(buf1), &prev); + bch2_bkey_to_text(&PBUF(buf1), &up); bch2_bkey_to_text(&PBUF(buf2), u.k); bch2_dump_bset(b, i, 0); @@ -870,10 +868,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, /* XXX: repair this */ } - if (!bkey_deleted(u.k)) - prev_data = u.k->p; - prev = *u.k; - + prev = k; k = bkey_next_skip_noops(k, vstruct_last(i)); } diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 347477c62779..5f918c6c3efb 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1504,12 +1504,12 @@ static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) struct btree_trans *trans = iter->trans; struct btree_insert_entry *i; - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: bkey_cmp(pos, i->k->k.p)) <= 0) break; - return i < trans->updates + trans->nr_updates && + return i < trans->updates2 + trans->nr_updates2 && iter->btree_id == i->iter->btree_id ? bkey_i_to_s_c(i->k) : bkey_s_c_null; @@ -1821,7 +1821,7 @@ int bch2_trans_iter_free(struct btree_trans *trans, static int bch2_trans_realloc_iters(struct btree_trans *trans, unsigned new_size) { - void *new_iters, *new_updates; + void *p, *new_iters, *new_updates, *new_updates2; size_t iters_bytes; size_t updates_bytes; @@ -1839,21 +1839,27 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, iters_bytes = sizeof(struct btree_iter) * new_size; updates_bytes = sizeof(struct btree_insert_entry) * new_size; - new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS); - if (new_iters) + p = kmalloc(iters_bytes + + updates_bytes + + updates_bytes, GFP_NOFS); + if (p) goto success; - new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); + p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); new_size = BTREE_ITER_MAX; trans->used_mempool = true; success: - new_updates = new_iters + iters_bytes; + new_iters = p; p += iters_bytes; + new_updates = p; p += updates_bytes; + new_updates2 = p; p += updates_bytes; memcpy(new_iters, trans->iters, sizeof(struct btree_iter) * trans->nr_iters); memcpy(new_updates, trans->updates, sizeof(struct btree_insert_entry) * trans->nr_updates); + memcpy(new_updates2, trans->updates2, + sizeof(struct btree_insert_entry) * trans->nr_updates2); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) memset(trans->iters, POISON_FREE, @@ -1865,6 +1871,7 @@ success: trans->iters = new_iters; trans->updates = new_updates; + trans->updates2 = new_updates2; trans->size = new_size; if (trans->iters_live) { @@ -2126,6 +2133,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) trans->need_reset = 0; trans->nr_updates = 0; + trans->nr_updates2 = 0; trans->mem_top = 0; if (trans->fs_usage_deltas) { @@ -2157,6 +2165,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, trans->size = ARRAY_SIZE(trans->iters_onstack); trans->iters = trans->iters_onstack; trans->updates = trans->updates_onstack; + trans->updates2 = trans->updates2_onstack; trans->fs_usage_deltas = NULL; if (expected_nr_iters > trans->size) @@ -2194,5 +2203,5 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, sizeof(struct btree_iter) * nr + sizeof(struct btree_insert_entry) * nr + - sizeof(u8) * nr); + sizeof(struct btree_insert_entry) * nr); } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index d1d5385d1eb7..fdfa7a265850 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -283,6 +283,7 @@ struct btree_trans { u8 nr_iters; u8 nr_updates; + u8 nr_updates2; u8 size; unsigned used_mempool:1; unsigned error:1; @@ -295,6 +296,7 @@ struct btree_trans { struct btree_iter *iters; struct btree_insert_entry *updates; + struct btree_insert_entry *updates2; /* update path: */ struct journal_res journal_res; @@ -308,6 +310,7 @@ struct btree_trans { struct btree_iter iters_onstack[2]; struct btree_insert_entry updates_onstack[2]; + struct btree_insert_entry updates2_onstack[2]; }; #define BTREE_FLAG(flag) \ diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index d1cd839ac08f..12127a33906b 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -132,4 +132,9 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) +#define trans_for_each_update2(_trans, _i) \ + for ((_i) = (_trans)->updates2; \ + (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ + (_i)++) + #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index e3204f32cc68..f6aceed89427 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -303,18 +303,23 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, } static inline void push_whiteout(struct bch_fs *c, struct btree *b, - struct bkey_packed *k) + struct bpos pos) { - unsigned u64s = bkeyp_key_u64s(&b->format, k); - struct bkey_packed *dst; + struct bkey_packed k; - BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b)); + BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); - b->whiteout_u64s += bkeyp_key_u64s(&b->format, k); - dst = unwritten_whiteouts_start(c, b); - memcpy_u64s(dst, k, u64s); - dst->u64s = u64s; - dst->type = KEY_TYPE_deleted; + if (!bkey_pack_pos(&k, pos, b)) { + struct bkey *u = (void *) &k; + + bkey_init(u); + u->p = pos; + } + + k.needs_whiteout = true; + + b->whiteout_u64s += k.u64s; + bkey_copy(unwritten_whiteouts_start(c, b), &k); } /* diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 94418c9b42e8..f0efc52c7590 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -23,11 +23,10 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { - return i != trans->updates && + return i != trans->updates2 && i[0].iter->l[0].b == i[-1].iter->l[0].b; } - inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, struct btree_iter *iter) { @@ -61,6 +60,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 || bkey_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(insert->k.u64s > + bch_btree_keys_u64s_remaining(iter->trans->c, b)); + EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_packed(b, k, &insert->k)) @@ -79,7 +81,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, k->type = KEY_TYPE_deleted; if (k->needs_whiteout) - push_whiteout(iter->trans->c, b, k); + push_whiteout(iter->trans->c, b, insert->k.p); k->needs_whiteout = false; if (k >= btree_bset_last(b)->start) { @@ -195,20 +197,6 @@ void bch2_btree_journal_key(struct btree_trans *trans, set_btree_node_dirty(b); } -static void bch2_insert_fixup_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree_iter_level *l = &iter->l[0]; - - EBUG_ON(iter->level); - EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(trans->c, l->b)); - - if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert))) - bch2_btree_journal_key(trans, iter, insert); -} - /** * btree_insert_key - insert a key one key into a leaf node */ @@ -223,12 +211,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans, int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; + EBUG_ON(iter->level); + insert->k.needs_whiteout = false; - if (!btree_node_is_extents(b)) - bch2_insert_fixup_key(trans, iter, insert); - else - bch2_insert_fixup_extent(trans, iter, insert); + if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert))) + bch2_btree_journal_key(trans, iter, insert); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) bset_u64s(t) - old_u64s; @@ -254,12 +242,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, struct bch_fs *c = trans->c; BUG_ON(iter->level); - BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos)); - EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0); - + BUG_ON(bkey_cmp(insert->k.p, iter->pos)); BUG_ON(debug_check_bkeys(c) && - !bkey_deleted(&insert->k) && bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id)); } @@ -312,9 +296,16 @@ btree_key_can_insert(struct btree_trans *trans, if (unlikely(btree_node_fake(b))) return BTREE_INSERT_BTREE_NODE_FULL; + /* + * old bch2_extent_sort_fix_overlapping() algorithm won't work with new + * style extent updates: + */ + if (unlikely(btree_node_old_extent_overwrite(b))) + return BTREE_INSERT_BTREE_NODE_FULL; + ret = !btree_node_is_extents(b) ? BTREE_INSERT_OK - : bch2_extent_can_insert(trans, iter, insert, u64s); + : bch2_extent_can_insert(trans, iter, insert); if (ret) return ret; @@ -383,7 +374,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, prefetch(&trans->c->journal.flags); - trans_for_each_update(trans, i) { + trans_for_each_update2(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) u64s = 0; @@ -422,10 +413,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { if (journal_seq_verify(c)) - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) i->k->k.version.lo = trans->journal_res.seq; else if (inject_invalid_keys(c)) - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) i->k->k.version = MAX_VERSION; } @@ -448,7 +439,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (unlikely(c->gc_pos.phase)) bch2_trans_mark_gc(trans); - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) do_btree_insert_one(trans, i->iter, i->k); err: if (marking) { @@ -469,7 +460,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, struct btree_iter *iter; int ret; - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) BUG_ON(!btree_node_intent_locked(i->iter, 0)); ret = bch2_journal_preres_get(&trans->c->journal, @@ -497,18 +488,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, } if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) btree_insert_entry_checks(trans, i->iter, i->k); bch2_btree_trans_verify_locks(trans); - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_btree_node_lock_for_insert(trans->c, i->iter->l[0].b, i->iter); ret = bch2_trans_commit_write_locked(trans, stopped_at); - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, i->iter); @@ -525,14 +516,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (trans->flags & BTREE_INSERT_NOUNLOCK) trans->nounlock = true; - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_foreground_maybe_merge(trans->c, i->iter, 0, trans->flags); trans->nounlock = false; - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) bch2_btree_iter_downgrade(i->iter); return 0; @@ -655,6 +646,135 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) return 0; } +static void bch2_trans_update2(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) +{ + struct btree_insert_entry *i, n = (struct btree_insert_entry) { + .iter = iter, .k = insert + }; + + btree_insert_entry_checks(trans, n.iter, n.k); + + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + + EBUG_ON(trans->nr_updates2 >= trans->nr_iters); + + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + + trans_for_each_update2(trans, i) { + if (btree_iter_cmp(n.iter, i->iter) == 0) { + *i = n; + return; + } + + if (btree_iter_cmp(n.iter, i->iter) <= 0) + break; + } + + array_insert_item(trans->updates2, trans->nr_updates2, + i - trans->updates2, n); +} + +static int extent_update_to_keys(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert) +{ + struct btree_iter *iter; + + if (bkey_deleted(&insert->k)) + return 0; + + iter = bch2_trans_copy_iter(trans, orig_iter); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + iter->flags |= BTREE_ITER_INTENT; + __bch2_btree_iter_set_pos(iter, insert->k.p, false); + bch2_trans_update2(trans, iter, insert); + bch2_trans_iter_put(trans, iter); + return 0; +} + +static int extent_handle_overwrites(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos start, struct bpos end) +{ + struct btree_iter *iter = NULL, *update_iter; + struct bkey_i *update; + struct bkey_s_c k; + int ret = 0; + + iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(iter); + if (ret) + return ret; + + k = bch2_btree_iter_peek_with_updates(iter); + + while (k.k && !(ret = bkey_err(k))) { + if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) + break; + + if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { + update_iter = bch2_trans_copy_iter(trans, iter); + if ((ret = PTR_ERR_OR_ZERO(update_iter))) + goto err; + + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + bch2_cut_back(start, update); + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + } + + if (bkey_cmp(k.k->p, end) > 0) { + update_iter = bch2_trans_copy_iter(trans, iter); + if ((ret = PTR_ERR_OR_ZERO(update_iter))) + goto err; + + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + bch2_cut_front(end, update); + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + } else { + update_iter = bch2_trans_copy_iter(trans, iter); + if ((ret = PTR_ERR_OR_ZERO(update_iter))) + goto err; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + update->k = *k.k; + set_bkey_val_u64s(&update->k, 0); + update->k.type = KEY_TYPE_deleted; + update->k.size = 0; + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + } + + k = bch2_btree_iter_next_with_updates(iter); + } +err: + if (!IS_ERR_OR_NULL(iter)) + bch2_trans_iter_put(trans, iter); + return ret; +} + int __bch2_trans_commit(struct btree_trans *trans) { struct btree_insert_entry *i = NULL; @@ -724,7 +844,36 @@ int __bch2_trans_commit(struct btree_trans *trans) } } while (trans_trigger_run); + /* Turn extents updates into keys: */ + trans_for_each_update(trans, i) + if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { + struct bpos start = bkey_start_pos(&i->k->k); + + while (i + 1 < trans->updates + trans->nr_updates && + i[0].iter->btree_id == i[1].iter->btree_id && + !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) + i++; + + ret = extent_handle_overwrites(trans, i->iter->btree_id, + start, i->k->k.p); + if (ret) + goto out; + } + trans_for_each_update(trans, i) { + if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { + ret = extent_update_to_keys(trans, i->iter, i->k); + if (ret) + goto out; + } else { + bch2_trans_update2(trans, i->iter, i->k); + } + } + + trans_for_each_update2(trans, i) { + BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); + BUG_ON(i->iter->locks_want < 1); + u64s = jset_u64s(i->k->k.u64s); if (0) trans->journal_preres_u64s += u64s; @@ -773,7 +922,10 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, .trigger_flags = flags, .iter = iter, .k = k }; - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k))); + EBUG_ON(bkey_cmp(iter->pos, + (iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_start_pos(&k->k) + : k->k.p)); iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 7b0f0583b1a5..cd54c2b1eff2 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1254,21 +1254,21 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, struct bkey_s_c old, struct bkey_i *new, struct bch_fs_usage *fs_usage, - unsigned flags) + unsigned flags, + bool is_extents) { struct bch_fs *c = trans->c; - struct btree *b = iter->l[0].b; unsigned offset = 0; - s64 sectors = 0; + s64 sectors = -((s64) old.k->size); flags |= BTREE_TRIGGER_OVERWRITE; - if (btree_node_is_extents(b) + if (is_extents ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 : bkey_cmp(new->k.p, old.k->p)) return 0; - if (btree_node_is_extents(b)) { + if (is_extents) { switch (bch2_extent_overlap(&new->k, old.k)) { case BCH_EXTENT_OVERLAP_ALL: offset = 0; @@ -1341,7 +1341,8 @@ int bch2_mark_update(struct btree_trans *trans, struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ret = bch2_mark_overwrite(trans, iter, k, insert, - fs_usage, flags); + fs_usage, flags, + btree_node_type_is_extents(iter->btree_id)); if (ret <= 0) break; diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 4c84787575f5..29ebc07a2497 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -268,7 +268,7 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *, int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, struct bkey_s_c, struct bkey_i *, - struct bch_fs_usage *, unsigned); + struct bch_fs_usage *, unsigned, bool); int bch2_mark_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, struct bch_fs_usage *, unsigned); diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 846d77dc2530..fa6c0698f385 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -39,6 +39,12 @@ static int count_iters_for_insert(struct btree_trans *trans, { int ret = 0; + /* + * The extent update path requires an _additional_ iterator for each + * extent we're inserting and overwriting: + */ + *nr_iters += 1; + switch (k.k->type) { case KEY_TYPE_extent: case KEY_TYPE_reflink_v: @@ -167,402 +173,40 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) enum btree_insert_ret bch2_extent_can_insert(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, - unsigned *u64s) + struct bkey_i *insert) { struct btree_iter_level *l = &iter->l[0]; struct btree_node_iter node_iter = l->iter; struct bkey_packed *_k; + struct bkey_s_c k; struct bkey unpacked; int sectors; - while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, - KEY_TYPE_discard))) { - struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked); - enum bch_extent_overlap overlap = - bch2_extent_overlap(&insert->k, k.k); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - overlap = bch2_extent_overlap(&insert->k, k.k); - - /* - * If we're overwriting an existing extent, we may need to emit - * a whiteout - unless we're inserting a new extent at the same - * position: - */ - if (k.k->needs_whiteout && - (!bkey_whiteout(&insert->k) || - bkey_cmp(k.k->p, insert->k.p))) - *u64s += BKEY_U64s; - - /* - * If we're partially overwriting an existing extent which has - * been written out to disk, we'll need to emit a new version of - * that extent: - */ - if (bkey_written(l->b, _k) && - overlap != BCH_EXTENT_OVERLAP_ALL) - *u64s += _k->u64s; - - /* And we may be splitting an existing extent: */ - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) - *u64s += _k->u64s; - - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bch2_bkey_sectors_compressed(k))) { - int flags = trans->flags & BTREE_INSERT_NOFAIL - ? BCH_DISK_RESERVATION_NOFAIL : 0; - - switch (bch2_disk_reservation_add(trans->c, - trans->disk_res, - sectors, flags)) { - case 0: - break; - case -ENOSPC: - return BTREE_INSERT_ENOSPC; - default: - BUG(); - } - } - - if (overlap == BCH_EXTENT_OVERLAP_FRONT || - overlap == BCH_EXTENT_OVERLAP_MIDDLE) - break; - - bch2_btree_node_iter_advance(&node_iter, l->b); - } - - return BTREE_INSERT_OK; -} - -static void verify_extent_nonoverlapping(struct bch_fs *c, - struct btree *b, - struct btree_node_iter *_iter, - struct bkey_i *insert) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct btree_node_iter iter; - struct bkey_packed *k; - struct bkey uk; - - if (!expensive_debug_checks(c)) - return; - - iter = *_iter; - k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); - - iter = *_iter; - k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); -#if 0 - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); -#else - if (k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { - char buf1[100]; - char buf2[100]; - - bch2_bkey_to_text(&PBUF(buf1), &insert->k); - bch2_bkey_to_text(&PBUF(buf2), &uk); - - bch2_dump_btree_node(b); - panic("insert > next :\n" - "insert %s\n" - "next %s\n", - buf1, buf2); - } -#endif - -#endif -} - -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *k = - bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); - - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - verify_extent_nonoverlapping(c, l->b, &l->iter, insert); - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); - - bch2_bset_insert(l->b, &l->iter, k, insert, 0); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); -} - -static void pack_push_whiteout(struct bch_fs *c, struct btree *b, - struct bpos pos) -{ - struct bkey_packed k; - - if (!bkey_pack_pos(&k, pos, b)) { - struct bkey_i tmp; - - bkey_init(&tmp.k); - tmp.k.p = pos; - bkey_copy(&k, &tmp); - } - - k.needs_whiteout = true; - push_whiteout(c, b, &k); -} - -static void -extent_drop(struct bch_fs *c, struct btree_iter *iter, - struct bkey_packed *_k, struct bkey_s k) -{ - struct btree_iter_level *l = &iter->l[0]; - - if (!bkey_whiteout(k.k)) - btree_account_key_drop(l->b, _k); - - k.k->size = 0; - k.k->type = KEY_TYPE_deleted; - - if (!btree_node_old_extent_overwrite(l->b) && - k.k->needs_whiteout) { - pack_push_whiteout(c, l->b, k.k->p); - k.k->needs_whiteout = false; - } - - if (_k >= btree_bset_last(l->b)->start) { - unsigned u64s = _k->u64s; - - bch2_bset_delete(l->b, _k, _k->u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0); - } else { - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - } -} - -static void -extent_squash(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert, - struct bkey_packed *_k, struct bkey_s k, - enum bch_extent_overlap overlap) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bkey_on_stack tmp, split; - - bkey_on_stack_init(&tmp); - bkey_on_stack_init(&split); - - if (!btree_node_old_extent_overwrite(l->b)) { - if (!bkey_whiteout(&insert->k) && - !bkey_cmp(k.k->p, insert->k.p)) { - insert->k.needs_whiteout = k.k->needs_whiteout; - k.k->needs_whiteout = false; - } - } else { - insert->k.needs_whiteout |= k.k->needs_whiteout; - } - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - if (bkey_written(l->b, _k)) { - bkey_on_stack_reassemble(&tmp, c, k.s_c); - bch2_cut_front(insert->k.p, tmp.k); - - /* - * needs_whiteout was propagated to new version of @k, - * @tmp: - */ - if (!btree_node_old_extent_overwrite(l->b)) - k.k->needs_whiteout = false; - - extent_drop(c, iter, _k, k); - extent_bset_insert(c, iter, tmp.k); - } else { - btree_keys_account_val_delta(l->b, _k, - bch2_cut_front_s(insert->k.p, k)); - - extent_save(l->b, _k, k.k); - /* - * No need to call bset_fix_invalidated_key, start of - * extent changed but extents are indexed by where they - * end - */ - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - } - break; - case BCH_EXTENT_OVERLAP_BACK: - if (bkey_written(l->b, _k)) { - bkey_on_stack_reassemble(&tmp, c, k.s_c); - bch2_cut_back(bkey_start_pos(&insert->k), tmp.k); - - /* - * @tmp has different position than @k, needs_whiteout - * should not be propagated: - */ - if (!btree_node_old_extent_overwrite(l->b)) - tmp.k->k.needs_whiteout = false; - - extent_drop(c, iter, _k, k); - extent_bset_insert(c, iter, tmp.k); - } else { - /* - * position of @k is changing, emit a whiteout if - * needs_whiteout is set: - */ - if (!btree_node_old_extent_overwrite(l->b) && - k.k->needs_whiteout) { - pack_push_whiteout(c, l->b, k.k->p); - k.k->needs_whiteout = false; - } - - btree_keys_account_val_delta(l->b, _k, - bch2_cut_back_s(bkey_start_pos(&insert->k), k)); - extent_save(l->b, _k, k.k); - - bch2_bset_fix_invalidated_key(l->b, _k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - } - break; - case BCH_EXTENT_OVERLAP_ALL: - extent_drop(c, iter, _k, k); - break; - case BCH_EXTENT_OVERLAP_MIDDLE: - bkey_on_stack_reassemble(&split, c, k.s_c); - bch2_cut_back(bkey_start_pos(&insert->k), split.k); - - if (!btree_node_old_extent_overwrite(l->b)) - split.k->k.needs_whiteout = false; - - /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */ - if (bkey_written(l->b, _k)) { - bkey_on_stack_reassemble(&tmp, c, k.s_c); - bch2_cut_front(insert->k.p, tmp.k); - - if (!btree_node_old_extent_overwrite(l->b)) - k.k->needs_whiteout = false; - - extent_drop(c, iter, _k, k); - extent_bset_insert(c, iter, tmp.k); - } else { - btree_keys_account_val_delta(l->b, _k, - bch2_cut_front_s(insert->k.p, k)); - - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - } - - extent_bset_insert(c, iter, split.k); - break; - } - - bkey_on_stack_exit(&split, c); - bkey_on_stack_exit(&tmp, c); -} + _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, + KEY_TYPE_discard); + if (!_k) + return BTREE_INSERT_OK; -/** - * bch_extent_insert_fixup - insert a new extent and deal with overlaps - * - * this may result in not actually doing the insert, or inserting some subset - * of the insert key. For cmpxchg operations this is where that logic lives. - * - * All subsets of @insert that need to be inserted are inserted using - * bch2_btree_insert_and_journal(). If @b or @res fills up, this function - * returns false, setting @iter->pos for the prefix of @insert that actually got - * inserted. - * - * BSET INVARIANTS: this function is responsible for maintaining all the - * invariants for bsets of extents in memory. things get really hairy with 0 - * size extents - * - * within one bset: - * - * bkey_start_pos(bkey_next(k)) >= k - * or bkey_start_offset(bkey_next(k)) >= k->offset - * - * i.e. strict ordering, no overlapping extents. - * - * multiple bsets (i.e. full btree node): - * - * ∀ k, j - * k.size != 0 ∧ j.size != 0 → - * ¬ (k > bkey_start_pos(j) ∧ k < j) - * - * i.e. no two overlapping keys _of nonzero size_ - * - * We can't realistically maintain this invariant for zero size keys because of - * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j - * there may be another 0 size key between them in another bset, and it will - * thus overlap with the merged key. - * - * In addition, the end of iter->pos indicates how much has been processed. - * If the end of iter->pos is not the same as the end of insert, then - * key insertion needs to continue/be retried. - */ -void bch2_insert_fixup_extent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) -{ - struct bch_fs *c = trans->c; - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter = l->iter; - bool do_update = !bkey_whiteout(&insert->k); - struct bkey_packed *_k; - struct bkey unpacked; + k = bkey_disassemble(l->b, _k, &unpacked); - EBUG_ON(iter->level); - EBUG_ON(!insert->k.size); - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); + /* Check if we're splitting a compressed extent: */ - while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, - KEY_TYPE_discard))) { - struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); - enum bch_extent_overlap overlap = - bch2_extent_overlap(&insert->k, k.k); + if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && + bkey_cmp(insert->k.p, k.k->p) < 0 && + (sectors = bch2_bkey_sectors_compressed(k))) { + int flags = trans->flags & BTREE_INSERT_NOFAIL + ? BCH_DISK_RESERVATION_NOFAIL : 0; - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) + switch (bch2_disk_reservation_add(trans->c, trans->disk_res, + sectors, flags)) { + case 0: break; - - if (!bkey_whiteout(k.k)) - do_update = true; - - if (!do_update) { - struct bpos cur_end = bpos_min(insert->k.p, k.k->p); - - bch2_cut_front(cur_end, insert); - bch2_btree_iter_set_pos_same_leaf(iter, cur_end); - } else { - extent_squash(c, iter, insert, _k, k, overlap); + case -ENOSPC: + return BTREE_INSERT_ENOSPC; + default: + BUG(); } - - node_iter = l->iter; - - if (overlap == BCH_EXTENT_OVERLAP_FRONT || - overlap == BCH_EXTENT_OVERLAP_MIDDLE) - break; } - l->iter = node_iter; - bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); - - if (do_update) { - if (insert->k.type == KEY_TYPE_deleted) - insert->k.type = KEY_TYPE_discard; - - if (!bkey_whiteout(&insert->k) || - btree_node_old_extent_overwrite(l->b)) - extent_bset_insert(c, iter, insert); - - bch2_btree_journal_key(trans, iter, insert); - } - - bch2_cut_front(insert->k.p, insert); + return BTREE_INSERT_OK; } diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h index e9dc8091ba3f..38dc084627d2 100644 --- a/fs/bcachefs/extent_update.h +++ b/fs/bcachefs/extent_update.h @@ -11,9 +11,6 @@ int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); enum btree_insert_ret bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, - struct bkey_i *, unsigned *); -void bch2_insert_fixup_extent(struct btree_trans *, - struct btree_iter *, - struct bkey_i *); + struct bkey_i *); #endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index eca723121a2c..902c8da9dc15 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -422,6 +422,42 @@ static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) POS(inode_nr + 1, 0), NULL); } +static int bch2_fix_overlapping_extent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, struct bpos cut_at) +{ + struct btree_iter *u_iter; + struct bkey_i *u; + int ret; + + u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + bkey_reassemble(u, k); + bch2_cut_front(cut_at, u); + + u_iter = bch2_trans_copy_iter(trans, iter); + ret = PTR_ERR_OR_ZERO(u_iter); + if (ret) + return ret; + + /* + * We don't want to go through the + * extent_handle_overwrites path: + */ + __bch2_btree_iter_set_pos(u_iter, u->k.p, false); + + /* + * XXX: this is going to leave disk space + * accounting slightly wrong + */ + ret = bch2_trans_update(trans, u_iter, u, 0); + bch2_trans_iter_put(trans, u_iter); + return ret; +} + /* * Walk extents: verify that extents have a corresponding S_ISREG inode, and * that i_size an i_sectors are consistent @@ -433,6 +469,7 @@ static int check_extents(struct bch_fs *c) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; + struct bkey prev = KEY(0, 0, 0); u64 i_sectors; int ret = 0; @@ -444,6 +481,25 @@ static int check_extents(struct bch_fs *c) POS(BCACHEFS_ROOT_INO, 0), 0); retry: for_each_btree_key_continue(iter, 0, k, ret) { + if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) { + char buf1[100]; + char buf2[100]; + + bch2_bkey_to_text(&PBUF(buf1), &prev); + bch2_bkey_to_text(&PBUF(buf2), k.k); + + if (fsck_err(c, "overlapping extents: %s, %s", buf1, buf2)) { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_fix_overlapping_extent(&trans, + iter, k, prev.p)); + if (ret) + goto err; + } + } + prev = *k.k; + ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index bd0edda7abf9..27378cc9cdd5 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -161,13 +161,16 @@ static void journal_entries_free(struct list_head *list) } } +/* + * When keys compare equal, oldest compares first: + */ static int journal_sort_key_cmp(const void *_l, const void *_r) { const struct journal_key *l = _l; const struct journal_key *r = _r; return cmp_int(l->btree_id, r->btree_id) ?: - bkey_cmp(l->pos, r->pos) ?: + bkey_cmp(l->k->k.p, r->k->k.p) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_offset, r->journal_offset); } @@ -179,25 +182,11 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) return cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->btree_id, r->btree_id) ?: - bkey_cmp(l->pos, r->pos); -} - -static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i) -{ - while (i + 1 < keys->d + keys->nr && - journal_sort_key_cmp(i, i + 1) > 0) { - swap(i[0], i[1]); - i++; - } + bkey_cmp(l->k->k.p, r->k->k.p); } static void journal_keys_free(struct journal_keys *keys) { - struct journal_key *i; - - for_each_journal_key(*keys, i) - if (i->allocated) - kfree(i->k); kvfree(keys->d); keys->d = NULL; keys->nr = 0; @@ -208,15 +197,15 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) struct journal_replay *p; struct jset_entry *entry; struct bkey_i *k, *_n; - struct journal_keys keys = { NULL }, keys_deduped = { NULL }; - struct journal_key *i; + struct journal_keys keys = { NULL }; + struct journal_key *src, *dst; size_t nr_keys = 0; list_for_each_entry(p, journal_entries, list) for_each_jset_key(k, _n, entry, &p->j) nr_keys++; - keys.journal_seq_base = keys_deduped.journal_seq_base = + keys.journal_seq_base = le64_to_cpu(list_first_entry(journal_entries, struct journal_replay, list)->j.seq); @@ -225,96 +214,31 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) if (!keys.d) goto err; - keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL); - if (!keys_deduped.d) - goto err; - list_for_each_entry(p, journal_entries, list) - for_each_jset_key(k, _n, entry, &p->j) { - if (bkey_deleted(&k->k) && - btree_node_type_is_extents(entry->btree_id)) - continue; - + for_each_jset_key(k, _n, entry, &p->j) keys.d[keys.nr++] = (struct journal_key) { .btree_id = entry->btree_id, - .pos = bkey_start_pos(&k->k), .k = k, .journal_seq = le64_to_cpu(p->j.seq) - keys.journal_seq_base, .journal_offset = k->_data - p->j._data, }; - } sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); - i = keys.d; - while (i < keys.d + keys.nr) { - if (i + 1 < keys.d + keys.nr && - i[0].btree_id == i[1].btree_id && - !bkey_cmp(i[0].pos, i[1].pos)) { - if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { - i++; - } else { - bch2_cut_front(i[1].k->k.p, i[0].k); - i[0].pos = i[1].k->k.p; - journal_keys_sift(&keys, i); - } - continue; - } - - if (i + 1 < keys.d + keys.nr && - i[0].btree_id == i[1].btree_id && - bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) { - if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?: - cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) { - if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { - bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k); - } else { - struct bkey_i *split = - kmalloc(bkey_bytes(i[0].k), GFP_KERNEL); - - if (!split) - goto err; - - bkey_copy(split, i[0].k); - bch2_cut_back(bkey_start_pos(&i[1].k->k), split); - keys_deduped.d[keys_deduped.nr++] = (struct journal_key) { - .btree_id = i[0].btree_id, - .allocated = true, - .pos = bkey_start_pos(&split->k), - .k = split, - .journal_seq = i[0].journal_seq, - .journal_offset = i[0].journal_offset, - }; - - bch2_cut_front(i[1].k->k.p, i[0].k); - i[0].pos = i[1].k->k.p; - journal_keys_sift(&keys, i); - continue; - } - } else { - if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) { - i[1] = i[0]; - i++; - continue; - } else { - bch2_cut_front(i[0].k->k.p, i[1].k); - i[1].pos = i[0].k->k.p; - journal_keys_sift(&keys, i + 1); - continue; - } - } - } + src = dst = keys.d; + while (src < keys.d + keys.nr) { + while (src + 1 < keys.d + keys.nr && + src[0].btree_id == src[1].btree_id && + !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) + src++; - keys_deduped.d[keys_deduped.nr++] = *i++; + *dst++ = *src++; } - kvfree(keys.d); - return keys_deduped; + keys.nr = dst - keys.d; err: - journal_keys_free(&keys_deduped); - kvfree(keys.d); - return (struct journal_keys) { NULL }; + return keys; } /* journal replay: */ @@ -365,11 +289,6 @@ retry: atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); - split_iter = bch2_trans_copy_iter(&trans, iter); - ret = PTR_ERR_OR_ZERO(split_iter); - if (ret) - goto err; - split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); ret = PTR_ERR_OR_ZERO(split); if (ret) @@ -388,12 +307,25 @@ retry: } bkey_copy(split, k); - bch2_cut_front(split_iter->pos, split); + bch2_cut_front(iter->pos, split); bch2_cut_back(atomic_end, split); + split_iter = bch2_trans_copy_iter(&trans, iter); + ret = PTR_ERR_OR_ZERO(split_iter); + if (ret) + goto err; + + /* + * It's important that we don't go through the + * extent_handle_overwrites() and extent_update_to_keys() path + * here: journal replay is supposed to treat extents like + * regular keys + */ + __bch2_btree_iter_set_pos(split_iter, split->k.p, false); bch2_trans_update(&trans, split_iter, split, !remark ? BTREE_TRIGGER_NORUN : BTREE_TRIGGER_NOOVERWRITES); + bch2_btree_iter_set_pos(iter, split->k.p); } while (bkey_cmp(iter->pos, k->k.p) < 0); @@ -424,11 +356,18 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, struct btree_iter *iter; int ret; - iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), - BTREE_ITER_INTENT); + iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT); if (IS_ERR(iter)) return PTR_ERR(iter); + /* + * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run + * extent_handle_overwrites() and extent_update_to_keys() - but we don't + * want that here, journal replay is supposed to treat extents like + * regular keys: + */ + __bch2_btree_iter_set_pos(iter, k->k.p, false); + ret = bch2_btree_iter_traverse(iter) ?: bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); bch2_trans_iter_put(trans, iter); @@ -459,7 +398,7 @@ static int bch2_journal_replay(struct bch_fs *c, if (i->btree_id == BTREE_ID_ALLOC) ret = bch2_alloc_replay_key(c, i->k); - else if (btree_node_type_is_extents(i->btree_id)) + else if (i->k->k.size) ret = bch2_extent_replay_key(c, i->btree_id, i->k); else ret = bch2_journal_replay_key(c, i->btree_id, i->k); @@ -859,6 +798,15 @@ int bch2_fs_recovery(struct bch_fs *c) journal_seq = le64_to_cpu(clean->journal_seq) + 1; } + if (!c->sb.clean && + !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { + bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); + ret = -EINVAL; + goto err; + } + + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; + ret = journal_replay_early(c, clean, &journal_entries); if (ret) goto err; diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index ccd84a8fe60d..c91309301563 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -5,8 +5,6 @@ struct journal_keys { struct journal_key { enum btree_id btree_id:8; - unsigned allocated:1; - struct bpos pos; struct bkey_i *k; u32 journal_seq; u32 journal_offset; |