diff options
39 files changed, 469 insertions, 309 deletions
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index e11989a57ca0..47455a85c909 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -501,7 +501,7 @@ found: prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); bch2_bkey_val_to_text(&buf, c, extent2); - struct nonce nonce = extent_nonce(extent.k->version, p.crc); + struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), trans, dup_backpointer_to_bad_csum_extent, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index c711d4c27a03..f4151ee51b03 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -594,6 +594,7 @@ struct bch_dev { #define BCH_FS_FLAGS() \ x(new_fs) \ x(started) \ + x(clean_recovery) \ x(btree_running) \ x(accounting_replay_done) \ x(may_go_rw) \ @@ -776,7 +777,7 @@ struct bch_fs { unsigned nsec_per_time_unit; u64 features; u64 compat; - unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; + unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; u64 btrees_lost_data; } sb; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 8c4addddd07e..203ee627cab5 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -217,7 +217,7 @@ struct bkey { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ __u8 pad[1]; - struct bversion version; + struct bversion bversion; __u32 size; /* extent size, in sectors */ struct bpos p; #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ @@ -328,8 +328,8 @@ enum bch_bkey_fields { bkey_format_field(OFFSET, p.offset), \ bkey_format_field(SNAPSHOT, p.snapshot), \ bkey_format_field(SIZE, size), \ - bkey_format_field(VERSION_HI, version.hi), \ - bkey_format_field(VERSION_LO, version.lo), \ + bkey_format_field(VERSION_HI, bversion.hi), \ + bkey_format_field(VERSION_LO, bversion.lo), \ }, \ }) diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index e34cb2bf329c..41df24a53d97 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -214,9 +214,9 @@ static __always_inline int bversion_cmp(struct bversion l, struct bversion r) #define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) #define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) -static __always_inline int bversion_zero(struct bversion v) +static __always_inline bool bversion_zero(struct bversion v) { - return !bversion_cmp(v, ZERO_VERSION); + return bversion_cmp(v, ZERO_VERSION) == 0; } #ifdef CONFIG_BCACHEFS_DEBUG @@ -554,8 +554,8 @@ static inline void bch2_bkey_pack_test(void) {} x(BKEY_FIELD_OFFSET, p.offset) \ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ x(BKEY_FIELD_SIZE, size) \ - x(BKEY_FIELD_VERSION_HI, version.hi) \ - x(BKEY_FIELD_VERSION_LO, version.lo) + x(BKEY_FIELD_VERSION_HI, bversion.hi) \ + x(BKEY_FIELD_VERSION_LO, bversion.lo) struct bkey_format_state { u64 field_min[BKEY_NR_FIELDS]; diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 88d8958281e8..e7ac227ba7e8 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -289,7 +289,7 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) bch2_bpos_to_text(out, k->p); - prt_printf(out, " len %u ver %llu", k->size, k->version.lo); + prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo); } else { prt_printf(out, "(null)"); } diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 3df3dd2723a1..018fb72e32d3 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -70,7 +70,7 @@ bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) { return l->type == r->type && - !bversion_cmp(l->version, r->version) && + !bversion_cmp(l->bversion, r->bversion) && bpos_eq(l->p, bkey_start_pos(r)); } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index b5e0692f03c6..660d2fa02da2 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -513,6 +513,8 @@ int bch2_check_topology(struct bch_fs *c) struct bpos pulled_from_scan = POS_MIN; int ret = 0; + bch2_trans_srcu_unlock(trans); + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { struct btree_root *r = bch2_btree_id_root(c, i); bool reconstructed_root = false; @@ -599,15 +601,15 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (initial) { BUG_ON(bch2_journal_seq_verify && - k.k->version.lo > atomic64_read(&c->journal.seq)); + k.k->bversion.lo > atomic64_read(&c->journal.seq)); if (fsck_err_on(btree_id != BTREE_ID_accounting && - k.k->version.lo > atomic64_read(&c->key_version), + k.k->bversion.lo > atomic64_read(&c->key_version), trans, bkey_version_in_future, "key version number higher than recorded %llu\n %s", atomic64_read(&c->key_version), (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - atomic64_set(&c->key_version, k.k->version.lo); + atomic64_set(&c->key_version, k.k->bversion.lo); } if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index cb48a9477514..1c1448b52207 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1195,6 +1195,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, set_btree_bset(b, b->set, &b->data->keys); b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); + memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, + btree_buf_bytes(b) - + sizeof(struct btree_node) - + b->nr.live_u64s * sizeof(u64)); u64s = le16_to_cpu(sorted->keys.u64s); *sorted = *b->data; @@ -1219,7 +1223,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ret = bch2_bkey_val_validate(c, u.s_c, READ); if (ret == -BCH_ERR_fsck_delete_bkey || (bch2_inject_invalid_keys && - !bversion_cmp(u.k->version, MAX_VERSION))) { + !bversion_cmp(u.k->bversion, MAX_VERSION))) { btree_keys_account_key_drop(&b->nr, 0, k); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index b28c649c6838..1e694fedc5da 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -275,7 +275,7 @@ static int read_btree_nodes(struct find_btree_nodes *f) w->ca = ca; t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); - ret = IS_ERR_OR_NULL(t); + ret = PTR_ERR_OR_ZERO(t); if (ret) { percpu_ref_put(&ca->io_ref); closure_put(&cl); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 91884da4e30a..1a74a1a252ee 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -684,10 +684,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, !(flags & BCH_TRANS_COMMIT_no_journal_res)) { if (bch2_journal_seq_verify) trans_for_each_update(trans, i) - i->k->k.version.lo = trans->journal_res.seq; + i->k->k.bversion.lo = trans->journal_res.seq; else if (bch2_inject_invalid_keys) trans_for_each_update(trans, i) - i->k->k.version = MAX_VERSION; + i->k->k.bversion = MAX_VERSION; } h = trans->hooks; @@ -700,27 +700,31 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct jset_entry *entry = trans->journal_entries; - if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { - percpu_down_read(&c->mark_lock); + percpu_down_read(&c->mark_lock); + + for (entry = trans->journal_entries; + entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && + entry->start->k.type == KEY_TYPE_accounting) { + BUG_ON(!trans->journal_res.ref); + + struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start); - for (entry = trans->journal_entries; - entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - entry = vstruct_next(entry)) - if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) { - struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start); + a->k.bversion = journal_pos_to_bversion(&trans->journal_res, + (u64 *) entry - (u64 *) trans->journal_entries); + BUG_ON(bversion_zero(a->k.bversion)); - a->k.version = journal_pos_to_bversion(&trans->journal_res, - (u64 *) entry - (u64 *) trans->journal_entries); - BUG_ON(bversion_zero(a->k.version)); - ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false, false); + if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { + ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal); if (ret) goto revert_fs_usage; } - percpu_up_read(&c->mark_lock); + } + percpu_up_read(&c->mark_lock); - /* XXX: we only want to run this if deltas are nonzero */ - bch2_trans_account_disk_usage_change(trans); - } + /* XXX: we only want to run this if deltas are nonzero */ + bch2_trans_account_disk_usage_change(trans); trans_for_each_update(trans, i) if (btree_node_type_has_atomic_triggers(i->bkey_type)) { @@ -735,6 +739,40 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto fatal_err; } + trans_for_each_update(trans, i) { + enum bch_validate_flags invalid_flags = 0; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; + + ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), + i->bkey_type, invalid_flags); + if (unlikely(ret)){ + bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", + trans->fn, (void *) i->ip_allocated); + goto fatal_err; + } + btree_insert_entry_checks(trans, i); + } + + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) { + enum bch_validate_flags invalid_flags = 0; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; + + ret = bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, invalid_flags); + if (unlikely(ret)) { + bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", + trans->fn); + goto fatal_err; + } + } + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { struct journal *j = &c->journal; struct jset_entry *entry; @@ -798,7 +836,7 @@ revert_fs_usage: struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start); bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, false, false); + bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); bch2_accounting_neg(a); } percpu_up_read(&c->mark_lock); @@ -1019,40 +1057,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (ret) goto out_reset; - trans_for_each_update(trans, i) { - enum bch_validate_flags invalid_flags = 0; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - - ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags); - if (unlikely(ret)){ - bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", - trans->fn, (void *) i->ip_allocated); - return ret; - } - btree_insert_entry_checks(trans, i); - } - - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) { - enum bch_validate_flags invalid_flags = 0; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - - ret = bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, invalid_flags); - if (unlikely(ret)) { - bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", - trans->fn); - return ret; - } - } - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { ret = do_bch2_trans_commit_to_journal_replay(trans); goto out_reset; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 60393e98084d..6a454f2fa005 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -220,7 +220,8 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t if (type && k.k->type != type) return ERR_PTR(-ENOENT); - mut = bch2_trans_kmalloc_nomemzero(trans, bytes); + /* extra padding for varint_decode_fast... */ + mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8); if (!IS_ERR(mut)) { bkey_reassemble(mut, k); diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 757b9884ef55..462b1a2fe1ad 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -639,7 +639,7 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_write_op_init(&m->op, c, io_opts); m->op.pos = bkey_start_pos(k.k); - m->op.version = k.k->version; + m->op.version = k.k->bversion; m->op.target = data_opts.target; m->op.write_point = wp; m->op.nr_replicas = 0; diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index e972e2bca546..9f3133e3e7e5 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -134,6 +134,10 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, void *end = &acc_k + 1; int ret = 0; + bkey_fsck_err_on(bversion_zero(k.k->bversion), + c, accounting_key_version_0, + "accounting key with version=0"); + switch (acc_k.type) { case BCH_DISK_ACCOUNTING_nr_inodes: end = field_end(acc_k, nr_inodes); @@ -291,7 +295,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun struct accounting_mem_entry n = { .pos = a.k->p, - .version = a.k->version, + .bversion = a.k->bversion, .nr_counters = bch2_accounting_counters(a.k), .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), sizeof(u64), GFP_KERNEL), @@ -319,11 +323,13 @@ err: return -BCH_ERR_ENOMEM_disk_accounting; } -int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) +int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) { struct bch_replicas_padded r; - if (accounting_to_replicas(&r.e, a.k->p) && + if (mode != BCH_ACCOUNTING_read && + accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) return -BCH_ERR_btree_insert_need_mark_replicas; @@ -566,7 +572,9 @@ int bch2_gc_accounting_done(struct bch_fs *c) struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; accounting_key_init(&k_i.k, &acc_k, src_v, nr); - bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false, false); + bch2_accounting_mem_mod_locked(trans, + bkey_i_to_s_c_accounting(&k_i.k), + BCH_ACCOUNTING_normal); preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); @@ -589,30 +597,14 @@ fsck_err: static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; if (k.k->type != KEY_TYPE_accounting) return 0; percpu_down_read(&c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), false, true); + int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), + BCH_ACCOUNTING_read); percpu_up_read(&c->mark_lock); - - if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) && - ret == -BCH_ERR_btree_insert_need_mark_replicas) - ret = 0; - - struct disk_accounting_pos acc; - bpos_to_disk_accounting_pos(&acc, k.k->p); - - if (fsck_err_on(ret == -BCH_ERR_btree_insert_need_mark_replicas, - trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n %s", - (bch2_accounting_key_to_text(&buf, &acc), - buf.buf))) - ret = bch2_accounting_update_sb_one(c, k.k->p); -fsck_err: - printbuf_exit(&buf); return ret; } @@ -624,6 +616,7 @@ int bch2_accounting_read(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; int ret = for_each_btree_key(trans, iter, BTREE_ID_accounting, POS_MIN, @@ -647,7 +640,7 @@ int bch2_accounting_read(struct bch_fs *c) accounting_pos_cmp, &k.k->p); bool applied = idx < acc->k.nr && - bversion_cmp(acc->k.data[idx].version, k.k->version) >= 0; + bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0; if (applied) continue; @@ -655,7 +648,7 @@ int bch2_accounting_read(struct bch_fs *c) if (i + 1 < &darray_top(*keys) && i[1].k->k.type == KEY_TYPE_accounting && !journal_key_cmp(i, i + 1)) { - BUG_ON(bversion_cmp(i[0].k->k.version, i[1].k->k.version) >= 0); + WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); i[1].journal_seq = i[0].journal_seq; @@ -674,6 +667,45 @@ int bch2_accounting_read(struct bch_fs *c) keys->gap = keys->nr = dst - keys->data; percpu_down_read(&c->mark_lock); + for (unsigned i = 0; i < acc->k.nr; i++) { + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + + if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters)) + continue; + + struct bch_replicas_padded r; + if (!accounting_to_replicas(&r.e, acc->k.data[i].pos)) + continue; + + /* + * If the replicas entry is invalid it'll get cleaned up by + * check_allocations: + */ + if (bch2_replicas_entry_validate(&r.e, c, &buf)) + continue; + + struct disk_accounting_pos k; + bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); + + if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), + trans, accounting_replicas_not_marked, + "accounting not marked in superblock replicas\n %s", + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &k), + buf.buf))) { + /* + * We're not RW yet and still single threaded, dropping + * and retaking lock is ok: + */ + percpu_up_read(&c->mark_lock); + ret = bch2_mark_replicas(c, &r.e); + if (ret) + goto fsck_err; + percpu_down_read(&c->mark_lock); + } + } + preempt_disable(); struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); @@ -709,8 +741,10 @@ int bch2_accounting_read(struct bch_fs *c) } } preempt_enable(); +fsck_err: percpu_up_read(&c->mark_lock); err: + printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index f29fd0dd9581..4ea6c8a092bc 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -36,8 +36,8 @@ static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++) dst->v.d[i] += src.v->d[i]; - if (bversion_cmp(dst->k.version, src.k->version) < 0) - dst->k.version = src.k->version; + if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0) + dst->k.bversion = src.k->bversion; } static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, @@ -103,23 +103,35 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r) return bpos_cmp(*l, *r); } -int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool); +enum bch_accounting_mode { + BCH_ACCOUNTING_normal, + BCH_ACCOUNTING_gc, + BCH_ACCOUNTING_read, +}; + +int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); /* * Update in memory counters so they match the btree update we're doing; called * from transaction commit path */ -static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc, bool read) +static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, + struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) { struct bch_fs *c = trans->c; + struct bch_accounting_mem *acc = &c->accounting; struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, a.k->p); + bool gc = mode == BCH_ACCOUNTING_gc; + + EBUG_ON(gc && !acc->gc_running); if (acc_k.type == BCH_DISK_ACCOUNTING_inum) return 0; - if (!gc && !read) { + if (mode == BCH_ACCOUNTING_normal) { switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; @@ -140,14 +152,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru } } - struct bch_accounting_mem *acc = &c->accounting; unsigned idx; - EBUG_ON(gc && !acc->gc_running); - while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { - int ret = bch2_accounting_mem_insert(c, a, gc); + int ret = bch2_accounting_mem_insert(c, a, mode); if (ret) return ret; } @@ -164,7 +173,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) { percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc, false); + int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal); percpu_up_read(&trans->c->mark_lock); return ret; } diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h index 1687a45177a7..b1982131b206 100644 --- a/fs/bcachefs/disk_accounting_types.h +++ b/fs/bcachefs/disk_accounting_types.h @@ -6,7 +6,7 @@ struct accounting_mem_entry { struct bpos pos; - struct bversion version; + struct bversion bversion; unsigned nr_counters; u64 __percpu *v[2]; }; diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 95afa7bf2020..3a16b535b6c3 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -239,7 +239,19 @@ int __bch2_fsck_err(struct bch_fs *c, if (!c) c = trans->c; - WARN_ON(!trans && bch2_current_has_btree_trans(c)); + /* + * Ugly: if there's a transaction in the current task it has to be + * passed in to unlock if we prompt for user input. + * + * But, plumbing a transaction and transaction restarts into + * bkey_validate() is problematic. + * + * So: + * - make all bkey errors AUTOFIX, they're simple anyways (we just + * delete the key) + * - and we don't need to warn if we're not prompting + */ + WARN_ON(!(flags & FSCK_AUTOFIX) && !trans && bch2_current_has_btree_trans(c)); if ((flags & FSCK_CAN_FIX) && test_bit(err, c->sb.errors_silent)) diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 2f1b86978f36..21ee7211b03e 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -184,7 +184,7 @@ do { \ ret = -BCH_ERR_fsck_delete_bkey; \ goto fsck_err; \ } \ - int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX, \ + int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX|FSCK_AUTOFIX,\ BCH_FSCK_ERR_##_err_type, \ _err_msg, ##__VA_ARGS__); \ if (_ret != -BCH_ERR_fsck_fix && \ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9b3470a97546..0d8b782b63fb 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -21,6 +21,49 @@ #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static bool dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) +{ + if (d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum) + return 0; + return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; +} + +static void dirent_inode_mismatch_msg(struct printbuf *out, + struct bch_fs *c, + struct bkey_s_c_dirent dirent, + struct bch_inode_unpacked *inode) +{ + prt_str(out, "inode points to dirent that does not point back:"); + prt_newline(out); + bch2_bkey_val_to_text(out, c, dirent.s_c); + prt_newline(out); + bch2_inode_unpacked_to_text(out, inode); +} + +static int dirent_points_to_inode(struct bch_fs *c, + struct bkey_s_c_dirent dirent, + struct bch_inode_unpacked *inode) +{ + int ret = dirent_points_to_inode_nowarn(dirent, inode); + if (ret) { + struct printbuf buf = PRINTBUF; + dirent_inode_mismatch_msg(&buf, c, dirent, inode); + bch_warn(c, "%s", buf.buf); + printbuf_exit(&buf); + } + return ret; +} + /* * XXX: this is handling transaction restarts without returning * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: @@ -346,14 +389,17 @@ static int reattach_inode(struct btree_trans *trans, static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { - struct btree_iter iter; - struct bkey_s_c_dirent d; - int ret; + if (!inode->bi_dir) + return 0; - d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, - POS(inode->bi_dir, inode->bi_dir_offset), 0, + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c_dirent d = + bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, + SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0, dirent); - ret = bkey_err(d) ?: + int ret = bkey_err(d) ?: + dirent_points_to_inode(c, d, inode) ?: __remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; @@ -371,7 +417,8 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume return ret; ret = remove_backpointer(trans, &inode); - bch_err_msg(c, ret, "removing dirent"); + if (!bch2_err_matches(ret, ENOENT)) + bch_err_msg(c, ret, "removing dirent"); if (ret) return ret; @@ -626,12 +673,12 @@ static int ref_visible2(struct bch_fs *c, struct inode_walker_entry { struct bch_inode_unpacked inode; u32 snapshot; - bool seen_this_pos; u64 count; }; struct inode_walker { bool first_this_inode; + bool have_inodes; bool recalculate_sums; struct bpos last_pos; @@ -669,6 +716,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, struct bkey_s_c k; int ret; + /* + * We no longer have inodes for w->last_pos; clear this to avoid + * screwing up check_i_sectors/check_subdir_count if we take a + * transaction restart here: + */ + w->have_inodes = false; w->recalculate_sums = false; w->inodes.nr = 0; @@ -686,6 +739,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, return ret; w->first_this_inode = true; + w->have_inodes = true; return 0; } @@ -740,9 +794,6 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); if (ret) return ERR_PTR(ret); - } else if (bkey_cmp(w->last_pos, k.k->p)) { - darray_for_each(w->inodes, i) - i->seen_this_pos = false; } w->last_pos = k.k->p; @@ -896,21 +947,6 @@ static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); } -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - -static bool dirent_points_to_inode(struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) -{ - return d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol - : le64_to_cpu(d.v->d_inum) == inode->bi_inum; -} - static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) { struct btree_iter iter; @@ -920,13 +956,14 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) return ret; } -static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k, +static int check_inode_dirent_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, - u32 inode_snapshot, bool *write_inode) + bool *write_inode) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + u32 inode_snapshot = inode->bi_snapshot; struct btree_iter dirent_iter = {}; struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); int ret = bkey_err(d); @@ -936,13 +973,13 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i if (fsck_err_on(ret, trans, inode_points_to_missing_dirent, "inode points to missing dirent\n%s", - (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) || - fsck_err_on(!ret && !dirent_points_to_inode(d, inode), + (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || + fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), trans, inode_points_to_wrong_dirent, - "inode points to dirent that does not point back:\n%s", - (bch2_bkey_val_to_text(&buf, c, inode_k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + "%s", + (printbuf_reset(&buf), + dirent_inode_mismatch_msg(&buf, c, d, inode), + buf.buf))) { /* * We just clear the backpointer fields for now. If we find a * dirent that points to this inode in check_dirents(), we'll @@ -963,7 +1000,7 @@ fsck_err: return ret; } -static bool bch2_inode_open(struct bch_fs *c, struct bpos p) +static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) { subvol_inum inum = { .subvol = snapshot_t(c, p.snapshot)->subvol, @@ -972,7 +1009,7 @@ static bool bch2_inode_open(struct bch_fs *c, struct bpos p) /* snapshot tree corruption, can't safely delete */ if (!inum.subvol) { - bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot); + bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot); return true; } @@ -1045,30 +1082,44 @@ static int check_inode(struct btree_trans *trans, } if (u.bi_flags & BCH_INODE_unlinked) { - ret = check_inode_deleted_list(trans, k.k->p); - if (ret < 0) - return ret; + if (!test_bit(BCH_FS_started, &c->flags)) { + /* + * If we're not in online fsck, don't delete unlinked + * inodes, just make sure they're on the deleted list. + * + * They might be referred to by a logged operation - + * i.e. we might have crashed in the middle of a + * truncate on an unlinked but open file - so we want to + * let the delete_dead_inodes kill it after resuming + * logged ops. + */ + ret = check_inode_deleted_list(trans, k.k->p); + if (ret < 0) + return ret; - fsck_err_on(!ret, - trans, unlinked_inode_not_on_deleted_list, - "inode %llu:%u unlinked, but not on deleted list", - u.bi_inum, k.k->p.snapshot); - ret = 0; - } + fsck_err_on(!ret, + trans, unlinked_inode_not_on_deleted_list, + "inode %llu:%u unlinked, but not on deleted list", + u.bi_inum, k.k->p.snapshot); - if (u.bi_flags & BCH_INODE_unlinked && - !bch2_inode_open(c, k.k->p) && - (!c->sb.clean || - fsck_err(trans, inode_unlinked_but_clean, - "filesystem marked clean, but inode %llu unlinked", - u.bi_inum))) { - ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); - bch_err_msg(c, ret, "in fsck deleting inode"); - return ret; + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1); + if (ret) + goto err; + } else { + if (fsck_err_on(bch2_inode_is_open(c, k.k->p), + trans, inode_unlinked_and_not_open, + "inode %llu%u unlinked and not open", + u.bi_inum, u.bi_snapshot)) { + ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck deleting inode"); + return ret; + } + } } + /* i_size_dirty is vestigal, since we now have logged ops for truncate * */ if (u.bi_flags & BCH_INODE_i_size_dirty && - (!c->sb.clean || + (!test_bit(BCH_FS_clean_recovery, &c->flags) || fsck_err(trans, inode_i_size_dirty_but_clean, "filesystem marked clean, but inode %llu has i_size dirty", u.bi_inum))) { @@ -1097,8 +1148,9 @@ static int check_inode(struct btree_trans *trans, do_update = true; } + /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */ if (u.bi_flags & BCH_INODE_i_sectors_dirty && - (!c->sb.clean || + (!test_bit(BCH_FS_clean_recovery, &c->flags) || fsck_err(trans, inode_i_sectors_dirty_but_clean, "filesystem marked clean, but inode %llu has i_sectors dirty", u.bi_inum))) { @@ -1126,7 +1178,7 @@ static int check_inode(struct btree_trans *trans, } if (u.bi_dir || u.bi_dir_offset) { - ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update); + ret = check_inode_dirent_inode(trans, &u, &do_update); if (ret) goto err; } @@ -1555,10 +1607,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct inode_walker *inode, struct snapshots_seen *s, - struct extent_ends *extent_ends) + struct extent_ends *extent_ends, + struct disk_reservation *res) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; int ret = 0; @@ -1568,7 +1620,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto out; } - if (inode->last_pos.inode != k.k->p.inode) { + if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) { ret = check_i_sectors(trans, inode); if (ret) goto err; @@ -1578,12 +1630,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - i = walk_inode(trans, inode, k); - ret = PTR_ERR_OR_ZERO(i); + struct inode_walker_entry *extent_i = walk_inode(trans, inode, k); + ret = PTR_ERR_OR_ZERO(extent_i); if (ret) goto err; - ret = check_key_has_inode(trans, iter, inode, i, k); + ret = check_key_has_inode(trans, iter, inode, extent_i, k); if (ret) goto err; @@ -1592,24 +1644,19 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, &inode->recalculate_sums); if (ret) goto err; - } - /* - * Check inodes in reverse order, from oldest snapshots to newest, - * starting from the inode that matches this extent's snapshot. If we - * didn't have one, iterate over all inodes: - */ - if (!i) - i = &darray_last(inode->inodes); - - for (; - inode->inodes.data && i >= inode->inodes.data; - --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) - continue; + /* + * Check inodes in reverse order, from oldest snapshots to + * newest, starting from the inode that matches this extent's + * snapshot. If we didn't have one, iterate over all inodes: + */ + for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); + inode->inodes.data && i >= inode->inodes.data; + --i) { + if (i->snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + continue; - if (k.k->type != KEY_TYPE_whiteout) { if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), @@ -1629,13 +1676,25 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; iter->k.type = KEY_TYPE_whiteout; + break; } - - if (bkey_extent_is_allocation(k.k)) - i->count += k.k->size; } + } - i->seen_this_pos = true; + ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + if (bkey_extent_is_allocation(k.k)) { + for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); + inode->inodes.data && i >= inode->inodes.data; + --i) { + if (i->snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + continue; + + i->count += k.k->size; + } } if (k.k->type != KEY_TYPE_whiteout) { @@ -1666,13 +1725,11 @@ int bch2_check_extents(struct bch_fs *c) extent_ends_init(&extent_ends); int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_extents, + for_each_btree_key(trans, iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, - BCH_TRANS_COMMIT_no_enospc, ({ + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: + check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: check_extent_overbig(trans, &iter, k); })) ?: check_i_sectors_notnested(trans, &w)); @@ -1758,6 +1815,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = { NULL }; int ret = 0; if (inode_points_to_dirent(target, d)) @@ -1770,7 +1828,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, prt_printf(&buf, "\n "), bch2_inode_unpacked_to_text(&buf, target), buf.buf))) - goto out_noiter; + goto err; if (!target->bi_dir && !target->bi_dir_offset) { @@ -1779,7 +1837,6 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, return __bch2_fsck_write_inode(trans, target, target_snapshot); } - struct btree_iter bp_iter = { NULL }; struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); ret = bkey_err(bp_dirent); @@ -1840,7 +1897,6 @@ out: err: fsck_err: bch2_trans_iter_exit(trans, &bp_iter); -out_noiter: printbuf_exit(&buf); bch_err_fn(c, ret); return ret; @@ -2075,7 +2131,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (k.k->type == KEY_TYPE_whiteout) goto out; - if (dir->last_pos.inode != k.k->p.inode) { + if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { ret = check_subdir_count(trans, dir); if (ret) goto err; @@ -2137,11 +2193,15 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; } - - if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) - i->count++; } + + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + if (d.v->d_type == DT_DIR) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + i->count++; out: err: fsck_err: @@ -2164,12 +2224,9 @@ int bch2_check_dirents(struct bch_fs *c) snapshots_seen_init(&s); int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, + for_each_btree_key(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, - k, - NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: check_subdir_count_notnested(trans, &dir)); @@ -2314,22 +2371,6 @@ static bool darray_u32_has(darray_u32 *d, u32 v) return false; } -/* - * We've checked that inode backpointers point to valid dirents; here, it's - * sufficient to check that the subvolume root has a dirent: - */ -static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s) -{ - struct bch_inode_unpacked inode; - int ret = bch2_inode_find_by_inum_trans(trans, - (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, - &inode); - if (ret) - return ret; - - return inode.bi_dir != 0; -} - static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; @@ -2348,14 +2389,24 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); - ret = subvol_has_dirent(trans, s); - if (ret < 0) + struct bch_inode_unpacked subvol_root; + ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, + &subvol_root); + if (ret) break; - if (fsck_err_on(!ret, + /* + * We've checked that inode backpointers point to valid dirents; + * here, it's sufficient to check that the subvolume root has a + * dirent: + */ + if (fsck_err_on(!subvol_root.bi_dir, trans, subvol_unreachable, "unreachable subvolume %s", (bch2_bkey_val_to_text(&buf, c, s.s_c), + prt_newline(&buf), + bch2_inode_unpacked_to_text(&buf, &subvol_root), buf.buf))) { ret = reattach_subvol(trans, s); break; @@ -2450,10 +2501,8 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino if (ret && !bch2_err_matches(ret, ENOENT)) break; - if (!ret && !dirent_points_to_inode(d, &inode)) { + if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) bch2_trans_iter_exit(trans, &dirent_iter); - ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; - } if (bch2_err_matches(ret, ENOENT)) { ret = 0; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 6ac0ff7e074b..753c208896c3 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -320,9 +320,11 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, int bch2_inode_unpack(struct bkey_s_c k, struct bch_inode_unpacked *unpacked) { - if (likely(k.k->type == KEY_TYPE_inode_v3)) - return bch2_inode_unpack_v3(k, unpacked); - return bch2_inode_unpack_slowpath(k, unpacked); + unpacked->bi_snapshot = k.k->p.snapshot; + + return likely(k.k->type == KEY_TYPE_inode_v3) + ? bch2_inode_unpack_v3(k, unpacked) + : bch2_inode_unpack_slowpath(k, unpacked); } int bch2_inode_peek_nowarn(struct btree_trans *trans, @@ -557,7 +559,7 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - prt_printf(out, "inum: %llu ", inode->bi_inum); + prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot); __bch2_inode_unpacked_to_text(out, inode); } @@ -1111,7 +1113,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, pos.offset, pos.snapshot)) goto delete; - if (c->sb.clean && + if (test_bit(BCH_FS_clean_recovery, &c->flags) && !fsck_err(trans, deleted_inode_but_clean, "filesystem marked as clean but have deleted inode %llu:%u", pos.offset, pos.snapshot)) { diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index f1fcb4c58039..695abd707cb6 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -69,6 +69,7 @@ typedef u64 u96; struct bch_inode_unpacked { u64 bi_inum; + u32 bi_snapshot; u64 bi_journal_seq; __le64 bi_hash_seed; u64 bi_size; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index b2f50e74bb76..e4fc17c548fd 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -517,7 +517,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if ((ret = bkey_err(k))) goto out; - if (bversion_cmp(k.k->version, rbio->version) || + if (bversion_cmp(k.k->bversion, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out; @@ -1031,7 +1031,7 @@ get_bio: rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; - rbio->version = k.k->version; + rbio->version = k.k->bversion; rbio->promote = promote; INIT_WORK(&rbio->work, NULL); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index d3b5be7fd9bf..b5fe9e0dc155 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -697,7 +697,7 @@ static void init_append_extent(struct bch_write_op *op, e = bkey_extent_init(op->insert_keys.top); e->k.p = op->pos; e->k.size = crc.uncompressed_size; - e->k.version = version; + e->k.bversion = version; if (crc.csum_type || crc.compression_type || @@ -1544,7 +1544,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) id = bkey_inline_data_init(op->insert_keys.top); id->k.p = op->pos; - id->k.version = op->version; + id->k.bversion = op->version; id->k.size = sectors; iter = bio->bi_iter; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 30460bce04be..954f6a96e0f4 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -605,7 +605,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, goto out; } - if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), + if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), c, version, jset, entry, journal_entry_data_usage_bad_size, "invalid journal entry usage: %s", err.buf)) { diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index f49fdca1d07d..6f4a4e1083c9 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -37,6 +37,14 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type); struct bkey_buf sk; u32 restart_count = trans->restart_count; + struct printbuf buf = PRINTBUF; + int ret = 0; + + fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags), + trans, logged_op_but_clean, + "filesystem marked as clean but have logged op\n%s", + (bch2_bkey_val_to_text(&buf, c, k), + buf.buf)); if (!fn) return 0; @@ -47,8 +55,9 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, fn->resume(trans, sk.k); bch2_bkey_buf_exit(&sk, c); - - return trans_was_restarted(trans, restart_count); +fsck_err: + printbuf_exit(&buf); + return ret ?: trans_was_restarted(trans, restart_count); } int bch2_resume_logged_ops(struct bch_fs *c) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index be1e7ca4362f..6db72d3bad7d 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -151,7 +151,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); /* Has this delta already been applied to the btree? */ - if (bversion_cmp(old.k->version, k->k->k.version) >= 0) { + if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) { ret = 0; goto out; } @@ -717,6 +717,8 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.fsck) set_bit(BCH_FS_fsck_running, &c->flags); + if (c->sb.clean) + set_bit(BCH_FS_clean_recovery, &c->flags); ret = bch2_blacklist_table_initialize(c); if (ret) { @@ -862,6 +864,9 @@ use_clean: clear_bit(BCH_FS_fsck_running, &c->flags); + /* in case we don't run journal replay, i.e. norecovery mode */ + set_bit(BCH_FS_accounting_replay_done, &c->flags); + /* fsync if we fixed errors */ if (test_bit(BCH_FS_errors_fixed, &c->flags) && bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) { diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 8c7dee5983d2..50406ce0e4ef 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -50,7 +50,7 @@ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ + x(delete_dead_inodes, 32, PASS_ALWAYS) \ x(fix_reflink_p, 33, 0) \ x(set_fs_needs_rebalance, 34, 0) \ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index e59c0abb4772..f457925fa362 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -367,7 +367,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, r_v->k.type = bkey_type_to_indirect(&orig->k); r_v->k.p = reflink_iter.pos; bch2_key_resize(&r_v->k, orig->k.size); - r_v->k.version = orig->k.version; + r_v->k.bversion = orig->k.bversion; set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 998c0bd06802..bcb3276747e0 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } -int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, - struct bch_sb *sb, - struct printbuf *err) +static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r, + struct bch_sb *sb, + struct printbuf *err) { if (!r->nr_devs) { prt_printf(err, "no devices in entry "); @@ -94,6 +94,16 @@ bad: return -BCH_ERR_invalid_replicas_entry; } +int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, + struct bch_fs *c, + struct printbuf *err) +{ + mutex_lock(&c->sb_lock); + int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err); + mutex_unlock(&c->sb_lock); + return ret; +} + void bch2_cpu_replicas_to_text(struct printbuf *out, struct bch_replicas_cpu *r) { @@ -676,7 +686,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); - int ret = bch2_replicas_entry_validate(e, sb, err); + int ret = bch2_replicas_entry_validate_locked(e, sb, err); if (ret) return ret; diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index 622482559c3d..5aba2c1ce133 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -10,7 +10,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); void bch2_replicas_entry_to_text(struct printbuf *, struct bch_replicas_entry_v1 *); int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *, - struct bch_sb *, struct printbuf *); + struct bch_fs *, struct printbuf *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); static inline struct bch_replicas_entry_v1 * diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 025848a9c4c0..005275281804 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -167,6 +167,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) ret = bch2_sb_clean_validate_late(c, clean, READ); if (ret) { + kfree(clean); mutex_unlock(&c->sb_lock); return ERR_PTR(ret); } diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index c7e4cdd3f6a5..5102059a0f1d 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -312,8 +312,7 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb, if (!first) prt_char(out, ','); first = false; - unsigned e = le16_to_cpu(i->errors[j]); - prt_str(out, e < BCH_SB_ERR_MAX ? bch2_sb_error_strs[e] : "(unknown)"); + bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j])); } prt_newline(out); } @@ -353,7 +352,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c) for (unsigned i = 0; i < src->nr_errors; i++) dst->errors[i] = cpu_to_le16(src->errors[i]); - downgrade_table_extra(c, &table); + ret = downgrade_table_extra(c, &table); + if (ret) + goto out; if (!dst->recovery_passes[0] && !dst->recovery_passes[1] && @@ -399,7 +400,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { unsigned e = le16_to_cpu(i->errors[j]); - if (e < BCH_SB_ERR_MAX) + if (e < BCH_FSCK_ERR_MAX) __set_bit(e, c->sb.errors_silent); if (e < sizeof(ext->errors_silent) * 8) __set_bit_le64(e, ext->errors_silent); diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index c1270d790e43..013a96883b4e 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -7,12 +7,12 @@ const char * const bch2_sb_error_strs[] = { #define x(t, n, ...) [n] = #t, BCH_SB_ERRS() - NULL +#undef x }; -static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) +void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) { - if (id < BCH_SB_ERR_MAX) + if (id < BCH_FSCK_ERR_MAX) prt_str(out, bch2_sb_error_strs[id]); else prt_printf(out, "(unknown error %u)", id); diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h index 8889001e7db4..b2357b8e6107 100644 --- a/fs/bcachefs/sb-errors.h +++ b/fs/bcachefs/sb-errors.h @@ -6,6 +6,8 @@ extern const char * const bch2_sb_error_strs[]; +void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id); + extern const struct bch_sb_field_ops bch_sb_field_ops_errors; void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index f0c14702f9e6..ed5dca5e1161 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -210,22 +210,23 @@ enum bch_fsck_flags { x(inode_snapshot_mismatch, 196, 0) \ x(inode_unlinked_but_clean, 197, 0) \ x(inode_unlinked_but_nlink_nonzero, 198, 0) \ + x(inode_unlinked_and_not_open, 281, 0) \ x(inode_checksum_type_invalid, 199, 0) \ x(inode_compression_type_invalid, 200, 0) \ x(inode_subvol_root_but_not_dir, 201, 0) \ - x(inode_i_size_dirty_but_clean, 202, 0) \ - x(inode_i_sectors_dirty_but_clean, 203, 0) \ - x(inode_i_sectors_wrong, 204, 0) \ - x(inode_dir_wrong_nlink, 205, 0) \ - x(inode_dir_multiple_links, 206, 0) \ - x(inode_multiple_links_but_nlink_0, 207, 0) \ - x(inode_wrong_backpointer, 208, 0) \ - x(inode_wrong_nlink, 209, 0) \ - x(inode_unreachable, 210, 0) \ - x(deleted_inode_but_clean, 211, 0) \ - x(deleted_inode_missing, 212, 0) \ - x(deleted_inode_is_dir, 213, 0) \ - x(deleted_inode_not_unlinked, 214, 0) \ + x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \ + x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \ + x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \ + x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \ + x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ + x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ + x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ + x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ + x(inode_unreachable, 210, FSCK_AUTOFIX) \ + x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ + x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ + x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ + x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ x(extent_overlapping, 215, 0) \ x(key_in_missing_inode, 216, 0) \ x(key_in_wrong_inode_type, 217, 0) \ @@ -255,7 +256,7 @@ enum bch_fsck_flags { x(dir_loop, 241, 0) \ x(hash_table_key_duplicate, 242, 0) \ x(hash_table_key_wrong_offset, 243, 0) \ - x(unlinked_inode_not_on_deleted_list, 244, 0) \ + x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \ x(reflink_p_front_pad_bad, 245, 0) \ x(journal_entry_dup_same_device, 246, 0) \ x(inode_bi_subvol_missing, 247, 0) \ @@ -270,7 +271,7 @@ enum bch_fsck_flags { x(subvol_children_not_set, 256, 0) \ x(subvol_children_bad, 257, 0) \ x(subvol_loop, 258, 0) \ - x(subvol_unreachable, 259, 0) \ + x(subvol_unreachable, 259, FSCK_AUTOFIX) \ x(btree_node_bkey_bad_u64s, 260, 0) \ x(btree_node_topology_empty_interior_node, 261, 0) \ x(btree_ptr_v2_min_key_bad, 262, 0) \ @@ -282,8 +283,8 @@ enum bch_fsck_flags { x(btree_ptr_v2_written_0, 268, 0) \ x(subvol_snapshot_bad, 269, 0) \ x(subvol_inode_bad, 270, 0) \ - x(alloc_key_stripe_sectors_wrong, 271, 0) \ - x(accounting_mismatch, 272, 0) \ + x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ + x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ @@ -292,12 +293,14 @@ enum bch_fsck_flags { x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ + x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ + x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ + x(MAX, 284, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, BCH_SB_ERRS() #undef x - BCH_SB_ERR_MAX }; struct bch_sb_field_errors { diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 9cbd3c14c94f..617d07e53b20 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -169,11 +169,17 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, ret = -1 - SIX_LOCK_write; } } else if (type == SIX_LOCK_write && lock->readers) { - if (try) { + if (try) atomic_add(SIX_LOCK_HELD_write, &lock->state); - smp_mb__after_atomic(); - } + /* + * Make sure atomic_add happens before pcpu_read_count and + * six_set_bitmask in slow path happens before pcpu_read_count. + * + * Paired with the smp_mb() in read lock fast path (per-cpu mode) + * and the one before atomic_read in read unlock path. + */ + smp_mb(); ret = !pcpu_read_count(lock); if (try && !ret) { diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 8b18a9b483a4..1809442b00ee 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -469,6 +469,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) u32 id = snapshot_root; u32 subvol = 0, s; + rcu_read_lock(); while (id) { s = snapshot_t(c, id)->subvol; @@ -477,6 +478,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) id = bch2_snapshot_tree_next(c, id); } + rcu_read_unlock(); return subvol; } @@ -1782,6 +1784,7 @@ static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, new->k.p.snapshot = leaf_id; ret = bch2_trans_update(trans, &iter, new, 0); out: + bch2_set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index dbe834cb349f..6845dde1b339 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -92,34 +92,32 @@ static int check_subvol(struct btree_trans *trans, } struct bch_inode_unpacked inode; - struct btree_iter inode_iter = {}; - ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode, + ret = bch2_inode_find_by_inum_nowarn_trans(trans, (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, - 0); - bch2_trans_iter_exit(trans, &inode_iter); - - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - - if (fsck_err_on(ret, - trans, subvol_to_missing_root, - "subvolume %llu points to missing subvolume root %llu:%u", - k.k->p.offset, le64_to_cpu(subvol.v->inode), - le32_to_cpu(subvol.v->snapshot))) { - ret = bch2_subvolume_delete(trans, iter->pos.offset); - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); - return ret ?: -BCH_ERR_transaction_restart_nested; - } - - if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, - trans, subvol_root_wrong_bi_subvol, - "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", - inode.bi_inum, inode_iter.k.p.snapshot, - inode.bi_subvol, subvol.k->p.offset)) { - inode.bi_subvol = subvol.k->p.offset; - ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot)); - if (ret) + &inode); + if (!ret) { + if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, + trans, subvol_root_wrong_bi_subvol, + "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", + inode.bi_inum, inode.bi_snapshot, + inode.bi_subvol, subvol.k->p.offset)) { + inode.bi_subvol = subvol.k->p.offset; + ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot)); + if (ret) + goto err; + } + } else if (bch2_err_matches(ret, ENOENT)) { + if (fsck_err(trans, subvol_to_missing_root, + "subvolume %llu points to missing subvolume root %llu:%u", + k.k->p.offset, le64_to_cpu(subvol.v->inode), + le32_to_cpu(subvol.v->snapshot))) { + ret = bch2_subvolume_delete(trans, iter->pos.offset); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); + ret = ret ?: -BCH_ERR_transaction_restart_nested; goto err; + } + } else { + goto err; } if (!BCH_SUBVOLUME_SNAP(subvol.v)) { @@ -137,7 +135,7 @@ static int check_subvol(struct btree_trans *trans, "%s: snapshot tree %u not found", __func__, snapshot_tree); if (ret) - return ret; + goto err; if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, trans, subvol_not_master_and_not_snapshot, @@ -147,7 +145,7 @@ static int check_subvol(struct btree_trans *trans, bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); ret = PTR_ERR_OR_ZERO(s); if (ret) - return ret; + goto err; SET_BCH_SUBVOLUME_SNAP(&s->v, true); } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index d86d5dae54c9..ce7410d72089 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -799,8 +799,10 @@ retry: i < layout.sb_offset + layout.nr_superblocks; i++) { offset = le64_to_cpu(*i); - if (offset == opt_get(*opts, sb)) + if (offset == opt_get(*opts, sb)) { + ret = -BCH_ERR_invalid; continue; + } ret = read_one_super(sb, offset, &err); if (!ret) @@ -1188,7 +1190,8 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8); prt_printf(out, "Errors to silently fix:\t"); - prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8); + prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, + min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8)); prt_newline(out); kfree(errors_silent); diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 01b768c9b767..b2f209743afe 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -394,7 +394,7 @@ static int insert_test_extent(struct bch_fs *c, k.k_i.k.p.offset = end; k.k_i.k.p.snapshot = U32_MAX; k.k_i.k.size = end - start; - k.k_i.k.version.lo = test_version++; + k.k_i.k.bversion.lo = test_version++; ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); bch_err_fn(c, ret); |