diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Makefile | 2 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-core.h | 23 | ||||
-rw-r--r-- | drivers/md/dm-ima.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-io-rewind.c | 166 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 7 | ||||
-rw-r--r-- | drivers/md/dm-rq.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 318 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 7 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-verity-target.c | 7 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 43 | ||||
-rw-r--r-- | drivers/md/dm-zone.c | 7 | ||||
-rw-r--r-- | drivers/md/dm.c | 462 | ||||
-rw-r--r-- | drivers/md/dm.h | 4 |
19 files changed, 676 insertions, 394 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 0454b0885b01..270f694850ec 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -5,7 +5,7 @@ dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o \ - dm-rq.o + dm-rq.o dm-io-rewind.o dm-multipath-y += dm-path-selector.o dm-mpath.o dm-historical-service-time-y += dm-ps-historical-service-time.o dm-io-affinity-y += dm-ps-io-affinity.o diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 179ed5bf81a3..0905f2c1615e 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -131,7 +131,7 @@ void dm_cache_dump(struct dm_cache_metadata *cmd); * hints will be lost. * * The hints are indexed by the cblock, but many policies will not - * neccessarily have a fast way of accessing efficiently via cblock. So + * necessarily have a fast way of accessing efficiently via cblock. So * rather than querying the policy for each cblock, we let it walk its data * structures and fill in the hints in whatever order it wishes. */ diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 28c5de8eca4a..54a8d5c9a44e 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2775,7 +2775,7 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, /* * The discard block size in the on disk metadata is not - * neccessarily the same as we're currently using. So we have to + * necessarily the same as we're currently using. So we have to * be careful to only set the discarded attribute if we know it * covers a complete block of the new size. */ diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index c954ff91870e..6c6bd24774f2 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -22,6 +22,8 @@ #define DM_RESERVED_MAX_IOS 1024 +struct dm_io; + struct dm_kobject_holder { struct kobject kobj; struct completion completion; @@ -91,6 +93,14 @@ struct mapped_device { spinlock_t deferred_lock; struct bio_list deferred; + /* + * requeue work context is needed for cloning one new bio + * to represent the dm_io to be requeued, since each + * dm_io may point to the original bio from FS. + */ + struct work_struct requeue_work; + struct dm_io *requeue_list; + void *interface_ptr; /* @@ -216,6 +226,13 @@ struct dm_table { #endif }; +static inline struct dm_target *dm_table_get_target(struct dm_table *t, + unsigned int index) +{ + BUG_ON(index >= t->num_targets); + return t->targets + index; +} + /* * One of these is allocated per clone bio. */ @@ -230,6 +247,9 @@ struct dm_target_io { sector_t old_sector; struct bio clone; }; +#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) +#define DM_IO_BIO_OFFSET \ + (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) /* * dm_target_io flags @@ -272,7 +292,6 @@ struct dm_io { atomic_t io_count; struct mapped_device *md; - struct bio *split_bio; /* The three fields represent mapped part of original bio */ struct bio *orig_bio; unsigned int sector_offset; /* offset to end of orig_bio */ @@ -300,6 +319,8 @@ static inline void dm_io_set_flag(struct dm_io *io, unsigned int bit) io->flags |= (1U << bit); } +void dm_io_rewind(struct dm_io *io, struct bio_set *bs); + static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) { return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c index 1842d3a958ef..a1bd7cd52b1b 100644 --- a/drivers/md/dm-ima.c +++ b/drivers/md/dm-ima.c @@ -208,7 +208,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl if (!target_data_buf) goto error; - num_targets = dm_table_get_num_targets(table); + num_targets = table->num_targets; if (dm_ima_alloc_and_copy_device_data(table->md, &device_data_buf, num_targets, noio)) goto error; @@ -237,9 +237,6 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl for (i = 0; i < num_targets; i++) { struct dm_target *ti = dm_table_get_target(table, i); - if (!ti) - goto error; - last_target_measured = 0; /* diff --git a/drivers/md/dm-io-rewind.c b/drivers/md/dm-io-rewind.c new file mode 100644 index 000000000000..0db53ccb94ba --- /dev/null +++ b/drivers/md/dm-io-rewind.c @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2022 Red Hat, Inc. + */ + +#include <linux/bio.h> +#include <linux/blk-crypto.h> +#include <linux/blk-integrity.h> + +#include "dm-core.h" + +static inline bool dm_bvec_iter_rewind(const struct bio_vec *bv, + struct bvec_iter *iter, + unsigned int bytes) +{ + int idx; + + iter->bi_size += bytes; + if (bytes <= iter->bi_bvec_done) { + iter->bi_bvec_done -= bytes; + return true; + } + + bytes -= iter->bi_bvec_done; + idx = iter->bi_idx - 1; + + while (idx >= 0 && bytes && bytes > bv[idx].bv_len) { + bytes -= bv[idx].bv_len; + idx--; + } + + if (WARN_ONCE(idx < 0 && bytes, + "Attempted to rewind iter beyond bvec's boundaries\n")) { + iter->bi_size -= bytes; + iter->bi_bvec_done = 0; + iter->bi_idx = 0; + return false; + } + + iter->bi_idx = idx; + iter->bi_bvec_done = bv[idx].bv_len - bytes; + return true; +} + +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +/** + * dm_bio_integrity_rewind - Rewind integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes to rewind + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and rewind the + * integrity vector accordingly. + */ +static void dm_bio_integrity_rewind(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); + + bip->bip_iter.bi_sector -= bio_integrity_intervals(bi, bytes_done >> 9); + dm_bvec_iter_rewind(bip->bip_vec, &bip->bip_iter, bytes); +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +static inline void dm_bio_integrity_rewind(struct bio *bio, + unsigned int bytes_done) +{ + return; +} + +#endif + +#if defined(CONFIG_BLK_INLINE_ENCRYPTION) + +/* Decrements @dun by @dec, treating @dun as a multi-limb integer. */ +static void dm_bio_crypt_dun_decrement(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + unsigned int dec) +{ + int i; + + for (i = 0; dec && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { + u64 prev = dun[i]; + + dun[i] -= dec; + if (dun[i] > prev) + dec = 1; + else + dec = 0; + } +} + +static void dm_bio_crypt_rewind(struct bio *bio, unsigned int bytes) +{ + struct bio_crypt_ctx *bc = bio->bi_crypt_context; + + dm_bio_crypt_dun_decrement(bc->bc_dun, + bytes >> bc->bc_key->data_unit_size_bits); +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline void dm_bio_crypt_rewind(struct bio *bio, unsigned int bytes) +{ + return; +} + +#endif + +static inline void dm_bio_rewind_iter(const struct bio *bio, + struct bvec_iter *iter, unsigned int bytes) +{ + iter->bi_sector -= bytes >> 9; + + /* No advance means no rewind */ + if (bio_no_advance_iter(bio)) + iter->bi_size += bytes; + else + dm_bvec_iter_rewind(bio->bi_io_vec, iter, bytes); +} + +/** + * dm_bio_rewind - update ->bi_iter of @bio by rewinding @bytes. + * @bio: bio to rewind + * @bytes: how many bytes to rewind + * + * WARNING: + * Caller must ensure that @bio has a fixed end sector, to allow + * rewinding from end of bio and restoring its original position. + * Caller is also responsibile for restoring bio's size. + */ +static void dm_bio_rewind(struct bio *bio, unsigned bytes) +{ + if (bio_integrity(bio)) + dm_bio_integrity_rewind(bio, bytes); + + if (bio_has_crypt_ctx(bio)) + dm_bio_crypt_rewind(bio, bytes); + + dm_bio_rewind_iter(bio, &bio->bi_iter, bytes); +} + +void dm_io_rewind(struct dm_io *io, struct bio_set *bs) +{ + struct bio *orig = io->orig_bio; + struct bio *new_orig = bio_alloc_clone(orig->bi_bdev, orig, + GFP_NOIO, bs); + /* + * dm_bio_rewind can restore to previous position since the + * end sector is fixed for original bio, but we still need + * to restore bio's size manually (using io->sectors). + */ + dm_bio_rewind(new_orig, ((io->sector_offset << 9) - + orig->bi_iter.bi_size)); + bio_trim(new_orig, 0, io->sectors); + + bio_chain(new_orig, orig); + /* + * __bi_remaining was increased (by dm_split_and_process_bio), + * so must drop the one added in bio_chain. + */ + atomic_dec(&orig->__bi_remaining); + io->orig_bio = new_orig; +} diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 87310fceb0d8..98976aaa9db9 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -832,7 +832,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { if (get_disk_ro(disk)) param->flags |= DM_READONLY_FLAG; - param->target_count = dm_table_get_num_targets(table); + param->target_count = table->num_targets; } param->flags |= DM_ACTIVE_PRESENT_FLAG; @@ -845,7 +845,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) if (table) { if (!(dm_table_get_mode(table) & FMODE_WRITE)) param->flags |= DM_READONLY_FLAG; - param->target_count = dm_table_get_num_targets(table); + param->target_count = table->num_targets; } dm_put_live_table(md, srcu_idx); } @@ -1248,7 +1248,7 @@ static void retrieve_status(struct dm_table *table, type = STATUSTYPE_INFO; /* Get all the target info */ - num_targets = dm_table_get_num_targets(table); + num_targets = table->num_targets; for (i = 0; i < num_targets; i++) { struct dm_target *ti = dm_table_get_target(table, i); size_t l; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 9c8f3544e99d..4d3bbbea2e9a 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -219,7 +219,7 @@ static struct page_list *alloc_pl(gfp_t gfp) if (!pl) return NULL; - pl->page = alloc_page(gfp); + pl->page = alloc_page(gfp | __GFP_HIGHMEM); if (!pl->page) { kfree(pl); return NULL; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 66486b14ec33..1ec17c32867f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1369,7 +1369,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, } rs->md.bitmap_info.daemon_sleep = value; } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET))) { - /* Userspace passes new data_offset after having extended the the data image LV */ + /* Userspace passes new data_offset after having extended the data image LV */ if (test_and_set_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) { rs->ti->error = "Only one data_offset argument pair allowed"; return -EINVAL; @@ -3097,6 +3097,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; ti->num_flush_bios = 1; + ti->needs_bio_set_dev = true; /* Restore any requested new layout for conversion decision */ rs_config_restore(rs, &rs_layout); @@ -3509,7 +3510,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, { struct raid_set *rs = ti->private; struct mddev *mddev = &rs->md; - struct r5conf *conf = mddev->private; + struct r5conf *conf = rs_is_raid456(rs) ? mddev->private : NULL; int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0; unsigned long recovery; unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ @@ -3819,7 +3820,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices)); - for (i = 0; i < mddev->raid_disks; i++) { + for (i = 0; i < rs->raid_disks; i++) { r = &rs->dev[i].rdev; /* HM FIXME: enhance journal device recovery processing */ if (test_bit(Journal, &r->flags)) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index a83b98a8d2a9..4f49bbcce4f1 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -43,7 +43,6 @@ unsigned dm_get_reserved_rq_based_ios(void) return __dm_get_module_param(&reserved_rq_based_ios, RESERVED_REQUEST_BASED_IOS, DM_RESERVED_MAX_IOS); } -EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); static unsigned dm_get_blk_mq_nr_hw_queues(void) { diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 0d336b5ec571..d1c2f84d27e3 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -2026,7 +2026,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) /* * Write to snapshot - higher level takes care of RW/RO * flags so we should only get this if we are - * writeable. + * writable. */ if (bio_data_dir(bio) == WRITE) { pe = __lookup_pending_exception(s, chunk); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index df904b7e95ce..332f96b58252 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -6,6 +6,7 @@ */ #include "dm-core.h" +#include "dm-rq.h" #include <linux/module.h> #include <linux/vmalloc.h> @@ -174,8 +175,6 @@ static void dm_table_destroy_crypto_profile(struct dm_table *t); void dm_table_destroy(struct dm_table *t) { - unsigned int i; - if (!t) return; @@ -184,13 +183,13 @@ void dm_table_destroy(struct dm_table *t) kvfree(t->index[t->depth - 2]); /* free the targets */ - for (i = 0; i < t->num_targets; i++) { - struct dm_target *tgt = t->targets + i; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); - if (tgt->type->dtr) - tgt->type->dtr(tgt); + if (ti->type->dtr) + ti->type->dtr(ti); - dm_put_target_type(tgt->type); + dm_put_target_type(ti->type); } kvfree(t->highs); @@ -450,14 +449,14 @@ EXPORT_SYMBOL(dm_put_device); /* * Checks to see if the target joins onto the end of the table. */ -static int adjoin(struct dm_table *table, struct dm_target *ti) +static int adjoin(struct dm_table *t, struct dm_target *ti) { struct dm_target *prev; - if (!table->num_targets) + if (!t->num_targets) return !ti->begin; - prev = &table->targets[table->num_targets - 1]; + prev = &t->targets[t->num_targets - 1]; return (ti->begin == (prev->begin + prev->len)); } @@ -564,8 +563,8 @@ int dm_split_args(int *argc, char ***argvp, char *input) * two or more targets, the size of each piece it gets split into must * be compatible with the logical_block_size of the target processing it. */ -static int validate_hardware_logical_block_alignment(struct dm_table *table, - struct queue_limits *limits) +static int validate_hardware_logical_block_alignment(struct dm_table *t, + struct queue_limits *limits) { /* * This function uses arithmetic modulo the logical_block_size @@ -587,13 +586,13 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, struct dm_target *ti; struct queue_limits ti_limits; - unsigned i; + unsigned int i; /* * Check each entry in the table in turn. */ - for (i = 0; i < dm_table_get_num_targets(table); i++) { - ti = dm_table_get_target(table, i); + for (i = 0; i < t->num_targets; i++) { + ti = dm_table_get_target(t, i); blk_set_stacking_limits(&ti_limits); @@ -621,7 +620,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, if (remaining) { DMWARN("%s: table line %u (start sect %llu len %llu) " "not aligned to h/w logical block size %u", - dm_device_name(table->md), i, + dm_device_name(t->md), i, (unsigned long long) ti->begin, (unsigned long long) ti->len, limits->logical_block_size); @@ -636,7 +635,7 @@ int dm_table_add_target(struct dm_table *t, const char *type, { int r = -EINVAL, argc; char **argv; - struct dm_target *tgt; + struct dm_target *ti; if (t->singleton) { DMERR("%s: target type %s must appear alone in table", @@ -646,87 +645,87 @@ int dm_table_add_target(struct dm_table *t, const char *type, BUG_ON(t->num_targets >= t->num_allocated); - tgt = t->targets + t->num_targets; - memset(tgt, 0, sizeof(*tgt)); + ti = t->targets + t->num_targets; + memset(ti, 0, sizeof(*ti)); if (!len) { DMERR("%s: zero-length target", dm_device_name(t->md)); return -EINVAL; } - tgt->type = dm_get_target_type(type); - if (!tgt->type) { + ti->type = dm_get_target_type(type); + if (!ti->type) { DMERR("%s: %s: unknown target type", dm_device_name(t->md), type); return -EINVAL; } - if (dm_target_needs_singleton(tgt->type)) { + if (dm_target_needs_singleton(ti->type)) { if (t->num_targets) { - tgt->error = "singleton target type must appear alone in table"; + ti->error = "singleton target type must appear alone in table"; goto bad; } t->singleton = true; } - if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) { - tgt->error = "target type may not be included in a read-only table"; + if (dm_target_always_writeable(ti->type) && !(t->mode & FMODE_WRITE)) { + ti->error = "target type may not be included in a read-only table"; goto bad; } if (t->immutable_target_type) { - if (t->immutable_target_type != tgt->type) { - tgt->error = "immutable target type cannot be mixed with other target types"; + if (t->immutable_target_type != ti->type) { + ti->error = "immutable target type cannot be mixed with other target types"; goto bad; } - } else if (dm_target_is_immutable(tgt->type)) { + } else if (dm_target_is_immutable(ti->type)) { if (t->num_targets) { - tgt->error = "immutable target type cannot be mixed with other target types"; + ti->error = "immutable target type cannot be mixed with other target types"; goto bad; } - t->immutable_target_type = tgt->type; + t->immutable_target_type = ti->type; } - if (dm_target_has_integrity(tgt->type)) + if (dm_target_has_integrity(ti->type)) t->integrity_added = 1; - tgt->table = t; - tgt->begin = start; - tgt->len = len; - tgt->error = "Unknown error"; + ti->table = t; + ti->begin = start; + ti->len = len; + ti->error = "Unknown error"; /* * Does this target adjoin the previous one ? */ - if (!adjoin(t, tgt)) { - tgt->error = "Gap in table"; + if (!adjoin(t, ti)) { + ti->error = "Gap in table"; goto bad; } r = dm_split_args(&argc, &argv, params); if (r) { - tgt->error = "couldn't split parameters"; + ti->error = "couldn't split parameters"; goto bad; } - r = tgt->type->ctr(tgt, argc, argv); + r = ti->type->ctr(ti, argc, argv); kfree(argv); if (r) goto bad; - t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + t->highs[t->num_targets++] = ti->begin + ti->len - 1; - if (!tgt->num_discard_bios && tgt->discards_supported) + if (!ti->num_discard_bios && ti->discards_supported) DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", dm_device_name(t->md), type); - if (tgt->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) + if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) static_branch_enable(&swap_bios_enabled); return 0; bad: - DMERR("%s: %s: %s (%pe)", dm_device_name(t->md), type, tgt->error, ERR_PTR(r)); - dm_put_target_type(tgt->type); + DMERR("%s: %s: %s (%pe)", dm_device_name(t->md), type, ti->error, ERR_PTR(r)); + dm_put_target_type(ti->type); return r; } @@ -825,14 +824,11 @@ static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_de } static bool dm_table_supports_dax(struct dm_table *t, - iterate_devices_callout_fn iterate_fn) + iterate_devices_callout_fn iterate_fn) { - struct dm_target *ti; - unsigned i; - /* Ensure that all targets support DAX. */ - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->type->direct_access) return false; @@ -860,9 +856,8 @@ static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, static int dm_table_determine_type(struct dm_table *t) { - unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; - struct dm_target *tgt; + struct dm_target *ti; struct list_head *devices = dm_table_get_devices(t); enum dm_queue_mode live_md_type = dm_get_md_type(t->md); @@ -876,11 +871,11 @@ static int dm_table_determine_type(struct dm_table *t) goto verify_rq_based; } - for (i = 0; i < t->num_targets; i++) { - tgt = t->targets + i; - if (dm_target_hybrid(tgt)) + for (unsigned int i = 0; i < t->num_targets; i++) { + ti = dm_table_get_target(t, i); + if (dm_target_hybrid(ti)) hybrid = 1; - else if (dm_target_request_based(tgt)) + else if (dm_target_request_based(ti)) request_based = 1; else bio_based = 1; @@ -942,18 +937,18 @@ verify_rq_based: return 0; } - tgt = dm_table_get_immutable_target(t); - if (!tgt) { + ti = dm_table_get_immutable_target(t); + if (!ti) { DMERR("table load rejected: immutable target is required"); return -EINVAL; - } else if (tgt->max_io_len) { + } else if (ti->max_io_len) { DMERR("table load rejected: immutable target that splits IO is not supported"); return -EINVAL; } /* Non-request-stackable devices can't be used for request-based dm */ - if (!tgt->type->iterate_devices || - !tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) { + if (!ti->type->iterate_devices || + !ti->type->iterate_devices(ti, device_is_rq_stackable, NULL)) { DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } @@ -983,11 +978,9 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t) struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) { - struct dm_target *ti; - unsigned i; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); if (dm_target_is_wildcard(ti->type)) return ti; } @@ -1010,32 +1003,56 @@ static bool dm_table_supports_poll(struct dm_table *t); static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { enum dm_queue_mode type = dm_table_get_type(t); - unsigned per_io_data_size = 0; - unsigned min_pool_size = 0; - struct dm_target *ti; - unsigned i; - bool poll_supported = false; + unsigned int per_io_data_size = 0, front_pad, io_front_pad; + unsigned int min_pool_size = 0, pool_size; + struct dm_md_mempools *pools; if (unlikely(type == DM_TYPE_NONE)) { DMWARN("no table type is set, can't allocate mempools"); return -EINVAL; } - if (__table_type_bio_based(type)) { - for (i = 0; i < t->num_targets; i++) { - ti = t->targets + i; - per_io_data_size = max(per_io_data_size, ti->per_io_data_size); - min_pool_size = max(min_pool_size, ti->num_flush_bios); - } - poll_supported = dm_table_supports_poll(t); + pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); + if (!pools) + return -ENOMEM; + + if (type == DM_TYPE_REQUEST_BASED) { + pool_size = dm_get_reserved_rq_based_ios(); + front_pad = offsetof(struct dm_rq_clone_bio_info, clone); + goto init_bs; } - t->mempools = dm_alloc_md_mempools(md, type, per_io_data_size, min_pool_size, - t->integrity_supported, poll_supported); - if (!t->mempools) - return -ENOMEM; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + per_io_data_size = max(per_io_data_size, ti->per_io_data_size); + min_pool_size = max(min_pool_size, ti->num_flush_bios); + } + pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); + front_pad = roundup(per_io_data_size, + __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; + + io_front_pad = roundup(per_io_data_size, + __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; + if (bioset_init(&pools->io_bs, pool_size, io_front_pad, + dm_table_supports_poll(t) ? BIOSET_PERCPU_CACHE : 0)) + goto out_free_pools; + if (t->integrity_supported && + bioset_integrity_create(&pools->io_bs, pool_size)) + goto out_free_pools; +init_bs: + if (bioset_init(&pools->bs, pool_size, front_pad, 0)) + goto out_free_pools; + if (t->integrity_supported && + bioset_integrity_create(&pools->bs, pool_size)) + goto out_free_pools; + + t->mempools = pools; return 0; + +out_free_pools: + dm_free_md_mempools(pools); + return -ENOMEM; } static int setup_indexes(struct dm_table *t) @@ -1100,10 +1117,10 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t) struct list_head *devices = dm_table_get_devices(t); struct dm_dev_internal *dd = NULL; struct gendisk *prev_disk = NULL, *template_disk = NULL; - unsigned i; - for (i = 0; i < dm_table_get_num_targets(t); i++) { + for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); + if (!dm_target_passes_integrity(ti->type)) goto no_integrity; } @@ -1217,18 +1234,19 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, struct dm_keyslot_evict_args args = { key }; struct dm_table *t; int srcu_idx; - int i; - struct dm_target *ti; t = dm_get_live_table(md, &srcu_idx); if (!t) return 0; - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + if (!ti->type->iterate_devices) continue; ti->type->iterate_devices(ti, dm_keyslot_evict_callback, &args); } + dm_put_live_table(md, srcu_idx); return args.err; } @@ -1277,7 +1295,6 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) { struct dm_crypto_profile *dmcp; struct blk_crypto_profile *profile; - struct dm_target *ti; unsigned int i; bool empty_profile = true; @@ -1293,8 +1310,8 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) memset(profile->modes_supported, 0xFF, sizeof(profile->modes_supported)); - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!dm_target_passes_crypto(ti->type)) { blk_crypto_intersect_capabilities(profile, NULL); @@ -1444,14 +1461,6 @@ inline sector_t dm_table_get_size(struct dm_table *t) } EXPORT_SYMBOL(dm_table_get_size); -struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) -{ - if (index >= t->num_targets) - return NULL; - - return t->targets + index; -} - /* * Search the btree for the correct target. * @@ -1512,11 +1521,8 @@ static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_any_dev_attr(struct dm_table *t, iterate_devices_callout_fn func, void *data) { - struct dm_target *ti; - unsigned int i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (ti->type->iterate_devices && ti->type->iterate_devices(ti, func, data)) @@ -1538,11 +1544,8 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_poll(struct dm_table *t) { - struct dm_target *ti; - unsigned i = 0; - - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->type->iterate_devices || ti->type->iterate_devices(ti, device_not_poll_capable, NULL)) @@ -1558,18 +1561,15 @@ static bool dm_table_supports_poll(struct dm_table *t) * Returns false if the result is unknown because a target doesn't * support iterate_devices. */ -bool dm_table_has_no_data_devices(struct dm_table *table) +bool dm_table_has_no_data_devices(struct dm_table *t) { - struct dm_target *ti; - unsigned i, num_devices; - - for (i = 0; i < dm_table_get_num_targets(table); i++) { - ti = dm_table_get_target(table, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + unsigned num_devices = 0; if (!ti->type->iterate_devices) return false; - num_devices = 0; ti->type->iterate_devices(ti, count_device, &num_devices); if (num_devices) return false; @@ -1597,11 +1597,8 @@ static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_zoned_model(struct dm_table *t, enum blk_zoned_model zoned_model) { - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (dm_target_supports_zoned_hm(ti->type)) { if (!ti->type->iterate_devices || @@ -1632,16 +1629,16 @@ static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev * * zone sectors, if the destination device is a zoned block device, it shall * have the specified zone_sectors. */ -static int validate_hardware_zoned_model(struct dm_table *table, +static int validate_hardware_zoned_model(struct dm_table *t, enum blk_zoned_model zoned_model, unsigned int zone_sectors) { if (zoned_model == BLK_ZONED_NONE) return 0; - if (!dm_table_supports_zoned_model(table, zoned_model)) { + if (!dm_table_supports_zoned_model(t, zoned_model)) { DMERR("%s: zoned model is not consistent across all devices", - dm_device_name(table->md)); + dm_device_name(t->md)); return -EINVAL; } @@ -1649,9 +1646,9 @@ static int validate_hardware_zoned_model(struct dm_table *table, if (!zone_sectors || !is_power_of_2(zone_sectors)) return -EINVAL; - if (dm_table_any_dev_attr(table, device_not_matches_zone_sectors, &zone_sectors)) { + if (dm_table_any_dev_attr(t, device_not_matches_zone_sectors, &zone_sectors)) { DMERR("%s: zone sectors is not consistent across all zoned devices", - dm_device_name(table->md)); + dm_device_name(t->md)); return -EINVAL; } @@ -1661,21 +1658,19 @@ static int validate_hardware_zoned_model(struct dm_table *table, /* * Establish the new table's queue_limits and validate them. */ -int dm_calculate_queue_limits(struct dm_table *table, +int dm_calculate_queue_limits(struct dm_table *t, struct queue_limits *limits) { - struct dm_target *ti; struct queue_limits ti_limits; - unsigned i; enum blk_zoned_model zoned_model = BLK_ZONED_NONE; unsigned int zone_sectors = 0; blk_set_stacking_limits(limits); - for (i = 0; i < dm_table_get_num_targets(table); i++) { - blk_set_stacking_limits(&ti_limits); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); - ti = dm_table_get_target(table, i); + blk_set_stacking_limits(&ti_limits); if (!ti->type->iterate_devices) goto combine_limits; @@ -1716,7 +1711,7 @@ combine_limits: DMWARN("%s: adding target device " "(start sect %llu len %llu) " "caused an alignment inconsistency", - dm_device_name(table->md), + dm_device_name(t->md), (unsigned long long) ti->begin, (unsigned long long) ti->len); } @@ -1736,10 +1731,10 @@ combine_limits: zoned_model = limits->zoned; zone_sectors = limits->chunk_sectors; } - if (validate_hardware_zoned_model(table, zoned_model, zone_sectors)) + if (validate_hardware_zoned_model(t, zoned_model, zone_sectors)) return -EINVAL; - return validate_hardware_logical_block_alignment(table, limits); + return validate_hardware_logical_block_alignment(t, limits); } /* @@ -1783,17 +1778,14 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) { - struct dm_target *ti; - unsigned i; - /* * Require at least one underlying device to support flushes. * t->devices includes internal dm devices such as mirror logs * so we need to use iterate_devices here, which targets * supporting flushes must provide. */ - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->num_flush_bios) continue; @@ -1847,11 +1839,8 @@ static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev * static bool dm_table_supports_write_zeroes(struct dm_table *t) { - struct dm_target *ti; - unsigned i = 0; - - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->num_write_zeroes_bios) return false; @@ -1874,11 +1863,8 @@ static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_nowait(struct dm_table *t) { - struct dm_target *ti; - unsigned i = 0; - - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!dm_target_supports_nowait(ti->type)) return false; @@ -1899,11 +1885,8 @@ static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev, static bool dm_table_supports_discards(struct dm_table *t) { - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->num_discard_bios) return false; @@ -1931,11 +1914,8 @@ static int device_not_secure_erase_capable(struct dm_target *ti, static bool dm_table_supports_secure_erase(struct dm_table *t) { - struct dm_target *ti; - unsigned int i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->num_secure_erase_bios) return false; @@ -2065,11 +2045,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, return 0; } -unsigned int dm_table_get_num_targets(struct dm_table *t) -{ - return t->num_targets; -} - struct list_head *dm_table_get_devices(struct dm_table *t) { return &t->devices; @@ -2089,12 +2064,11 @@ enum suspend_mode { static void suspend_targets(struct dm_table *t, enum suspend_mode mode) { - int i = t->num_targets; - struct dm_target *ti = t->targets; - lockdep_assert_held(&t->md->suspend_lock); - while (i--) { + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + switch (mode) { case PRESUSPEND: if (ti->type->presuspend) @@ -2109,7 +2083,6 @@ static void suspend_targets(struct dm_table *t, enum suspend_mode mode) ti->type->postsuspend(ti); break; } - ti++; } } @@ -2139,12 +2112,13 @@ void dm_table_postsuspend_targets(struct dm_table *t) int dm_table_resume_targets(struct dm_table *t) { - int i, r = 0; + unsigned int i; + int r = 0; lockdep_assert_held(&t->md->suspend_lock); for (i = 0; i < t->num_targets; i++) { - struct dm_target *ti = t->targets + i; + struct dm_target *ti = dm_table_get_target(t, i); if (!ti->type->preresume) continue; @@ -2158,7 +2132,7 @@ int dm_table_resume_targets(struct dm_table *t) } for (i = 0; i < t->num_targets; i++) { - struct dm_target *ti = t->targets + i; + struct dm_target *ti = dm_table_get_target(t, i); if (ti->type->resume) ti->type->resume(ti); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 2db7030aba00..a27395c8621f 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -2045,10 +2045,13 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, dm_sm_threshold_fn fn, void *context) { - int r; + int r = -EINVAL; pmd_write_lock_in_core(pmd); - r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context); + if (!pmd->fail_io) { + r = dm_sm_register_threshold_callback(pmd->metadata_sm, + threshold, fn, context); + } pmd_write_unlock(pmd); return r; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 84c083f76673..e76c96c760a9 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3375,8 +3375,10 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) calc_metadata_threshold(pt), metadata_low_callback, pool); - if (r) + if (r) { + ti->error = "Error registering metadata threshold"; goto out_flags_changed; + } dm_pool_register_pre_commit_callback(pool->pmd, metadata_pre_commit_callback, pool); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index d6dbd47492a8..75b66dd67633 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -527,11 +527,10 @@ static int verity_verify_io(struct dm_verity_io *io) if (v->validated_blocks) set_bit(cur_block, v->validated_blocks); continue; - } - else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, - cur_block, NULL, &start) == 0) + } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, + cur_block, NULL, &start) == 0) { continue; - else { + } else { if (bio->bi_status) { /* * Error correction failed; Just return error diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 2b994b3e22a7..1fc161d65673 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -22,7 +22,7 @@ #define HIGH_WATERMARK 50 #define LOW_WATERMARK 45 -#define MAX_WRITEBACK_JOBS 0 +#define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16) #define ENDIO_LATENCY 16 #define WRITEBACK_LATENCY 64 #define AUTOCOMMIT_BLOCKS_SSD 65536 @@ -1325,8 +1325,8 @@ enum wc_map_op { WC_MAP_ERROR, }; -static enum wc_map_op writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, - struct wc_entry *e) +static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, + struct wc_entry *e) { if (e) { sector_t next_boundary = @@ -1334,8 +1334,6 @@ static enum wc_map_op writecache_map_remap_origin(struct dm_writecache *wc, stru if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) dm_accept_partial_bio(bio, next_boundary); } - - return WC_MAP_REMAP_ORIGIN; } static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio) @@ -1362,14 +1360,16 @@ read_next_block: map_op = WC_MAP_REMAP; } } else { - map_op = writecache_map_remap_origin(wc, bio, e); + writecache_map_remap_origin(wc, bio, e); + wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; + map_op = WC_MAP_REMAP_ORIGIN; } return map_op; } -static enum wc_map_op writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, - struct wc_entry *e, bool search_used) +static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, + struct wc_entry *e, bool search_used) { unsigned bio_size = wc->block_size; sector_t start_cache_sec = cache_sector(wc, e); @@ -1409,14 +1409,15 @@ static enum wc_map_op writecache_bio_copy_ssd(struct dm_writecache *wc, struct b bio->bi_iter.bi_sector = start_cache_sec; dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; + wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; + if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { wc->uncommitted_blocks = 0; queue_work(wc->writeback_wq, &wc->flush_work); } else { writecache_schedule_autocommit(wc); } - - return WC_MAP_REMAP; } static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio) @@ -1426,9 +1427,10 @@ static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio do { bool found_entry = false; bool search_used = false; - wc->stats.writes++; - if (writecache_has_error(wc)) + if (writecache_has_error(wc)) { + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; return WC_MAP_ERROR; + } e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); if (e) { if (!writecache_entry_is_committed(wc, e)) { @@ -1452,9 +1454,11 @@ static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio if (unlikely(!e)) { if (!WC_MODE_PMEM(wc) && !found_entry) { direct_write: - wc->stats.writes_around++; e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); - return writecache_map_remap_origin(wc, bio, e); + writecache_map_remap_origin(wc, bio, e); + wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits; + wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; + return WC_MAP_REMAP_ORIGIN; } wc->stats.writes_blocked_on_freelist++; writecache_wait_on_freelist(wc); @@ -1465,10 +1469,13 @@ direct_write: wc->uncommitted_blocks++; wc->stats.writes_allocate++; bio_copy: - if (WC_MODE_PMEM(wc)) + if (WC_MODE_PMEM(wc)) { bio_copy_block(wc, bio, memory_data(wc, e)); - else - return writecache_bio_copy_ssd(wc, bio, e, search_used); + wc->stats.writes++; + } else { + writecache_bio_copy_ssd(wc, bio, e, search_used); + return WC_MAP_REMAP; + } } while (bio->bi_iter.bi_size); if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks)) @@ -1503,7 +1510,7 @@ static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio) { - wc->stats.discards++; + wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits; if (writecache_has_error(wc)) return WC_MAP_ERROR; diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 4d10f302c62e..3dafc0e8b7a9 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -273,11 +273,8 @@ static int device_not_zone_append_capable(struct dm_target *ti, static bool dm_table_supports_zone_append(struct dm_table *t) { - struct dm_target *ti; - unsigned int i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); if (ti->emulate_zone_append) return false; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 54c2a23f4e55..99642f69bfa7 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -88,10 +88,6 @@ struct clone_info { bool submit_as_polled:1; }; -#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) -#define DM_IO_BIO_OFFSET \ - (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) - static inline struct dm_target_io *clone_to_tio(struct bio *clone) { return container_of(clone, struct dm_target_io, clone); @@ -415,7 +411,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, struct block_device **bdev) { - struct dm_target *tgt; + struct dm_target *ti; struct dm_table *map; int r; @@ -426,17 +422,17 @@ retry: return r; /* We only support devices that have a single target */ - if (dm_table_get_num_targets(map) != 1) + if (map->num_targets != 1) return r; - tgt = dm_table_get_target(map, 0); - if (!tgt->type->prepare_ioctl) + ti = dm_table_get_target(map, 0); + if (!ti->type->prepare_ioctl) return r; if (dm_suspended_md(md)) return -EAGAIN; - r = tgt->type->prepare_ioctl(tgt, bdev); + r = ti->type->prepare_ioctl(ti, bdev); if (r == -ENOTCONN && !fatal_signal_pending(current)) { dm_put_live_table(md, *srcu_idx); msleep(10); @@ -578,9 +574,6 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) struct bio *clone; clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs); - /* Set default bdev, but target must bio_set_dev() before issuing IO */ - clone->bi_bdev = md->disk->part0; - tio = clone_to_tio(clone); tio->flags = 0; dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO); @@ -594,7 +587,6 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) atomic_set(&io->io_count, 2); this_cpu_inc(*md->pending_io); io->orig_bio = bio; - io->split_bio = NULL; io->md = md; spin_lock_init(&io->lock); io->start_time = jiffies; @@ -614,6 +606,7 @@ static void free_io(struct dm_io *io) static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) { + struct mapped_device *md = ci->io->md; struct dm_target_io *tio; struct bio *clone; @@ -623,14 +616,10 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, /* alloc_io() already initialized embedded clone */ clone = &tio->clone; } else { - struct mapped_device *md = ci->io->md; - clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->mempools->bs); if (!clone) return NULL; - /* Set default bdev, but target must bio_set_dev() before issuing IO */ - clone->bi_bdev = md->disk->part0; /* REQ_DM_POLL_LIST shouldn't be inherited */ clone->bi_opf &= ~REQ_DM_POLL_LIST; @@ -646,6 +635,11 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, tio->len_ptr = len; tio->old_sector = 0; + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; + if (unlikely(ti->needs_bio_set_dev)) + bio_set_dev(clone, md->disk->part0); + if (len) { clone->bi_iter.bi_size = to_bytes(*len); if (bio_integrity(clone)) @@ -884,22 +878,63 @@ static int __noflush_suspending(struct mapped_device *md) return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); } -static void dm_io_complete(struct dm_io *io) +static void dm_requeue_add_io(struct dm_io *io, bool first_stage) { - blk_status_t io_error; struct mapped_device *md = io->md; - struct bio *bio = io->split_bio ? io->split_bio : io->orig_bio; - if (io->status == BLK_STS_DM_REQUEUE) { + if (first_stage) { + struct dm_io *next = md->requeue_list; + + md->requeue_list = io; + io->next = next; + } else { + bio_list_add_head(&md->deferred, io->orig_bio); + } +} + +static void dm_kick_requeue(struct mapped_device *md, bool first_stage) +{ + if (first_stage) + queue_work(md->wq, &md->requeue_work); + else + queue_work(md->wq, &md->work); +} + +/* + * Return true if the dm_io's original bio is requeued. + * io->status is updated with error if requeue disallowed. + */ +static bool dm_handle_requeue(struct dm_io *io, bool first_stage) +{ + struct bio *bio = io->orig_bio; + bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE); + bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) && + (bio->bi_opf & REQ_POLLED)); + struct mapped_device *md = io->md; + bool requeued = false; + + if (handle_requeue || handle_polled_eagain) { unsigned long flags; + + if (bio->bi_opf & REQ_POLLED) { + /* + * Upper layer won't help us poll split bio + * (io->orig_bio may only reflect a subset of the + * pre-split original) so clear REQ_POLLED. + */ + bio_clear_polled(bio); + } + /* - * Target requested pushing back the I/O. + * Target requested pushing back the I/O or + * polled IO hit BLK_STS_AGAIN. */ spin_lock_irqsave(&md->deferred_lock, flags); - if (__noflush_suspending(md) && - !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { - /* NOTE early return due to BLK_STS_DM_REQUEUE below */ - bio_list_add_head(&md->deferred, bio); + if ((__noflush_suspending(md) && + !WARN_ON_ONCE(dm_is_zone_write(md, bio))) || + handle_polled_eagain || first_stage) { + dm_requeue_add_io(io, first_stage); + requeued = true; } else { /* * noflush suspend was interrupted or this is @@ -910,6 +945,23 @@ static void dm_io_complete(struct dm_io *io) spin_unlock_irqrestore(&md->deferred_lock, flags); } + if (requeued) + dm_kick_requeue(md, first_stage); + + return requeued; +} + +static void __dm_io_complete(struct dm_io *io, bool first_stage) +{ + struct bio *bio = io->orig_bio; + struct mapped_device *md = io->md; + blk_status_t io_error; + bool requeued; + + requeued = dm_handle_requeue(io, first_stage); + if (requeued && first_stage) + return; + io_error = io->status; if (dm_io_flagged(io, DM_IO_ACCOUNTED)) dm_end_io_acct(io); @@ -929,23 +981,9 @@ static void dm_io_complete(struct dm_io *io) if (unlikely(wq_has_sleeper(&md->wait))) wake_up(&md->wait); - if (io_error == BLK_STS_DM_REQUEUE || io_error == BLK_STS_AGAIN) { - if (bio->bi_opf & REQ_POLLED) { - /* - * Upper layer won't help us poll split bio (io->orig_bio - * may only reflect a subset of the pre-split original) - * so clear REQ_POLLED in case of requeue. - */ - bio_clear_polled(bio); - if (io_error == BLK_STS_AGAIN) { - /* io_uring doesn't handle BLK_STS_AGAIN (yet) */ - queue_io(md, bio); - return; - } - } - if (io_error == BLK_STS_DM_REQUEUE) - return; - } + /* Return early if the original bio was requeued */ + if (requeued) + return; if (bio_is_flush_with_data(bio)) { /* @@ -962,6 +1000,58 @@ static void dm_io_complete(struct dm_io *io) } } +static void dm_wq_requeue_work(struct work_struct *work) +{ + struct mapped_device *md = container_of(work, struct mapped_device, + requeue_work); + unsigned long flags; + struct dm_io *io; + + /* reuse deferred lock to simplify dm_handle_requeue */ + spin_lock_irqsave(&md->deferred_lock, flags); + io = md->requeue_list; + md->requeue_list = NULL; + spin_unlock_irqrestore(&md->deferred_lock, flags); + + while (io) { + struct dm_io *next = io->next; + + dm_io_rewind(io, &md->queue->bio_split); + + io->next = NULL; + __dm_io_complete(io, false); + io = next; + } +} + +/* + * Two staged requeue: + * + * 1) io->orig_bio points to the real original bio, and the part mapped to + * this io must be requeued, instead of other parts of the original bio. + * + * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io. + */ +static void dm_io_complete(struct dm_io *io) +{ + bool first_requeue; + + /* + * Only dm_io that has been split needs two stage requeue, otherwise + * we may run into long bio clone chain during suspend and OOM could + * be triggered. + * + * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they + * also aren't handled via the first stage requeue. + */ + if (dm_io_flagged(io, DM_IO_WAS_SPLIT)) + first_requeue = true; + else + first_requeue = false; + + __dm_io_complete(io, first_requeue); +} + /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. @@ -1240,6 +1330,7 @@ out: void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = clone_to_tio(bio); + struct dm_io *io = tio->io; unsigned bio_sectors = bio_sectors(bio); BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); @@ -1255,8 +1346,9 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) * __split_and_process_bio() may have already saved mapped part * for accounting but it is being reduced so update accordingly. */ - dm_io_set_flag(tio->io, DM_IO_WAS_SPLIT); - tio->io->sectors = n_sectors; + dm_io_set_flag(io, DM_IO_WAS_SPLIT); + io->sectors = n_sectors; + io->sector_offset = bio_sectors(io->orig_bio); } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); @@ -1379,17 +1471,7 @@ static void setup_split_accounting(struct clone_info *ci, unsigned len) */ dm_io_set_flag(io, DM_IO_WAS_SPLIT); io->sectors = len; - } - - if (static_branch_unlikely(&stats_enabled) && - unlikely(dm_stats_used(&io->md->stats))) { - /* - * Save bi_sector in terms of its offset from end of - * original bio, only needed for DM-stats' benefit. - * - saved regardless of whether split needed so that - * dm_accept_partial_bio() doesn't need to. - */ - io->sector_offset = bio_end_sector(ci->bio) - ci->sector; + io->sector_offset = bio_sectors(ci->bio); } } @@ -1423,11 +1505,11 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, } static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, - unsigned num_bios, unsigned *len) + unsigned int num_bios, unsigned *len) { struct bio_list blist = BIO_EMPTY_LIST; struct bio *clone; - int ret = 0; + unsigned int ret = 0; switch (num_bios) { case 0: @@ -1455,8 +1537,7 @@ static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, static void __send_empty_flush(struct clone_info *ci) { - unsigned target_nr = 0; - struct dm_target *ti; + struct dm_table *t = ci->map; struct bio flush_bio; /* @@ -1471,8 +1552,9 @@ static void __send_empty_flush(struct clone_info *ci) ci->sector_count = 0; ci->io->tio.clone.bi_iter.bi_size = 0; - while ((ti = dm_table_get_target(ci->map, target_nr++))) { - int bios; + for (unsigned int i = 0; i < t->num_targets; i++) { + unsigned int bios; + struct dm_target *ti = dm_table_get_target(t, i); atomic_add(ti->num_flush_bios, &ci->io->io_count); bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); @@ -1492,7 +1574,7 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target unsigned num_bios) { unsigned len; - int bios; + unsigned int bios; len = min_t(sector_t, ci->sector_count, max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); @@ -1691,11 +1773,9 @@ static void dm_split_and_process_bio(struct mapped_device *md, * Remainder must be passed to submit_bio_noacct() so it gets handled * *after* bios already submitted have been completely processed. */ - WARN_ON_ONCE(!dm_io_flagged(io, DM_IO_WAS_SPLIT)); - io->split_bio = bio_split(bio, io->sectors, GFP_NOIO, - &md->queue->bio_split); - bio_chain(io->split_bio, bio); - trace_block_split(io->split_bio, bio->bi_iter.bi_sector); + bio_trim(bio, io->sectors, ci.sector_count); + trace_block_split(bio, bio->bi_iter.bi_sector); + bio_inc_remaining(bio); submit_bio_noacct(bio); out: /* @@ -1971,9 +2051,11 @@ static struct mapped_device *alloc_dev(int minor) init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); + INIT_WORK(&md->requeue_work, dm_wq_requeue_work); init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); + md->requeue_list = NULL; md->swap_bios = get_swap_bios(); sema_init(&md->swap_bios_semaphore, md->swap_bios); mutex_init(&md->swap_bios_lock); @@ -2980,54 +3062,6 @@ int dm_noflush_suspending(struct dm_target *ti) } EXPORT_SYMBOL_GPL(dm_noflush_suspending); -struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned per_io_data_size, unsigned min_pool_size, - bool integrity, bool poll) -{ - struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); - unsigned int pool_size = 0; - unsigned int front_pad, io_front_pad; - int ret; - - if (!pools) - return NULL; - - switch (type) { - case DM_TYPE_BIO_BASED: - case DM_TYPE_DAX_BIO_BASED: - pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); - front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; - io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; - ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, poll ? BIOSET_PERCPU_CACHE : 0); - if (ret) - goto out; - if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) - goto out; - break; - case DM_TYPE_REQUEST_BASED: - pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); - front_pad = offsetof(struct dm_rq_clone_bio_info, clone); - /* per_io_data_size is used for blk-mq pdu at queue allocation */ - break; - default: - BUG(); - } - - ret = bioset_init(&pools->bs, pool_size, front_pad, 0); - if (ret) - goto out; - - if (integrity && bioset_integrity_create(&pools->bs, pool_size)) - goto out; - - return pools; - -out: - dm_free_md_mempools(pools); - - return NULL; -} - void dm_free_md_mempools(struct dm_md_mempools *pools) { if (!pools) @@ -3043,11 +3077,14 @@ struct dm_pr { u64 old_key; u64 new_key; u32 flags; + bool abort; bool fail_early; + int ret; + enum pr_type type; }; static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, - void *data) + struct dm_pr *pr) { struct mapped_device *md = bdev->bd_disk->private_data; struct dm_table *table; @@ -3059,15 +3096,21 @@ static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, goto out; /* We only support devices that have a single target */ - if (dm_table_get_num_targets(table) != 1) + if (table->num_targets != 1) goto out; ti = dm_table_get_target(table, 0); + if (dm_suspended_md(md)) { + ret = -EAGAIN; + goto out; + } + ret = -EINVAL; if (!ti->type->iterate_devices) goto out; - ret = ti->type->iterate_devices(ti, fn, data); + ti->type->iterate_devices(ti, fn, pr); + ret = 0; out: dm_put_live_table(md, srcu_idx); return ret; @@ -3081,10 +3124,24 @@ static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, { struct dm_pr *pr = data; const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + int ret; + + if (!ops || !ops->pr_register) { + pr->ret = -EOPNOTSUPP; + return -1; + } - if (!ops || !ops->pr_register) - return -EOPNOTSUPP; - return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); + ret = ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); + if (!ret) + return 0; + + if (!pr->ret) + pr->ret = ret; + + if (pr->fail_early) + return -1; + + return 0; } static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, @@ -3095,82 +3152,145 @@ static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, .new_key = new_key, .flags = flags, .fail_early = true, + .ret = 0, }; int ret; ret = dm_call_pr(bdev, __dm_pr_register, &pr); - if (ret && new_key) { - /* unregister all paths if we failed to register any path */ - pr.old_key = new_key; - pr.new_key = 0; - pr.flags = 0; - pr.fail_early = false; - dm_call_pr(bdev, __dm_pr_register, &pr); + if (ret) { + /* Didn't even get to register a path */ + return ret; } + if (!pr.ret) + return 0; + ret = pr.ret; + + if (!new_key) + return ret; + + /* unregister all paths if we failed to register any path */ + pr.old_key = new_key; + pr.new_key = 0; + pr.flags = 0; + pr.fail_early = false; + (void) dm_call_pr(bdev, __dm_pr_register, &pr); return ret; } + +static int __dm_pr_reserve(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_pr *pr = data; + const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + + if (!ops || !ops->pr_reserve) { + pr->ret = -EOPNOTSUPP; + return -1; + } + + pr->ret = ops->pr_reserve(dev->bdev, pr->old_key, pr->type, pr->flags); + if (!pr->ret) + return -1; + + return 0; +} + static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, u32 flags) { - struct mapped_device *md = bdev->bd_disk->private_data; - const struct pr_ops *ops; - int r, srcu_idx; + struct dm_pr pr = { + .old_key = key, + .flags = flags, + .type = type, + .fail_early = false, + .ret = 0, + }; + int ret; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); - if (r < 0) - goto out; + ret = dm_call_pr(bdev, __dm_pr_reserve, &pr); + if (ret) + return ret; - ops = bdev->bd_disk->fops->pr_ops; - if (ops && ops->pr_reserve) - r = ops->pr_reserve(bdev, key, type, flags); - else - r = -EOPNOTSUPP; -out: - dm_unprepare_ioctl(md, srcu_idx); - return r; + return pr.ret; +} + +/* + * If there is a non-All Registrants type of reservation, the release must be + * sent down the holding path. For the cases where there is no reservation or + * the path is not the holder the device will also return success, so we must + * try each path to make sure we got the correct path. + */ +static int __dm_pr_release(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_pr *pr = data; + const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + + if (!ops || !ops->pr_release) { + pr->ret = -EOPNOTSUPP; + return -1; + } + + pr->ret = ops->pr_release(dev->bdev, pr->old_key, pr->type); + if (pr->ret) + return -1; + + return 0; } static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { - struct mapped_device *md = bdev->bd_disk->private_data; - const struct pr_ops *ops; - int r, srcu_idx; + struct dm_pr pr = { + .old_key = key, + .type = type, + .fail_early = false, + }; + int ret; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); - if (r < 0) - goto out; + ret = dm_call_pr(bdev, __dm_pr_release, &pr); + if (ret) + return ret; - ops = bdev->bd_disk->fops->pr_ops; - if (ops && ops->pr_release) - r = ops->pr_release(bdev, key, type); - else - r = -EOPNOTSUPP; -out: - dm_unprepare_ioctl(md, srcu_idx); - return r; + return pr.ret; +} + +static int __dm_pr_preempt(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_pr *pr = data; + const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; + + if (!ops || !ops->pr_preempt) { + pr->ret = -EOPNOTSUPP; + return -1; + } + + pr->ret = ops->pr_preempt(dev->bdev, pr->old_key, pr->new_key, pr->type, + pr->abort); + if (!pr->ret) + return -1; + + return 0; } static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, enum pr_type type, bool abort) { - struct mapped_device *md = bdev->bd_disk->private_data; - const struct pr_ops *ops; - int r, srcu_idx; + struct dm_pr pr = { + .new_key = new_key, + .old_key = old_key, + .type = type, + .fail_early = false, + }; + int ret; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); - if (r < 0) - goto out; + ret = dm_call_pr(bdev, __dm_pr_preempt, &pr); + if (ret) + return ret; - ops = bdev->bd_disk->fops->pr_ops; - if (ops && ops->pr_preempt) - r = ops->pr_preempt(bdev, old_key, new_key, type, abort); - else - r = -EOPNOTSUPP; -out: - dm_unprepare_ioctl(md, srcu_idx); - return r; + return pr.ret; } static int dm_pr_clear(struct block_device *bdev, u64 key) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a8405ce305a9..5201df03ce40 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -53,7 +53,6 @@ struct dm_io; *---------------------------------------------------------------*/ void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); -struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); bool dm_table_has_no_data_devices(struct dm_table *table); int dm_calculate_queue_limits(struct dm_table *table, @@ -218,9 +217,6 @@ void dm_kcopyd_exit(void); /* * Mempool operations */ -struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned per_io_data_size, unsigned min_pool_size, - bool integrity, bool poll); void dm_free_md_mempools(struct dm_md_mempools *pools); /* |