diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 9 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-bio-list.h | 26 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 91 | ||||
-rw-r--r-- | drivers/md/dm-delay.c | 383 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 54 | ||||
-rw-r--r-- | drivers/md/dm-hw-handler.h | 1 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 232 | ||||
-rw-r--r-- | drivers/md/dm-io.h | 83 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 77 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 187 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 10 | ||||
-rw-r--r-- | drivers/md/dm.c | 1 | ||||
-rw-r--r-- | drivers/md/kcopyd.c | 28 | ||||
-rw-r--r-- | drivers/md/md.c | 164 | ||||
-rw-r--r-- | drivers/md/raid1.c | 33 | ||||
-rw-r--r-- | drivers/md/raid5.c | 4 |
19 files changed, 1026 insertions, 369 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 4540ade6b6b5..7df934d69134 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -262,6 +262,15 @@ config DM_MULTIPATH_EMC ---help--- Multipath support for EMC CX/AX series hardware. +config DM_DELAY + tristate "I/O delaying target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + A target that delays reads and/or writes and can send + them to different devices. Useful for testing. + + If unsure, say N. + endmenu endif diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 34957a68d921..38754084eac7 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o +obj-$(CONFIG_DM_DELAY) += dm-delay.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index e61e0efe9ec7..5a4a74c1097c 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1456,10 +1456,10 @@ int bitmap_create(mddev_t *mddev) bitmap->offset = mddev->bitmap_offset; if (file) { get_file(file); - do_sync_file_range(file, 0, LLONG_MAX, - SYNC_FILE_RANGE_WAIT_BEFORE | - SYNC_FILE_RANGE_WRITE | - SYNC_FILE_RANGE_WAIT_AFTER); + do_sync_mapping_range(file->f_mapping, 0, LLONG_MAX, + SYNC_FILE_RANGE_WAIT_BEFORE | + SYNC_FILE_RANGE_WRITE | + SYNC_FILE_RANGE_WAIT_AFTER); } /* read superblock from bitmap file (this sets bitmap->chunksize) */ err = bitmap_read_sb(bitmap); diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h index da4349649f7f..c6be88826fae 100644 --- a/drivers/md/dm-bio-list.h +++ b/drivers/md/dm-bio-list.h @@ -8,17 +8,43 @@ #define DM_BIO_LIST_H #include <linux/bio.h> +#include <linux/prefetch.h> struct bio_list { struct bio *head; struct bio *tail; }; +static inline int bio_list_empty(const struct bio_list *bl) +{ + return bl->head == NULL; +} + +#define BIO_LIST_INIT { .head = NULL, .tail = NULL } + +#define BIO_LIST(bl) \ + struct bio_list bl = BIO_LIST_INIT + static inline void bio_list_init(struct bio_list *bl) { bl->head = bl->tail = NULL; } +#define bio_list_for_each(bio, bl) \ + for (bio = (bl)->head; bio && ({ prefetch(bio->bi_next); 1; }); \ + bio = bio->bi_next) + +static inline unsigned bio_list_size(const struct bio_list *bl) +{ + unsigned sz = 0; + struct bio *bio; + + bio_list_for_each(bio, bl) + sz++; + + return sz; +} + static inline void bio_list_add(struct bio_list *bl, struct bio *bio) { bio->bi_next = NULL; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d8121234c347..7b0fcfc9eaa5 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -33,7 +33,6 @@ struct crypt_io { struct dm_target *target; struct bio *base_bio; - struct bio *first_clone; struct work_struct work; atomic_t pending; int error; @@ -107,6 +106,8 @@ struct crypt_config { static struct kmem_cache *_crypt_io_pool; +static void clone_init(struct crypt_io *, struct bio *); + /* * Different IV generation algorithms: * @@ -120,6 +121,9 @@ static struct kmem_cache *_crypt_io_pool; * benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1 * (needed for LRW-32-AES and possible other narrow block modes) * + * null: the initial vector is always zero. Provides compatibility with + * obsolete loop_fish2 devices. Do not use for new devices. + * * plumb: unimplemented, see: * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ @@ -256,6 +260,13 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) return 0; } +static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +{ + memset(iv, 0, cc->iv_size); + + return 0; +} + static struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; @@ -272,6 +283,10 @@ static struct crypt_iv_operations crypt_iv_benbi_ops = { .generator = crypt_iv_benbi_gen }; +static struct crypt_iv_operations crypt_iv_null_ops = { + .generator = crypt_iv_null_gen +}; + static int crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out, struct scatterlist *in, unsigned int length, @@ -378,36 +393,21 @@ static int crypt_convert(struct crypt_config *cc, * This should never violate the device limitations * May return a smaller bio when running out of pages */ -static struct bio * -crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, - struct bio *base_bio, unsigned int *bio_vec_idx) +static struct bio *crypt_alloc_buffer(struct crypt_io *io, unsigned int size) { + struct crypt_config *cc = io->target->private; struct bio *clone; unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; unsigned int i; - if (base_bio) { - clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs); - __bio_clone(clone, base_bio); - } else - clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); - + clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); if (!clone) return NULL; - clone->bi_destructor = dm_crypt_bio_destructor; - - /* if the last bio was not complete, continue where that one ended */ - clone->bi_idx = *bio_vec_idx; - clone->bi_vcnt = *bio_vec_idx; - clone->bi_size = 0; - clone->bi_flags &= ~(1 << BIO_SEG_VALID); - - /* clone->bi_idx pages have already been allocated */ - size -= clone->bi_idx * PAGE_SIZE; + clone_init(io, clone); - for (i = clone->bi_idx; i < nr_iovecs; i++) { + for (i = 0; i < nr_iovecs; i++) { struct bio_vec *bv = bio_iovec_idx(clone, i); bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask); @@ -419,7 +419,7 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, * return a partially allocated bio, the caller will then try * to allocate additional bios while submitting this partial bio */ - if ((i - clone->bi_idx) == (MIN_BIO_PAGES - 1)) + if (i == (MIN_BIO_PAGES - 1)) gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; bv->bv_offset = 0; @@ -438,12 +438,6 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, return NULL; } - /* - * Remember the last bio_vec allocated to be able - * to correctly continue after the splitting. - */ - *bio_vec_idx = clone->bi_vcnt; - return clone; } @@ -495,9 +489,6 @@ static void dec_pending(struct crypt_io *io, int error) if (!atomic_dec_and_test(&io->pending)) return; - if (io->first_clone) - bio_put(io->first_clone); - bio_endio(io->base_bio, io->base_bio->bi_size, io->error); mempool_free(io, cc->io_pool); @@ -562,6 +553,7 @@ static void clone_init(struct crypt_io *io, struct bio *clone) clone->bi_end_io = crypt_endio; clone->bi_bdev = cc->dev->bdev; clone->bi_rw = io->base_bio->bi_rw; + clone->bi_destructor = dm_crypt_bio_destructor; } static void process_read(struct crypt_io *io) @@ -585,7 +577,6 @@ static void process_read(struct crypt_io *io) } clone_init(io, clone); - clone->bi_destructor = dm_crypt_bio_destructor; clone->bi_idx = 0; clone->bi_vcnt = bio_segments(base_bio); clone->bi_size = base_bio->bi_size; @@ -604,7 +595,6 @@ static void process_write(struct crypt_io *io) struct convert_context ctx; unsigned remaining = base_bio->bi_size; sector_t sector = base_bio->bi_sector - io->target->begin; - unsigned bvec_idx = 0; atomic_inc(&io->pending); @@ -615,14 +605,14 @@ static void process_write(struct crypt_io *io) * so repeat the whole process until all the data can be handled. */ while (remaining) { - clone = crypt_alloc_buffer(cc, base_bio->bi_size, - io->first_clone, &bvec_idx); + clone = crypt_alloc_buffer(io, remaining); if (unlikely(!clone)) { dec_pending(io, -ENOMEM); return; } ctx.bio_out = clone; + ctx.idx_out = 0; if (unlikely(crypt_convert(cc, &ctx) < 0)) { crypt_free_buffer_pages(cc, clone, clone->bi_size); @@ -631,31 +621,26 @@ static void process_write(struct crypt_io *io) return; } - clone_init(io, clone); - clone->bi_sector = cc->start + sector; - - if (!io->first_clone) { - /* - * hold a reference to the first clone, because it - * holds the bio_vec array and that can't be freed - * before all other clones are released - */ - bio_get(clone); - io->first_clone = clone; - } + /* crypt_convert should have filled the clone bio */ + BUG_ON(ctx.idx_out < clone->bi_vcnt); + clone->bi_sector = cc->start + sector; remaining -= clone->bi_size; sector += bio_sectors(clone); - /* prevent bio_put of first_clone */ + /* Grab another reference to the io struct + * before we kick off the request */ if (remaining) atomic_inc(&io->pending); generic_make_request(clone); + /* Do not reference clone after this - it + * may be gone already. */ + /* out of memory -> run queues */ if (remaining) - congestion_wait(bio_data_dir(clone), HZ/100); + congestion_wait(WRITE, HZ/100); } } @@ -832,6 +817,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->iv_gen_ops = &crypt_iv_essiv_ops; else if (strcmp(ivmode, "benbi") == 0) cc->iv_gen_ops = &crypt_iv_benbi_ops; + else if (strcmp(ivmode, "null") == 0) + cc->iv_gen_ops = &crypt_iv_null_ops; else { ti->error = "Invalid IV mode"; goto bad2; @@ -954,10 +941,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, struct crypt_config *cc = ti->private; struct crypt_io *io; + if (bio_barrier(bio)) + return -EOPNOTSUPP; + io = mempool_alloc(cc->io_pool, GFP_NOIO); io->target = ti; io->base_bio = bio; - io->first_clone = NULL; io->error = io->post_process = 0; atomic_set(&io->pending, 0); kcryptd_queue_io(io); @@ -1057,7 +1046,7 @@ error: static struct target_type crypt_target = { .name = "crypt", - .version= {1, 3, 0}, + .version= {1, 5, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c new file mode 100644 index 000000000000..52c7cf9e5803 --- /dev/null +++ b/drivers/md/dm-delay.c @@ -0,0 +1,383 @@ +/* + * Copyright (C) 2005-2007 Red Hat GmbH + * + * A target that delays reads and/or writes and can send + * them to different devices. + * + * This file is released under the GPL. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> + +#include "dm.h" +#include "dm-bio-list.h" + +#define DM_MSG_PREFIX "delay" + +struct delay_c { + struct timer_list delay_timer; + struct semaphore timer_lock; + struct work_struct flush_expired_bios; + struct list_head delayed_bios; + atomic_t may_delay; + mempool_t *delayed_pool; + + struct dm_dev *dev_read; + sector_t start_read; + unsigned read_delay; + unsigned reads; + + struct dm_dev *dev_write; + sector_t start_write; + unsigned write_delay; + unsigned writes; +}; + +struct delay_info { + struct delay_c *context; + struct list_head list; + struct bio *bio; + unsigned long expires; +}; + +static DEFINE_MUTEX(delayed_bios_lock); + +static struct workqueue_struct *kdelayd_wq; +static struct kmem_cache *delayed_cache; + +static void handle_delayed_timer(unsigned long data) +{ + struct delay_c *dc = (struct delay_c *)data; + + queue_work(kdelayd_wq, &dc->flush_expired_bios); +} + +static void queue_timeout(struct delay_c *dc, unsigned long expires) +{ + down(&dc->timer_lock); + + if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires) + mod_timer(&dc->delay_timer, expires); + + up(&dc->timer_lock); +} + +static void flush_bios(struct bio *bio) +{ + struct bio *n; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = n; + } +} + +static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) +{ + struct delay_info *delayed, *next; + unsigned long next_expires = 0; + int start_timer = 0; + BIO_LIST(flush_bios); + + mutex_lock(&delayed_bios_lock); + list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { + if (flush_all || time_after_eq(jiffies, delayed->expires)) { + list_del(&delayed->list); + bio_list_add(&flush_bios, delayed->bio); + if ((bio_data_dir(delayed->bio) == WRITE)) + delayed->context->writes--; + else + delayed->context->reads--; + mempool_free(delayed, dc->delayed_pool); + continue; + } + + if (!start_timer) { + start_timer = 1; + next_expires = delayed->expires; + } else + next_expires = min(next_expires, delayed->expires); + } + + mutex_unlock(&delayed_bios_lock); + + if (start_timer) + queue_timeout(dc, next_expires); + + return bio_list_get(&flush_bios); +} + +static void flush_expired_bios(struct work_struct *work) +{ + struct delay_c *dc; + + dc = container_of(work, struct delay_c, flush_expired_bios); + flush_bios(flush_delayed_bios(dc, 0)); +} + +/* + * Mapping parameters: + * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>] + * + * With separate write parameters, the first set is only used for reads. + * Delays are specified in milliseconds. + */ +static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct delay_c *dc; + unsigned long long tmpll; + + if (argc != 3 && argc != 6) { + ti->error = "requires exactly 3 or 6 arguments"; + return -EINVAL; + } + + dc = kmalloc(sizeof(*dc), GFP_KERNEL); + if (!dc) { + ti->error = "Cannot allocate context"; + return -ENOMEM; + } + + dc->reads = dc->writes = 0; + + if (sscanf(argv[1], "%llu", &tmpll) != 1) { + ti->error = "Invalid device sector"; + goto bad; + } + dc->start_read = tmpll; + + if (sscanf(argv[2], "%u", &dc->read_delay) != 1) { + ti->error = "Invalid delay"; + goto bad; + } + + if (dm_get_device(ti, argv[0], dc->start_read, ti->len, + dm_table_get_mode(ti->table), &dc->dev_read)) { + ti->error = "Device lookup failed"; + goto bad; + } + + if (argc == 3) { + dc->dev_write = NULL; + goto out; + } + + if (sscanf(argv[4], "%llu", &tmpll) != 1) { + ti->error = "Invalid write device sector"; + goto bad; + } + dc->start_write = tmpll; + + if (sscanf(argv[5], "%u", &dc->write_delay) != 1) { + ti->error = "Invalid write delay"; + goto bad; + } + + if (dm_get_device(ti, argv[3], dc->start_write, ti->len, + dm_table_get_mode(ti->table), &dc->dev_write)) { + ti->error = "Write device lookup failed"; + dm_put_device(ti, dc->dev_read); + goto bad; + } + +out: + dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache); + if (!dc->delayed_pool) { + DMERR("Couldn't create delayed bio pool."); + goto bad; + } + + init_timer(&dc->delay_timer); + dc->delay_timer.function = handle_delayed_timer; + dc->delay_timer.data = (unsigned long)dc; + + INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); + INIT_LIST_HEAD(&dc->delayed_bios); + init_MUTEX(&dc->timer_lock); + atomic_set(&dc->may_delay, 1); + + ti->private = dc; + return 0; + +bad: + kfree(dc); + return -EINVAL; +} + +static void delay_dtr(struct dm_target *ti) +{ + struct delay_c *dc = ti->private; + + flush_workqueue(kdelayd_wq); + + dm_put_device(ti, dc->dev_read); + + if (dc->dev_write) + dm_put_device(ti, dc->dev_write); + + mempool_destroy(dc->delayed_pool); + kfree(dc); +} + +static int delay_bio(struct delay_c *dc, int delay, struct bio *bio) +{ + struct delay_info *delayed; + unsigned long expires = 0; + + if (!delay || !atomic_read(&dc->may_delay)) + return 1; + + delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO); + + delayed->context = dc; + delayed->bio = bio; + delayed->expires = expires = jiffies + (delay * HZ / 1000); + + mutex_lock(&delayed_bios_lock); + + if (bio_data_dir(bio) == WRITE) + dc->writes++; + else + dc->reads++; + + list_add_tail(&delayed->list, &dc->delayed_bios); + + mutex_unlock(&delayed_bios_lock); + + queue_timeout(dc, expires); + + return 0; +} + +static void delay_presuspend(struct dm_target *ti) +{ + struct delay_c *dc = ti->private; + + atomic_set(&dc->may_delay, 0); + del_timer_sync(&dc->delay_timer); + flush_bios(flush_delayed_bios(dc, 1)); +} + +static void delay_resume(struct dm_target *ti) +{ + struct delay_c *dc = ti->private; + + atomic_set(&dc->may_delay, 1); +} + +static int delay_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct delay_c *dc = ti->private; + + if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { + bio->bi_bdev = dc->dev_write->bdev; + bio->bi_sector = dc->start_write + + (bio->bi_sector - ti->begin); + + return delay_bio(dc, dc->write_delay, bio); + } + + bio->bi_bdev = dc->dev_read->bdev; + bio->bi_sector = dc->start_read + + (bio->bi_sector - ti->begin); + + return delay_bio(dc, dc->read_delay, bio); +} + +static int delay_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + struct delay_c *dc = ti->private; + int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%u %u", dc->reads, dc->writes); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %llu %u", dc->dev_read->name, + (unsigned long long) dc->start_read, + dc->read_delay); + if (dc->dev_write) + DMEMIT("%s %llu %u", dc->dev_write->name, + (unsigned long long) dc->start_write, + dc->write_delay); + break; + } + + return 0; +} + +static struct target_type delay_target = { + .name = "delay", + .version = {1, 0, 2}, + .module = THIS_MODULE, + .ctr = delay_ctr, + .dtr = delay_dtr, + .map = delay_map, + .presuspend = delay_presuspend, + .resume = delay_resume, + .status = delay_status, +}; + +static int __init dm_delay_init(void) +{ + int r = -ENOMEM; + + kdelayd_wq = create_workqueue("kdelayd"); + if (!kdelayd_wq) { + DMERR("Couldn't start kdelayd"); + goto bad_queue; + } + + delayed_cache = kmem_cache_create("dm-delay", + sizeof(struct delay_info), + __alignof__(struct delay_info), + 0, NULL, NULL); + if (!delayed_cache) { + DMERR("Couldn't create delayed bio cache."); + goto bad_memcache; + } + + r = dm_register_target(&delay_target); + if (r < 0) { + DMERR("register failed %d", r); + goto bad_register; + } + + return 0; + +bad_register: + kmem_cache_destroy(delayed_cache); +bad_memcache: + destroy_workqueue(kdelayd_wq); +bad_queue: + return r; +} + +static void __exit dm_delay_exit(void) +{ + int r = dm_unregister_target(&delay_target); + + if (r < 0) + DMERR("unregister failed %d", r); + + kmem_cache_destroy(delayed_cache); + destroy_workqueue(kdelayd_wq); +} + +/* Module hooks */ +module_init(dm_delay_init); +module_exit(dm_delay_exit); + +MODULE_DESCRIPTION(DM_NAME " delay target"); +MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 99cdffa7fbfe..07e0a0c84f6e 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -1,7 +1,8 @@ /* - * dm-snapshot.c + * dm-exception-store.c * * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2006 Red Hat GmbH * * This file is released under the GPL. */ @@ -123,6 +124,7 @@ struct pstore { atomic_t pending_count; uint32_t callback_count; struct commit_callback *callbacks; + struct dm_io_client *io_client; }; static inline unsigned int sectors_to_pages(unsigned int sectors) @@ -159,14 +161,20 @@ static void free_area(struct pstore *ps) */ static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) { - struct io_region where; - unsigned long bits; - - where.bdev = ps->snap->cow->bdev; - where.sector = ps->snap->chunk_size * chunk; - where.count = ps->snap->chunk_size; - - return dm_io_sync_vm(1, &where, rw, ps->area, &bits); + struct io_region where = { + .bdev = ps->snap->cow->bdev, + .sector = ps->snap->chunk_size * chunk, + .count = ps->snap->chunk_size, + }; + struct dm_io_request io_req = { + .bi_rw = rw, + .mem.type = DM_IO_VMA, + .mem.ptr.vma = ps->area, + .client = ps->io_client, + .notify.fn = NULL, + }; + + return dm_io(&io_req, 1, &where, NULL); } /* @@ -213,17 +221,18 @@ static int read_header(struct pstore *ps, int *new_snapshot) chunk_size_supplied = 0; } - r = dm_io_get(sectors_to_pages(ps->snap->chunk_size)); - if (r) - return r; + ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap-> + chunk_size)); + if (IS_ERR(ps->io_client)) + return PTR_ERR(ps->io_client); r = alloc_area(ps); if (r) - goto bad1; + return r; r = chunk_io(ps, 0, READ); if (r) - goto bad2; + goto bad; dh = (struct disk_header *) ps->area; @@ -235,7 +244,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { DMWARN("Invalid or corrupt snapshot"); r = -ENXIO; - goto bad2; + goto bad; } *new_snapshot = 0; @@ -252,27 +261,22 @@ static int read_header(struct pstore *ps, int *new_snapshot) (unsigned long long)ps->snap->chunk_size); /* We had a bogus chunk_size. Fix stuff up. */ - dm_io_put(sectors_to_pages(ps->snap->chunk_size)); free_area(ps); ps->snap->chunk_size = chunk_size; ps->snap->chunk_mask = chunk_size - 1; ps->snap->chunk_shift = ffs(chunk_size) - 1; - r = dm_io_get(sectors_to_pages(chunk_size)); + r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size), + ps->io_client); if (r) return r; r = alloc_area(ps); - if (r) - goto bad1; - - return 0; + return r; -bad2: +bad: free_area(ps); -bad1: - dm_io_put(sectors_to_pages(ps->snap->chunk_size)); return r; } @@ -405,7 +409,7 @@ static void persistent_destroy(struct exception_store *store) { struct pstore *ps = get_info(store); - dm_io_put(sectors_to_pages(ps->snap->chunk_size)); + dm_io_client_destroy(ps->io_client); vfree(ps->callbacks); free_area(ps); kfree(ps); diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h index 32eff28e4adc..e0832e6fcf36 100644 --- a/drivers/md/dm-hw-handler.h +++ b/drivers/md/dm-hw-handler.h @@ -16,6 +16,7 @@ struct hw_handler_type; struct hw_handler { struct hw_handler_type *type; + struct mapped_device *md; void *context; }; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 8bdc8a87b249..352c6fbeac53 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2003 Sistina Software + * Copyright (C) 2006 Red Hat GmbH * * This file is released under the GPL. */ @@ -12,13 +13,17 @@ #include <linux/sched.h> #include <linux/slab.h> -static struct bio_set *_bios; +struct dm_io_client { + mempool_t *pool; + struct bio_set *bios; +}; /* FIXME: can we shrink this ? */ struct io { unsigned long error; atomic_t count; struct task_struct *sleeper; + struct dm_io_client *client; io_notify_fn callback; void *context; }; @@ -26,63 +31,58 @@ struct io { /* * io contexts are only dynamically allocated for asynchronous * io. Since async io is likely to be the majority of io we'll - * have the same number of io contexts as buffer heads ! (FIXME: - * must reduce this). + * have the same number of io contexts as bios! (FIXME: must reduce this). */ -static unsigned _num_ios; -static mempool_t *_io_pool; static unsigned int pages_to_ios(unsigned int pages) { return 4 * pages; /* too many ? */ } -static int resize_pool(unsigned int new_ios) +/* + * Create a client with mempool and bioset. + */ +struct dm_io_client *dm_io_client_create(unsigned num_pages) { - int r = 0; - - if (_io_pool) { - if (new_ios == 0) { - /* free off the pool */ - mempool_destroy(_io_pool); - _io_pool = NULL; - bioset_free(_bios); - - } else { - /* resize the pool */ - r = mempool_resize(_io_pool, new_ios, GFP_KERNEL); - } + unsigned ios = pages_to_ios(num_pages); + struct dm_io_client *client; - } else { - /* create new pool */ - _io_pool = mempool_create_kmalloc_pool(new_ios, - sizeof(struct io)); - if (!_io_pool) - return -ENOMEM; - - _bios = bioset_create(16, 16); - if (!_bios) { - mempool_destroy(_io_pool); - _io_pool = NULL; - return -ENOMEM; - } - } + client = kmalloc(sizeof(*client), GFP_KERNEL); + if (!client) + return ERR_PTR(-ENOMEM); + + client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); + if (!client->pool) + goto bad; - if (!r) - _num_ios = new_ios; + client->bios = bioset_create(16, 16); + if (!client->bios) + goto bad; - return r; + return client; + + bad: + if (client->pool) + mempool_destroy(client->pool); + kfree(client); + return ERR_PTR(-ENOMEM); } +EXPORT_SYMBOL(dm_io_client_create); -int dm_io_get(unsigned int num_pages) +int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client) { - return resize_pool(_num_ios + pages_to_ios(num_pages)); + return mempool_resize(client->pool, pages_to_ios(num_pages), + GFP_KERNEL); } +EXPORT_SYMBOL(dm_io_client_resize); -void dm_io_put(unsigned int num_pages) +void dm_io_client_destroy(struct dm_io_client *client) { - resize_pool(_num_ios - pages_to_ios(num_pages)); + mempool_destroy(client->pool); + bioset_free(client->bios); + kfree(client); } +EXPORT_SYMBOL(dm_io_client_destroy); /*----------------------------------------------------------------- * We need to keep track of which region a bio is doing io for. @@ -118,7 +118,7 @@ static void dec_count(struct io *io, unsigned int region, int error) io_notify_fn fn = io->callback; void *context = io->context; - mempool_free(io, _io_pool); + mempool_free(io, io->client->pool); fn(r, context); } } @@ -126,7 +126,8 @@ static void dec_count(struct io *io, unsigned int region, int error) static int endio(struct bio *bio, unsigned int done, int error) { - struct io *io = (struct io *) bio->bi_private; + struct io *io; + unsigned region; /* keep going until we've finished */ if (bio->bi_size) @@ -135,10 +136,17 @@ static int endio(struct bio *bio, unsigned int done, int error) if (error && bio_data_dir(bio) == READ) zero_fill_bio(bio); - dec_count(io, bio_get_region(bio), error); + /* + * The bio destructor in bio_put() may use the io object. + */ + io = bio->bi_private; + region = bio_get_region(bio); + bio->bi_max_vecs++; bio_put(bio); + dec_count(io, region, error); + return 0; } @@ -209,6 +217,9 @@ static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec) dp->context_ptr = bvec; } +/* + * Functions for getting the pages from a VMA. + */ static void vm_get_page(struct dpages *dp, struct page **p, unsigned long *len, unsigned *offset) { @@ -233,7 +244,34 @@ static void vm_dp_init(struct dpages *dp, void *data) static void dm_bio_destructor(struct bio *bio) { - bio_free(bio, _bios); + struct io *io = bio->bi_private; + + bio_free(bio, io->client->bios); +} + +/* + * Functions for getting the pages from kernel memory. + */ +static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len, + unsigned *offset) +{ + *p = virt_to_page(dp->context_ptr); + *offset = dp->context_u; + *len = PAGE_SIZE - dp->context_u; +} + +static void km_next_page(struct dpages *dp) +{ + dp->context_ptr += PAGE_SIZE - dp->context_u; + dp->context_u = 0; +} + +static void km_dp_init(struct dpages *dp, void *data) +{ + dp->get_page = km_get_page; + dp->next_page = km_next_page; + dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1); + dp->context_ptr = data; } /*----------------------------------------------------------------- @@ -256,7 +294,7 @@ static void do_region(int rw, unsigned int region, struct io_region *where, * to hide it from bio_add_page(). */ num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2; - bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios); + bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); bio->bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; bio->bi_end_io = endio; @@ -311,8 +349,9 @@ static void dispatch_io(int rw, unsigned int num_regions, dec_count(io, 0, 0); } -static int sync_io(unsigned int num_regions, struct io_region *where, - int rw, struct dpages *dp, unsigned long *error_bits) +static int sync_io(struct dm_io_client *client, unsigned int num_regions, + struct io_region *where, int rw, struct dpages *dp, + unsigned long *error_bits) { struct io io; @@ -324,6 +363,7 @@ static int sync_io(unsigned int num_regions, struct io_region *where, io.error = 0; atomic_set(&io.count, 1); /* see dispatch_io() */ io.sleeper = current; + io.client = client; dispatch_io(rw, num_regions, where, dp, &io, 1); @@ -340,12 +380,15 @@ static int sync_io(unsigned int num_regions, struct io_region *where, if (atomic_read(&io.count)) return -EINTR; - *error_bits = io.error; + if (error_bits) + *error_bits = io.error; + return io.error ? -EIO : 0; } -static int async_io(unsigned int num_regions, struct io_region *where, int rw, - struct dpages *dp, io_notify_fn fn, void *context) +static int async_io(struct dm_io_client *client, unsigned int num_regions, + struct io_region *where, int rw, struct dpages *dp, + io_notify_fn fn, void *context) { struct io *io; @@ -355,10 +398,11 @@ static int async_io(unsigned int num_regions, struct io_region *where, int rw, return -EIO; } - io = mempool_alloc(_io_pool, GFP_NOIO); + io = mempool_alloc(client->pool, GFP_NOIO); io->error = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ io->sleeper = NULL; + io->client = client; io->callback = fn; io->context = context; @@ -366,61 +410,51 @@ static int async_io(unsigned int num_regions, struct io_region *where, int rw, return 0; } -int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, - struct page_list *pl, unsigned int offset, - unsigned long *error_bits) +static int dp_init(struct dm_io_request *io_req, struct dpages *dp) { - struct dpages dp; - list_dp_init(&dp, pl, offset); - return sync_io(num_regions, where, rw, &dp, error_bits); -} + /* Set up dpages based on memory type */ + switch (io_req->mem.type) { + case DM_IO_PAGE_LIST: + list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); + break; + + case DM_IO_BVEC: + bvec_dp_init(dp, io_req->mem.ptr.bvec); + break; + + case DM_IO_VMA: + vm_dp_init(dp, io_req->mem.ptr.vma); + break; + + case DM_IO_KMEM: + km_dp_init(dp, io_req->mem.ptr.addr); + break; + + default: + return -EINVAL; + } -int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw, - struct bio_vec *bvec, unsigned long *error_bits) -{ - struct dpages dp; - bvec_dp_init(&dp, bvec); - return sync_io(num_regions, where, rw, &dp, error_bits); + return 0; } -int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, - void *data, unsigned long *error_bits) +/* + * New collapsed (a)synchronous interface + */ +int dm_io(struct dm_io_request *io_req, unsigned num_regions, + struct io_region *where, unsigned long *sync_error_bits) { + int r; struct dpages dp; - vm_dp_init(&dp, data); - return sync_io(num_regions, where, rw, &dp, error_bits); -} -int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, - struct page_list *pl, unsigned int offset, - io_notify_fn fn, void *context) -{ - struct dpages dp; - list_dp_init(&dp, pl, offset); - return async_io(num_regions, where, rw, &dp, fn, context); -} + r = dp_init(io_req, &dp); + if (r) + return r; -int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw, - struct bio_vec *bvec, io_notify_fn fn, void *context) -{ - struct dpages dp; - bvec_dp_init(&dp, bvec); - return async_io(num_regions, where, rw, &dp, fn, context); -} + if (!io_req->notify.fn) + return sync_io(io_req->client, num_regions, where, + io_req->bi_rw, &dp, sync_error_bits); -int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, - void *data, io_notify_fn fn, void *context) -{ - struct dpages dp; - vm_dp_init(&dp, data); - return async_io(num_regions, where, rw, &dp, fn, context); + return async_io(io_req->client, num_regions, where, io_req->bi_rw, + &dp, io_req->notify.fn, io_req->notify.context); } - -EXPORT_SYMBOL(dm_io_get); -EXPORT_SYMBOL(dm_io_put); -EXPORT_SYMBOL(dm_io_sync); -EXPORT_SYMBOL(dm_io_async); -EXPORT_SYMBOL(dm_io_sync_bvec); -EXPORT_SYMBOL(dm_io_async_bvec); -EXPORT_SYMBOL(dm_io_sync_vm); -EXPORT_SYMBOL(dm_io_async_vm); +EXPORT_SYMBOL(dm_io); diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h index f9035bfd1a9f..f647e2cceaa6 100644 --- a/drivers/md/dm-io.h +++ b/drivers/md/dm-io.h @@ -12,7 +12,7 @@ struct io_region { struct block_device *bdev; sector_t sector; - sector_t count; + sector_t count; /* If this is zero the region is ignored. */ }; struct page_list { @@ -20,55 +20,60 @@ struct page_list { struct page *page; }; - -/* - * 'error' is a bitset, with each bit indicating whether an error - * occurred doing io to the corresponding region. - */ typedef void (*io_notify_fn)(unsigned long error, void *context); +enum dm_io_mem_type { + DM_IO_PAGE_LIST,/* Page list */ + DM_IO_BVEC, /* Bio vector */ + DM_IO_VMA, /* Virtual memory area */ + DM_IO_KMEM, /* Kernel memory */ +}; + +struct dm_io_memory { + enum dm_io_mem_type type; + + union { + struct page_list *pl; + struct bio_vec *bvec; + void *vma; + void *addr; + } ptr; + + unsigned offset; +}; + +struct dm_io_notify { + io_notify_fn fn; /* Callback for asynchronous requests */ + void *context; /* Passed to callback */ +}; /* - * Before anyone uses the IO interface they should call - * dm_io_get(), specifying roughly how many pages they are - * expecting to perform io on concurrently. - * - * This function may block. + * IO request structure */ -int dm_io_get(unsigned int num_pages); -void dm_io_put(unsigned int num_pages); +struct dm_io_client; +struct dm_io_request { + int bi_rw; /* READ|WRITE - not READA */ + struct dm_io_memory mem; /* Memory to use for io */ + struct dm_io_notify notify; /* Synchronous if notify.fn is NULL */ + struct dm_io_client *client; /* Client memory handler */ +}; /* - * Synchronous IO. + * For async io calls, users can alternatively use the dm_io() function below + * and dm_io_client_create() to create private mempools for the client. * - * Please ensure that the rw flag in the next two functions is - * either READ or WRITE, ie. we don't take READA. Any - * regions with a zero count field will be ignored. + * Create/destroy may block. */ -int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, - struct page_list *pl, unsigned int offset, - unsigned long *error_bits); - -int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw, - struct bio_vec *bvec, unsigned long *error_bits); - -int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, - void *data, unsigned long *error_bits); +struct dm_io_client *dm_io_client_create(unsigned num_pages); +int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client); +void dm_io_client_destroy(struct dm_io_client *client); /* - * Aynchronous IO. - * - * The 'where' array may be safely allocated on the stack since - * the function takes a copy. + * IO interface using private per-client pools. + * Each bit in the optional 'sync_error_bits' bitset indicates whether an + * error occurred doing io to the corresponding region. */ -int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, - struct page_list *pl, unsigned int offset, - io_notify_fn fn, void *context); - -int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw, - struct bio_vec *bvec, io_notify_fn fn, void *context); - -int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw, - void *data, io_notify_fn fn, void *context); +int dm_io(struct dm_io_request *io_req, unsigned num_regions, + struct io_region *region, unsigned long *sync_error_bits); #endif diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 6a9261351848..a66428d860fe 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -149,9 +149,12 @@ struct log_c { FORCESYNC, /* Force a sync to happen */ } sync; + struct dm_io_request io_req; + /* * Disk log fields */ + int log_dev_failed; struct dm_dev *log_dev; struct log_header header; @@ -199,13 +202,20 @@ static void header_from_disk(struct log_header *core, struct log_header *disk) core->nr_regions = le64_to_cpu(disk->nr_regions); } +static int rw_header(struct log_c *lc, int rw) +{ + lc->io_req.bi_rw = rw; + lc->io_req.mem.ptr.vma = lc->disk_header; + lc->io_req.notify.fn = NULL; + + return dm_io(&lc->io_req, 1, &lc->header_location, NULL); +} + static int read_header(struct log_c *log) { int r; - unsigned long ebits; - r = dm_io_sync_vm(1, &log->header_location, READ, - log->disk_header, &ebits); + r = rw_header(log, READ); if (r) return r; @@ -233,11 +243,8 @@ static int read_header(struct log_c *log) static inline int write_header(struct log_c *log) { - unsigned long ebits; - header_to_disk(&log->header, log->disk_header); - return dm_io_sync_vm(1, &log->header_location, WRITE, - log->disk_header, &ebits); + return rw_header(log, WRITE); } /*---------------------------------------------------------------- @@ -256,6 +263,7 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti, uint32_t region_size; unsigned int region_count; size_t bitset_size, buf_size; + int r; if (argc < 1 || argc > 2) { DMWARN("wrong number of arguments to mirror log"); @@ -315,6 +323,7 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti, lc->disk_header = NULL; } else { lc->log_dev = dev; + lc->log_dev_failed = 0; lc->header_location.bdev = lc->log_dev->bdev; lc->header_location.sector = 0; @@ -324,6 +333,15 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti, buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size, ti->limits.hardsect_size); lc->header_location.count = buf_size >> SECTOR_SHIFT; + lc->io_req.mem.type = DM_IO_VMA; + lc->io_req.client = dm_io_client_create(dm_div_up(buf_size, + PAGE_SIZE)); + if (IS_ERR(lc->io_req.client)) { + r = PTR_ERR(lc->io_req.client); + DMWARN("couldn't allocate disk io client"); + kfree(lc); + return -ENOMEM; + } lc->disk_header = vmalloc(buf_size); if (!lc->disk_header) { @@ -424,6 +442,7 @@ static void disk_dtr(struct dirty_log *log) dm_put_device(lc->ti, lc->log_dev); vfree(lc->disk_header); + dm_io_client_destroy(lc->io_req.client); destroy_log_context(lc); } @@ -437,6 +456,15 @@ static int count_bits32(uint32_t *addr, unsigned size) return count; } +static void fail_log_device(struct log_c *lc) +{ + if (lc->log_dev_failed) + return; + + lc->log_dev_failed = 1; + dm_table_event(lc->ti->table); +} + static int disk_resume(struct dirty_log *log) { int r; @@ -446,8 +474,19 @@ static int disk_resume(struct dirty_log *log) /* read the disk header */ r = read_header(lc); - if (r) - return r; + if (r) { + DMWARN("%s: Failed to read header on mirror log device", + lc->log_dev->name); + fail_log_device(lc); + /* + * If the log device cannot be read, we must assume + * all regions are out-of-sync. If we simply return + * here, the state will be uninitialized and could + * lead us to return 'in-sync' status for regions + * that are actually 'out-of-sync'. + */ + lc->header.nr_regions = 0; + } /* set or clear any new bits -- device has grown */ if (lc->sync == NOSYNC) @@ -472,7 +511,14 @@ static int disk_resume(struct dirty_log *log) lc->header.nr_regions = lc->region_count; /* write the new header */ - return write_header(lc); + r = write_header(lc); + if (r) { + DMWARN("%s: Failed to write header on mirror log device", + lc->log_dev->name); + fail_log_device(lc); + } + + return r; } static uint32_t core_get_region_size(struct dirty_log *log) @@ -516,7 +562,9 @@ static int disk_flush(struct dirty_log *log) return 0; r = write_header(lc); - if (!r) + if (r) + fail_log_device(lc); + else lc->touched = 0; return r; @@ -591,6 +639,7 @@ static int core_status(struct dirty_log *log, status_type_t status, switch(status) { case STATUSTYPE_INFO: + DMEMIT("1 %s", log->type->name); break; case STATUSTYPE_TABLE: @@ -606,17 +655,17 @@ static int disk_status(struct dirty_log *log, status_type_t status, char *result, unsigned int maxlen) { int sz = 0; - char buffer[16]; struct log_c *lc = log->context; switch(status) { case STATUSTYPE_INFO: + DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, + lc->log_dev_failed ? 'D' : 'A'); break; case STATUSTYPE_TABLE: - format_dev_t(buffer, lc->log_dev->bdev->bd_dev); DMEMIT("%s %u %s %u ", log->type->name, - lc->sync == DEFAULTSYNC ? 2 : 3, buffer, + lc->sync == DEFAULTSYNC ? 2 : 3, lc->log_dev->name, lc->region_size); DMEMIT_SYNC; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 3aa013506967..de54b39e6ffe 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -668,6 +668,9 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) return -EINVAL; } + m->hw_handler.md = dm_table_get_md(ti->table); + dm_put(m->hw_handler.md); + r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv); if (r) { dm_put_hw_handler(hwht); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 23a642619bed..ef124b71ccc8 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -21,15 +21,11 @@ #include <linux/workqueue.h> #define DM_MSG_PREFIX "raid1" +#define DM_IO_PAGES 64 -static struct workqueue_struct *_kmirrord_wq; -static struct work_struct _kmirrord_work; -static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); +#define DM_RAID1_HANDLE_ERRORS 0x01 -static inline void wake(void) -{ - queue_work(_kmirrord_wq, &_kmirrord_work); -} +static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); /*----------------------------------------------------------------- * Region hash @@ -125,17 +121,23 @@ struct mirror_set { struct list_head list; struct region_hash rh; struct kcopyd_client *kcopyd_client; + uint64_t features; spinlock_t lock; /* protects the next two lists */ struct bio_list reads; struct bio_list writes; + struct dm_io_client *io_client; + /* recovery */ region_t nr_regions; int in_sync; struct mirror *default_mirror; /* Default mirror */ + struct workqueue_struct *kmirrord_wq; + struct work_struct kmirrord_work; + unsigned int nr_mirrors; struct mirror mirror[0]; }; @@ -153,6 +155,11 @@ static inline sector_t region_to_sector(struct region_hash *rh, region_t region) return region << rh->region_shift; } +static void wake(struct mirror_set *ms) +{ + queue_work(ms->kmirrord_wq, &ms->kmirrord_work); +} + /* FIXME move this */ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); @@ -398,8 +405,7 @@ static void rh_update_states(struct region_hash *rh) mempool_free(reg, rh->region_pool); } - if (!list_empty(&recovered)) - rh->log->type->flush(rh->log); + rh->log->type->flush(rh->log); list_for_each_entry_safe (reg, next, &clean, list) mempool_free(reg, rh->region_pool); @@ -471,7 +477,7 @@ static void rh_dec(struct region_hash *rh, region_t region) spin_unlock_irqrestore(&rh->region_lock, flags); if (should_wake) - wake(); + wake(rh->ms); } /* @@ -558,7 +564,7 @@ static void rh_recovery_end(struct region *reg, int success) list_add(®->list, ®->rh->recovered_regions); spin_unlock_irq(&rh->region_lock); - wake(); + wake(rh->ms); } static void rh_flush(struct region_hash *rh) @@ -592,7 +598,7 @@ static void rh_start_recovery(struct region_hash *rh) for (i = 0; i < MAX_RECOVERY; i++) up(&rh->recovery_count); - wake(); + wake(rh->ms); } /* @@ -735,7 +741,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) /* * We can only read balance if the region is in sync. */ - if (rh_in_sync(&ms->rh, region, 0)) + if (rh_in_sync(&ms->rh, region, 1)) m = choose_mirror(ms, bio->bi_sector); else m = ms->default_mirror; @@ -792,6 +798,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio) unsigned int i; struct io_region io[KCOPYD_MAX_REGIONS+1]; struct mirror *m; + struct dm_io_request io_req = { + .bi_rw = WRITE, + .mem.type = DM_IO_BVEC, + .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, + .notify.fn = write_callback, + .notify.context = bio, + .client = ms->io_client, + }; for (i = 0; i < ms->nr_mirrors; i++) { m = ms->mirror + i; @@ -802,9 +816,8 @@ static void do_write(struct mirror_set *ms, struct bio *bio) } bio_set_ms(bio, ms); - dm_io_async_bvec(ms->nr_mirrors, io, WRITE, - bio->bi_io_vec + bio->bi_idx, - write_callback, bio); + + (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); } static void do_writes(struct mirror_set *ms, struct bio_list *writes) @@ -870,11 +883,10 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) /*----------------------------------------------------------------- * kmirrord *---------------------------------------------------------------*/ -static LIST_HEAD(_mirror_sets); -static DECLARE_RWSEM(_mirror_sets_lock); - -static void do_mirror(struct mirror_set *ms) +static void do_mirror(struct work_struct *work) { + struct mirror_set *ms =container_of(work, struct mirror_set, + kmirrord_work); struct bio_list reads, writes; spin_lock(&ms->lock); @@ -890,16 +902,6 @@ static void do_mirror(struct mirror_set *ms) do_writes(ms, &writes); } -static void do_work(struct work_struct *ignored) -{ - struct mirror_set *ms; - - down_read(&_mirror_sets_lock); - list_for_each_entry (ms, &_mirror_sets, list) - do_mirror(ms); - up_read(&_mirror_sets_lock); -} - /*----------------------------------------------------------------- * Target functions *---------------------------------------------------------------*/ @@ -931,6 +933,13 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, ms->in_sync = 0; ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; + ms->io_client = dm_io_client_create(DM_IO_PAGES); + if (IS_ERR(ms->io_client)) { + ti->error = "Error creating dm_io client"; + kfree(ms); + return NULL; + } + if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { ti->error = "Error creating dirty region hash"; kfree(ms); @@ -946,6 +955,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti, while (m--) dm_put_device(ti, ms->mirror[m].dev); + dm_io_client_destroy(ms->io_client); rh_exit(&ms->rh); kfree(ms); } @@ -978,23 +988,6 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, return 0; } -static int add_mirror_set(struct mirror_set *ms) -{ - down_write(&_mirror_sets_lock); - list_add_tail(&ms->list, &_mirror_sets); - up_write(&_mirror_sets_lock); - wake(); - - return 0; -} - -static void del_mirror_set(struct mirror_set *ms) -{ - down_write(&_mirror_sets_lock); - list_del(&ms->list); - up_write(&_mirror_sets_lock); -} - /* * Create dirty log: log_type #log_params <log_params> */ @@ -1037,16 +1030,55 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti, return dl; } +static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, + unsigned *args_used) +{ + unsigned num_features; + struct dm_target *ti = ms->ti; + + *args_used = 0; + + if (!argc) + return 0; + + if (sscanf(argv[0], "%u", &num_features) != 1) { + ti->error = "Invalid number of features"; + return -EINVAL; + } + + argc--; + argv++; + (*args_used)++; + + if (num_features > argc) { + ti->error = "Not enough arguments to support feature count"; + return -EINVAL; + } + + if (!strcmp("handle_errors", argv[0])) + ms->features |= DM_RAID1_HANDLE_ERRORS; + else { + ti->error = "Unrecognised feature requested"; + return -EINVAL; + } + + (*args_used)++; + + return 0; +} + /* * Construct a mirror mapping: * * log_type #log_params <log_params> * #mirrors [mirror_path offset]{2,} + * [#features <features>] * * log_type is "core" or "disk" * #log_params is between 1 and 3 + * + * If present, features must be "handle_errors". */ -#define DM_IO_PAGES 64 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int r; @@ -1070,8 +1102,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) argv++, argc--; - if (argc != nr_mirrors * 2) { - ti->error = "Wrong number of mirror arguments"; + if (argc < nr_mirrors * 2) { + ti->error = "Too few mirror arguments"; dm_destroy_dirty_log(dl); return -EINVAL; } @@ -1096,13 +1128,37 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = ms; ti->split_io = ms->rh.region_size; + ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); + if (!ms->kmirrord_wq) { + DMERR("couldn't start kmirrord"); + free_context(ms, ti, m); + return -ENOMEM; + } + INIT_WORK(&ms->kmirrord_work, do_mirror); + + r = parse_features(ms, argc, argv, &args_used); + if (r) { + free_context(ms, ti, ms->nr_mirrors); + return r; + } + + argv += args_used; + argc -= args_used; + + if (argc) { + ti->error = "Too many mirror arguments"; + free_context(ms, ti, ms->nr_mirrors); + return -EINVAL; + } + r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); if (r) { + destroy_workqueue(ms->kmirrord_wq); free_context(ms, ti, ms->nr_mirrors); return r; } - add_mirror_set(ms); + wake(ms); return 0; } @@ -1110,8 +1166,9 @@ static void mirror_dtr(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; - del_mirror_set(ms); + flush_workqueue(ms->kmirrord_wq); kcopyd_client_destroy(ms->kcopyd_client); + destroy_workqueue(ms->kmirrord_wq); free_context(ms, ti, ms->nr_mirrors); } @@ -1127,7 +1184,7 @@ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) spin_unlock(&ms->lock); if (should_wake) - wake(); + wake(ms); } /* @@ -1222,11 +1279,9 @@ static void mirror_resume(struct dm_target *ti) static int mirror_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { - unsigned int m, sz; + unsigned int m, sz = 0; struct mirror_set *ms = (struct mirror_set *) ti->private; - sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); - switch (type) { case STATUSTYPE_INFO: DMEMIT("%d ", ms->nr_mirrors); @@ -1237,13 +1292,21 @@ static int mirror_status(struct dm_target *ti, status_type_t type, (unsigned long long)ms->rh.log->type-> get_sync_count(ms->rh.log), (unsigned long long)ms->nr_regions); + + sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); + break; case STATUSTYPE_TABLE: + sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); + DMEMIT("%d", ms->nr_mirrors); for (m = 0; m < ms->nr_mirrors; m++) DMEMIT(" %s %llu", ms->mirror[m].dev->name, (unsigned long long)ms->mirror[m].offset); + + if (ms->features & DM_RAID1_HANDLE_ERRORS) + DMEMIT(" 1 handle_errors"); } return 0; @@ -1251,7 +1314,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type, static struct target_type mirror_target = { .name = "mirror", - .version = {1, 0, 2}, + .version = {1, 0, 3}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, @@ -1270,20 +1333,11 @@ static int __init dm_mirror_init(void) if (r) return r; - _kmirrord_wq = create_singlethread_workqueue("kmirrord"); - if (!_kmirrord_wq) { - DMERR("couldn't start kmirrord"); - dm_dirty_log_exit(); - return r; - } - INIT_WORK(&_kmirrord_work, do_work); - r = dm_register_target(&mirror_target); if (r < 0) { DMERR("%s: Failed to register mirror target", mirror_target.name); dm_dirty_log_exit(); - destroy_workqueue(_kmirrord_wq); } return r; @@ -1297,7 +1351,6 @@ static void __exit dm_mirror_exit(void) if (r < 0) DMERR("%s: unregister failed %d", mirror_target.name, r); - destroy_workqueue(_kmirrord_wq); dm_dirty_log_exit(); } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 05befa91807a..2fc199b0016b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -425,13 +425,15 @@ static void close_dev(struct dm_dev *d, struct mapped_device *md) } /* - * If possible (ie. blk_size[major] is set), this checks an area - * of a destination device is valid. + * If possible, this checks an area of a destination device is valid. */ static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len) { - sector_t dev_size; - dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT; + sector_t dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT; + + if (!dev_size) + return 1; + return ((start < dev_size) && (len <= (dev_size - start))); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 11a98df298ec..2717a355dc5b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1236,6 +1236,7 @@ void dm_put(struct mapped_device *md) free_dev(md); } } +EXPORT_SYMBOL_GPL(dm_put); /* * Process the deferred bios diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c index b46f6c575f7e..dbc234e3c69f 100644 --- a/drivers/md/kcopyd.c +++ b/drivers/md/kcopyd.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2002 Sistina Software (UK) Limited. + * Copyright (C) 2006 Red Hat GmbH * * This file is released under the GPL. * @@ -45,6 +46,8 @@ struct kcopyd_client { unsigned int nr_pages; unsigned int nr_free_pages; + struct dm_io_client *io_client; + wait_queue_head_t destroyq; atomic_t nr_jobs; }; @@ -342,16 +345,20 @@ static void complete_io(unsigned long error, void *context) static int run_io_job(struct kcopyd_job *job) { int r; + struct dm_io_request io_req = { + .bi_rw = job->rw, + .mem.type = DM_IO_PAGE_LIST, + .mem.ptr.pl = job->pages, + .mem.offset = job->offset, + .notify.fn = complete_io, + .notify.context = job, + .client = job->kc->io_client, + }; if (job->rw == READ) - r = dm_io_async(1, &job->source, job->rw, - job->pages, - job->offset, complete_io, job); - + r = dm_io(&io_req, 1, &job->source, NULL); else - r = dm_io_async(job->num_dests, job->dests, job->rw, - job->pages, - job->offset, complete_io, job); + r = dm_io(&io_req, job->num_dests, job->dests, NULL); return r; } @@ -670,8 +677,9 @@ int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result) return r; } - r = dm_io_get(nr_pages); - if (r) { + kc->io_client = dm_io_client_create(nr_pages); + if (IS_ERR(kc->io_client)) { + r = PTR_ERR(kc->io_client); client_free_pages(kc); kfree(kc); kcopyd_exit(); @@ -691,7 +699,7 @@ void kcopyd_client_destroy(struct kcopyd_client *kc) /* Wait for completion of all jobs submitted by this client. */ wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); - dm_io_put(kc->nr_pages); + dm_io_client_destroy(kc->io_client); client_free_pages(kc); client_del(kc); kfree(kc); diff --git a/drivers/md/md.c b/drivers/md/md.c index 509171ca7fa8..c10ce91b64e9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -33,6 +33,7 @@ */ #include <linux/module.h> +#include <linux/kernel.h> #include <linux/kthread.h> #include <linux/linkage.h> #include <linux/raid/md.h> @@ -273,6 +274,7 @@ static mddev_t * mddev_find(dev_t unit) atomic_set(&new->active, 1); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); + new->reshape_position = MaxSector; new->queue = blk_alloc_queue(GFP_KERNEL); if (!new->queue) { @@ -589,14 +591,41 @@ abort: return ret; } + +static u32 md_csum_fold(u32 csum) +{ + csum = (csum & 0xffff) + (csum >> 16); + return (csum & 0xffff) + (csum >> 16); +} + static unsigned int calc_sb_csum(mdp_super_t * sb) { + u64 newcsum = 0; + u32 *sb32 = (u32*)sb; + int i; unsigned int disk_csum, csum; disk_csum = sb->sb_csum; sb->sb_csum = 0; - csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + + for (i = 0; i < MD_SB_BYTES/4 ; i++) + newcsum += sb32[i]; + csum = (newcsum & 0xffffffff) + (newcsum>>32); + + +#ifdef CONFIG_ALPHA + /* This used to use csum_partial, which was wrong for several + * reasons including that different results are returned on + * different architectures. It isn't critical that we get exactly + * the same return value as before (we always csum_fold before + * testing, and that removes any differences). However as we + * know that csum_partial always returned a 16bit value on + * alphas, do a fold to maximise conformity to previous behaviour. + */ + sb->sb_csum = md_csum_fold(disk_csum); +#else sb->sb_csum = disk_csum; +#endif return csum; } @@ -684,7 +713,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version if (sb->raid_disks <= 0) goto abort; - if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { + if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { printk(KERN_WARNING "md: invalid superblock checksum on %s\n", b); goto abort; @@ -694,6 +723,17 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->data_offset = 0; rdev->sb_size = MD_SB_BYTES; + if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { + if (sb->level != 1 && sb->level != 4 + && sb->level != 5 && sb->level != 6 + && sb->level != 10) { + /* FIXME use a better test */ + printk(KERN_WARNING + "md: bitmaps not supported for this level.\n"); + goto abort; + } + } + if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; else @@ -792,16 +832,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->max_disks = MD_SB_DISKS; if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && - mddev->bitmap_file == NULL) { - if (mddev->level != 1 && mddev->level != 4 - && mddev->level != 5 && mddev->level != 6 - && mddev->level != 10) { - /* FIXME use a better test */ - printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); - return -EINVAL; - } + mddev->bitmap_file == NULL) mddev->bitmap_offset = mddev->default_bitmap_offset; - } } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling */ @@ -1058,6 +1090,18 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) bdevname(rdev->bdev,b)); return -EINVAL; } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { + if (sb->level != cpu_to_le32(1) && + sb->level != cpu_to_le32(4) && + sb->level != cpu_to_le32(5) && + sb->level != cpu_to_le32(6) && + sb->level != cpu_to_le32(10)) { + printk(KERN_WARNING + "md: bitmaps not supported for this level.\n"); + return -EINVAL; + } + } + rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); @@ -1141,14 +1185,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->max_disks = (4096-256)/2; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && - mddev->bitmap_file == NULL ) { - if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 - && mddev->level != 10) { - printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); - return -EINVAL; - } + mddev->bitmap_file == NULL ) mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); - } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { mddev->reshape_position = le64_to_cpu(sb->reshape_position); mddev->delta_disks = le32_to_cpu(sb->delta_disks); @@ -2204,6 +2243,10 @@ static ssize_t layout_show(mddev_t *mddev, char *page) { /* just a number, not meaningful for all levels */ + if (mddev->reshape_position != MaxSector && + mddev->layout != mddev->new_layout) + return sprintf(page, "%d (%d)\n", + mddev->new_layout, mddev->layout); return sprintf(page, "%d\n", mddev->layout); } @@ -2212,13 +2255,16 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); - if (mddev->pers) - return -EBUSY; if (!*buf || (*e && *e != '\n')) return -EINVAL; - mddev->layout = n; + if (mddev->pers) + return -EBUSY; + if (mddev->reshape_position != MaxSector) + mddev->new_layout = n; + else + mddev->layout = n; return len; } static struct md_sysfs_entry md_layout = @@ -2230,6 +2276,10 @@ raid_disks_show(mddev_t *mddev, char *page) { if (mddev->raid_disks == 0) return 0; + if (mddev->reshape_position != MaxSector && + mddev->delta_disks != 0) + return sprintf(page, "%d (%d)\n", mddev->raid_disks, + mddev->raid_disks - mddev->delta_disks); return sprintf(page, "%d\n", mddev->raid_disks); } @@ -2247,7 +2297,11 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) rv = update_raid_disks(mddev, n); - else + else if (mddev->reshape_position != MaxSector) { + int olddisks = mddev->raid_disks - mddev->delta_disks; + mddev->delta_disks = n - olddisks; + mddev->raid_disks = n; + } else mddev->raid_disks = n; return rv ? rv : len; } @@ -2257,6 +2311,10 @@ __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t chunk_size_show(mddev_t *mddev, char *page) { + if (mddev->reshape_position != MaxSector && + mddev->chunk_size != mddev->new_chunk) + return sprintf(page, "%d (%d)\n", mddev->new_chunk, + mddev->chunk_size); return sprintf(page, "%d\n", mddev->chunk_size); } @@ -2267,12 +2325,15 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) char *e; unsigned long n = simple_strtoul(buf, &e, 10); - if (mddev->pers) - return -EBUSY; if (!*buf || (*e && *e != '\n')) return -EINVAL; - mddev->chunk_size = n; + if (mddev->pers) + return -EBUSY; + else if (mddev->reshape_position != MaxSector) + mddev->new_chunk = n; + else + mddev->chunk_size = n; return len; } static struct md_sysfs_entry md_chunk_size = @@ -2637,8 +2698,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len) minor = simple_strtoul(buf, &e, 10); if (e==buf || (*e && *e != '\n') ) return -EINVAL; - if (major >= sizeof(super_types)/sizeof(super_types[0]) || - super_types[major].name == NULL) + if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) return -ENOENT; mddev->major_version = major; mddev->minor_version = minor; @@ -2859,6 +2919,37 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); +static ssize_t +reshape_position_show(mddev_t *mddev, char *page) +{ + if (mddev->reshape_position != MaxSector) + return sprintf(page, "%llu\n", + (unsigned long long)mddev->reshape_position); + strcpy(page, "none\n"); + return 5; +} + +static ssize_t +reshape_position_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long long new = simple_strtoull(buf, &e, 10); + if (mddev->pers) + return -EBUSY; + if (buf == e || (*e && *e != '\n')) + return -EINVAL; + mddev->reshape_position = new; + mddev->delta_disks = 0; + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk = mddev->chunk_size; + return len; +} + +static struct md_sysfs_entry md_reshape_position = +__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, + reshape_position_store); + static struct attribute *md_default_attrs[] = { &md_level.attr, @@ -2871,6 +2962,7 @@ static struct attribute *md_default_attrs[] = { &md_new_device.attr, &md_safe_delay.attr, &md_array_state.attr, + &md_reshape_position.attr, NULL, }; @@ -3080,7 +3172,7 @@ static int do_md_run(mddev_t * mddev) if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); - invalidate_bdev(rdev->bdev, 0); + invalidate_bdev(rdev->bdev); } md_probe(mddev->unit, NULL, NULL); @@ -3409,6 +3501,7 @@ static int do_md_stop(mddev_t * mddev, int mode) mddev->size = 0; mddev->raid_disks = 0; mddev->recovery_cp = 0; + mddev->reshape_position = MaxSector; } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", @@ -4019,7 +4112,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) if (info->raid_disks == 0) { /* just setting version number for superblock loading */ if (info->major_version < 0 || - info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || + info->major_version >= ARRAY_SIZE(super_types) || super_types[info->major_version].name == NULL) { /* maybe try to auto-load a module? */ printk(KERN_INFO @@ -4941,15 +5034,6 @@ static int md_seq_open(struct inode *inode, struct file *file) return error; } -static int md_seq_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = file->private_data; - struct mdstat_info *mi = m->private; - m->private = NULL; - kfree(mi); - return seq_release(inode, file); -} - static unsigned int mdstat_poll(struct file *filp, poll_table *wait) { struct seq_file *m = filp->private_data; @@ -4971,7 +5055,7 @@ static const struct file_operations md_seq_fops = { .open = md_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = md_seq_release, + .release = seq_release_private, .poll = mdstat_poll, }; @@ -5019,7 +5103,7 @@ static int is_mddev_idle(mddev_t *mddev) * * Note: the following is an unsigned comparison. */ - if ((curr_events - rdev->last_events + 4096) > 8192) { + if ((long)curr_events - (long)rdev->last_events > 4096) { rdev->last_events = curr_events; idle = 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 97ee870b265d..3a95cc5e029c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -271,21 +271,25 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int */ update_head_pos(mirror, r1_bio); - if (uptodate || (conf->raid_disks - conf->mddev->degraded) <= 1) { - /* - * Set R1BIO_Uptodate in our master bio, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the composite IO operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' bio. + if (uptodate) + set_bit(R1BIO_Uptodate, &r1_bio->state); + else { + /* If all other devices have failed, we want to return + * the error upwards rather than fail the last device. + * Here we redefine "uptodate" to mean "Don't want to retry" */ - if (uptodate) - set_bit(R1BIO_Uptodate, &r1_bio->state); + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + if (r1_bio->mddev->degraded == conf->raid_disks || + (r1_bio->mddev->degraded == conf->raid_disks-1 && + !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))) + uptodate = 1; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + if (uptodate) raid_end_bio_io(r1_bio); - } else { + else { /* * oops, read error: */ @@ -992,13 +996,14 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; + set_bit(Faulty, &rdev->flags); spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); - } - set_bit(Faulty, &rdev->flags); + } else + set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8d59914f2057..061375ee6592 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -353,8 +353,8 @@ static int grow_stripes(raid5_conf_t *conf, int num) struct kmem_cache *sc; int devs = conf->raid_disks; - sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev)); - sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev)); + sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev)); + sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev)); conf->active_name = 0; sc = kmem_cache_create(conf->cache_name[conf->active_name], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), |