summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig9
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bitmap.c8
-rw-r--r--drivers/md/dm-bio-list.h26
-rw-r--r--drivers/md/dm-crypt.c91
-rw-r--r--drivers/md/dm-delay.c383
-rw-r--r--drivers/md/dm-exception-store.c54
-rw-r--r--drivers/md/dm-hw-handler.h1
-rw-r--r--drivers/md/dm-io.c232
-rw-r--r--drivers/md/dm-io.h83
-rw-r--r--drivers/md/dm-log.c77
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/dm-raid1.c187
-rw-r--r--drivers/md/dm-table.c10
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/md/kcopyd.c28
-rw-r--r--drivers/md/md.c164
-rw-r--r--drivers/md/raid1.c33
-rw-r--r--drivers/md/raid5.c4
19 files changed, 1026 insertions, 369 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 4540ade6b6b5..7df934d69134 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -262,6 +262,15 @@ config DM_MULTIPATH_EMC
---help---
Multipath support for EMC CX/AX series hardware.
+config DM_DELAY
+ tristate "I/O delaying target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ A target that delays reads and/or writes and can send
+ them to different devices. Useful for testing.
+
+ If unsure, say N.
+
endmenu
endif
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 34957a68d921..38754084eac7 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_MD_FAULTY) += faulty.o
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
+obj-$(CONFIG_DM_DELAY) += dm-delay.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e61e0efe9ec7..5a4a74c1097c 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1456,10 +1456,10 @@ int bitmap_create(mddev_t *mddev)
bitmap->offset = mddev->bitmap_offset;
if (file) {
get_file(file);
- do_sync_file_range(file, 0, LLONG_MAX,
- SYNC_FILE_RANGE_WAIT_BEFORE |
- SYNC_FILE_RANGE_WRITE |
- SYNC_FILE_RANGE_WAIT_AFTER);
+ do_sync_mapping_range(file->f_mapping, 0, LLONG_MAX,
+ SYNC_FILE_RANGE_WAIT_BEFORE |
+ SYNC_FILE_RANGE_WRITE |
+ SYNC_FILE_RANGE_WAIT_AFTER);
}
/* read superblock from bitmap file (this sets bitmap->chunksize) */
err = bitmap_read_sb(bitmap);
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
index da4349649f7f..c6be88826fae 100644
--- a/drivers/md/dm-bio-list.h
+++ b/drivers/md/dm-bio-list.h
@@ -8,17 +8,43 @@
#define DM_BIO_LIST_H
#include <linux/bio.h>
+#include <linux/prefetch.h>
struct bio_list {
struct bio *head;
struct bio *tail;
};
+static inline int bio_list_empty(const struct bio_list *bl)
+{
+ return bl->head == NULL;
+}
+
+#define BIO_LIST_INIT { .head = NULL, .tail = NULL }
+
+#define BIO_LIST(bl) \
+ struct bio_list bl = BIO_LIST_INIT
+
static inline void bio_list_init(struct bio_list *bl)
{
bl->head = bl->tail = NULL;
}
+#define bio_list_for_each(bio, bl) \
+ for (bio = (bl)->head; bio && ({ prefetch(bio->bi_next); 1; }); \
+ bio = bio->bi_next)
+
+static inline unsigned bio_list_size(const struct bio_list *bl)
+{
+ unsigned sz = 0;
+ struct bio *bio;
+
+ bio_list_for_each(bio, bl)
+ sz++;
+
+ return sz;
+}
+
static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
{
bio->bi_next = NULL;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d8121234c347..7b0fcfc9eaa5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -33,7 +33,6 @@
struct crypt_io {
struct dm_target *target;
struct bio *base_bio;
- struct bio *first_clone;
struct work_struct work;
atomic_t pending;
int error;
@@ -107,6 +106,8 @@ struct crypt_config {
static struct kmem_cache *_crypt_io_pool;
+static void clone_init(struct crypt_io *, struct bio *);
+
/*
* Different IV generation algorithms:
*
@@ -120,6 +121,9 @@ static struct kmem_cache *_crypt_io_pool;
* benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1
* (needed for LRW-32-AES and possible other narrow block modes)
*
+ * null: the initial vector is always zero. Provides compatibility with
+ * obsolete loop_fish2 devices. Do not use for new devices.
+ *
* plumb: unimplemented, see:
* http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
*/
@@ -256,6 +260,13 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
return 0;
}
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+{
+ memset(iv, 0, cc->iv_size);
+
+ return 0;
+}
+
static struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -272,6 +283,10 @@ static struct crypt_iv_operations crypt_iv_benbi_ops = {
.generator = crypt_iv_benbi_gen
};
+static struct crypt_iv_operations crypt_iv_null_ops = {
+ .generator = crypt_iv_null_gen
+};
+
static int
crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
struct scatterlist *in, unsigned int length,
@@ -378,36 +393,21 @@ static int crypt_convert(struct crypt_config *cc,
* This should never violate the device limitations
* May return a smaller bio when running out of pages
*/
-static struct bio *
-crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
- struct bio *base_bio, unsigned int *bio_vec_idx)
+static struct bio *crypt_alloc_buffer(struct crypt_io *io, unsigned int size)
{
+ struct crypt_config *cc = io->target->private;
struct bio *clone;
unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
unsigned int i;
- if (base_bio) {
- clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs);
- __bio_clone(clone, base_bio);
- } else
- clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
-
+ clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
if (!clone)
return NULL;
- clone->bi_destructor = dm_crypt_bio_destructor;
-
- /* if the last bio was not complete, continue where that one ended */
- clone->bi_idx = *bio_vec_idx;
- clone->bi_vcnt = *bio_vec_idx;
- clone->bi_size = 0;
- clone->bi_flags &= ~(1 << BIO_SEG_VALID);
-
- /* clone->bi_idx pages have already been allocated */
- size -= clone->bi_idx * PAGE_SIZE;
+ clone_init(io, clone);
- for (i = clone->bi_idx; i < nr_iovecs; i++) {
+ for (i = 0; i < nr_iovecs; i++) {
struct bio_vec *bv = bio_iovec_idx(clone, i);
bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask);
@@ -419,7 +419,7 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
* return a partially allocated bio, the caller will then try
* to allocate additional bios while submitting this partial bio
*/
- if ((i - clone->bi_idx) == (MIN_BIO_PAGES - 1))
+ if (i == (MIN_BIO_PAGES - 1))
gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
bv->bv_offset = 0;
@@ -438,12 +438,6 @@ crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
return NULL;
}
- /*
- * Remember the last bio_vec allocated to be able
- * to correctly continue after the splitting.
- */
- *bio_vec_idx = clone->bi_vcnt;
-
return clone;
}
@@ -495,9 +489,6 @@ static void dec_pending(struct crypt_io *io, int error)
if (!atomic_dec_and_test(&io->pending))
return;
- if (io->first_clone)
- bio_put(io->first_clone);
-
bio_endio(io->base_bio, io->base_bio->bi_size, io->error);
mempool_free(io, cc->io_pool);
@@ -562,6 +553,7 @@ static void clone_init(struct crypt_io *io, struct bio *clone)
clone->bi_end_io = crypt_endio;
clone->bi_bdev = cc->dev->bdev;
clone->bi_rw = io->base_bio->bi_rw;
+ clone->bi_destructor = dm_crypt_bio_destructor;
}
static void process_read(struct crypt_io *io)
@@ -585,7 +577,6 @@ static void process_read(struct crypt_io *io)
}
clone_init(io, clone);
- clone->bi_destructor = dm_crypt_bio_destructor;
clone->bi_idx = 0;
clone->bi_vcnt = bio_segments(base_bio);
clone->bi_size = base_bio->bi_size;
@@ -604,7 +595,6 @@ static void process_write(struct crypt_io *io)
struct convert_context ctx;
unsigned remaining = base_bio->bi_size;
sector_t sector = base_bio->bi_sector - io->target->begin;
- unsigned bvec_idx = 0;
atomic_inc(&io->pending);
@@ -615,14 +605,14 @@ static void process_write(struct crypt_io *io)
* so repeat the whole process until all the data can be handled.
*/
while (remaining) {
- clone = crypt_alloc_buffer(cc, base_bio->bi_size,
- io->first_clone, &bvec_idx);
+ clone = crypt_alloc_buffer(io, remaining);
if (unlikely(!clone)) {
dec_pending(io, -ENOMEM);
return;
}
ctx.bio_out = clone;
+ ctx.idx_out = 0;
if (unlikely(crypt_convert(cc, &ctx) < 0)) {
crypt_free_buffer_pages(cc, clone, clone->bi_size);
@@ -631,31 +621,26 @@ static void process_write(struct crypt_io *io)
return;
}
- clone_init(io, clone);
- clone->bi_sector = cc->start + sector;
-
- if (!io->first_clone) {
- /*
- * hold a reference to the first clone, because it
- * holds the bio_vec array and that can't be freed
- * before all other clones are released
- */
- bio_get(clone);
- io->first_clone = clone;
- }
+ /* crypt_convert should have filled the clone bio */
+ BUG_ON(ctx.idx_out < clone->bi_vcnt);
+ clone->bi_sector = cc->start + sector;
remaining -= clone->bi_size;
sector += bio_sectors(clone);
- /* prevent bio_put of first_clone */
+ /* Grab another reference to the io struct
+ * before we kick off the request */
if (remaining)
atomic_inc(&io->pending);
generic_make_request(clone);
+ /* Do not reference clone after this - it
+ * may be gone already. */
+
/* out of memory -> run queues */
if (remaining)
- congestion_wait(bio_data_dir(clone), HZ/100);
+ congestion_wait(WRITE, HZ/100);
}
}
@@ -832,6 +817,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->iv_gen_ops = &crypt_iv_essiv_ops;
else if (strcmp(ivmode, "benbi") == 0)
cc->iv_gen_ops = &crypt_iv_benbi_ops;
+ else if (strcmp(ivmode, "null") == 0)
+ cc->iv_gen_ops = &crypt_iv_null_ops;
else {
ti->error = "Invalid IV mode";
goto bad2;
@@ -954,10 +941,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
struct crypt_config *cc = ti->private;
struct crypt_io *io;
+ if (bio_barrier(bio))
+ return -EOPNOTSUPP;
+
io = mempool_alloc(cc->io_pool, GFP_NOIO);
io->target = ti;
io->base_bio = bio;
- io->first_clone = NULL;
io->error = io->post_process = 0;
atomic_set(&io->pending, 0);
kcryptd_queue_io(io);
@@ -1057,7 +1046,7 @@ error:
static struct target_type crypt_target = {
.name = "crypt",
- .version= {1, 3, 0},
+ .version= {1, 5, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
new file mode 100644
index 000000000000..52c7cf9e5803
--- /dev/null
+++ b/drivers/md/dm-delay.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright (C) 2005-2007 Red Hat GmbH
+ *
+ * A target that delays reads and/or writes and can send
+ * them to different devices.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+
+#include "dm.h"
+#include "dm-bio-list.h"
+
+#define DM_MSG_PREFIX "delay"
+
+struct delay_c {
+ struct timer_list delay_timer;
+ struct semaphore timer_lock;
+ struct work_struct flush_expired_bios;
+ struct list_head delayed_bios;
+ atomic_t may_delay;
+ mempool_t *delayed_pool;
+
+ struct dm_dev *dev_read;
+ sector_t start_read;
+ unsigned read_delay;
+ unsigned reads;
+
+ struct dm_dev *dev_write;
+ sector_t start_write;
+ unsigned write_delay;
+ unsigned writes;
+};
+
+struct delay_info {
+ struct delay_c *context;
+ struct list_head list;
+ struct bio *bio;
+ unsigned long expires;
+};
+
+static DEFINE_MUTEX(delayed_bios_lock);
+
+static struct workqueue_struct *kdelayd_wq;
+static struct kmem_cache *delayed_cache;
+
+static void handle_delayed_timer(unsigned long data)
+{
+ struct delay_c *dc = (struct delay_c *)data;
+
+ queue_work(kdelayd_wq, &dc->flush_expired_bios);
+}
+
+static void queue_timeout(struct delay_c *dc, unsigned long expires)
+{
+ down(&dc->timer_lock);
+
+ if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires)
+ mod_timer(&dc->delay_timer, expires);
+
+ up(&dc->timer_lock);
+}
+
+static void flush_bios(struct bio *bio)
+{
+ struct bio *n;
+
+ while (bio) {
+ n = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = n;
+ }
+}
+
+static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
+{
+ struct delay_info *delayed, *next;
+ unsigned long next_expires = 0;
+ int start_timer = 0;
+ BIO_LIST(flush_bios);
+
+ mutex_lock(&delayed_bios_lock);
+ list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
+ if (flush_all || time_after_eq(jiffies, delayed->expires)) {
+ list_del(&delayed->list);
+ bio_list_add(&flush_bios, delayed->bio);
+ if ((bio_data_dir(delayed->bio) == WRITE))
+ delayed->context->writes--;
+ else
+ delayed->context->reads--;
+ mempool_free(delayed, dc->delayed_pool);
+ continue;
+ }
+
+ if (!start_timer) {
+ start_timer = 1;
+ next_expires = delayed->expires;
+ } else
+ next_expires = min(next_expires, delayed->expires);
+ }
+
+ mutex_unlock(&delayed_bios_lock);
+
+ if (start_timer)
+ queue_timeout(dc, next_expires);
+
+ return bio_list_get(&flush_bios);
+}
+
+static void flush_expired_bios(struct work_struct *work)
+{
+ struct delay_c *dc;
+
+ dc = container_of(work, struct delay_c, flush_expired_bios);
+ flush_bios(flush_delayed_bios(dc, 0));
+}
+
+/*
+ * Mapping parameters:
+ * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
+ *
+ * With separate write parameters, the first set is only used for reads.
+ * Delays are specified in milliseconds.
+ */
+static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct delay_c *dc;
+ unsigned long long tmpll;
+
+ if (argc != 3 && argc != 6) {
+ ti->error = "requires exactly 3 or 6 arguments";
+ return -EINVAL;
+ }
+
+ dc = kmalloc(sizeof(*dc), GFP_KERNEL);
+ if (!dc) {
+ ti->error = "Cannot allocate context";
+ return -ENOMEM;
+ }
+
+ dc->reads = dc->writes = 0;
+
+ if (sscanf(argv[1], "%llu", &tmpll) != 1) {
+ ti->error = "Invalid device sector";
+ goto bad;
+ }
+ dc->start_read = tmpll;
+
+ if (sscanf(argv[2], "%u", &dc->read_delay) != 1) {
+ ti->error = "Invalid delay";
+ goto bad;
+ }
+
+ if (dm_get_device(ti, argv[0], dc->start_read, ti->len,
+ dm_table_get_mode(ti->table), &dc->dev_read)) {
+ ti->error = "Device lookup failed";
+ goto bad;
+ }
+
+ if (argc == 3) {
+ dc->dev_write = NULL;
+ goto out;
+ }
+
+ if (sscanf(argv[4], "%llu", &tmpll) != 1) {
+ ti->error = "Invalid write device sector";
+ goto bad;
+ }
+ dc->start_write = tmpll;
+
+ if (sscanf(argv[5], "%u", &dc->write_delay) != 1) {
+ ti->error = "Invalid write delay";
+ goto bad;
+ }
+
+ if (dm_get_device(ti, argv[3], dc->start_write, ti->len,
+ dm_table_get_mode(ti->table), &dc->dev_write)) {
+ ti->error = "Write device lookup failed";
+ dm_put_device(ti, dc->dev_read);
+ goto bad;
+ }
+
+out:
+ dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
+ if (!dc->delayed_pool) {
+ DMERR("Couldn't create delayed bio pool.");
+ goto bad;
+ }
+
+ init_timer(&dc->delay_timer);
+ dc->delay_timer.function = handle_delayed_timer;
+ dc->delay_timer.data = (unsigned long)dc;
+
+ INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
+ INIT_LIST_HEAD(&dc->delayed_bios);
+ init_MUTEX(&dc->timer_lock);
+ atomic_set(&dc->may_delay, 1);
+
+ ti->private = dc;
+ return 0;
+
+bad:
+ kfree(dc);
+ return -EINVAL;
+}
+
+static void delay_dtr(struct dm_target *ti)
+{
+ struct delay_c *dc = ti->private;
+
+ flush_workqueue(kdelayd_wq);
+
+ dm_put_device(ti, dc->dev_read);
+
+ if (dc->dev_write)
+ dm_put_device(ti, dc->dev_write);
+
+ mempool_destroy(dc->delayed_pool);
+ kfree(dc);
+}
+
+static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
+{
+ struct delay_info *delayed;
+ unsigned long expires = 0;
+
+ if (!delay || !atomic_read(&dc->may_delay))
+ return 1;
+
+ delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO);
+
+ delayed->context = dc;
+ delayed->bio = bio;
+ delayed->expires = expires = jiffies + (delay * HZ / 1000);
+
+ mutex_lock(&delayed_bios_lock);
+
+ if (bio_data_dir(bio) == WRITE)
+ dc->writes++;
+ else
+ dc->reads++;
+
+ list_add_tail(&delayed->list, &dc->delayed_bios);
+
+ mutex_unlock(&delayed_bios_lock);
+
+ queue_timeout(dc, expires);
+
+ return 0;
+}
+
+static void delay_presuspend(struct dm_target *ti)
+{
+ struct delay_c *dc = ti->private;
+
+ atomic_set(&dc->may_delay, 0);
+ del_timer_sync(&dc->delay_timer);
+ flush_bios(flush_delayed_bios(dc, 1));
+}
+
+static void delay_resume(struct dm_target *ti)
+{
+ struct delay_c *dc = ti->private;
+
+ atomic_set(&dc->may_delay, 1);
+}
+
+static int delay_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ struct delay_c *dc = ti->private;
+
+ if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
+ bio->bi_bdev = dc->dev_write->bdev;
+ bio->bi_sector = dc->start_write +
+ (bio->bi_sector - ti->begin);
+
+ return delay_bio(dc, dc->write_delay, bio);
+ }
+
+ bio->bi_bdev = dc->dev_read->bdev;
+ bio->bi_sector = dc->start_read +
+ (bio->bi_sector - ti->begin);
+
+ return delay_bio(dc, dc->read_delay, bio);
+}
+
+static int delay_status(struct dm_target *ti, status_type_t type,
+ char *result, unsigned maxlen)
+{
+ struct delay_c *dc = ti->private;
+ int sz = 0;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%u %u", dc->reads, dc->writes);
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %llu %u", dc->dev_read->name,
+ (unsigned long long) dc->start_read,
+ dc->read_delay);
+ if (dc->dev_write)
+ DMEMIT("%s %llu %u", dc->dev_write->name,
+ (unsigned long long) dc->start_write,
+ dc->write_delay);
+ break;
+ }
+
+ return 0;
+}
+
+static struct target_type delay_target = {
+ .name = "delay",
+ .version = {1, 0, 2},
+ .module = THIS_MODULE,
+ .ctr = delay_ctr,
+ .dtr = delay_dtr,
+ .map = delay_map,
+ .presuspend = delay_presuspend,
+ .resume = delay_resume,
+ .status = delay_status,
+};
+
+static int __init dm_delay_init(void)
+{
+ int r = -ENOMEM;
+
+ kdelayd_wq = create_workqueue("kdelayd");
+ if (!kdelayd_wq) {
+ DMERR("Couldn't start kdelayd");
+ goto bad_queue;
+ }
+
+ delayed_cache = kmem_cache_create("dm-delay",
+ sizeof(struct delay_info),
+ __alignof__(struct delay_info),
+ 0, NULL, NULL);
+ if (!delayed_cache) {
+ DMERR("Couldn't create delayed bio cache.");
+ goto bad_memcache;
+ }
+
+ r = dm_register_target(&delay_target);
+ if (r < 0) {
+ DMERR("register failed %d", r);
+ goto bad_register;
+ }
+
+ return 0;
+
+bad_register:
+ kmem_cache_destroy(delayed_cache);
+bad_memcache:
+ destroy_workqueue(kdelayd_wq);
+bad_queue:
+ return r;
+}
+
+static void __exit dm_delay_exit(void)
+{
+ int r = dm_unregister_target(&delay_target);
+
+ if (r < 0)
+ DMERR("unregister failed %d", r);
+
+ kmem_cache_destroy(delayed_cache);
+ destroy_workqueue(kdelayd_wq);
+}
+
+/* Module hooks */
+module_init(dm_delay_init);
+module_exit(dm_delay_exit);
+
+MODULE_DESCRIPTION(DM_NAME " delay target");
+MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 99cdffa7fbfe..07e0a0c84f6e 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -1,7 +1,8 @@
/*
- * dm-snapshot.c
+ * dm-exception-store.c
*
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006 Red Hat GmbH
*
* This file is released under the GPL.
*/
@@ -123,6 +124,7 @@ struct pstore {
atomic_t pending_count;
uint32_t callback_count;
struct commit_callback *callbacks;
+ struct dm_io_client *io_client;
};
static inline unsigned int sectors_to_pages(unsigned int sectors)
@@ -159,14 +161,20 @@ static void free_area(struct pstore *ps)
*/
static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
{
- struct io_region where;
- unsigned long bits;
-
- where.bdev = ps->snap->cow->bdev;
- where.sector = ps->snap->chunk_size * chunk;
- where.count = ps->snap->chunk_size;
-
- return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
+ struct io_region where = {
+ .bdev = ps->snap->cow->bdev,
+ .sector = ps->snap->chunk_size * chunk,
+ .count = ps->snap->chunk_size,
+ };
+ struct dm_io_request io_req = {
+ .bi_rw = rw,
+ .mem.type = DM_IO_VMA,
+ .mem.ptr.vma = ps->area,
+ .client = ps->io_client,
+ .notify.fn = NULL,
+ };
+
+ return dm_io(&io_req, 1, &where, NULL);
}
/*
@@ -213,17 +221,18 @@ static int read_header(struct pstore *ps, int *new_snapshot)
chunk_size_supplied = 0;
}
- r = dm_io_get(sectors_to_pages(ps->snap->chunk_size));
- if (r)
- return r;
+ ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
+ chunk_size));
+ if (IS_ERR(ps->io_client))
+ return PTR_ERR(ps->io_client);
r = alloc_area(ps);
if (r)
- goto bad1;
+ return r;
r = chunk_io(ps, 0, READ);
if (r)
- goto bad2;
+ goto bad;
dh = (struct disk_header *) ps->area;
@@ -235,7 +244,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
DMWARN("Invalid or corrupt snapshot");
r = -ENXIO;
- goto bad2;
+ goto bad;
}
*new_snapshot = 0;
@@ -252,27 +261,22 @@ static int read_header(struct pstore *ps, int *new_snapshot)
(unsigned long long)ps->snap->chunk_size);
/* We had a bogus chunk_size. Fix stuff up. */
- dm_io_put(sectors_to_pages(ps->snap->chunk_size));
free_area(ps);
ps->snap->chunk_size = chunk_size;
ps->snap->chunk_mask = chunk_size - 1;
ps->snap->chunk_shift = ffs(chunk_size) - 1;
- r = dm_io_get(sectors_to_pages(chunk_size));
+ r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
+ ps->io_client);
if (r)
return r;
r = alloc_area(ps);
- if (r)
- goto bad1;
-
- return 0;
+ return r;
-bad2:
+bad:
free_area(ps);
-bad1:
- dm_io_put(sectors_to_pages(ps->snap->chunk_size));
return r;
}
@@ -405,7 +409,7 @@ static void persistent_destroy(struct exception_store *store)
{
struct pstore *ps = get_info(store);
- dm_io_put(sectors_to_pages(ps->snap->chunk_size));
+ dm_io_client_destroy(ps->io_client);
vfree(ps->callbacks);
free_area(ps);
kfree(ps);
diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h
index 32eff28e4adc..e0832e6fcf36 100644
--- a/drivers/md/dm-hw-handler.h
+++ b/drivers/md/dm-hw-handler.h
@@ -16,6 +16,7 @@
struct hw_handler_type;
struct hw_handler {
struct hw_handler_type *type;
+ struct mapped_device *md;
void *context;
};
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 8bdc8a87b249..352c6fbeac53 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2003 Sistina Software
+ * Copyright (C) 2006 Red Hat GmbH
*
* This file is released under the GPL.
*/
@@ -12,13 +13,17 @@
#include <linux/sched.h>
#include <linux/slab.h>
-static struct bio_set *_bios;
+struct dm_io_client {
+ mempool_t *pool;
+ struct bio_set *bios;
+};
/* FIXME: can we shrink this ? */
struct io {
unsigned long error;
atomic_t count;
struct task_struct *sleeper;
+ struct dm_io_client *client;
io_notify_fn callback;
void *context;
};
@@ -26,63 +31,58 @@ struct io {
/*
* io contexts are only dynamically allocated for asynchronous
* io. Since async io is likely to be the majority of io we'll
- * have the same number of io contexts as buffer heads ! (FIXME:
- * must reduce this).
+ * have the same number of io contexts as bios! (FIXME: must reduce this).
*/
-static unsigned _num_ios;
-static mempool_t *_io_pool;
static unsigned int pages_to_ios(unsigned int pages)
{
return 4 * pages; /* too many ? */
}
-static int resize_pool(unsigned int new_ios)
+/*
+ * Create a client with mempool and bioset.
+ */
+struct dm_io_client *dm_io_client_create(unsigned num_pages)
{
- int r = 0;
-
- if (_io_pool) {
- if (new_ios == 0) {
- /* free off the pool */
- mempool_destroy(_io_pool);
- _io_pool = NULL;
- bioset_free(_bios);
-
- } else {
- /* resize the pool */
- r = mempool_resize(_io_pool, new_ios, GFP_KERNEL);
- }
+ unsigned ios = pages_to_ios(num_pages);
+ struct dm_io_client *client;
- } else {
- /* create new pool */
- _io_pool = mempool_create_kmalloc_pool(new_ios,
- sizeof(struct io));
- if (!_io_pool)
- return -ENOMEM;
-
- _bios = bioset_create(16, 16);
- if (!_bios) {
- mempool_destroy(_io_pool);
- _io_pool = NULL;
- return -ENOMEM;
- }
- }
+ client = kmalloc(sizeof(*client), GFP_KERNEL);
+ if (!client)
+ return ERR_PTR(-ENOMEM);
+
+ client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
+ if (!client->pool)
+ goto bad;
- if (!r)
- _num_ios = new_ios;
+ client->bios = bioset_create(16, 16);
+ if (!client->bios)
+ goto bad;
- return r;
+ return client;
+
+ bad:
+ if (client->pool)
+ mempool_destroy(client->pool);
+ kfree(client);
+ return ERR_PTR(-ENOMEM);
}
+EXPORT_SYMBOL(dm_io_client_create);
-int dm_io_get(unsigned int num_pages)
+int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client)
{
- return resize_pool(_num_ios + pages_to_ios(num_pages));
+ return mempool_resize(client->pool, pages_to_ios(num_pages),
+ GFP_KERNEL);
}
+EXPORT_SYMBOL(dm_io_client_resize);
-void dm_io_put(unsigned int num_pages)
+void dm_io_client_destroy(struct dm_io_client *client)
{
- resize_pool(_num_ios - pages_to_ios(num_pages));
+ mempool_destroy(client->pool);
+ bioset_free(client->bios);
+ kfree(client);
}
+EXPORT_SYMBOL(dm_io_client_destroy);
/*-----------------------------------------------------------------
* We need to keep track of which region a bio is doing io for.
@@ -118,7 +118,7 @@ static void dec_count(struct io *io, unsigned int region, int error)
io_notify_fn fn = io->callback;
void *context = io->context;
- mempool_free(io, _io_pool);
+ mempool_free(io, io->client->pool);
fn(r, context);
}
}
@@ -126,7 +126,8 @@ static void dec_count(struct io *io, unsigned int region, int error)
static int endio(struct bio *bio, unsigned int done, int error)
{
- struct io *io = (struct io *) bio->bi_private;
+ struct io *io;
+ unsigned region;
/* keep going until we've finished */
if (bio->bi_size)
@@ -135,10 +136,17 @@ static int endio(struct bio *bio, unsigned int done, int error)
if (error && bio_data_dir(bio) == READ)
zero_fill_bio(bio);
- dec_count(io, bio_get_region(bio), error);
+ /*
+ * The bio destructor in bio_put() may use the io object.
+ */
+ io = bio->bi_private;
+ region = bio_get_region(bio);
+
bio->bi_max_vecs++;
bio_put(bio);
+ dec_count(io, region, error);
+
return 0;
}
@@ -209,6 +217,9 @@ static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
dp->context_ptr = bvec;
}
+/*
+ * Functions for getting the pages from a VMA.
+ */
static void vm_get_page(struct dpages *dp,
struct page **p, unsigned long *len, unsigned *offset)
{
@@ -233,7 +244,34 @@ static void vm_dp_init(struct dpages *dp, void *data)
static void dm_bio_destructor(struct bio *bio)
{
- bio_free(bio, _bios);
+ struct io *io = bio->bi_private;
+
+ bio_free(bio, io->client->bios);
+}
+
+/*
+ * Functions for getting the pages from kernel memory.
+ */
+static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len,
+ unsigned *offset)
+{
+ *p = virt_to_page(dp->context_ptr);
+ *offset = dp->context_u;
+ *len = PAGE_SIZE - dp->context_u;
+}
+
+static void km_next_page(struct dpages *dp)
+{
+ dp->context_ptr += PAGE_SIZE - dp->context_u;
+ dp->context_u = 0;
+}
+
+static void km_dp_init(struct dpages *dp, void *data)
+{
+ dp->get_page = km_get_page;
+ dp->next_page = km_next_page;
+ dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
+ dp->context_ptr = data;
}
/*-----------------------------------------------------------------
@@ -256,7 +294,7 @@ static void do_region(int rw, unsigned int region, struct io_region *where,
* to hide it from bio_add_page().
*/
num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2;
- bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios);
+ bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
bio->bi_end_io = endio;
@@ -311,8 +349,9 @@ static void dispatch_io(int rw, unsigned int num_regions,
dec_count(io, 0, 0);
}
-static int sync_io(unsigned int num_regions, struct io_region *where,
- int rw, struct dpages *dp, unsigned long *error_bits)
+static int sync_io(struct dm_io_client *client, unsigned int num_regions,
+ struct io_region *where, int rw, struct dpages *dp,
+ unsigned long *error_bits)
{
struct io io;
@@ -324,6 +363,7 @@ static int sync_io(unsigned int num_regions, struct io_region *where,
io.error = 0;
atomic_set(&io.count, 1); /* see dispatch_io() */
io.sleeper = current;
+ io.client = client;
dispatch_io(rw, num_regions, where, dp, &io, 1);
@@ -340,12 +380,15 @@ static int sync_io(unsigned int num_regions, struct io_region *where,
if (atomic_read(&io.count))
return -EINTR;
- *error_bits = io.error;
+ if (error_bits)
+ *error_bits = io.error;
+
return io.error ? -EIO : 0;
}
-static int async_io(unsigned int num_regions, struct io_region *where, int rw,
- struct dpages *dp, io_notify_fn fn, void *context)
+static int async_io(struct dm_io_client *client, unsigned int num_regions,
+ struct io_region *where, int rw, struct dpages *dp,
+ io_notify_fn fn, void *context)
{
struct io *io;
@@ -355,10 +398,11 @@ static int async_io(unsigned int num_regions, struct io_region *where, int rw,
return -EIO;
}
- io = mempool_alloc(_io_pool, GFP_NOIO);
+ io = mempool_alloc(client->pool, GFP_NOIO);
io->error = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->sleeper = NULL;
+ io->client = client;
io->callback = fn;
io->context = context;
@@ -366,61 +410,51 @@ static int async_io(unsigned int num_regions, struct io_region *where, int rw,
return 0;
}
-int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
- struct page_list *pl, unsigned int offset,
- unsigned long *error_bits)
+static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
{
- struct dpages dp;
- list_dp_init(&dp, pl, offset);
- return sync_io(num_regions, where, rw, &dp, error_bits);
-}
+ /* Set up dpages based on memory type */
+ switch (io_req->mem.type) {
+ case DM_IO_PAGE_LIST:
+ list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
+ break;
+
+ case DM_IO_BVEC:
+ bvec_dp_init(dp, io_req->mem.ptr.bvec);
+ break;
+
+ case DM_IO_VMA:
+ vm_dp_init(dp, io_req->mem.ptr.vma);
+ break;
+
+ case DM_IO_KMEM:
+ km_dp_init(dp, io_req->mem.ptr.addr);
+ break;
+
+ default:
+ return -EINVAL;
+ }
-int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
- struct bio_vec *bvec, unsigned long *error_bits)
-{
- struct dpages dp;
- bvec_dp_init(&dp, bvec);
- return sync_io(num_regions, where, rw, &dp, error_bits);
+ return 0;
}
-int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
- void *data, unsigned long *error_bits)
+/*
+ * New collapsed (a)synchronous interface
+ */
+int dm_io(struct dm_io_request *io_req, unsigned num_regions,
+ struct io_region *where, unsigned long *sync_error_bits)
{
+ int r;
struct dpages dp;
- vm_dp_init(&dp, data);
- return sync_io(num_regions, where, rw, &dp, error_bits);
-}
-int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
- struct page_list *pl, unsigned int offset,
- io_notify_fn fn, void *context)
-{
- struct dpages dp;
- list_dp_init(&dp, pl, offset);
- return async_io(num_regions, where, rw, &dp, fn, context);
-}
+ r = dp_init(io_req, &dp);
+ if (r)
+ return r;
-int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
- struct bio_vec *bvec, io_notify_fn fn, void *context)
-{
- struct dpages dp;
- bvec_dp_init(&dp, bvec);
- return async_io(num_regions, where, rw, &dp, fn, context);
-}
+ if (!io_req->notify.fn)
+ return sync_io(io_req->client, num_regions, where,
+ io_req->bi_rw, &dp, sync_error_bits);
-int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
- void *data, io_notify_fn fn, void *context)
-{
- struct dpages dp;
- vm_dp_init(&dp, data);
- return async_io(num_regions, where, rw, &dp, fn, context);
+ return async_io(io_req->client, num_regions, where, io_req->bi_rw,
+ &dp, io_req->notify.fn, io_req->notify.context);
}
-
-EXPORT_SYMBOL(dm_io_get);
-EXPORT_SYMBOL(dm_io_put);
-EXPORT_SYMBOL(dm_io_sync);
-EXPORT_SYMBOL(dm_io_async);
-EXPORT_SYMBOL(dm_io_sync_bvec);
-EXPORT_SYMBOL(dm_io_async_bvec);
-EXPORT_SYMBOL(dm_io_sync_vm);
-EXPORT_SYMBOL(dm_io_async_vm);
+EXPORT_SYMBOL(dm_io);
diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h
index f9035bfd1a9f..f647e2cceaa6 100644
--- a/drivers/md/dm-io.h
+++ b/drivers/md/dm-io.h
@@ -12,7 +12,7 @@
struct io_region {
struct block_device *bdev;
sector_t sector;
- sector_t count;
+ sector_t count; /* If this is zero the region is ignored. */
};
struct page_list {
@@ -20,55 +20,60 @@ struct page_list {
struct page *page;
};
-
-/*
- * 'error' is a bitset, with each bit indicating whether an error
- * occurred doing io to the corresponding region.
- */
typedef void (*io_notify_fn)(unsigned long error, void *context);
+enum dm_io_mem_type {
+ DM_IO_PAGE_LIST,/* Page list */
+ DM_IO_BVEC, /* Bio vector */
+ DM_IO_VMA, /* Virtual memory area */
+ DM_IO_KMEM, /* Kernel memory */
+};
+
+struct dm_io_memory {
+ enum dm_io_mem_type type;
+
+ union {
+ struct page_list *pl;
+ struct bio_vec *bvec;
+ void *vma;
+ void *addr;
+ } ptr;
+
+ unsigned offset;
+};
+
+struct dm_io_notify {
+ io_notify_fn fn; /* Callback for asynchronous requests */
+ void *context; /* Passed to callback */
+};
/*
- * Before anyone uses the IO interface they should call
- * dm_io_get(), specifying roughly how many pages they are
- * expecting to perform io on concurrently.
- *
- * This function may block.
+ * IO request structure
*/
-int dm_io_get(unsigned int num_pages);
-void dm_io_put(unsigned int num_pages);
+struct dm_io_client;
+struct dm_io_request {
+ int bi_rw; /* READ|WRITE - not READA */
+ struct dm_io_memory mem; /* Memory to use for io */
+ struct dm_io_notify notify; /* Synchronous if notify.fn is NULL */
+ struct dm_io_client *client; /* Client memory handler */
+};
/*
- * Synchronous IO.
+ * For async io calls, users can alternatively use the dm_io() function below
+ * and dm_io_client_create() to create private mempools for the client.
*
- * Please ensure that the rw flag in the next two functions is
- * either READ or WRITE, ie. we don't take READA. Any
- * regions with a zero count field will be ignored.
+ * Create/destroy may block.
*/
-int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
- struct page_list *pl, unsigned int offset,
- unsigned long *error_bits);
-
-int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
- struct bio_vec *bvec, unsigned long *error_bits);
-
-int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
- void *data, unsigned long *error_bits);
+struct dm_io_client *dm_io_client_create(unsigned num_pages);
+int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client);
+void dm_io_client_destroy(struct dm_io_client *client);
/*
- * Aynchronous IO.
- *
- * The 'where' array may be safely allocated on the stack since
- * the function takes a copy.
+ * IO interface using private per-client pools.
+ * Each bit in the optional 'sync_error_bits' bitset indicates whether an
+ * error occurred doing io to the corresponding region.
*/
-int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
- struct page_list *pl, unsigned int offset,
- io_notify_fn fn, void *context);
-
-int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
- struct bio_vec *bvec, io_notify_fn fn, void *context);
-
-int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
- void *data, io_notify_fn fn, void *context);
+int dm_io(struct dm_io_request *io_req, unsigned num_regions,
+ struct io_region *region, unsigned long *sync_error_bits);
#endif
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6a9261351848..a66428d860fe 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -149,9 +149,12 @@ struct log_c {
FORCESYNC, /* Force a sync to happen */
} sync;
+ struct dm_io_request io_req;
+
/*
* Disk log fields
*/
+ int log_dev_failed;
struct dm_dev *log_dev;
struct log_header header;
@@ -199,13 +202,20 @@ static void header_from_disk(struct log_header *core, struct log_header *disk)
core->nr_regions = le64_to_cpu(disk->nr_regions);
}
+static int rw_header(struct log_c *lc, int rw)
+{
+ lc->io_req.bi_rw = rw;
+ lc->io_req.mem.ptr.vma = lc->disk_header;
+ lc->io_req.notify.fn = NULL;
+
+ return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
+}
+
static int read_header(struct log_c *log)
{
int r;
- unsigned long ebits;
- r = dm_io_sync_vm(1, &log->header_location, READ,
- log->disk_header, &ebits);
+ r = rw_header(log, READ);
if (r)
return r;
@@ -233,11 +243,8 @@ static int read_header(struct log_c *log)
static inline int write_header(struct log_c *log)
{
- unsigned long ebits;
-
header_to_disk(&log->header, log->disk_header);
- return dm_io_sync_vm(1, &log->header_location, WRITE,
- log->disk_header, &ebits);
+ return rw_header(log, WRITE);
}
/*----------------------------------------------------------------
@@ -256,6 +263,7 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti,
uint32_t region_size;
unsigned int region_count;
size_t bitset_size, buf_size;
+ int r;
if (argc < 1 || argc > 2) {
DMWARN("wrong number of arguments to mirror log");
@@ -315,6 +323,7 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti,
lc->disk_header = NULL;
} else {
lc->log_dev = dev;
+ lc->log_dev_failed = 0;
lc->header_location.bdev = lc->log_dev->bdev;
lc->header_location.sector = 0;
@@ -324,6 +333,15 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti,
buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
bitset_size, ti->limits.hardsect_size);
lc->header_location.count = buf_size >> SECTOR_SHIFT;
+ lc->io_req.mem.type = DM_IO_VMA;
+ lc->io_req.client = dm_io_client_create(dm_div_up(buf_size,
+ PAGE_SIZE));
+ if (IS_ERR(lc->io_req.client)) {
+ r = PTR_ERR(lc->io_req.client);
+ DMWARN("couldn't allocate disk io client");
+ kfree(lc);
+ return -ENOMEM;
+ }
lc->disk_header = vmalloc(buf_size);
if (!lc->disk_header) {
@@ -424,6 +442,7 @@ static void disk_dtr(struct dirty_log *log)
dm_put_device(lc->ti, lc->log_dev);
vfree(lc->disk_header);
+ dm_io_client_destroy(lc->io_req.client);
destroy_log_context(lc);
}
@@ -437,6 +456,15 @@ static int count_bits32(uint32_t *addr, unsigned size)
return count;
}
+static void fail_log_device(struct log_c *lc)
+{
+ if (lc->log_dev_failed)
+ return;
+
+ lc->log_dev_failed = 1;
+ dm_table_event(lc->ti->table);
+}
+
static int disk_resume(struct dirty_log *log)
{
int r;
@@ -446,8 +474,19 @@ static int disk_resume(struct dirty_log *log)
/* read the disk header */
r = read_header(lc);
- if (r)
- return r;
+ if (r) {
+ DMWARN("%s: Failed to read header on mirror log device",
+ lc->log_dev->name);
+ fail_log_device(lc);
+ /*
+ * If the log device cannot be read, we must assume
+ * all regions are out-of-sync. If we simply return
+ * here, the state will be uninitialized and could
+ * lead us to return 'in-sync' status for regions
+ * that are actually 'out-of-sync'.
+ */
+ lc->header.nr_regions = 0;
+ }
/* set or clear any new bits -- device has grown */
if (lc->sync == NOSYNC)
@@ -472,7 +511,14 @@ static int disk_resume(struct dirty_log *log)
lc->header.nr_regions = lc->region_count;
/* write the new header */
- return write_header(lc);
+ r = write_header(lc);
+ if (r) {
+ DMWARN("%s: Failed to write header on mirror log device",
+ lc->log_dev->name);
+ fail_log_device(lc);
+ }
+
+ return r;
}
static uint32_t core_get_region_size(struct dirty_log *log)
@@ -516,7 +562,9 @@ static int disk_flush(struct dirty_log *log)
return 0;
r = write_header(lc);
- if (!r)
+ if (r)
+ fail_log_device(lc);
+ else
lc->touched = 0;
return r;
@@ -591,6 +639,7 @@ static int core_status(struct dirty_log *log, status_type_t status,
switch(status) {
case STATUSTYPE_INFO:
+ DMEMIT("1 %s", log->type->name);
break;
case STATUSTYPE_TABLE:
@@ -606,17 +655,17 @@ static int disk_status(struct dirty_log *log, status_type_t status,
char *result, unsigned int maxlen)
{
int sz = 0;
- char buffer[16];
struct log_c *lc = log->context;
switch(status) {
case STATUSTYPE_INFO:
+ DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
+ lc->log_dev_failed ? 'D' : 'A');
break;
case STATUSTYPE_TABLE:
- format_dev_t(buffer, lc->log_dev->bdev->bd_dev);
DMEMIT("%s %u %s %u ", log->type->name,
- lc->sync == DEFAULTSYNC ? 2 : 3, buffer,
+ lc->sync == DEFAULTSYNC ? 2 : 3, lc->log_dev->name,
lc->region_size);
DMEMIT_SYNC;
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 3aa013506967..de54b39e6ffe 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -668,6 +668,9 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
return -EINVAL;
}
+ m->hw_handler.md = dm_table_get_md(ti->table);
+ dm_put(m->hw_handler.md);
+
r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
if (r) {
dm_put_hw_handler(hwht);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 23a642619bed..ef124b71ccc8 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -21,15 +21,11 @@
#include <linux/workqueue.h>
#define DM_MSG_PREFIX "raid1"
+#define DM_IO_PAGES 64
-static struct workqueue_struct *_kmirrord_wq;
-static struct work_struct _kmirrord_work;
-static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
+#define DM_RAID1_HANDLE_ERRORS 0x01
-static inline void wake(void)
-{
- queue_work(_kmirrord_wq, &_kmirrord_work);
-}
+static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
/*-----------------------------------------------------------------
* Region hash
@@ -125,17 +121,23 @@ struct mirror_set {
struct list_head list;
struct region_hash rh;
struct kcopyd_client *kcopyd_client;
+ uint64_t features;
spinlock_t lock; /* protects the next two lists */
struct bio_list reads;
struct bio_list writes;
+ struct dm_io_client *io_client;
+
/* recovery */
region_t nr_regions;
int in_sync;
struct mirror *default_mirror; /* Default mirror */
+ struct workqueue_struct *kmirrord_wq;
+ struct work_struct kmirrord_work;
+
unsigned int nr_mirrors;
struct mirror mirror[0];
};
@@ -153,6 +155,11 @@ static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
return region << rh->region_shift;
}
+static void wake(struct mirror_set *ms)
+{
+ queue_work(ms->kmirrord_wq, &ms->kmirrord_work);
+}
+
/* FIXME move this */
static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
@@ -398,8 +405,7 @@ static void rh_update_states(struct region_hash *rh)
mempool_free(reg, rh->region_pool);
}
- if (!list_empty(&recovered))
- rh->log->type->flush(rh->log);
+ rh->log->type->flush(rh->log);
list_for_each_entry_safe (reg, next, &clean, list)
mempool_free(reg, rh->region_pool);
@@ -471,7 +477,7 @@ static void rh_dec(struct region_hash *rh, region_t region)
spin_unlock_irqrestore(&rh->region_lock, flags);
if (should_wake)
- wake();
+ wake(rh->ms);
}
/*
@@ -558,7 +564,7 @@ static void rh_recovery_end(struct region *reg, int success)
list_add(&reg->list, &reg->rh->recovered_regions);
spin_unlock_irq(&rh->region_lock);
- wake();
+ wake(rh->ms);
}
static void rh_flush(struct region_hash *rh)
@@ -592,7 +598,7 @@ static void rh_start_recovery(struct region_hash *rh)
for (i = 0; i < MAX_RECOVERY; i++)
up(&rh->recovery_count);
- wake();
+ wake(rh->ms);
}
/*
@@ -735,7 +741,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
/*
* We can only read balance if the region is in sync.
*/
- if (rh_in_sync(&ms->rh, region, 0))
+ if (rh_in_sync(&ms->rh, region, 1))
m = choose_mirror(ms, bio->bi_sector);
else
m = ms->default_mirror;
@@ -792,6 +798,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
unsigned int i;
struct io_region io[KCOPYD_MAX_REGIONS+1];
struct mirror *m;
+ struct dm_io_request io_req = {
+ .bi_rw = WRITE,
+ .mem.type = DM_IO_BVEC,
+ .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
+ .notify.fn = write_callback,
+ .notify.context = bio,
+ .client = ms->io_client,
+ };
for (i = 0; i < ms->nr_mirrors; i++) {
m = ms->mirror + i;
@@ -802,9 +816,8 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
}
bio_set_ms(bio, ms);
- dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
- bio->bi_io_vec + bio->bi_idx,
- write_callback, bio);
+
+ (void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
}
static void do_writes(struct mirror_set *ms, struct bio_list *writes)
@@ -870,11 +883,10 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
/*-----------------------------------------------------------------
* kmirrord
*---------------------------------------------------------------*/
-static LIST_HEAD(_mirror_sets);
-static DECLARE_RWSEM(_mirror_sets_lock);
-
-static void do_mirror(struct mirror_set *ms)
+static void do_mirror(struct work_struct *work)
{
+ struct mirror_set *ms =container_of(work, struct mirror_set,
+ kmirrord_work);
struct bio_list reads, writes;
spin_lock(&ms->lock);
@@ -890,16 +902,6 @@ static void do_mirror(struct mirror_set *ms)
do_writes(ms, &writes);
}
-static void do_work(struct work_struct *ignored)
-{
- struct mirror_set *ms;
-
- down_read(&_mirror_sets_lock);
- list_for_each_entry (ms, &_mirror_sets, list)
- do_mirror(ms);
- up_read(&_mirror_sets_lock);
-}
-
/*-----------------------------------------------------------------
* Target functions
*---------------------------------------------------------------*/
@@ -931,6 +933,13 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
ms->in_sync = 0;
ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
+ ms->io_client = dm_io_client_create(DM_IO_PAGES);
+ if (IS_ERR(ms->io_client)) {
+ ti->error = "Error creating dm_io client";
+ kfree(ms);
+ return NULL;
+ }
+
if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
ti->error = "Error creating dirty region hash";
kfree(ms);
@@ -946,6 +955,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
while (m--)
dm_put_device(ti, ms->mirror[m].dev);
+ dm_io_client_destroy(ms->io_client);
rh_exit(&ms->rh);
kfree(ms);
}
@@ -978,23 +988,6 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
return 0;
}
-static int add_mirror_set(struct mirror_set *ms)
-{
- down_write(&_mirror_sets_lock);
- list_add_tail(&ms->list, &_mirror_sets);
- up_write(&_mirror_sets_lock);
- wake();
-
- return 0;
-}
-
-static void del_mirror_set(struct mirror_set *ms)
-{
- down_write(&_mirror_sets_lock);
- list_del(&ms->list);
- up_write(&_mirror_sets_lock);
-}
-
/*
* Create dirty log: log_type #log_params <log_params>
*/
@@ -1037,16 +1030,55 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti,
return dl;
}
+static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
+ unsigned *args_used)
+{
+ unsigned num_features;
+ struct dm_target *ti = ms->ti;
+
+ *args_used = 0;
+
+ if (!argc)
+ return 0;
+
+ if (sscanf(argv[0], "%u", &num_features) != 1) {
+ ti->error = "Invalid number of features";
+ return -EINVAL;
+ }
+
+ argc--;
+ argv++;
+ (*args_used)++;
+
+ if (num_features > argc) {
+ ti->error = "Not enough arguments to support feature count";
+ return -EINVAL;
+ }
+
+ if (!strcmp("handle_errors", argv[0]))
+ ms->features |= DM_RAID1_HANDLE_ERRORS;
+ else {
+ ti->error = "Unrecognised feature requested";
+ return -EINVAL;
+ }
+
+ (*args_used)++;
+
+ return 0;
+}
+
/*
* Construct a mirror mapping:
*
* log_type #log_params <log_params>
* #mirrors [mirror_path offset]{2,}
+ * [#features <features>]
*
* log_type is "core" or "disk"
* #log_params is between 1 and 3
+ *
+ * If present, features must be "handle_errors".
*/
-#define DM_IO_PAGES 64
static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
int r;
@@ -1070,8 +1102,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
argv++, argc--;
- if (argc != nr_mirrors * 2) {
- ti->error = "Wrong number of mirror arguments";
+ if (argc < nr_mirrors * 2) {
+ ti->error = "Too few mirror arguments";
dm_destroy_dirty_log(dl);
return -EINVAL;
}
@@ -1096,13 +1128,37 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->private = ms;
ti->split_io = ms->rh.region_size;
+ ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
+ if (!ms->kmirrord_wq) {
+ DMERR("couldn't start kmirrord");
+ free_context(ms, ti, m);
+ return -ENOMEM;
+ }
+ INIT_WORK(&ms->kmirrord_work, do_mirror);
+
+ r = parse_features(ms, argc, argv, &args_used);
+ if (r) {
+ free_context(ms, ti, ms->nr_mirrors);
+ return r;
+ }
+
+ argv += args_used;
+ argc -= args_used;
+
+ if (argc) {
+ ti->error = "Too many mirror arguments";
+ free_context(ms, ti, ms->nr_mirrors);
+ return -EINVAL;
+ }
+
r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
if (r) {
+ destroy_workqueue(ms->kmirrord_wq);
free_context(ms, ti, ms->nr_mirrors);
return r;
}
- add_mirror_set(ms);
+ wake(ms);
return 0;
}
@@ -1110,8 +1166,9 @@ static void mirror_dtr(struct dm_target *ti)
{
struct mirror_set *ms = (struct mirror_set *) ti->private;
- del_mirror_set(ms);
+ flush_workqueue(ms->kmirrord_wq);
kcopyd_client_destroy(ms->kcopyd_client);
+ destroy_workqueue(ms->kmirrord_wq);
free_context(ms, ti, ms->nr_mirrors);
}
@@ -1127,7 +1184,7 @@ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
spin_unlock(&ms->lock);
if (should_wake)
- wake();
+ wake(ms);
}
/*
@@ -1222,11 +1279,9 @@ static void mirror_resume(struct dm_target *ti)
static int mirror_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen)
{
- unsigned int m, sz;
+ unsigned int m, sz = 0;
struct mirror_set *ms = (struct mirror_set *) ti->private;
- sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
-
switch (type) {
case STATUSTYPE_INFO:
DMEMIT("%d ", ms->nr_mirrors);
@@ -1237,13 +1292,21 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
(unsigned long long)ms->rh.log->type->
get_sync_count(ms->rh.log),
(unsigned long long)ms->nr_regions);
+
+ sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
+
break;
case STATUSTYPE_TABLE:
+ sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
+
DMEMIT("%d", ms->nr_mirrors);
for (m = 0; m < ms->nr_mirrors; m++)
DMEMIT(" %s %llu", ms->mirror[m].dev->name,
(unsigned long long)ms->mirror[m].offset);
+
+ if (ms->features & DM_RAID1_HANDLE_ERRORS)
+ DMEMIT(" 1 handle_errors");
}
return 0;
@@ -1251,7 +1314,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
static struct target_type mirror_target = {
.name = "mirror",
- .version = {1, 0, 2},
+ .version = {1, 0, 3},
.module = THIS_MODULE,
.ctr = mirror_ctr,
.dtr = mirror_dtr,
@@ -1270,20 +1333,11 @@ static int __init dm_mirror_init(void)
if (r)
return r;
- _kmirrord_wq = create_singlethread_workqueue("kmirrord");
- if (!_kmirrord_wq) {
- DMERR("couldn't start kmirrord");
- dm_dirty_log_exit();
- return r;
- }
- INIT_WORK(&_kmirrord_work, do_work);
-
r = dm_register_target(&mirror_target);
if (r < 0) {
DMERR("%s: Failed to register mirror target",
mirror_target.name);
dm_dirty_log_exit();
- destroy_workqueue(_kmirrord_wq);
}
return r;
@@ -1297,7 +1351,6 @@ static void __exit dm_mirror_exit(void)
if (r < 0)
DMERR("%s: unregister failed %d", mirror_target.name, r);
- destroy_workqueue(_kmirrord_wq);
dm_dirty_log_exit();
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 05befa91807a..2fc199b0016b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -425,13 +425,15 @@ static void close_dev(struct dm_dev *d, struct mapped_device *md)
}
/*
- * If possible (ie. blk_size[major] is set), this checks an area
- * of a destination device is valid.
+ * If possible, this checks an area of a destination device is valid.
*/
static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len)
{
- sector_t dev_size;
- dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT;
+ sector_t dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT;
+
+ if (!dev_size)
+ return 1;
+
return ((start < dev_size) && (len <= (dev_size - start)));
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 11a98df298ec..2717a355dc5b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1236,6 +1236,7 @@ void dm_put(struct mapped_device *md)
free_dev(md);
}
}
+EXPORT_SYMBOL_GPL(dm_put);
/*
* Process the deferred bios
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index b46f6c575f7e..dbc234e3c69f 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006 Red Hat GmbH
*
* This file is released under the GPL.
*
@@ -45,6 +46,8 @@ struct kcopyd_client {
unsigned int nr_pages;
unsigned int nr_free_pages;
+ struct dm_io_client *io_client;
+
wait_queue_head_t destroyq;
atomic_t nr_jobs;
};
@@ -342,16 +345,20 @@ static void complete_io(unsigned long error, void *context)
static int run_io_job(struct kcopyd_job *job)
{
int r;
+ struct dm_io_request io_req = {
+ .bi_rw = job->rw,
+ .mem.type = DM_IO_PAGE_LIST,
+ .mem.ptr.pl = job->pages,
+ .mem.offset = job->offset,
+ .notify.fn = complete_io,
+ .notify.context = job,
+ .client = job->kc->io_client,
+ };
if (job->rw == READ)
- r = dm_io_async(1, &job->source, job->rw,
- job->pages,
- job->offset, complete_io, job);
-
+ r = dm_io(&io_req, 1, &job->source, NULL);
else
- r = dm_io_async(job->num_dests, job->dests, job->rw,
- job->pages,
- job->offset, complete_io, job);
+ r = dm_io(&io_req, job->num_dests, job->dests, NULL);
return r;
}
@@ -670,8 +677,9 @@ int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
return r;
}
- r = dm_io_get(nr_pages);
- if (r) {
+ kc->io_client = dm_io_client_create(nr_pages);
+ if (IS_ERR(kc->io_client)) {
+ r = PTR_ERR(kc->io_client);
client_free_pages(kc);
kfree(kc);
kcopyd_exit();
@@ -691,7 +699,7 @@ void kcopyd_client_destroy(struct kcopyd_client *kc)
/* Wait for completion of all jobs submitted by this client. */
wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
- dm_io_put(kc->nr_pages);
+ dm_io_client_destroy(kc->io_client);
client_free_pages(kc);
client_del(kc);
kfree(kc);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 509171ca7fa8..c10ce91b64e9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,6 +33,7 @@
*/
#include <linux/module.h>
+#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/linkage.h>
#include <linux/raid/md.h>
@@ -273,6 +274,7 @@ static mddev_t * mddev_find(dev_t unit)
atomic_set(&new->active, 1);
spin_lock_init(&new->write_lock);
init_waitqueue_head(&new->sb_wait);
+ new->reshape_position = MaxSector;
new->queue = blk_alloc_queue(GFP_KERNEL);
if (!new->queue) {
@@ -589,14 +591,41 @@ abort:
return ret;
}
+
+static u32 md_csum_fold(u32 csum)
+{
+ csum = (csum & 0xffff) + (csum >> 16);
+ return (csum & 0xffff) + (csum >> 16);
+}
+
static unsigned int calc_sb_csum(mdp_super_t * sb)
{
+ u64 newcsum = 0;
+ u32 *sb32 = (u32*)sb;
+ int i;
unsigned int disk_csum, csum;
disk_csum = sb->sb_csum;
sb->sb_csum = 0;
- csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+
+ for (i = 0; i < MD_SB_BYTES/4 ; i++)
+ newcsum += sb32[i];
+ csum = (newcsum & 0xffffffff) + (newcsum>>32);
+
+
+#ifdef CONFIG_ALPHA
+ /* This used to use csum_partial, which was wrong for several
+ * reasons including that different results are returned on
+ * different architectures. It isn't critical that we get exactly
+ * the same return value as before (we always csum_fold before
+ * testing, and that removes any differences). However as we
+ * know that csum_partial always returned a 16bit value on
+ * alphas, do a fold to maximise conformity to previous behaviour.
+ */
+ sb->sb_csum = md_csum_fold(disk_csum);
+#else
sb->sb_csum = disk_csum;
+#endif
return csum;
}
@@ -684,7 +713,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
if (sb->raid_disks <= 0)
goto abort;
- if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
+ if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
b);
goto abort;
@@ -694,6 +723,17 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
rdev->data_offset = 0;
rdev->sb_size = MD_SB_BYTES;
+ if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
+ if (sb->level != 1 && sb->level != 4
+ && sb->level != 5 && sb->level != 6
+ && sb->level != 10) {
+ /* FIXME use a better test */
+ printk(KERN_WARNING
+ "md: bitmaps not supported for this level.\n");
+ goto abort;
+ }
+ }
+
if (sb->level == LEVEL_MULTIPATH)
rdev->desc_nr = -1;
else
@@ -792,16 +832,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mddev->max_disks = MD_SB_DISKS;
if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
- mddev->bitmap_file == NULL) {
- if (mddev->level != 1 && mddev->level != 4
- && mddev->level != 5 && mddev->level != 6
- && mddev->level != 10) {
- /* FIXME use a better test */
- printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
- return -EINVAL;
- }
+ mddev->bitmap_file == NULL)
mddev->bitmap_offset = mddev->default_bitmap_offset;
- }
} else if (mddev->pers == NULL) {
/* Insist on good event counter while assembling */
@@ -1058,6 +1090,18 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
bdevname(rdev->bdev,b));
return -EINVAL;
}
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
+ if (sb->level != cpu_to_le32(1) &&
+ sb->level != cpu_to_le32(4) &&
+ sb->level != cpu_to_le32(5) &&
+ sb->level != cpu_to_le32(6) &&
+ sb->level != cpu_to_le32(10)) {
+ printk(KERN_WARNING
+ "md: bitmaps not supported for this level.\n");
+ return -EINVAL;
+ }
+ }
+
rdev->preferred_minor = 0xffff;
rdev->data_offset = le64_to_cpu(sb->data_offset);
atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
@@ -1141,14 +1185,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mddev->max_disks = (4096-256)/2;
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
- mddev->bitmap_file == NULL ) {
- if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
- && mddev->level != 10) {
- printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
- return -EINVAL;
- }
+ mddev->bitmap_file == NULL )
mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
- }
+
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
mddev->reshape_position = le64_to_cpu(sb->reshape_position);
mddev->delta_disks = le32_to_cpu(sb->delta_disks);
@@ -2204,6 +2243,10 @@ static ssize_t
layout_show(mddev_t *mddev, char *page)
{
/* just a number, not meaningful for all levels */
+ if (mddev->reshape_position != MaxSector &&
+ mddev->layout != mddev->new_layout)
+ return sprintf(page, "%d (%d)\n",
+ mddev->new_layout, mddev->layout);
return sprintf(page, "%d\n", mddev->layout);
}
@@ -2212,13 +2255,16 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
{
char *e;
unsigned long n = simple_strtoul(buf, &e, 10);
- if (mddev->pers)
- return -EBUSY;
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
- mddev->layout = n;
+ if (mddev->pers)
+ return -EBUSY;
+ if (mddev->reshape_position != MaxSector)
+ mddev->new_layout = n;
+ else
+ mddev->layout = n;
return len;
}
static struct md_sysfs_entry md_layout =
@@ -2230,6 +2276,10 @@ raid_disks_show(mddev_t *mddev, char *page)
{
if (mddev->raid_disks == 0)
return 0;
+ if (mddev->reshape_position != MaxSector &&
+ mddev->delta_disks != 0)
+ return sprintf(page, "%d (%d)\n", mddev->raid_disks,
+ mddev->raid_disks - mddev->delta_disks);
return sprintf(page, "%d\n", mddev->raid_disks);
}
@@ -2247,7 +2297,11 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
if (mddev->pers)
rv = update_raid_disks(mddev, n);
- else
+ else if (mddev->reshape_position != MaxSector) {
+ int olddisks = mddev->raid_disks - mddev->delta_disks;
+ mddev->delta_disks = n - olddisks;
+ mddev->raid_disks = n;
+ } else
mddev->raid_disks = n;
return rv ? rv : len;
}
@@ -2257,6 +2311,10 @@ __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
static ssize_t
chunk_size_show(mddev_t *mddev, char *page)
{
+ if (mddev->reshape_position != MaxSector &&
+ mddev->chunk_size != mddev->new_chunk)
+ return sprintf(page, "%d (%d)\n", mddev->new_chunk,
+ mddev->chunk_size);
return sprintf(page, "%d\n", mddev->chunk_size);
}
@@ -2267,12 +2325,15 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
char *e;
unsigned long n = simple_strtoul(buf, &e, 10);
- if (mddev->pers)
- return -EBUSY;
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
- mddev->chunk_size = n;
+ if (mddev->pers)
+ return -EBUSY;
+ else if (mddev->reshape_position != MaxSector)
+ mddev->new_chunk = n;
+ else
+ mddev->chunk_size = n;
return len;
}
static struct md_sysfs_entry md_chunk_size =
@@ -2637,8 +2698,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
minor = simple_strtoul(buf, &e, 10);
if (e==buf || (*e && *e != '\n') )
return -EINVAL;
- if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
- super_types[major].name == NULL)
+ if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
return -ENOENT;
mddev->major_version = major;
mddev->minor_version = minor;
@@ -2859,6 +2919,37 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
+static ssize_t
+reshape_position_show(mddev_t *mddev, char *page)
+{
+ if (mddev->reshape_position != MaxSector)
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->reshape_position);
+ strcpy(page, "none\n");
+ return 5;
+}
+
+static ssize_t
+reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long long new = simple_strtoull(buf, &e, 10);
+ if (mddev->pers)
+ return -EBUSY;
+ if (buf == e || (*e && *e != '\n'))
+ return -EINVAL;
+ mddev->reshape_position = new;
+ mddev->delta_disks = 0;
+ mddev->new_level = mddev->level;
+ mddev->new_layout = mddev->layout;
+ mddev->new_chunk = mddev->chunk_size;
+ return len;
+}
+
+static struct md_sysfs_entry md_reshape_position =
+__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
+ reshape_position_store);
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
@@ -2871,6 +2962,7 @@ static struct attribute *md_default_attrs[] = {
&md_new_device.attr,
&md_safe_delay.attr,
&md_array_state.attr,
+ &md_reshape_position.attr,
NULL,
};
@@ -3080,7 +3172,7 @@ static int do_md_run(mddev_t * mddev)
if (test_bit(Faulty, &rdev->flags))
continue;
sync_blockdev(rdev->bdev);
- invalidate_bdev(rdev->bdev, 0);
+ invalidate_bdev(rdev->bdev);
}
md_probe(mddev->unit, NULL, NULL);
@@ -3409,6 +3501,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
mddev->size = 0;
mddev->raid_disks = 0;
mddev->recovery_cp = 0;
+ mddev->reshape_position = MaxSector;
} else if (mddev->pers)
printk(KERN_INFO "md: %s switched to read-only mode.\n",
@@ -4019,7 +4112,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
if (info->raid_disks == 0) {
/* just setting version number for superblock loading */
if (info->major_version < 0 ||
- info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
+ info->major_version >= ARRAY_SIZE(super_types) ||
super_types[info->major_version].name == NULL) {
/* maybe try to auto-load a module? */
printk(KERN_INFO
@@ -4941,15 +5034,6 @@ static int md_seq_open(struct inode *inode, struct file *file)
return error;
}
-static int md_seq_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = file->private_data;
- struct mdstat_info *mi = m->private;
- m->private = NULL;
- kfree(mi);
- return seq_release(inode, file);
-}
-
static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
{
struct seq_file *m = filp->private_data;
@@ -4971,7 +5055,7 @@ static const struct file_operations md_seq_fops = {
.open = md_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = md_seq_release,
+ .release = seq_release_private,
.poll = mdstat_poll,
};
@@ -5019,7 +5103,7 @@ static int is_mddev_idle(mddev_t *mddev)
*
* Note: the following is an unsigned comparison.
*/
- if ((curr_events - rdev->last_events + 4096) > 8192) {
+ if ((long)curr_events - (long)rdev->last_events > 4096) {
rdev->last_events = curr_events;
idle = 0;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 97ee870b265d..3a95cc5e029c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -271,21 +271,25 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
*/
update_head_pos(mirror, r1_bio);
- if (uptodate || (conf->raid_disks - conf->mddev->degraded) <= 1) {
- /*
- * Set R1BIO_Uptodate in our master bio, so that
- * we will return a good error code for to the higher
- * levels even if IO on some other mirrored buffer fails.
- *
- * The 'master' represents the composite IO operation to
- * user-side. So if something waits for IO, then it will
- * wait for the 'master' bio.
+ if (uptodate)
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ else {
+ /* If all other devices have failed, we want to return
+ * the error upwards rather than fail the last device.
+ * Here we redefine "uptodate" to mean "Don't want to retry"
*/
- if (uptodate)
- set_bit(R1BIO_Uptodate, &r1_bio->state);
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (r1_bio->mddev->degraded == conf->raid_disks ||
+ (r1_bio->mddev->degraded == conf->raid_disks-1 &&
+ !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
+ uptodate = 1;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+ if (uptodate)
raid_end_bio_io(r1_bio);
- } else {
+ else {
/*
* oops, read error:
*/
@@ -992,13 +996,14 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
+ set_bit(Faulty, &rdev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
*/
set_bit(MD_RECOVERY_ERR, &mddev->recovery);
- }
- set_bit(Faulty, &rdev->flags);
+ } else
+ set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
" Operation continuing on %d devices\n",
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8d59914f2057..061375ee6592 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -353,8 +353,8 @@ static int grow_stripes(raid5_conf_t *conf, int num)
struct kmem_cache *sc;
int devs = conf->raid_disks;
- sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
- sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
+ sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev));
+ sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev));
conf->active_name = 0;
sc = kmem_cache_create(conf->cache_name[conf->active_name],
sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),