diff options
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 665 |
1 files changed, 553 insertions, 112 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index f19b874753a9..2fe32c261922 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -33,17 +33,16 @@ */ #include <linux/module.h> -#include <linux/config.h> #include <linux/kthread.h> #include <linux/linkage.h> #include <linux/raid/md.h> #include <linux/raid/bitmap.h> #include <linux/sysctl.h> -#include <linux/devfs_fs_kernel.h> #include <linux/buffer_head.h> /* for invalidate_bdev */ #include <linux/suspend.h> #include <linux/poll.h> #include <linux/mutex.h> +#include <linux/ctype.h> #include <linux/init.h> @@ -72,6 +71,10 @@ static void autostart_arrays (int part); static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); +static void md_print_devices(void); + +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } + /* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. @@ -170,7 +173,7 @@ EXPORT_SYMBOL_GPL(md_new_event); /* Alternate version that can be called from interrupts * when calling sysfs_notify isn't needed. */ -void md_new_event_inintr(mddev_t *mddev) +static void md_new_event_inintr(mddev_t *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); @@ -732,6 +735,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) { mdp_disk_t *desc; mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + __u64 ev1 = md_event(sb); rdev->raid_disk = -1; rdev->flags = 0; @@ -748,7 +752,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; mddev->size = sb->size; - mddev->events = md_event(sb); + mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = MD_SB_BYTES >> 9; @@ -797,7 +801,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling */ - __u64 ev1 = md_event(sb); ++ev1; if (ev1 < mddev->events) return -EINVAL; @@ -805,19 +808,21 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ - __u64 ev1 = md_event(sb); if (ev1 < mddev->bitmap->events_cleared) return 0; - } else /* just a hot-add of a new device, leave raid_disk at -1 */ - return 0; + } else { + if (ev1 < mddev->events) + /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; + } if (mddev->level != LEVEL_MULTIPATH) { desc = sb->disks + rdev->desc_nr; if (desc->state & (1<<MD_DISK_FAULTY)) set_bit(Faulty, &rdev->flags); - else if (desc->state & (1<<MD_DISK_SYNC) && - desc->raid_disk < mddev->raid_disks) { + else if (desc->state & (1<<MD_DISK_SYNC) /* && + desc->raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; } @@ -1100,6 +1105,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) { struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; rdev->flags = 0; @@ -1115,7 +1121,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->size = le64_to_cpu(sb->size)/2; - mddev->events = le64_to_cpu(sb->events); + mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = 1024 >> 9; @@ -1149,7 +1155,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling */ - __u64 ev1 = le64_to_cpu(sb->events); ++ev1; if (ev1 < mddev->events) return -EINVAL; @@ -1157,12 +1162,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ - __u64 ev1 = le64_to_cpu(sb->events); if (ev1 < mddev->bitmap->events_cleared) return 0; - } else /* just a hot-add of a new device, leave raid_disk at -1 */ - return 0; - + } else { + if (ev1 < mddev->events) + /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; + } if (mddev->level != LEVEL_MULTIPATH) { int role; rdev->desc_nr = le32_to_cpu(sb->dev_number); @@ -1174,7 +1180,11 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); break; default: - set_bit(In_sync, &rdev->flags); + if ((le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + else + set_bit(In_sync, &rdev->flags); rdev->raid_disk = role; break; } @@ -1198,6 +1208,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->feature_map = 0; sb->pad0 = 0; + sb->recovery_offset = cpu_to_le64(0); memset(sb->pad1, 0, sizeof(sb->pad1)); memset(sb->pad2, 0, sizeof(sb->pad2)); memset(sb->pad3, 0, sizeof(sb->pad3)); @@ -1218,6 +1229,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } + + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset > 0) { + sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); + } + if (mddev->reshape_position != MaxSector) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); sb->reshape_position = cpu_to_le64(mddev->reshape_position); @@ -1242,11 +1261,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->dev_roles[i] = cpu_to_le16(0xfffe); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(0xffff); } - sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ sb->sb_csum = calc_sb_1_csum(sb); } @@ -1507,7 +1527,7 @@ static void print_rdev(mdk_rdev_t *rdev) printk(KERN_INFO "md: no rdev superblock!\n"); } -void md_print_devices(void) +static void md_print_devices(void) { struct list_head *tmp, *tmp2; mdk_rdev_t *rdev; @@ -1536,15 +1556,30 @@ void md_print_devices(void) } -static void sync_sbs(mddev_t * mddev) +static void sync_sbs(mddev_t * mddev, int nospares) { + /* Update each superblock (in-memory image), but + * if we are allowed to, skip spares which already + * have the right event counter, or have one earlier + * (which would mean they aren't being marked as dirty + * with the rest of the array) + */ mdk_rdev_t *rdev; struct list_head *tmp; ITERATE_RDEV(mddev,rdev,tmp) { - super_types[mddev->major_version]. - sync_super(mddev, rdev); - rdev->sb_loaded = 1; + if (rdev->sb_events == mddev->events || + (nospares && + rdev->raid_disk < 0 && + (rdev->sb_events&1)==0 && + rdev->sb_events+1 == mddev->events)) { + /* Don't update this superblock */ + rdev->sb_loaded = 2; + } else { + super_types[mddev->major_version]. + sync_super(mddev, rdev); + rdev->sb_loaded = 1; + } } } @@ -1554,12 +1589,42 @@ void md_update_sb(mddev_t * mddev) struct list_head *tmp; mdk_rdev_t *rdev; int sync_req; + int nospares = 0; repeat: spin_lock_irq(&mddev->write_lock); sync_req = mddev->in_sync; mddev->utime = get_seconds(); - mddev->events ++; + if (mddev->sb_dirty == 3) + /* just a clean<-> dirty transition, possibly leave spares alone, + * though if events isn't the right even/odd, we will have to do + * spares after all + */ + nospares = 1; + + /* If this is just a dirty<->clean transition, and the array is clean + * and 'events' is odd, we can roll back to the previous clean state */ + if (mddev->sb_dirty == 3 + && (mddev->in_sync && mddev->recovery_cp == MaxSector) + && (mddev->events & 1)) + mddev->events--; + else { + /* otherwise we have to go forward and ... */ + mddev->events ++; + if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ + /* .. if the array isn't clean, insist on an odd 'events' */ + if ((mddev->events&1)==0) { + mddev->events++; + nospares = 0; + } + } else { + /* otherwise insist on an even 'events' (for clean states) */ + if ((mddev->events&1)) { + mddev->events++; + nospares = 0; + } + } + } if (!mddev->events) { /* @@ -1571,7 +1636,7 @@ repeat: mddev->events --; } mddev->sb_dirty = 2; - sync_sbs(mddev); + sync_sbs(mddev, nospares); /* * do not write anything to disk if using @@ -1593,6 +1658,8 @@ repeat: ITERATE_RDEV(mddev,rdev,tmp) { char b[BDEVNAME_SIZE]; dprintk(KERN_INFO "md: "); + if (rdev->sb_loaded != 1) + continue; /* no noise on spare devices */ if (test_bit(Faulty, &rdev->flags)) dprintk("(skipping faulty "); @@ -1604,6 +1671,7 @@ repeat: dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", bdevname(rdev->bdev,b), (unsigned long long)rdev->sb_offset); + rdev->sb_events = mddev->events; } else dprintk(")\n"); @@ -1667,6 +1735,10 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%sin_sync",sep); sep = ","; } + if (test_bit(WriteMostly, &rdev->flags)) { + len += sprintf(page+len, "%swrite_mostly",sep); + sep = ","; + } if (!test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { len += sprintf(page+len, "%sspare", sep); @@ -1675,8 +1747,40 @@ state_show(mdk_rdev_t *rdev, char *page) return len+sprintf(page+len, "\n"); } +static ssize_t +state_store(mdk_rdev_t *rdev, const char *buf, size_t len) +{ + /* can write + * faulty - simulates and error + * remove - disconnects the device + * writemostly - sets write_mostly + * -writemostly - clears write_mostly + */ + int err = -EINVAL; + if (cmd_match(buf, "faulty") && rdev->mddev->pers) { + md_error(rdev->mddev, rdev); + err = 0; + } else if (cmd_match(buf, "remove")) { + if (rdev->raid_disk >= 0) + err = -EBUSY; + else { + mddev_t *mddev = rdev->mddev; + kick_rdev_from_array(rdev); + md_update_sb(mddev); + md_new_event(mddev); + err = 0; + } + } else if (cmd_match(buf, "writemostly")) { + set_bit(WriteMostly, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-writemostly")) { + clear_bit(WriteMostly, &rdev->flags); + err = 0; + } + return err ? err : len; +} static struct rdev_sysfs_entry -rdev_state = __ATTR_RO(state); +rdev_state = __ATTR(state, 0644, state_show, state_store); static ssize_t super_show(mdk_rdev_t *rdev, char *page) @@ -1873,6 +1977,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi rdev->desc_nr = -1; rdev->flags = 0; rdev->data_offset = 0; + rdev->sb_events = 0; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->corrected_errors, 0); @@ -1978,6 +2083,54 @@ static void analyze_sbs(mddev_t * mddev) } static ssize_t +safe_delay_show(mddev_t *mddev, char *page) +{ + int msec = (mddev->safemode_delay*1000)/HZ; + return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); +} +static ssize_t +safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) +{ + int scale=1; + int dot=0; + int i; + unsigned long msec; + char buf[30]; + char *e; + /* remove a period, and count digits after it */ + if (len >= sizeof(buf)) + return -EINVAL; + strlcpy(buf, cbuf, len); + buf[len] = 0; + for (i=0; i<len; i++) { + if (dot) { + if (isdigit(buf[i])) { + buf[i-1] = buf[i]; + scale *= 10; + } + buf[i] = 0; + } else if (buf[i] == '.') { + dot=1; + buf[i] = 0; + } + } + msec = simple_strtoul(buf, &e, 10); + if (e == buf || (*e && *e != '\n')) + return -EINVAL; + msec = (msec * 1000) / scale; + if (msec == 0) + mddev->safemode_delay = 0; + else { + mddev->safemode_delay = (msec*HZ)/1000; + if (mddev->safemode_delay == 0) + mddev->safemode_delay = 1; + } + return len; +} +static struct md_sysfs_entry md_safe_delay = +__ATTR(safe_mode_delay, 0644,safe_delay_show, safe_delay_store); + +static ssize_t level_show(mddev_t *mddev, char *page) { struct mdk_personality *p = mddev->pers; @@ -2012,6 +2165,32 @@ level_store(mddev_t *mddev, const char *buf, size_t len) static struct md_sysfs_entry md_level = __ATTR(level, 0644, level_show, level_store); + +static ssize_t +layout_show(mddev_t *mddev, char *page) +{ + /* just a number, not meaningful for all levels */ + return sprintf(page, "%d\n", mddev->layout); +} + +static ssize_t +layout_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + if (mddev->pers) + return -EBUSY; + + if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + mddev->layout = n; + return len; +} +static struct md_sysfs_entry md_layout = +__ATTR(layout, 0655, layout_show, layout_store); + + static ssize_t raid_disks_show(mddev_t *mddev, char *page) { @@ -2067,6 +2246,200 @@ static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); static ssize_t +resync_start_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); +} + +static ssize_t +resync_start_store(mddev_t *mddev, const char *buf, size_t len) +{ + /* can only set chunk_size if array is not yet active */ + char *e; + unsigned long long n = simple_strtoull(buf, &e, 10); + + if (mddev->pers) + return -EBUSY; + if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + mddev->recovery_cp = n; + return len; +} +static struct md_sysfs_entry md_resync_start = +__ATTR(resync_start, 0644, resync_start_show, resync_start_store); + +/* + * The array state can be: + * + * clear + * No devices, no size, no level + * Equivalent to STOP_ARRAY ioctl + * inactive + * May have some settings, but array is not active + * all IO results in error + * When written, doesn't tear down array, but just stops it + * suspended (not supported yet) + * All IO requests will block. The array can be reconfigured. + * Writing this, if accepted, will block until array is quiessent + * readonly + * no resync can happen. no superblocks get written. + * write requests fail + * read-auto + * like readonly, but behaves like 'clean' on a write request. + * + * clean - no pending writes, but otherwise active. + * When written to inactive array, starts without resync + * If a write request arrives then + * if metadata is known, mark 'dirty' and switch to 'active'. + * if not known, block and switch to write-pending + * If written to an active array that has pending writes, then fails. + * active + * fully active: IO and resync can be happening. + * When written to inactive array, starts with resync + * + * write-pending + * clean, but writes are blocked waiting for 'active' to be written. + * + * active-idle + * like active, but no writes have been seen for a while (100msec). + * + */ +enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, + write_pending, active_idle, bad_word}; +static char *array_states[] = { + "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", + "write-pending", "active-idle", NULL }; + +static int match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (cmd_match(word, list[n])) + break; + return n; +} + +static ssize_t +array_state_show(mddev_t *mddev, char *page) +{ + enum array_state st = inactive; + + if (mddev->pers) + switch(mddev->ro) { + case 1: + st = readonly; + break; + case 2: + st = read_auto; + break; + case 0: + if (mddev->in_sync) + st = clean; + else if (mddev->safemode) + st = active_idle; + else + st = active; + } + else { + if (list_empty(&mddev->disks) && + mddev->raid_disks == 0 && + mddev->size == 0) + st = clear; + else + st = inactive; + } + return sprintf(page, "%s\n", array_states[st]); +} + +static int do_md_stop(mddev_t * mddev, int ro); +static int do_md_run(mddev_t * mddev); +static int restart_array(mddev_t *mddev); + +static ssize_t +array_state_store(mddev_t *mddev, const char *buf, size_t len) +{ + int err = -EINVAL; + enum array_state st = match_word(buf, array_states); + switch(st) { + case bad_word: + break; + case clear: + /* stopping an active array */ + if (mddev->pers) { + if (atomic_read(&mddev->active) > 1) + return -EBUSY; + err = do_md_stop(mddev, 0); + } + break; + case inactive: + /* stopping an active array */ + if (mddev->pers) { + if (atomic_read(&mddev->active) > 1) + return -EBUSY; + err = do_md_stop(mddev, 2); + } + break; + case suspended: + break; /* not supported yet */ + case readonly: + if (mddev->pers) + err = do_md_stop(mddev, 1); + else { + mddev->ro = 1; + err = do_md_run(mddev); + } + break; + case read_auto: + /* stopping an active array */ + if (mddev->pers) { + err = do_md_stop(mddev, 1); + if (err == 0) + mddev->ro = 2; /* FIXME mark devices writable */ + } else { + mddev->ro = 2; + err = do_md_run(mddev); + } + break; + case clean: + if (mddev->pers) { + restart_array(mddev); + spin_lock_irq(&mddev->write_lock); + if (atomic_read(&mddev->writes_pending) == 0) { + mddev->in_sync = 1; + mddev->sb_dirty = 1; + } + spin_unlock_irq(&mddev->write_lock); + } else { + mddev->ro = 0; + mddev->recovery_cp = MaxSector; + err = do_md_run(mddev); + } + break; + case active: + if (mddev->pers) { + restart_array(mddev); + mddev->sb_dirty = 0; + wake_up(&mddev->sb_wait); + err = 0; + } else { + mddev->ro = 0; + err = do_md_run(mddev); + } + break; + case write_pending: + case active_idle: + /* these cannot be set */ + break; + } + if (err) + return err; + else + return len; +} +static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store); + +static ssize_t null_show(mddev_t *mddev, char *page) { return -EINVAL; @@ -2428,11 +2801,15 @@ __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); static struct attribute *md_default_attrs[] = { &md_level.attr, + &md_layout.attr, &md_raid_disks.attr, &md_chunk_size.attr, &md_size.attr, + &md_resync_start.attr, &md_metadata.attr, &md_new_device.attr, + &md_safe_delay.attr, + &md_array_state.attr, NULL, }; @@ -2532,13 +2909,10 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) } disk->major = MAJOR(dev); disk->first_minor = unit << shift; - if (partitioned) { + if (partitioned) sprintf(disk->disk_name, "md_d%d", unit); - sprintf(disk->devfs_name, "md/d%d", unit); - } else { + else sprintf(disk->disk_name, "md%d", unit); - sprintf(disk->devfs_name, "md/%d", unit); - } disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; @@ -2553,8 +2927,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) return NULL; } -void md_wakeup_thread(mdk_thread_t *thread); - static void md_safemode_timeout(unsigned long data) { mddev_t *mddev = (mddev_t *) data; @@ -2708,7 +3080,7 @@ static int do_md_run(mddev_t * mddev) mddev->safemode = 0; mddev->safemode_timer.function = md_safemode_timeout; mddev->safemode_timer.data = (unsigned long) mddev; - mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ + mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; ITERATE_RDEV(mddev,rdev,tmp) @@ -2736,6 +3108,36 @@ static int do_md_run(mddev_t * mddev) mddev->queue->queuedata = mddev; mddev->queue->make_request_fn = mddev->pers->make_request; + /* If there is a partially-recovered drive we need to + * start recovery here. If we leave it to md_check_recovery, + * it will remove the drives and not do the right thing + */ + if (mddev->degraded) { + struct list_head *rtmp; + int spares = 0; + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + /* complete an interrupted recovery */ + spares++; + if (spares && mddev->pers->sync_request) { + mddev->recovery = 0; + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "%s_resync"); + if (!mddev->sync_thread) { + printk(KERN_ERR "%s: could not start resync" + " thread...\n", + mdname(mddev)); + /* leave the spares where they are, it shouldn't hurt */ + mddev->recovery = 0; + } else + md_wakeup_thread(mddev->sync_thread); + } + } + mddev->changed = 1; md_new_event(mddev); return 0; @@ -2769,18 +3171,47 @@ static int restart_array(mddev_t *mddev) */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); err = 0; - } else { - printk(KERN_ERR "md: %s has no personality assigned.\n", - mdname(mddev)); + } else err = -EINVAL; - } out: return err; } -static int do_md_stop(mddev_t * mddev, int ro) +/* similar to deny_write_access, but accounts for our holding a reference + * to the file ourselves */ +static int deny_bitmap_write_access(struct file * file) +{ + struct inode *inode = file->f_mapping->host; + + spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_writecount) > 1) { + spin_unlock(&inode->i_lock); + return -ETXTBSY; + } + atomic_set(&inode->i_writecount, -1); + spin_unlock(&inode->i_lock); + + return 0; +} + +static void restore_bitmap_write_access(struct file *file) +{ + struct inode *inode = file->f_mapping->host; + + spin_lock(&inode->i_lock); + atomic_set(&inode->i_writecount, 1); + spin_unlock(&inode->i_lock); +} + +/* mode: + * 0 - completely stop and dis-assemble array + * 1 - switch to readonly + * 2 - stop but do not disassemble array + */ +static int do_md_stop(mddev_t * mddev, int mode) { int err = 0; struct gendisk *disk = mddev->gendisk; @@ -2792,6 +3223,7 @@ static int do_md_stop(mddev_t * mddev, int ro) } if (mddev->sync_thread) { + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_unregister_thread(mddev->sync_thread); mddev->sync_thread = NULL; @@ -2801,12 +3233,15 @@ static int do_md_stop(mddev_t * mddev, int ro) invalidate_partition(disk, 0); - if (ro) { + switch(mode) { + case 1: /* readonly */ err = -ENXIO; if (mddev->ro==1) goto out; mddev->ro = 1; - } else { + break; + case 0: /* disassemble */ + case 2: /* stop */ bitmap_flush(mddev); md_super_wait(mddev); if (mddev->ro) @@ -2821,19 +3256,20 @@ static int do_md_stop(mddev_t * mddev, int ro) if (mddev->ro) mddev->ro = 0; } - if (!mddev->in_sync) { + if (!mddev->in_sync || mddev->sb_dirty) { /* mark array as shutdown cleanly */ mddev->in_sync = 1; md_update_sb(mddev); } - if (ro) + if (mode == 1) set_disk_ro(disk, 1); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } /* * Free resources if final stop */ - if (!ro) { + if (mode == 0) { mdk_rdev_t *rdev; struct list_head *tmp; struct gendisk *disk; @@ -2841,7 +3277,7 @@ static int do_md_stop(mddev_t * mddev, int ro) bitmap_destroy(mddev); if (mddev->bitmap_file) { - atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); + restore_bitmap_write_access(mddev->bitmap_file); fput(mddev->bitmap_file); mddev->bitmap_file = NULL; } @@ -2857,11 +3293,15 @@ static int do_md_stop(mddev_t * mddev, int ro) export_array(mddev); mddev->array_size = 0; + mddev->size = 0; + mddev->raid_disks = 0; + mddev->recovery_cp = 0; + disk = mddev->gendisk; if (disk) set_capacity(disk, 0); mddev->changed = 1; - } else + } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", mdname(mddev)); err = 0; @@ -3264,6 +3704,17 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); + if (!err && !mddev->pers->hot_remove_disk) { + /* If there is hot_add_disk but no hot_remove_disk + * then added disks for geometry changes, + * and should be added immediately. + */ + super_types[mddev->major_version]. + validate_super(mddev, rdev); + err = mddev->pers->hot_add_disk(mddev, rdev); + if (err) + unbind_rdev_from_array(rdev); + } if (err) export_rdev(rdev); @@ -3434,23 +3885,6 @@ abort_export: return err; } -/* similar to deny_write_access, but accounts for our holding a reference - * to the file ourselves */ -static int deny_bitmap_write_access(struct file * file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) > 1) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; - } - atomic_set(&inode->i_writecount, -1); - spin_unlock(&inode->i_lock); - - return 0; -} - static int set_bitmap_file(mddev_t *mddev, int fd) { int err; @@ -3491,12 +3925,17 @@ static int set_bitmap_file(mddev_t *mddev, int fd) mddev->pers->quiesce(mddev, 1); if (fd >= 0) err = bitmap_create(mddev); - if (fd < 0 || err) + if (fd < 0 || err) { bitmap_destroy(mddev); + fd = -1; /* make sure to put the file */ + } mddev->pers->quiesce(mddev, 0); - } else if (fd < 0) { - if (mddev->bitmap_file) + } + if (fd < 0) { + if (mddev->bitmap_file) { + restore_bitmap_write_access(mddev->bitmap_file); fput(mddev->bitmap_file); + } mddev->bitmap_file = NULL; } @@ -3977,11 +4416,6 @@ static int md_ioctl(struct inode *inode, struct file *file, goto done_unlock; default: - if (_IOC_TYPE(cmd) == MD_MAJOR) - printk(KERN_WARNING "md: %s(pid %d) used" - " obsolete MD ioctl, upgrade your" - " software to use new ictls.\n", - current->comm, current->pid); err = -EINVAL; goto abort_unlock; } @@ -4586,7 +5020,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi) spin_lock_irq(&mddev->write_lock); if (mddev->in_sync) { mddev->in_sync = 0; - mddev->sb_dirty = 1; + mddev->sb_dirty = 3; md_wakeup_thread(mddev->thread); } spin_unlock_irq(&mddev->write_lock); @@ -4599,7 +5033,7 @@ void md_write_end(mddev_t *mddev) if (atomic_dec_and_test(&mddev->writes_pending)) { if (mddev->safemode == 2) md_wakeup_thread(mddev->thread); - else + else if (mddev->safemode_delay) mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); } } @@ -4620,10 +5054,14 @@ void md_do_sync(mddev_t *mddev) struct list_head *tmp; sector_t last_check; int skipped = 0; + struct list_head *rtmp; + mdk_rdev_t *rdev; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; + if (mddev->ro) /* never try to sync a read-only array */ + return; /* we overload curr_resync somewhat here. * 0 == not engaged in resync at all @@ -4682,17 +5120,30 @@ void md_do_sync(mddev_t *mddev) } } while (mddev->curr_resync < 2); + j = 0; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* resync follows the size requested by the personality, * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; mddev->resync_mismatches = 0; + /* we don't use the checkpoint if there's a bitmap */ + if (!mddev->bitmap && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->size << 1; - else + else { /* recovery follows the physical size of devices */ max_sectors = mddev->size << 1; + j = MaxSector; + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < j) + j = rdev->recovery_offset; + } printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" @@ -4702,12 +5153,7 @@ void md_do_sync(mddev_t *mddev) speed_max(mddev)); is_mddev_idle(mddev); /* this also initializes IO event counters */ - /* we don't use the checkpoint if there's a bitmap */ - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap - && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - j = mddev->recovery_cp; - else - j = 0; + io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { mark[m] = jiffies; @@ -4828,15 +5274,28 @@ void md_do_sync(mddev_t *mddev) if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync > 2 && - mddev->curr_resync >= mddev->recovery_cp) { - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - printk(KERN_INFO - "md: checkpointing recovery of %s.\n", - mdname(mddev)); - mddev->recovery_cp = mddev->curr_resync; - } else - mddev->recovery_cp = MaxSector; + mddev->curr_resync > 2) { + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + if (mddev->curr_resync >= mddev->recovery_cp) { + printk(KERN_INFO + "md: checkpointing recovery of %s.\n", + mdname(mddev)); + mddev->recovery_cp = mddev->curr_resync; + } + } else + mddev->recovery_cp = MaxSector; + } else { + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + mddev->curr_resync = MaxSector; + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < mddev->curr_resync) + rdev->recovery_offset = mddev->curr_resync; + mddev->sb_dirty = 1; + } } skip: @@ -4908,7 +5367,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; - mddev->sb_dirty = 1; + mddev->sb_dirty = 3; } if (mddev->safemode == 1) mddev->safemode = 0; @@ -4957,6 +5416,8 @@ void md_check_recovery(mddev_t *mddev) clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + goto unlock; /* no recovery is running. * remove any failed drives, then * add spares if possible. @@ -4979,6 +5440,7 @@ void md_check_recovery(mddev_t *mddev) ITERATE_RDEV(mddev,rdev,rtmp) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { + rdev->recovery_offset = 0; if (mddev->pers->hot_add_disk(mddev,rdev)) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); @@ -5071,8 +5533,6 @@ static void md_geninit(void) static int __init md_init(void) { - int minor; - printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," " MD_SB_DISKS=%d\n", MD_MAJOR_VERSION, MD_MINOR_VERSION, @@ -5086,23 +5546,11 @@ static int __init md_init(void) unregister_blkdev(MAJOR_NR, "md"); return -1; } - devfs_mk_dir("md"); blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, md_probe, NULL, NULL); blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, md_probe, NULL, NULL); - for (minor=0; minor < MAX_MD_DEVS; ++minor) - devfs_mk_bdev(MKDEV(MAJOR_NR, minor), - S_IFBLK|S_IRUSR|S_IWUSR, - "md/%d", minor); - - for (minor=0; minor < MAX_MD_DEVS; ++minor) - devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), - S_IFBLK|S_IRUSR|S_IWUSR, - "md/mdp%d", minor); - - register_reboot_notifier(&md_notifier); raid_table_header = register_sysctl_table(raid_root_table, 1); @@ -5158,15 +5606,9 @@ static __exit void md_exit(void) { mddev_t *mddev; struct list_head *tmp; - int i; + blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); - for (i=0; i < MAX_MD_DEVS; i++) - devfs_remove("md/%d", i); - for (i=0; i < MAX_MD_DEVS; i++) - devfs_remove("md/d%d", i); - - devfs_remove("md"); unregister_blkdev(MAJOR_NR,"md"); unregister_blkdev(mdp_major, "mdp"); @@ -5216,7 +5658,6 @@ EXPORT_SYMBOL(md_write_end); EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); -EXPORT_SYMBOL(md_print_devices); EXPORT_SYMBOL(md_check_recovery); MODULE_LICENSE("GPL"); MODULE_ALIAS("md"); |