From ba714450698a966d184f5337235b100cbfa8685e Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 1 Sep 2011 15:10:34 +0200 Subject: FIX: Mdmon crashes after changing RAID level from 1 to 0 Description of the bug: Sometimes mdmon crashes after changing RAID level from 1 to 0 (takeover). Cause of the bug: The managemon marks an active_array for removal from monitoring by assigning a->container to NULL value (in the "manage_member" function). Sometimes (during stress test) it happens right when the monitor is in the "read_and_act" function and a->container pointer is in use. This causes the monitor crashes. Solution: The active array has to be marked for removal in another way than setting NULL pointer when it can be in use. A new field "to_remove" was added to the "active_array" structure. It is used in the managemon to mark a container to remove (instead of the old assigment: a->container = NULL) and monitor checks it to determine if the array should be removed. The field "to_remove" should be checked in some other places to avoid managing of the array which is going to be removed. Signed-off-by: Lukasz Dorau Signed-off-by: NeilBrown --- monitor.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'monitor.c') diff --git a/monitor.c b/monitor.c index 7ac59072..b002e90f 100644 --- a/monitor.c +++ b/monitor.c @@ -479,7 +479,7 @@ static void reconcile_failed(struct active_array *aa, struct mdinfo *failed) struct mdinfo *victim; for (a = aa; a; a = a->next) { - if (!a->container) + if (!a->container || a->to_remove) continue; victim = find_device(a, failed->disk.major, failed->disk.minor); if (!victim) @@ -539,7 +539,7 @@ static int wait_and_act(struct supertype *container, int nowait) /* once an array has been deactivated we want to * ask the manager to discard it. */ - if (!a->container) { + if (!a->container || a->to_remove) { if (discard_this) { ap = &(*ap)->next; continue; @@ -642,7 +642,7 @@ static int wait_and_act(struct supertype *container, int nowait) /* FIXME check if device->state_fd need to be cleared?*/ signal_manager(); } - if (a->container) { + if (a->container && !a->to_remove) { is_dirty = read_and_act(a); rv |= 1; dirty_arrays += is_dirty; @@ -657,7 +657,7 @@ static int wait_and_act(struct supertype *container, int nowait) /* propagate failures across container members */ for (a = *aap; a ; a = a->next) { - if (!a->container) + if (!a->container || a->to_remove) continue; for (mdi = a->info.devs ; mdi ; mdi = mdi->next) if (mdi->curr_state & DS_FAULTY) -- cgit v1.2.3