mdmon: delegate removal to managemon

Starting from [1], kernel requires suspend lock on member drive remove path. It causes deadlock with external management because monitor thread may be locked on suspend and is unable to switch array to active, for example if badblock is reported in this time. It is blocking action now, so it must be delegated to managemon thread but we must ensure that monitor does metadata update first, just after detecting faulty. This patch adds appropriative support. Monitor thread detects "faulty", and updates the metadata. After that, it is asking manager thread to remove the device. Manager must be careful because closing descriptors used by select() may lead to abort with D_FORTIFY_SOURCE=2. First, it must ensure that device descriptors are not used by monitor. There is unlimited numer of remove retries and recovery is blocked until all failed drives are removed. It is safe because "faulty" device is not longer used by MD. Issue will be also mitigated by optimalization on badlbock recording path in kernel. It will check if device is not failed before badblock is recorded but relying on this is not ideologically correct. Userspace must keep compatibility with kernel and since it is blocking action, we must tract is as blocking action. [1] kernel commit cfa078c8b80d ("md: use new apis to suspend array for adding/removing rdev from state_store()") Signed-off-by: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
author: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com> 2024-07-30 14:12:21 +0200
committer: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com> 2024-11-04 10:29:52 +0100
commit: 07ad253044f5cf7b9cc5883f0d0a1cdb9ec42821 (patch)
tree: 9d61b6f6ddc660b397fa72e024252d9733386422 /monitor.c
parent: monitor: Add DS_EXTERNAL_BB flag (diff)
download: mdadm-07ad253044f5cf7b9cc5883f0d0a1cdb9ec42821.tar.xz
mdadm-07ad253044f5cf7b9cc5883f0d0a1cdb9ec42821.zip
1 files changed, 30 insertions, 32 deletions
diff --git a/monitor.c b/monitor.c
index 6429afc6..81ae8893 100644
--- a/monitor.c
+++ b/monitor.c
@@ -399,8 +399,9 @@ static void signal_manager(void)
 static int read_and_act(struct active_array *a)
 {
 	unsigned long long sync_completed;
-	int check_degraded = 0;
-	int check_reshape = 0;
+	bool disks_to_remove = false;
+	bool check_degraded = false;
+	bool check_reshape = false;
 	int deactivate = 0;
 	struct mdinfo *mdi;
 	int ret = 0;
@@ -425,7 +426,7 @@ static int read_and_act(struct active_array *a)
 		mdi->next_state = 0;
 		mdi->curr_state = 0;
 
-		if (!is_fd_valid(mdi->state_fd))
+		if (mdi->man_disk_to_remove)
 			/* We are removing this device, skip it then */
 			continue;
 
@@ -624,21 +625,12 @@ static int read_and_act(struct active_array *a)
 			write_attr("-blocked", mdi->state_fd);
 		}
 
-		if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) {
-			/* The kernel may not be able to immediately remove the
-			 * disk.  In that case we wait a little while and
-			 * try again.
-			 */
-			if (write_attr("remove", mdi->state_fd) == MDADM_STATUS_SUCCESS) {
-				dprintf_cont(" %d:removed", mdi->disk.raid_disk);
-				close(mdi->state_fd);
-				close(mdi->recovery_fd);
-				close(mdi->bb_fd);
-				close(mdi->ubb_fd);
-				mdi->state_fd = -1;
-			} else
-				ret |= ARRAY_BUSY;
+		if ((mdi->next_state & DS_REMOVE) && !mdi->man_disk_to_remove) {
+			dprintf_cont(" %d:disk_to_remove", mdi->disk.raid_disk);
+			mdi->man_disk_to_remove = true;
+			disks_to_remove = true;
 		}
+
 		if (mdi->next_state & DS_INSYNC) {
 			write_attr("+in_sync", mdi->state_fd);
 			dprintf_cont(" %d:+in_sync", mdi->disk.raid_disk);
@@ -651,17 +643,14 @@ static int read_and_act(struct active_array *a)
 
 	a->prev_action = a->curr_action;
 
-	for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+	for (mdi = a->info.devs; mdi ; mdi = mdi->next)
 		mdi->prev_state = mdi->curr_state;
-		mdi->next_state = 0;
-	}
 
-	if (check_degraded || check_reshape) {
-		/* manager will do the actual check */
-		if (check_degraded)
-			a->check_degraded = 1;
-		if (check_reshape)
-			a->check_reshape = 1;
+	if (check_degraded || check_reshape || disks_to_remove) {
+
+		a->check_member_remove |= disks_to_remove;
+		a->check_degraded |= check_degraded;
+		a->check_reshape |= check_reshape;
 		signal_manager();
 	}
 
@@ -734,13 +723,11 @@ int monitor_loop_cnt;
 
 static int wait_and_act(struct supertype *container, int nowait)
 {
-	fd_set rfds;
-	int maxfd = 0;
-	struct active_array **aap = &container->arrays;
-	struct active_array *a, **ap;
-	int rv;
-	struct mdinfo *mdi;
+	struct active_array *a, **ap, **aap = &container->arrays;
 	static unsigned int dirty_arrays = ~0; /* start at some non-zero value */
+	struct mdinfo *mdi;
+	int rv, maxfd = 0;
+	fd_set rfds;
 
 	FD_ZERO(&rfds);
 
@@ -764,7 +751,18 @@ static int wait_and_act(struct supertype *container, int nowait)
 		add_fd(&rfds, &maxfd, a->info.state_fd);
 		add_fd(&rfds, &maxfd, a->action_fd);
 		add_fd(&rfds, &maxfd, a->sync_completed_fd);
+
 		for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+			if (mdi->man_disk_to_remove) {
+				mdi->mon_descriptors_not_used = true;
+
+				/* Managemon could be blocked on suspend in kernel.
+				 * Monitor must respond if any badblock is recorded in this time.
+				 */
+				container->retry_soon = 1;
+				continue;
+			}
+
 			add_fd(&rfds, &maxfd, mdi->state_fd);
 			add_fd(&rfds, &maxfd, mdi->bb_fd);
 			add_fd(&rfds, &maxfd, mdi->ubb_fd);
author	Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>	2024-07-30 14:12:21 +0200
committer	Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>	2024-11-04 10:29:52 +0100
commit	07ad253044f5cf7b9cc5883f0d0a1cdb9ec42821 (patch)
tree	9d61b6f6ddc660b397fa72e024252d9733386422 /monitor.c
parent	monitor: Add DS_EXTERNAL_BB flag (diff)
download	mdadm-07ad253044f5cf7b9cc5883f0d0a1cdb9ec42821.tar.xz mdadm-07ad253044f5cf7b9cc5883f0d0a1cdb9ec42821.zip