From 96a2ff2a6373259cd3bbc5dcaa79865ce271fa4b Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 20 Mar 2023 17:45:02 -0400
Subject: dm bufio: remove unused dm_bufio_release_move interface

Was used by multi-snapshot DM target that never went upstream.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Acked-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 77 ---------------------------------------------------
 1 file changed, 77 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index cf077f9b30c3..79434b38f368 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1415,83 +1415,6 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c
 }
 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
 
-/*
- * We first delete any other buffer that may be at that new location.
- *
- * Then, we write the buffer to the original location if it was dirty.
- *
- * Then, if we are the only one who is holding the buffer, relink the buffer
- * in the buffer tree for the new location.
- *
- * If there was someone else holding the buffer, we write it to the new
- * location but not relink it, because that other user needs to have the buffer
- * at the same place.
- */
-void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
-{
-	struct dm_bufio_client *c = b->c;
-	struct dm_buffer *new;
-
-	BUG_ON(dm_bufio_in_request());
-
-	dm_bufio_lock(c);
-
-retry:
-	new = __find(c, new_block);
-	if (new) {
-		if (new->hold_count) {
-			__wait_for_free_buffer(c);
-			goto retry;
-		}
-
-		/*
-		 * FIXME: Is there any point waiting for a write that's going
-		 * to be overwritten in a bit?
-		 */
-		__make_buffer_clean(new);
-		__unlink_buffer(new);
-		__free_buffer_wake(new);
-	}
-
-	BUG_ON(!b->hold_count);
-	BUG_ON(test_bit(B_READING, &b->state));
-
-	__write_dirty_buffer(b, NULL);
-	if (b->hold_count == 1) {
-		wait_on_bit_io(&b->state, B_WRITING,
-			       TASK_UNINTERRUPTIBLE);
-		set_bit(B_DIRTY, &b->state);
-		b->dirty_start = 0;
-		b->dirty_end = c->block_size;
-		__unlink_buffer(b);
-		__link_buffer(b, new_block, LIST_DIRTY);
-	} else {
-		sector_t old_block;
-
-		wait_on_bit_lock_io(&b->state, B_WRITING,
-				    TASK_UNINTERRUPTIBLE);
-		/*
-		 * Relink buffer to "new_block" so that write_callback
-		 * sees "new_block" as a block number.
-		 * After the write, link the buffer back to old_block.
-		 * All this must be done in bufio lock, so that block number
-		 * change isn't visible to other threads.
-		 */
-		old_block = b->block;
-		__unlink_buffer(b);
-		__link_buffer(b, new_block, b->list_mode);
-		submit_io(b, REQ_OP_WRITE, write_endio);
-		wait_on_bit_io(&b->state, B_WRITING,
-			       TASK_UNINTERRUPTIBLE);
-		__unlink_buffer(b);
-		__link_buffer(b, old_block, b->list_mode);
-	}
-
-	dm_bufio_unlock(c);
-	dm_bufio_release(b);
-}
-EXPORT_SYMBOL_GPL(dm_bufio_release_move);
-
 static void forget_buffer_locked(struct dm_buffer *b)
 {
 	if (likely(!b->hold_count) && likely(!smp_load_acquire(&b->state))) {
-- 
cgit v1.2.3


From 555977dd6818da3bcf39ca5a3c1cb7e8ca4298c1 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 24 Mar 2023 21:17:43 -0400
Subject: dm bufio: use WARN_ON in dm_bufio_client_destroy and dm_bufio_exit

Using BUG_ON when tearing down is excessive. Relax these to WARN_ONs.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 79434b38f368..dac9f1f84c34 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1828,8 +1828,8 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 
 	mutex_unlock(&dm_bufio_clients_lock);
 
-	BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
-	BUG_ON(c->need_reserved_buffers);
+	WARN_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
+	WARN_ON(c->need_reserved_buffers);
 
 	while (!list_empty(&c->reserved_buffers)) {
 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
@@ -1843,7 +1843,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 			DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
 
 	for (i = 0; i < LIST_SIZE; i++)
-		BUG_ON(c->n_buffers[i]);
+		WARN_ON(c->n_buffers[i]);
 
 	kmem_cache_destroy(c->slab_cache);
 	kmem_cache_destroy(c->slab_buffer);
@@ -2082,7 +2082,7 @@ static void __exit dm_bufio_exit(void)
 		bug = 1;
 	}
 
-	BUG_ON(bug);
+	WARN_ON(bug); /* leaks are not worth crashing the system */
 }
 
 module_init(dm_bufio_init)
-- 
cgit v1.2.3


From 0511228752eaa5264ada8b1f4baeded916c65945 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 24 Mar 2023 21:36:26 -0400
Subject: dm bufio: never crash if dm_bufio_in_request()

All these instances are entirely avoidable given that they speak to
coding mistakes that result in inappropriate use. Proper testing during
development will catch any such coding bug so its best to relax all of
these from BUG_ON to WARN_ON_ONCE.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index dac9f1f84c34..d63f94ab1d9f 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1152,7 +1152,8 @@ EXPORT_SYMBOL_GPL(dm_bufio_get);
 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
 		    struct dm_buffer **bp)
 {
-	BUG_ON(dm_bufio_in_request());
+	if (WARN_ON_ONCE(dm_bufio_in_request()))
+		return ERR_PTR(-EINVAL);
 
 	return new_read(c, block, NF_READ, bp);
 }
@@ -1161,7 +1162,8 @@ EXPORT_SYMBOL_GPL(dm_bufio_read);
 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
 		   struct dm_buffer **bp)
 {
-	BUG_ON(dm_bufio_in_request());
+	if (WARN_ON_ONCE(dm_bufio_in_request()))
+		return ERR_PTR(-EINVAL);
 
 	return new_read(c, block, NF_FRESH, bp);
 }
@@ -1174,7 +1176,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 
 	LIST_HEAD(write_list);
 
-	BUG_ON(dm_bufio_in_request());
+	if (WARN_ON_ONCE(dm_bufio_in_request()))
+		return; /* should never happen */
 
 	blk_start_plug(&plug);
 	dm_bufio_lock(c);
@@ -1281,7 +1284,8 @@ void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
 {
 	LIST_HEAD(write_list);
 
-	BUG_ON(dm_bufio_in_request());
+	if (WARN_ON_ONCE(dm_bufio_in_request()))
+		return; /* should never happen */
 
 	dm_bufio_lock(c);
 	__write_dirty_buffers_async(c, 0, &write_list);
@@ -1386,7 +1390,8 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c)
 		.count = 0,
 	};
 
-	BUG_ON(dm_bufio_in_request());
+	if (WARN_ON_ONCE(dm_bufio_in_request()))
+		return -EINVAL;
 
 	return dm_io(&io_req, 1, &io_reg, NULL);
 }
@@ -1409,7 +1414,8 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c
 		.count = block_to_sector(c, count),
 	};
 
-	BUG_ON(dm_bufio_in_request());
+	if (WARN_ON_ONCE(dm_bufio_in_request()))
+		return -EINVAL; /* discards are optional */
 
 	return dm_io(&io_req, 1, &io_reg, NULL);
 }
-- 
cgit v1.2.3


From b75a80f4f5a956f3d335ebf29c7cee1b234ac3cf Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 24 Mar 2023 20:49:25 -0400
Subject: dm bufio: don't bug for clear developer oversight

Reasonable to relax to WARN_ON because these are easily avoided but do
offer some assurance future coding mistakes won't occur (if changes
tested properly).

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index d63f94ab1d9f..64fb5fd39bd9 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -378,8 +378,10 @@ static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
  */
 static void __cache_size_refresh(void)
 {
-	BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
-	BUG_ON(dm_bufio_client_count < 0);
+	if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock)))
+		return;
+	if (WARN_ON(dm_bufio_client_count < 0))
+		return;
 
 	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
 
@@ -1536,7 +1538,8 @@ static void drop_buffers(struct dm_bufio_client *c)
 	int i;
 	bool warned = false;
 
-	BUG_ON(dm_bufio_in_request());
+	if (WARN_ON(dm_bufio_in_request()))
+		return; /* should never happen */
 
 	/*
 	 * An optimization so that the buffers are not written one-by-one.
@@ -1556,7 +1559,7 @@ static void drop_buffers(struct dm_bufio_client *c)
 			      (unsigned long long)b->block, b->hold_count, i);
 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 			stack_trace_print(b->stack_entries, b->stack_len, 1);
-			/* mark unclaimed to avoid BUG_ON below */
+			/* mark unclaimed to avoid WARN_ON below */
 			b->hold_count = 0;
 #endif
 		}
@@ -1567,7 +1570,7 @@ static void drop_buffers(struct dm_bufio_client *c)
 #endif
 
 	for (i = 0; i < LIST_SIZE; i++)
-		BUG_ON(!list_empty(&c->lru[i]));
+		WARN_ON(!list_empty(&c->lru[i]));
 
 	dm_bufio_unlock(c);
 }
-- 
cgit v1.2.3


From be845babda1bb168d3f9f47c171f0a24a3312cba Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 10 Mar 2023 15:37:20 +0000
Subject: dm bufio: add LRU abstraction

A CLOCK algorithm is used in this LRU abstraction.  This avoids
relinking list nodes, which would require a write lock protecting it.

None of the LRU methods are threadsafe; locking must be done at a
higher level.

Code that uses this new LRU will be introduced in the next 2 commits.

As such, this commit will cause "defined but not used" compiler warnings
that will be resolved by the next 2 commits.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 235 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 64fb5fd39bd9..a0bb0de0d4e7 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -66,6 +66,241 @@
 #define LIST_DIRTY	1
 #define LIST_SIZE	2
 
+/*--------------------------------------------------------------*/
+
+/*
+ * Rather than use an LRU list, we use a clock algorithm where entries
+ * are held in a circular list.  When an entry is 'hit' a reference bit
+ * is set.  The least recently used entry is approximated by running a
+ * cursor around the list selecting unreferenced entries. Referenced
+ * entries have their reference bit cleared as the cursor passes them.
+ */
+struct lru_entry {
+	struct list_head list;
+	atomic_t referenced;
+};
+
+struct lru_iter {
+	struct lru *lru;
+	struct list_head list;
+	struct lru_entry *stop;
+	struct lru_entry *e;
+};
+
+struct lru {
+	struct list_head *cursor;
+	unsigned long count;
+
+	struct list_head iterators;
+};
+
+/*--------------*/
+
+static void lru_init(struct lru *lru)
+{
+	lru->cursor = NULL;
+	lru->count = 0;
+	INIT_LIST_HEAD(&lru->iterators);
+}
+
+static void lru_destroy(struct lru *lru)
+{
+	WARN_ON_ONCE(lru->cursor);
+	WARN_ON_ONCE(!list_empty(&lru->iterators));
+}
+
+/*
+ * Insert a new entry into the lru.
+ */
+static void lru_insert(struct lru *lru, struct lru_entry *le)
+{
+	/*
+	 * Don't be tempted to set to 1, makes the lru aspect
+	 * perform poorly.
+	 */
+	atomic_set(&le->referenced, 0);
+
+	if (lru->cursor) {
+		list_add_tail(&le->list, lru->cursor);
+	} else {
+		INIT_LIST_HEAD(&le->list);
+		lru->cursor = &le->list;
+	}
+	lru->count++;
+}
+
+/*--------------*/
+
+/*
+ * Convert a list_head pointer to an lru_entry pointer.
+ */
+static inline struct lru_entry *to_le(struct list_head *l)
+{
+	return container_of(l, struct lru_entry, list);
+}
+
+/*
+ * Initialize an lru_iter and add it to the list of cursors in the lru.
+ */
+static void lru_iter_begin(struct lru *lru, struct lru_iter *it)
+{
+	it->lru = lru;
+	it->stop = lru->cursor ? to_le(lru->cursor->prev) : NULL;
+	it->e = lru->cursor ? to_le(lru->cursor) : NULL;
+	list_add(&it->list, &lru->iterators);
+}
+
+/*
+ * Remove an lru_iter from the list of cursors in the lru.
+ */
+static inline void lru_iter_end(struct lru_iter *it)
+{
+	list_del(&it->list);
+}
+
+/* Predicate function type to be used with lru_iter_next */
+typedef bool (*iter_predicate)(struct lru_entry *le, void *context);
+
+/*
+ * Advance the cursor to the next entry that passes the
+ * predicate, and return that entry.  Returns NULL if the
+ * iteration is complete.
+ */
+static struct lru_entry *lru_iter_next(struct lru_iter *it,
+				       iter_predicate pred, void *context)
+{
+	struct lru_entry *e;
+
+	while (it->e) {
+		e = it->e;
+
+		/* advance the cursor */
+		if (it->e == it->stop)
+			it->e = NULL;
+		else
+			it->e = to_le(it->e->list.next);
+
+		if (pred(e, context))
+			return e;
+	}
+
+	return NULL;
+}
+
+/*
+ * Invalidate a specific lru_entry and update all cursors in
+ * the lru accordingly.
+ */
+static void lru_iter_invalidate(struct lru *lru, struct lru_entry *e)
+{
+	struct lru_iter *it;
+
+	list_for_each_entry(it, &lru->iterators, list) {
+		/* Move c->e forwards if necc. */
+		if (it->e == e) {
+			it->e = to_le(it->e->list.next);
+			if (it->e == e)
+				it->e = NULL;
+		}
+
+		/* Move it->stop backwards if necc. */
+		if (it->stop == e) {
+			it->stop = to_le(it->stop->list.prev);
+			if (it->stop == e)
+				it->stop = NULL;
+		}
+	}
+}
+
+/*--------------*/
+
+/*
+ * Remove a specific entry from the lru.
+ */
+static void lru_remove(struct lru *lru, struct lru_entry *le)
+{
+	lru_iter_invalidate(lru, le);
+	if (lru->count == 1) {
+		lru->cursor = NULL;
+	} else {
+		if (lru->cursor == &le->list)
+			lru->cursor = lru->cursor->next;
+		list_del(&le->list);
+	}
+	lru->count--;
+}
+
+/*
+ * Mark as referenced.
+ */
+static inline void lru_reference(struct lru_entry *le)
+{
+	atomic_set(&le->referenced, 1);
+}
+
+/*--------------*/
+
+/*
+ * Remove the least recently used entry (approx), that passes the predicate.
+ * Returns NULL on failure.
+ */
+enum evict_result {
+	ER_EVICT,
+	ER_DONT_EVICT,
+	ER_STOP, /* stop looking for something to evict */
+};
+
+typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context);
+
+static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context)
+{
+	unsigned long tested = 0;
+	struct list_head *h = lru->cursor;
+	struct lru_entry *le;
+
+	if (!h)
+		return NULL;
+	/*
+	 * In the worst case we have to loop around twice. Once to clear
+	 * the reference flags, and then again to discover the predicate
+	 * fails for all entries.
+	 */
+	while (tested < lru->count) {
+		le = container_of(h, struct lru_entry, list);
+
+		if (atomic_read(&le->referenced)) {
+			atomic_set(&le->referenced, 0);
+		} else {
+			tested++;
+			switch (pred(le, context)) {
+			case ER_EVICT:
+				/*
+				 * Adjust the cursor, so we start the next
+				 * search from here.
+				 */
+				lru->cursor = le->list.next;
+				lru_remove(lru, le);
+				return le;
+
+			case ER_DONT_EVICT:
+				break;
+
+			case ER_STOP:
+				lru->cursor = le->list.next;
+				return NULL;
+			}
+		}
+
+		h = h->next;
+
+		cond_resched();
+	}
+
+	return NULL;
+}
+
+/*--------------------------------------------------------------*/
+
 /*
  * Linking of buffers:
  *	All buffers are linked to buffer_tree with their node field.
-- 
cgit v1.2.3


From 2cd7a6d41fe8ec85f82e1ca31bceb0dc0d849f60 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 10 Mar 2023 16:02:29 +0000
Subject: dm bufio: add dm_buffer_cache abstraction

The buffer cache is responsible for managing the holder count,
tracking clean/dirty state, and choosing buffers via predicates.
Higher level code is responsible for allocation of buffers, IO and
eviction/cache sizing.

The buffer cache has thread safe methods for acquiring a reference
to an existing buffer. All other methods in buffer cache are _not_
threadsafe, and only contain enough locking to guarantee the safe
methods.

Rather than a single mutex, sharded rw_semaphores are used to allow
concurrent threads to 'get' buffers. Each rw_semaphore protects its
own rbtree of buffer entries.

Code that uses this new dm_buffer_cache abstraction will be introduced
in a following commit.

This commit moves the dm_buffer struct in preparation for finer grained
dm_buffer changes, in the next commit, to be more easily seen. It also
introduces temporary dm_buffer struct members to allow compilation of
this intermediate commit (they will be elided in the next commit).

This commit will cause "defined but not used" compiler warnings that
will be resolved by the next commit.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 588 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 526 insertions(+), 62 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index a0bb0de0d4e7..9daff9b77cee 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -301,6 +301,517 @@ static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *con
 
 /*--------------------------------------------------------------*/
 
+/*
+ * Buffer state bits.
+ */
+#define B_READING	0
+#define B_WRITING	1
+#define B_DIRTY		2
+
+/*
+ * Describes how the block was allocated:
+ * kmem_cache_alloc(), __get_free_pages() or vmalloc().
+ * See the comment at alloc_buffer_data.
+ */
+enum data_mode {
+	DATA_MODE_SLAB = 0,
+	DATA_MODE_GET_FREE_PAGES = 1,
+	DATA_MODE_VMALLOC = 2,
+	DATA_MODE_LIMIT = 3
+};
+
+struct dm_buffer {
+	struct rb_node node;
+	struct list_head lru_list;
+	struct list_head global_list;
+
+	sector_t block;
+	void *data;
+	unsigned char data_mode;		/* DATA_MODE_* */
+
+	unsigned int accessed;
+	unsigned int __hold_count;
+	unsigned long last_accessed;
+
+	unsigned char list_mode;		/* LIST_* */
+	blk_status_t read_error;
+	blk_status_t write_error;
+	unsigned long state;
+	unsigned int dirty_start;
+	unsigned int dirty_end;
+	unsigned int write_start;
+	unsigned int write_end;
+	struct dm_bufio_client *c;
+	struct list_head write_list;
+	void (*end_io)(struct dm_buffer *b, blk_status_t bs);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+#define MAX_STACK 10
+	unsigned int stack_len;
+	unsigned long stack_entries[MAX_STACK];
+#endif
+	/* Temp members to allow dm_buffer_cache code to compile */
+	atomic_t hold_count;
+	struct lru_entry lru;
+};
+
+/*--------------------------------------------------------------*/
+
+/*
+ * The buffer cache manages buffers, particularly:
+ *  - inc/dec of holder count
+ *  - setting the last_accessed field
+ *  - maintains clean/dirty state along with lru
+ *  - selecting buffers that match predicates
+ *
+ * It does *not* handle:
+ *  - allocation/freeing of buffers.
+ *  - IO
+ *  - Eviction or cache sizing.
+ *
+ * cache_get() and cache_put() are threadsafe, you do not need to
+ * protect these calls with a surrounding mutex.  All the other
+ * methods are not threadsafe; they do use locking primitives, but
+ * only enough to ensure get/put are threadsafe.
+ */
+
+#define NR_LOCKS 64
+#define LOCKS_MASK (NR_LOCKS - 1)
+
+struct buffer_tree {
+	struct rw_semaphore lock;
+	struct rb_root root;
+} ____cacheline_aligned_in_smp;
+
+struct dm_buffer_cache {
+	/*
+	 * We spread entries across multiple trees to reduce contention
+	 * on the locks.
+	 */
+	struct buffer_tree trees[NR_LOCKS];
+	struct lru lru[LIST_SIZE];
+};
+
+static inline unsigned int cache_index(sector_t block)
+{
+	return block & LOCKS_MASK;
+}
+
+static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
+{
+	down_read(&bc->trees[cache_index(block)].lock);
+}
+
+static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
+{
+	up_read(&bc->trees[cache_index(block)].lock);
+}
+
+static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
+{
+	down_write(&bc->trees[cache_index(block)].lock);
+}
+
+static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
+{
+	up_write(&bc->trees[cache_index(block)].lock);
+}
+
+static inline struct dm_buffer *le_to_buffer(struct lru_entry *le)
+{
+	return container_of(le, struct dm_buffer, lru);
+}
+
+static struct dm_buffer *list_to_buffer(struct list_head *l)
+{
+	struct lru_entry *le = list_entry(l, struct lru_entry, list);
+
+	if (!le)
+		return NULL;
+
+	return le_to_buffer(le);
+}
+
+static void cache_init(struct dm_buffer_cache *bc)
+{
+	unsigned int i;
+
+	for (i = 0; i < NR_LOCKS; i++) {
+		init_rwsem(&bc->trees[i].lock);
+		bc->trees[i].root = RB_ROOT;
+	}
+
+	lru_init(&bc->lru[LIST_CLEAN]);
+	lru_init(&bc->lru[LIST_DIRTY]);
+}
+
+static void cache_destroy(struct dm_buffer_cache *bc)
+{
+	unsigned int i;
+
+	for (i = 0; i < NR_LOCKS; i++)
+		WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root));
+
+	lru_destroy(&bc->lru[LIST_CLEAN]);
+	lru_destroy(&bc->lru[LIST_DIRTY]);
+}
+
+/*--------------*/
+
+/*
+ * not threadsafe, or racey depending how you look at it
+ */
+static inline unsigned long cache_count(struct dm_buffer_cache *bc, int list_mode)
+{
+	return bc->lru[list_mode].count;
+}
+
+static inline unsigned long cache_total(struct dm_buffer_cache *bc)
+{
+	return cache_count(bc, LIST_CLEAN) + cache_count(bc, LIST_DIRTY);
+}
+
+/*--------------*/
+
+/*
+ * Gets a specific buffer, indexed by block.
+ * If the buffer is found then its holder count will be incremented and
+ * lru_reference will be called.
+ *
+ * threadsafe
+ */
+static struct dm_buffer *__cache_get(const struct rb_root *root, sector_t block)
+{
+	struct rb_node *n = root->rb_node;
+	struct dm_buffer *b;
+
+	while (n) {
+		b = container_of(n, struct dm_buffer, node);
+
+		if (b->block == block)
+			return b;
+
+		n = block < b->block ? n->rb_left : n->rb_right;
+	}
+
+	return NULL;
+}
+
+static void __cache_inc_buffer(struct dm_buffer *b)
+{
+	atomic_inc(&b->hold_count);
+	WRITE_ONCE(b->last_accessed, jiffies);
+}
+
+static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block)
+{
+	struct dm_buffer *b;
+
+	cache_read_lock(bc, block);
+	b = __cache_get(&bc->trees[cache_index(block)].root, block);
+	if (b) {
+		lru_reference(&b->lru);
+		__cache_inc_buffer(b);
+	}
+	cache_read_unlock(bc, block);
+
+	return b;
+}
+
+/*--------------*/
+
+/*
+ * Returns true if the hold count hits zero.
+ * threadsafe
+ */
+static bool cache_put(struct dm_buffer_cache *bc, struct dm_buffer *b)
+{
+	bool r;
+
+	cache_read_lock(bc, b->block);
+	BUG_ON(!atomic_read(&b->hold_count));
+	r = atomic_dec_and_test(&b->hold_count);
+	cache_read_unlock(bc, b->block);
+
+	return r;
+}
+
+/*--------------*/
+
+typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *);
+
+/*
+ * Evicts a buffer based on a predicate.  The oldest buffer that
+ * matches the predicate will be selected.  In addition to the
+ * predicate the hold_count of the selected buffer will be zero.
+ */
+struct evict_wrapper {
+	b_predicate pred;
+	void *context;
+};
+
+/*
+ * Wraps the buffer predicate turning it into an lru predicate.  Adds
+ * extra test for hold_count.
+ */
+static enum evict_result __evict_pred(struct lru_entry *le, void *context)
+{
+	struct evict_wrapper *w = context;
+	struct dm_buffer *b = le_to_buffer(le);
+
+	if (atomic_read(&b->hold_count))
+		return ER_DONT_EVICT;
+
+	return w->pred(b, w->context);
+}
+
+static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
+				     b_predicate pred, void *context)
+{
+	struct evict_wrapper w = {.pred = pred, .context = context};
+	struct lru_entry *le;
+	struct dm_buffer *b;
+
+	le = lru_evict(&bc->lru[list_mode], __evict_pred, &w);
+	if (!le)
+		return NULL;
+
+	b = le_to_buffer(le);
+	/* __evict_pred will have locked the appropriate tree. */
+	rb_erase(&b->node, &bc->trees[cache_index(b->block)].root);
+
+	return b;
+}
+
+/*--------------*/
+
+/*
+ * Mark a buffer as clean or dirty. Not threadsafe.
+ */
+static void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list_mode)
+{
+	cache_write_lock(bc, b->block);
+	if (list_mode != b->list_mode) {
+		lru_remove(&bc->lru[b->list_mode], &b->lru);
+		b->list_mode = list_mode;
+		lru_insert(&bc->lru[b->list_mode], &b->lru);
+	}
+	cache_write_unlock(bc, b->block);
+}
+
+/*--------------*/
+
+/*
+ * Runs through the lru associated with 'old_mode', if the predicate matches then
+ * it moves them to 'new_mode'.  Not threadsafe.
+ */
+static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
+			    b_predicate pred, void *context)
+{
+	struct lru_entry *le;
+	struct dm_buffer *b;
+	struct evict_wrapper w = {.pred = pred, .context = context};
+
+	while (true) {
+		le = lru_evict(&bc->lru[old_mode], __evict_pred, &w);
+		if (!le)
+			break;
+
+		b = le_to_buffer(le);
+		b->list_mode = new_mode;
+		lru_insert(&bc->lru[b->list_mode], &b->lru);
+	}
+}
+
+/*--------------*/
+
+/*
+ * Iterates through all clean or dirty entries calling a function for each
+ * entry.  The callback may terminate the iteration early.  Not threadsafe.
+ */
+
+/*
+ * Iterator functions should return one of these actions to indicate
+ * how the iteration should proceed.
+ */
+enum it_action {
+	IT_NEXT,
+	IT_COMPLETE,
+};
+
+typedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context);
+
+static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
+			  iter_fn fn, void *context)
+{
+	struct lru *lru = &bc->lru[list_mode];
+	struct lru_entry *le, *first;
+
+	if (!lru->cursor)
+		return;
+
+	first = le = to_le(lru->cursor);
+	do {
+		struct dm_buffer *b = le_to_buffer(le);
+
+		switch (fn(b, context)) {
+		case IT_NEXT:
+			break;
+
+		case IT_COMPLETE:
+			return;
+		}
+		cond_resched();
+
+		le = to_le(le->list.next);
+	} while (le != first);
+}
+
+/*--------------*/
+
+/*
+ * Passes ownership of the buffer to the cache. Returns false if the
+ * buffer was already present (in which case ownership does not pass).
+ * eg, a race with another thread.
+ *
+ * Holder count should be 1 on insertion.
+ *
+ * Not threadsafe.
+ */
+static bool __cache_insert(struct rb_root *root, struct dm_buffer *b)
+{
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+	struct dm_buffer *found;
+
+	while (*new) {
+		found = container_of(*new, struct dm_buffer, node);
+
+		if (found->block == b->block)
+			return false;
+
+		parent = *new;
+		new = b->block < found->block ?
+			&found->node.rb_left : &found->node.rb_right;
+	}
+
+	rb_link_node(&b->node, parent, new);
+	rb_insert_color(&b->node, root);
+
+	return true;
+}
+
+static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b)
+{
+	bool r;
+
+	if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE))
+		return false;
+
+	cache_write_lock(bc, b->block);
+	BUG_ON(atomic_read(&b->hold_count) != 1);
+	r = __cache_insert(&bc->trees[cache_index(b->block)].root, b);
+	if (r)
+		lru_insert(&bc->lru[b->list_mode], &b->lru);
+	cache_write_unlock(bc, b->block);
+
+	return r;
+}
+
+/*--------------*/
+
+/*
+ * Removes buffer from cache, ownership of the buffer passes back to the caller.
+ * Fails if the hold_count is not one (ie. the caller holds the only reference).
+ *
+ * Not threadsafe.
+ */
+static bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b)
+{
+	bool r;
+
+	cache_write_lock(bc, b->block);
+
+	if (atomic_read(&b->hold_count) != 1) {
+		r = false;
+	} else {
+		r = true;
+		rb_erase(&b->node, &bc->trees[cache_index(b->block)].root);
+		lru_remove(&bc->lru[b->list_mode], &b->lru);
+	}
+
+	cache_write_unlock(bc, b->block);
+
+	return r;
+}
+
+/*--------------*/
+
+typedef void (*b_release)(struct dm_buffer *);
+
+static struct dm_buffer *__find_next(struct rb_root *root, sector_t block)
+{
+	struct rb_node *n = root->rb_node;
+	struct dm_buffer *b;
+	struct dm_buffer *best = NULL;
+
+	while (n) {
+		b = container_of(n, struct dm_buffer, node);
+
+		if (b->block == block)
+			return b;
+
+		if (block <= b->block) {
+			n = n->rb_left;
+			best = b;
+		} else {
+			n = n->rb_right;
+		}
+	}
+
+	return best;
+}
+
+static void __remove_range(struct dm_buffer_cache *bc,
+			   struct rb_root *root,
+			   sector_t begin, sector_t end,
+			   b_predicate pred, b_release release)
+{
+	struct dm_buffer *b;
+
+	while (true) {
+		cond_resched();
+
+		b = __find_next(root, begin);
+		if (!b || (b->block >= end))
+			break;
+
+		begin = b->block + 1;
+
+		if (atomic_read(&b->hold_count))
+			continue;
+
+		if (pred(b, NULL) == ER_EVICT) {
+			rb_erase(&b->node, root);
+			lru_remove(&bc->lru[b->list_mode], &b->lru);
+			release(b);
+		}
+	}
+}
+
+static void cache_remove_range(struct dm_buffer_cache *bc,
+			       sector_t begin, sector_t end,
+			       b_predicate pred, b_release release)
+{
+	unsigned int i;
+
+	for (i = 0; i < NR_LOCKS; i++) {
+		down_write(&bc->trees[i].lock);
+		__remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
+		up_write(&bc->trees[i].lock);
+	}
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * Linking of buffers:
  *	All buffers are linked to buffer_tree with their node field.
@@ -352,53 +863,6 @@ struct dm_bufio_client {
 	atomic_long_t need_shrink;
 };
 
-/*
- * Buffer state bits.
- */
-#define B_READING	0
-#define B_WRITING	1
-#define B_DIRTY		2
-
-/*
- * Describes how the block was allocated:
- * kmem_cache_alloc(), __get_free_pages() or vmalloc().
- * See the comment at alloc_buffer_data.
- */
-enum data_mode {
-	DATA_MODE_SLAB = 0,
-	DATA_MODE_GET_FREE_PAGES = 1,
-	DATA_MODE_VMALLOC = 2,
-	DATA_MODE_LIMIT = 3
-};
-
-struct dm_buffer {
-	struct rb_node node;
-	struct list_head lru_list;
-	struct list_head global_list;
-	sector_t block;
-	void *data;
-	unsigned char data_mode;		/* DATA_MODE_* */
-	unsigned char list_mode;		/* LIST_* */
-	blk_status_t read_error;
-	blk_status_t write_error;
-	unsigned int accessed;
-	unsigned int hold_count;
-	unsigned long state;
-	unsigned long last_accessed;
-	unsigned int dirty_start;
-	unsigned int dirty_end;
-	unsigned int write_start;
-	unsigned int write_end;
-	struct dm_bufio_client *c;
-	struct list_head write_list;
-	void (*end_io)(struct dm_buffer *buf, blk_status_t stat);
-#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
-#define MAX_STACK 10
-	unsigned int stack_len;
-	unsigned long stack_entries[MAX_STACK];
-#endif
-};
-
 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
 
 /*----------------------------------------------------------------*/
@@ -516,7 +980,7 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
 	return NULL;
 }
 
-static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
+static struct dm_buffer *__find_next_old(struct dm_bufio_client *c, sector_t block)
 {
 	struct rb_node *n = c->buffer_tree.rb_node;
 	struct dm_buffer *b;
@@ -1040,7 +1504,7 @@ static void __flush_write_list(struct list_head *write_list)
  */
 static void __make_buffer_clean(struct dm_buffer *b)
 {
-	BUG_ON(b->hold_count);
+	BUG_ON(b->__hold_count);
 
 	/* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
 	if (!smp_load_acquire(&b->state))	/* fast case */
@@ -1067,7 +1531,7 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
 		    unlikely(test_bit_acquire(B_READING, &b->state)))
 			continue;
 
-		if (!b->hold_count) {
+		if (!b->__hold_count) {
 			__make_buffer_clean(b);
 			__unlink_buffer(b);
 			return b;
@@ -1081,7 +1545,7 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
 	list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
 		BUG_ON(test_bit(B_READING, &b->state));
 
-		if (!b->hold_count) {
+		if (!b->__hold_count) {
 			__make_buffer_clean(b);
 			__unlink_buffer(b);
 			return b;
@@ -1283,7 +1747,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 	__check_watermark(c, write_list);
 
 	b = new_b;
-	b->hold_count = 1;
+	b->__hold_count = 1;
 	b->read_error = 0;
 	b->write_error = 0;
 	__link_buffer(b, block, LIST_CLEAN);
@@ -1311,7 +1775,7 @@ found_buffer:
 	if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state)))
 		return NULL;
 
-	b->hold_count++;
+	b->__hold_count++;
 	__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
 		     test_bit(B_WRITING, &b->state));
 	return b;
@@ -1460,10 +1924,10 @@ void dm_bufio_release(struct dm_buffer *b)
 
 	dm_bufio_lock(c);
 
-	BUG_ON(!b->hold_count);
+	BUG_ON(!b->__hold_count);
 
-	b->hold_count--;
-	if (!b->hold_count) {
+	b->__hold_count--;
+	if (!b->__hold_count) {
 		wake_up(&c->free_buffer_wait);
 
 		/*
@@ -1564,12 +2028,12 @@ again:
 		if (test_bit(B_WRITING, &b->state)) {
 			if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
 				dropped_lock = 1;
-				b->hold_count++;
+				b->__hold_count++;
 				dm_bufio_unlock(c);
 				wait_on_bit_io(&b->state, B_WRITING,
 					       TASK_UNINTERRUPTIBLE);
 				dm_bufio_lock(c);
-				b->hold_count--;
+				b->__hold_count--;
 			} else
 				wait_on_bit_io(&b->state, B_WRITING,
 					       TASK_UNINTERRUPTIBLE);
@@ -1660,7 +2124,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
 
 static void forget_buffer_locked(struct dm_buffer *b)
 {
-	if (likely(!b->hold_count) && likely(!smp_load_acquire(&b->state))) {
+	if (likely(!b->__hold_count) && likely(!smp_load_acquire(&b->state))) {
 		__unlink_buffer(b);
 		__free_buffer_wake(b);
 	}
@@ -1694,7 +2158,7 @@ void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t
 	while (block < end_block) {
 		dm_bufio_lock(c);
 
-		b = __find_next(c, block);
+		b = __find_next_old(c, block);
 		if (b) {
 			block = b->block + 1;
 			forget_buffer_locked(b);
@@ -1791,7 +2255,7 @@ static void drop_buffers(struct dm_bufio_client *c)
 			WARN_ON(!warned);
 			warned = true;
 			DMERR("leaked buffer %llx, hold count %u, list %d",
-			      (unsigned long long)b->block, b->hold_count, i);
+			      (unsigned long long)b->block, b->__hold_count, i);
 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 			stack_trace_print(b->stack_entries, b->stack_len, 1);
 			/* mark unclaimed to avoid WARN_ON below */
@@ -1828,7 +2292,7 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
 			return false;
 	}
 
-	if (b->hold_count)
+	if (b->__hold_count)
 		return false;
 
 	__make_buffer_clean(b);
-- 
cgit v1.2.3


From 450e8dee51aa6fa1dd0f64073e88235f1a77b035 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 10 Mar 2023 16:02:29 +0000
Subject: dm bufio: improve concurrent IO performance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When multiple threads perform IO to a thin device, the underlying
dm_bufio object can become a bottleneck; slowing down access to btree
nodes that store the thin metadata. Prior to this commit, each bufio
instance had a single mutex that was taken for every bufio operation.

This commit concentrates on improving the common case where: a user of
dm_bufio wishes to access, but not modify, a buffer which is already
within the dm_bufio cache.

Implementation::

  The code has been refactored; pulling out an 'lru' abstraction and a
  'buffer cache' abstraction (see 2 previous commits). This commit
  updates higher level bufio code (that performs allocation of buffers,
  IO and eviction/cache sizing) to leverage both abstractions. It also
  deals with the delicate locking requirements of both abstractions to
  provide finer grained locking. The result is significantly better
  concurrent IO performance.

  Before this commit, bufio has a global lru list it used to evict the
  oldest, clean buffers from _all_ clients. With the new locking we
  don’t want different ways to access the same buffer, so instead
  do_global_cleanup() loops around the clients asking them to free
  buffers older than a certain time.

  This commit also converts many old BUG_ONs to WARN_ON_ONCE, see the
  lru_evict and cache_evict code in particular.  They will return
  ER_DONT_EVICT if a given buffer somehow meets the invariants that
  should _never_ happen. [Aside from revising this commit's header and
  fixing coding style and whitespace nits: this switching to
  WARN_ON_ONCE is Mike Snitzer's lone contribution to this commit]

Testing::

  Some of the low level functions have been unit tested using dm-unit:
    https://github.com/jthornber/dm-unit/blob/main/src/tests/bufio.rs

  Higher level concurrency and IO is tested via a test only target
  found here:
    https://github.com/jthornber/linux/blob/2023-03-24-thin-concurrency-9/drivers/md/dm-bufio-test.c

  The associated userland side of these tests is here:
    https://github.com/jthornber/dmtest-python/blob/main/src/dmtest/bufio/bufio_tests.py

  In addition the full dmtest suite of tests (dm-thin, dm-cache, etc)
  has been run (~450 tests).

Performance::

  Most bufio operations have unchanged performance. But if multiple
  threads are attempting to get buffers concurrently, and these
  buffers are already in the cache then there's a big speed up. Eg,
  one test has 16 'hotspot' threads simulating btree lookups while
  another thread dirties the whole device. In this case the hotspot
  threads acquire the buffers about 25 times faster.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 941 +++++++++++++++++++++++++-------------------------
 1 file changed, 478 insertions(+), 463 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 9daff9b77cee..1e000ec73bd6 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -321,37 +321,42 @@ enum data_mode {
 };
 
 struct dm_buffer {
+	/* protected by the locks in dm_buffer_cache */
 	struct rb_node node;
-	struct list_head lru_list;
-	struct list_head global_list;
 
+	/* immutable, so don't need protecting */
 	sector_t block;
 	void *data;
 	unsigned char data_mode;		/* DATA_MODE_* */
 
-	unsigned int accessed;
-	unsigned int __hold_count;
+	/*
+	 * These two fields are used in isolation, so do not need
+	 * a surrounding lock.
+	 */
+	atomic_t hold_count;
 	unsigned long last_accessed;
 
+	/*
+	 * Everything else is protected by the mutex in
+	 * dm_bufio_client
+	 */
+	unsigned long state;
+	struct lru_entry lru;
 	unsigned char list_mode;		/* LIST_* */
 	blk_status_t read_error;
 	blk_status_t write_error;
-	unsigned long state;
 	unsigned int dirty_start;
 	unsigned int dirty_end;
 	unsigned int write_start;
 	unsigned int write_end;
-	struct dm_bufio_client *c;
 	struct list_head write_list;
+	struct dm_bufio_client *c;
 	void (*end_io)(struct dm_buffer *b, blk_status_t bs);
 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 #define MAX_STACK 10
 	unsigned int stack_len;
 	unsigned long stack_entries[MAX_STACK];
 #endif
-	/* Temp members to allow dm_buffer_cache code to compile */
-	atomic_t hold_count;
-	struct lru_entry lru;
 };
 
 /*--------------------------------------------------------------*/
@@ -814,7 +819,7 @@ static void cache_remove_range(struct dm_buffer_cache *bc,
 
 /*
  * Linking of buffers:
- *	All buffers are linked to buffer_tree with their node field.
+ *	All buffers are linked to buffer_cache with their node field.
  *
  *	Clean buffers that are not being written (B_WRITING not set)
  *	are linked to lru[LIST_CLEAN] with their lru_list field.
@@ -832,9 +837,6 @@ struct dm_bufio_client {
 	spinlock_t spinlock;
 	bool no_sleep;
 
-	struct list_head lru[LIST_SIZE];
-	unsigned long n_buffers[LIST_SIZE];
-
 	struct block_device *bdev;
 	unsigned int block_size;
 	s8 sectors_per_block_bits;
@@ -849,7 +851,7 @@ struct dm_bufio_client {
 
 	unsigned int minimum_buffers;
 
-	struct rb_root buffer_tree;
+	struct dm_buffer_cache cache;
 	wait_queue_head_t free_buffer_wait;
 
 	sector_t start;
@@ -861,6 +863,11 @@ struct dm_bufio_client {
 	struct shrinker shrinker;
 	struct work_struct shrink_work;
 	atomic_long_t need_shrink;
+
+	/*
+	 * Used by global_cleanup to sort the clients list.
+	 */
+	unsigned long oldest_buffer;
 };
 
 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
@@ -877,14 +884,6 @@ static void dm_bufio_lock(struct dm_bufio_client *c)
 		mutex_lock_nested(&c->lock, dm_bufio_in_request());
 }
 
-static int dm_bufio_trylock(struct dm_bufio_client *c)
-{
-	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
-		return spin_trylock_bh(&c->spinlock);
-	else
-		return mutex_trylock(&c->lock);
-}
-
 static void dm_bufio_unlock(struct dm_bufio_client *c)
 {
 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
@@ -913,10 +912,6 @@ static unsigned long dm_bufio_cache_size_latch;
 
 static DEFINE_SPINLOCK(global_spinlock);
 
-static LIST_HEAD(global_queue);
-
-static unsigned long global_num;
-
 /*
  * Buffers are freed after this timeout
  */
@@ -958,78 +953,6 @@ static void buffer_record_stack(struct dm_buffer *b)
 }
 #endif
 
-/*
- *----------------------------------------------------------------
- * A red/black tree acts as an index for all the buffers.
- *----------------------------------------------------------------
- */
-static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
-{
-	struct rb_node *n = c->buffer_tree.rb_node;
-	struct dm_buffer *b;
-
-	while (n) {
-		b = container_of(n, struct dm_buffer, node);
-
-		if (b->block == block)
-			return b;
-
-		n = block < b->block ? n->rb_left : n->rb_right;
-	}
-
-	return NULL;
-}
-
-static struct dm_buffer *__find_next_old(struct dm_bufio_client *c, sector_t block)
-{
-	struct rb_node *n = c->buffer_tree.rb_node;
-	struct dm_buffer *b;
-	struct dm_buffer *best = NULL;
-
-	while (n) {
-		b = container_of(n, struct dm_buffer, node);
-
-		if (b->block == block)
-			return b;
-
-		if (block <= b->block) {
-			n = n->rb_left;
-			best = b;
-		} else {
-			n = n->rb_right;
-		}
-	}
-
-	return best;
-}
-
-static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
-{
-	struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
-	struct dm_buffer *found;
-
-	while (*new) {
-		found = container_of(*new, struct dm_buffer, node);
-
-		if (found->block == b->block) {
-			BUG_ON(found != b);
-			return;
-		}
-
-		parent = *new;
-		new = b->block < found->block ?
-			&found->node.rb_left : &found->node.rb_right;
-	}
-
-	rb_link_node(&b->node, parent, new);
-	rb_insert_color(&b->node, &c->buffer_tree);
-}
-
-static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
-{
-	rb_erase(&b->node, &c->buffer_tree);
-}
-
 /*----------------------------------------------------------------*/
 
 static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
@@ -1057,16 +980,9 @@ static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
 	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
 		dm_bufio_peak_allocated = dm_bufio_current_allocated;
 
-	b->accessed = 1;
-
 	if (!unlink) {
-		list_add(&b->global_list, &global_queue);
-		global_num++;
 		if (dm_bufio_current_allocated > dm_bufio_cache_size)
 			queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
-	} else {
-		list_del(&b->global_list);
-		global_num--;
 	}
 
 	spin_unlock(&global_spinlock);
@@ -1196,6 +1112,7 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
 		kmem_cache_free(c->slab_buffer, b);
 		return NULL;
 	}
+	adjust_total_allocated(b, false);
 
 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 	b->stack_len = 0;
@@ -1210,61 +1127,11 @@ static void free_buffer(struct dm_buffer *b)
 {
 	struct dm_bufio_client *c = b->c;
 
+	adjust_total_allocated(b, true);
 	free_buffer_data(c, b->data, b->data_mode);
 	kmem_cache_free(c->slab_buffer, b);
 }
 
-/*
- * Link buffer to the buffer tree and clean or dirty queue.
- */
-static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
-{
-	struct dm_bufio_client *c = b->c;
-
-	c->n_buffers[dirty]++;
-	b->block = block;
-	b->list_mode = dirty;
-	list_add(&b->lru_list, &c->lru[dirty]);
-	__insert(b->c, b);
-	b->last_accessed = jiffies;
-
-	adjust_total_allocated(b, false);
-}
-
-/*
- * Unlink buffer from the buffer tree and dirty or clean queue.
- */
-static void __unlink_buffer(struct dm_buffer *b)
-{
-	struct dm_bufio_client *c = b->c;
-
-	BUG_ON(!c->n_buffers[b->list_mode]);
-
-	c->n_buffers[b->list_mode]--;
-	__remove(b->c, b);
-	list_del(&b->lru_list);
-
-	adjust_total_allocated(b, true);
-}
-
-/*
- * Place the buffer to the head of dirty or clean LRU queue.
- */
-static void __relink_lru(struct dm_buffer *b, int dirty)
-{
-	struct dm_bufio_client *c = b->c;
-
-	b->accessed = 1;
-
-	BUG_ON(!c->n_buffers[b->list_mode]);
-
-	c->n_buffers[b->list_mode]--;
-	c->n_buffers[dirty]++;
-	b->list_mode = dirty;
-	list_move(&b->lru_list, &c->lru[dirty]);
-	b->last_accessed = jiffies;
-}
-
 /*
  *--------------------------------------------------------------------------
  * Submit I/O on the buffer.
@@ -1504,7 +1371,7 @@ static void __flush_write_list(struct list_head *write_list)
  */
 static void __make_buffer_clean(struct dm_buffer *b)
 {
-	BUG_ON(b->__hold_count);
+	BUG_ON(atomic_read(&b->hold_count));
 
 	/* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
 	if (!smp_load_acquire(&b->state))	/* fast case */
@@ -1515,6 +1382,36 @@ static void __make_buffer_clean(struct dm_buffer *b)
 	wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
 }
 
+static enum evict_result is_clean(struct dm_buffer *b, void *context)
+{
+	struct dm_bufio_client *c = context;
+
+	/* These should never happen */
+	if (WARN_ON_ONCE(test_bit(B_WRITING, &b->state)))
+		return ER_DONT_EVICT;
+	if (WARN_ON_ONCE(test_bit(B_DIRTY, &b->state)))
+		return ER_DONT_EVICT;
+	if (WARN_ON_ONCE(b->list_mode != LIST_CLEAN))
+		return ER_DONT_EVICT;
+
+	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
+	    unlikely(test_bit(B_READING, &b->state)))
+		return ER_DONT_EVICT;
+
+	return ER_EVICT;
+}
+
+static enum evict_result is_dirty(struct dm_buffer *b, void *context)
+{
+	/* These should never happen */
+	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
+		return ER_DONT_EVICT;
+	if (WARN_ON_ONCE(b->list_mode != LIST_DIRTY))
+		return ER_DONT_EVICT;
+
+	return ER_EVICT;
+}
+
 /*
  * Find some buffer that is not held by anybody, clean it, unlink it and
  * return it.
@@ -1523,34 +1420,20 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
 {
 	struct dm_buffer *b;
 
-	list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
-		BUG_ON(test_bit(B_WRITING, &b->state));
-		BUG_ON(test_bit(B_DIRTY, &b->state));
-
-		if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
-		    unlikely(test_bit_acquire(B_READING, &b->state)))
-			continue;
-
-		if (!b->__hold_count) {
-			__make_buffer_clean(b);
-			__unlink_buffer(b);
-			return b;
-		}
-		cond_resched();
+	b = cache_evict(&c->cache, LIST_CLEAN, is_clean, c);
+	if (b) {
+		/* this also waits for pending reads */
+		__make_buffer_clean(b);
+		return b;
 	}
 
 	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
 		return NULL;
 
-	list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
-		BUG_ON(test_bit(B_READING, &b->state));
-
-		if (!b->__hold_count) {
-			__make_buffer_clean(b);
-			__unlink_buffer(b);
-			return b;
-		}
-		cond_resched();
+	b = cache_evict(&c->cache, LIST_DIRTY, is_dirty, NULL);
+	if (b) {
+		__make_buffer_clean(b);
+		return b;
 	}
 
 	return NULL;
@@ -1571,7 +1454,12 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	dm_bufio_unlock(c);
 
-	io_schedule();
+	/*
+	 * It's possible to miss a wake up event since we don't always
+	 * hold c->lock when wake_up is called.  So we have a timeout here,
+	 * just in case.
+	 */
+	io_schedule_timeout(5 * HZ);
 
 	remove_wait_queue(&c->free_buffer_wait, &wait);
 
@@ -1629,9 +1517,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
 		}
 
 		if (!list_empty(&c->reserved_buffers)) {
-			b = list_entry(c->reserved_buffers.next,
-				       struct dm_buffer, lru_list);
-			list_del(&b->lru_list);
+			b = list_to_buffer(c->reserved_buffers.next);
+			list_del(&b->lru.list);
 			c->need_reserved_buffers++;
 
 			return b;
@@ -1665,36 +1552,56 @@ static void __free_buffer_wake(struct dm_buffer *b)
 {
 	struct dm_bufio_client *c = b->c;
 
+	b->block = -1;
 	if (!c->need_reserved_buffers)
 		free_buffer(b);
 	else {
-		list_add(&b->lru_list, &c->reserved_buffers);
+		list_add(&b->lru.list, &c->reserved_buffers);
 		c->need_reserved_buffers--;
 	}
 
 	wake_up(&c->free_buffer_wait);
 }
 
-static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
-					struct list_head *write_list)
+static enum evict_result cleaned(struct dm_buffer *b, void *context)
 {
-	struct dm_buffer *b, *tmp;
+	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
+		return ER_DONT_EVICT; /* should never happen */
 
-	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
-		BUG_ON(test_bit(B_READING, &b->state));
+	if (test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state))
+		return ER_DONT_EVICT;
+	else
+		return ER_EVICT;
+}
 
-		if (!test_bit(B_DIRTY, &b->state) &&
-		    !test_bit(B_WRITING, &b->state)) {
-			__relink_lru(b, LIST_CLEAN);
-			continue;
-		}
+static void __move_clean_buffers(struct dm_bufio_client *c)
+{
+	cache_mark_many(&c->cache, LIST_DIRTY, LIST_CLEAN, cleaned, NULL);
+}
 
-		if (no_wait && test_bit(B_WRITING, &b->state))
-			return;
+struct write_context {
+	int no_wait;
+	struct list_head *write_list;
+};
 
-		__write_dirty_buffer(b, write_list);
-		cond_resched();
-	}
+static enum it_action write_one(struct dm_buffer *b, void *context)
+{
+	struct write_context *wc = context;
+
+	if (wc->no_wait && test_bit(B_WRITING, &b->state))
+		return IT_COMPLETE;
+
+	__write_dirty_buffer(b, wc->write_list);
+	return IT_NEXT;
+}
+
+static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
+					struct list_head *write_list)
+{
+	struct write_context wc = {.no_wait = no_wait, .write_list = write_list};
+
+	__move_clean_buffers(c);
+	cache_iterate(&c->cache, LIST_DIRTY, write_one, &wc);
 }
 
 /*
@@ -1705,7 +1612,8 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
 static void __check_watermark(struct dm_bufio_client *c,
 			      struct list_head *write_list)
 {
-	if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
+	if (cache_count(&c->cache, LIST_DIRTY) >
+	    cache_count(&c->cache, LIST_CLEAN) * DM_BUFIO_WRITEBACK_RATIO)
 		__write_dirty_buffers_async(c, 1, write_list);
 }
 
@@ -1715,6 +1623,21 @@ static void __check_watermark(struct dm_bufio_client *c,
  *--------------------------------------------------------------
  */
 
+static void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+	/*
+	 * Relying on waitqueue_active() is racey, but we sleep
+	 * with schedule_timeout anyway.
+	 */
+	if (cache_put(&c->cache, b) &&
+	    unlikely(waitqueue_active(&c->free_buffer_wait)))
+		wake_up(&c->free_buffer_wait);
+}
+
+/*
+ * This assumes you have already checked the cache to see if the buffer
+ * is already present (it will recheck after dropping the lock for allocation).
+ */
 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 				     enum new_flag nf, int *need_submit,
 				     struct list_head *write_list)
@@ -1723,11 +1646,8 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 
 	*need_submit = 0;
 
-	b = __find(c, block);
-	if (b)
-		goto found_buffer;
-
-	if (nf == NF_GET)
+	/* This can't be called with NF_GET */
+	if (WARN_ON_ONCE(nf == NF_GET))
 		return NULL;
 
 	new_b = __alloc_buffer_wait(c, nf);
@@ -1738,7 +1658,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 	 * We've had a period where the mutex was unlocked, so need to
 	 * recheck the buffer tree.
 	 */
-	b = __find(c, block);
+	b = cache_get(&c->cache, block);
 	if (b) {
 		__free_buffer_wake(new_b);
 		goto found_buffer;
@@ -1747,24 +1667,35 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 	__check_watermark(c, write_list);
 
 	b = new_b;
-	b->__hold_count = 1;
+	atomic_set(&b->hold_count, 1);
+	WRITE_ONCE(b->last_accessed, jiffies);
+	b->block = block;
 	b->read_error = 0;
 	b->write_error = 0;
-	__link_buffer(b, block, LIST_CLEAN);
+	b->list_mode = LIST_CLEAN;
 
-	if (nf == NF_FRESH) {
+	if (nf == NF_FRESH)
 		b->state = 0;
-		return b;
+	else {
+		b->state = 1 << B_READING;
+		*need_submit = 1;
 	}
 
-	b->state = 1 << B_READING;
-	*need_submit = 1;
+	/*
+	 * We mustn't insert into the cache until the B_READING state
+	 * is set.  Otherwise another thread could get it and use
+	 * it before it had been read.
+	 */
+	cache_insert(&c->cache, b);
 
 	return b;
 
 found_buffer:
-	if (nf == NF_PREFETCH)
+	if (nf == NF_PREFETCH) {
+		cache_put_and_wake(c, b);
 		return NULL;
+	}
+
 	/*
 	 * Note: it is essential that we don't wait for the buffer to be
 	 * read if dm_bufio_get function is used. Both dm_bufio_get and
@@ -1772,12 +1703,11 @@ found_buffer:
 	 * If the user called both dm_bufio_prefetch and dm_bufio_get on
 	 * the same buffer, it would deadlock if we waited.
 	 */
-	if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state)))
+	if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
+		cache_put_and_wake(c, b);
 		return NULL;
+	}
 
-	b->__hold_count++;
-	__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
-		     test_bit(B_WRITING, &b->state));
 	return b;
 }
 
@@ -1807,18 +1737,50 @@ static void read_endio(struct dm_buffer *b, blk_status_t status)
 static void *new_read(struct dm_bufio_client *c, sector_t block,
 		      enum new_flag nf, struct dm_buffer **bp)
 {
-	int need_submit;
+	int need_submit = 0;
 	struct dm_buffer *b;
 
 	LIST_HEAD(write_list);
 
-	dm_bufio_lock(c);
-	b = __bufio_new(c, block, nf, &need_submit, &write_list);
+	*bp = NULL;
+
+	/*
+	 * Fast path, hopefully the block is already in the cache.  No need
+	 * to get the client lock for this.
+	 */
+	b = cache_get(&c->cache, block);
+	if (b) {
+		if (nf == NF_PREFETCH) {
+			cache_put_and_wake(c, b);
+			return NULL;
+		}
+
+		/*
+		 * Note: it is essential that we don't wait for the buffer to be
+		 * read if dm_bufio_get function is used. Both dm_bufio_get and
+		 * dm_bufio_prefetch can be used in the driver request routine.
+		 * If the user called both dm_bufio_prefetch and dm_bufio_get on
+		 * the same buffer, it would deadlock if we waited.
+		 */
+		if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
+			cache_put_and_wake(c, b);
+			return NULL;
+		}
+	}
+
+	if (!b) {
+		if (nf == NF_GET)
+			return NULL;
+
+		dm_bufio_lock(c);
+		b = __bufio_new(c, block, nf, &need_submit, &write_list);
+		dm_bufio_unlock(c);
+	}
+
 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
-	if (b && b->hold_count == 1)
+	if (b && (atomic_read(&b->hold_count) == 1))
 		buffer_record_stack(b);
 #endif
-	dm_bufio_unlock(c);
 
 	__flush_write_list(&write_list);
 
@@ -1881,12 +1843,19 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 		return; /* should never happen */
 
 	blk_start_plug(&plug);
-	dm_bufio_lock(c);
 
 	for (; n_blocks--; block++) {
 		int need_submit;
 		struct dm_buffer *b;
 
+		b = cache_get(&c->cache, block);
+		if (b) {
+			/* already in cache */
+			cache_put_and_wake(c, b);
+			continue;
+		}
+
+		dm_bufio_lock(c);
 		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
 				&write_list);
 		if (unlikely(!list_empty(&write_list))) {
@@ -1909,10 +1878,9 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 				goto flush_plug;
 			dm_bufio_lock(c);
 		}
+		dm_bufio_unlock(c);
 	}
 
-	dm_bufio_unlock(c);
-
 flush_plug:
 	blk_finish_plug(&plug);
 }
@@ -1922,29 +1890,28 @@ void dm_bufio_release(struct dm_buffer *b)
 {
 	struct dm_bufio_client *c = b->c;
 
-	dm_bufio_lock(c);
-
-	BUG_ON(!b->__hold_count);
-
-	b->__hold_count--;
-	if (!b->__hold_count) {
-		wake_up(&c->free_buffer_wait);
+	/*
+	 * If there were errors on the buffer, and the buffer is not
+	 * to be written, free the buffer. There is no point in caching
+	 * invalid buffer.
+	 */
+	if ((b->read_error || b->write_error) &&
+	    !test_bit_acquire(B_READING, &b->state) &&
+	    !test_bit(B_WRITING, &b->state) &&
+	    !test_bit(B_DIRTY, &b->state)) {
+		dm_bufio_lock(c);
 
-		/*
-		 * If there were errors on the buffer, and the buffer is not
-		 * to be written, free the buffer. There is no point in caching
-		 * invalid buffer.
-		 */
-		if ((b->read_error || b->write_error) &&
-		    !test_bit_acquire(B_READING, &b->state) &&
-		    !test_bit(B_WRITING, &b->state) &&
-		    !test_bit(B_DIRTY, &b->state)) {
-			__unlink_buffer(b);
+		/* cache remove can fail if there are other holders */
+		if (cache_remove(&c->cache, b)) {
 			__free_buffer_wake(b);
+			dm_bufio_unlock(c);
+			return;
 		}
+
+		dm_bufio_unlock(c);
 	}
 
-	dm_bufio_unlock(c);
+	cache_put_and_wake(c, b);
 }
 EXPORT_SYMBOL_GPL(dm_bufio_release);
 
@@ -1963,7 +1930,7 @@ void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
 	if (!test_and_set_bit(B_DIRTY, &b->state)) {
 		b->dirty_start = start;
 		b->dirty_end = end;
-		__relink_lru(b, LIST_DIRTY);
+		cache_mark(&c->cache, b, LIST_DIRTY);
 	} else {
 		if (start < b->dirty_start)
 			b->dirty_start = start;
@@ -2002,11 +1969,19 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
  *
  * Finally, we flush hardware disk cache.
  */
+static bool is_writing(struct lru_entry *e, void *context)
+{
+	struct dm_buffer *b = le_to_buffer(e);
+
+	return test_bit(B_WRITING, &b->state);
+}
+
 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
 {
 	int a, f;
-	unsigned long buffers_processed = 0;
-	struct dm_buffer *b, *tmp;
+	unsigned long nr_buffers;
+	struct lru_entry *e;
+	struct lru_iter it;
 
 	LIST_HEAD(write_list);
 
@@ -2016,52 +1991,32 @@ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
 	__flush_write_list(&write_list);
 	dm_bufio_lock(c);
 
-again:
-	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
-		int dropped_lock = 0;
-
-		if (buffers_processed < c->n_buffers[LIST_DIRTY])
-			buffers_processed++;
+	nr_buffers = cache_count(&c->cache, LIST_DIRTY);
+	lru_iter_begin(&c->cache.lru[LIST_DIRTY], &it);
+	while ((e = lru_iter_next(&it, is_writing, c))) {
+		struct dm_buffer *b = le_to_buffer(e);
+		__cache_inc_buffer(b);
 
 		BUG_ON(test_bit(B_READING, &b->state));
 
-		if (test_bit(B_WRITING, &b->state)) {
-			if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
-				dropped_lock = 1;
-				b->__hold_count++;
-				dm_bufio_unlock(c);
-				wait_on_bit_io(&b->state, B_WRITING,
-					       TASK_UNINTERRUPTIBLE);
-				dm_bufio_lock(c);
-				b->__hold_count--;
-			} else
-				wait_on_bit_io(&b->state, B_WRITING,
-					       TASK_UNINTERRUPTIBLE);
+		if (nr_buffers) {
+			nr_buffers--;
+			dm_bufio_unlock(c);
+			wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
+			dm_bufio_lock(c);
+		} else {
+			wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
 		}
 
-		if (!test_bit(B_DIRTY, &b->state) &&
-		    !test_bit(B_WRITING, &b->state))
-			__relink_lru(b, LIST_CLEAN);
+		if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state))
+			cache_mark(&c->cache, b, LIST_CLEAN);
 
-		cond_resched();
+		cache_put_and_wake(c, b);
 
-		/*
-		 * If we dropped the lock, the list is no longer consistent,
-		 * so we must restart the search.
-		 *
-		 * In the most common case, the buffer just processed is
-		 * relinked to the clean list, so we won't loop scanning the
-		 * same buffer again and again.
-		 *
-		 * This may livelock if there is another thread simultaneously
-		 * dirtying buffers, so we count the number of buffers walked
-		 * and if it exceeds the total number of buffers, it means that
-		 * someone is doing some writes simultaneously with us.  In
-		 * this case, stop, dropping the lock.
-		 */
-		if (dropped_lock)
-			goto again;
+		cond_resched();
 	}
+	lru_iter_end(&it);
+
 	wake_up(&c->free_buffer_wait);
 	dm_bufio_unlock(c);
 
@@ -2122,12 +2077,23 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c
 }
 EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
 
-static void forget_buffer_locked(struct dm_buffer *b)
+static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
 {
-	if (likely(!b->__hold_count) && likely(!smp_load_acquire(&b->state))) {
-		__unlink_buffer(b);
-		__free_buffer_wake(b);
+	struct dm_buffer *b;
+
+	b = cache_get(&c->cache, block);
+	if (b) {
+		if (likely(!smp_load_acquire(&b->state))) {
+			if (cache_remove(&c->cache, b))
+				__free_buffer_wake(b);
+			else
+				cache_put_and_wake(c, b);
+		} else {
+			cache_put_and_wake(c, b);
+		}
 	}
+
+	return b ? true : false;
 }
 
 /*
@@ -2138,38 +2104,22 @@ static void forget_buffer_locked(struct dm_buffer *b)
  */
 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
 {
-	struct dm_buffer *b;
-
 	dm_bufio_lock(c);
-
-	b = __find(c, block);
-	if (b)
-		forget_buffer_locked(b);
-
+	forget_buffer(c, block);
 	dm_bufio_unlock(c);
 }
 EXPORT_SYMBOL_GPL(dm_bufio_forget);
 
-void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
+static enum evict_result idle(struct dm_buffer *b, void *context)
 {
-	struct dm_buffer *b;
-	sector_t end_block = block + n_blocks;
-
-	while (block < end_block) {
-		dm_bufio_lock(c);
-
-		b = __find_next_old(c, block);
-		if (b) {
-			block = b->block + 1;
-			forget_buffer_locked(b);
-		}
-
-		dm_bufio_unlock(c);
-
-		if (!b)
-			break;
-	}
+	return b->state ? ER_DONT_EVICT : ER_EVICT;
+}
 
+void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
+{
+	dm_bufio_lock(c);
+	cache_remove_range(&c->cache, block, block + n_blocks, idle, __free_buffer_wake);
+	dm_bufio_unlock(c);
 }
 EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
 
@@ -2231,11 +2181,26 @@ struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
 }
 EXPORT_SYMBOL_GPL(dm_bufio_get_client);
 
+static enum it_action warn_leak(struct dm_buffer *b, void *context)
+{
+	bool *warned = context;
+
+	WARN_ON(!(*warned));
+	*warned = true;
+	DMERR("leaked buffer %llx, hold count %u, list %d",
+	      (unsigned long long)b->block, atomic_read(&b->hold_count), b->list_mode);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	stack_trace_print(b->stack_entries, b->stack_len, 1);
+	/* mark unclaimed to avoid WARN_ON at end of drop_buffers() */
+	atomic_set(&b->hold_count, 0);
+#endif
+	return IT_NEXT;
+}
+
 static void drop_buffers(struct dm_bufio_client *c)
 {
-	struct dm_buffer *b;
 	int i;
-	bool warned = false;
+	struct dm_buffer *b;
 
 	if (WARN_ON(dm_bufio_in_request()))
 		return; /* should never happen */
@@ -2250,18 +2215,11 @@ static void drop_buffers(struct dm_bufio_client *c)
 	while ((b = __get_unclaimed_buffer(c)))
 		__free_buffer_wake(b);
 
-	for (i = 0; i < LIST_SIZE; i++)
-		list_for_each_entry(b, &c->lru[i], lru_list) {
-			WARN_ON(!warned);
-			warned = true;
-			DMERR("leaked buffer %llx, hold count %u, list %d",
-			      (unsigned long long)b->block, b->__hold_count, i);
-#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
-			stack_trace_print(b->stack_entries, b->stack_len, 1);
-			/* mark unclaimed to avoid WARN_ON below */
-			b->hold_count = 0;
-#endif
-		}
+	for (i = 0; i < LIST_SIZE; i++) {
+		bool warned = false;
+
+		cache_iterate(&c->cache, i, warn_leak, &warned);
+	}
 
 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 	while ((b = __get_unclaimed_buffer(c)))
@@ -2269,39 +2227,11 @@ static void drop_buffers(struct dm_bufio_client *c)
 #endif
 
 	for (i = 0; i < LIST_SIZE; i++)
-		WARN_ON(!list_empty(&c->lru[i]));
+		WARN_ON(cache_count(&c->cache, i));
 
 	dm_bufio_unlock(c);
 }
 
-/*
- * We may not be able to evict this buffer if IO pending or the client
- * is still using it.  Caller is expected to know buffer is too old.
- *
- * And if GFP_NOFS is used, we must not do any I/O because we hold
- * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
- * rerouted to different bufio client.
- */
-static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
-{
-	if (!(gfp & __GFP_FS) ||
-	    (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
-		if (test_bit_acquire(B_READING, &b->state) ||
-		    test_bit(B_WRITING, &b->state) ||
-		    test_bit(B_DIRTY, &b->state))
-			return false;
-	}
-
-	if (b->__hold_count)
-		return false;
-
-	__make_buffer_clean(b);
-	__unlink_buffer(b);
-	__free_buffer_wake(b);
-
-	return true;
-}
-
 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
 {
 	unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
@@ -2317,22 +2247,28 @@ static unsigned long get_retain_buffers(struct dm_bufio_client *c)
 static void __scan(struct dm_bufio_client *c)
 {
 	int l;
-	struct dm_buffer *b, *tmp;
+	struct dm_buffer *b;
 	unsigned long freed = 0;
-	unsigned long count = c->n_buffers[LIST_CLEAN] +
-			      c->n_buffers[LIST_DIRTY];
 	unsigned long retain_target = get_retain_buffers(c);
+	unsigned long count = cache_total(&c->cache);
 
 	for (l = 0; l < LIST_SIZE; l++) {
-		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
+		while (true) {
 			if (count - freed <= retain_target)
 				atomic_long_set(&c->need_shrink, 0);
 			if (!atomic_long_read(&c->need_shrink))
-				return;
-			if (__try_evict_buffer(b, GFP_KERNEL)) {
-				atomic_long_dec(&c->need_shrink);
-				freed++;
-			}
+				break;
+
+			b = cache_evict(&c->cache, l,
+					l == LIST_CLEAN ? is_clean : is_dirty, c);
+			if (!b)
+				break;
+
+			__make_buffer_clean(b);
+			__free_buffer_wake(b);
+
+			atomic_long_dec(&c->need_shrink);
+			freed++;
 			cond_resched();
 		}
 	}
@@ -2361,8 +2297,7 @@ static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink
 static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
-	unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
-			      READ_ONCE(c->n_buffers[LIST_DIRTY]);
+	unsigned long count = cache_total(&c->cache);
 	unsigned long retain_target = get_retain_buffers(c);
 	unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
 
@@ -2390,7 +2325,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 {
 	int r;
 	struct dm_bufio_client *c;
-	unsigned int i;
 	char slab_name[27];
 
 	if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
@@ -2404,7 +2338,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 		r = -ENOMEM;
 		goto bad_client;
 	}
-	c->buffer_tree = RB_ROOT;
+	cache_init(&c->cache);
 
 	c->bdev = bdev;
 	c->block_size = block_size;
@@ -2421,11 +2355,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 		static_branch_inc(&no_sleep_enabled);
 	}
 
-	for (i = 0; i < LIST_SIZE; i++) {
-		INIT_LIST_HEAD(&c->lru[i]);
-		c->n_buffers[i] = 0;
-	}
-
 	mutex_init(&c->lock);
 	spin_lock_init(&c->spinlock);
 	INIT_LIST_HEAD(&c->reserved_buffers);
@@ -2497,9 +2426,9 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 
 bad:
 	while (!list_empty(&c->reserved_buffers)) {
-		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
-						 struct dm_buffer, lru_list);
-		list_del(&b->lru_list);
+		struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
+
+		list_del(&b->lru.list);
 		free_buffer(b);
 	}
 	kmem_cache_destroy(c->slab_cache);
@@ -2536,23 +2465,23 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 
 	mutex_unlock(&dm_bufio_clients_lock);
 
-	WARN_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
 	WARN_ON(c->need_reserved_buffers);
 
 	while (!list_empty(&c->reserved_buffers)) {
-		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
-						 struct dm_buffer, lru_list);
-		list_del(&b->lru_list);
+		struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
+
+		list_del(&b->lru.list);
 		free_buffer(b);
 	}
 
 	for (i = 0; i < LIST_SIZE; i++)
-		if (c->n_buffers[i])
-			DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
+		if (cache_count(&c->cache, i))
+			DMERR("leaked buffer count %d: %lu", i, cache_count(&c->cache, i));
 
 	for (i = 0; i < LIST_SIZE; i++)
-		WARN_ON(c->n_buffers[i]);
+		WARN_ON(cache_count(&c->cache, i));
 
+	cache_destroy(&c->cache);
 	kmem_cache_destroy(c->slab_cache);
 	kmem_cache_destroy(c->slab_buffer);
 	dm_io_client_destroy(c->dm_io);
@@ -2569,6 +2498,8 @@ void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
 }
 EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
 
+/*--------------------------------------------------------------*/
+
 static unsigned int get_max_age_hz(void)
 {
 	unsigned int max_age = READ_ONCE(dm_bufio_max_age);
@@ -2581,13 +2512,74 @@ static unsigned int get_max_age_hz(void)
 
 static bool older_than(struct dm_buffer *b, unsigned long age_hz)
 {
-	return time_after_eq(jiffies, b->last_accessed + age_hz);
+	return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz);
 }
 
-static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
+struct evict_params {
+	gfp_t gfp;
+	unsigned long age_hz;
+
+	/*
+	 * This gets updated with the largest last_accessed (ie. most
+	 * recently used) of the evicted buffers.  It will not be reinitialised
+	 * by __evict_many(), so you can use it across multiple invocations.
+	 */
+	unsigned long last_accessed;
+};
+
+/*
+ * We may not be able to evict this buffer if IO pending or the client
+ * is still using it.
+ *
+ * And if GFP_NOFS is used, we must not do any I/O because we hold
+ * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
+ * rerouted to different bufio client.
+ */
+static enum evict_result select_for_evict(struct dm_buffer *b, void *context)
+{
+	struct evict_params *params = context;
+
+	if (!(params->gfp & __GFP_FS) ||
+	    (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
+		if (test_bit_acquire(B_READING, &b->state) ||
+		    test_bit(B_WRITING, &b->state) ||
+		    test_bit(B_DIRTY, &b->state))
+			return ER_DONT_EVICT;
+	}
+
+	return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP;
+}
+
+static unsigned long __evict_many(struct dm_bufio_client *c,
+				  struct evict_params *params,
+				  int list_mode, unsigned long max_count)
 {
-	struct dm_buffer *b, *tmp;
-	unsigned long retain_target = get_retain_buffers(c);
+	unsigned long count;
+	unsigned long last_accessed;
+	struct dm_buffer *b;
+
+	for (count = 0; count < max_count; count++) {
+		b = cache_evict(&c->cache, list_mode, select_for_evict, params);
+		if (!b)
+			break;
+
+		last_accessed = READ_ONCE(b->last_accessed);
+		if (time_after_eq(params->last_accessed, last_accessed))
+			params->last_accessed = last_accessed;
+
+		__make_buffer_clean(b);
+		__free_buffer_wake(b);
+
+		cond_resched();
+	}
+
+	return count;
+}
+
+static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
+{
+	struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0};
+	unsigned long retain = get_retain_buffers(c);
 	unsigned long count;
 	LIST_HEAD(write_list);
 
@@ -2600,112 +2592,135 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
 		dm_bufio_lock(c);
 	}
 
-	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
-	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
-		if (count <= retain_target)
-			break;
-
-		if (!older_than(b, age_hz))
-			break;
-
-		if (__try_evict_buffer(b, 0))
-			count--;
-
-		cond_resched();
-	}
+	count = cache_total(&c->cache);
+	if (count > retain)
+		__evict_many(c, &params, LIST_CLEAN, count - retain);
 
 	dm_bufio_unlock(c);
 }
 
-static void do_global_cleanup(struct work_struct *w)
+static void cleanup_old_buffers(void)
 {
-	struct dm_bufio_client *locked_client = NULL;
-	struct dm_bufio_client *current_client;
-	struct dm_buffer *b;
-	unsigned int spinlock_hold_count;
-	unsigned long threshold = dm_bufio_cache_size -
-		dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
-	unsigned long loops = global_num * 2;
+	unsigned long max_age_hz = get_max_age_hz();
+	struct dm_bufio_client *c;
 
 	mutex_lock(&dm_bufio_clients_lock);
 
-	while (1) {
-		cond_resched();
+	__cache_size_refresh();
 
-		spin_lock(&global_spinlock);
-		if (unlikely(dm_bufio_current_allocated <= threshold))
-			break;
+	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
+		evict_old_buffers(c, max_age_hz);
 
-		spinlock_hold_count = 0;
-get_next:
-		if (!loops--)
-			break;
-		if (unlikely(list_empty(&global_queue)))
-			break;
-		b = list_entry(global_queue.prev, struct dm_buffer, global_list);
-
-		if (b->accessed) {
-			b->accessed = 0;
-			list_move(&b->global_list, &global_queue);
-			if (likely(++spinlock_hold_count < 16))
-				goto get_next;
-			spin_unlock(&global_spinlock);
-			continue;
-		}
+	mutex_unlock(&dm_bufio_clients_lock);
+}
 
-		current_client = b->c;
-		if (unlikely(current_client != locked_client)) {
-			if (locked_client)
-				dm_bufio_unlock(locked_client);
+static void work_fn(struct work_struct *w)
+{
+	cleanup_old_buffers();
 
-			if (!dm_bufio_trylock(current_client)) {
-				spin_unlock(&global_spinlock);
-				dm_bufio_lock(current_client);
-				locked_client = current_client;
-				continue;
-			}
+	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
+			   DM_BUFIO_WORK_TIMER_SECS * HZ);
+}
 
-			locked_client = current_client;
-		}
+/*--------------------------------------------------------------*/
 
-		spin_unlock(&global_spinlock);
+/*
+ * Global cleanup tries to evict the oldest buffers from across _all_
+ * the clients.  It does this by repeatedly evicting a few buffers from
+ * the client that holds the oldest buffer.  It's approximate, but hopefully
+ * good enough.
+ */
+static struct dm_bufio_client *__pop_client(void)
+{
+	struct list_head *h;
 
-		if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
-			spin_lock(&global_spinlock);
-			list_move(&b->global_list, &global_queue);
-			spin_unlock(&global_spinlock);
-		}
+	if (list_empty(&dm_bufio_all_clients))
+		return NULL;
+
+	h = dm_bufio_all_clients.next;
+	list_del(h);
+	return container_of(h, struct dm_bufio_client, client_list);
+}
+
+/*
+ * Inserts the client in the global client list based on its
+ * 'oldest_buffer' field.
+ */
+static void __insert_client(struct dm_bufio_client *new_client)
+{
+	struct dm_bufio_client *c;
+	struct list_head *h = dm_bufio_all_clients.next;
+
+	while (h != &dm_bufio_all_clients) {
+		c = container_of(h, struct dm_bufio_client, client_list);
+		if (time_after_eq(c->oldest_buffer, new_client->oldest_buffer))
+			break;
+		h = h->next;
 	}
 
-	spin_unlock(&global_spinlock);
+	list_add_tail(&new_client->client_list, h);
+}
 
-	if (locked_client)
-		dm_bufio_unlock(locked_client);
+static unsigned long __evict_a_few(unsigned long nr_buffers)
+{
+	unsigned long count;
+	struct dm_bufio_client *c;
+	struct evict_params params = {
+		.gfp = GFP_KERNEL,
+		.age_hz = 0,
+		/* set to jiffies in case there are no buffers in this client */
+		.last_accessed = jiffies
+	};
 
-	mutex_unlock(&dm_bufio_clients_lock);
+	c = __pop_client();
+	if (!c)
+		return 0;
+
+	dm_bufio_lock(c);
+	count = __evict_many(c, &params, LIST_CLEAN, nr_buffers);
+	dm_bufio_unlock(c);
+
+	if (count)
+		c->oldest_buffer = params.last_accessed;
+	__insert_client(c);
+
+	return count;
 }
 
-static void cleanup_old_buffers(void)
+static void check_watermarks(void)
 {
-	unsigned long max_age_hz = get_max_age_hz();
+	LIST_HEAD(write_list);
 	struct dm_bufio_client *c;
 
 	mutex_lock(&dm_bufio_clients_lock);
+	list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
+		dm_bufio_lock(c);
+		__check_watermark(c, &write_list);
+		dm_bufio_unlock(c);
+	}
+	mutex_unlock(&dm_bufio_clients_lock);
 
-	__cache_size_refresh();
+	__flush_write_list(&write_list);
+}
 
-	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
-		__evict_old_buffers(c, max_age_hz);
+static void evict_old(void)
+{
+	unsigned long threshold = dm_bufio_cache_size -
+		dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
 
+	mutex_lock(&dm_bufio_clients_lock);
+	while (dm_bufio_current_allocated > threshold) {
+		if (!__evict_a_few(64))
+			break;
+		cond_resched();
+	}
 	mutex_unlock(&dm_bufio_clients_lock);
 }
 
-static void work_fn(struct work_struct *w)
+static void do_global_cleanup(struct work_struct *w)
 {
-	cleanup_old_buffers();
-
-	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
-			   DM_BUFIO_WORK_TIMER_SECS * HZ);
+	check_watermarks();
+	evict_old();
 }
 
 /*
-- 
cgit v1.2.3


From 791188065be0d3715e6696412ab9c7a5f6b52bd3 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 24 Mar 2023 23:01:03 -0400
Subject: dm bufio: add lock_history optimization for cache iterators

Sometimes it is beneficial to repeatedly get and drop locks as part of
an iteration.  Introduce lock_history struct to help avoid redundant
drop and gets of the same lock.

Optimizes cache_iterate, cache_mark_many and cache_evict.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 111 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 1e000ec73bd6..9ac50024006d 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -421,6 +421,70 @@ static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block
 	up_write(&bc->trees[cache_index(block)].lock);
 }
 
+/*
+ * Sometimes we want to repeatedly get and drop locks as part of an iteration.
+ * This struct helps avoid redundant drop and gets of the same lock.
+ */
+struct lock_history {
+	struct dm_buffer_cache *cache;
+	bool write;
+	unsigned int previous;
+};
+
+static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool write)
+{
+	lh->cache = cache;
+	lh->write = write;
+	lh->previous = NR_LOCKS;  /* indicates no previous */
+}
+
+static void __lh_lock(struct lock_history *lh, unsigned int index)
+{
+	if (lh->write)
+		down_write(&lh->cache->trees[index].lock);
+	else
+		down_read(&lh->cache->trees[index].lock);
+}
+
+static void __lh_unlock(struct lock_history *lh, unsigned int index)
+{
+	if (lh->write)
+		up_write(&lh->cache->trees[index].lock);
+	else
+		up_read(&lh->cache->trees[index].lock);
+}
+
+/*
+ * Make sure you call this since it will unlock the final lock.
+ */
+static void lh_exit(struct lock_history *lh)
+{
+	if (lh->previous != NR_LOCKS) {
+		__lh_unlock(lh, lh->previous);
+		lh->previous = NR_LOCKS;
+	}
+}
+
+/*
+ * Named 'next' because there is no corresponding
+ * 'up/unlock' call since it's done automatically.
+ */
+static void lh_next(struct lock_history *lh, sector_t b)
+{
+	unsigned int index = cache_index(b);
+
+	if (lh->previous != NR_LOCKS) {
+		if (lh->previous != index) {
+			__lh_unlock(lh, lh->previous);
+			__lh_lock(lh, index);
+			lh->previous = index;
+		}
+	} else {
+		__lh_lock(lh, index);
+		lh->previous = index;
+	}
+}
+
 static inline struct dm_buffer *le_to_buffer(struct lru_entry *le)
 {
 	return container_of(le, struct dm_buffer, lru);
@@ -550,6 +614,7 @@ typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *);
  * predicate the hold_count of the selected buffer will be zero.
  */
 struct evict_wrapper {
+	struct lock_history *lh;
 	b_predicate pred;
 	void *context;
 };
@@ -563,16 +628,19 @@ static enum evict_result __evict_pred(struct lru_entry *le, void *context)
 	struct evict_wrapper *w = context;
 	struct dm_buffer *b = le_to_buffer(le);
 
+	lh_next(w->lh, b->block);
+
 	if (atomic_read(&b->hold_count))
 		return ER_DONT_EVICT;
 
 	return w->pred(b, w->context);
 }
 
-static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
-				     b_predicate pred, void *context)
+static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode,
+				       b_predicate pred, void *context,
+				       struct lock_history *lh)
 {
-	struct evict_wrapper w = {.pred = pred, .context = context};
+	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
 	struct lru_entry *le;
 	struct dm_buffer *b;
 
@@ -587,6 +655,19 @@ static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
 	return b;
 }
 
+static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
+				     b_predicate pred, void *context)
+{
+	struct dm_buffer *b;
+	struct lock_history lh;
+
+	lh_init(&lh, bc, true);
+	b = __cache_evict(bc, list_mode, pred, context, &lh);
+	lh_exit(&lh);
+
+	return b;
+}
+
 /*--------------*/
 
 /*
@@ -609,12 +690,12 @@ static void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list
  * Runs through the lru associated with 'old_mode', if the predicate matches then
  * it moves them to 'new_mode'.  Not threadsafe.
  */
-static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
-			    b_predicate pred, void *context)
+static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
+			      b_predicate pred, void *context, struct lock_history *lh)
 {
 	struct lru_entry *le;
 	struct dm_buffer *b;
-	struct evict_wrapper w = {.pred = pred, .context = context};
+	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
 
 	while (true) {
 		le = lru_evict(&bc->lru[old_mode], __evict_pred, &w);
@@ -627,6 +708,16 @@ static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mo
 	}
 }
 
+static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
+			    b_predicate pred, void *context)
+{
+	struct lock_history lh;
+
+	lh_init(&lh, bc, true);
+	__cache_mark_many(bc, old_mode, new_mode, pred, context, &lh);
+	lh_exit(&lh);
+}
+
 /*--------------*/
 
 /*
@@ -645,8 +736,8 @@ enum it_action {
 
 typedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context);
 
-static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
-			  iter_fn fn, void *context)
+static void __cache_iterate(struct dm_buffer_cache *bc, int list_mode,
+			    iter_fn fn, void *context, struct lock_history *lh)
 {
 	struct lru *lru = &bc->lru[list_mode];
 	struct lru_entry *le, *first;
@@ -658,6 +749,8 @@ static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
 	do {
 		struct dm_buffer *b = le_to_buffer(le);
 
+		lh_next(lh, b->block);
+
 		switch (fn(b, context)) {
 		case IT_NEXT:
 			break;
@@ -671,6 +764,16 @@ static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
 	} while (le != first);
 }
 
+static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
+			  iter_fn fn, void *context)
+{
+	struct lock_history lh;
+
+	lh_init(&lh, bc, false);
+	__cache_iterate(bc, list_mode, fn, context, &lh);
+	lh_exit(&lh);
+}
+
 /*--------------*/
 
 /*
-- 
cgit v1.2.3


From 530f683ddcd211a4dc5d60a13e6f4918a541bb8d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 21 Mar 2023 12:06:38 -0400
Subject: dm bufio: move dm_bufio_client members to avoid spanning cachelines

Movement also consolidates holes in dm_bufio_client struct. But the
overall size of the struct isn't changed.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 9ac50024006d..e5459741335d 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -936,13 +936,16 @@ static void cache_remove_range(struct dm_buffer_cache *bc,
  *	context.
  */
 struct dm_bufio_client {
-	struct mutex lock;
-	spinlock_t spinlock;
-	bool no_sleep;
-
 	struct block_device *bdev;
 	unsigned int block_size;
 	s8 sectors_per_block_bits;
+
+	bool no_sleep;
+	struct mutex lock;
+	spinlock_t spinlock;
+
+	int async_write_error;
+
 	void (*alloc_callback)(struct dm_buffer *buf);
 	void (*write_callback)(struct dm_buffer *buf);
 	struct kmem_cache *slab_buffer;
@@ -954,23 +957,22 @@ struct dm_bufio_client {
 
 	unsigned int minimum_buffers;
 
-	struct dm_buffer_cache cache;
-	wait_queue_head_t free_buffer_wait;
-
 	sector_t start;
 
-	int async_write_error;
-
-	struct list_head client_list;
-
 	struct shrinker shrinker;
 	struct work_struct shrink_work;
 	atomic_long_t need_shrink;
 
+	wait_queue_head_t free_buffer_wait;
+
+	struct list_head client_list;
+
 	/*
 	 * Used by global_cleanup to sort the clients list.
 	 */
 	unsigned long oldest_buffer;
+
+	struct dm_buffer_cache cache;
 };
 
 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
-- 
cgit v1.2.3


From f5f93541202f2e619c1395f5d98868b70cf60f50 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 23 Mar 2023 11:25:49 -0400
Subject: dm bufio: use waitqueue_active in __free_buffer_wake

Save one spinlock by using waitqueue_active. We hold the bufio lock at
this place, so no one can add entries to the waitqueue at this point.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index e5459741335d..cca43ed13fd1 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1665,7 +1665,12 @@ static void __free_buffer_wake(struct dm_buffer *b)
 		c->need_reserved_buffers--;
 	}
 
-	wake_up(&c->free_buffer_wait);
+	/*
+	 * We hold the bufio lock here, so no one can add entries to the
+	 * wait queue anyway.
+	 */
+	if (unlikely(waitqueue_active(&c->free_buffer_wait)))
+		wake_up(&c->free_buffer_wait);
 }
 
 static enum evict_result cleaned(struct dm_buffer *b, void *context)
-- 
cgit v1.2.3


From 56c5de4406f8234123ad08d4615865dbc13f743a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 21 Mar 2023 10:18:10 -0400
Subject: dm bufio: use multi-page bio vector

The kernel supports multi page bio vector entries, so we can use them
in dm-bufio as an optimization.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index cca43ed13fd1..ae552644a0b4 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1312,19 +1312,14 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
 {
 	struct bio *bio;
 	char *ptr;
-	unsigned int vec_size, len;
+	unsigned int len;
 
-	vec_size = b->c->block_size >> PAGE_SHIFT;
-	if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT))
-		vec_size += 2;
-
-	bio = bio_kmalloc(vec_size, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
+	bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
 	if (!bio) {
-dmio:
 		use_dmio(b, op, sector, n_sectors, offset);
 		return;
 	}
-	bio_init(bio, b->c->bdev, bio->bi_inline_vecs, vec_size, op);
+	bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op);
 	bio->bi_iter.bi_sector = sector;
 	bio->bi_end_io = bio_complete;
 	bio->bi_private = b;
@@ -1332,18 +1327,7 @@ dmio:
 	ptr = (char *)b->data + offset;
 	len = n_sectors << SECTOR_SHIFT;
 
-	do {
-		unsigned int this_step = min((unsigned int)(PAGE_SIZE - offset_in_page(ptr)), len);
-
-		if (!bio_add_page(bio, virt_to_page(ptr), this_step,
-				  offset_in_page(ptr))) {
-			bio_put(bio);
-			goto dmio;
-		}
-
-		len -= this_step;
-		ptr += this_step;
-	} while (len > 0);
+	__bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
 
 	submit_bio(bio);
 }
-- 
cgit v1.2.3


From bb46c56165faf284cf42c197317bff24f899835a Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 22 Mar 2023 13:34:49 +0000
Subject: dm thin: speed up cell_defer_no_holder()

Reduce the time that a spinlock is held in cell_defer_no_holder().

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-thin.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 13d4677baafd..00323428919e 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -883,15 +883,17 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
 {
 	struct pool *pool = tc->pool;
 	unsigned long flags;
-	int has_work;
+	struct bio_list bios;
 
-	spin_lock_irqsave(&tc->lock, flags);
-	cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
-	has_work = !bio_list_empty(&tc->deferred_bio_list);
-	spin_unlock_irqrestore(&tc->lock, flags);
+	bio_list_init(&bios);
+	cell_release_no_holder(pool, cell, &bios);
 
-	if (has_work)
+	if (!bio_list_empty(&bios)) {
+		spin_lock_irqsave(&tc->lock, flags);
+		bio_list_merge(&tc->deferred_bio_list, &bios);
+		spin_unlock_irqrestore(&tc->lock, flags);
 		wake_worker(pool);
+	}
 }
 
 static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
-- 
cgit v1.2.3


From 06961c487a33a222fd3d84998dc6398ed0449373 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Wed, 1 Mar 2023 12:48:43 -0500
Subject: dm: split discards further if target sets max_discard_granularity

The block core (bio_split_discard) will already split discards based
on the 'discard_granularity' and 'max_discard_sectors' queue_limits.
But the DM thin target also needs to ensure that it doesn't receive a
discard that spans a 'max_discard_sectors' boundary.

Introduce a dm_target 'max_discard_granularity' flag that if set will
cause DM core to split discard bios relative to 'max_discard_sectors'.
This treats 'discard_granularity' as a "min_discard_granularity" and
'max_discard_sectors' as a "max_discard_granularity".

Requested-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm.c               | 25 +++++++++++++++++++------
 include/linux/device-mapper.h |  6 ++++++
 include/uapi/linux/dm-ioctl.h |  4 ++--
 3 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfde0088147a..20c6b72a0245 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1162,7 +1162,8 @@ static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
 	return ti->len - target_offset;
 }
 
-static sector_t max_io_len(struct dm_target *ti, sector_t sector)
+static sector_t __max_io_len(struct dm_target *ti, sector_t sector,
+			     unsigned int max_granularity)
 {
 	sector_t target_offset = dm_target_offset(ti, sector);
 	sector_t len = max_io_len_target_boundary(ti, target_offset);
@@ -1173,11 +1174,16 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector)
 	 *   explains why stacked chunk_sectors based splitting via
 	 *   bio_split_to_limits() isn't possible here.
 	 */
-	if (!ti->max_io_len)
+	if (!max_granularity)
 		return len;
 	return min_t(sector_t, len,
 		min(queue_max_sectors(ti->table->md->queue),
-		    blk_chunk_sectors_left(target_offset, ti->max_io_len)));
+		    blk_chunk_sectors_left(target_offset, max_granularity)));
+}
+
+static inline sector_t max_io_len(struct dm_target *ti, sector_t sector)
+{
+	return __max_io_len(ti, sector, ti->max_io_len);
 }
 
 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
@@ -1565,12 +1571,13 @@ static void __send_empty_flush(struct clone_info *ci)
 }
 
 static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
-					unsigned int num_bios)
+					unsigned int num_bios,
+					unsigned int max_granularity)
 {
 	unsigned int len, bios;
 
 	len = min_t(sector_t, ci->sector_count,
-		    max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
+		    __max_io_len(ti, ci->sector, max_granularity));
 
 	atomic_add(num_bios, &ci->io->io_count);
 	bios = __send_duplicate_bios(ci, ti, num_bios, &len);
@@ -1606,10 +1613,16 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
 					  struct dm_target *ti)
 {
 	unsigned int num_bios = 0;
+	unsigned int max_granularity = 0;
 
 	switch (bio_op(ci->bio)) {
 	case REQ_OP_DISCARD:
 		num_bios = ti->num_discard_bios;
+		if (ti->max_discard_granularity) {
+			struct queue_limits *limits =
+				dm_get_queue_limits(ti->table->md);
+			max_granularity = limits->max_discard_sectors;
+		}
 		break;
 	case REQ_OP_SECURE_ERASE:
 		num_bios = ti->num_secure_erase_bios;
@@ -1630,7 +1643,7 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
 	if (unlikely(!num_bios))
 		return BLK_STS_NOTSUPP;
 
-	__send_changing_extent_only(ci, ti, num_bios);
+	__send_changing_extent_only(ci, ti, num_bios, max_granularity);
 	return BLK_STS_OK;
 }
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 7975483816e4..8aa6b3ea91fa 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -358,6 +358,12 @@ struct dm_target {
 	 */
 	bool discards_supported:1;
 
+	/*
+	 * Set if this target requires that discards be split on both
+	 * 'discard_granularity' and 'max_discard_sectors' boundaries.
+	 */
+	bool max_discard_granularity:1;
+
 	/*
 	 * Set if we need to limit the number of in-flight bios when swapping.
 	 */
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 7edf335778ba..1990b5700f69 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -286,9 +286,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	47
+#define DM_VERSION_MINOR	48
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2022-07-28)"
+#define DM_VERSION_EXTRA	"-ioctl (2023-03-01)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From e2dd8aca2d76f937f1f08d03041684f3532e9df4 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 2 Mar 2023 09:35:37 +0000
Subject: dm bio prison v1: improve concurrent IO performance

Split the bio prison into multiple regions, with a separate rbtree and
associated lock for each region.

To get fast bio prison locking and not damage the performance of
discards too much the bio-prison now stipulates that discards should
not cross a BIO_PRISON_MAX_RANGE boundary.

Because the range of a key (block_end - block_begin) must not exceed
BIO_PRISON_MAX_RANGE: break_up_discard_bio() now ensures the data
range reflected in PHYSICAL key doesn't exceed BIO_PRISON_MAX_RANGE.
And splitting the thin target's discards (handled with VIRTUAL key) is
achieved by updating dm-thin.c to set limits->max_discard_sectors in
terms of BIO_PRISON_MAX_RANGE _and_ setting the thin and thin-pool
targets' max_discard_granularity to true.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bio-prison-v1.c | 87 ++++++++++++++++++++++++++--------------
 drivers/md/dm-bio-prison-v1.h | 10 +++++
 drivers/md/dm-thin.c          | 92 +++++++++++++++++++++++++------------------
 3 files changed, 121 insertions(+), 68 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index c4c05d5d8909..2b8af861e5f6 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -16,11 +16,17 @@
 
 /*----------------------------------------------------------------*/
 
+#define NR_LOCKS 64
+#define LOCK_MASK (NR_LOCKS - 1)
 #define MIN_CELLS 1024
 
-struct dm_bio_prison {
+struct prison_region {
 	spinlock_t lock;
-	struct rb_root cells;
+	struct rb_root cell;
+} ____cacheline_aligned_in_smp;
+
+struct dm_bio_prison {
+	struct prison_region regions[NR_LOCKS];
 	mempool_t cell_pool;
 };
 
@@ -34,13 +40,17 @@ static struct kmem_cache *_cell_cache;
  */
 struct dm_bio_prison *dm_bio_prison_create(void)
 {
-	struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
 	int ret;
+	unsigned i;
+	struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
 
 	if (!prison)
 		return NULL;
 
-	spin_lock_init(&prison->lock);
+	for (i = 0; i < NR_LOCKS; i++) {
+		spin_lock_init(&prison->regions[i].lock);
+		prison->regions[i].cell = RB_ROOT;
+	}
 
 	ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache);
 	if (ret) {
@@ -48,8 +58,6 @@ struct dm_bio_prison *dm_bio_prison_create(void)
 		return NULL;
 	}
 
-	prison->cells = RB_ROOT;
-
 	return prison;
 }
 EXPORT_SYMBOL_GPL(dm_bio_prison_create);
@@ -107,14 +115,26 @@ static int cmp_keys(struct dm_cell_key *lhs,
 	return 0;
 }
 
-static int __bio_detain(struct dm_bio_prison *prison,
+static unsigned lock_nr(struct dm_cell_key *key)
+{
+	return (key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) & LOCK_MASK;
+}
+
+static void check_range(struct dm_cell_key *key)
+{
+	BUG_ON(key->block_end - key->block_begin > BIO_PRISON_MAX_RANGE);
+	BUG_ON((key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) !=
+	       ((key->block_end - 1) >> BIO_PRISON_MAX_RANGE_SHIFT));
+}
+
+static int __bio_detain(struct rb_root *root,
 			struct dm_cell_key *key,
 			struct bio *inmate,
 			struct dm_bio_prison_cell *cell_prealloc,
 			struct dm_bio_prison_cell **cell_result)
 {
 	int r;
-	struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+	struct rb_node **new = &root->rb_node, *parent = NULL;
 
 	while (*new) {
 		struct dm_bio_prison_cell *cell =
@@ -139,7 +159,7 @@ static int __bio_detain(struct dm_bio_prison *prison,
 	*cell_result = cell_prealloc;
 
 	rb_link_node(&cell_prealloc->node, parent, new);
-	rb_insert_color(&cell_prealloc->node, &prison->cells);
+	rb_insert_color(&cell_prealloc->node, root);
 
 	return 0;
 }
@@ -151,10 +171,12 @@ static int bio_detain(struct dm_bio_prison *prison,
 		      struct dm_bio_prison_cell **cell_result)
 {
 	int r;
+	unsigned l = lock_nr(key);
+	check_range(key);
 
-	spin_lock_irq(&prison->lock);
-	r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
-	spin_unlock_irq(&prison->lock);
+	spin_lock_irq(&prison->regions[l].lock);
+	r = __bio_detain(&prison->regions[l].cell, key, inmate, cell_prealloc, cell_result);
+	spin_unlock_irq(&prison->regions[l].lock);
 
 	return r;
 }
@@ -181,11 +203,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
 /*
  * @inmates must have been initialised prior to this call
  */
-static void __cell_release(struct dm_bio_prison *prison,
+static void __cell_release(struct rb_root *root,
 			   struct dm_bio_prison_cell *cell,
 			   struct bio_list *inmates)
 {
-	rb_erase(&cell->node, &prison->cells);
+	rb_erase(&cell->node, root);
 
 	if (inmates) {
 		if (cell->holder)
@@ -198,20 +220,22 @@ void dm_cell_release(struct dm_bio_prison *prison,
 		     struct dm_bio_prison_cell *cell,
 		     struct bio_list *bios)
 {
-	spin_lock_irq(&prison->lock);
-	__cell_release(prison, cell, bios);
-	spin_unlock_irq(&prison->lock);
+	unsigned l = lock_nr(&cell->key);
+
+	spin_lock_irq(&prison->regions[l].lock);
+	__cell_release(&prison->regions[l].cell, cell, bios);
+	spin_unlock_irq(&prison->regions[l].lock);
 }
 EXPORT_SYMBOL_GPL(dm_cell_release);
 
 /*
  * Sometimes we don't want the holder, just the additional bios.
  */
-static void __cell_release_no_holder(struct dm_bio_prison *prison,
+static void __cell_release_no_holder(struct rb_root *root,
 				     struct dm_bio_prison_cell *cell,
 				     struct bio_list *inmates)
 {
-	rb_erase(&cell->node, &prison->cells);
+	rb_erase(&cell->node, root);
 	bio_list_merge(inmates, &cell->bios);
 }
 
@@ -219,11 +243,12 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
 			       struct dm_bio_prison_cell *cell,
 			       struct bio_list *inmates)
 {
+	unsigned l = lock_nr(&cell->key);
 	unsigned long flags;
 
-	spin_lock_irqsave(&prison->lock, flags);
-	__cell_release_no_holder(prison, cell, inmates);
-	spin_unlock_irqrestore(&prison->lock, flags);
+	spin_lock_irqsave(&prison->regions[l].lock, flags);
+	__cell_release_no_holder(&prison->regions[l].cell, cell, inmates);
+	spin_unlock_irqrestore(&prison->regions[l].lock, flags);
 }
 EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
 
@@ -248,18 +273,19 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
 			   void *context,
 			   struct dm_bio_prison_cell *cell)
 {
-	spin_lock_irq(&prison->lock);
+	unsigned l = lock_nr(&cell->key);
+	spin_lock_irq(&prison->regions[l].lock);
 	visit_fn(context, cell);
-	rb_erase(&cell->node, &prison->cells);
-	spin_unlock_irq(&prison->lock);
+	rb_erase(&cell->node, &prison->regions[l].cell);
+	spin_unlock_irq(&prison->regions[l].lock);
 }
 EXPORT_SYMBOL_GPL(dm_cell_visit_release);
 
-static int __promote_or_release(struct dm_bio_prison *prison,
+static int __promote_or_release(struct rb_root *root,
 				struct dm_bio_prison_cell *cell)
 {
 	if (bio_list_empty(&cell->bios)) {
-		rb_erase(&cell->node, &prison->cells);
+		rb_erase(&cell->node, root);
 		return 1;
 	}
 
@@ -271,10 +297,11 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison,
 			       struct dm_bio_prison_cell *cell)
 {
 	int r;
+	unsigned l = lock_nr(&cell->key);
 
-	spin_lock_irq(&prison->lock);
-	r = __promote_or_release(prison, cell);
-	spin_unlock_irq(&prison->lock);
+	spin_lock_irq(&prison->regions[l].lock);
+	r = __promote_or_release(&prison->regions[l].cell, cell);
+	spin_unlock_irq(&prison->regions[l].lock);
 
 	return r;
 }
diff --git a/drivers/md/dm-bio-prison-v1.h b/drivers/md/dm-bio-prison-v1.h
index dfbf1e94cb75..0b8acd6708fb 100644
--- a/drivers/md/dm-bio-prison-v1.h
+++ b/drivers/md/dm-bio-prison-v1.h
@@ -34,6 +34,16 @@ struct dm_cell_key {
 	dm_block_t block_begin, block_end;
 };
 
+/*
+ * The range of a key (block_end - block_begin) must not
+ * exceed BIO_PRISON_MAX_RANGE.  Also the range must not
+ * cross a similarly sized boundary.
+ *
+ * Must be a power of 2.
+ */
+#define BIO_PRISON_MAX_RANGE 1024
+#define BIO_PRISON_MAX_RANGE_SHIFT 10
+
 /*
  * Treat this as opaque, only in header so callers can manage allocation
  * themselves.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 00323428919e..33ad5695f959 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1674,54 +1674,69 @@ static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t
 	struct dm_cell_key data_key;
 	struct dm_bio_prison_cell *data_cell;
 	struct dm_thin_new_mapping *m;
-	dm_block_t virt_begin, virt_end, data_begin;
+	dm_block_t virt_begin, virt_end, data_begin, data_end;
+	dm_block_t len, next_boundary;
 
 	while (begin != end) {
-		r = ensure_next_mapping(pool);
-		if (r)
-			/* we did our best */
-			return;
-
 		r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
 					      &data_begin, &maybe_shared);
-		if (r)
+		if (r) {
 			/*
 			 * Silently fail, letting any mappings we've
 			 * created complete.
 			 */
 			break;
-
-		build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
-		if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
-			/* contention, we'll give up with this range */
-			begin = virt_end;
-			continue;
 		}
 
-		/*
-		 * IO may still be going to the destination block.  We must
-		 * quiesce before we can do the removal.
-		 */
-		m = get_next_mapping(pool);
-		m->tc = tc;
-		m->maybe_shared = maybe_shared;
-		m->virt_begin = virt_begin;
-		m->virt_end = virt_end;
-		m->data_block = data_begin;
-		m->cell = data_cell;
-		m->bio = bio;
+		data_end = data_begin + (virt_end - virt_begin);
 
 		/*
-		 * The parent bio must not complete before sub discard bios are
-		 * chained to it (see end_discard's bio_chain)!
-		 *
-		 * This per-mapping bi_remaining increment is paired with
-		 * the implicit decrement that occurs via bio_endio() in
-		 * end_discard().
+		 * Make sure the data region obeys the bio prison restrictions.
 		 */
-		bio_inc_remaining(bio);
-		if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
-			pool->process_prepared_discard(m);
+		while (data_begin < data_end) {
+			r = ensure_next_mapping(pool);
+			if (r)
+				return; /* we did our best */
+
+			next_boundary = ((data_begin >> BIO_PRISON_MAX_RANGE_SHIFT) + 1)
+				<< BIO_PRISON_MAX_RANGE_SHIFT;
+			len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin);
+
+			build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key);
+			if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
+				/* contention, we'll give up with this range */
+				data_begin += len;
+				continue;
+			}
+
+			/*
+			 * IO may still be going to the destination block.  We must
+			 * quiesce before we can do the removal.
+			 */
+			m = get_next_mapping(pool);
+			m->tc = tc;
+			m->maybe_shared = maybe_shared;
+			m->virt_begin = virt_begin;
+			m->virt_end = virt_begin + len;
+			m->data_block = data_begin;
+			m->cell = data_cell;
+			m->bio = bio;
+
+			/*
+			 * The parent bio must not complete before sub discard bios are
+			 * chained to it (see end_discard's bio_chain)!
+			 *
+			 * This per-mapping bi_remaining increment is paired with
+			 * the implicit decrement that occurs via bio_endio() in
+			 * end_discard().
+			 */
+			bio_inc_remaining(bio);
+			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+				pool->process_prepared_discard(m);
+
+			virt_begin += len;
+			data_begin += len;
+		}
 
 		begin = virt_end;
 	}
@@ -3380,13 +3395,13 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	 */
 	if (pf.discard_enabled && pf.discard_passdown) {
 		ti->num_discard_bios = 1;
-
 		/*
 		 * Setting 'discards_supported' circumvents the normal
 		 * stacking of discard limits (this keeps the pool and
 		 * thin devices' discard limits consistent).
 		 */
 		ti->discards_supported = true;
+		ti->max_discard_granularity = true;
 	}
 	ti->private = pt;
 
@@ -4096,7 +4111,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 22, 0},
+	.version = {1, 23, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -4261,6 +4276,7 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
+		ti->max_discard_granularity = true;
 	}
 
 	mutex_unlock(&dm_thin_pool_table.mutex);
@@ -4476,12 +4492,12 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 		return;
 
 	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
+	limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
 }
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 22, 0},
+	.version = {1, 23, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
-- 
cgit v1.2.3


From 3f8d3f5432078a558151e27230e20bcf93c23ffe Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Sat, 25 Mar 2023 17:52:01 -0400
Subject: dm bio prison v1: add dm_cell_key_has_valid_range

Don't have bio_detain() BUG_ON if a dm_cell_key is beyond
BIO_PRISON_MAX_RANGE or spans a boundary.

Update dm-thin.c:build_key() to use dm_cell_key_has_valid_range() which
will do this checking without using BUG_ON. Also update
process_discard_bio() to check the discard bio that DM core passes in
(having first imposed max_discard_granularity based splitting).

dm_cell_key_has_valid_range() will merely WARN_ON_ONCE if it returns
false because if it does: it is programmer error that should be caught
with proper testing. So relax the BUG_ONs to be WARN_ON_ONCE.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bio-prison-v1.c | 14 +++++++++-----
 drivers/md/dm-bio-prison-v1.h |  5 +++++
 drivers/md/dm-thin.c          | 21 +++++++++++++++------
 3 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index 2b8af861e5f6..78bb559b521c 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -120,12 +120,17 @@ static unsigned lock_nr(struct dm_cell_key *key)
 	return (key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) & LOCK_MASK;
 }
 
-static void check_range(struct dm_cell_key *key)
+bool dm_cell_key_has_valid_range(struct dm_cell_key *key)
 {
-	BUG_ON(key->block_end - key->block_begin > BIO_PRISON_MAX_RANGE);
-	BUG_ON((key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) !=
-	       ((key->block_end - 1) >> BIO_PRISON_MAX_RANGE_SHIFT));
+	if (WARN_ON_ONCE(key->block_end - key->block_begin > BIO_PRISON_MAX_RANGE))
+		return false;
+	if (WARN_ON_ONCE((key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) !=
+			 (key->block_end - 1) >> BIO_PRISON_MAX_RANGE_SHIFT))
+		return false;
+
+	return true;
 }
+EXPORT_SYMBOL(dm_cell_key_has_valid_range);
 
 static int __bio_detain(struct rb_root *root,
 			struct dm_cell_key *key,
@@ -172,7 +177,6 @@ static int bio_detain(struct dm_bio_prison *prison,
 {
 	int r;
 	unsigned l = lock_nr(key);
-	check_range(key);
 
 	spin_lock_irq(&prison->regions[l].lock);
 	r = __bio_detain(&prison->regions[l].cell, key, inmate, cell_prealloc, cell_result);
diff --git a/drivers/md/dm-bio-prison-v1.h b/drivers/md/dm-bio-prison-v1.h
index 0b8acd6708fb..2a097ed0d85e 100644
--- a/drivers/md/dm-bio-prison-v1.h
+++ b/drivers/md/dm-bio-prison-v1.h
@@ -83,6 +83,11 @@ int dm_get_cell(struct dm_bio_prison *prison,
 		struct dm_bio_prison_cell *cell_prealloc,
 		struct dm_bio_prison_cell **cell_result);
 
+/*
+ * Returns false if key is beyond BIO_PRISON_MAX_RANGE or spans a boundary.
+ */
+bool dm_cell_key_has_valid_range(struct dm_cell_key *key);
+
 /*
  * An atomic op that combines retrieving or creating a cell, and adding a
  * bio to it.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 33ad5695f959..2b13c949bd72 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -118,25 +118,27 @@ enum lock_space {
 	PHYSICAL
 };
 
-static void build_key(struct dm_thin_device *td, enum lock_space ls,
+static bool build_key(struct dm_thin_device *td, enum lock_space ls,
 		      dm_block_t b, dm_block_t e, struct dm_cell_key *key)
 {
 	key->virtual = (ls == VIRTUAL);
 	key->dev = dm_thin_dev_id(td);
 	key->block_begin = b;
 	key->block_end = e;
+
+	return dm_cell_key_has_valid_range(key);
 }
 
 static void build_data_key(struct dm_thin_device *td, dm_block_t b,
 			   struct dm_cell_key *key)
 {
-	build_key(td, PHYSICAL, b, b + 1llu, key);
+	(void) build_key(td, PHYSICAL, b, b + 1llu, key);
 }
 
 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 			      struct dm_cell_key *key)
 {
-	build_key(td, VIRTUAL, b, b + 1llu, key);
+	(void) build_key(td, VIRTUAL, b, b + 1llu, key);
 }
 
 /*----------------------------------------------------------------*/
@@ -1702,7 +1704,8 @@ static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t
 				<< BIO_PRISON_MAX_RANGE_SHIFT;
 			len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin);
 
-			build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key);
+			/* This key is certainly within range given the above splitting */
+			(void) build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key);
 			if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
 				/* contention, we'll give up with this range */
 				data_begin += len;
@@ -1778,8 +1781,13 @@ static void process_discard_bio(struct thin_c *tc, struct bio *bio)
 		return;
 	}
 
-	build_key(tc->td, VIRTUAL, begin, end, &virt_key);
-	if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
+	if (unlikely(!build_key(tc->td, VIRTUAL, begin, end, &virt_key))) {
+		DMERR_LIMIT("Discard doesn't respect bio prison limits");
+		bio_endio(bio);
+		return;
+	}
+
+	if (bio_detain(tc->pool, &virt_key, bio, &virt_cell)) {
 		/*
 		 * Potential starvation issue: We're relying on the
 		 * fs/application being well behaved, and not trying to
@@ -1788,6 +1796,7 @@ static void process_discard_bio(struct thin_c *tc, struct bio *bio)
 		 * cell will never be granted.
 		 */
 		return;
+	}
 
 	tc->pool->process_discard_cell(tc, virt_cell);
 }
-- 
cgit v1.2.3


From 0bac3f2f28b87b520e50a196c42409485acfe5cf Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Mon, 27 Mar 2023 13:59:25 -0400
Subject: dm: add dm_num_hash_locks()

Simple helper to use when DM core code needs to appropriately size,
based on num_online_cpus(), its data structures that split locks.

dm_num_hash_locks() rounds up num_online_cpus() to next power of 2
but caps return at DM_HASH_LOCKS_MAX (64).

This heuristic may evolve as warranted, but as-is it will serve as a
more informed basis for sizing the sharded lock structs in dm-bufio's
dm_buffer_cache (buffer_trees) and dm-bio-prison-v1's dm_bio_prison
(prison_regions).

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 22eaed188907..a1a5defddb07 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -20,6 +20,7 @@
 #include <linux/completion.h>
 #include <linux/kobject.h>
 #include <linux/refcount.h>
+#include <linux/log2.h>
 
 #include "dm-stats.h"
 
@@ -228,4 +229,13 @@ void dm_free_md_mempools(struct dm_md_mempools *pools);
  */
 unsigned int dm_get_reserved_bio_based_ios(void);
 
+#define DM_HASH_LOCKS_MAX 64
+
+static inline unsigned int dm_num_hash_locks(void)
+{
+	unsigned int num_locks = roundup_pow_of_two(num_online_cpus());
+
+	return min_t(unsigned int, num_locks, DM_HASH_LOCKS_MAX);
+}
+
 #endif
-- 
cgit v1.2.3


From 36c18b86390824f93ed1580500df9698978e2006 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Mon, 27 Mar 2023 14:09:45 -0400
Subject: dm bufio: prepare to intelligently size dm_buffer_cache's
 buffer_trees

Add num_locks member to dm_buffer_cache struct and use it rather than
the NR_LOCKS magic value (64).

Next commit will size the dm_buffer_cache's buffer_trees according to
dm_num_hash_locks().

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 48 ++++++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index ae552644a0b4..2250799a70e4 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -380,7 +380,6 @@ struct dm_buffer {
  */
 
 #define NR_LOCKS 64
-#define LOCKS_MASK (NR_LOCKS - 1)
 
 struct buffer_tree {
 	struct rw_semaphore lock;
@@ -388,37 +387,38 @@ struct buffer_tree {
 } ____cacheline_aligned_in_smp;
 
 struct dm_buffer_cache {
+	struct lru lru[LIST_SIZE];
 	/*
 	 * We spread entries across multiple trees to reduce contention
 	 * on the locks.
 	 */
+	unsigned int num_locks;
 	struct buffer_tree trees[NR_LOCKS];
-	struct lru lru[LIST_SIZE];
 };
 
-static inline unsigned int cache_index(sector_t block)
+static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
 {
-	return block & LOCKS_MASK;
+	return block & (num_locks - 1);
 }
 
 static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
 {
-	down_read(&bc->trees[cache_index(block)].lock);
+	down_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
 }
 
 static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
 {
-	up_read(&bc->trees[cache_index(block)].lock);
+	up_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
 }
 
 static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
 {
-	down_write(&bc->trees[cache_index(block)].lock);
+	down_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
 }
 
 static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
 {
-	up_write(&bc->trees[cache_index(block)].lock);
+	up_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
 }
 
 /*
@@ -429,13 +429,15 @@ struct lock_history {
 	struct dm_buffer_cache *cache;
 	bool write;
 	unsigned int previous;
+	unsigned int no_previous;
 };
 
 static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool write)
 {
 	lh->cache = cache;
 	lh->write = write;
-	lh->previous = NR_LOCKS;  /* indicates no previous */
+	lh->no_previous = cache->num_locks;
+	lh->previous = lh->no_previous;
 }
 
 static void __lh_lock(struct lock_history *lh, unsigned int index)
@@ -459,9 +461,9 @@ static void __lh_unlock(struct lock_history *lh, unsigned int index)
  */
 static void lh_exit(struct lock_history *lh)
 {
-	if (lh->previous != NR_LOCKS) {
+	if (lh->previous != lh->no_previous) {
 		__lh_unlock(lh, lh->previous);
-		lh->previous = NR_LOCKS;
+		lh->previous = lh->no_previous;
 	}
 }
 
@@ -471,9 +473,9 @@ static void lh_exit(struct lock_history *lh)
  */
 static void lh_next(struct lock_history *lh, sector_t b)
 {
-	unsigned int index = cache_index(b);
+	unsigned int index = cache_index(b, lh->no_previous); /* no_previous is num_locks */
 
-	if (lh->previous != NR_LOCKS) {
+	if (lh->previous != lh->no_previous) {
 		if (lh->previous != index) {
 			__lh_unlock(lh, lh->previous);
 			__lh_lock(lh, index);
@@ -500,11 +502,13 @@ static struct dm_buffer *list_to_buffer(struct list_head *l)
 	return le_to_buffer(le);
 }
 
-static void cache_init(struct dm_buffer_cache *bc)
+static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks)
 {
 	unsigned int i;
 
-	for (i = 0; i < NR_LOCKS; i++) {
+	bc->num_locks = num_locks;
+
+	for (i = 0; i < bc->num_locks; i++) {
 		init_rwsem(&bc->trees[i].lock);
 		bc->trees[i].root = RB_ROOT;
 	}
@@ -517,7 +521,7 @@ static void cache_destroy(struct dm_buffer_cache *bc)
 {
 	unsigned int i;
 
-	for (i = 0; i < NR_LOCKS; i++)
+	for (i = 0; i < bc->num_locks; i++)
 		WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root));
 
 	lru_destroy(&bc->lru[LIST_CLEAN]);
@@ -576,7 +580,7 @@ static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block)
 	struct dm_buffer *b;
 
 	cache_read_lock(bc, block);
-	b = __cache_get(&bc->trees[cache_index(block)].root, block);
+	b = __cache_get(&bc->trees[cache_index(block, bc->num_locks)].root, block);
 	if (b) {
 		lru_reference(&b->lru);
 		__cache_inc_buffer(b);
@@ -650,7 +654,7 @@ static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode
 
 	b = le_to_buffer(le);
 	/* __evict_pred will have locked the appropriate tree. */
-	rb_erase(&b->node, &bc->trees[cache_index(b->block)].root);
+	rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
 
 	return b;
 }
@@ -816,7 +820,7 @@ static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b)
 
 	cache_write_lock(bc, b->block);
 	BUG_ON(atomic_read(&b->hold_count) != 1);
-	r = __cache_insert(&bc->trees[cache_index(b->block)].root, b);
+	r = __cache_insert(&bc->trees[cache_index(b->block, bc->num_locks)].root, b);
 	if (r)
 		lru_insert(&bc->lru[b->list_mode], &b->lru);
 	cache_write_unlock(bc, b->block);
@@ -842,7 +846,7 @@ static bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b)
 		r = false;
 	} else {
 		r = true;
-		rb_erase(&b->node, &bc->trees[cache_index(b->block)].root);
+		rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
 		lru_remove(&bc->lru[b->list_mode], &b->lru);
 	}
 
@@ -911,7 +915,7 @@ static void cache_remove_range(struct dm_buffer_cache *bc,
 {
 	unsigned int i;
 
-	for (i = 0; i < NR_LOCKS; i++) {
+	for (i = 0; i < bc->num_locks; i++) {
 		down_write(&bc->trees[i].lock);
 		__remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
 		up_write(&bc->trees[i].lock);
@@ -2432,7 +2436,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 		r = -ENOMEM;
 		goto bad_client;
 	}
-	cache_init(&c->cache);
+	cache_init(&c->cache, NR_LOCKS);
 
 	c->bdev = bdev;
 	c->block_size = block_size;
-- 
cgit v1.2.3


From 1e84c4b7322d44a5b7d7f83eced1f60cc0ecf1a4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Mon, 27 Mar 2023 14:30:46 -0400
Subject: dm bufio: intelligently size dm_buffer_cache's buffer_trees

Size the dm_buffer_cache's number of buffer_tree structs using
dm_num_hash_locks().

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 2250799a70e4..c1126ad45bdb 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -21,6 +21,8 @@
 #include <linux/stacktrace.h>
 #include <linux/jump_label.h>
 
+#include "dm.h"
+
 #define DM_MSG_PREFIX "bufio"
 
 /*
@@ -379,8 +381,6 @@ struct dm_buffer {
  * only enough to ensure get/put are threadsafe.
  */
 
-#define NR_LOCKS 64
-
 struct buffer_tree {
 	struct rw_semaphore lock;
 	struct rb_root root;
@@ -393,7 +393,7 @@ struct dm_buffer_cache {
 	 * on the locks.
 	 */
 	unsigned int num_locks;
-	struct buffer_tree trees[NR_LOCKS];
+	struct buffer_tree trees[];
 };
 
 static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
@@ -976,7 +976,7 @@ struct dm_bufio_client {
 	 */
 	unsigned long oldest_buffer;
 
-	struct dm_buffer_cache cache;
+	struct dm_buffer_cache cache; /* must be last member */
 };
 
 static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
@@ -2422,6 +2422,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 					       unsigned int flags)
 {
 	int r;
+	unsigned int num_locks;
 	struct dm_bufio_client *c;
 	char slab_name[27];
 
@@ -2431,12 +2432,13 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 		goto bad_client;
 	}
 
-	c = kzalloc(sizeof(*c), GFP_KERNEL);
+	num_locks = dm_num_hash_locks();
+	c = kzalloc(sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL);
 	if (!c) {
 		r = -ENOMEM;
 		goto bad_client;
 	}
-	cache_init(&c->cache, NR_LOCKS);
+	cache_init(&c->cache, num_locks);
 
 	c->bdev = bdev;
 	c->block_size = block_size;
-- 
cgit v1.2.3


From c6273411d1803ed03d4f0628bc629a8c45774351 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Mon, 27 Mar 2023 14:09:45 -0400
Subject: dm bio prison v1: prepare to intelligently size dm_bio_prison's
 prison_regions

Add num_locks member to dm_bio_prison struct and use it rather than
the NR_LOCKS magic value (64).

Next commit will size the dm_bio_prison's prison_regions according to
dm_num_hash_locks().

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bio-prison-v1.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index 78bb559b521c..a7930ad1878b 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -17,7 +17,6 @@
 /*----------------------------------------------------------------*/
 
 #define NR_LOCKS 64
-#define LOCK_MASK (NR_LOCKS - 1)
 #define MIN_CELLS 1024
 
 struct prison_region {
@@ -26,8 +25,9 @@ struct prison_region {
 } ____cacheline_aligned_in_smp;
 
 struct dm_bio_prison {
-	struct prison_region regions[NR_LOCKS];
 	mempool_t cell_pool;
+	unsigned int num_locks;
+	struct prison_region regions[NR_LOCKS];
 };
 
 static struct kmem_cache *_cell_cache;
@@ -46,8 +46,9 @@ struct dm_bio_prison *dm_bio_prison_create(void)
 
 	if (!prison)
 		return NULL;
+	prison->num_locks = NR_LOCKS;
 
-	for (i = 0; i < NR_LOCKS; i++) {
+	for (i = 0; i < prison->num_locks; i++) {
 		spin_lock_init(&prison->regions[i].lock);
 		prison->regions[i].cell = RB_ROOT;
 	}
@@ -115,9 +116,9 @@ static int cmp_keys(struct dm_cell_key *lhs,
 	return 0;
 }
 
-static unsigned lock_nr(struct dm_cell_key *key)
+static unsigned lock_nr(struct dm_cell_key *key, unsigned int num_locks)
 {
-	return (key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) & LOCK_MASK;
+	return (key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) & (num_locks - 1);
 }
 
 bool dm_cell_key_has_valid_range(struct dm_cell_key *key)
@@ -176,7 +177,7 @@ static int bio_detain(struct dm_bio_prison *prison,
 		      struct dm_bio_prison_cell **cell_result)
 {
 	int r;
-	unsigned l = lock_nr(key);
+	unsigned l = lock_nr(key, prison->num_locks);
 
 	spin_lock_irq(&prison->regions[l].lock);
 	r = __bio_detain(&prison->regions[l].cell, key, inmate, cell_prealloc, cell_result);
@@ -224,7 +225,7 @@ void dm_cell_release(struct dm_bio_prison *prison,
 		     struct dm_bio_prison_cell *cell,
 		     struct bio_list *bios)
 {
-	unsigned l = lock_nr(&cell->key);
+	unsigned l = lock_nr(&cell->key, prison->num_locks);
 
 	spin_lock_irq(&prison->regions[l].lock);
 	__cell_release(&prison->regions[l].cell, cell, bios);
@@ -247,7 +248,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
 			       struct dm_bio_prison_cell *cell,
 			       struct bio_list *inmates)
 {
-	unsigned l = lock_nr(&cell->key);
+	unsigned l = lock_nr(&cell->key, prison->num_locks);
 	unsigned long flags;
 
 	spin_lock_irqsave(&prison->regions[l].lock, flags);
@@ -277,7 +278,7 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
 			   void *context,
 			   struct dm_bio_prison_cell *cell)
 {
-	unsigned l = lock_nr(&cell->key);
+	unsigned l = lock_nr(&cell->key, prison->num_locks);
 	spin_lock_irq(&prison->regions[l].lock);
 	visit_fn(context, cell);
 	rb_erase(&cell->node, &prison->regions[l].cell);
@@ -301,7 +302,7 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison,
 			       struct dm_bio_prison_cell *cell)
 {
 	int r;
-	unsigned l = lock_nr(&cell->key);
+	unsigned l = lock_nr(&cell->key, prison->num_locks);
 
 	spin_lock_irq(&prison->regions[l].lock);
 	r = __promote_or_release(&prison->regions[l].cell, cell);
-- 
cgit v1.2.3


From b6279f82eb11a1f380af3a26acf921c37505fc86 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Mon, 27 Mar 2023 14:30:46 -0400
Subject: dm bio prison v1: intelligently size dm_bio_prison's prison_regions

Size the dm_bio_prison's number of prison_region structs using
dm_num_hash_locks().

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bio-prison-v1.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index a7930ad1878b..6b726b8b300d 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -16,7 +16,6 @@
 
 /*----------------------------------------------------------------*/
 
-#define NR_LOCKS 64
 #define MIN_CELLS 1024
 
 struct prison_region {
@@ -27,7 +26,7 @@ struct prison_region {
 struct dm_bio_prison {
 	mempool_t cell_pool;
 	unsigned int num_locks;
-	struct prison_region regions[NR_LOCKS];
+	struct prison_region regions[];
 };
 
 static struct kmem_cache *_cell_cache;
@@ -41,12 +40,14 @@ static struct kmem_cache *_cell_cache;
 struct dm_bio_prison *dm_bio_prison_create(void)
 {
 	int ret;
-	unsigned i;
-	struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
+	unsigned int i, num_locks;
+	struct dm_bio_prison *prison;
 
+	num_locks = dm_num_hash_locks();
+	prison = kzalloc(struct_size(prison, regions, num_locks), GFP_KERNEL);
 	if (!prison)
 		return NULL;
-	prison->num_locks = NR_LOCKS;
+	prison->num_locks = num_locks;
 
 	for (i = 0; i < prison->num_locks; i++) {
 		spin_lock_init(&prison->regions[i].lock);
-- 
cgit v1.2.3


From 363b7fd76c91dc611a56d992e9550bb1ba070e1a Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 29 Mar 2023 10:29:34 -0400
Subject: dm: improve hash_locks sizing and hash function

Both bufio and bio-prison-v1 use the identical model for splitting
their respective locks and rbtrees. Improve dm_num_hash_locks() to
distribute across more rbtrees to improve overall performance -- but
the maximum number of locks/rbtrees is still 64.

Also factor out a common hash function named dm_hash_locks_index(),
the magic numbers used were determined to be best using this program:
 https://gist.github.com/jthornber/e05c47daa7b500c56dc339269c5467fc

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bio-prison-v1.c |  5 +++--
 drivers/md/dm-bufio.c         |  2 +-
 drivers/md/dm.h               | 14 +++++++++++++-
 3 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index 6b726b8b300d..92afdca760ae 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -117,9 +117,10 @@ static int cmp_keys(struct dm_cell_key *lhs,
 	return 0;
 }
 
-static unsigned lock_nr(struct dm_cell_key *key, unsigned int num_locks)
+static inline unsigned int lock_nr(struct dm_cell_key *key, unsigned int num_locks)
 {
-	return (key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) & (num_locks - 1);
+	return dm_hash_locks_index((key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT),
+				   num_locks);
 }
 
 bool dm_cell_key_has_valid_range(struct dm_cell_key *key)
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index c1126ad45bdb..8a448185662c 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -398,7 +398,7 @@ struct dm_buffer_cache {
 
 static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
 {
-	return block & (num_locks - 1);
+	return dm_hash_locks_index(block, num_locks);
 }
 
 static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a1a5defddb07..a856e0aee73b 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -233,9 +233,21 @@ unsigned int dm_get_reserved_bio_based_ios(void);
 
 static inline unsigned int dm_num_hash_locks(void)
 {
-	unsigned int num_locks = roundup_pow_of_two(num_online_cpus());
+	unsigned int num_locks = roundup_pow_of_two(num_online_cpus()) << 1;
 
 	return min_t(unsigned int, num_locks, DM_HASH_LOCKS_MAX);
 }
 
+#define DM_HASH_LOCKS_MULT  4294967291ULL
+#define DM_HASH_LOCKS_SHIFT 6
+
+static inline unsigned int dm_hash_locks_index(sector_t block,
+					       unsigned int num_locks)
+{
+	sector_t h1 = (block * DM_HASH_LOCKS_MULT) >> DM_HASH_LOCKS_SHIFT;
+	sector_t h2 = h1 >> DM_HASH_LOCKS_SHIFT;
+
+	return (h1 ^ h2) & (num_locks - 1);
+}
+
 #endif
-- 
cgit v1.2.3


From e8c5d45f82ce0c238a4817739892fe8897a3dcc3 Mon Sep 17 00:00:00 2001
From: Yeongjin Gil <youngjin.gil@samsung.com>
Date: Mon, 20 Mar 2023 15:59:32 +0900
Subject: dm verity: fix error handling for check_at_most_once on FEC

In verity_end_io(), if bi_status is not BLK_STS_OK, it can be return
directly. But if FEC configured, it is desired to correct the data page
through verity_verify_io. And the return value will be converted to
blk_status and passed to verity_finish_io().

BTW, when a bit is set in v->validated_blocks, verity_verify_io() skips
verification regardless of I/O error for the corresponding bio. In this
case, the I/O error could not be returned properly, and as a result,
there is a problem that abnormal data could be read for the
corresponding block.

To fix this problem, when an I/O error occurs, do not skip verification
even if the bit related is set in v->validated_blocks.

Fixes: 843f38d382b1 ("dm verity: add 'check_at_most_once' option to only validate hashes once")
Cc: stable@vger.kernel.org
Reviewed-by: Sungjong Seo <sj1557.seo@samsung.com>
Signed-off-by: Yeongjin Gil <youngjin.gil@samsung.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-verity-target.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index ade83ef3b439..9316399b920e 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -523,7 +523,7 @@ static int verity_verify_io(struct dm_verity_io *io)
 		sector_t cur_block = io->block + b;
 		struct ahash_request *req = verity_io_hash_req(v, io);
 
-		if (v->validated_blocks &&
+		if (v->validated_blocks && bio->bi_status == BLK_STS_OK &&
 		    likely(test_bit(cur_block, v->validated_blocks))) {
 			verity_bv_skip_block(v, io, iter);
 			continue;
-- 
cgit v1.2.3


From 074c44664f60f62ef932eae9ee89d663563cf307 Mon Sep 17 00:00:00 2001
From: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
Date: Wed, 1 Mar 2023 12:34:15 +0100
Subject: dm verity: emit audit events on verification failure and more
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

dm-verity signals integrity violations by returning I/O errors
to user space. To identify integrity violations by a controlling
instance, the kernel audit subsystem can be used to emit audit
events to user space. Analogous to dm-integrity, we also use the
dm-audit submodule allowing to emit audit events on verification
failures of metadata and data blocks as well as if max corrupted
errors are reached.

The construction and destruction of verity device mappings are
also relevant for auditing a system. Thus, those events are also
logged as audit events.

Tested by starting a container with the container manager (cmld) of
GyroidOS which uses a dm-verity protected rootfs image root.img mapped
to /dev/mapper/<uuid>-root. One block was manipulated in the
underlying image file and repeated reads of the verity device were
performed again until the max corrupted errors is reached, e.g.:

  dd if=/dev/urandom of=root.img bs=512 count=1 seek=1000
  for i in range {1..101}; do \
    dd if=/dev/mapper/<uuid>-root of=/dev/null bs=4096 \
       count=1 skip=1000 \
  done

The resulting audit log looks as follows:

  type=DM_CTRL msg=audit(1677618791.876:962):
    module=verity op=ctr ppid=4876 pid=29102 auid=0 uid=0 gid=0
    euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=(none) ses=44
    comm="cmld" exe="/usr/sbin/cml/cmld" subj=unconfined
    dev=254:3 error_msg='success' res=1

  type=DM_EVENT msg=audit(1677619463.786:1074): module=verity
    op=verify-data dev=7:0 sector=1000 res=0
  ...
  type=DM_EVENT msg=audit(1677619596.727:1162): module=verity
    op=verify-data dev=7:0 sector=1000 res=0

  type=DM_EVENT msg=audit(1677619596.731:1163): module=verity
    op=max-corrupted-errors dev=254:3 sector=? res=0

Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-verity-target.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 9316399b920e..b2f0c3c0abdb 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -16,6 +16,7 @@
 #include "dm-verity.h"
 #include "dm-verity-fec.h"
 #include "dm-verity-verify-sig.h"
+#include "dm-audit.h"
 #include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/scatterlist.h>
@@ -248,8 +249,10 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
 	DMERR_LIMIT("%s: %s block %llu is corrupted", v->data_dev->name,
 		    type_str, block);
 
-	if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
+	if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS) {
 		DMERR("%s: reached maximum errors", v->data_dev->name);
+		dm_audit_log_target(DM_MSG_PREFIX, "max-corrupted-errors", v->ti, 0);
+	}
 
 	snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu",
 		DM_VERITY_ENV_VAR_NAME, type, block);
@@ -340,6 +343,11 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 		else if (verity_handle_err(v,
 					   DM_VERITY_BLOCK_TYPE_METADATA,
 					   hash_block)) {
+			struct bio *bio =
+				dm_bio_from_per_bio_data(io,
+							 v->ti->per_io_data_size);
+			dm_audit_log_bio(DM_MSG_PREFIX, "verify-metadata", bio,
+					 block, 0);
 			r = -EIO;
 			goto release_ret_r;
 		}
@@ -590,8 +598,11 @@ static int verity_verify_io(struct dm_verity_io *io)
 				return -EIO;
 			}
 			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
-					      cur_block))
+					      cur_block)) {
+				dm_audit_log_bio(DM_MSG_PREFIX, "verify-data",
+						 bio, cur_block, 0);
 				return -EIO;
+			}
 		}
 	}
 
@@ -975,6 +986,8 @@ static void verity_dtr(struct dm_target *ti)
 		static_branch_dec(&use_tasklet_enabled);
 
 	kfree(v);
+
+	dm_audit_log_dtr(DM_MSG_PREFIX, ti, 1);
 }
 
 static int verity_alloc_most_once(struct dm_verity *v)
@@ -1429,11 +1442,14 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	verity_verify_sig_opts_cleanup(&verify_args);
 
+	dm_audit_log_ctr(DM_MSG_PREFIX, ti, 1);
+
 	return 0;
 
 bad:
 
 	verity_verify_sig_opts_cleanup(&verify_args);
+	dm_audit_log_ctr(DM_MSG_PREFIX, ti, 0);
 	verity_dtr(ti);
 
 	return r;
-- 
cgit v1.2.3


From 85c938e8914f8f0e276b8e82d4f2e4bcab044e96 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 4 Apr 2023 11:14:26 -0400
Subject: dm table: allow targets without devices to set ->io_hints

In dm_calculate_queue_limits, add call to ->io_hints hook if the
target doesn't provide ->iterate_devices.

This is needed so the "error" and "zero" targets may support
discards. The 2 following commits will add their respective discard
support.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Tested-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-table.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2055a758541d..119db5e01080 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1670,8 +1670,12 @@ int dm_calculate_queue_limits(struct dm_table *t,
 
 		blk_set_stacking_limits(&ti_limits);
 
-		if (!ti->type->iterate_devices)
+		if (!ti->type->iterate_devices) {
+			/* Set I/O hints portion of queue limits */
+			if (ti->type->io_hints)
+				ti->type->io_hints(ti, &ti_limits);
 			goto combine_limits;
+		}
 
 		/*
 		 * Combine queue limits of all the devices this target uses.
-- 
cgit v1.2.3


From 00065f925efb077ade3e7fea49150d798cf87d05 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 4 Apr 2023 11:19:33 -0400
Subject: dm zero: add discard support

Add zero_io_hints() and set discard limits so that the zero target
advertises support for discards.

The zero target will ignore discards.

This is useful when the user combines dm-zero with other
discard-supporting targets in the same table; without dm-zero support,
discards would be disabled for the whole combined device.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Tested-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-zero.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 2601cd520384..6215d8eb7e41 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -27,6 +27,7 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	 * Silently drop discards, avoiding -EOPNOTSUPP.
 	 */
 	ti->num_discard_bios = 1;
+	ti->discards_supported = true;
 
 	return 0;
 }
@@ -45,6 +46,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
 		zero_fill_bio(bio);
 		break;
 	case REQ_OP_WRITE:
+	case REQ_OP_DISCARD:
 		/* writes get silently dropped */
 		break;
 	default:
@@ -57,13 +59,21 @@ static int zero_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_SUBMITTED;
 }
 
+static void zero_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	limits->max_discard_sectors = UINT_MAX;
+	limits->max_hw_discard_sectors = UINT_MAX;
+	limits->discard_granularity = 512;
+}
+
 static struct target_type zero_target = {
 	.name   = "zero",
-	.version = {1, 1, 0},
+	.version = {1, 2, 0},
 	.features = DM_TARGET_NOWAIT,
 	.module = THIS_MODULE,
 	.ctr    = zero_ctr,
 	.map    = zero_map,
+	.io_hints = zero_io_hints,
 };
 
 static int __init dm_zero_init(void)
-- 
cgit v1.2.3


From b6bcb84446810df0c9364ee6e23e07866316beaf Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 4 Apr 2023 11:22:10 -0400
Subject: dm error: add discard support

Add io_err_io_hints() and set discard limits so that the zero target
advertises support for discards.

The error target will return -EIO for discards.

This is useful when the user combines dm-error with other
discard-supporting targets in the same table; without dm-error
support, discards would be disabled for the whole combined device.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Tested-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-target.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 26ea22b1a0d7..97a75f3eed93 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -119,6 +119,7 @@ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
 	 * Return error for discards instead of -EOPNOTSUPP
 	 */
 	tt->num_discard_bios = 1;
+	tt->discards_supported = true;
 
 	return 0;
 }
@@ -145,6 +146,13 @@ static void io_err_release_clone_rq(struct request *clone,
 {
 }
 
+static void io_err_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	limits->max_discard_sectors = UINT_MAX;
+	limits->max_hw_discard_sectors = UINT_MAX;
+	limits->discard_granularity = 512;
+}
+
 static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode mode, void **kaddr,
 		pfn_t *pfn)
@@ -154,13 +162,14 @@ static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 
 static struct target_type error_target = {
 	.name = "error",
-	.version = {1, 5, 0},
+	.version = {1, 6, 0},
 	.features = DM_TARGET_WILDCARD,
 	.ctr  = io_err_ctr,
 	.dtr  = io_err_dtr,
 	.map  = io_err_map,
 	.clone_and_map_rq = io_err_clone_and_map_rq,
 	.release_clone_rq = io_err_release_clone_rq,
+	.io_hints = io_err_io_hints,
 	.direct_access = io_err_dax_direct_access,
 };
 
-- 
cgit v1.2.3


From 6827af4a9a9f5bb664c42abf7c11af4978d72201 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 4 Apr 2023 11:59:00 -0400
Subject: dm clone: call kmem_cache_destroy() in dm_clone_init() error path

Otherwise the _hydration_cache will leak if dm_register_target() fails.

Cc: stable@vger.kernel.org
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-clone-target.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index f38a27604c7a..fc30ebd67622 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -2205,6 +2205,7 @@ static int __init dm_clone_init(void)
 	r = dm_register_target(&clone_target);
 	if (r < 0) {
 		DMERR("Failed to register clone target");
+		kmem_cache_destroy(_hydration_cache);
 		return r;
 	}
 
-- 
cgit v1.2.3


From 6b79a428c02769f2a11f8ae76bf866226d134887 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 4 Apr 2023 13:34:28 -0400
Subject: dm integrity: call kmem_cache_destroy() in dm_integrity_init() error
 path

Otherwise the journal_io_cache will leak if dm_register_target() fails.

Cc: stable@vger.kernel.org
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-integrity.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index b0d5057fbdd9..54830b07b829 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -4703,11 +4703,13 @@ static int __init dm_integrity_init(void)
 	}
 
 	r = dm_register_target(&integrity_target);
-
-	if (r < 0)
+	if (r < 0) {
 		DMERR("register failed %d", r);
+		kmem_cache_destroy(journal_io_cache);
+		return r;
+	}
 
-	return r;
+	return 0;
 }
 
 static void __exit dm_integrity_exit(void)
-- 
cgit v1.2.3


From b362c733ed7bf312ed729847bc26ba89febc556e Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Sat, 18 Mar 2023 21:16:33 +0800
Subject: dm: push error reporting down to dm_register_target()

Simplifies each DM target's init method by making dm_register_target()
responsible for its error reporting (on behalf of targets).

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-cache-target.c  |  1 -
 drivers/md/dm-clone-target.c  |  1 -
 drivers/md/dm-crypt.c         |  8 +-------
 drivers/md/dm-delay.c         | 13 +------------
 drivers/md/dm-dust.c          |  7 +------
 drivers/md/dm-ebs-target.c    |  7 +------
 drivers/md/dm-era-target.c    | 10 +---------
 drivers/md/dm-flakey.c        |  7 +------
 drivers/md/dm-integrity.c     |  1 -
 drivers/md/dm-log-writes.c    |  7 +------
 drivers/md/dm-mpath.c         |  5 +----
 drivers/md/dm-raid1.c         | 10 +++-------
 drivers/md/dm-snap.c          | 12 +++---------
 drivers/md/dm-switch.c        |  8 +-------
 drivers/md/dm-target.c        |  9 ++++++---
 drivers/md/dm-verity-target.c |  8 +-------
 drivers/md/dm-writecache.c    | 10 +---------
 drivers/md/dm-zero.c          |  7 +------
 18 files changed, 24 insertions(+), 107 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index dbbcfa580078..872896218550 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -3459,7 +3459,6 @@ static int __init dm_cache_init(void)
 
 	r = dm_register_target(&cache_target);
 	if (r) {
-		DMERR("cache target registration failed: %d", r);
 		kmem_cache_destroy(migration_cache);
 		return r;
 	}
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index fc30ebd67622..f467cdb5a022 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -2204,7 +2204,6 @@ static int __init dm_clone_init(void)
 
 	r = dm_register_target(&clone_target);
 	if (r < 0) {
-		DMERR("Failed to register clone target");
 		kmem_cache_destroy(_hydration_cache);
 		return r;
 	}
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3ba53dc3cc3f..52615a258e13 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3662,13 +3662,7 @@ static struct target_type crypt_target = {
 
 static int __init dm_crypt_init(void)
 {
-	int r;
-
-	r = dm_register_target(&crypt_target);
-	if (r < 0)
-		DMERR("register failed %d", r);
-
-	return r;
+	return dm_register_target(&crypt_target);
 }
 
 static void __exit dm_crypt_exit(void)
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index a425046f88c7..00d18b2070a5 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -370,18 +370,7 @@ static struct target_type delay_target = {
 
 static int __init dm_delay_init(void)
 {
-	int r;
-
-	r = dm_register_target(&delay_target);
-	if (r < 0) {
-		DMERR("register failed %d", r);
-		goto bad_register;
-	}
-
-	return 0;
-
-bad_register:
-	return r;
+	return dm_register_target(&delay_target);
 }
 
 static void __exit dm_delay_exit(void)
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index 7ae9936752de..9bf3cdf548de 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -573,12 +573,7 @@ static struct target_type dust_target = {
 
 static int __init dm_dust_init(void)
 {
-	int r = dm_register_target(&dust_target);
-
-	if (r < 0)
-		DMERR("dm_register_target failed %d", r);
-
-	return r;
+	return dm_register_target(&dust_target);
 }
 
 static void __exit dm_dust_exit(void)
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index b1068a68bc46..38da4de3ecbf 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -455,12 +455,7 @@ static struct target_type ebs_target = {
 
 static int __init dm_ebs_init(void)
 {
-	int r = dm_register_target(&ebs_target);
-
-	if (r < 0)
-		DMERR("register failed %d", r);
-
-	return r;
+	return dm_register_target(&ebs_target);
 }
 
 static void dm_ebs_exit(void)
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index c2e7780cdd2d..554d234fca18 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1756,15 +1756,7 @@ static struct target_type era_target = {
 
 static int __init dm_era_init(void)
 {
-	int r;
-
-	r = dm_register_target(&era_target);
-	if (r) {
-		DMERR("era target registration failed: %d", r);
-		return r;
-	}
-
-	return 0;
+	return dm_register_target(&era_target);
 }
 
 static void __exit dm_era_exit(void)
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 5b7556d2a9d9..14179355e6a1 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -509,12 +509,7 @@ static struct target_type flakey_target = {
 
 static int __init dm_flakey_init(void)
 {
-	int r = dm_register_target(&flakey_target);
-
-	if (r < 0)
-		DMERR("register failed %d", r);
-
-	return r;
+	return dm_register_target(&flakey_target);
 }
 
 static void __exit dm_flakey_exit(void)
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 54830b07b829..ed612bf38b83 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -4704,7 +4704,6 @@ static int __init dm_integrity_init(void)
 
 	r = dm_register_target(&integrity_target);
 	if (r < 0) {
-		DMERR("register failed %d", r);
 		kmem_cache_destroy(journal_io_cache);
 		return r;
 	}
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index cbd0f81f4a35..0ce9b01d1393 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -940,12 +940,7 @@ static struct target_type log_writes_target = {
 
 static int __init dm_log_writes_init(void)
 {
-	int r = dm_register_target(&log_writes_target);
-
-	if (r < 0)
-		DMERR("register failed %d", r);
-
-	return r;
+	return dm_register_target(&log_writes_target);
 }
 
 static void __exit dm_log_writes_exit(void)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 61ab1a8d2c9c..bea3cda9938e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -2235,11 +2235,8 @@ static int __init dm_multipath_init(void)
 	}
 
 	r = dm_register_target(&multipath_target);
-	if (r < 0) {
-		DMERR("request-based register failed %d", r);
-		r = -EINVAL;
+	if (r < 0)
 		goto bad_register_target;
-	}
 
 	return 0;
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index bc417a5e5b89..3e947655746c 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1498,23 +1498,19 @@ static struct target_type mirror_target = {
 
 static int __init dm_mirror_init(void)
 {
-	int r = -ENOMEM;
+	int r;
 
 	dm_raid1_wq = alloc_workqueue("dm_raid1_wq", 0, 0);
 	if (!dm_raid1_wq)
-		goto bad_target;
+		return -ENOMEM;
 
 	r = dm_register_target(&mirror_target);
 	if (r < 0) {
 		destroy_workqueue(dm_raid1_wq);
-		goto bad_target;
+		return r;
 	}
 
 	return 0;
-
-bad_target:
-	DMERR("Failed to register mirror target");
-	return r;
 }
 
 static void __exit dm_mirror_exit(void)
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f766c21408f1..9c49f53760d0 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2815,22 +2815,16 @@ static int __init dm_snapshot_init(void)
 	}
 
 	r = dm_register_target(&snapshot_target);
-	if (r < 0) {
-		DMERR("snapshot target register failed %d", r);
+	if (r < 0)
 		goto bad_register_snapshot_target;
-	}
 
 	r = dm_register_target(&origin_target);
-	if (r < 0) {
-		DMERR("Origin target register failed %d", r);
+	if (r < 0)
 		goto bad_register_origin_target;
-	}
 
 	r = dm_register_target(&merge_target);
-	if (r < 0) {
-		DMERR("Merge target register failed %d", r);
+	if (r < 0)
 		goto bad_register_merge_target;
-	}
 
 	return 0;
 
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index ee2432927e90..5a5976b0cfb8 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -568,13 +568,7 @@ static struct target_type switch_target = {
 
 static int __init dm_switch_init(void)
 {
-	int r;
-
-	r = dm_register_target(&switch_target);
-	if (r < 0)
-		DMERR("dm_register_target() failed %d", r);
-
-	return r;
+	return dm_register_target(&switch_target);
 }
 
 static void __exit dm_switch_exit(void)
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 97a75f3eed93..27e2992ff249 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -85,12 +85,15 @@ int dm_register_target(struct target_type *tt)
 	int rv = 0;
 
 	down_write(&_lock);
-	if (__find_target_type(tt->name))
+	if (__find_target_type(tt->name)) {
+		DMERR("%s: '%s' target already registered",
+		      __func__, tt->name);
 		rv = -EEXIST;
-	else
+	} else {
 		list_add(&tt->list, &_targets);
-
+	}
 	up_write(&_lock);
+
 	return rv;
 }
 EXPORT_SYMBOL(dm_register_target);
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index b2f0c3c0abdb..260f04d12982 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1517,13 +1517,7 @@ static struct target_type verity_target = {
 
 static int __init dm_verity_init(void)
 {
-	int r;
-
-	r = dm_register_target(&verity_target);
-	if (r < 0)
-		DMERR("register failed %d", r);
-
-	return r;
+	return dm_register_target(&verity_target);
 }
 
 static void __exit dm_verity_exit(void)
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 3aa5874f0aef..81b60b75a9fa 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -2776,15 +2776,7 @@ static struct target_type writecache_target = {
 
 static int __init dm_writecache_init(void)
 {
-	int r;
-
-	r = dm_register_target(&writecache_target);
-	if (r < 0) {
-		DMERR("register failed %d", r);
-		return r;
-	}
-
-	return 0;
+	return dm_register_target(&writecache_target);
 }
 
 static void __exit dm_writecache_exit(void)
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 6215d8eb7e41..884ac726a743 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -78,12 +78,7 @@ static struct target_type zero_target = {
 
 static int __init dm_zero_init(void)
 {
-	int r = dm_register_target(&zero_target);
-
-	if (r < 0)
-		DMERR("register failed %d", r);
-
-	return r;
+	return dm_register_target(&zero_target);
 }
 
 static void __exit dm_zero_exit(void)
-- 
cgit v1.2.3


From 990f61e43c4d3235486707568edf59d67488a575 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 4 Apr 2023 11:44:33 -0400
Subject: dm mirror: add DMERR message if alloc_workqueue fails

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-raid1.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 3e947655746c..4b2b40f81a1d 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1501,8 +1501,10 @@ static int __init dm_mirror_init(void)
 	int r;
 
 	dm_raid1_wq = alloc_workqueue("dm_raid1_wq", 0, 0);
-	if (!dm_raid1_wq)
+	if (!dm_raid1_wq) {
+		DMERR("Failed to alloc workqueue");
 		return -ENOMEM;
+	}
 
 	r = dm_register_target(&mirror_target);
 	if (r < 0) {
-- 
cgit v1.2.3


From 26cb62a285802ab6d26cdbf11305cd8516871d1a Mon Sep 17 00:00:00 2001
From: Yu Zhe <yuzhe@nfschina.com>
Date: Fri, 17 Mar 2023 09:35:54 +0800
Subject: dm: remove unnecessary (void*) conversions

Pointer variables of void * type do not require type cast.

Signed-off-by: Yu Zhe <yuzhe@nfschina.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-integrity.c       |  6 +++---
 drivers/md/dm-io.c              |  4 ++--
 drivers/md/dm-kcopyd.c          |  4 ++--
 drivers/md/dm-linear.c          |  6 +++---
 drivers/md/dm-log-writes.c      |  2 +-
 drivers/md/dm-log.c             | 24 ++++++++++++------------
 drivers/md/dm-raid1.c           | 10 +++++-----
 drivers/md/dm-snap-persistent.c |  2 +-
 drivers/md/dm-stripe.c          |  4 ++--
 drivers/md/dm-verity-fec.c      |  4 ++--
 drivers/md/dm-zoned-metadata.c  |  6 +++---
 11 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index ed612bf38b83..31838b13ea54 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -3118,7 +3118,7 @@ static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, voi
 
 static void dm_integrity_postsuspend(struct dm_target *ti)
 {
-	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+	struct dm_integrity_c *ic = ti->private;
 	int r;
 
 	WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier));
@@ -3167,7 +3167,7 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
 
 static void dm_integrity_resume(struct dm_target *ti)
 {
-	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+	struct dm_integrity_c *ic = ti->private;
 	__u64 old_provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
 	int r;
 
@@ -3290,7 +3290,7 @@ static void dm_integrity_resume(struct dm_target *ti)
 static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 				unsigned int status_flags, char *result, unsigned int maxlen)
 {
-	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+	struct dm_integrity_c *ic = ti->private;
 	unsigned int arg_count;
 	size_t sz = 0;
 
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index dc2df76999b0..f053ce245814 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -187,7 +187,7 @@ static void list_get_page(struct dpages *dp,
 		  struct page **p, unsigned long *len, unsigned int *offset)
 {
 	unsigned int o = dp->context_u;
-	struct page_list *pl = (struct page_list *) dp->context_ptr;
+	struct page_list *pl = dp->context_ptr;
 
 	*p = pl->page;
 	*len = PAGE_SIZE - o;
@@ -196,7 +196,7 @@ static void list_get_page(struct dpages *dp,
 
 static void list_next_page(struct dpages *dp)
 {
-	struct page_list *pl = (struct page_list *) dp->context_ptr;
+	struct page_list *pl = dp->context_ptr;
 
 	dp->context_ptr = pl->next;
 	dp->context_u = 0;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index a158c6e5fbd7..d01807c50f20 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -519,7 +519,7 @@ static int run_complete_job(struct kcopyd_job *job)
 
 static void complete_io(unsigned long error, void *context)
 {
-	struct kcopyd_job *job = (struct kcopyd_job *) context;
+	struct kcopyd_job *job = context;
 	struct dm_kcopyd_client *kc = job->kc;
 
 	io_job_finish(kc->throttle);
@@ -696,7 +696,7 @@ static void segment_complete(int read_err, unsigned long write_err,
 	/* FIXME: tidy this function */
 	sector_t progress = 0;
 	sector_t count = 0;
-	struct kcopyd_job *sub_job = (struct kcopyd_job *) context;
+	struct kcopyd_job *sub_job = context;
 	struct kcopyd_job *job = sub_job->master_job;
 	struct dm_kcopyd_client *kc = job->kc;
 
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 3e622dcc9dbd..f4448d520ee9 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -72,7 +72,7 @@ bad:
 
 static void linear_dtr(struct dm_target *ti)
 {
-	struct linear_c *lc = (struct linear_c *) ti->private;
+	struct linear_c *lc = ti->private;
 
 	dm_put_device(ti, lc->dev);
 	kfree(lc);
@@ -98,7 +98,7 @@ static int linear_map(struct dm_target *ti, struct bio *bio)
 static void linear_status(struct dm_target *ti, status_type_t type,
 			  unsigned int status_flags, char *result, unsigned int maxlen)
 {
-	struct linear_c *lc = (struct linear_c *) ti->private;
+	struct linear_c *lc = ti->private;
 	size_t sz = 0;
 
 	switch (type) {
@@ -120,7 +120,7 @@ static void linear_status(struct dm_target *ti, status_type_t type,
 
 static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
 {
-	struct linear_c *lc = (struct linear_c *) ti->private;
+	struct linear_c *lc = ti->private;
 	struct dm_dev *dev = lc->dev;
 
 	*bdev = dev->bdev;
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 0ce9b01d1393..6d7436d2fd7f 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -429,7 +429,7 @@ static inline sector_t logdev_last_sector(struct log_writes_c *lc)
 
 static int log_writes_kthread(void *arg)
 {
-	struct log_writes_c *lc = (struct log_writes_c *)arg;
+	struct log_writes_c *lc = arg;
 	sector_t sector = 0;
 
 	while (!kthread_should_stop()) {
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index afd94d2e7295..f9f84236dfcd 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -530,7 +530,7 @@ static void destroy_log_context(struct log_c *lc)
 
 static void core_dtr(struct dm_dirty_log *log)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 
 	vfree(lc->clean_bits);
 	destroy_log_context(lc);
@@ -569,7 +569,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 
 static void disk_dtr(struct dm_dirty_log *log)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 
 	dm_put_device(lc->ti, lc->log_dev);
 	vfree(lc->disk_header);
@@ -590,7 +590,7 @@ static int disk_resume(struct dm_dirty_log *log)
 {
 	int r;
 	unsigned int i;
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 	size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
 
 	/* read the disk header */
@@ -652,14 +652,14 @@ static int disk_resume(struct dm_dirty_log *log)
 
 static uint32_t core_get_region_size(struct dm_dirty_log *log)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 
 	return lc->region_size;
 }
 
 static int core_resume(struct dm_dirty_log *log)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 
 	lc->sync_search = 0;
 	return 0;
@@ -667,14 +667,14 @@ static int core_resume(struct dm_dirty_log *log)
 
 static int core_is_clean(struct dm_dirty_log *log, region_t region)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 
 	return log_test_bit(lc->clean_bits, region);
 }
 
 static int core_in_sync(struct dm_dirty_log *log, region_t region, int block)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 
 	return log_test_bit(lc->sync_bits, region);
 }
@@ -727,14 +727,14 @@ static int disk_flush(struct dm_dirty_log *log)
 
 static void core_mark_region(struct dm_dirty_log *log, region_t region)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 
 	log_clear_bit(lc, lc->clean_bits, region);
 }
 
 static void core_clear_region(struct dm_dirty_log *log, region_t region)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 
 	if (likely(!lc->flush_failed))
 		log_set_bit(lc, lc->clean_bits, region);
@@ -742,7 +742,7 @@ static void core_clear_region(struct dm_dirty_log *log, region_t region)
 
 static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 
 	if (lc->sync_search >= lc->region_count)
 		return 0;
@@ -765,7 +765,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
 static void core_set_region_sync(struct dm_dirty_log *log, region_t region,
 				 int in_sync)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 
 	log_clear_bit(lc, lc->recovering_bits, region);
 	if (in_sync) {
@@ -779,7 +779,7 @@ static void core_set_region_sync(struct dm_dirty_log *log, region_t region,
 
 static region_t core_get_sync_count(struct dm_dirty_log *log)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	struct log_c *lc = log->context;
 
 	return lc->sync_count;
 }
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 4b2b40f81a1d..ddcb2bc4a617 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -604,7 +604,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 static void write_callback(unsigned long error, void *context)
 {
 	unsigned int i;
-	struct bio *bio = (struct bio *) context;
+	struct bio *bio = context;
 	struct mirror_set *ms;
 	int should_wake = 0;
 	unsigned long flags;
@@ -1180,7 +1180,7 @@ err_free_context:
 
 static void mirror_dtr(struct dm_target *ti)
 {
-	struct mirror_set *ms = (struct mirror_set *) ti->private;
+	struct mirror_set *ms = ti->private;
 
 	del_timer_sync(&ms->timer);
 	flush_workqueue(ms->kmirrord_wq);
@@ -1246,7 +1246,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 		blk_status_t *error)
 {
 	int rw = bio_data_dir(bio);
-	struct mirror_set *ms = (struct mirror_set *) ti->private;
+	struct mirror_set *ms = ti->private;
 	struct mirror *m = NULL;
 	struct dm_bio_details *bd = NULL;
 	struct dm_raid1_bio_record *bio_record =
@@ -1311,7 +1311,7 @@ out:
 
 static void mirror_presuspend(struct dm_target *ti)
 {
-	struct mirror_set *ms = (struct mirror_set *) ti->private;
+	struct mirror_set *ms = ti->private;
 	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 
 	struct bio_list holds;
@@ -1407,7 +1407,7 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
 {
 	unsigned int m, sz = 0;
 	int num_feature_args = 0;
-	struct mirror_set *ms = (struct mirror_set *) ti->private;
+	struct mirror_set *ms = ti->private;
 	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 	char buffer[MAX_NR_MIRRORS + 1];
 
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index f14e5df27874..15649921f2a9 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -567,7 +567,7 @@ ret_destroy_bufio:
 
 static struct pstore *get_info(struct dm_exception_store *store)
 {
-	return (struct pstore *) store->context;
+	return store->context;
 }
 
 static void persistent_usage(struct dm_exception_store *store,
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 8d6951157106..e2854a3cbd28 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -189,7 +189,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 static void stripe_dtr(struct dm_target *ti)
 {
 	unsigned int i;
-	struct stripe_c *sc = (struct stripe_c *) ti->private;
+	struct stripe_c *sc = ti->private;
 
 	for (i = 0; i < sc->stripes; i++)
 		dm_put_device(ti, sc->stripe[i].dev);
@@ -360,7 +360,7 @@ static size_t stripe_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff,
 static void stripe_status(struct dm_target *ti, status_type_t type,
 			  unsigned int status_flags, char *result, unsigned int maxlen)
 {
-	struct stripe_c *sc = (struct stripe_c *) ti->private;
+	struct stripe_c *sc = ti->private;
 	unsigned int sz = 0;
 	unsigned int i;
 
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 962fc32c947c..a9ee2faa75a2 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -567,14 +567,14 @@ out:
 
 static void *fec_rs_alloc(gfp_t gfp_mask, void *pool_data)
 {
-	struct dm_verity *v = (struct dm_verity *)pool_data;
+	struct dm_verity *v = pool_data;
 
 	return init_rs_gfp(8, 0x11d, 0, 1, v->fec->roots, gfp_mask);
 }
 
 static void fec_rs_free(void *element, void *pool_data)
 {
-	struct rs_control *rs = (struct rs_control *)element;
+	struct rs_control *rs = element;
 
 	if (rs)
 		free_rs(rs);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index cf9402064aba..8f0896a6990b 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1701,7 +1701,7 @@ static int dmz_load_mapping(struct dmz_metadata *zmd)
 			if (IS_ERR(dmap_mblk))
 				return PTR_ERR(dmap_mblk);
 			zmd->map_mblk[i] = dmap_mblk;
-			dmap = (struct dmz_map *) dmap_mblk->data;
+			dmap = dmap_mblk->data;
 			i++;
 			e = 0;
 		}
@@ -1832,7 +1832,7 @@ static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk,
 				  unsigned int dzone_id, unsigned int bzone_id)
 {
 	struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
-	struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
+	struct dmz_map *dmap = dmap_mblk->data;
 	int map_idx = chunk & DMZ_MAP_ENTRIES_MASK;
 
 	dmap[map_idx].dzone_id = cpu_to_le32(dzone_id);
@@ -2045,7 +2045,7 @@ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd,
 				      unsigned int chunk, enum req_op op)
 {
 	struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
-	struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
+	struct dmz_map *dmap = dmap_mblk->data;
 	int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK;
 	unsigned int dzone_id;
 	struct dm_zone *dzone = NULL;
-- 
cgit v1.2.3


From 306fbc2e041c227be7c934efe8a49ddb87bd31f1 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Thu, 30 Mar 2023 17:27:53 -0400
Subject: dm raid: remove unused d variable

clang with W=1 reports
drivers/md/dm-raid.c:2212:15: error: variable
  'd' set but not used [-Werror,-Wunused-but-set-variable]
        unsigned int d;
                     ^
This variable is not used so remove it.

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-raid.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 60632b409b80..2dfd51509647 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2209,7 +2209,6 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 {
 	int role;
-	unsigned int d;
 	struct mddev *mddev = &rs->md;
 	uint64_t events_sb;
 	uint64_t failed_devices[DISKS_ARRAY_ELEMS];
@@ -2324,7 +2323,6 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 	 *    to provide capacity for redundancy or during reshape
 	 *    to add capacity to grow the raid set.
 	 */
-	d = 0;
 	rdev_for_each(r, mddev) {
 		if (test_bit(Journal, &rdev->flags))
 			continue;
@@ -2340,8 +2338,6 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 			if (test_bit(FirstUse, &r->flags))
 				rebuild_and_new++;
 		}
-
-		d++;
 	}
 
 	if (new_devs == rs->raid_disks || !rebuilds) {
-- 
cgit v1.2.3


From 3664ff82dae1ef9f14f7763d3dd30565e7ef9e14 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Mon, 10 Apr 2023 00:43:37 +0800
Subject: dm: add helper macro for simple DM target module init and exit

Eliminate duplicate boilerplate code for simple modules that contain
a single DM target driver without any additional setup code.

Add a new module_dm() macro, which replaces the module_init() and
module_exit() with template functions that call dm_register_target()
and dm_unregister_target() respectively.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-crypt.c         | 14 +-------------
 drivers/md/dm-delay.c         | 15 +--------------
 drivers/md/dm-dust.c          | 14 +-------------
 drivers/md/dm-ebs-target.c    | 14 +-------------
 drivers/md/dm-era-target.c    | 14 +-------------
 drivers/md/dm-flakey.c        | 15 +--------------
 drivers/md/dm-log-writes.c    | 14 +-------------
 drivers/md/dm-raid.c          | 18 +-----------------
 drivers/md/dm-switch.c        | 14 +-------------
 drivers/md/dm-unstripe.c      | 14 +-------------
 drivers/md/dm-verity-target.c | 14 +-------------
 drivers/md/dm-writecache.c    | 14 +-------------
 drivers/md/dm-zero.c          | 14 +-------------
 drivers/md/dm-zoned-target.c  | 16 ++--------------
 include/linux/device-mapper.h | 20 ++++++++++++++++++++
 15 files changed, 35 insertions(+), 189 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 52615a258e13..8b47b913ee83 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3659,19 +3659,7 @@ static struct target_type crypt_target = {
 	.iterate_devices = crypt_iterate_devices,
 	.io_hints = crypt_io_hints,
 };
-
-static int __init dm_crypt_init(void)
-{
-	return dm_register_target(&crypt_target);
-}
-
-static void __exit dm_crypt_exit(void)
-{
-	dm_unregister_target(&crypt_target);
-}
-
-module_init(dm_crypt_init);
-module_exit(dm_crypt_exit);
+module_dm(crypt);
 
 MODULE_AUTHOR("Jana Saout <jana@saout.de>");
 MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption");
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 00d18b2070a5..7433525e5985 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -367,20 +367,7 @@ static struct target_type delay_target = {
 	.status	     = delay_status,
 	.iterate_devices = delay_iterate_devices,
 };
-
-static int __init dm_delay_init(void)
-{
-	return dm_register_target(&delay_target);
-}
-
-static void __exit dm_delay_exit(void)
-{
-	dm_unregister_target(&delay_target);
-}
-
-/* Module hooks */
-module_init(dm_delay_init);
-module_exit(dm_delay_exit);
+module_dm(delay);
 
 MODULE_DESCRIPTION(DM_NAME " delay target");
 MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>");
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index 9bf3cdf548de..12a377e06d02 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -570,19 +570,7 @@ static struct target_type dust_target = {
 	.status = dust_status,
 	.prepare_ioctl = dust_prepare_ioctl,
 };
-
-static int __init dm_dust_init(void)
-{
-	return dm_register_target(&dust_target);
-}
-
-static void __exit dm_dust_exit(void)
-{
-	dm_unregister_target(&dust_target);
-}
-
-module_init(dm_dust_init);
-module_exit(dm_dust_exit);
+module_dm(dust);
 
 MODULE_DESCRIPTION(DM_NAME " dust test target");
 MODULE_AUTHOR("Bryan Gurney <dm-devel@redhat.com>");
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index 38da4de3ecbf..435b45201f4d 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -452,19 +452,7 @@ static struct target_type ebs_target = {
 	.prepare_ioctl	 = ebs_prepare_ioctl,
 	.iterate_devices = ebs_iterate_devices,
 };
-
-static int __init dm_ebs_init(void)
-{
-	return dm_register_target(&ebs_target);
-}
-
-static void dm_ebs_exit(void)
-{
-	dm_unregister_target(&ebs_target);
-}
-
-module_init(dm_ebs_init);
-module_exit(dm_ebs_exit);
+module_dm(ebs);
 
 MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
 MODULE_DESCRIPTION(DM_NAME " emulated block size target");
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 554d234fca18..0d70914217ee 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1753,19 +1753,7 @@ static struct target_type era_target = {
 	.iterate_devices = era_iterate_devices,
 	.io_hints = era_io_hints
 };
-
-static int __init dm_era_init(void)
-{
-	return dm_register_target(&era_target);
-}
-
-static void __exit dm_era_exit(void)
-{
-	dm_unregister_target(&era_target);
-}
-
-module_init(dm_era_init);
-module_exit(dm_era_exit);
+module_dm(era);
 
 MODULE_DESCRIPTION(DM_NAME " era target");
 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 14179355e6a1..ebcfb99b186b 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -506,20 +506,7 @@ static struct target_type flakey_target = {
 	.prepare_ioctl = flakey_prepare_ioctl,
 	.iterate_devices = flakey_iterate_devices,
 };
-
-static int __init dm_flakey_init(void)
-{
-	return dm_register_target(&flakey_target);
-}
-
-static void __exit dm_flakey_exit(void)
-{
-	dm_unregister_target(&flakey_target);
-}
-
-/* Module hooks */
-module_init(dm_flakey_init);
-module_exit(dm_flakey_exit);
+module_dm(flakey);
 
 MODULE_DESCRIPTION(DM_NAME " flakey target");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 6d7436d2fd7f..f17a6cf2284e 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -937,19 +937,7 @@ static struct target_type log_writes_target = {
 	.dax_zero_page_range = log_writes_dax_zero_page_range,
 	.dax_recovery_write = log_writes_dax_recovery_write,
 };
-
-static int __init dm_log_writes_init(void)
-{
-	return dm_register_target(&log_writes_target);
-}
-
-static void __exit dm_log_writes_exit(void)
-{
-	dm_unregister_target(&log_writes_target);
-}
-
-module_init(dm_log_writes_init);
-module_exit(dm_log_writes_exit);
+module_dm(log_writes);
 
 MODULE_DESCRIPTION(DM_NAME " log writes target");
 MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2dfd51509647..c8821fcb8299 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -4077,23 +4077,7 @@ static struct target_type raid_target = {
 	.preresume = raid_preresume,
 	.resume = raid_resume,
 };
-
-static int __init dm_raid_init(void)
-{
-	DMINFO("Loading target version %u.%u.%u",
-	       raid_target.version[0],
-	       raid_target.version[1],
-	       raid_target.version[2]);
-	return dm_register_target(&raid_target);
-}
-
-static void __exit dm_raid_exit(void)
-{
-	dm_unregister_target(&raid_target);
-}
-
-module_init(dm_raid_init);
-module_exit(dm_raid_exit);
+module_dm(raid);
 
 module_param(devices_handle_discard_safely, bool, 0644);
 MODULE_PARM_DESC(devices_handle_discard_safely,
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 5a5976b0cfb8..dfd9fb52a6f3 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -565,19 +565,7 @@ static struct target_type switch_target = {
 	.prepare_ioctl = switch_prepare_ioctl,
 	.iterate_devices = switch_iterate_devices,
 };
-
-static int __init dm_switch_init(void)
-{
-	return dm_register_target(&switch_target);
-}
-
-static void __exit dm_switch_exit(void)
-{
-	dm_unregister_target(&switch_target);
-}
-
-module_init(dm_switch_init);
-module_exit(dm_switch_exit);
+module_dm(switch);
 
 MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
 MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c
index e7b7d5983a16..48587c16c445 100644
--- a/drivers/md/dm-unstripe.c
+++ b/drivers/md/dm-unstripe.c
@@ -192,19 +192,7 @@ static struct target_type unstripe_target = {
 	.iterate_devices = unstripe_iterate_devices,
 	.io_hints = unstripe_io_hints,
 };
-
-static int __init dm_unstripe_init(void)
-{
-	return dm_register_target(&unstripe_target);
-}
-
-static void __exit dm_unstripe_exit(void)
-{
-	dm_unregister_target(&unstripe_target);
-}
-
-module_init(dm_unstripe_init);
-module_exit(dm_unstripe_exit);
+module_dm(unstripe);
 
 MODULE_DESCRIPTION(DM_NAME " unstriped target");
 MODULE_ALIAS("dm-unstriped");
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 260f04d12982..e35c16e06d06 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1514,19 +1514,7 @@ static struct target_type verity_target = {
 	.iterate_devices = verity_iterate_devices,
 	.io_hints	= verity_io_hints,
 };
-
-static int __init dm_verity_init(void)
-{
-	return dm_register_target(&verity_target);
-}
-
-static void __exit dm_verity_exit(void)
-{
-	dm_unregister_target(&verity_target);
-}
-
-module_init(dm_verity_init);
-module_exit(dm_verity_exit);
+module_dm(verity);
 
 MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
 MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 81b60b75a9fa..074cb785eafc 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -2773,19 +2773,7 @@ static struct target_type writecache_target = {
 	.iterate_devices	= writecache_iterate_devices,
 	.io_hints		= writecache_io_hints,
 };
-
-static int __init dm_writecache_init(void)
-{
-	return dm_register_target(&writecache_target);
-}
-
-static void __exit dm_writecache_exit(void)
-{
-	dm_unregister_target(&writecache_target);
-}
-
-module_init(dm_writecache_init);
-module_exit(dm_writecache_exit);
+module_dm(writecache);
 
 MODULE_DESCRIPTION(DM_NAME " writecache target");
 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index 884ac726a743..3b13e6eb1aa4 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -75,19 +75,7 @@ static struct target_type zero_target = {
 	.map    = zero_map,
 	.io_hints = zero_io_hints,
 };
-
-static int __init dm_zero_init(void)
-{
-	return dm_register_target(&zero_target);
-}
-
-static void __exit dm_zero_exit(void)
-{
-	dm_unregister_target(&zero_target);
-}
-
-module_init(dm_zero_init)
-module_exit(dm_zero_exit)
+module_dm(zero);
 
 MODULE_AUTHOR("Jana Saout <jana@saout.de>");
 MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros");
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index ad4764dcd013..ad8e670a2f9b 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -1138,7 +1138,7 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
 	return r;
 }
 
-static struct target_type dmz_type = {
+static struct target_type zoned_target = {
 	.name		 = "zoned",
 	.version	 = {2, 0, 0},
 	.features	 = DM_TARGET_SINGLETON | DM_TARGET_MIXED_ZONED_MODEL,
@@ -1154,19 +1154,7 @@ static struct target_type dmz_type = {
 	.status		 = dmz_status,
 	.message	 = dmz_message,
 };
-
-static int __init dmz_init(void)
-{
-	return dm_register_target(&dmz_type);
-}
-
-static void __exit dmz_exit(void)
-{
-	dm_unregister_target(&dmz_type);
-}
-
-module_init(dmz_init);
-module_exit(dmz_exit);
+module_dm(zoned);
 
 MODULE_DESCRIPTION(DM_NAME " target for zoned block devices");
 MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>");
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 8aa6b3ea91fa..f2d9afb8a7e9 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -631,6 +631,26 @@ void dm_destroy_crypto_profile(struct blk_crypto_profile *profile);
 		DMEMIT("target_name=%s,target_version=%u.%u.%u", \
 		       (y)->name, (y)->version[0], (y)->version[1], (y)->version[2])
 
+/**
+ * module_dm() - Helper macro for DM targets that don't do anything
+ * special in their module_init and module_exit.
+ * Each module may only use this macro once, and calling it replaces
+ * module_init() and module_exit().
+ *
+ * @name: DM target's name
+ */
+#define module_dm(name) \
+static int __init dm_##name##_init(void) \
+{ \
+	return dm_register_target(&(name##_target)); \
+} \
+module_init(dm_##name##_init) \
+static void __exit dm_##name##_exit(void) \
+{ \
+	dm_unregister_target(&(name##_target)); \
+} \
+module_exit(dm_##name##_exit)
+
 /*
  * Definitions of return values from target end_io function.
  */
-- 
cgit v1.2.3


From 13f6facf3faeed34ca381aef4c9b153c7aed3972 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 14 Apr 2023 12:07:27 -0400
Subject: dm: allow targets to require splitting WRITE_ZEROES and SECURE_ERASE

Introduce max_write_zeroes_granularity and
max_secure_erase_granularity flags in the dm_target struct.

If a target sets these then DM core will split IO of these operation
types accordingly (in terms of max_write_zeroes_sectors and
max_secure_erase_sectors respectively).

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm.c               | 10 ++++++----
 include/linux/device-mapper.h | 16 ++++++++++++++--
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 20c6b72a0245..244ebb8c316b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1614,21 +1614,23 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
 {
 	unsigned int num_bios = 0;
 	unsigned int max_granularity = 0;
+	struct queue_limits *limits = dm_get_queue_limits(ti->table->md);
 
 	switch (bio_op(ci->bio)) {
 	case REQ_OP_DISCARD:
 		num_bios = ti->num_discard_bios;
-		if (ti->max_discard_granularity) {
-			struct queue_limits *limits =
-				dm_get_queue_limits(ti->table->md);
+		if (ti->max_discard_granularity)
 			max_granularity = limits->max_discard_sectors;
-		}
 		break;
 	case REQ_OP_SECURE_ERASE:
 		num_bios = ti->num_secure_erase_bios;
+		if (ti->max_secure_erase_granularity)
+			max_granularity = limits->max_secure_erase_sectors;
 		break;
 	case REQ_OP_WRITE_ZEROES:
 		num_bios = ti->num_write_zeroes_bios;
+		if (ti->max_write_zeroes_granularity)
+			max_granularity = limits->max_write_zeroes_sectors;
 		break;
 	default:
 		break;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index f2d9afb8a7e9..983f1f0402b5 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -359,11 +359,23 @@ struct dm_target {
 	bool discards_supported:1;
 
 	/*
-	 * Set if this target requires that discards be split on both
-	 * 'discard_granularity' and 'max_discard_sectors' boundaries.
+	 * Set if this target requires that discards be split on
+	 * 'max_discard_sectors' boundaries.
 	 */
 	bool max_discard_granularity:1;
 
+	/*
+	 * Set if this target requires that secure_erases be split on
+	 * 'max_secure_erase_sectors' boundaries.
+	 */
+	bool max_secure_erase_granularity:1;
+
+	/*
+	 * Set if this target requires that write_zeroes be split on
+	 * 'max_write_zeroes_sectors' boundaries.
+	 */
+	bool max_write_zeroes_granularity:1;
+
 	/*
 	 * Set if we need to limit the number of in-flight bios when swapping.
 	 */
-- 
cgit v1.2.3


From f7995089c508a5a11b0491c7b348d5c07217a4e8 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 14 Apr 2023 12:32:20 -0400
Subject: dm: unexport dm_get_queue_limits()

There are no dm_get_queue_limits() callers outside of DM core and
there shouldn't be.

Also, remove its BUG_ON(!atomic_read(&md->holders)) to micro-optimize
__process_abnormal_io().

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm.c               | 20 +++++++++-----------
 include/linux/device-mapper.h |  2 --
 2 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 244ebb8c316b..3b694ba3a106 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1072,6 +1072,15 @@ static void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
 	__dm_io_dec_pending(io);
 }
 
+/*
+ * The queue_limits are only valid as long as you have a reference
+ * count on 'md'. But _not_ imposing verification to avoid atomic_read(),
+ */
+static inline struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
+{
+	return &md->queue->limits;
+}
+
 void disable_discard(struct mapped_device *md)
 {
 	struct queue_limits *limits = dm_get_queue_limits(md);
@@ -2311,17 +2320,6 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 	return md->immutable_target_type;
 }
 
-/*
- * The queue_limits are only valid as long as you have a reference
- * count on 'md'.
- */
-struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
-{
-	BUG_ON(!atomic_read(&md->holders));
-	return &md->queue->limits;
-}
-EXPORT_SYMBOL_GPL(dm_get_queue_limits);
-
 /*
  * Setup the DM device's queue based on md's type
  */
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 983f1f0402b5..a52d2b9a6846 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -530,8 +530,6 @@ int __init dm_early_create(struct dm_ioctl *dmi,
 			   struct dm_target_spec **spec_array,
 			   char **target_params_array);
 
-struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
-
 /*
  * Geometry functions.
  */
-- 
cgit v1.2.3


From 3d32aaa7e66d5c1479a3c31d6c2c5d45dd0d3b89 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Mon, 17 Apr 2023 11:59:56 -0400
Subject: dm ioctl: fix nested locking in table_clear() to remove deadlock
 concern

syzkaller found the following problematic rwsem locking (with write
lock already held):

 down_read+0x9d/0x450 kernel/locking/rwsem.c:1509
 dm_get_inactive_table+0x2b/0xc0 drivers/md/dm-ioctl.c:773
 __dev_status+0x4fd/0x7c0 drivers/md/dm-ioctl.c:844
 table_clear+0x197/0x280 drivers/md/dm-ioctl.c:1537

In table_clear, it first acquires a write lock
https://elixir.bootlin.com/linux/v6.2/source/drivers/md/dm-ioctl.c#L1520
down_write(&_hash_lock);

Then before the lock is released at L1539, there is a path shown above:
table_clear -> __dev_status -> dm_get_inactive_table ->  down_read
https://elixir.bootlin.com/linux/v6.2/source/drivers/md/dm-ioctl.c#L773
down_read(&_hash_lock);

It tries to acquire the same read lock again, resulting in the deadlock
problem.

Fix this by moving table_clear()'s __dev_status() call to after its
up_write(&_hash_lock);

Cc: stable@vger.kernel.org
Reported-by: Zheng Zhang <zheng.zhang@email.ucr.edu>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-ioctl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 50a1259294d1..7d5c9c582ed2 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1556,11 +1556,12 @@ static int table_clear(struct file *filp, struct dm_ioctl *param, size_t param_s
 		has_new_map = true;
 	}
 
-	param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
-
-	__dev_status(hc->md, param);
 	md = hc->md;
 	up_write(&_hash_lock);
+
+	param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+	__dev_status(md, param);
+
 	if (old_map) {
 		dm_sync_table(md);
 		dm_table_destroy(old_map);
-- 
cgit v1.2.3


From 98dba02d9a93eec11bffbb93c7c51624290702d2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Apr 2023 15:57:47 -0400
Subject: dm flakey: fix a crash with invalid table line

This command will crash with NULL pointer dereference:
 dmsetup create flakey --table \
  "0 `blockdev --getsize /dev/ram0` flakey /dev/ram0 0 0 1 2 corrupt_bio_byte 512"

Fix the crash by checking if arg_name is non-NULL before comparing it.

Cc: stable@vger.kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-flakey.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index ebcfb99b186b..ef07b294e550 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -125,9 +125,9 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 			 * Direction r or w?
 			 */
 			arg_name = dm_shift_arg(as);
-			if (!strcasecmp(arg_name, "w"))
+			if (arg_name && !strcasecmp(arg_name, "w"))
 				fc->corrupt_bio_rw = WRITE;
-			else if (!strcasecmp(arg_name, "r"))
+			else if (arg_name && !strcasecmp(arg_name, "r"))
 				fc->corrupt_bio_rw = READ;
 			else {
 				ti->error = "Invalid corrupt bio direction (r or w)";
-- 
cgit v1.2.3


From e3675dc1e7ea2e4e5a6527fa7068e9fcbc2475cc Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Apr 2023 15:56:09 -0400
Subject: dm flakey: remove trailing space in the table line

Don't return a trailing space in the output of STATUSTYPE_TABLE.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-flakey.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index ef07b294e550..948b6b5ee5f2 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -430,21 +430,21 @@ static void flakey_status(struct dm_target *ti, status_type_t type,
 		break;
 
 	case STATUSTYPE_TABLE:
-		DMEMIT("%s %llu %u %u ", fc->dev->name,
+		DMEMIT("%s %llu %u %u", fc->dev->name,
 		       (unsigned long long)fc->start, fc->up_interval,
 		       fc->down_interval);
 
 		drop_writes = test_bit(DROP_WRITES, &fc->flags);
 		error_writes = test_bit(ERROR_WRITES, &fc->flags);
-		DMEMIT("%u ", drop_writes + error_writes + (fc->corrupt_bio_byte > 0) * 5);
+		DMEMIT(" %u", drop_writes + error_writes + (fc->corrupt_bio_byte > 0) * 5);
 
 		if (drop_writes)
-			DMEMIT("drop_writes ");
+			DMEMIT(" drop_writes");
 		else if (error_writes)
-			DMEMIT("error_writes ");
+			DMEMIT(" error_writes");
 
 		if (fc->corrupt_bio_byte)
-			DMEMIT("corrupt_bio_byte %u %c %u %u ",
+			DMEMIT(" corrupt_bio_byte %u %c %u %u",
 			       fc->corrupt_bio_byte,
 			       (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r',
 			       fc->corrupt_bio_value, fc->corrupt_bio_flags);
-- 
cgit v1.2.3


From aa7d7bc99fed71664e9a241b32294ee15a88d938 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Apr 2023 15:56:41 -0400
Subject: dm flakey: add an "error_reads" option

dm-flakey returns error on reads if no other argument is specified.
This commit simplifies associated logic while formalizing an
"error_reads" argument and an ERROR_READS flag.

If no argument is specified, set ERROR_READS flag so that it behaves
just like before this commit.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 .../admin-guide/device-mapper/dm-flakey.rst        |  4 +++
 drivers/md/dm-flakey.c                             | 39 +++++++++++++++-------
 2 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/Documentation/admin-guide/device-mapper/dm-flakey.rst b/Documentation/admin-guide/device-mapper/dm-flakey.rst
index 86138735879d..f7104c01b0f7 100644
--- a/Documentation/admin-guide/device-mapper/dm-flakey.rst
+++ b/Documentation/admin-guide/device-mapper/dm-flakey.rst
@@ -39,6 +39,10 @@ Optional feature parameters:
   If no feature parameters are present, during the periods of
   unreliability, all I/O returns errors.
 
+  error_reads:
+	All read I/O is failed with an error signalled.
+	Write I/O is handled correctly.
+
   drop_writes:
 	All write I/O is silently ignored.
 	Read I/O is handled correctly.
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 948b6b5ee5f2..bd80bcafbe50 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -37,6 +37,7 @@ struct flakey_c {
 };
 
 enum feature_flag_bits {
+	ERROR_READS,
 	DROP_WRITES,
 	ERROR_WRITES
 };
@@ -53,7 +54,7 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 	const char *arg_name;
 
 	static const struct dm_arg _args[] = {
-		{0, 6, "Invalid number of feature args"},
+		{0, 7, "Invalid number of feature args"},
 		{1, UINT_MAX, "Invalid corrupt bio byte"},
 		{0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
 		{0, UINT_MAX, "Invalid corrupt bio flags mask"},
@@ -76,6 +77,17 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 			return -EINVAL;
 		}
 
+		/*
+		 * error_reads
+		 */
+		if (!strcasecmp(arg_name, "error_reads")) {
+			if (test_and_set_bit(ERROR_READS, &fc->flags)) {
+				ti->error = "Feature error_reads duplicated";
+				return -EINVAL;
+			}
+			continue;
+		}
+
 		/*
 		 * drop_writes
 		 */
@@ -171,6 +183,12 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 		return -EINVAL;
 	}
 
+	if (!fc->corrupt_bio_byte && !test_bit(ERROR_READS, &fc->flags) &&
+	    !test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags)) {
+		set_bit(ERROR_WRITES, &fc->flags);
+		set_bit(ERROR_READS, &fc->flags);
+	}
+
 	return 0;
 }
 
@@ -346,8 +364,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
 		 * Otherwise, flakey_end_io() will decide if the reads should be modified.
 		 */
 		if (bio_data_dir(bio) == READ) {
-			if (!fc->corrupt_bio_byte && !test_bit(DROP_WRITES, &fc->flags) &&
-			    !test_bit(ERROR_WRITES, &fc->flags))
+			if (test_bit(ERROR_READS, &fc->flags))
 				return DM_MAPIO_KILL;
 			goto map_bio;
 		}
@@ -373,11 +390,6 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
 			}
 			goto map_bio;
 		}
-
-		/*
-		 * By default, error all I/O.
-		 */
-		return DM_MAPIO_KILL;
 	}
 
 map_bio:
@@ -404,8 +416,8 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
 				 */
 				corrupt_bio_data(bio, fc);
 			}
-		} else if (!test_bit(DROP_WRITES, &fc->flags) &&
-			   !test_bit(ERROR_WRITES, &fc->flags)) {
+		}
+		if (test_bit(ERROR_READS, &fc->flags)) {
 			/*
 			 * Error read during the down_interval if drop_writes
 			 * and error_writes were not configured.
@@ -422,7 +434,7 @@ static void flakey_status(struct dm_target *ti, status_type_t type,
 {
 	unsigned int sz = 0;
 	struct flakey_c *fc = ti->private;
-	unsigned int drop_writes, error_writes;
+	unsigned int error_reads, drop_writes, error_writes;
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -434,10 +446,13 @@ static void flakey_status(struct dm_target *ti, status_type_t type,
 		       (unsigned long long)fc->start, fc->up_interval,
 		       fc->down_interval);
 
+		error_reads = test_bit(ERROR_READS, &fc->flags);
 		drop_writes = test_bit(DROP_WRITES, &fc->flags);
 		error_writes = test_bit(ERROR_WRITES, &fc->flags);
-		DMEMIT(" %u", drop_writes + error_writes + (fc->corrupt_bio_byte > 0) * 5);
+		DMEMIT(" %u", error_reads + drop_writes + error_writes + (fc->corrupt_bio_byte > 0) * 5);
 
+		if (error_reads)
+			DMEMIT(" error_reads");
 		if (drop_writes)
 			DMEMIT(" drop_writes");
 		else if (error_writes)
-- 
cgit v1.2.3


From 38d11da522aacaa05898c734a1cec86f1e611129 Mon Sep 17 00:00:00 2001
From: Li Lingfeng <lilingfeng3@huawei.com>
Date: Tue, 18 Apr 2023 16:38:04 +0800
Subject: dm: don't lock fs when the map is NULL in process of resume

Commit fa247089de99 ("dm: requeue IO if mapping table not yet available")
added a detection of whether the mapping table is available in the IO
submission process. If the mapping table is unavailable, it returns
BLK_STS_RESOURCE and requeues the IO.
This can lead to the following deadlock problem:

dm create                                      mount
ioctl(DM_DEV_CREATE_CMD)
ioctl(DM_TABLE_LOAD_CMD)
                               do_mount
                                vfs_get_tree
                                 ext4_get_tree
                                  get_tree_bdev
                                   sget_fc
                                    alloc_super
                                     // got &s->s_umount
                                     down_write_nested(&s->s_umount, ...);
                                   ext4_fill_super
                                    ext4_load_super
                                     ext4_read_bh
                                      submit_bio
                                      // submit and wait io end
ioctl(DM_DEV_SUSPEND_CMD)
dev_suspend
 do_resume
  dm_suspend
   __dm_suspend
    lock_fs
     freeze_bdev
      get_active_super
       grab_super
        // wait for &s->s_umount
        down_write(&s->s_umount);
  dm_swap_table
   __bind
    // set md->map(can't get here)

IO will be continuously requeued while holding the lock since mapping
table is NULL. At the same time, mapping table won't be set since the
lock is not available.
Like request-based DM, bio-based DM also has the same problem.

It's not proper to just abort IO if the mapping table not available.
So clear DM_SKIP_LOCKFS_FLAG when the mapping table is NULL, this
allows the DM table to be loaded and the IO submitted upon resume.

Fixes: fa247089de99 ("dm: requeue IO if mapping table not yet available")
Cc: stable@vger.kernel.org
Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-ioctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 7d5c9c582ed2..cc77cf3d4109 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1168,10 +1168,13 @@ static int do_resume(struct dm_ioctl *param)
 	/* Do we need to load a new map ? */
 	if (new_map) {
 		sector_t old_size, new_size;
+		int srcu_idx;
 
 		/* Suspend if it isn't already suspended */
-		if (param->flags & DM_SKIP_LOCKFS_FLAG)
+		old_map = dm_get_live_table(md, &srcu_idx);
+		if ((param->flags & DM_SKIP_LOCKFS_FLAG) || !old_map)
 			suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+		dm_put_live_table(md, srcu_idx);
 		if (param->flags & DM_NOFLUSH_FLAG)
 			suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
 		if (!dm_suspended_md(md))
-- 
cgit v1.2.3