From 737f98cfe7de8df7433a4d846850aa8efa44bd48 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Wed, 22 Feb 2017 18:13:59 +0800
Subject: blk-mq: initialize mq kobjects in blk_mq_init_allocated_queue()

Both q->mq_kobj and sw queues' kobjects should have been initialized
once, instead of doing that each add_disk context.

Also this patch removes clearing of ctx in blk_mq_init_cpu_queues()
because percpu allocator fills zero to allocated variable.

This patch fixes one issue[1] reported from Omar.

[1] kernel wearning when doing unbind/bind on one scsi-mq device

[   19.347924] kobject (ffff8800791ea0b8): tried to init an initialized object, something is seriously wrong.
[   19.349781] CPU: 1 PID: 84 Comm: kworker/u8:1 Not tainted 4.10.0-rc7-00210-g53f39eeaa263 #34
[   19.350686] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-20161122_114906-anatol 04/01/2014
[   19.350920] Workqueue: events_unbound async_run_entry_fn
[   19.350920] Call Trace:
[   19.350920]  dump_stack+0x63/0x83
[   19.350920]  kobject_init+0x77/0x90
[   19.350920]  blk_mq_register_dev+0x40/0x130
[   19.350920]  blk_register_queue+0xb6/0x190
[   19.350920]  device_add_disk+0x1ec/0x4b0
[   19.350920]  sd_probe_async+0x10d/0x1c0 [sd_mod]
[   19.350920]  async_run_entry_fn+0x48/0x150
[   19.350920]  process_one_work+0x1d0/0x480
[   19.350920]  worker_thread+0x48/0x4e0
[   19.350920]  kthread+0x101/0x140
[   19.350920]  ? process_one_work+0x480/0x480
[   19.350920]  ? kthread_create_on_node+0x60/0x60
[   19.350920]  ret_from_fork+0x2c/0x40

Cc: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c | 4 +---
 block/blk-mq.c       | 4 +++-
 block/blk-mq.h       | 1 +
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 295e69670c39..124305407c80 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -277,7 +277,7 @@ void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
 	kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
 }
 
-static void blk_mq_sysfs_init(struct request_queue *q)
+void blk_mq_sysfs_init(struct request_queue *q)
 {
 	struct blk_mq_ctx *ctx;
 	int cpu;
@@ -297,8 +297,6 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
 
 	blk_mq_disable_hotplug();
 
-	blk_mq_sysfs_init(q);
-
 	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
 	if (ret < 0)
 		goto out;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b2fd175e84d7..ed4b55176cdd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2045,7 +2045,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
 		struct blk_mq_hw_ctx *hctx;
 
-		memset(__ctx, 0, sizeof(*__ctx));
 		__ctx->cpu = i;
 		spin_lock_init(&__ctx->lock);
 		INIT_LIST_HEAD(&__ctx->rq_list);
@@ -2352,6 +2351,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (!q->queue_ctx)
 		goto err_exit;
 
+	/* init q->mq_kobj and sw queues' kobjects */
+	blk_mq_sysfs_init(q);
+
 	q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
 						GFP_KERNEL, set->numa_node);
 	if (!q->queue_hw_ctx)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 088ced003c13..ad8bfd7473ef 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -77,6 +77,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 /*
  * sysfs helpers
  */
+extern void blk_mq_sysfs_init(struct request_queue *q);
 extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
-- 
cgit v1.2.3


From 7ea5fe31c12dd8bcf4a9c5a4a7e8e23826a9a3b8 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Wed, 22 Feb 2017 18:14:00 +0800
Subject: blk-mq: make lifetime consitent between q/ctx and its kobject

Currently from kobject view, both q->mq_kobj and ctx->kobj can
be released during one cycle of blk_mq_register_dev() and
blk_mq_unregister_dev(). Actually, sw queue's lifetime is
same with its request queue's, which is covered by request_queue->kobj.

So we don't need to call kobject_put() for the two kinds of
kobject in __blk_mq_unregister_dev(), instead we do that
in release handler of request queue.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c | 20 +++++++++++++-------
 block/blk-mq.c       |  7 ++++++-
 block/blk-mq.h       |  1 +
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 124305407c80..77fb238af2be 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -242,15 +242,11 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-	int i, j;
+	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		blk_mq_unregister_hctx(hctx);
 
-		hctx_for_each_ctx(hctx, ctx, j)
-			kobject_put(&ctx->kobj);
-
 		kobject_put(&hctx->kobj);
 	}
 
@@ -258,8 +254,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 
 	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
 	kobject_del(&q->mq_kobj);
-	kobject_put(&q->mq_kobj);
-
 	kobject_put(&dev->kobj);
 
 	q->mq_sysfs_init_done = false;
@@ -277,6 +271,18 @@ void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
 	kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
 }
 
+void blk_mq_sysfs_deinit(struct request_queue *q)
+{
+	struct blk_mq_ctx *ctx;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		ctx = per_cpu_ptr(q->queue_ctx, cpu);
+		kobject_put(&ctx->kobj);
+	}
+	kobject_put(&q->mq_kobj);
+}
+
 void blk_mq_sysfs_init(struct request_queue *q)
 {
 	struct blk_mq_ctx *ctx;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ed4b55176cdd..b985c236f50f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2264,7 +2264,12 @@ void blk_mq_release(struct request_queue *q)
 
 	kfree(q->queue_hw_ctx);
 
-	/* ctx kobj stays in queue_ctx */
+	/*
+	 * release .mq_kobj and sw queue's kobject now because
+	 * both share lifetime with request queue.
+	 */
+	blk_mq_sysfs_deinit(q);
+
 	free_percpu(q->queue_ctx);
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ad8bfd7473ef..b79f9a7d8cf6 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -78,6 +78,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
  * sysfs helpers
  */
 extern void blk_mq_sysfs_init(struct request_queue *q);
+extern void blk_mq_sysfs_deinit(struct request_queue *q);
 extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
-- 
cgit v1.2.3


From 6c8b232efea1ad3d263ff8b9c16b7e8767a77488 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Wed, 22 Feb 2017 18:14:01 +0800
Subject: blk-mq: make lifetime consistent between hctx and its kobject

This patch removes kobject_put() over hctx in __blk_mq_unregister_dev(),
and trys to keep lifetime consistent between hctx and hctx's kobject.

Now blk_mq_sysfs_register() and blk_mq_sysfs_unregister() become
totally symmetrical, and kobject's refcounter drops to zero just
when the hctx is freed.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c | 15 ++++++++++-----
 block/blk-mq.c       |  5 +----
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 77fb238af2be..cb19ec16a7fc 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -17,6 +17,14 @@ static void blk_mq_sysfs_release(struct kobject *kobj)
 {
 }
 
+static void blk_mq_hw_sysfs_release(struct kobject *kobj)
+{
+	struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
+						  kobj);
+	kfree(hctx->ctxs);
+	kfree(hctx);
+}
+
 struct blk_mq_ctx_sysfs_entry {
 	struct attribute attr;
 	ssize_t (*show)(struct blk_mq_ctx *, char *);
@@ -200,7 +208,7 @@ static struct kobj_type blk_mq_ctx_ktype = {
 static struct kobj_type blk_mq_hw_ktype = {
 	.sysfs_ops	= &blk_mq_hw_sysfs_ops,
 	.default_attrs	= default_hw_ctx_attrs,
-	.release	= blk_mq_sysfs_release,
+	.release	= blk_mq_hw_sysfs_release,
 };
 
 static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
@@ -244,12 +252,9 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	queue_for_each_hw_ctx(q, hctx, i) {
+	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
 
-		kobject_put(&hctx->kobj);
-	}
-
 	blk_mq_debugfs_unregister_hctxs(q);
 
 	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b985c236f50f..f70595e5fb86 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2256,8 +2256,7 @@ void blk_mq_release(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx)
 			continue;
-		kfree(hctx->ctxs);
-		kfree(hctx);
+		kobject_put(&hctx->kobj);
 	}
 
 	q->mq_map = NULL;
@@ -2336,8 +2335,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 			blk_mq_exit_hctx(q, set, hctx, j);
 			free_cpumask_var(hctx->cpumask);
 			kobject_put(&hctx->kobj);
-			kfree(hctx->ctxs);
-			kfree(hctx);
 			hctxs[j] = NULL;
 
 		}
-- 
cgit v1.2.3


From 01388df37627d2e89f0b835377c0eb39d81f671c Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Wed, 22 Feb 2017 18:14:02 +0800
Subject: blk-mq: free hctx->cpumask in release handler of hctx's kobject

It is obviously that hctx->cpumask is per hctx, and both
share same lifetime, so this patch moves freeing of hctx->cpumask
into release handler of hctx's kobject.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c |  1 +
 block/blk-mq.c       | 12 ------------
 2 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index cb19ec16a7fc..d745ab81033a 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -21,6 +21,7 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
 {
 	struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
 						  kobj);
+	free_cpumask_var(hctx->cpumask);
 	kfree(hctx->ctxs);
 	kfree(hctx);
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f70595e5fb86..159187a28d66 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1955,16 +1955,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
 	}
 }
 
-static void blk_mq_free_hw_queues(struct request_queue *q,
-		struct blk_mq_tag_set *set)
-{
-	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
-
-	queue_for_each_hw_ctx(q, hctx, i)
-		free_cpumask_var(hctx->cpumask);
-}
-
 static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -2333,7 +2323,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 			if (hctx->tags)
 				blk_mq_free_map_and_requests(set, j);
 			blk_mq_exit_hctx(q, set, hctx, j);
-			free_cpumask_var(hctx->cpumask);
 			kobject_put(&hctx->kobj);
 			hctxs[j] = NULL;
 
@@ -2446,7 +2435,6 @@ void blk_mq_free_queue(struct request_queue *q)
 	blk_mq_del_queue_tag_set(q);
 
 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
-	blk_mq_free_hw_queues(q, set);
 }
 
 /* Basically redo blk_mq_init_queue with queue frozen */
-- 
cgit v1.2.3


From b0bfdfc2bf7fa85317824c6a389fc373dfcef5bc Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Mon, 6 Mar 2017 08:41:04 -0700
Subject: block/sed: Fix opal user range check and unused variables

Fixes check that the opal user is within the range, and cleans up unused
method variables.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/sed-opal.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 1e18dca360fc..14035f826b5e 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1023,7 +1023,6 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
 
 static int gen_key(struct opal_dev *dev, void *data)
 {
-	const u8 *method;
 	u8 uid[OPAL_UID_LENGTH];
 	int err = 0;
 
@@ -1031,7 +1030,6 @@ static int gen_key(struct opal_dev *dev, void *data)
 	set_comid(dev, dev->comid);
 
 	memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len));
-	method = opalmethod[OPAL_GENKEY];
 	kfree(dev->prev_data);
 	dev->prev_data = NULL;
 
@@ -1669,7 +1667,6 @@ static int add_user_to_lr(struct opal_dev *dev, void *data)
 static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
 {
 	u8 lr_buffer[OPAL_UID_LENGTH];
-	const u8 *method;
 	struct opal_lock_unlock *lkul = data;
 	u8 read_locked = 1, write_locked = 1;
 	int err = 0;
@@ -1677,7 +1674,6 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
 	clear_opal_cmd(dev);
 	set_comid(dev, dev->comid);
 
-	method = opalmethod[OPAL_SET];
 	if (build_locking_range(lr_buffer, sizeof(lr_buffer),
 				lkul->session.opal_key.lr) < 0)
 		return -ERANGE;
@@ -1733,14 +1729,12 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
 {
 	u8 lr_buffer[OPAL_UID_LENGTH];
 	u8 read_locked = 1, write_locked = 1;
-	const u8 *method;
 	struct opal_lock_unlock *lkul = data;
 	int ret;
 
 	clear_opal_cmd(dev);
 	set_comid(dev, dev->comid);
 
-	method = opalmethod[OPAL_SET];
 	if (build_locking_range(lr_buffer, sizeof(lr_buffer),
 				lkul->session.opal_key.lr) < 0)
 		return -ERANGE;
@@ -2133,7 +2127,7 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
 		pr_err("Locking state was not RO or RW\n");
 		return -EINVAL;
 	}
-	if (lk_unlk->session.who < OPAL_USER1 &&
+	if (lk_unlk->session.who < OPAL_USER1 ||
 	    lk_unlk->session.who > OPAL_USER9) {
 		pr_err("Authority was not within the range of users: %d\n",
 		       lk_unlk->session.who);
@@ -2316,7 +2310,7 @@ static int opal_activate_user(struct opal_dev *dev,
 	int ret;
 
 	/* We can't activate Admin1 it's active as manufactured */
-	if (opal_session->who < OPAL_USER1 &&
+	if (opal_session->who < OPAL_USER1 ||
 	    opal_session->who > OPAL_USER9) {
 		pr_err("Who was not a valid user: %d\n", opal_session->who);
 		return -EINVAL;
-- 
cgit v1.2.3


From 90f16fddcc2802726142b8386c65ccb89f044613 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Mar 2017 17:48:33 +0100
Subject: block: Make del_gendisk() safer for disks without queues

Commit 165a5e22fafb "block: Move bdi_unregister() to del_gendisk()"
added disk->queue dereference to del_gendisk(). Although del_gendisk()
is not supposed to be called without disk->queue valid and
blk_unregister_queue() warns in that case, this change will make it oops
instead. Return to the old more robust behavior of just warning when
del_gendisk() gets called for gendisk with disk->queue being NULL.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Tested-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/genhd.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index b26a5ea115d0..94f323842b52 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -681,12 +681,16 @@ void del_gendisk(struct gendisk *disk)
 	disk->flags &= ~GENHD_FL_UP;
 
 	sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
-	/*
-	 * Unregister bdi before releasing device numbers (as they can get
-	 * reused and we'd get clashes in sysfs).
-	 */
-	bdi_unregister(disk->queue->backing_dev_info);
-	blk_unregister_queue(disk);
+	if (disk->queue) {
+		/*
+		 * Unregister bdi before releasing device numbers (as they can
+		 * get reused and we'd get clashes in sysfs).
+		 */
+		bdi_unregister(disk->queue->backing_dev_info);
+		blk_unregister_queue(disk);
+	} else {
+		WARN_ON(1);
+	}
 	blk_unregister_region(disk_devt(disk), disk->minors);
 
 	part_stat_set_all(&disk->part0, 0);
-- 
cgit v1.2.3


From c01228db4ba965986511a5b28c478bddd7e2726e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Mar 2017 17:48:34 +0100
Subject: Revert "scsi, block: fix duplicate bdi name registration crashes"

This reverts commit 0dba1314d4f81115dce711292ec7981d17231064. It causes
leaking of device numbers for SCSI when SCSI registers multiple gendisks
for one request_queue in succession. It can be easily reproduced using
Omar's script [1] on kernel with CONFIG_DEBUG_TEST_DRIVER_REMOVE.
Furthermore the protection provided by this commit is not needed anymore
as the problem it was fixing got also fixed by commit 165a5e22fafb
"block: Move bdi_unregister() to del_gendisk()".

[1]: http://marc.info/?l=linux-block&m=148554717109098&w=2

Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  2 --
 block/genhd.c          | 21 ---------------------
 drivers/scsi/sd.c      | 41 ++++++++---------------------------------
 include/linux/blkdev.h |  1 -
 include/linux/genhd.h  |  8 --------
 5 files changed, 8 insertions(+), 65 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1086dac8724c..a76895c9776d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -578,8 +578,6 @@ void blk_cleanup_queue(struct request_queue *q)
 		q->queue_lock = &q->__queue_lock;
 	spin_unlock_irq(lock);
 
-	put_disk_devt(q->disk_devt);
-
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
 }
diff --git a/block/genhd.c b/block/genhd.c
index 94f323842b52..a9c516a8b37d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -572,20 +572,6 @@ exit:
 	disk_part_iter_exit(&piter);
 }
 
-void put_disk_devt(struct disk_devt *disk_devt)
-{
-	if (disk_devt && atomic_dec_and_test(&disk_devt->count))
-		disk_devt->release(disk_devt);
-}
-EXPORT_SYMBOL(put_disk_devt);
-
-void get_disk_devt(struct disk_devt *disk_devt)
-{
-	if (disk_devt)
-		atomic_inc(&disk_devt->count);
-}
-EXPORT_SYMBOL(get_disk_devt);
-
 /**
  * device_add_disk - add partitioning information to kernel list
  * @parent: parent device for the disk
@@ -626,13 +612,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 
 	disk_alloc_events(disk);
 
-	/*
-	 * Take a reference on the devt and assign it to queue since it
-	 * must not be reallocated while the bdi is registered
-	 */
-	disk->queue->disk_devt = disk->disk_devt;
-	get_disk_devt(disk->disk_devt);
-
 	/* Register BDI before referencing it from bdev */
 	bdi = disk->queue->backing_dev_info;
 	bdi_register_owner(bdi, disk_to_dev(disk));
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index c7839f6c35cc..d277e8620e3e 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3075,23 +3075,6 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
 	put_device(&sdkp->dev);
 }
 
-struct sd_devt {
-	int idx;
-	struct disk_devt disk_devt;
-};
-
-static void sd_devt_release(struct disk_devt *disk_devt)
-{
-	struct sd_devt *sd_devt = container_of(disk_devt, struct sd_devt,
-			disk_devt);
-
-	spin_lock(&sd_index_lock);
-	ida_remove(&sd_index_ida, sd_devt->idx);
-	spin_unlock(&sd_index_lock);
-
-	kfree(sd_devt);
-}
-
 /**
  *	sd_probe - called during driver initialization and whenever a
  *	new scsi device is attached to the system. It is called once
@@ -3113,7 +3096,6 @@ static void sd_devt_release(struct disk_devt *disk_devt)
 static int sd_probe(struct device *dev)
 {
 	struct scsi_device *sdp = to_scsi_device(dev);
-	struct sd_devt *sd_devt;
 	struct scsi_disk *sdkp;
 	struct gendisk *gd;
 	int index;
@@ -3139,13 +3121,9 @@ static int sd_probe(struct device *dev)
 	if (!sdkp)
 		goto out;
 
-	sd_devt = kzalloc(sizeof(*sd_devt), GFP_KERNEL);
-	if (!sd_devt)
-		goto out_free;
-
 	gd = alloc_disk(SD_MINORS);
 	if (!gd)
-		goto out_free_devt;
+		goto out_free;
 
 	do {
 		if (!ida_pre_get(&sd_index_ida, GFP_KERNEL))
@@ -3161,11 +3139,6 @@ static int sd_probe(struct device *dev)
 		goto out_put;
 	}
 
-	atomic_set(&sd_devt->disk_devt.count, 1);
-	sd_devt->disk_devt.release = sd_devt_release;
-	sd_devt->idx = index;
-	gd->disk_devt = &sd_devt->disk_devt;
-
 	error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN);
 	if (error) {
 		sdev_printk(KERN_WARNING, sdp, "SCSI disk (sd) name length exceeded.\n");
@@ -3205,12 +3178,11 @@ static int sd_probe(struct device *dev)
 	return 0;
 
  out_free_index:
-	put_disk_devt(&sd_devt->disk_devt);
-	sd_devt = NULL;
+	spin_lock(&sd_index_lock);
+	ida_remove(&sd_index_ida, index);
+	spin_unlock(&sd_index_lock);
  out_put:
 	put_disk(gd);
- out_free_devt:
-	kfree(sd_devt);
  out_free:
 	kfree(sdkp);
  out:
@@ -3271,7 +3243,10 @@ static void scsi_disk_release(struct device *dev)
 	struct scsi_disk *sdkp = to_scsi_disk(dev);
 	struct gendisk *disk = sdkp->disk;
 	
-	put_disk_devt(disk->disk_devt);
+	spin_lock(&sd_index_lock);
+	ida_remove(&sd_index_ida, sdkp->index);
+	spin_unlock(&sd_index_lock);
+
 	disk->private_data = NULL;
 	put_disk(disk);
 	put_device(&sdkp->device->sdev_gendev);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 796016e63c1d..5a7da607ca04 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -435,7 +435,6 @@ struct request_queue {
 	struct delayed_work	delay_work;
 
 	struct backing_dev_info	*backing_dev_info;
-	struct disk_devt	*disk_devt;
 
 	/*
 	 * The queue owner gets to use this for whatever they like.
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index a999d281a2f1..76f39754e7b0 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -167,13 +167,6 @@ struct blk_integrity {
 };
 
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
-struct disk_devt {
-	atomic_t count;
-	void (*release)(struct disk_devt *disk_devt);
-};
-
-void put_disk_devt(struct disk_devt *disk_devt);
-void get_disk_devt(struct disk_devt *disk_devt);
 
 struct gendisk {
 	/* major, first_minor and minors are input parameters only,
@@ -183,7 +176,6 @@ struct gendisk {
 	int first_minor;
 	int minors;                     /* maximum number of minors, =1 for
                                          * disks that can't be partitioned. */
-	struct disk_devt *disk_devt;
 
 	char disk_name[DISK_NAME_LEN];	/* name of major driver */
 	char *(*devnode)(struct gendisk *gd, umode_t *mode);
-- 
cgit v1.2.3


From 79bd99596b7305ab08109a8bf44a6a4511dbf1cd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 8 Mar 2017 07:38:05 +1100
Subject: blk: improve order of bio handling in generic_make_request()

To avoid recursion on the kernel stack when stacked block devices
are in use, generic_make_request() will, when called recursively,
queue new requests for later handling.  They will be handled when the
make_request_fn for the current bio completes.

If any bios are submitted by a make_request_fn, these will ultimately
be handled seqeuntially.  If the handling of one of those generates
further requests, they will be added to the end of the queue.

This strict first-in-first-out behaviour can lead to deadlocks in
various ways, normally because a request might need to wait for a
previous request to the same device to complete.  This can happen when
they share a mempool, and can happen due to interdependencies
particular to the device.  Both md and dm have examples where this happens.

These deadlocks can be erradicated by more selective ordering of bios.
Specifically by handling them in depth-first order.  That is: when the
handling of one bio generates one or more further bios, they are
handled immediately after the parent, before any siblings of the
parent.  That way, when generic_make_request() calls make_request_fn
for some particular device, we can be certain that all previously
submited requests for that device have been completely handled and are
not waiting for anything in the queue of requests maintained in
generic_make_request().

An easy way to achieve this would be to use a last-in-first-out stack
instead of a queue.  However this will change the order of consecutive
bios submitted by a make_request_fn, which could have unexpected consequences.
Instead we take a slightly more complex approach.
A fresh queue is created for each call to a make_request_fn.  After it completes,
any bios for a different device are placed on the front of the main queue, followed
by any bios for the same device, followed by all bios that were already on
the queue before the make_request_fn was called.
This provides the depth-first approach without reordering bios on the same level.

This, by itself, it not enough to remove all deadlocks.  It just makes
it possible for drivers to take the extra step required themselves.

To avoid deadlocks, drivers must never risk waiting for a request
after submitting one to generic_make_request.  This includes never
allocing from a mempool twice in the one call to a make_request_fn.

A common pattern in drivers is to call bio_split() in a loop, handling
the first part and then looping around to possibly split the next part.
Instead, a driver that finds it needs to split a bio should queue
(with generic_make_request) the second part, handle the first part,
and then return.  The new code in generic_make_request will ensure the
requests to underlying bios are processed first, then the second bio
that was split off.  If it splits again, the same process happens.  In
each case one bio will be completely handled before the next one is attempted.

With this is place, it should be possible to disable the
punt_bios_to_recover() recovery thread for many block devices, and
eventually it may be possible to remove it completely.

Ref: http://www.spinics.net/lists/raid/msg54680.html
Tested-by: Jinpu Wang <jinpu.wang@profitbricks.com>
Inspired-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index a76895c9776d..0eeb99ef654f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2015,17 +2015,34 @@ blk_qc_t generic_make_request(struct bio *bio)
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
 		if (likely(blk_queue_enter(q, false) == 0)) {
+			struct bio_list hold;
+			struct bio_list lower, same;
+
+			/* Create a fresh bio_list for all subordinate requests */
+			hold = bio_list_on_stack;
+			bio_list_init(&bio_list_on_stack);
 			ret = q->make_request_fn(q, bio);
 
 			blk_queue_exit(q);
 
-			bio = bio_list_pop(current->bio_list);
+			/* sort new bios into those for a lower level
+			 * and those for the same level
+			 */
+			bio_list_init(&lower);
+			bio_list_init(&same);
+			while ((bio = bio_list_pop(&bio_list_on_stack)) != NULL)
+				if (q == bdev_get_queue(bio->bi_bdev))
+					bio_list_add(&same, bio);
+				else
+					bio_list_add(&lower, bio);
+			/* now assemble so we handle the lowest level first */
+			bio_list_merge(&bio_list_on_stack, &lower);
+			bio_list_merge(&bio_list_on_stack, &same);
+			bio_list_merge(&bio_list_on_stack, &hold);
 		} else {
-			struct bio *bio_next = bio_list_pop(current->bio_list);
-
 			bio_io_error(bio);
-			bio = bio_next;
 		}
+		bio = bio_list_pop(current->bio_list);
 	} while (bio);
 	current->bio_list = NULL; /* deactivate */
 
-- 
cgit v1.2.3


From f5fe1b51905df7cfe4fdfd85c5fb7bc5b71a094f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 10 Mar 2017 17:00:47 +1100
Subject: blk: Ensure users for current->bio_list can see the full list.

Commit 79bd99596b73 ("blk: improve order of bio handling in generic_make_request()")
changed current->bio_list so that it did not contain *all* of the
queued bios, but only those submitted by the currently running
make_request_fn.

There are two places which walk the list and requeue selected bios,
and others that check if the list is empty.  These are no longer
correct.

So redefine current->bio_list to point to an array of two lists, which
contain all queued bios, and adjust various code to test or walk both
lists.

Signed-off-by: NeilBrown <neilb@suse.com>
Fixes: 79bd99596b73 ("blk: improve order of bio handling in generic_make_request()")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c         | 12 +++++++++---
 block/blk-core.c    | 30 ++++++++++++++++++------------
 drivers/md/dm.c     | 29 ++++++++++++++++-------------
 drivers/md/raid10.c |  3 ++-
 4 files changed, 45 insertions(+), 29 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index 5eec5e08417f..e75878f8b14a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -376,10 +376,14 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
 	bio_list_init(&punt);
 	bio_list_init(&nopunt);
 
-	while ((bio = bio_list_pop(current->bio_list)))
+	while ((bio = bio_list_pop(&current->bio_list[0])))
 		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+	current->bio_list[0] = nopunt;
 
-	*current->bio_list = nopunt;
+	bio_list_init(&nopunt);
+	while ((bio = bio_list_pop(&current->bio_list[1])))
+		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+	current->bio_list[1] = nopunt;
 
 	spin_lock(&bs->rescue_lock);
 	bio_list_merge(&bs->rescue_list, &punt);
@@ -466,7 +470,9 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		 * we retry with the original gfp_flags.
 		 */
 
-		if (current->bio_list && !bio_list_empty(current->bio_list))
+		if (current->bio_list &&
+		    (!bio_list_empty(&current->bio_list[0]) ||
+		     !bio_list_empty(&current->bio_list[1])))
 			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
diff --git a/block/blk-core.c b/block/blk-core.c
index 0eeb99ef654f..d772c221cc17 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1973,7 +1973,14 @@ end_io:
  */
 blk_qc_t generic_make_request(struct bio *bio)
 {
-	struct bio_list bio_list_on_stack;
+	/*
+	 * bio_list_on_stack[0] contains bios submitted by the current
+	 * make_request_fn.
+	 * bio_list_on_stack[1] contains bios that were submitted before
+	 * the current make_request_fn, but that haven't been processed
+	 * yet.
+	 */
+	struct bio_list bio_list_on_stack[2];
 	blk_qc_t ret = BLK_QC_T_NONE;
 
 	if (!generic_make_request_checks(bio))
@@ -1990,7 +1997,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 	 * should be added at the tail
 	 */
 	if (current->bio_list) {
-		bio_list_add(current->bio_list, bio);
+		bio_list_add(&current->bio_list[0], bio);
 		goto out;
 	}
 
@@ -2009,18 +2016,17 @@ blk_qc_t generic_make_request(struct bio *bio)
 	 * bio_list, and call into ->make_request() again.
 	 */
 	BUG_ON(bio->bi_next);
-	bio_list_init(&bio_list_on_stack);
-	current->bio_list = &bio_list_on_stack;
+	bio_list_init(&bio_list_on_stack[0]);
+	current->bio_list = bio_list_on_stack;
 	do {
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
 		if (likely(blk_queue_enter(q, false) == 0)) {
-			struct bio_list hold;
 			struct bio_list lower, same;
 
 			/* Create a fresh bio_list for all subordinate requests */
-			hold = bio_list_on_stack;
-			bio_list_init(&bio_list_on_stack);
+			bio_list_on_stack[1] = bio_list_on_stack[0];
+			bio_list_init(&bio_list_on_stack[0]);
 			ret = q->make_request_fn(q, bio);
 
 			blk_queue_exit(q);
@@ -2030,19 +2036,19 @@ blk_qc_t generic_make_request(struct bio *bio)
 			 */
 			bio_list_init(&lower);
 			bio_list_init(&same);
-			while ((bio = bio_list_pop(&bio_list_on_stack)) != NULL)
+			while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
 				if (q == bdev_get_queue(bio->bi_bdev))
 					bio_list_add(&same, bio);
 				else
 					bio_list_add(&lower, bio);
 			/* now assemble so we handle the lowest level first */
-			bio_list_merge(&bio_list_on_stack, &lower);
-			bio_list_merge(&bio_list_on_stack, &same);
-			bio_list_merge(&bio_list_on_stack, &hold);
+			bio_list_merge(&bio_list_on_stack[0], &lower);
+			bio_list_merge(&bio_list_on_stack[0], &same);
+			bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
 		} else {
 			bio_io_error(bio);
 		}
-		bio = bio_list_pop(current->bio_list);
+		bio = bio_list_pop(&bio_list_on_stack[0]);
 	} while (bio);
 	current->bio_list = NULL; /* deactivate */
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f4ffd1eb8f44..dfb75979e455 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -989,26 +989,29 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
 	struct dm_offload *o = container_of(cb, struct dm_offload, cb);
 	struct bio_list list;
 	struct bio *bio;
+	int i;
 
 	INIT_LIST_HEAD(&o->cb.list);
 
 	if (unlikely(!current->bio_list))
 		return;
 
-	list = *current->bio_list;
-	bio_list_init(current->bio_list);
-
-	while ((bio = bio_list_pop(&list))) {
-		struct bio_set *bs = bio->bi_pool;
-		if (unlikely(!bs) || bs == fs_bio_set) {
-			bio_list_add(current->bio_list, bio);
-			continue;
+	for (i = 0; i < 2; i++) {
+		list = current->bio_list[i];
+		bio_list_init(&current->bio_list[i]);
+
+		while ((bio = bio_list_pop(&list))) {
+			struct bio_set *bs = bio->bi_pool;
+			if (unlikely(!bs) || bs == fs_bio_set) {
+				bio_list_add(&current->bio_list[i], bio);
+				continue;
+			}
+
+			spin_lock(&bs->rescue_lock);
+			bio_list_add(&bs->rescue_list, bio);
+			queue_work(bs->rescue_workqueue, &bs->rescue_work);
+			spin_unlock(&bs->rescue_lock);
 		}
-
-		spin_lock(&bs->rescue_lock);
-		bio_list_add(&bs->rescue_list, bio);
-		queue_work(bs->rescue_workqueue, &bs->rescue_work);
-		spin_unlock(&bs->rescue_lock);
 	}
 }
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 063c43d83b72..0536658c9d40 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -974,7 +974,8 @@ static void wait_barrier(struct r10conf *conf)
 				    !conf->barrier ||
 				    (atomic_read(&conf->nr_pending) &&
 				     current->bio_list &&
-				     !bio_list_empty(current->bio_list)),
+				     (!bio_list_empty(&current->bio_list[0]) ||
+				      !bio_list_empty(&current->bio_list[1]))),
 				    conf->resync_lock);
 		conf->nr_waiting--;
 		if (!conf->nr_waiting)
-- 
cgit v1.2.3


From 0067d4b020ea07a58540acb2c5fcd3364bf326e0 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 13 Mar 2017 16:10:11 +0200
Subject: blk-mq: Fix tagset reinit in the presence of cpu hot-unplug

In case cpu was unplugged, we need to make sure not to assume
that the tags for that cpu are still allocated. so check
for null tags when reinitializing a tagset.

Reported-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'block')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e48bc2c72615..9d97bfc4d465 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -295,6 +295,9 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		struct blk_mq_tags *tags = set->tags[i];
 
+		if (!tags)
+			continue;
+
 		for (j = 0; j < tags->nr_tags; j++) {
 			if (!tags->static_rqs[j])
 				continue;
-- 
cgit v1.2.3


From 9c62110454b088b4914ffe375c2dbc19643eac34 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 14 Mar 2017 11:51:59 -0600
Subject: blk-mq-sched: don't run the queue async from
 blk_mq_try_issue_directly()

If we have scheduling enabled, we jump directly to insert-and-run.
That's fine, but we run the queue async and we don't pass in information
on whether we can block from this context or not. Fixup both these
cases.

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 159187a28d66..a4546f060e80 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1434,7 +1434,8 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
 }
 
-static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
+static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
+				      bool may_sleep)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_queue_data bd = {
@@ -1475,7 +1476,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 	}
 
 insert:
-	blk_mq_sched_insert_request(rq, false, true, true, false);
+	blk_mq_sched_insert_request(rq, false, true, false, may_sleep);
 }
 
 /*
@@ -1569,11 +1570,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 		if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
 			rcu_read_lock();
-			blk_mq_try_issue_directly(old_rq, &cookie);
+			blk_mq_try_issue_directly(old_rq, &cookie, false);
 			rcu_read_unlock();
 		} else {
 			srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
-			blk_mq_try_issue_directly(old_rq, &cookie);
+			blk_mq_try_issue_directly(old_rq, &cookie, true);
 			srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
 		}
 		goto done;
-- 
cgit v1.2.3


From efd4b81abbe1ac753717f2f10cd3dab8bed6c103 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 16 Mar 2017 09:46:14 -0600
Subject: blk-stat: fix blk_stat_sum() if all samples are batched

We need to flush the batch _before_ we check the number of samples,
otherwise we'll miss all of the batched samples.

Fixes: cf43e6b ("block: add scalable completion tracking of requests")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-stat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-stat.c b/block/blk-stat.c
index 9b43efb8933f..186fcb981e9b 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -30,11 +30,11 @@ static void blk_stat_flush_batch(struct blk_rq_stat *stat)
 
 static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 {
+	blk_stat_flush_batch(src);
+
 	if (!src->nr_samples)
 		return;
 
-	blk_stat_flush_batch(src);
-
 	dst->min = min(dst->min, src->min);
 	dst->max = max(dst->max, src->max);
 
-- 
cgit v1.2.3


From 95a49603707d982b25d17c5b70e220a05556a2f9 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Wed, 22 Mar 2017 10:14:43 +0800
Subject: blk-mq: don't complete un-started request in timeout handler

When iterating busy requests in timeout handler,
if the STARTED flag of one request isn't set, that means
the request is being processed in block layer or driver, and
isn't submitted to hardware yet.

In current implementation of blk_mq_check_expired(),
if the request queue becomes dying, un-started requests are
handled as being completed/freed immediately. This way is
wrong, and can cause rq corruption or double allocation[1][2],
when doing I/O and removing&resetting NVMe device at the sametime.

This patch fixes several issues reported by Yi Zhang.

[1]. oops log 1
[  581.789754] ------------[ cut here ]------------
[  581.789758] kernel BUG at block/blk-mq.c:374!
[  581.789760] invalid opcode: 0000 [#1] SMP
[  581.789761] Modules linked in: vfat fat ipmi_ssif intel_rapl sb_edac
edac_core x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm nvme
irqbypass crct10dif_pclmul nvme_core crc32_pclmul ghash_clmulni_intel
intel_cstate ipmi_si mei_me ipmi_devintf intel_uncore sg ipmi_msghandler
intel_rapl_perf iTCO_wdt mei iTCO_vendor_support mxm_wmi lpc_ich dcdbas shpchp
pcspkr acpi_power_meter wmi nfsd auth_rpcgss nfs_acl lockd dm_multipath grace
sunrpc ip_tables xfs libcrc32c sd_mod mgag200 i2c_algo_bit drm_kms_helper
syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm ahci libahci
crc32c_intel tg3 libata megaraid_sas i2c_core ptp fjes pps_core dm_mirror
dm_region_hash dm_log dm_mod
[  581.789796] CPU: 1 PID: 1617 Comm: kworker/1:1H Not tainted 4.10.0.bz1420297+ #4
[  581.789797] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.2.5 09/06/2016
[  581.789804] Workqueue: kblockd blk_mq_timeout_work
[  581.789806] task: ffff8804721c8000 task.stack: ffffc90006ee4000
[  581.789809] RIP: 0010:blk_mq_end_request+0x58/0x70
[  581.789810] RSP: 0018:ffffc90006ee7d50 EFLAGS: 00010202
[  581.789811] RAX: 0000000000000001 RBX: ffff8802e4195340 RCX: ffff88028e2f4b88
[  581.789812] RDX: 0000000000001000 RSI: 0000000000001000 RDI: 0000000000000000
[  581.789813] RBP: ffffc90006ee7d60 R08: 0000000000000003 R09: ffff88028e2f4b00
[  581.789814] R10: 0000000000001000 R11: 0000000000000001 R12: 00000000fffffffb
[  581.789815] R13: ffff88042abe5780 R14: 000000000000002d R15: ffff88046fbdff80
[  581.789817] FS:  0000000000000000(0000) GS:ffff88047fc00000(0000) knlGS:0000000000000000
[  581.789818] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  581.789819] CR2: 00007f64f403a008 CR3: 000000014d078000 CR4: 00000000001406e0
[  581.789820] Call Trace:
[  581.789825]  blk_mq_check_expired+0x76/0x80
[  581.789828]  bt_iter+0x45/0x50
[  581.789830]  blk_mq_queue_tag_busy_iter+0xdd/0x1f0
[  581.789832]  ? blk_mq_rq_timed_out+0x70/0x70
[  581.789833]  ? blk_mq_rq_timed_out+0x70/0x70
[  581.789840]  ? __switch_to+0x140/0x450
[  581.789841]  blk_mq_timeout_work+0x88/0x170
[  581.789845]  process_one_work+0x165/0x410
[  581.789847]  worker_thread+0x137/0x4c0
[  581.789851]  kthread+0x101/0x140
[  581.789853]  ? rescuer_thread+0x3b0/0x3b0
[  581.789855]  ? kthread_park+0x90/0x90
[  581.789860]  ret_from_fork+0x2c/0x40
[  581.789861] Code: 48 85 c0 74 0d 44 89 e6 48 89 df ff d0 5b 41 5c 5d c3 48
8b bb 70 01 00 00 48 85 ff 75 0f 48 89 df e8 7d f0 ff ff 5b 41 5c 5d c3 <0f>
0b e8 71 f0 ff ff 90 eb e9 0f 1f 40 00 66 2e 0f 1f 84 00 00
[  581.789882] RIP: blk_mq_end_request+0x58/0x70 RSP: ffffc90006ee7d50
[  581.789889] ---[ end trace bcaf03d9a14a0a70 ]---

[2]. oops log2
[ 6984.857362] BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
[ 6984.857372] IP: nvme_queue_rq+0x6e6/0x8cd [nvme]
[ 6984.857373] PGD 0
[ 6984.857374]
[ 6984.857376] Oops: 0000 [#1] SMP
[ 6984.857379] Modules linked in: ipmi_ssif vfat fat intel_rapl sb_edac
edac_core x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm
irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel ipmi_si iTCO_wdt
iTCO_vendor_support mxm_wmi ipmi_devintf intel_cstate sg dcdbas intel_uncore
mei_me intel_rapl_perf mei pcspkr lpc_ich ipmi_msghandler shpchp
acpi_power_meter wmi nfsd auth_rpcgss dm_multipath nfs_acl lockd grace sunrpc
ip_tables xfs libcrc32c sd_mod mgag200 i2c_algo_bit drm_kms_helper syscopyarea
sysfillrect crc32c_intel sysimgblt fb_sys_fops ttm nvme drm nvme_core ahci
libahci i2c_core tg3 libata ptp megaraid_sas pps_core fjes dm_mirror
dm_region_hash dm_log dm_mod
[ 6984.857416] CPU: 7 PID: 1635 Comm: kworker/7:1H Not tainted
4.10.0-2.el7.bz1420297.x86_64 #1
[ 6984.857417] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.2.5 09/06/2016
[ 6984.857427] Workqueue: kblockd blk_mq_run_work_fn
[ 6984.857429] task: ffff880476e3da00 task.stack: ffffc90002e90000
[ 6984.857432] RIP: 0010:nvme_queue_rq+0x6e6/0x8cd [nvme]
[ 6984.857433] RSP: 0018:ffffc90002e93c50 EFLAGS: 00010246
[ 6984.857434] RAX: 0000000000000000 RBX: ffff880275646600 RCX: 0000000000001000
[ 6984.857435] RDX: 0000000000000fff RSI: 00000002fba2a000 RDI: ffff8804734e6950
[ 6984.857436] RBP: ffffc90002e93d30 R08: 0000000000002000 R09: 0000000000001000
[ 6984.857437] R10: 0000000000001000 R11: 0000000000000000 R12: ffff8804741d8000
[ 6984.857438] R13: 0000000000000040 R14: ffff880475649f80 R15: ffff8804734e6780
[ 6984.857439] FS:  0000000000000000(0000) GS:ffff88047fcc0000(0000) knlGS:0000000000000000
[ 6984.857440] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 6984.857442] CR2: 0000000000000010 CR3: 0000000001c09000 CR4: 00000000001406e0
[ 6984.857443] Call Trace:
[ 6984.857451]  ? mempool_free+0x2b/0x80
[ 6984.857455]  ? bio_free+0x4e/0x60
[ 6984.857459]  blk_mq_dispatch_rq_list+0xf5/0x230
[ 6984.857462]  blk_mq_process_rq_list+0x133/0x170
[ 6984.857465]  __blk_mq_run_hw_queue+0x8c/0xa0
[ 6984.857467]  blk_mq_run_work_fn+0x12/0x20
[ 6984.857473]  process_one_work+0x165/0x410
[ 6984.857475]  worker_thread+0x137/0x4c0
[ 6984.857478]  kthread+0x101/0x140
[ 6984.857480]  ? rescuer_thread+0x3b0/0x3b0
[ 6984.857481]  ? kthread_park+0x90/0x90
[ 6984.857489]  ret_from_fork+0x2c/0x40
[ 6984.857490] Code: 8b bd 70 ff ff ff 89 95 50 ff ff ff 89 8d 58 ff ff ff 44
89 95 60 ff ff ff e8 b7 dd 12 e1 8b 95 50 ff ff ff 48 89 85 68 ff ff ff <4c>
8b 48 10 44 8b 58 18 8b 8d 58 ff ff ff 44 8b 95 60 ff ff ff
[ 6984.857511] RIP: nvme_queue_rq+0x6e6/0x8cd [nvme] RSP: ffffc90002e93c50
[ 6984.857512] CR2: 0000000000000010
[ 6984.895359] ---[ end trace 2d7ceb528432bf83 ]---

Cc: stable@vger.kernel.org
Reported-by: Yi Zhang <yizhan@redhat.com>
Tested-by: Yi Zhang <yizhan@redhat.com>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a4546f060e80..08a49c69738b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -697,17 +697,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 {
 	struct blk_mq_timeout_data *data = priv;
 
-	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
-		/*
-		 * If a request wasn't started before the queue was
-		 * marked dying, kill it here or it'll go unnoticed.
-		 */
-		if (unlikely(blk_queue_dying(rq->q))) {
-			rq->errors = -EIO;
-			blk_mq_end_request(rq, rq->errors);
-		}
+	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 		return;
-	}
 
 	if (time_after_eq(jiffies, rq->deadline)) {
 		if (!blk_mark_rq_complete(rq))
-- 
cgit v1.2.3