From dd840087086f3b93ac20f7472b4fca59aff7b79f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Fri, 15 Aug 2014 23:16:32 +0800
Subject: blk-mq: fix WARNING "percpu_ref_kill() called more than once!"

Before doing queue release, the queue has been freezed already
by blk_cleanup_queue(), so needn't to freeze queue for deleting
tag set.

This patch fixes the WARNING of "percpu_ref_kill() called more than once!"
which is triggered during unloading block driver.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5189cb1e478a..ac8a0413664e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1713,14 +1713,10 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
 
-	blk_mq_freeze_queue(q);
-
 	mutex_lock(&set->tag_list_lock);
 	list_del_init(&q->tag_set_list);
 	blk_mq_update_tag_set_depth(set);
 	mutex_unlock(&set->tag_list_lock);
-
-	blk_mq_unfreeze_queue(q);
 }
 
 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
-- 
cgit v1.2.3


From 274a5843ff2f08a89464589d90c64eb65f2c0847 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 15 Aug 2014 12:44:08 -0600
Subject: blk-mq: don't allow merges if turned off for the queue

blk-mq uses BLK_MQ_F_SHOULD_MERGE, as set by the driver at init time,
to determine whether it should merge IO or not. However, this could
also be disabled by the admin, if merging is switched off through
sysfs. So check the general queue state as well before attempting
to merge IO.

Reported-by: Rob Elliott <Elliott@hp.com>
Tested-by: Rob Elliott <Elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ac8a0413664e..c9e89a8792e3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1068,13 +1068,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 		blk_account_io_start(rq, 1);
 }
 
+static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
+{
+	return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
+		!blk_queue_nomerges(hctx->queue);
+}
+
 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
 					 struct blk_mq_ctx *ctx,
 					 struct request *rq, struct bio *bio)
 {
-	struct request_queue *q = hctx->queue;
-
-	if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) {
+	if (!hctx_allow_merges(hctx)) {
 		blk_mq_bio_to_request(rq, bio);
 		spin_lock(&ctx->lock);
 insert_rq:
@@ -1082,6 +1086,8 @@ insert_rq:
 		spin_unlock(&ctx->lock);
 		return false;
 	} else {
+		struct request_queue *q = hctx->queue;
+
 		spin_lock(&ctx->lock);
 		if (!blk_mq_attempt_merge(q, ctx, bio)) {
 			blk_mq_bio_to_request(rq, bio);
-- 
cgit v1.2.3


From 16f408dc6b1c87f5e3a767626df09c1399c6bf70 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagig@mellanox.com>
Date: Wed, 13 Aug 2014 14:49:31 +0300
Subject: block: Fix BUG_ON when pi errors occur

When getting a pi error we get to bio_integrity_end_io with
bi_remaining already decremented to 0 where we will eventually
need to call bio_endio with restored original bio completion handler.
Calling bio_endio invokes a BUG_ON(). We should call bio_endio_nodec
instead, like what is done in bio_integrity_verify_fn.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio-integrity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index bc423f7b02da..f14b4abbebd8 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -520,7 +520,7 @@ void bio_integrity_endio(struct bio *bio, int error)
 	 */
 	if (error) {
 		bio->bi_end_io = bip->bip_end_io;
-		bio_endio(bio, error);
+		bio_endio_nodec(bio, error);
 
 		return;
 	}
-- 
cgit v1.2.3


From a68aafa5b297d99c2d0c38689089a752126e9e79 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 15 Aug 2014 13:19:15 -0600
Subject: blk-mq: correct a few wrong/bad comments

Just grammar or spelling errors, nothing major.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c9e89a8792e3..a0565bb20fd5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1580,7 +1580,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 		hctx->tags = set->tags[i];
 
 		/*
-		 * Allocate space for all possible cpus to avoid allocation in
+		 * Allocate space for all possible cpus to avoid allocation at
 		 * runtime
 		 */
 		hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
@@ -1668,8 +1668,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		/*
-		 * If not software queues are mapped to this hardware queue,
-		 * disable it and free the request entries
+		 * If no software queues are mapped to this hardware queue,
+		 * disable it and free the request entries.
 		 */
 		if (!hctx->nr_ctx) {
 			struct blk_mq_tag_set *set = q->tag_set;
-- 
cgit v1.2.3


From cddd5d17642cc6881352732693c2ae6930e9ce65 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 16 Aug 2014 08:02:24 -0400
Subject: blk-mq: blk_mq_freeze_queue() should allow nesting

While converting to percpu_ref for freezing, add703fda981 ("blk-mq:
use percpu_ref for mq usage count") incorrectly made
blk_mq_freeze_queue() misbehave when freezing is nested due to
percpu_ref_kill() being invoked on an already killed ref.

Fix it by making blk_mq_freeze_queue() kill and kick the queue only
for the outermost freeze attempt.  All the nested ones can simply wait
for the ref to reach zero.

While at it, remove unnecessary @wake initialization from
blk_mq_unfreeze_queue().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a0565bb20fd5..7950f8d7c1bb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -112,18 +112,22 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
  */
 void blk_mq_freeze_queue(struct request_queue *q)
 {
+	bool freeze;
+
 	spin_lock_irq(q->queue_lock);
-	q->mq_freeze_depth++;
+	freeze = !q->mq_freeze_depth++;
 	spin_unlock_irq(q->queue_lock);
 
-	percpu_ref_kill(&q->mq_usage_counter);
-	blk_mq_run_queues(q, false);
+	if (freeze) {
+		percpu_ref_kill(&q->mq_usage_counter);
+		blk_mq_run_queues(q, false);
+	}
 	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
 }
 
 static void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	bool wake = false;
+	bool wake;
 
 	spin_lock_irq(q->queue_lock);
 	wake = !--q->mq_freeze_depth;
-- 
cgit v1.2.3


From 2cada584b20007470931080c49310d85908449b0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 20:38:27 -0500
Subject: block: cleanup error handling in sg_io

Make sure we always clean up through the out label and just have
a single place to put the request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/scsi_ioctl.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 51bf5155ee75..ba8706f6cded 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -279,7 +279,6 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 	r = blk_rq_unmap_user(bio);
 	if (!ret)
 		ret = r;
-	blk_put_request(rq);
 
 	return ret;
 }
@@ -322,10 +321,9 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		return -ENOMEM;
 	blk_rq_set_block_pc(rq);
 
-	if (blk_fill_sghdr_rq(q, rq, hdr, mode)) {
-		blk_put_request(rq);
-		return -EFAULT;
-	}
+	ret = -EFAULT;
+	if (blk_fill_sghdr_rq(q, rq, hdr, mode))
+		goto out;
 
 	if (hdr->iovec_count) {
 		size_t iov_data_len;
@@ -376,7 +374,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 
 	hdr->duration = jiffies_to_msecs(jiffies - start_time);
 
-	return blk_complete_sghdr_rq(rq, hdr, bio);
+	ret = blk_complete_sghdr_rq(rq, hdr, bio);
 out:
 	blk_put_request(rq);
 	return ret;
-- 
cgit v1.2.3


From a57821cac6bb6e46abea118e34d0e86444ec1410 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 21 Aug 2014 20:39:53 -0500
Subject: block: support > 16 byte CDBs for SG_IO

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Boaz Harrosh <boaz@plexistor.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/scsi_ioctl.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index ba8706f6cded..1d78e6cf9d61 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -296,8 +296,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 
 	if (hdr->interface_id != 'S')
 		return -EINVAL;
-	if (hdr->cmd_len > BLK_MAX_CDB)
-		return -EINVAL;
 
 	if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))
 		return -EIO;
@@ -316,14 +314,21 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	if (hdr->flags & SG_FLAG_Q_AT_HEAD)
 		at_head = 1;
 
+	ret = -ENOMEM;
 	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
 	if (!rq)
-		return -ENOMEM;
+		goto out;
 	blk_rq_set_block_pc(rq);
 
+	if (hdr->cmd_len > BLK_MAX_CDB) {
+		rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
+		if (!rq->cmd)
+			goto out_put_request;
+	}
+
 	ret = -EFAULT;
 	if (blk_fill_sghdr_rq(q, rq, hdr, mode))
-		goto out;
+		goto out_free_cdb;
 
 	if (hdr->iovec_count) {
 		size_t iov_data_len;
@@ -333,7 +338,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 					    0, NULL, &iov);
 		if (ret < 0) {
 			kfree(iov);
-			goto out;
+			goto out_free_cdb;
 		}
 
 		iov_data_len = ret;
@@ -356,7 +361,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 				      GFP_KERNEL);
 
 	if (ret)
-		goto out;
+		goto out_free_cdb;
 
 	bio = rq->bio;
 	memset(sense, 0, sizeof(sense));
@@ -375,8 +380,13 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	hdr->duration = jiffies_to_msecs(jiffies - start_time);
 
 	ret = blk_complete_sghdr_rq(rq, hdr, bio);
-out:
+
+out_free_cdb:
+	if (rq->cmd != rq->__cmd)
+		kfree(rq->cmd);
+out_put_request:
 	blk_put_request(rq);
+out:
 	return ret;
 }
 
-- 
cgit v1.2.3


From 6f4a16266fb3e58cd3e200eab51d2220ef92d604 Mon Sep 17 00:00:00 2001
From: Tony Battersby <tonyb@cybernetics.com>
Date: Fri, 22 Aug 2014 15:53:39 -0400
Subject: scsi-mq: fix requests that use a separate CDB buffer

This patch fixes code such as the following with scsi-mq enabled:

    rq = blk_get_request(...);
    blk_rq_set_block_pc(rq);

    rq->cmd = my_cmd_buffer; /* separate CDB buffer */

    blk_execute_rq_nowait(...);

Code like this appears in e.g. sg_start_req() in drivers/scsi/sg.c (for
large CDBs only).  Without this patch, scsi_mq_prep_fn() will set
rq->cmd back to rq->__cmd, causing the wrong CDB to be sent to the device.

Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c        | 1 -
 block/blk-mq.c          | 2 ++
 drivers/scsi/scsi_lib.c | 1 -
 3 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index c359d72e9d76..bf930f481d43 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1252,7 +1252,6 @@ void blk_rq_set_block_pc(struct request *rq)
 	rq->__sector = (sector_t) -1;
 	rq->bio = rq->biotail = NULL;
 	memset(rq->__cmd, 0, sizeof(rq->__cmd));
-	rq->cmd = rq->__cmd;
 }
 EXPORT_SYMBOL(blk_rq_set_block_pc);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7950f8d7c1bb..4aac82615a46 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -176,6 +176,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	/* tag was already set */
 	rq->errors = 0;
 
+	rq->cmd = rq->__cmd;
+
 	rq->extra_len = 0;
 	rq->sense_len = 0;
 	rq->resid_len = 0;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9c44392b748f..d86808f051e0 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1808,7 +1808,6 @@ static int scsi_mq_prep_fn(struct request *req)
 
 	cmd->tag = req->tag;
 
-	req->cmd = req->__cmd;
 	cmd->cmnd = req->cmd;
 	cmd->prot_op = SCSI_PROT_NORMAL;
 
-- 
cgit v1.2.3


From 2ba136daa3ae1e881c9f586f283fcaa164767dce Mon Sep 17 00:00:00 2001
From: Tony Battersby <tonyb@cybernetics.com>
Date: Fri, 22 Aug 2014 15:53:35 -0400
Subject: fix regression in SCSI_IOCTL_SEND_COMMAND

blk_rq_set_block_pc() memsets rq->cmd to 0, so it should come
immediately after blk_get_request() to avoid overwriting the
user-supplied CDB.  Also check for failure to allocate rq.

Fixes: f27b087b81b7 ("block: add blk_rq_set_block_pc()")
Cc: <stable@vger.kernel.org> # 3.16.x
Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/scsi_ioctl.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 1d78e6cf9d61..5dd477bfb4bc 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -456,6 +456,11 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 	}
 
 	rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
+	if (!rq) {
+		err = -ENOMEM;
+		goto error;
+	}
+	blk_rq_set_block_pc(rq);
 
 	cmdlen = COMMAND_SIZE(opcode);
 
@@ -509,7 +514,6 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 	memset(sense, 0, sizeof(sense));
 	rq->sense = sense;
 	rq->sense_len = 0;
-	blk_rq_set_block_pc(rq);
 
 	blk_execute_rq(q, disk, rq, 0);
 
@@ -529,7 +533,8 @@ out:
 	
 error:
 	kfree(buffer);
-	blk_put_request(rq);
+	if (rq)
+		blk_put_request(rq);
 	return err;
 }
 EXPORT_SYMBOL_GPL(sg_scsi_ioctl);
-- 
cgit v1.2.3


From d19d744685f47f1bb3d39b3ec51eb50afe5ff47d Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 26 Aug 2014 16:14:02 +0200
Subject: block: fix error handling in sg_io

Before commit 2cada584b200 ("block: cleanup error handling in sg_io"),
we had ret = 0 before entering the last big if block of sg_io.

Since 2cada584b200, ret = -EFAULT, which breaks hdparm:

/dev/sda:
 setting Advanced Power Management level to 0xc8 (200)
 HDIO_DRIVE_CMD failed: Bad address
 APM_level      = 128

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Fixes: 2cada584b200 ("block: cleanup error handling in sg_io")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/scsi_ioctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 5dd477bfb4bc..9b8eaeca6a79 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -330,6 +330,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	if (blk_fill_sghdr_rq(q, rq, hdr, mode))
 		goto out_free_cdb;
 
+	ret = 0;
 	if (hdr->iovec_count) {
 		size_t iov_data_len;
 		struct iovec *iov = NULL;
-- 
cgit v1.2.3


From e15693ef18e13e3e6bffe891fe140f18b8ff6d07 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Tue, 26 Aug 2014 20:56:36 +0900
Subject: cfq-iosched: Fix wrong children_weight calculation

cfq_group_service_tree_add() is applying new_weight at the beginning of
the function via cfq_update_group_weight().
This actually allows weight to change between adding it to and subtracting
it from children_weight, and triggers WARN_ON_ONCE() in
cfq_group_service_tree_del(), or even causes oops by divide error during
vfr calculation in cfq_group_service_tree_add().

The detailed scenario is as follows:
1. Create blkio cgroups X and Y as a child of X.
   Set X's weight to 500 and perform some I/O to apply new_weight.
   This X's I/O completes before starting Y's I/O.
2. Y starts I/O and cfq_group_service_tree_add() is called with Y.
3. cfq_group_service_tree_add() walks up the tree during children_weight
   calculation and adds parent X's weight (500) to children_weight of root.
   children_weight becomes 500.
4. Set X's weight to 1000.
5. X starts I/O and cfq_group_service_tree_add() is called with X.
6. cfq_group_service_tree_add() applies its new_weight (1000).
7. I/O of Y completes and cfq_group_service_tree_del() is called with Y.
8. I/O of X completes and cfq_group_service_tree_del() is called with X.
9. cfq_group_service_tree_del() subtracts X's weight (1000) from
   children_weight of root. children_weight becomes -500.
   This triggers WARN_ON_ONCE().
10. Set X's weight to 500.
11. X starts I/O and cfq_group_service_tree_add() is called with X.
12. cfq_group_service_tree_add() applies its new_weight (500) and adds it
    to children_weight of root. children_weight becomes 0. Calcularion of
    vfr triggers oops by divide error.

weight should be updated right before adding it to children_weight.

Reported-by: Ruki Sekiya <sekiya.ruki@lab.ntt.co.jp>
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index cadc37841744..d7494637c5db 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1275,12 +1275,16 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 static void
 cfq_update_group_weight(struct cfq_group *cfqg)
 {
-	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
-
 	if (cfqg->new_weight) {
 		cfqg->weight = cfqg->new_weight;
 		cfqg->new_weight = 0;
 	}
+}
+
+static void
+cfq_update_group_leaf_weight(struct cfq_group *cfqg)
+{
+	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
 
 	if (cfqg->new_leaf_weight) {
 		cfqg->leaf_weight = cfqg->new_leaf_weight;
@@ -1299,7 +1303,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	/* add to the service tree */
 	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
 
-	cfq_update_group_weight(cfqg);
+	cfq_update_group_leaf_weight(cfqg);
 	__cfq_group_service_tree_add(st, cfqg);
 
 	/*
@@ -1323,6 +1327,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	 */
 	while ((parent = cfqg_parent(pos))) {
 		if (propagate) {
+			cfq_update_group_weight(pos);
 			propagate = !parent->nr_active++;
 			parent->children_weight += pos->weight;
 		}
-- 
cgit v1.2.3


From 7b5af5cffce569298b1d03af1ddf1dc43570ab42 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Thu, 28 Aug 2014 17:14:58 +0900
Subject: cfq-iosched: Add comments on update timing of weight

Explain that weight has to be updated on activation.
This complements previous fix e15693ef18e1 ("cfq-iosched: Fix wrong
children_weight calculation").

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d7494637c5db..3f31cf9508e6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1272,6 +1272,9 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	rb_insert_color(&cfqg->rb_node, &st->rb);
 }
 
+/*
+ * This has to be called only on activation of cfqg
+ */
 static void
 cfq_update_group_weight(struct cfq_group *cfqg)
 {
@@ -1303,6 +1306,11 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	/* add to the service tree */
 	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
 
+	/*
+	 * Update leaf_weight.  We cannot update weight at this point
+	 * because cfqg might already have been activated and is
+	 * contributing its current weight to the parent's child_weight.
+	 */
 	cfq_update_group_leaf_weight(cfqg);
 	__cfq_group_service_tree_add(st, cfqg);
 
-- 
cgit v1.2.3


From 0738854939e6ec9b9111a8cfc0ca1dfa3cff6b2e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Tue, 2 Sep 2014 23:02:59 +0800
Subject: blk-merge: fix blk_recount_segments

QUEUE_FLAG_NO_SG_MERGE is set at default for blk-mq devices,
so bio->bi_phys_segment computed may be bigger than
queue_max_segments(q) for blk-mq devices, then drivers will
fail to handle the case, for example, BUG_ON() in
virtio_queue_rq() can be triggerd for virtio-blk:

	https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1359146

This patch fixes the issue by ignoring the QUEUE_FLAG_NO_SG_MERGE
flag if the computed bio->bi_phys_segment is bigger than
queue_max_segments(q), and the regression is caused by commit
05f1dd53152173(block: add queue flag for disabling SG merging).

Reported-by: Kick In <pierre-andre.morey@canonical.com>
Tested-by: Chris J Arges <chris.j.arges@canonical.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 54535831f1e1..77881798f793 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -10,10 +10,11 @@
 #include "blk.h"
 
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
-					     struct bio *bio)
+					     struct bio *bio,
+					     bool no_sg_merge)
 {
 	struct bio_vec bv, bvprv = { NULL };
-	int cluster, high, highprv = 1, no_sg_merge;
+	int cluster, high, highprv = 1;
 	unsigned int seg_size, nr_phys_segs;
 	struct bio *fbio, *bbio;
 	struct bvec_iter iter;
@@ -35,7 +36,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	cluster = blk_queue_cluster(q);
 	seg_size = 0;
 	nr_phys_segs = 0;
-	no_sg_merge = test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
 	high = 0;
 	for_each_bio(bio) {
 		bio_for_each_segment(bv, bio, iter) {
@@ -88,18 +88,23 @@ new_segment:
 
 void blk_recalc_rq_segments(struct request *rq)
 {
-	rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
+	bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
+			&rq->q->queue_flags);
+
+	rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
+			no_sg_merge);
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-	if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags))
+	if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
+			bio->bi_vcnt < queue_max_segments(q))
 		bio->bi_phys_segments = bio->bi_vcnt;
 	else {
 		struct bio *nxt = bio->bi_next;
 
 		bio->bi_next = NULL;
-		bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+		bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
 		bio->bi_next = nxt;
 	}
 
-- 
cgit v1.2.3


From 5676e7b6db02b80eafc2e3ad316d5f2fee817ecb Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hp.com>
Date: Tue, 2 Sep 2014 11:38:44 -0500
Subject: blk-mq: cleanup after blk_mq_init_rq_map failures

In blk-mq.c blk_mq_alloc_tag_set, if:
	set->tags = kmalloc_node()
succeeds, but one of the blk_mq_init_rq_map() calls fails,
	goto out_unwind;
needs to free set->tags so the caller is not obligated
to do so.  None of the current callers (null_blk,
virtio_blk, virtio_blk, or the forthcoming scsi-mq)
do so.

set->tags needs to be set to NULL after doing so,
so other tag cleanup logic doesn't try to free
a stale pointer later.  Also set it to NULL
in blk_mq_free_tag_set.

Tested with error injection on the forthcoming
scsi-mq + hpsa combination.

Signed-off-by: Robert Elliott <elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4aac82615a46..f9b85e83d9ba 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1982,6 +1982,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 out_unwind:
 	while (--i >= 0)
 		blk_mq_free_rq_map(set, set->tags[i], i);
+	kfree(set->tags);
+	set->tags = NULL;
 out:
 	return -ENOMEM;
 }
@@ -1997,6 +1999,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	}
 
 	kfree(set->tags);
+	set->tags = NULL;
 }
 EXPORT_SYMBOL(blk_mq_free_tag_set);
 
-- 
cgit v1.2.3


From 2da78092dda13f1efd26edbbf99a567776913750 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 26 Aug 2014 09:05:36 -0600
Subject: block: Fix dev_t minor allocation lifetime

Releases the dev_t minor when all references are closed to prevent
another device from acquiring the same major/minor.

Since the partition's release may be invoked from call_rcu's soft-irq
context, the ext_dev_idr's mutex had to be replaced with a spinlock so
as not so sleep.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/genhd.c             | 24 ++++++++++++++----------
 block/partition-generic.c |  2 +-
 2 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 791f41943132..09da5e4a8e03 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -28,10 +28,10 @@ struct kobject *block_depr;
 /* for extended dynamic devt allocation, currently only one major is used */
 #define NR_EXT_DEVT		(1 << MINORBITS)
 
-/* For extended devt allocation.  ext_devt_mutex prevents look up
+/* For extended devt allocation.  ext_devt_lock prevents look up
  * results from going away underneath its user.
  */
-static DEFINE_MUTEX(ext_devt_mutex);
+static DEFINE_SPINLOCK(ext_devt_lock);
 static DEFINE_IDR(ext_devt_idr);
 
 static struct device_type disk_type;
@@ -420,9 +420,13 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
 	}
 
 	/* allocate ext devt */
-	mutex_lock(&ext_devt_mutex);
-	idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_KERNEL);
-	mutex_unlock(&ext_devt_mutex);
+	idr_preload(GFP_KERNEL);
+
+	spin_lock(&ext_devt_lock);
+	idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
+	spin_unlock(&ext_devt_lock);
+
+	idr_preload_end();
 	if (idx < 0)
 		return idx == -ENOSPC ? -EBUSY : idx;
 
@@ -447,9 +451,9 @@ void blk_free_devt(dev_t devt)
 		return;
 
 	if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
-		mutex_lock(&ext_devt_mutex);
+		spin_lock(&ext_devt_lock);
 		idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
-		mutex_unlock(&ext_devt_mutex);
+		spin_unlock(&ext_devt_lock);
 	}
 }
 
@@ -665,7 +669,6 @@ void del_gendisk(struct gendisk *disk)
 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
 	device_del(disk_to_dev(disk));
-	blk_free_devt(disk_to_dev(disk)->devt);
 }
 EXPORT_SYMBOL(del_gendisk);
 
@@ -690,13 +693,13 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 	} else {
 		struct hd_struct *part;
 
-		mutex_lock(&ext_devt_mutex);
+		spin_lock(&ext_devt_lock);
 		part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
 		if (part && get_disk(part_to_disk(part))) {
 			*partno = part->partno;
 			disk = part_to_disk(part);
 		}
-		mutex_unlock(&ext_devt_mutex);
+		spin_unlock(&ext_devt_lock);
 	}
 
 	return disk;
@@ -1098,6 +1101,7 @@ static void disk_release(struct device *dev)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
+	blk_free_devt(dev->devt);
 	disk_release_events(disk);
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 789cdea05893..0d9e5f97f0a8 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -211,6 +211,7 @@ static const struct attribute_group *part_attr_groups[] = {
 static void part_release(struct device *dev)
 {
 	struct hd_struct *p = dev_to_part(dev);
+	blk_free_devt(dev->devt);
 	free_part_stats(p);
 	free_part_info(p);
 	kfree(p);
@@ -253,7 +254,6 @@ void delete_partition(struct gendisk *disk, int partno)
 	rcu_assign_pointer(ptbl->last_lookup, NULL);
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
-	blk_free_devt(part_devt(part));
 
 	hd_struct_put(part);
 }
-- 
cgit v1.2.3


From df35c7c912fe668797681842b3b74c61b0664050 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Tue, 9 Sep 2014 11:50:58 -0400
Subject: Block: fix unbalanced bypass-disable in blk_register_queue

When a queue is registered, the block layer turns off the bypass
setting (because bypass is enabled when the queue is created).  This
doesn't work well for queues that are unregistered and then registered
again; we get a WARNING because of the unbalanced calls to
blk_queue_bypass_end().

This patch fixes the problem by making blk_register_queue() call
blk_queue_bypass_end() only the first time the queue is registered.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Tejun Heo <tj@kernel.org>
CC: James Bottomley <James.Bottomley@HansenPartnership.com>
CC: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-sysfs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 4db5abf96b9e..17f5c84ce7bf 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -554,8 +554,10 @@ int blk_register_queue(struct gendisk *disk)
 	 * Initialization must be complete by now.  Finish the initial
 	 * bypass from queue allocation.
 	 */
-	queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
-	blk_queue_bypass_end(q);
+	if (!blk_queue_init_done(q)) {
+		queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
+		blk_queue_bypass_end(q);
+	}
 
 	ret = blk_trace_init_sysfs(dev);
 	if (ret)
-- 
cgit v1.2.3


From a516440542afcb9647f88d12c35640baf02d07ea Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 10 Sep 2014 09:02:03 -0600
Subject: blk-mq: scale depth and rq map appropriate if low on memory

If we are running in a kdump environment, resources are scarce.
For some SCSI setups with a huge set of shared tags, we run out
of memory allocating what the drivers is asking for. So implement
a scale back logic to reduce the tag depth for those cases, allowing
the driver to successfully load.

We should extend this to detect low memory situations, and implement
a sane fallback for those (1 queue, 64 tags, or something like that).

Tested-by: Robert Elliott <elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 88 +++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 19 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f9b85e83d9ba..383ea0cb1f0a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1321,6 +1321,7 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
 				continue;
 			set->ops->exit_request(set->driver_data, tags->rqs[i],
 						hctx_idx, i);
+			tags->rqs[i] = NULL;
 		}
 	}
 
@@ -1354,8 +1355,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 
 	INIT_LIST_HEAD(&tags->page_list);
 
-	tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
-					GFP_KERNEL, set->numa_node);
+	tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
+				 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+				 set->numa_node);
 	if (!tags->rqs) {
 		blk_mq_free_tags(tags);
 		return NULL;
@@ -1379,8 +1381,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			this_order--;
 
 		do {
-			page = alloc_pages_node(set->numa_node, GFP_KERNEL,
-						this_order);
+			page = alloc_pages_node(set->numa_node,
+				GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+				this_order);
 			if (page)
 				break;
 			if (!this_order--)
@@ -1404,8 +1407,10 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			if (set->ops->init_request) {
 				if (set->ops->init_request(set->driver_data,
 						tags->rqs[i], hctx_idx, i,
-						set->numa_node))
+						set->numa_node)) {
+					tags->rqs[i] = NULL;
 					goto fail;
+				}
 			}
 
 			p += rq_size;
@@ -1416,7 +1421,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 	return tags;
 
 fail:
-	pr_warn("%s: failed to allocate requests\n", __func__);
 	blk_mq_free_rq_map(set, tags, hctx_idx);
 	return NULL;
 }
@@ -1936,6 +1940,61 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+{
+	int i;
+
+	for (i = 0; i < set->nr_hw_queues; i++) {
+		set->tags[i] = blk_mq_init_rq_map(set, i);
+		if (!set->tags[i])
+			goto out_unwind;
+	}
+
+	return 0;
+
+out_unwind:
+	while (--i >= 0)
+		blk_mq_free_rq_map(set, set->tags[i], i);
+
+	set->tags = NULL;
+	return -ENOMEM;
+}
+
+/*
+ * Allocate the request maps associated with this tag_set. Note that this
+ * may reduce the depth asked for, if memory is tight. set->queue_depth
+ * will be updated to reflect the allocated depth.
+ */
+static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+{
+	unsigned int depth;
+	int err;
+
+	depth = set->queue_depth;
+	do {
+		err = __blk_mq_alloc_rq_maps(set);
+		if (!err)
+			break;
+
+		set->queue_depth >>= 1;
+		if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
+			err = -ENOMEM;
+			break;
+		}
+	} while (set->queue_depth);
+
+	if (!set->queue_depth || err) {
+		pr_err("blk-mq: failed to allocate request map\n");
+		return -ENOMEM;
+	}
+
+	if (depth != set->queue_depth)
+		pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
+						depth, set->queue_depth);
+
+	return 0;
+}
+
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
@@ -1944,8 +2003,6 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
  */
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 {
-	int i;
-
 	if (!set->nr_hw_queues)
 		return -EINVAL;
 	if (!set->queue_depth)
@@ -1966,25 +2023,18 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 				 sizeof(struct blk_mq_tags *),
 				 GFP_KERNEL, set->numa_node);
 	if (!set->tags)
-		goto out;
+		return -ENOMEM;
 
-	for (i = 0; i < set->nr_hw_queues; i++) {
-		set->tags[i] = blk_mq_init_rq_map(set, i);
-		if (!set->tags[i])
-			goto out_unwind;
-	}
+	if (blk_mq_alloc_rq_maps(set))
+		goto enomem;
 
 	mutex_init(&set->tag_list_lock);
 	INIT_LIST_HEAD(&set->tag_list);
 
 	return 0;
-
-out_unwind:
-	while (--i >= 0)
-		blk_mq_free_rq_map(set, set->tags[i], i);
+enomem:
 	kfree(set->tags);
 	set->tags = NULL;
-out:
 	return -ENOMEM;
 }
 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
-- 
cgit v1.2.3