diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bdev.c | 2 | ||||
-rw-r--r-- | block/bfq-iosched.c | 20 | ||||
-rw-r--r-- | block/bio.c | 3 | ||||
-rw-r--r-- | block/blk-cgroup.c | 5 | ||||
-rw-r--r-- | block/blk-cgroup.h | 17 | ||||
-rw-r--r-- | block/blk-core.c | 38 | ||||
-rw-r--r-- | block/blk-iolatency.c | 2 | ||||
-rw-r--r-- | block/blk-lib.c | 88 | ||||
-rw-r--r-- | block/blk-merge.c | 71 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 6 | ||||
-rw-r--r-- | block/blk-mq-debugfs.h | 2 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 18 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 16 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 4 | ||||
-rw-r--r-- | block/blk-mq.c | 274 | ||||
-rw-r--r-- | block/blk-mq.h | 2 | ||||
-rw-r--r-- | block/blk-rq-qos.h | 20 | ||||
-rw-r--r-- | block/blk-settings.c | 16 | ||||
-rw-r--r-- | block/blk-sysfs.c | 28 | ||||
-rw-r--r-- | block/blk-throttle.c | 48 | ||||
-rw-r--r-- | block/blk-throttle.h | 3 | ||||
-rw-r--r-- | block/blk-zoned.c | 1 | ||||
-rw-r--r-- | block/blk.h | 3 | ||||
-rw-r--r-- | block/bounce.c | 3 | ||||
-rw-r--r-- | block/elevator.c | 6 | ||||
-rw-r--r-- | block/fops.c | 3 | ||||
-rw-r--r-- | block/genhd.c | 45 |
27 files changed, 375 insertions, 369 deletions
diff --git a/block/bdev.c b/block/bdev.c index ce8de42a89be..13de871fa816 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -385,7 +385,7 @@ static struct kmem_cache * bdev_cachep __read_mostly; static struct inode *bdev_alloc_inode(struct super_block *sb) { - struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); + struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); if (!ei) return NULL; diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d23b5112c682..2e0dd68a3cbe 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2153,7 +2153,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->waker_detection_started = now_ns; bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name, MAX_BFQQ_NAME_LENGTH); - bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name); + bfq_log_bfqq(bfqd, bfqq, "set tentative waker %s", waker_name); } else /* Same tentative waker queue detected again */ bfqq->num_waker_detections++; @@ -2782,6 +2782,15 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * are likely to increase the throughput. */ bfqq->new_bfqq = new_bfqq; + /* + * The above assignment schedules the following redirections: + * each time some I/O for bfqq arrives, the process that + * generated that I/O is disassociated from bfqq and + * associated with new_bfqq. Here we increases new_bfqq->ref + * in advance, adding the number of processes that are + * expected to be associated with new_bfqq as they happen to + * issue I/O. + */ new_bfqq->ref += process_refs; return new_bfqq; } @@ -2844,6 +2853,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_queue *in_service_bfqq, *new_bfqq; + /* if a merge has already been setup, then proceed with that first */ + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + /* * Check delayed stable merge for rotational or non-queueing * devs. For this branch to be executed, bfqq must not be @@ -2945,9 +2958,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfq_too_late_for_merging(bfqq)) return NULL; - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; @@ -5449,7 +5459,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { bfq_release_process_ref(bfqd, bfqq); - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true); + bfqq = bfq_get_queue(bfqd, bio, false, bic, true); bic_set_bfqq(bic, bfqq, false); } diff --git a/block/bio.c b/block/bio.c index 3c57b3ba727d..cdd7b2915c53 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1514,8 +1514,7 @@ again: if (!bio_integrity_endio(bio)) return; - if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED)) - rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio); + rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fa063c6c0338..d53b0d69dd73 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -82,6 +82,8 @@ static void blkg_free(struct blkcg_gq *blkg) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + if (blkg->q) + blk_put_queue(blkg->q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); @@ -167,6 +169,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg->iostat_cpu) goto err_free; + if (!blk_get_queue(q)) + goto err_free; + blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); spin_lock_init(&blkg->async_bio_lock); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 3e91803c4a55..47e1e38390c9 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,6 +15,7 @@ */ #include <linux/blk-cgroup.h> +#include <linux/blk-mq.h> /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) @@ -428,6 +429,21 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); } +/** + * blk_cgroup_mergeable - Determine whether to allow or disallow merges + * @rq: request to merge into + * @bio: bio to merge + * + * @bio and @rq should belong to the same cgroup and their issue_as_root should + * match. The latter is necessary as we don't want to throttle e.g. a metadata + * update because it happens to be next to a regular IO. + */ +static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) +{ + return rq->bio->bi_blkg == bio->bi_blkg && + bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); +} + void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ @@ -467,6 +483,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) { } static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } +static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) diff --git a/block/blk-core.c b/block/blk-core.c index 3bb5a551bb90..937bb6b86331 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -50,6 +50,7 @@ #include "blk-pm.h" #include "blk-cgroup.h" #include "blk-throttle.h" +#include "blk-rq-qos.h" struct dentry *blk_debugfs_root; @@ -122,7 +123,6 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(ZONE_CLOSE), REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(ZONE_APPEND), - REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(DRV_IN), REQ_OP_NAME(DRV_OUT), @@ -315,6 +315,9 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_freeze_queue(q); + /* cleanup rq qos structures for queue without disk */ + rq_qos_exit(q); + blk_queue_flag_set(QUEUE_FLAG_DEAD, q); blk_sync_queue(q); @@ -336,8 +339,6 @@ void blk_cleanup_queue(struct request_queue *q) blk_mq_sched_free_rqs(q); mutex_unlock(&q->sysfs_lock); - percpu_ref_exit(&q->q_usage_counter); - /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } @@ -490,17 +491,12 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_stats; - if (blkcg_init_queue(q)) - goto fail_ref; - blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); q->nr_requests = BLKDEV_DEFAULT_RQ; return q; -fail_ref: - percpu_ref_exit(&q->q_usage_counter); fail_stats: blk_free_queue_stats(q->stats); fail_split: @@ -691,7 +687,7 @@ static void __submit_bio(struct bio *bio) * * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. * bio_list_on_stack[1] contains bios that were submitted before the current - * ->submit_bio_bio, but that haven't been processed yet. + * ->submit_bio, but that haven't been processed yet. */ static void __submit_bio_noacct(struct bio *bio) { @@ -831,10 +827,6 @@ void submit_bio_noacct(struct bio *bio) if (!blk_queue_secure_erase(q)) goto not_supported; break; - case REQ_OP_WRITE_SAME: - if (!q->limits.max_write_same_sectors) - goto not_supported; - break; case REQ_OP_ZONE_APPEND: status = blk_check_zone_append(q, bio); if (status != BLK_STS_OK) @@ -906,13 +898,7 @@ void submit_bio(struct bio *bio) * go through the normal accounting stuff before submission. */ if (bio_has_data(bio)) { - unsigned int count; - - if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - count = queue_logical_block_size( - bdev_get_queue(bio->bi_bdev)) >> 9; - else - count = bio_sectors(bio); + unsigned int count = bio_sectors(bio); if (op_is_write(bio_op(bio))) { count_vm_events(PGPGOUT, count); @@ -958,7 +944,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); blk_qc_t cookie = READ_ONCE(bio->bi_cookie); - int ret; + int ret = 0; if (cookie == BLK_QC_T_NONE || !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) @@ -968,10 +954,14 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT)) return 0; - if (WARN_ON_ONCE(!queue_is_mq(q))) - ret = 0; /* not yet implemented, should not happen */ - else + if (queue_is_mq(q)) { ret = blk_mq_poll(q, cookie, iob, flags); + } else { + struct gendisk *disk = q->disk; + + if (disk && disk->fops->poll_bio) + ret = disk->fops->poll_bio(bio, iob, flags); + } blk_queue_exit(q); return ret; } diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 010e658d44a8..2f33932e72e3 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -598,7 +598,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) int inflight = 0; blkg = bio->bi_blkg; - if (!blkg || !bio_flagged(bio, BIO_TRACKED)) + if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED)) return; iolat = blkg_to_lat(bio->bi_blkg); diff --git a/block/blk-lib.c b/block/blk-lib.c index fc6ea52e7482..237d60d8b585 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -132,94 +132,6 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_discard); -/** - * __blkdev_issue_write_same - generate number of bios with same page - * @bdev: target blockdev - * @sector: start sector - * @nr_sects: number of sectors to write - * @gfp_mask: memory allocation flags (for bio_alloc) - * @page: page containing data to write - * @biop: pointer to anchor bio - * - * Description: - * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page. - */ -static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, struct page *page, - struct bio **biop) -{ - struct request_queue *q = bdev_get_queue(bdev); - unsigned int max_write_same_sectors; - struct bio *bio = *biop; - sector_t bs_mask; - - if (bdev_read_only(bdev)) - return -EPERM; - - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) - return -EINVAL; - - if (!bdev_write_same(bdev)) - return -EOPNOTSUPP; - - /* Ensure that max_write_same_sectors doesn't overflow bi_size */ - max_write_same_sectors = bio_allowed_max_sectors(q); - - while (nr_sects) { - bio = blk_next_bio(bio, bdev, 1, REQ_OP_WRITE_SAME, gfp_mask); - bio->bi_iter.bi_sector = sector; - bio->bi_vcnt = 1; - bio->bi_io_vec->bv_page = page; - bio->bi_io_vec->bv_offset = 0; - bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev); - - if (nr_sects > max_write_same_sectors) { - bio->bi_iter.bi_size = max_write_same_sectors << 9; - nr_sects -= max_write_same_sectors; - sector += max_write_same_sectors; - } else { - bio->bi_iter.bi_size = nr_sects << 9; - nr_sects = 0; - } - cond_resched(); - } - - *biop = bio; - return 0; -} - -/** - * blkdev_issue_write_same - queue a write same operation - * @bdev: target blockdev - * @sector: start sector - * @nr_sects: number of sectors to write - * @gfp_mask: memory allocation flags (for bio_alloc) - * @page: page containing data - * - * Description: - * Issue a write same request for the sectors in question. - */ -int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, - struct page *page) -{ - struct bio *bio = NULL; - struct blk_plug plug; - int ret; - - blk_start_plug(&plug); - ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page, - &bio); - if (ret == 0 && bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - } - blk_finish_plug(&plug); - return ret; -} -EXPORT_SYMBOL(blkdev_issue_write_same); - static int __blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) diff --git a/block/blk-merge.c b/block/blk-merge.c index 0e871d4e7cb8..7771dacc99cb 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -9,6 +9,7 @@ #include <linux/blk-integrity.h> #include <linux/scatterlist.h> #include <linux/part_stat.h> +#include <linux/blk-cgroup.h> #include <trace/events/block.h> @@ -152,22 +153,6 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs); } -static struct bio *blk_bio_write_same_split(struct request_queue *q, - struct bio *bio, - struct bio_set *bs, - unsigned *nsegs) -{ - *nsegs = 1; - - if (!q->limits.max_write_same_sectors) - return NULL; - - if (bio_sectors(bio) <= q->limits.max_write_same_sectors) - return NULL; - - return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs); -} - /* * Return the maximum number of sectors from the start of a bio that may be * submitted as a single request to a block device. If enough sectors remain, @@ -351,10 +336,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, nr_segs); break; - case REQ_OP_WRITE_SAME: - split = blk_bio_write_same_split(q, *bio, &q->bio_split, - nr_segs); - break; default: split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); break; @@ -414,8 +395,6 @@ unsigned int blk_recalc_rq_segments(struct request *rq) return 1; case REQ_OP_WRITE_ZEROES: return 0; - case REQ_OP_WRITE_SAME: - return 1; } rq_for_each_bvec(bv, rq, iter) @@ -553,8 +532,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); - else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) - nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg); else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); @@ -598,6 +575,9 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) { + if (!blk_cgroup_mergeable(req, bio)) + goto no_merge; + if (blk_integrity_merge_bio(req->q, req, bio) == false) goto no_merge; @@ -694,6 +674,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (total_phys_segments > blk_rq_get_max_segments(req)) return 0; + if (!blk_cgroup_mergeable(req, next->bio)) + return 0; + if (blk_integrity_merge_rq(q, req, next) == false) return 0; @@ -755,13 +738,6 @@ static enum elv_merge blk_try_req_merge(struct request *req, return ELEVATOR_NO_MERGE; } -static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) -{ - if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b)) - return true; - return false; -} - /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -778,10 +754,6 @@ static struct request *attempt_merge(struct request_queue *q, if (rq_data_dir(req) != rq_data_dir(next)) return NULL; - if (req_op(req) == REQ_OP_WRITE_SAME && - !blk_write_same_mergeable(req->bio, next->bio)) - return NULL; - if (req->ioprio != next->ioprio) return NULL; @@ -895,6 +867,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_data_dir(bio) != rq_data_dir(rq)) return false; + /* don't merge across cgroup boundaries */ + if (!blk_cgroup_mergeable(rq, bio)) + return false; + /* only merge integrity protected bio into ditto rq */ if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; @@ -903,11 +879,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!bio_crypt_rq_ctx_compatible(rq, bio)) return false; - /* must be using the same buffer */ - if (req_op(rq) == REQ_OP_WRITE_SAME && - !blk_write_same_mergeable(rq->bio, bio)) - return false; - if (rq->ioprio != bio_prio(bio)) return false; @@ -1073,12 +1044,20 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, if (!plug || rq_list_empty(plug->mq_list)) return false; - /* check the previously added entry for a quick merge attempt */ - rq = rq_list_peek(&plug->mq_list); - if (rq->q == q) { - if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == - BIO_MERGE_OK) - return true; + rq_list_for_each(&plug->mq_list, rq) { + if (rq->q == q) { + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK) + return true; + break; + } + + /* + * Only keep iterating plug list for merges if we have multiple + * queues + */ + if (!plug->multiple_queues) + break; } return false; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index c2904c75c160..aa0349e9f083 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -683,7 +683,7 @@ static void debugfs_create_files(struct dentry *parent, void *data, void blk_mq_debugfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); @@ -756,7 +756,7 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) void blk_mq_debugfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_hctx(q, hctx); @@ -765,7 +765,7 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q) void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_unregister_hctx(hctx); diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index a68aa6041a10..69918f4170d6 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -6,6 +6,8 @@ #include <linux/seq_file.h> +struct blk_mq_hw_ctx; + struct blk_mq_debugfs_attr { const char *name; umode_t mode; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 55488ba97823..9e56a69422b6 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -180,11 +180,18 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { + unsigned long end = jiffies + HZ; int ret; do { ret = __blk_mq_do_dispatch_sched(hctx); - } while (ret == 1); + if (ret != 1) + break; + if (need_resched() || time_is_before_jiffies(end)) { + blk_mq_delay_run_hw_queue(hctx, 0); + break; + } + } while (1); return ret; } @@ -515,7 +522,7 @@ static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) { @@ -550,9 +557,10 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue) int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { - unsigned int i, flags = q->tag_set->flags; + unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; + unsigned long i; int ret; if (!e) { @@ -618,7 +626,7 @@ err_free_map_and_rqs: void blk_mq_sched_free_rqs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; if (blk_mq_is_shared_tags(q->tag_set->flags)) { blk_mq_free_rqs(q->tag_set, q->sched_shared_tags, @@ -635,7 +643,7 @@ void blk_mq_sched_free_rqs(struct request_queue *q) void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 674786574075..c08426975856 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -206,7 +206,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; lockdep_assert_held(&q->sysfs_dir_lock); @@ -255,7 +255,8 @@ void blk_mq_sysfs_init(struct request_queue *q) int __blk_mq_register_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int ret, i; + unsigned long i, j; + int ret; WARN_ON_ONCE(!q->kobj.parent); lockdep_assert_held(&q->sysfs_dir_lock); @@ -278,8 +279,10 @@ out: return ret; unreg: - while (--i >= 0) - blk_mq_unregister_hctx(q->queue_hw_ctx[i]); + queue_for_each_hw_ctx(q, hctx, j) { + if (j < i) + blk_mq_unregister_hctx(hctx); + } kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); @@ -290,7 +293,7 @@ unreg: void blk_mq_sysfs_unregister(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) @@ -306,7 +309,8 @@ unlock: int blk_mq_sysfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i, ret = 0; + unsigned long i; + int ret = 0; mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 0fd409b8e86e..68ac23d0b640 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -498,7 +498,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { /* - * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx + * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table * while the queue is frozen. So we can use q_usage_counter to avoid * racing with it. */ @@ -515,7 +515,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, bt_for_each(NULL, q, btags, fn, priv, false); } else { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; diff --git a/block/blk-mq.c b/block/blk-mq.c index 64d5c2edb817..e6f24fa4a4c2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -71,7 +71,8 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, blk_qc_t qc) { - return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT]; + return xa_load(&q->hctx_table, + (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT); } static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, @@ -312,7 +313,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) @@ -573,7 +574,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * If not tell the caller that it should skip this queue. */ ret = -EXDEV; - data.hctx = q->queue_hw_ctx[hctx_idx]; + data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); @@ -887,10 +888,15 @@ static inline void blk_account_io_done(struct request *req, u64 now) static void __blk_account_io_start(struct request *rq) { - /* passthrough requests can hold bios that do not have ->bi_bdev set */ - if (rq->bio && rq->bio->bi_bdev) + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (rq->bio) rq->part = rq->bio->bi_bdev; - else if (rq->q->disk) + else rq->part = rq->q->disk->part0; part_stat_lock(); @@ -1446,7 +1452,7 @@ static void blk_mq_timeout_work(struct work_struct *work) container_of(work, struct request_queue, timeout_work); unsigned long next = 0; struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; /* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting @@ -2147,7 +2153,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx, *sq_hctx; - int i; + unsigned long i; sq_hctx = NULL; if (blk_mq_has_sqsched(q)) @@ -2175,7 +2181,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { struct blk_mq_hw_ctx *hctx, *sq_hctx; - int i; + unsigned long i; sq_hctx = NULL; if (blk_mq_has_sqsched(q)) @@ -2213,7 +2219,7 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); bool blk_mq_queue_stopped(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hctx_stopped(hctx)) @@ -2252,7 +2258,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue); void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_stop_hw_queue(hctx); @@ -2270,7 +2276,7 @@ EXPORT_SYMBOL(blk_mq_start_hw_queue); void blk_mq_start_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_hw_queue(hctx); @@ -2290,7 +2296,7 @@ EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_stopped_hw_queue(hctx, async); @@ -2570,13 +2576,36 @@ static void __blk_mq_flush_plug_list(struct request_queue *q, q->mq_ops->queue_rqs(&plug->mq_list); } +static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) +{ + struct blk_mq_hw_ctx *this_hctx = NULL; + struct blk_mq_ctx *this_ctx = NULL; + struct request *requeue_list = NULL; + unsigned int depth = 0; + LIST_HEAD(list); + + do { + struct request *rq = rq_list_pop(&plug->mq_list); + + if (!this_hctx) { + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + rq_list_add(&requeue_list, rq); + continue; + } + list_add_tail(&rq->queuelist, &list); + depth++; + } while (!rq_list_empty(plug->mq_list)); + + plug->mq_list = requeue_list; + trace_block_unplug(this_hctx->queue, depth, !from_sched); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched); +} + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct blk_mq_hw_ctx *this_hctx; - struct blk_mq_ctx *this_ctx; struct request *rq; - unsigned int depth; - LIST_HEAD(list); if (rq_list_empty(plug->mq_list)) return; @@ -2612,35 +2641,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) return; } - this_hctx = NULL; - this_ctx = NULL; - depth = 0; do { - rq = rq_list_pop(&plug->mq_list); - - if (!this_hctx) { - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { - trace_block_unplug(this_hctx->queue, depth, - !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, - &list, from_schedule); - depth = 0; - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - - } - - list_add(&rq->queuelist, &list); - depth++; + blk_mq_dispatch_plug_list(plug, from_schedule); } while (!rq_list_empty(plug->mq_list)); - - if (!list_empty(&list)) { - trace_block_unplug(this_hctx->queue, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, - from_schedule); - } } void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, @@ -2727,7 +2730,8 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q, static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, - struct bio *bio) + struct bio *bio, + unsigned int nsegs) { struct blk_mq_alloc_data data = { .q = q, @@ -2739,6 +2743,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, if (unlikely(bio_queue_enter(bio))) return NULL; + if (blk_mq_attempt_bio_merge(q, bio, nsegs)) + goto queue_exit; + + rq_qos_throttle(q, bio); + if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; @@ -2751,12 +2760,13 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); +queue_exit: blk_queue_exit(q); return NULL; } static inline struct request *blk_mq_get_cached_request(struct request_queue *q, - struct blk_plug *plug, struct bio *bio) + struct blk_plug *plug, struct bio **bio, unsigned int nsegs) { struct request *rq; @@ -2766,12 +2776,19 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, if (!rq || rq->q != q) return NULL; - if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type) + if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { + *bio = NULL; + return NULL; + } + + rq_qos_throttle(q, *bio); + + if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type) return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) + if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) return NULL; - rq->cmd_flags = bio->bi_opf; + rq->cmd_flags = (*bio)->bi_opf; plug->cached_rq = rq_list_next(rq); INIT_LIST_HEAD(&rq->queuelist); return rq; @@ -2806,14 +2823,11 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return; - if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) - return; - - rq_qos_throttle(q, bio); - - rq = blk_mq_get_cached_request(q, plug, bio); + rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); if (!rq) { - rq = blk_mq_get_new_requests(q, plug, bio); + if (!bio) + return; + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); if (unlikely(!rq)) return; } @@ -3068,6 +3082,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, struct blk_mq_tags *drv_tags; struct page *page; + if (list_empty(&tags->page_list)) + return; + if (blk_mq_is_shared_tags(set->flags)) drv_tags = set->shared_tags; else @@ -3110,15 +3127,41 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags) blk_mq_free_tags(tags); } +static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + int i; + + for (i = 0; i < set->nr_maps; i++) { + unsigned int start = set->map[i].queue_offset; + unsigned int end = start + set->map[i].nr_queues; + + if (hctx_idx >= start && hctx_idx < end) + break; + } + + if (i >= set->nr_maps) + i = HCTX_TYPE_DEFAULT; + + return i; +} + +static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + enum hctx_type type = hctx_idx_to_type(set, hctx_idx); + + return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); +} + static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, unsigned int reserved_tags) { + int node = blk_mq_get_hctx_node(set, hctx_idx); struct blk_mq_tags *tags; - int node; - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -3167,10 +3210,9 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; + int node = blk_mq_get_hctx_node(set, hctx_idx); size_t rq_size, left; - int node; - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -3415,6 +3457,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, blk_mq_remove_cpuhp(hctx); + xa_erase(&q->hctx_table, hctx_idx); + spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); @@ -3424,12 +3468,11 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; - blk_mq_debugfs_unregister_hctx(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } @@ -3454,8 +3497,15 @@ static int blk_mq_init_hctx(struct request_queue *q, if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) goto exit_hctx; + + if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) + goto exit_flush_rq; + return 0; + exit_flush_rq: + if (set->ops->exit_request) + set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -3615,7 +3665,8 @@ static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, static void blk_mq_map_swqueue(struct request_queue *q) { - unsigned int i, j, hctx_idx; + unsigned int j, hctx_idx; + unsigned long i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; @@ -3722,7 +3773,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (shared) { @@ -3822,7 +3873,7 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); @@ -3833,7 +3884,7 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - kfree(q->queue_hw_ctx); + xa_destroy(&q->hctx_table); /* * release .mq_kobj and sw queue's kobject now because @@ -3922,52 +3973,28 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { - int i, j, end; - struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; - - if (q->nr_hw_queues < set->nr_hw_queues) { - struct blk_mq_hw_ctx **new_hctxs; - - new_hctxs = kcalloc_node(set->nr_hw_queues, - sizeof(*new_hctxs), GFP_KERNEL, - set->numa_node); - if (!new_hctxs) - return; - if (hctxs) - memcpy(new_hctxs, hctxs, q->nr_hw_queues * - sizeof(*hctxs)); - q->queue_hw_ctx = new_hctxs; - kfree(hctxs); - hctxs = new_hctxs; - } + struct blk_mq_hw_ctx *hctx; + unsigned long i, j; /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { - int node; - struct blk_mq_hw_ctx *hctx; + int old_node; + int node = blk_mq_get_hctx_node(set, i); + struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); - /* - * If the hw queue has been mapped to another numa node, - * we need to realloc the hctx. If allocation fails, fallback - * to use the previous one. - */ - if (hctxs[i] && (hctxs[i]->numa_node == node)) - continue; + if (old_hctx) { + old_node = old_hctx->numa_node; + blk_mq_exit_hctx(q, set, old_hctx, i); + } - hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); - if (hctx) { - if (hctxs[i]) - blk_mq_exit_hctx(q, set, hctxs[i], i); - hctxs[i] = hctx; - } else { - if (hctxs[i]) - pr_warn("Allocate new hctx on node %d fails,\ - fallback to previous one on node %d\n", - node, hctxs[i]->numa_node); - else + if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { + if (!old_hctx) break; + pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", + node, old_node); + hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); + WARN_ON_ONCE(!hctx); } } /* @@ -3976,24 +4003,27 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; - end = i; } else { j = i; - end = q->nr_hw_queues; q->nr_hw_queues = set->nr_hw_queues; } - for (; j < end; j++) { - struct blk_mq_hw_ctx *hctx = hctxs[j]; - - if (hctx) { - blk_mq_exit_hctx(q, set, hctx, j); - hctxs[j] = NULL; - } - } + xa_for_each_start(&q->hctx_table, j, hctx, j) + blk_mq_exit_hctx(q, set, hctx, j); mutex_unlock(&q->sysfs_lock); } +static void blk_mq_update_poll_flag(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + if (set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); + else + blk_queue_flag_clear(QUEUE_FLAG_POLL, q); +} + int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { @@ -4018,6 +4048,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); + xa_init(&q->hctx_table); + blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; @@ -4028,9 +4060,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; - if (set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); + blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->requeue_list); @@ -4049,7 +4079,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return 0; err_hctxs: - kfree(q->queue_hw_ctx); + xa_destroy(&q->hctx_table); q->nr_hw_queues = 0; blk_mq_sysfs_deinit(q); err_poll: @@ -4337,7 +4367,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; - int i, ret; + int ret; + unsigned long i; if (!set) return -EINVAL; @@ -4496,6 +4527,7 @@ fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); + blk_mq_update_poll_flag(q); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; @@ -4712,7 +4744,7 @@ void blk_mq_cancel_work_sync(struct request_queue *q) { if (queue_is_mq(q)) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; cancel_delayed_work_sync(&q->requeue_work); diff --git a/block/blk-mq.h b/block/blk-mq.h index 948791ea2a3e..2615bd58bad3 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -83,7 +83,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * enum hctx_type type, unsigned int cpu) { - return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; + return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags) diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 3cfbc8668cba..68267007da1c 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -177,20 +177,20 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) __rq_qos_requeue(q->rq_qos, rq); } -static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +static inline void rq_qos_done_bio(struct bio *bio) { - if (q->rq_qos) - __rq_qos_done_bio(q->rq_qos, bio); + if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || + bio_flagged(bio, BIO_QOS_MERGED))) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + if (q->rq_qos) + __rq_qos_done_bio(q->rq_qos, bio); + } } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - /* - * BIO_TRACKED lets controllers know that a bio went through the - * normal rq_qos path. - */ if (q->rq_qos) { - bio_set_flag(bio, BIO_TRACKED); + bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } } @@ -205,8 +205,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq, static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) + if (q->rq_qos) { + bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); + } } static inline void rq_qos_queue_depth_changed(struct request_queue *q) diff --git a/block/blk-settings.c b/block/blk-settings.c index b880c70e22e4..b83df3d2eebc 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -42,7 +42,6 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; lim->max_dev_sectors = 0; lim->chunk_sectors = 0; - lim->max_write_same_sectors = 0; lim->max_write_zeroes_sectors = 0; lim->max_zone_append_sectors = 0; lim->max_discard_sectors = 0; @@ -79,7 +78,6 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; - lim->max_write_same_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; lim->max_zone_append_sectors = UINT_MAX; } @@ -179,18 +177,6 @@ void blk_queue_max_discard_sectors(struct request_queue *q, EXPORT_SYMBOL(blk_queue_max_discard_sectors); /** - * blk_queue_max_write_same_sectors - set max sectors for a single write same - * @q: the request queue for the device - * @max_write_same_sectors: maximum number of sectors to write per command - **/ -void blk_queue_max_write_same_sectors(struct request_queue *q, - unsigned int max_write_same_sectors) -{ - q->limits.max_write_same_sectors = max_write_same_sectors; -} -EXPORT_SYMBOL(blk_queue_max_write_same_sectors); - -/** * blk_queue_max_write_zeroes_sectors - set max sectors for a single * write zeroes * @q: the request queue for the device @@ -519,8 +505,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); - t->max_write_same_sectors = min(t->max_write_same_sectors, - b->max_write_same_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); t->max_zone_append_sectors = min(t->max_zone_append_sectors, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 241ded62f458..88bd41d4cb59 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -214,8 +214,7 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) { - return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_write_same_sectors << 9); + return queue_var_show(0, page); } static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) @@ -739,27 +738,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q); } -/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ -static void blk_exit_queue(struct request_queue *q) -{ - /* - * Since the I/O scheduler exit code may access cgroup information, - * perform I/O scheduler exit before disassociating from the block - * cgroup controller. - */ - if (q->elevator) { - ioc_clear_queue(q); - elevator_exit(q); - } - - /* - * Remove all references to @q from the block cgroup controller before - * restoring @q->queue_lock to avoid that restoring this pointer causes - * e.g. blkcg_print_blkgs() to crash. - */ - blkcg_exit_queue(q); -} - /** * blk_release_queue - releases all allocated resources of the request_queue * @kobj: pointer to a kobject, whose container is a request_queue @@ -787,12 +765,12 @@ static void blk_release_queue(struct kobject *kobj) might_sleep(); + percpu_ref_exit(&q->q_usage_counter); + if (q->poll_stat) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); - blk_exit_queue(q); - blk_free_queue_stats(q->stats); kfree(q->poll_stat); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a3b3ebc72dd4..469c483719be 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -874,7 +874,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ - if (bps_limit == U64_MAX && iops_limit == UINT_MAX) { + if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || + tg->flags & THROTL_TG_CANCELING) { if (wait) *wait = 0; return true; @@ -1137,12 +1138,22 @@ static void throtl_pending_timer_fn(struct timer_list *t) struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); - struct request_queue *q = td->queue; struct throtl_service_queue *parent_sq; + struct request_queue *q; bool dispatched; int ret; + /* throtl_data may be gone, so figure out request queue by blkg */ + if (tg) + q = tg->pd.blkg->q; + else + q = td->queue; + spin_lock_irq(&q->queue_lock); + + if (!q->root_blkg) + goto out_unlock; + if (throtl_can_upgrade(td, NULL)) throtl_upgrade_state(td); @@ -1766,6 +1777,39 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) return false; } +void blk_throtl_cancel_bios(struct request_queue *q) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + spin_lock_irq(&q->queue_lock); + /* + * queue_lock is held, rcu lock is not needed here technically. + * However, rcu lock is still held to emphasize that following + * path need RCU protection and to prevent warning from lockdep. + */ + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + /* + * Set the flag to make sure throtl_pending_timer_fn() won't + * stop until all throttled bios are dispatched. + */ + blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; + /* + * Update disptime after setting the above flag to make sure + * throtl_select_dispatch() won't exit without dispatching. + */ + tg_update_disptime(tg); + + throtl_schedule_pending_timer(sq, jiffies + 1); + } + rcu_read_unlock(); + spin_unlock_irq(&q->queue_lock); +} + static bool throtl_can_upgrade(struct throtl_data *td, struct throtl_grp *this_tg) { diff --git a/block/blk-throttle.h b/block/blk-throttle.h index b23a9f3abb82..c1b602996127 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -56,6 +56,7 @@ enum tg_state_flags { THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */ + THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */ }; enum { @@ -162,11 +163,13 @@ static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } +static inline void blk_throtl_cancel_bios(struct request_queue *q) { } #else /* CONFIG_BLK_DEV_THROTTLING */ int blk_throtl_init(struct request_queue *q); void blk_throtl_exit(struct request_queue *q); void blk_throtl_register_queue(struct request_queue *q); bool __blk_throtl_bio(struct bio *bio); +void blk_throtl_cancel_bios(struct request_queue *q); static inline bool blk_throtl_bio(struct bio *bio) { struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 602bef54c813..38cd840d8838 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -65,7 +65,6 @@ bool blk_req_needs_zone_write_lock(struct request *rq) switch (req_op(rq)) { case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE: return blk_rq_zone_is_seq(rq); default: diff --git a/block/blk.h b/block/blk.h index ebaa59ca46ca..8ccbc6e07636 100644 --- a/block/blk.h +++ b/block/blk.h @@ -286,7 +286,6 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio) case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE_SAME: return true; /* non-trivial splitting decisions */ default: break; @@ -325,7 +324,7 @@ int blk_dev_init(void); */ static inline bool blk_do_io_stat(struct request *rq) { - return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk; + return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq); } void update_io_ticks(struct block_device *part, unsigned long now, bool end); diff --git a/block/bounce.c b/block/bounce.c index 9db1256d57d5..467be46d0e65 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -177,9 +177,6 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: break; - case REQ_OP_WRITE_SAME: - bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; - break; default: bio_for_each_segment(bv, bio_src, iter) bio->bi_io_vec[bio->bi_vcnt++] = bv; diff --git a/block/elevator.c b/block/elevator.c index 9a9e52374e27..c319765892bb 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -192,6 +192,9 @@ void elevator_exit(struct request_queue *q) { struct elevator_queue *e = q->elevator; + ioc_clear_queue(q); + blk_mq_sched_free_rqs(q); + mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); @@ -594,8 +597,6 @@ int elevator_switch_mq(struct request_queue *q, if (q->elevator) { elv_unregister_queue(q); - ioc_clear_queue(q); - blk_mq_sched_free_rqs(q); elevator_exit(q); } @@ -606,7 +607,6 @@ int elevator_switch_mq(struct request_queue *q, if (new_e) { ret = elv_register_queue(q, true); if (ret) { - blk_mq_sched_free_rqs(q); elevator_exit(q); goto out; } diff --git a/block/fops.c b/block/fops.c index 1c732b72de72..9f2ecec406b0 100644 --- a/block/fops.c +++ b/block/fops.c @@ -425,7 +425,8 @@ static int blkdev_writepages(struct address_space *mapping, } const struct address_space_operations def_blk_aops = { - .set_page_dirty = __set_page_dirty_buffers, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, .readpage = blkdev_readpage, .readahead = blkdev_readahead, .writepage = blkdev_writepage, diff --git a/block/genhd.c b/block/genhd.c index 11c761afd64f..c9a4fc90d3e9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -25,10 +25,12 @@ #include <linux/pm_runtime.h> #include <linux/badblocks.h> #include <linux/part_stat.h> +#include "blk-throttle.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-cgroup.h" static struct kobject *block_depr; @@ -410,6 +412,10 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, struct device *ddev = disk_to_dev(disk); int ret; + /* Only makes sense for bio-based to set ->poll_bio */ + if (queue_is_mq(disk->queue) && disk->fops->poll_bio) + return -EINVAL; + /* * The disk queue should now be all set with enough information about * the device for the elevator code to pick an adequate default @@ -640,7 +646,8 @@ void del_gendisk(struct gendisk *disk) blk_mq_freeze_queue_wait(q); - rq_qos_exit(q); + blk_throtl_cancel_bios(disk->queue); + blk_sync_queue(q); blk_flush_integrity(); /* @@ -1111,6 +1118,31 @@ static const struct attribute_group *disk_attr_groups[] = { NULL }; +static void disk_release_mq(struct request_queue *q) +{ + blk_mq_cancel_work_sync(q); + + /* + * There can't be any non non-passthrough bios in flight here, but + * requests stay around longer, including passthrough ones so we + * still need to freeze the queue here. + */ + blk_mq_freeze_queue(q); + + /* + * Since the I/O scheduler exit code may access cgroup information, + * perform I/O scheduler exit before disassociating from the block + * cgroup controller. + */ + if (q->elevator) { + mutex_lock(&q->sysfs_lock); + elevator_exit(q); + mutex_unlock(&q->sysfs_lock); + } + rq_qos_exit(q); + __blk_mq_unfreeze_queue(q, true); +} + /** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk @@ -1132,11 +1164,15 @@ static void disk_release(struct device *dev) might_sleep(); WARN_ON_ONCE(disk_live(disk)); - blk_mq_cancel_work_sync(disk->queue); + if (queue_is_mq(disk->queue)) + disk_release_mq(disk->queue); + + blkcg_exit_queue(disk->queue); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); + disk->queue->disk = NULL; blk_put_queue(disk->queue); @@ -1342,6 +1378,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; + if (blkcg_init_queue(q)) + goto out_erase_part0; + rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; @@ -1354,6 +1393,8 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, #endif return disk; +out_erase_part0: + xa_erase(&disk->part_tbl, 0); out_destroy_part_tbl: xa_destroy(&disk->part_tbl); disk->part0->bd_disk = NULL; |