From aa8dcccaf32bfdc09f2aff089d5d60c37da5b7b5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 27 Jan 2022 08:05:49 +0100
Subject: block: check that there is a plug in blk_flush_plug

Rename blk_flush_plug to __blk_flush_plug and add a wrapper that includes
the NULL check instead of open coding that check everywhere.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220127070549.1377856-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index d93e3bb9a769..61f6a0dc4511 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -991,8 +991,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
 	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 		return 0;
 
-	if (current->plug)
-		blk_flush_plug(current->plug, false);
+	blk_flush_plug(current->plug, false);
 
 	if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT))
 		return 0;
@@ -1274,7 +1273,7 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
 }
 EXPORT_SYMBOL(blk_check_plugged);
 
-void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
+void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 {
 	if (!list_empty(&plug->cb_list))
 		flush_plug_callbacks(plug, from_schedule);
@@ -1303,7 +1302,7 @@ void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 void blk_finish_plug(struct blk_plug *plug)
 {
 	if (plug == current->plug) {
-		blk_flush_plug(plug, false);
+		__blk_flush_plug(plug, false);
 		current->plug = NULL;
 	}
 }
-- 
cgit v1.2.3


From 2651bf680bc2ad9a078b7222b0873145ab4ece07 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Thu, 3 Feb 2022 11:28:25 -0800
Subject: block: introduce BLK_STS_OFFLINE

Currently, drivers reports BLK_STS_IOERR for devices that are not full
online or being removed. This behavior could cause confusion for users,
as they are not really I/O errors from the device.

Solve this issue with a new state BLK_STS_OFFLINE, which reports "device
offline error" in dmesg instead of "I/O error".

EIO is intentionally kept to not change user visible return value.

Signed-off-by: Song Liu <song@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220203192827.1370270-2-song@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 1 +
 include/linux/blk_types.h | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 61f6a0dc4511..24035dd2eef1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -164,6 +164,7 @@ static const struct {
 	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
 	[BLK_STS_DEV_RESOURCE]	= { -EBUSY,	"device resource" },
 	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
+	[BLK_STS_OFFLINE]	= { -EIO,	"device offline" },
 
 	/* device mapper special case, should not leak out: */
 	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index fe065c394fff..5561e58d158a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -153,6 +153,13 @@ typedef u8 __bitwise blk_status_t;
  */
 #define BLK_STS_ZONE_ACTIVE_RESOURCE	((__force blk_status_t)16)
 
+/*
+ * BLK_STS_OFFLINE is returned from the driver when the target device is offline
+ * or is being taken offline. This could help differentiate the case where a
+ * device is intentionally being shut down from a real I/O error.
+ */
+#define BLK_STS_OFFLINE		((__force blk_status_t)17)
+
 /**
  * blk_path_error - returns true if error may be path related
  * @error: status the request was completed with
-- 
cgit v1.2.3


From 7d32c027a21ef7aa0a400763397644d44b3576a9 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Thu, 3 Feb 2022 11:28:26 -0800
Subject: block: return -ENODEV for BLK_STS_OFFLINE

Change the user visible return value for BLK_STS_OFFLINE to -ENODEV, which
is more descriptive than existing -EIO.

Signed-off-by: Song Liu <song@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220203192827.1370270-3-song@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 24035dd2eef1..be8812f5489d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -164,7 +164,7 @@ static const struct {
 	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
 	[BLK_STS_DEV_RESOURCE]	= { -EBUSY,	"device resource" },
 	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
-	[BLK_STS_OFFLINE]	= { -EIO,	"device offline" },
+	[BLK_STS_OFFLINE]	= { -ENODEV,	"device offline" },
 
 	/* device mapper special case, should not leak out: */
 	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },
-- 
cgit v1.2.3


From 472e4314c039d6cf36e28783b1c63f87b5b394c2 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 11 Feb 2022 18:11:48 +0800
Subject: block: move initialization of q->blkg_list into blkcg_init_queue

q->blkg_list is only used by blkcg code, so move it into
blkcg_init_queue.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20220211101149.2368042-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 2 ++
 block/blk-core.c   | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 671debbae941..35deaceba1f0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1175,6 +1175,8 @@ int blkcg_init_queue(struct request_queue *q)
 	bool preloaded;
 	int ret;
 
+	INIT_LIST_HEAD(&q->blkg_list);
+
 	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
 	if (!new_blkg)
 		return -ENOMEM;
diff --git a/block/blk-core.c b/block/blk-core.c
index be8812f5489d..ff972b968f25 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -476,9 +476,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 	timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
 	INIT_WORK(&q->timeout_work, blk_timeout_work);
 	INIT_LIST_HEAD(&q->icq_list);
-#ifdef CONFIG_BLK_CGROUP
-	INIT_LIST_HEAD(&q->blkg_list);
-#endif
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
 
-- 
cgit v1.2.3


From 672fdcf0e7de3b1e39416ac85abf178f023271f1 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 11 Feb 2022 18:11:49 +0800
Subject: block: partition include/linux/blk-cgroup.h

Partition include/linux/blk-cgroup.h into two parts: one is public part,
the other is block layer private part.

Suggested by Christoph Hellwig.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220211101149.2368042-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.h         |   1 -
 block/bio.c                 |   2 +-
 block/blk-cgroup-rwstat.h   |   2 +-
 block/blk-cgroup.c          |   2 +-
 block/blk-cgroup.h          | 477 ++++++++++++++++++++++++++++++++++++++++++++
 block/blk-core.c            |   2 +-
 block/blk-crypto-fallback.c |   2 +-
 block/blk-iocost.c          |   2 +-
 block/blk-iolatency.c       |   2 +-
 block/blk-ioprio.c          |   2 +-
 block/blk-sysfs.c           |   2 +-
 block/blk-throttle.c        |   1 -
 block/bounce.c              |   2 +-
 block/elevator.c            |   2 +-
 include/linux/blk-cgroup.h  | 459 +-----------------------------------------
 15 files changed, 493 insertions(+), 467 deletions(-)
 create mode 100644 block/blk-cgroup.h

(limited to 'block/blk-core.c')

diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 07288b9da389..72255ec44f8f 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -8,7 +8,6 @@
 
 #include <linux/blktrace_api.h>
 #include <linux/hrtimer.h>
-#include <linux/blk-cgroup.h>
 
 #include "blk-cgroup-rwstat.h"
 
diff --git a/block/bio.c b/block/bio.c
index 18d34b33351b..b15f5466ce08 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -15,7 +15,6 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
-#include <linux/blk-cgroup.h>
 #include <linux/highmem.h>
 #include <linux/sched/sysctl.h>
 #include <linux/blk-crypto.h>
@@ -24,6 +23,7 @@
 #include <trace/events/block.h>
 #include "blk.h"
 #include "blk-rq-qos.h"
+#include "blk-cgroup.h"
 
 struct bio_alloc_cache {
 	struct bio		*free_list;
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index ee746919c41f..9f2723b34b75 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -6,7 +6,7 @@
 #ifndef _BLK_CGROUP_RWSTAT_H
 #define _BLK_CGROUP_RWSTAT_H
 
-#include <linux/blk-cgroup.h>
+#include "blk-cgroup.h"
 
 enum blkg_rwstat_type {
 	BLKG_RWSTAT_READ,
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 35deaceba1f0..4108d445c73a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -26,11 +26,11 @@
 #include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/ctype.h>
-#include <linux/blk-cgroup.h>
 #include <linux/tracehook.h>
 #include <linux/psi.h>
 #include <linux/part_stat.h>
 #include "blk.h"
+#include "blk-cgroup.h"
 #include "blk-ioprio.h"
 #include "blk-throttle.h"
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 000000000000..3e91803c4a55
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,477 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BLK_CGROUP_PRIVATE_H
+#define _BLK_CGROUP_PRIVATE_H
+/*
+ * block cgroup private header
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * 	              Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/blk-cgroup.h>
+
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
+
+#ifdef CONFIG_BLK_CGROUP
+
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q).  This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
+ */
+struct blkg_policy_data {
+	/* the blkg and policy id this per-policy data belongs to */
+	struct blkcg_gq			*blkg;
+	int				plid;
+};
+
+/*
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods.  A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
+ */
+struct blkcg_policy_data {
+	/* the blkcg and policy id this per-policy data belongs to */
+	struct blkcg			*blkcg;
+	int				plid;
+};
+
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp,
+				struct request_queue *q, struct blkcg *blkcg);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
+typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
+				struct seq_file *s);
+
+struct blkcg_policy {
+	int				plid;
+	/* cgroup files for the policy */
+	struct cftype			*dfl_cftypes;
+	struct cftype			*legacy_cftypes;
+
+	/* operations */
+	blkcg_pol_alloc_cpd_fn		*cpd_alloc_fn;
+	blkcg_pol_init_cpd_fn		*cpd_init_fn;
+	blkcg_pol_free_cpd_fn		*cpd_free_fn;
+	blkcg_pol_bind_cpd_fn		*cpd_bind_fn;
+
+	blkcg_pol_alloc_pd_fn		*pd_alloc_fn;
+	blkcg_pol_init_pd_fn		*pd_init_fn;
+	blkcg_pol_online_pd_fn		*pd_online_fn;
+	blkcg_pol_offline_pd_fn		*pd_offline_fn;
+	blkcg_pol_free_pd_fn		*pd_free_fn;
+	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
+	blkcg_pol_stat_pd_fn		*pd_stat_fn;
+};
+
+extern struct blkcg blkcg_root;
+extern bool blkcg_debug_stats;
+
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+				      struct request_queue *q, bool update_hint);
+int blkcg_init_queue(struct request_queue *q);
+void blkcg_exit_queue(struct request_queue *q);
+
+/* Blkio controller policy registration */
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
+int blkcg_activate_policy(struct request_queue *q,
+			  const struct blkcg_policy *pol);
+void blkcg_deactivate_policy(struct request_queue *q,
+			     const struct blkcg_policy *pol);
+
+const char *blkg_dev_name(struct blkcg_gq *blkg);
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+		       u64 (*prfill)(struct seq_file *,
+				     struct blkg_policy_data *, int),
+		       const struct blkcg_policy *pol, int data,
+		       bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+
+struct blkg_conf_ctx {
+	struct block_device		*bdev;
+	struct blkcg_gq			*blkg;
+	char				*body;
+};
+
+struct block_device *blkcg_conf_open_bdev(char **inputp);
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+		   char *input, struct blkg_conf_ctx *ctx);
+void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+
+/**
+ * blkcg_css - find the current css
+ *
+ * Find the css associated with either the kthread or the current task.
+ * This may return a dying css, so it is up to the caller to use tryget logic
+ * to confirm it is alive and well.
+ */
+static inline struct cgroup_subsys_state *blkcg_css(void)
+{
+	struct cgroup_subsys_state *css;
+
+	css = kthread_blkcg();
+	if (css)
+		return css;
+	return task_css(current, io_cgrp_id);
+}
+
+/**
+ * __bio_blkcg - internal, inconsistent version to get blkcg
+ *
+ * DO NOT USE.
+ * This function is inconsistent and consequently is dangerous to use.  The
+ * first part of the function returns a blkcg where a reference is owned by the
+ * bio.  This means it does not need to be rcu protected as it cannot go away
+ * with the bio owning a reference to it.  However, the latter potentially gets
+ * it from task_css().  This can race against task migration and the cgroup
+ * dying.  It is also semantically different as it must be called rcu protected
+ * and is susceptible to failure when trying to get a reference to it.
+ * Therefore, it is not ok to assume that *_get() will always succeed on the
+ * blkcg returned here.
+ */
+static inline struct blkcg *__bio_blkcg(struct bio *bio)
+{
+	if (bio && bio->bi_blkg)
+		return bio->bi_blkg->blkcg;
+	return css_to_blkcg(blkcg_css());
+}
+
+/**
+ * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
+ * @return: true if this bio needs to be submitted with the root blkg context.
+ *
+ * In order to avoid priority inversions we sometimes need to issue a bio as if
+ * it were attached to the root blkg, and then backcharge to the actual owning
+ * blkg.  The idea is we do bio_blkcg() to look up the actual context for the
+ * bio and attach the appropriate blkg to the bio.  Then we call this helper and
+ * if it is true run with the root blkg for that queue and then do any
+ * backcharging to the originating cgroup once the io is complete.
+ */
+static inline bool bio_issue_as_root_blkg(struct bio *bio)
+{
+	return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
+}
+
+/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state.  If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+					     struct request_queue *q,
+					     bool update_hint)
+{
+	struct blkcg_gq *blkg;
+
+	if (blkcg == &blkcg_root)
+		return q->root_blkg;
+
+	blkg = rcu_dereference(blkcg->blkg_hint);
+	if (blkg && blkg->q == q)
+		return blkg;
+
+	return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.  This function should be called
+ * under RCU read lock.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+					   struct request_queue *q)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return __blkg_lookup(blkcg, q, false);
+}
+
+/**
+ * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for @q at the root level. See also blkg_lookup().
+ */
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{
+	return q->root_blkg;
+}
+
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol)
+{
+	return blkg ? blkg->pd[pol->plid] : NULL;
+}
+
+static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
+						     struct blkcg_policy *pol)
+{
+	return blkcg ? blkcg->cpd[pol->plid] : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pd: policy private data of interest
+ *
+ * @pd is policy private data.  Determine the blkg it's associated with.
+ */
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+{
+	return pd ? pd->blkg : NULL;
+}
+
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+	return cpd ? cpd->blkcg : NULL;
+}
+
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+{
+	return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+}
+
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding an existing reference.
+ */
+static inline void blkg_get(struct blkcg_gq *blkg)
+{
+	percpu_ref_get(&blkg->refcnt);
+}
+
+/**
+ * blkg_tryget - try and get a blkg reference
+ * @blkg: blkg to get
+ *
+ * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
+ * of freeing this blkg, so we can only use it if the refcnt is not zero.
+ */
+static inline bool blkg_tryget(struct blkcg_gq *blkg)
+{
+	return blkg && percpu_ref_tryget(&blkg->refcnt);
+}
+
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ */
+static inline void blkg_put(struct blkcg_gq *blkg)
+{
+	percpu_ref_put(&blkg->refcnt);
+}
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+ * read locked.  If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs.  The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead.  Synchronization rules are the same.  @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
+bool __blkcg_punt_bio_submit(struct bio *bio);
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio)
+{
+	if (bio->bi_opf & REQ_CGROUP_PUNT)
+		return __blkcg_punt_bio_submit(bio);
+	else
+		return false;
+}
+
+static inline void blkcg_bio_issue_init(struct bio *bio)
+{
+	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+}
+
+static inline void blkcg_use_delay(struct blkcg_gq *blkg)
+{
+	if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
+		return;
+	if (atomic_add_return(1, &blkg->use_delay) == 1)
+		atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
+{
+	int old = atomic_read(&blkg->use_delay);
+
+	if (WARN_ON_ONCE(old < 0))
+		return 0;
+	if (old == 0)
+		return 0;
+
+	/*
+	 * We do this song and dance because we can race with somebody else
+	 * adding or removing delay.  If we just did an atomic_dec we'd end up
+	 * negative and we'd already be in trouble.  We need to subtract 1 and
+	 * then check to see if we were the last delay so we can drop the
+	 * congestion count on the cgroup.
+	 */
+	while (old) {
+		int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
+		if (cur == old)
+			break;
+		old = cur;
+	}
+
+	if (old == 0)
+		return 0;
+	if (old == 1)
+		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+	return 1;
+}
+
+/**
+ * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
+ * @blkg: target blkg
+ * @delay: delay duration in nsecs
+ *
+ * When enabled with this function, the delay is not decayed and must be
+ * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
+ * blkcg_[un]use_delay() and blkcg_add_delay() usages.
+ */
+static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
+{
+	int old = atomic_read(&blkg->use_delay);
+
+	/* We only want 1 person setting the congestion count for this blkg. */
+	if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old)
+		atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+
+	atomic64_set(&blkg->delay_nsec, delay);
+}
+
+/**
+ * blkcg_clear_delay - Disable allocator delay mechanism
+ * @blkg: target blkg
+ *
+ * Disable use_delay mechanism. See blkcg_set_delay().
+ */
+static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
+{
+	int old = atomic_read(&blkg->use_delay);
+
+	/* We only want 1 person clearing the congestion count for this blkg. */
+	if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old)
+		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+void blk_cgroup_bio_start(struct bio *bio);
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
+#else	/* CONFIG_BLK_CGROUP */
+
+struct blkg_policy_data {
+};
+
+struct blkcg_policy_data {
+};
+
+struct blkcg_policy {
+};
+
+#ifdef CONFIG_BLOCK
+
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{ return NULL; }
+static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+static inline void blkcg_exit_queue(struct request_queue *q) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+static inline int blkcg_activate_policy(struct request_queue *q,
+					const struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct request_queue *q,
+					   const struct blkcg_policy *pol) { }
+
+static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
+
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
+static inline void blkcg_bio_issue_init(struct bio *bio) { }
+static inline void blk_cgroup_bio_start(struct bio *bio) { }
+
+#define blk_queue_for_each_rl(rl, q)	\
+	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+
+#endif	/* CONFIG_BLOCK */
+#endif	/* CONFIG_BLK_CGROUP */
+
+#endif /* _BLK_CGROUP_PRIVATE_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index ff972b968f25..5a4a59041629 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,7 +34,6 @@
 #include <linux/delay.h>
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
 #include <linux/t10-pi.h>
 #include <linux/debugfs.h>
 #include <linux/bpf.h>
@@ -49,6 +48,7 @@
 #include "blk.h"
 #include "blk-mq-sched.h"
 #include "blk-pm.h"
+#include "blk-cgroup.h"
 #include "blk-throttle.h"
 
 struct dentry *blk_debugfs_root;
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index c87aba8584c6..18c8eafe20b9 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -10,7 +10,6 @@
 #define pr_fmt(fmt) "blk-crypto-fallback: " fmt
 
 #include <crypto/skcipher.h>
-#include <linux/blk-cgroup.h>
 #include <linux/blk-crypto.h>
 #include <linux/blk-crypto-profile.h>
 #include <linux/blkdev.h>
@@ -20,6 +19,7 @@
 #include <linux/random.h>
 #include <linux/scatterlist.h>
 
+#include "blk-cgroup.h"
 #include "blk-crypto-internal.h"
 
 static unsigned int num_prealloc_bounce_pg = 32;
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 769b64394298..70a0a3d680a3 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -178,12 +178,12 @@
 #include <linux/time64.h>
 #include <linux/parser.h>
 #include <linux/sched/signal.h>
-#include <linux/blk-cgroup.h>
 #include <asm/local.h>
 #include <asm/local64.h>
 #include "blk-rq-qos.h"
 #include "blk-stat.h"
 #include "blk-wbt.h"
+#include "blk-cgroup.h"
 
 #ifdef CONFIG_TRACEPOINTS
 
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 6593c7123b97..010e658d44a8 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -74,9 +74,9 @@
 #include <linux/sched/signal.h>
 #include <trace/events/block.h>
 #include <linux/blk-mq.h>
-#include <linux/blk-cgroup.h>
 #include "blk-rq-qos.h"
 #include "blk-stat.h"
+#include "blk-cgroup.h"
 #include "blk.h"
 
 #define DEFAULT_SCALE_COOKIE 1000000U
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 2e7f10e1c03f..79e797f5d194 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -12,11 +12,11 @@
  *   Documentation/admin-guide/cgroup-v2.rst.
  */
 
-#include <linux/blk-cgroup.h>
 #include <linux/blk-mq.h>
 #include <linux/blk_types.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include "blk-cgroup.h"
 #include "blk-ioprio.h"
 #include "blk-rq-qos.h"
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9f32882ceb2f..4c6b7dff71e5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -10,7 +10,6 @@
 #include <linux/backing-dev.h>
 #include <linux/blktrace_api.h>
 #include <linux/blk-mq.h>
-#include <linux/blk-cgroup.h>
 #include <linux/debugfs.h>
 
 #include "blk.h"
@@ -18,6 +17,7 @@
 #include "blk-mq-debugfs.h"
 #include "blk-mq-sched.h"
 #include "blk-wbt.h"
+#include "blk-cgroup.h"
 #include "blk-throttle.h"
 
 struct queue_sysfs_entry {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 7c462c006b26..73640d80e99e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,7 +10,6 @@
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
-#include <linux/blk-cgroup.h>
 #include "blk.h"
 #include "blk-cgroup-rwstat.h"
 #include "blk-stat.h"
diff --git a/block/bounce.c b/block/bounce.c
index 3fd3bc6fd5db..3d50d19cde72 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -14,7 +14,6 @@
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
-#include <linux/blk-cgroup.h>
 #include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/hash.h>
@@ -24,6 +23,7 @@
 
 #include <trace/events/block.h>
 #include "blk.h"
+#include "blk-cgroup.h"
 
 #define POOL_SIZE	64
 #define ISA_POOL_SIZE	16
diff --git a/block/elevator.c b/block/elevator.c
index ec98aed39c4f..6847ab6e7aa5 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -35,7 +35,6 @@
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 
@@ -44,6 +43,7 @@
 #include "blk-mq-sched.h"
 #include "blk-pm.h"
 #include "blk-wbt.h"
+#include "blk-cgroup.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index bdc49bd4eef0..f2ad8ed8f777 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -25,12 +25,8 @@
 #include <linux/kthread.h>
 #include <linux/fs.h>
 
-/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
-#define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
-
 #define FC_APPID_LEN              129
 
-
 #ifdef CONFIG_BLK_CGROUP
 
 enum blkg_iostat_type {
@@ -42,6 +38,7 @@ enum blkg_iostat_type {
 };
 
 struct blkcg_gq;
+struct blkg_policy_data;
 
 struct blkcg {
 	struct cgroup_subsys_state	css;
@@ -74,36 +71,6 @@ struct blkg_iostat_set {
 	struct blkg_iostat		last;
 };
 
-/*
- * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
- * request_queue (q).  This is used by blkcg policies which need to track
- * information per blkcg - q pair.
- *
- * There can be multiple active blkcg policies and each blkg:policy pair is
- * represented by a blkg_policy_data which is allocated and freed by each
- * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
- * area by allocating larger data structure which embeds blkg_policy_data
- * at the beginning.
- */
-struct blkg_policy_data {
-	/* the blkg and policy id this per-policy data belongs to */
-	struct blkcg_gq			*blkg;
-	int				plid;
-};
-
-/*
- * Policies that need to keep per-blkcg data which is independent from any
- * request_queue associated to it should implement cpd_alloc/free_fn()
- * methods.  A policy can allocate private data area by allocating larger
- * data structure which embeds blkcg_policy_data at the beginning.
- * cpd_init() is invoked to let each policy handle per-blkcg data.
- */
-struct blkcg_policy_data {
-	/* the blkcg and policy id this per-policy data belongs to */
-	struct blkcg			*blkcg;
-	int				plid;
-};
-
 /* association between a blk cgroup and a request queue */
 struct blkcg_gq {
 	/* Pointer to the associated request_queue */
@@ -139,120 +106,17 @@ struct blkcg_gq {
 	struct rcu_head			rcu_head;
 };
 
-typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
-typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
-typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
-typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
-typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp,
-				struct request_queue *q, struct blkcg *blkcg);
-typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
-typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
-				struct seq_file *s);
-
-struct blkcg_policy {
-	int				plid;
-	/* cgroup files for the policy */
-	struct cftype			*dfl_cftypes;
-	struct cftype			*legacy_cftypes;
-
-	/* operations */
-	blkcg_pol_alloc_cpd_fn		*cpd_alloc_fn;
-	blkcg_pol_init_cpd_fn		*cpd_init_fn;
-	blkcg_pol_free_cpd_fn		*cpd_free_fn;
-	blkcg_pol_bind_cpd_fn		*cpd_bind_fn;
-
-	blkcg_pol_alloc_pd_fn		*pd_alloc_fn;
-	blkcg_pol_init_pd_fn		*pd_init_fn;
-	blkcg_pol_online_pd_fn		*pd_online_fn;
-	blkcg_pol_offline_pd_fn		*pd_offline_fn;
-	blkcg_pol_free_pd_fn		*pd_free_fn;
-	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
-	blkcg_pol_stat_pd_fn		*pd_stat_fn;
-};
-
-extern struct blkcg blkcg_root;
 extern struct cgroup_subsys_state * const blkcg_root_css;
-extern bool blkcg_debug_stats;
-
-struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
-				      struct request_queue *q, bool update_hint);
-int blkcg_init_queue(struct request_queue *q);
-void blkcg_exit_queue(struct request_queue *q);
-
-/* Blkio controller policy registration */
-int blkcg_policy_register(struct blkcg_policy *pol);
-void blkcg_policy_unregister(struct blkcg_policy *pol);
-int blkcg_activate_policy(struct request_queue *q,
-			  const struct blkcg_policy *pol);
-void blkcg_deactivate_policy(struct request_queue *q,
-			     const struct blkcg_policy *pol);
-
-const char *blkg_dev_name(struct blkcg_gq *blkg);
-void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
-		       u64 (*prfill)(struct seq_file *,
-				     struct blkg_policy_data *, int),
-		       const struct blkcg_policy *pol, int data,
-		       bool show_total);
-u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-
-struct blkg_conf_ctx {
-	struct block_device		*bdev;
-	struct blkcg_gq			*blkg;
-	char				*body;
-};
-
-struct block_device *blkcg_conf_open_bdev(char **inputp);
-int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   char *input, struct blkg_conf_ctx *ctx);
-void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 
-/**
- * blkcg_css - find the current css
- *
- * Find the css associated with either the kthread or the current task.
- * This may return a dying css, so it is up to the caller to use tryget logic
- * to confirm it is alive and well.
- */
-static inline struct cgroup_subsys_state *blkcg_css(void)
-{
-	struct cgroup_subsys_state *css;
-
-	css = kthread_blkcg();
-	if (css)
-		return css;
-	return task_css(current, io_cgrp_id);
-}
+void blkcg_destroy_blkgs(struct blkcg *blkcg);
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
+void blkcg_maybe_throttle_current(void);
 
 static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct blkcg, css) : NULL;
 }
 
-/**
- * __bio_blkcg - internal, inconsistent version to get blkcg
- *
- * DO NOT USE.
- * This function is inconsistent and consequently is dangerous to use.  The
- * first part of the function returns a blkcg where a reference is owned by the
- * bio.  This means it does not need to be rcu protected as it cannot go away
- * with the bio owning a reference to it.  However, the latter potentially gets
- * it from task_css().  This can race against task migration and the cgroup
- * dying.  It is also semantically different as it must be called rcu protected
- * and is susceptible to failure when trying to get a reference to it.
- * Therefore, it is not ok to assume that *_get() will always succeed on the
- * blkcg returned here.
- */
-static inline struct blkcg *__bio_blkcg(struct bio *bio)
-{
-	if (bio && bio->bi_blkg)
-		return bio->bi_blkg->blkcg;
-	return css_to_blkcg(blkcg_css());
-}
-
 /**
  * bio_blkcg - grab the blkcg associated with a bio
  * @bio: target bio
@@ -288,22 +152,6 @@ static inline bool blk_cgroup_congested(void)
 	return ret;
 }
 
-/**
- * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
- * @return: true if this bio needs to be submitted with the root blkg context.
- *
- * In order to avoid priority inversions we sometimes need to issue a bio as if
- * it were attached to the root blkg, and then backcharge to the actual owning
- * blkg.  The idea is we do bio_blkcg() to look up the actual context for the
- * bio and attach the appropriate blkg to the bio.  Then we call this helper and
- * if it is true run with the root blkg for that queue and then do any
- * backcharging to the originating cgroup once the io is complete.
- */
-static inline bool bio_issue_as_root_blkg(struct bio *bio)
-{
-	return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
-}
-
 /**
  * blkcg_parent - get the parent of a blkcg
  * @blkcg: blkcg of interest
@@ -315,96 +163,6 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
 	return css_to_blkcg(blkcg->css.parent);
 }
 
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state.  If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
-					     struct request_queue *q,
-					     bool update_hint)
-{
-	struct blkcg_gq *blkg;
-
-	if (blkcg == &blkcg_root)
-		return q->root_blkg;
-
-	blkg = rcu_dereference(blkcg->blkg_hint);
-	if (blkg && blkg->q == q)
-		return blkg;
-
-	return blkg_lookup_slowpath(blkcg, q, update_hint);
-}
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock.
- */
-static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
-					   struct request_queue *q)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	return __blkg_lookup(blkcg, q, false);
-}
-
-/**
- * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
- * @q: request_queue of interest
- *
- * Lookup blkg for @q at the root level. See also blkg_lookup().
- */
-static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
-{
-	return q->root_blkg;
-}
-
-/**
- * blkg_to_pdata - get policy private data
- * @blkg: blkg of interest
- * @pol: policy of interest
- *
- * Return pointer to private data associated with the @blkg-@pol pair.
- */
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-						  struct blkcg_policy *pol)
-{
-	return blkg ? blkg->pd[pol->plid] : NULL;
-}
-
-static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
-						     struct blkcg_policy *pol)
-{
-	return blkcg ? blkcg->cpd[pol->plid] : NULL;
-}
-
-/**
- * pdata_to_blkg - get blkg associated with policy private data
- * @pd: policy private data of interest
- *
- * @pd is policy private data.  Determine the blkg it's associated with.
- */
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
-{
-	return pd ? pd->blkg : NULL;
-}
-
-static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
-{
-	return cpd ? cpd->blkcg : NULL;
-}
-
-extern void blkcg_destroy_blkgs(struct blkcg *blkcg);
-
 /**
  * blkcg_pin_online - pin online state
  * @blkcg: blkcg of interest
@@ -437,231 +195,24 @@ static inline void blkcg_unpin_online(struct blkcg *blkcg)
 	} while (blkcg);
 }
 
-/**
- * blkg_path - format cgroup path of blkg
- * @blkg: blkg of interest
- * @buf: target buffer
- * @buflen: target buffer length
- *
- * Format the path of the cgroup of @blkg into @buf.
- */
-static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
-{
-	return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-}
-
-/**
- * blkg_get - get a blkg reference
- * @blkg: blkg to get
- *
- * The caller should be holding an existing reference.
- */
-static inline void blkg_get(struct blkcg_gq *blkg)
-{
-	percpu_ref_get(&blkg->refcnt);
-}
-
-/**
- * blkg_tryget - try and get a blkg reference
- * @blkg: blkg to get
- *
- * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
- * of freeing this blkg, so we can only use it if the refcnt is not zero.
- */
-static inline bool blkg_tryget(struct blkcg_gq *blkg)
-{
-	return blkg && percpu_ref_tryget(&blkg->refcnt);
-}
-
-/**
- * blkg_put - put a blkg reference
- * @blkg: blkg to put
- */
-static inline void blkg_put(struct blkcg_gq *blkg)
-{
-	percpu_ref_put(&blkg->refcnt);
-}
-
-/**
- * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
- * read locked.  If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs.  The caller may
- * update @pos_css by calling css_rightmost_descendant() to skip subtree.
- * @p_blkg is included in the iteration and the first node to be visited.
- */
-#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)		\
-	css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)	\
-		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
-					      (p_blkg)->q, false)))
-
-/**
- * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Similar to blkg_for_each_descendant_pre() but performs post-order
- * traversal instead.  Synchronization rules are the same.  @p_blkg is
- * included in the iteration and the last node to be visited.
- */
-#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)		\
-	css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)	\
-		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
-					      (p_blkg)->q, false)))
-
-bool __blkcg_punt_bio_submit(struct bio *bio);
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio)
-{
-	if (bio->bi_opf & REQ_CGROUP_PUNT)
-		return __blkcg_punt_bio_submit(bio);
-	else
-		return false;
-}
-
-static inline void blkcg_bio_issue_init(struct bio *bio)
-{
-	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-}
-
-static inline void blkcg_use_delay(struct blkcg_gq *blkg)
-{
-	if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
-		return;
-	if (atomic_add_return(1, &blkg->use_delay) == 1)
-		atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
-}
-
-static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
-{
-	int old = atomic_read(&blkg->use_delay);
-
-	if (WARN_ON_ONCE(old < 0))
-		return 0;
-	if (old == 0)
-		return 0;
-
-	/*
-	 * We do this song and dance because we can race with somebody else
-	 * adding or removing delay.  If we just did an atomic_dec we'd end up
-	 * negative and we'd already be in trouble.  We need to subtract 1 and
-	 * then check to see if we were the last delay so we can drop the
-	 * congestion count on the cgroup.
-	 */
-	while (old) {
-		int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
-		if (cur == old)
-			break;
-		old = cur;
-	}
-
-	if (old == 0)
-		return 0;
-	if (old == 1)
-		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
-	return 1;
-}
-
-/**
- * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
- * @blkg: target blkg
- * @delay: delay duration in nsecs
- *
- * When enabled with this function, the delay is not decayed and must be
- * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
- * blkcg_[un]use_delay() and blkcg_add_delay() usages.
- */
-static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
-{
-	int old = atomic_read(&blkg->use_delay);
-
-	/* We only want 1 person setting the congestion count for this blkg. */
-	if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old)
-		atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
-
-	atomic64_set(&blkg->delay_nsec, delay);
-}
-
-/**
- * blkcg_clear_delay - Disable allocator delay mechanism
- * @blkg: target blkg
- *
- * Disable use_delay mechanism. See blkcg_set_delay().
- */
-static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
-{
-	int old = atomic_read(&blkg->use_delay);
-
-	/* We only want 1 person clearing the congestion count for this blkg. */
-	if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old)
-		atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
-}
-
-void blk_cgroup_bio_start(struct bio *bio);
-void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
-void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
-void blkcg_maybe_throttle_current(void);
 #else	/* CONFIG_BLK_CGROUP */
 
 struct blkcg {
 };
 
-struct blkg_policy_data {
-};
-
-struct blkcg_policy_data {
-};
-
 struct blkcg_gq {
 };
 
-struct blkcg_policy {
-};
-
 #define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 
 static inline void blkcg_maybe_throttle_current(void) { }
 static inline bool blk_cgroup_congested(void) { return false; }
 
 #ifdef CONFIG_BLOCK
-
 static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
-
-static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
-{ return NULL; }
-static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
-static inline void blkcg_exit_queue(struct request_queue *q) { }
-static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
-static inline int blkcg_activate_policy(struct request_queue *q,
-					const struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_deactivate_policy(struct request_queue *q,
-					   const struct blkcg_policy *pol) { }
-
-static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
 static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+#endif /* CONFIG_BLOCK */
 
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-						  struct blkcg_policy *pol) { return NULL; }
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
-static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
-static inline void blkg_get(struct blkcg_gq *blkg) { }
-static inline void blkg_put(struct blkcg_gq *blkg) { }
-
-static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
-static inline void blkcg_bio_issue_init(struct bio *bio) { }
-static inline void blk_cgroup_bio_start(struct bio *bio) { }
-
-#define blk_queue_for_each_rl(rl, q)	\
-	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
-
-#endif	/* CONFIG_BLOCK */
 #endif	/* CONFIG_BLK_CGROUP */
 
 #ifdef CONFIG_BLK_CGROUP_FC_APPID
-- 
cgit v1.2.3


From a650628bde77f6ac5b1d532092346feff7b58c52 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:07 +0800
Subject: block: move submit_bio_checks() into submit_bio_noacct

It is more clean & readable to check bio when starting to submit it,
instead of just before calling ->submit_bio() or blk_mq_submit_bio().

Also it provides us chance to optimize bio submission without checking
bio.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220216044514.2903784-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 5a4a59041629..d4a023667ac1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -797,9 +797,6 @@ static void __submit_bio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
 
-	if (unlikely(!submit_bio_checks(bio)))
-		return;
-
 	if (!disk->fops->submit_bio)
 		blk_mq_submit_bio(bio);
 	else
@@ -893,6 +890,9 @@ static void __submit_bio_noacct_mq(struct bio *bio)
  */
 void submit_bio_noacct(struct bio *bio)
 {
+	if (unlikely(!submit_bio_checks(bio)))
+		return;
+
 	/*
 	 * We only want one ->submit_bio to be active at a time, else stack
 	 * usage with stacked devices could be a problem.  Use current->bio_list
-- 
cgit v1.2.3


From 7f36b7d02a287ed18d02ae821868aa07b0235521 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:08 +0800
Subject: block: move blk_crypto_bio_prep() out of blk-mq.c

blk_crypto_bio_prep() is called for both bio based and blk-mq drivers,
so move it out of blk-mq.c, then we can unify this kind of handling.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220216044514.2903784-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 21 ++++++++-------------
 block/blk-mq.c   |  3 ---
 2 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index d4a023667ac1..f03fff1fa391 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -783,24 +783,19 @@ end_io:
 	return false;
 }
 
-static void __submit_bio_fops(struct gendisk *disk, struct bio *bio)
-{
-	if (blk_crypto_bio_prep(&bio)) {
-		if (likely(bio_queue_enter(bio) == 0)) {
-			disk->fops->submit_bio(bio);
-			blk_queue_exit(disk->queue);
-		}
-	}
-}
-
 static void __submit_bio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
 
-	if (!disk->fops->submit_bio)
+	if (unlikely(!blk_crypto_bio_prep(&bio)))
+		return;
+
+	if (!disk->fops->submit_bio) {
 		blk_mq_submit_bio(bio);
-	else
-		__submit_bio_fops(disk, bio);
+	} else if (likely(bio_queue_enter(bio) == 0)) {
+		disk->fops->submit_bio(bio);
+		blk_queue_exit(disk->queue);
+	}
 }
 
 /*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 886836a54064..7ca0b47246a6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2788,9 +2788,6 @@ void blk_mq_submit_bio(struct bio *bio)
 	unsigned int nr_segs = 1;
 	blk_status_t ret;
 
-	if (unlikely(!blk_crypto_bio_prep(&bio)))
-		return;
-
 	blk_queue_bounce(q, &bio);
 	if (blk_may_split(q, bio))
 		__blk_queue_split(q, &bio, &nr_segs);
-- 
cgit v1.2.3


From 29ff23624e21c89d3321d6429dec8ad3847b534a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:09 +0800
Subject: block: don't declare submit_bio_checks in local header

submit_bio_checks() won't be called outside of block/blk-core.c any more
since commit 9d497e2941c3 ("block: don't protect submit_bio_checks by
q_usage_counter"), so mark it as one local helper.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220216044514.2903784-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 2 +-
 block/blk.h      | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index f03fff1fa391..5248b94d276b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -676,7 +676,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 	return BLK_STS_OK;
 }
 
-noinline_for_stack bool submit_bio_checks(struct bio *bio)
+static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	struct request_queue *q = bdev_get_queue(bdev);
diff --git a/block/blk.h b/block/blk.h
index abb663a2a147..b2516cb4f98e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -46,7 +46,6 @@ void blk_freeze_queue(struct request_queue *q);
 void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 void blk_queue_start_drain(struct request_queue *q);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
-bool submit_bio_checks(struct bio *bio);
 
 static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 {
-- 
cgit v1.2.3


From 3f98c753717c600eb5708e9b78b3eba6664bddf1 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:10 +0800
Subject: block: don't check bio in blk_throtl_dispatch_work_fn

The bio has been checked already before throttling, so no need to check
it again before dispatching it from throttle queue.

Add a helper of submit_bio_noacct_nocheck() for this purpose.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220216044514.2903784-5-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c     | 30 +++++++++++++++++-------------
 block/blk-throttle.c |  2 +-
 block/blk.h          |  1 +
 3 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 5248b94d276b..72b7b2214c70 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -874,20 +874,8 @@ static void __submit_bio_noacct_mq(struct bio *bio)
 	current->bio_list = NULL;
 }
 
-/**
- * submit_bio_noacct - re-submit a bio to the block device layer for I/O
- * @bio:  The bio describing the location in memory and on the device.
- *
- * This is a version of submit_bio() that shall only be used for I/O that is
- * resubmitted to lower level drivers by stacking block drivers.  All file
- * systems and other upper level users of the block layer should use
- * submit_bio() instead.
- */
-void submit_bio_noacct(struct bio *bio)
+void submit_bio_noacct_nocheck(struct bio *bio)
 {
-	if (unlikely(!submit_bio_checks(bio)))
-		return;
-
 	/*
 	 * We only want one ->submit_bio to be active at a time, else stack
 	 * usage with stacked devices could be a problem.  Use current->bio_list
@@ -901,6 +889,22 @@ void submit_bio_noacct(struct bio *bio)
 	else
 		__submit_bio_noacct(bio);
 }
+
+/**
+ * submit_bio_noacct - re-submit a bio to the block device layer for I/O
+ * @bio:  The bio describing the location in memory and on the device.
+ *
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level drivers by stacking block drivers.  All file
+ * systems and other upper level users of the block layer should use
+ * submit_bio() instead.
+ */
+void submit_bio_noacct(struct bio *bio)
+{
+	if (unlikely(!submit_bio_checks(bio)))
+		return;
+	submit_bio_noacct_nocheck(bio);
+}
 EXPORT_SYMBOL(submit_bio_noacct);
 
 /**
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 73640d80e99e..8770768f1000 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1218,7 +1218,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	if (!bio_list_empty(&bio_list_on_stack)) {
 		blk_start_plug(&plug);
 		while ((bio = bio_list_pop(&bio_list_on_stack)))
-			submit_bio_noacct(bio);
+			submit_bio_noacct_nocheck(bio);
 		blk_finish_plug(&plug);
 	}
 }
diff --git a/block/blk.h b/block/blk.h
index b2516cb4f98e..ebaa59ca46ca 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -46,6 +46,7 @@ void blk_freeze_queue(struct request_queue *q);
 void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 void blk_queue_start_drain(struct request_queue *q);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
+void submit_bio_noacct_nocheck(struct bio *bio);
 
 static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 {
-- 
cgit v1.2.3


From d24c670ec1f9f1dc320e59004e61f3491ae24546 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 16 Feb 2022 12:45:11 +0800
Subject: block: merge submit_bio_checks() into submit_bio_noacct

Now submit_bio_checks() is only called by submit_bio_noacct(), so merge
it into submit_bio_noacct().

Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220216044514.2903784-6-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 209 +++++++++++++++++++++++++++----------------------------
 1 file changed, 101 insertions(+), 108 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 72b7b2214c70..94bf37f8e61d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -676,113 +676,6 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 	return BLK_STS_OK;
 }
 
-static noinline_for_stack bool submit_bio_checks(struct bio *bio)
-{
-	struct block_device *bdev = bio->bi_bdev;
-	struct request_queue *q = bdev_get_queue(bdev);
-	blk_status_t status = BLK_STS_IOERR;
-	struct blk_plug *plug;
-
-	might_sleep();
-
-	plug = blk_mq_plug(q, bio);
-	if (plug && plug->nowait)
-		bio->bi_opf |= REQ_NOWAIT;
-
-	/*
-	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
-	 * if queue does not support NOWAIT.
-	 */
-	if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
-		goto not_supported;
-
-	if (should_fail_bio(bio))
-		goto end_io;
-	if (unlikely(bio_check_ro(bio)))
-		goto end_io;
-	if (!bio_flagged(bio, BIO_REMAPPED)) {
-		if (unlikely(bio_check_eod(bio)))
-			goto end_io;
-		if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
-			goto end_io;
-	}
-
-	/*
-	 * Filter flush bio's early so that bio based drivers without flush
-	 * support don't have to worry about them.
-	 */
-	if (op_is_flush(bio->bi_opf) &&
-	    !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
-		bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
-		if (!bio_sectors(bio)) {
-			status = BLK_STS_OK;
-			goto end_io;
-		}
-	}
-
-	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-		bio_clear_polled(bio);
-
-	switch (bio_op(bio)) {
-	case REQ_OP_DISCARD:
-		if (!blk_queue_discard(q))
-			goto not_supported;
-		break;
-	case REQ_OP_SECURE_ERASE:
-		if (!blk_queue_secure_erase(q))
-			goto not_supported;
-		break;
-	case REQ_OP_WRITE_SAME:
-		if (!q->limits.max_write_same_sectors)
-			goto not_supported;
-		break;
-	case REQ_OP_ZONE_APPEND:
-		status = blk_check_zone_append(q, bio);
-		if (status != BLK_STS_OK)
-			goto end_io;
-		break;
-	case REQ_OP_ZONE_RESET:
-	case REQ_OP_ZONE_OPEN:
-	case REQ_OP_ZONE_CLOSE:
-	case REQ_OP_ZONE_FINISH:
-		if (!blk_queue_is_zoned(q))
-			goto not_supported;
-		break;
-	case REQ_OP_ZONE_RESET_ALL:
-		if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
-			goto not_supported;
-		break;
-	case REQ_OP_WRITE_ZEROES:
-		if (!q->limits.max_write_zeroes_sectors)
-			goto not_supported;
-		break;
-	default:
-		break;
-	}
-
-	if (blk_throtl_bio(bio))
-		return false;
-
-	blk_cgroup_bio_start(bio);
-	blkcg_bio_issue_init(bio);
-
-	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
-		trace_block_bio_queue(bio);
-		/* Now that enqueuing has been traced, we need to trace
-		 * completion as well.
-		 */
-		bio_set_flag(bio, BIO_TRACE_COMPLETION);
-	}
-	return true;
-
-not_supported:
-	status = BLK_STS_NOTSUPP;
-end_io:
-	bio->bi_status = status;
-	bio_endio(bio);
-	return false;
-}
-
 static void __submit_bio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
@@ -901,9 +794,109 @@ void submit_bio_noacct_nocheck(struct bio *bio)
  */
 void submit_bio_noacct(struct bio *bio)
 {
-	if (unlikely(!submit_bio_checks(bio)))
+	struct block_device *bdev = bio->bi_bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	blk_status_t status = BLK_STS_IOERR;
+	struct blk_plug *plug;
+
+	might_sleep();
+
+	plug = blk_mq_plug(q, bio);
+	if (plug && plug->nowait)
+		bio->bi_opf |= REQ_NOWAIT;
+
+	/*
+	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
+	 * if queue does not support NOWAIT.
+	 */
+	if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
+		goto not_supported;
+
+	if (should_fail_bio(bio))
+		goto end_io;
+	if (unlikely(bio_check_ro(bio)))
+		goto end_io;
+	if (!bio_flagged(bio, BIO_REMAPPED)) {
+		if (unlikely(bio_check_eod(bio)))
+			goto end_io;
+		if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
+			goto end_io;
+	}
+
+	/*
+	 * Filter flush bio's early so that bio based drivers without flush
+	 * support don't have to worry about them.
+	 */
+	if (op_is_flush(bio->bi_opf) &&
+	    !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
+		bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
+		if (!bio_sectors(bio)) {
+			status = BLK_STS_OK;
+			goto end_io;
+		}
+	}
+
+	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+		bio_clear_polled(bio);
+
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+		if (!blk_queue_discard(q))
+			goto not_supported;
+		break;
+	case REQ_OP_SECURE_ERASE:
+		if (!blk_queue_secure_erase(q))
+			goto not_supported;
+		break;
+	case REQ_OP_WRITE_SAME:
+		if (!q->limits.max_write_same_sectors)
+			goto not_supported;
+		break;
+	case REQ_OP_ZONE_APPEND:
+		status = blk_check_zone_append(q, bio);
+		if (status != BLK_STS_OK)
+			goto end_io;
+		break;
+	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_OPEN:
+	case REQ_OP_ZONE_CLOSE:
+	case REQ_OP_ZONE_FINISH:
+		if (!blk_queue_is_zoned(q))
+			goto not_supported;
+		break;
+	case REQ_OP_ZONE_RESET_ALL:
+		if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
+			goto not_supported;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		if (!q->limits.max_write_zeroes_sectors)
+			goto not_supported;
+		break;
+	default:
+		break;
+	}
+
+	if (blk_throtl_bio(bio))
 		return;
+
+	blk_cgroup_bio_start(bio);
+	blkcg_bio_issue_init(bio);
+
+	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+		trace_block_bio_queue(bio);
+		/* Now that enqueuing has been traced, we need to trace
+		 * completion as well.
+		 */
+		bio_set_flag(bio, BIO_TRACE_COMPLETION);
+	}
 	submit_bio_noacct_nocheck(bio);
+	return;
+
+not_supported:
+	status = BLK_STS_NOTSUPP;
+end_io:
+	bio->bi_status = status;
+	bio_endio(bio);
 }
 EXPORT_SYMBOL(submit_bio_noacct);
 
-- 
cgit v1.2.3


From 57e95e4670d1126c103305bcf34a9442f49f6d6a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:00:56 +0100
Subject: block: fix and cleanup bio_check_ro

Don't use a WARN_ON when printing a potentially user triggered
condition.  Also don't print the partno when the block device name
already includes it, and use the %pg specifier to simplify printing
the block device name.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 94bf37f8e61d..34e1b7fdb7c8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -580,14 +580,10 @@ late_initcall(fail_make_request_debugfs);
 static inline bool bio_check_ro(struct bio *bio)
 {
 	if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
-		char b[BDEVNAME_SIZE];
-
 		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
 			return false;
-
-		WARN_ONCE(1,
-		       "Trying to write to read-only block-device %s (partno %d)\n",
-			bio_devname(bio, b), bio->bi_bdev->bd_partno);
+		pr_warn("Trying to write to read-only block-device %pg\n",
+			bio->bi_bdev);
 		/* Older lvm-tools actually trigger this */
 		return false;
 	}
-- 
cgit v1.2.3


From ad740780bbc2fe37856f944dbbaff07aac9db9e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 4 Mar 2022 19:00:57 +0100
Subject: block: remove handle_bad_sector

Use the %pg format specifier instead of the stack hungry bdevname
function, and remove handle_bad_sector given that it is not pointless.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220304180105.409765-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 34e1b7fdb7c8..4d858fc08f8b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -540,17 +540,6 @@ bool blk_get_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_get_queue);
 
-static void handle_bad_sector(struct bio *bio, sector_t maxsector)
-{
-	char b[BDEVNAME_SIZE];
-
-	pr_info_ratelimited("%s: attempt to access beyond end of device\n"
-			    "%s: rw=%d, want=%llu, limit=%llu\n",
-			    current->comm,
-			    bio_devname(bio, b), bio->bi_opf,
-			    bio_end_sector(bio), maxsector);
-}
-
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 
 static DECLARE_FAULT_ATTR(fail_make_request);
@@ -612,7 +601,11 @@ static inline int bio_check_eod(struct bio *bio)
 	if (nr_sectors && maxsector &&
 	    (nr_sectors > maxsector ||
 	     bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
-		handle_bad_sector(bio, maxsector);
+		pr_info_ratelimited("%s: attempt to access beyond end of device\n"
+				    "%pg: rw=%d, want=%llu, limit=%llu\n",
+				    current->comm,
+				    bio->bi_bdev, bio->bi_opf,
+				    bio_end_sector(bio), maxsector);
 		return -EIO;
 	}
 	return 0;
-- 
cgit v1.2.3


From 1059699f87eb0b3aa9d574b91a572d534897134a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 06:51:55 +0100
Subject: block: move blkcg initialization/destroy into disk allocation/release
 handler

blkcg works on FS bio level, so it is reasonable to make both blkcg and
gendisk sharing same lifetime. Meantime there won't be any FS IO when
releasing disk, so safe to move blkcg initialization/destroy into disk
allocation/release handler

Long term, we can move blkcg into gendisk completely.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220308055200.735835-10-hch@lst.de
[axboe: fixup missing blk-cgroup.h include]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c  | 5 -----
 block/blk-sysfs.c | 7 -------
 block/genhd.c     | 9 +++++++++
 3 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 4d858fc08f8b..3fa2f08d3750 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -496,17 +496,12 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 				PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
 		goto fail_stats;
 
-	if (blkcg_init_queue(q))
-		goto fail_ref;
-
 	blk_queue_dma_alignment(q, 511);
 	blk_set_default_limits(&q->limits);
 	q->nr_requests = BLKDEV_DEFAULT_RQ;
 
 	return q;
 
-fail_ref:
-	percpu_ref_exit(&q->q_usage_counter);
 fail_stats:
 	blk_free_queue_stats(q->stats);
 fail_split:
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 241ded62f458..220085109d7f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -751,13 +751,6 @@ static void blk_exit_queue(struct request_queue *q)
 		ioc_clear_queue(q);
 		elevator_exit(q);
 	}
-
-	/*
-	 * Remove all references to @q from the block cgroup controller before
-	 * restoring @q->queue_lock to avoid that restoring this pointer causes
-	 * e.g. blkcg_print_blkgs() to crash.
-	 */
-	blkcg_exit_queue(q);
 }
 
 /**
diff --git a/block/genhd.c b/block/genhd.c
index 54f60ded2ee6..fc10aedab209 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -29,6 +29,7 @@
 #include "blk.h"
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
+#include "blk-cgroup.h"
 
 static struct kobject *block_depr;
 
@@ -1120,9 +1121,12 @@ static void disk_release(struct device *dev)
 
 	blk_mq_cancel_work_sync(disk->queue);
 
+	blkcg_exit_queue(disk->queue);
+
 	disk_release_events(disk);
 	kfree(disk->random);
 	xa_destroy(&disk->part_tbl);
+
 	disk->queue->disk = NULL;
 	blk_put_queue(disk->queue);
 
@@ -1328,6 +1332,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 	if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
 		goto out_destroy_part_tbl;
 
+	if (blkcg_init_queue(q))
+		goto out_erase_part0;
+
 	rand_initialize_disk(disk);
 	disk_to_dev(disk)->class = &block_class;
 	disk_to_dev(disk)->type = &disk_type;
@@ -1340,6 +1347,8 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 #endif
 	return disk;
 
+out_erase_part0:
+	xa_erase(&disk->part_tbl, 0);
 out_destroy_part_tbl:
 	xa_destroy(&disk->part_tbl);
 	disk->part0->bd_disk = NULL;
-- 
cgit v1.2.3


From ba3e845665fbbb0252336f27200cd5cf288a3573 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 8 Mar 2022 06:51:57 +0100
Subject: block: move q_usage_counter release into blk_queue_release

After blk_cleanup_queue() returns, disk may not be released yet, so
probably bio may still be submitted and ->q_usage_counter may be
touched, so far this way seems safe, but not good from API's viewpoint.

Move the release q_usage_counter into blk_queue_release().

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220308055200.735835-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c  | 2 --
 block/blk-sysfs.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'block/blk-core.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3fa2f08d3750..a97918d107a0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -342,8 +342,6 @@ void blk_cleanup_queue(struct request_queue *q)
 		blk_mq_sched_free_rqs(q);
 	mutex_unlock(&q->sysfs_lock);
 
-	percpu_ref_exit(&q->q_usage_counter);
-
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 220085109d7f..af5a6d86073f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -780,6 +780,8 @@ static void blk_release_queue(struct kobject *kobj)
 
 	might_sleep();
 
+	percpu_ref_exit(&q->q_usage_counter);
+
 	if (q->poll_stat)
 		blk_stat_remove_callback(q, q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);
-- 
cgit v1.2.3