summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-10-13 21:12:44 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2020-10-13 21:12:44 +0200
commit3ad11d7ac8872b1c8da54494721fad8907ee41f7 (patch)
tree439d7cb75466978be936250c65a27ff05e82d9bc /include
parentMerge tag 'x86_urgent_for_v5.10-rc1' of git://git.kernel.org/pub/scm/linux/ke... (diff)
parentblock: fix uapi blkzoned.h comments (diff)
downloadlinux-3ad11d7ac8872b1c8da54494721fad8907ee41f7.tar.xz
linux-3ad11d7ac8872b1c8da54494721fad8907ee41f7.zip
Merge tag 'block-5.10-2020-10-12' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: - Series of merge handling cleanups (Baolin, Christoph) - Series of blk-throttle fixes and cleanups (Baolin) - Series cleaning up BDI, seperating the block device from the backing_dev_info (Christoph) - Removal of bdget() as a generic API (Christoph) - Removal of blkdev_get() as a generic API (Christoph) - Cleanup of is-partition checks (Christoph) - Series reworking disk revalidation (Christoph) - Series cleaning up bio flags (Christoph) - bio crypt fixes (Eric) - IO stats inflight tweak (Gabriel) - blk-mq tags fixes (Hannes) - Buffer invalidation fixes (Jan) - Allow soft limits for zone append (Johannes) - Shared tag set improvements (John, Kashyap) - Allow IOPRIO_CLASS_RT for CAP_SYS_NICE (Khazhismel) - DM no-wait support (Mike, Konstantin) - Request allocation improvements (Ming) - Allow md/dm/bcache to use IO stat helpers (Song) - Series improving blk-iocost (Tejun) - Various cleanups (Geert, Damien, Danny, Julia, Tetsuo, Tian, Wang, Xianting, Yang, Yufen, yangerkun) * tag 'block-5.10-2020-10-12' of git://git.kernel.dk/linux-block: (191 commits) block: fix uapi blkzoned.h comments blk-mq: move cancel of hctx->run_work to the front of blk_exit_queue blk-mq: get rid of the dead flush handle code path block: get rid of unnecessary local variable block: fix comment and add lockdep assert blk-mq: use helper function to test hw stopped block: use helper function to test queue register block: remove redundant mq check block: invoke blk_mq_exit_sched no matter whether have .exit_sched percpu_ref: don't refer to ref->data if it isn't allocated block: ratelimit handle_bad_sector() message blk-throttle: Re-use the throtl_set_slice_end() blk-throttle: Open code __throtl_de/enqueue_tg() blk-throttle: Move service tree validation out of the throtl_rb_first() blk-throttle: Move the list operation after list validation blk-throttle: Fix IO hang for a corner case blk-throttle: Avoid tracking latency if low limit is invalid blk-throttle: Avoid getting the current time if tg->last_finish_time is 0 blk-throttle: Remove a meaningless parameter for throtl_downgrade_state() block: Remove redundant 'return' statement ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/backing-dev.h78
-rw-r--r--include/linux/blk-crypto.h20
-rw-r--r--include/linux/blk-mq.h15
-rw-r--r--include/linux/blk_types.h7
-rw-r--r--include/linux/blkdev.h84
-rw-r--r--include/linux/device-mapper.h6
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/genhd.h15
-rw-r--r--include/linux/ide.h2
-rw-r--r--include/linux/percpu-refcount.h52
-rw-r--r--include/linux/suspend.h4
-rw-r--r--include/linux/swap.h3
-rw-r--r--include/trace/events/iocost.h67
-rw-r--r--include/uapi/linux/blkzoned.h15
-rw-r--r--include/uapi/linux/capability.h2
15 files changed, 199 insertions, 173 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0b06b2d26c9a..44df4fcef65c 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -110,33 +110,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
/*
* Flags in backing_dev_info::capability
*
- * The first three flags control whether dirty pages will contribute to the
- * VM's accounting and whether writepages() should be called for dirty pages
- * (something that would not, for example, be appropriate for ramfs)
- *
- * WARNING: these flags are closely related and should not normally be
- * used separately. The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these
- * three flags into a single convenience macro.
- *
- * BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting
- * BDI_CAP_NO_WRITEBACK: Don't write pages back
- * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages
- * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold.
- *
- * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
- * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be
- * inefficient.
+ * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages
+ * should contribute to accounting
+ * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages
+ * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold
*/
-#define BDI_CAP_NO_ACCT_DIRTY 0x00000001
-#define BDI_CAP_NO_WRITEBACK 0x00000002
-#define BDI_CAP_NO_ACCT_WB 0x00000004
-#define BDI_CAP_STABLE_WRITES 0x00000008
-#define BDI_CAP_STRICTLIMIT 0x00000010
-#define BDI_CAP_CGROUP_WRITEBACK 0x00000020
-#define BDI_CAP_SYNCHRONOUS_IO 0x00000040
-
-#define BDI_CAP_NO_ACCT_AND_WRITEBACK \
- (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
+#define BDI_CAP_WRITEBACK (1 << 0)
+#define BDI_CAP_WRITEBACK_ACCT (1 << 1)
+#define BDI_CAP_STRICTLIMIT (1 << 2)
extern struct backing_dev_info noop_backing_dev_info;
@@ -175,41 +156,9 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
long congestion_wait(int sync, long timeout);
long wait_iff_congested(int sync, long timeout);
-static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi)
-{
- return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO;
-}
-
-static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi)
-{
- return bdi->capabilities & BDI_CAP_STABLE_WRITES;
-}
-
-static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
-{
- return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK);
-}
-
-static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi)
-{
- return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY);
-}
-
-static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
-{
- /* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */
- return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB |
- BDI_CAP_NO_WRITEBACK));
-}
-
-static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
-{
- return bdi_cap_writeback_dirty(inode_to_bdi(mapping->host));
-}
-
-static inline bool mapping_cap_account_dirty(struct address_space *mapping)
+static inline bool mapping_can_writeback(struct address_space *mapping)
{
- return bdi_cap_account_dirty(inode_to_bdi(mapping->host));
+ return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK;
}
static inline int bdi_sched_wait(void *word)
@@ -233,9 +182,9 @@ int inode_congested(struct inode *inode, int cong_bits);
* inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
* @inode: inode of interest
*
- * cgroup writeback requires support from both the bdi and filesystem.
- * Also, both memcg and iocg have to be on the default hierarchy. Test
- * whether all conditions are met.
+ * Cgroup writeback requires support from the filesystem. Also, both memcg and
+ * iocg have to be on the default hierarchy. Test whether all conditions are
+ * met.
*
* Note that the test result may change dynamically on the same inode
* depending on how memcg and iocg are configured.
@@ -246,8 +195,7 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
cgroup_subsys_on_dfl(io_cgrp_subsys) &&
- bdi_cap_account_dirty(bdi) &&
- (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
+ (bdi->capabilities & BDI_CAP_WRITEBACK) &&
(inode->i_sb->s_iflags & SB_I_CGROUPWB);
}
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index e82342907f2b..69b24fe92cbf 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -112,12 +112,24 @@ static inline bool bio_has_crypt_ctx(struct bio *bio)
#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
-void __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask);
-static inline void bio_crypt_clone(struct bio *dst, struct bio *src,
- gfp_t gfp_mask)
+int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask);
+/**
+ * bio_crypt_clone - clone bio encryption context
+ * @dst: destination bio
+ * @src: source bio
+ * @gfp_mask: memory allocation flags
+ *
+ * If @src has an encryption context, clone it to @dst.
+ *
+ * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if
+ * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM.
+ */
+static inline int bio_crypt_clone(struct bio *dst, struct bio *src,
+ gfp_t gfp_mask)
{
if (bio_has_crypt_ctx(src))
- __bio_crypt_clone(dst, src, gfp_mask);
+ return __bio_crypt_clone(dst, src, gfp_mask);
+ return 0;
}
#endif /* __LINUX_BLK_CRYPTO_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9d2d5ad367a4..b23eeca4d677 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -139,6 +139,10 @@ struct blk_mq_hw_ctx {
* shared across request queues.
*/
atomic_t nr_active;
+ /**
+ * @elevator_queued: Number of queued requests on hctx.
+ */
+ atomic_t elevator_queued;
/** @cpuhp_online: List to store request if CPU is going to die */
struct hlist_node cpuhp_online;
@@ -231,6 +235,9 @@ enum hctx_type {
* @flags: Zero or more BLK_MQ_F_* flags.
* @driver_data: Pointer to data owned by the block driver that created this
* tag set.
+ * @__bitmap_tags: A shared tags sbitmap, used over all hctx's
+ * @__breserved_tags:
+ * A shared reserved tags sbitmap, used over all hctx's
* @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues
* elements.
* @tag_list_lock: Serializes tag_list accesses.
@@ -249,7 +256,10 @@ struct blk_mq_tag_set {
unsigned int timeout;
unsigned int flags;
void *driver_data;
+ atomic_t active_queues_shared_sbitmap;
+ struct sbitmap_queue __bitmap_tags;
+ struct sbitmap_queue __breserved_tags;
struct blk_mq_tags **tags;
struct mutex tag_list_lock;
@@ -378,12 +388,13 @@ struct blk_mq_ops {
enum {
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
- BLK_MQ_F_TAG_SHARED = 1 << 1,
+ BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
/*
* Set when this device requires underlying blk-mq device for
* completing IO:
*/
BLK_MQ_F_STACKING = 1 << 2,
+ BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
BLK_MQ_F_BLOCKING = 1 << 5,
BLK_MQ_F_NO_SCHED = 1 << 6,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
@@ -489,8 +500,6 @@ void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
void blk_mq_complete_request(struct request *rq);
bool blk_mq_complete_request_remote(struct request *rq);
-bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
- struct bio *bio, unsigned int nr_segs);
bool blk_mq_queue_stopped(struct request_queue *q);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index b3fc5d3dd8ea..7d7c13238fdb 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -20,7 +20,7 @@ typedef void (bio_end_io_t) (struct bio *);
struct bio_crypt_ctx;
struct block_device {
- dev_t bd_dev; /* not a kdev_t - it's a search key */
+ dev_t bd_dev;
int bd_openers;
struct inode * bd_inode; /* will die */
struct super_block * bd_super;
@@ -37,7 +37,8 @@ struct block_device {
struct hd_struct * bd_part;
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
- int bd_invalidated;
+
+ spinlock_t bd_size_lock; /* for bd_inode->i_size updates */
struct gendisk * bd_disk;
struct backing_dev_info *bd_bdi;
@@ -255,8 +256,6 @@ enum {
BIO_NO_PAGE_REF, /* don't put release vec pages */
BIO_CLONED, /* doesn't own data */
BIO_BOUNCED, /* bio is a bounce bio */
- BIO_USER_MAPPED, /* contains user pages */
- BIO_NULL_MAPPED, /* contains invalid user pages */
BIO_WORKINGSET, /* contains userspace workingset pages */
BIO_QUIET, /* Make BIO Quiet */
BIO_CHAIN, /* chained bio, ->bi_remaining in effect */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 868e11face00..1d99bf70a90a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -24,6 +24,7 @@
#include <linux/percpu-refcount.h>
#include <linux/scatterlist.h>
#include <linux/blkzoned.h>
+#include <linux/pm.h>
struct module;
struct scsi_ioctl_command;
@@ -398,6 +399,8 @@ struct request_queue {
struct request *last_merge;
struct elevator_queue *elevator;
+ struct percpu_ref q_usage_counter;
+
struct blk_queue_stats *stats;
struct rq_qos *rq_qos;
@@ -460,7 +463,7 @@ struct request_queue {
#ifdef CONFIG_PM
struct device *dev;
- int rpm_status;
+ enum rpm_status rpm_status;
unsigned int nr_pending;
#endif
@@ -486,6 +489,8 @@ struct request_queue {
struct timer_list timeout;
struct work_struct timeout_work;
+ atomic_t nr_active_requests_shared_sbitmap;
+
struct list_head icq_list;
#ifdef CONFIG_BLK_CGROUP
DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
@@ -568,7 +573,6 @@ struct request_queue {
* percpu_ref_kill() and percpu_ref_reinit().
*/
struct mutex mq_freeze_lock;
- struct percpu_ref q_usage_counter;
struct blk_mq_tag_set *tag_set;
struct list_head tag_set_list;
@@ -605,6 +609,7 @@ struct request_queue {
#define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */
#define QUEUE_FLAG_DEAD 13 /* queue tear-down finished */
#define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */
+#define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */
#define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */
#define QUEUE_FLAG_WC 17 /* Write back caching */
#define QUEUE_FLAG_FUA 18 /* device supports FUA writes */
@@ -617,9 +622,12 @@ struct request_queue {
#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */
#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */
#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
+#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
+#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
- (1 << QUEUE_FLAG_SAME_COMP))
+ (1 << QUEUE_FLAG_SAME_COMP) | \
+ (1 << QUEUE_FLAG_NOWAIT))
void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
@@ -633,6 +641,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_noxmerges(q) \
test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
+#define blk_queue_stable_writes(q) \
+ test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags)
#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
@@ -659,6 +669,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_pm_only(q) atomic_read(&(q)->pm_only)
#define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
#define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
+#define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);
@@ -1061,11 +1072,17 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
static inline unsigned int blk_max_size_offset(struct request_queue *q,
sector_t offset)
{
- if (!q->limits.chunk_sectors)
+ unsigned int chunk_sectors = q->limits.chunk_sectors;
+
+ if (!chunk_sectors)
return q->limits.max_sectors;
- return min(q->limits.max_sectors, (unsigned int)(q->limits.chunk_sectors -
- (offset & (q->limits.chunk_sectors - 1))));
+ if (likely(is_power_of_2(chunk_sectors)))
+ chunk_sectors -= offset & (chunk_sectors - 1);
+ else
+ chunk_sectors -= sector_div(offset, chunk_sectors);
+
+ return min(q->limits.max_sectors, chunk_sectors);
}
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
@@ -1132,6 +1149,7 @@ extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_alignment_offset(struct request_queue *q,
unsigned int alignment);
+void blk_queue_update_readahead(struct request_queue *q);
extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
@@ -1341,6 +1359,11 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
extern int blk_verify_command(unsigned char *cmd, fmode_t mode);
+static inline bool bdev_is_partition(struct block_device *bdev)
+{
+ return bdev->bd_partno;
+}
+
enum blk_default_limits {
BLK_MAX_SEGMENTS = 128,
BLK_SAFE_MAX_SECTORS = 255,
@@ -1386,7 +1409,10 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q)
{
- return q->limits.max_zone_append_sectors;
+
+ const struct queue_limits *l = &q->limits;
+
+ return min(l->max_zone_append_sectors, l->max_sectors);
}
static inline unsigned queue_logical_block_size(const struct request_queue *q)
@@ -1457,10 +1483,9 @@ static inline int bdev_alignment_offset(struct block_device *bdev)
if (q->limits.misaligned)
return -1;
-
- if (bdev != bdev->bd_contains)
- return bdev->bd_part->alignment_offset;
-
+ if (bdev_is_partition(bdev))
+ return queue_limit_alignment_offset(&q->limits,
+ bdev->bd_part->start_sect);
return q->limits.alignment_offset;
}
@@ -1499,9 +1524,9 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
- if (bdev != bdev->bd_contains)
- return bdev->bd_part->discard_alignment;
-
+ if (bdev_is_partition(bdev))
+ return queue_limit_discard_alignment(&q->limits,
+ bdev->bd_part->start_sect);
return q->limits.discard_alignment;
}
@@ -1644,10 +1669,6 @@ extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
struct scatterlist *);
extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
-extern bool blk_integrity_merge_rq(struct request_queue *, struct request *,
- struct request *);
-extern bool blk_integrity_merge_bio(struct request_queue *, struct request *,
- struct bio *);
static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
{
@@ -1775,18 +1796,6 @@ static inline unsigned short queue_max_integrity_segments(const struct request_q
{
return 0;
}
-static inline bool blk_integrity_merge_rq(struct request_queue *rq,
- struct request *r1,
- struct request *r2)
-{
- return true;
-}
-static inline bool blk_integrity_merge_bio(struct request_queue *rq,
- struct request *r,
- struct bio *b)
-{
- return true;
-}
static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
unsigned int sectors)
@@ -1932,6 +1941,11 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
void disk_end_io_acct(struct gendisk *disk, unsigned int op,
unsigned long start_time);
+unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
+ struct bio *bio);
+void part_end_io_acct(struct hd_struct *part, struct bio *bio,
+ unsigned long start_time);
+
/**
* bio_start_io_acct - start I/O accounting for bio based drivers
* @bio: bio to start account for
@@ -1969,7 +1983,6 @@ void blkdev_show(struct seq_file *seqf, off_t offset);
#define BLKDEV_MAJOR_MAX 0
#endif
-int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
void *holder);
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder);
@@ -1980,17 +1993,24 @@ void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
void blkdev_put(struct block_device *bdev, fmode_t mode);
struct block_device *I_BDEV(struct inode *inode);
-struct block_device *bdget(dev_t);
+struct block_device *bdget_part(struct hd_struct *part);
struct block_device *bdgrab(struct block_device *bdev);
void bdput(struct block_device *);
#ifdef CONFIG_BLOCK
void invalidate_bdev(struct block_device *bdev);
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
+ loff_t lend);
int sync_blockdev(struct block_device *bdev);
#else
static inline void invalidate_bdev(struct block_device *bdev)
{
}
+static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
+ loff_t lstart, loff_t lend)
+{
+ return 0;
+}
static inline int sync_blockdev(struct block_device *bdev)
{
return 0;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 93096e524e43..d6f8d4ba8d48 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -252,6 +252,12 @@ struct target_type {
#define DM_TARGET_ZONED_HM 0x00000040
#define dm_target_supports_zoned_hm(type) ((type)->features & DM_TARGET_ZONED_HM)
+/*
+ * A target handles REQ_NOWAIT
+ */
+#define DM_TARGET_NOWAIT 0x00000080
+#define dm_target_supports_nowait(type) ((type)->features & DM_TARGET_NOWAIT)
+
struct dm_target {
struct dm_table *table;
struct target_type *type;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 34ad5fe166a1..0b1e2f1f388b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1377,7 +1377,7 @@ extern int send_sigurg(struct fown_struct *fown);
#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */
#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */
-#define SB_I_MULTIROOT 0x00000008 /* Multiple roots to the dentry tree */
+#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */
/* sb->s_iflags to limit user namespace mounts */
#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 4ab853461dff..38f23d757013 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -65,8 +65,6 @@ struct hd_struct {
struct disk_stats __percpu *dkstats;
struct percpu_ref ref;
- sector_t alignment_offset;
- unsigned int discard_alignment;
struct device __dev;
struct kobject *holder_dir;
int policy, partno;
@@ -193,6 +191,8 @@ struct gendisk {
void *private_data;
int flags;
+ unsigned long state;
+#define GD_NEED_PART_SCAN 0
struct rw_semaphore lookup_sem;
struct kobject *slave_dir;
@@ -315,9 +315,8 @@ static inline int get_disk_ro(struct gendisk *disk)
extern void disk_block_events(struct gendisk *disk);
extern void disk_unblock_events(struct gendisk *disk);
extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
-extern void set_capacity_revalidate_and_notify(struct gendisk *disk,
- sector_t size, bool revalidate);
-extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);
+void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
+ bool update_bdev);
/* drivers/char/random.c */
extern void add_disk_randomness(struct gendisk *disk) __latent_entropy;
@@ -372,10 +371,10 @@ extern void blk_unregister_region(dev_t devt, unsigned long range);
int register_blkdev(unsigned int major, const char *name);
void unregister_blkdev(unsigned int major, const char *name);
-int revalidate_disk(struct gendisk *disk);
-int check_disk_change(struct block_device *bdev);
+void revalidate_disk_size(struct gendisk *disk, bool verbose);
+bool bdev_check_media_change(struct block_device *bdev);
int __invalidate_device(struct block_device *bdev, bool kill_dirty);
-void bd_set_size(struct block_device *bdev, loff_t size);
+void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors);
/* for drivers/char/raw.c: */
int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a254841bd315..62653769509f 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -490,8 +490,6 @@ enum {
IDE_DFLAG_NOPROBE = BIT(9),
/* need to do check_media_change() */
IDE_DFLAG_REMOVABLE = BIT(10),
- /* needed for removable devices */
- IDE_DFLAG_ATTACH = BIT(11),
IDE_DFLAG_FORCED_GEOM = BIT(12),
/* disallow setting unmask bit */
IDE_DFLAG_NO_UNMASK = BIT(13),
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 87d8a38bdea1..16c35a728b4c 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -92,18 +92,30 @@ enum {
PERCPU_REF_ALLOW_REINIT = 1 << 2,
};
-struct percpu_ref {
+struct percpu_ref_data {
atomic_long_t count;
- /*
- * The low bit of the pointer indicates whether the ref is in percpu
- * mode; if set, then get/put will manipulate the atomic_t.
- */
- unsigned long percpu_count_ptr;
percpu_ref_func_t *release;
percpu_ref_func_t *confirm_switch;
bool force_atomic:1;
bool allow_reinit:1;
struct rcu_head rcu;
+ struct percpu_ref *ref;
+};
+
+struct percpu_ref {
+ /*
+ * The low bit of the pointer indicates whether the ref is in percpu
+ * mode; if set, then get/put will manipulate the atomic_t.
+ */
+ unsigned long percpu_count_ptr;
+
+ /*
+ * 'percpu_ref' is often embedded into user structure, and only
+ * 'percpu_count_ptr' is required in fast path, move other fields
+ * into 'percpu_ref_data', so we can reduce memory footprint in
+ * fast path.
+ */
+ struct percpu_ref_data *data;
};
int __must_check percpu_ref_init(struct percpu_ref *ref,
@@ -118,6 +130,7 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
percpu_ref_func_t *confirm_kill);
void percpu_ref_resurrect(struct percpu_ref *ref);
void percpu_ref_reinit(struct percpu_ref *ref);
+bool percpu_ref_is_zero(struct percpu_ref *ref);
/**
* percpu_ref_kill - drop the initial ref
@@ -191,7 +204,7 @@ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
if (__ref_is_percpu(ref, &percpu_count))
this_cpu_add(*percpu_count, nr);
else
- atomic_long_add(nr, &ref->count);
+ atomic_long_add(nr, &ref->data->count);
rcu_read_unlock();
}
@@ -231,7 +244,7 @@ static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
this_cpu_add(*percpu_count, nr);
ret = true;
} else {
- ret = atomic_long_add_unless(&ref->count, nr, 0);
+ ret = atomic_long_add_unless(&ref->data->count, nr, 0);
}
rcu_read_unlock();
@@ -279,7 +292,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
this_cpu_inc(*percpu_count);
ret = true;
} else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
- ret = atomic_long_inc_not_zero(&ref->count);
+ ret = atomic_long_inc_not_zero(&ref->data->count);
}
rcu_read_unlock();
@@ -305,8 +318,8 @@ static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
if (__ref_is_percpu(ref, &percpu_count))
this_cpu_sub(*percpu_count, nr);
- else if (unlikely(atomic_long_sub_and_test(nr, &ref->count)))
- ref->release(ref);
+ else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count)))
+ ref->data->release(ref);
rcu_read_unlock();
}
@@ -339,21 +352,4 @@ static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
return ref->percpu_count_ptr & __PERCPU_REF_DEAD;
}
-/**
- * percpu_ref_is_zero - test whether a percpu refcount reached zero
- * @ref: percpu_ref to test
- *
- * Returns %true if @ref reached zero.
- *
- * This function is safe to call as long as @ref is between init and exit.
- */
-static inline bool percpu_ref_is_zero(struct percpu_ref *ref)
-{
- unsigned long __percpu *percpu_count;
-
- if (__ref_is_percpu(ref, &percpu_count))
- return false;
- return !atomic_long_read(&ref->count);
-}
-
#endif
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index cb9afad82a90..8af13ba60c7e 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -473,9 +473,9 @@ static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) {
#endif /* CONFIG_HIBERNATION */
#ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV
-int is_hibernate_resume_dev(const struct inode *);
+int is_hibernate_resume_dev(dev_t dev);
#else
-static inline int is_hibernate_resume_dev(const struct inode *i) { return 0; }
+static inline int is_hibernate_resume_dev(dev_t dev) { return 0; }
#endif
/* Hibernation and suspend events */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 661046994db4..4340a7b6e7a1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -467,7 +467,8 @@ extern int swapcache_prepare(swp_entry_t);
extern void swap_free(swp_entry_t);
extern void swapcache_free_entries(swp_entry_t *entries, int n);
extern int free_swap_and_cache(swp_entry_t);
-extern int swap_type_of(dev_t, sector_t, struct block_device **);
+int swap_type_of(dev_t device, sector_t offset);
+int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
extern sector_t map_swap_page(struct page *, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h
index c2f580fd371b..0b6869980ba2 100644
--- a/include/trace/events/iocost.h
+++ b/include/trace/events/iocost.h
@@ -26,7 +26,6 @@ TRACE_EVENT(iocost_iocg_activate,
__field(u64, vrate)
__field(u64, last_period)
__field(u64, cur_period)
- __field(u64, last_vtime)
__field(u64, vtime)
__field(u32, weight)
__field(u32, inuse)
@@ -42,7 +41,6 @@ TRACE_EVENT(iocost_iocg_activate,
__entry->vrate = now->vrate;
__entry->last_period = last_period;
__entry->cur_period = cur_period;
- __entry->last_vtime = iocg->last_vtime;
__entry->vtime = vtime;
__entry->weight = iocg->weight;
__entry->inuse = iocg->inuse;
@@ -51,13 +49,12 @@ TRACE_EVENT(iocost_iocg_activate,
),
TP_printk("[%s:%s] now=%llu:%llu vrate=%llu "
- "period=%llu->%llu vtime=%llu->%llu "
+ "period=%llu->%llu vtime=%llu "
"weight=%u/%u hweight=%llu/%llu",
__get_str(devname), __get_str(cgroup),
__entry->now, __entry->vnow, __entry->vrate,
__entry->last_period, __entry->cur_period,
- __entry->last_vtime, __entry->vtime,
- __entry->inuse, __entry->weight,
+ __entry->vtime, __entry->inuse, __entry->weight,
__entry->hweight_inuse, __entry->hweight_active
)
);
@@ -98,7 +95,7 @@ DECLARE_EVENT_CLASS(iocg_inuse_update,
)
);
-DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback,
+DEFINE_EVENT(iocg_inuse_update, iocost_inuse_shortage,
TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
u32 old_inuse, u32 new_inuse,
@@ -108,7 +105,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback,
old_hw_inuse, new_hw_inuse)
);
-DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway,
+DEFINE_EVENT(iocg_inuse_update, iocost_inuse_transfer,
TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
u32 old_inuse, u32 new_inuse,
@@ -118,7 +115,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway,
old_hw_inuse, new_hw_inuse)
);
-DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset,
+DEFINE_EVENT(iocg_inuse_update, iocost_inuse_adjust,
TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
u32 old_inuse, u32 new_inuse,
@@ -131,11 +128,9 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset,
TRACE_EVENT(iocost_ioc_vrate_adj,
TP_PROTO(struct ioc *ioc, u64 new_vrate, u32 *missed_ppm,
- u32 rq_wait_pct, int nr_lagging, int nr_shortages,
- int nr_surpluses),
+ u32 rq_wait_pct, int nr_lagging, int nr_shortages),
- TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages,
- nr_surpluses),
+ TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages),
TP_STRUCT__entry (
__string(devname, ioc_name(ioc))
@@ -147,7 +142,6 @@ TRACE_EVENT(iocost_ioc_vrate_adj,
__field(u32, rq_wait_pct)
__field(int, nr_lagging)
__field(int, nr_shortages)
- __field(int, nr_surpluses)
),
TP_fast_assign(
@@ -160,15 +154,54 @@ TRACE_EVENT(iocost_ioc_vrate_adj,
__entry->rq_wait_pct = rq_wait_pct;
__entry->nr_lagging = nr_lagging;
__entry->nr_shortages = nr_shortages;
- __entry->nr_surpluses = nr_surpluses;
),
- TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d surpluses=%d",
+ TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d",
__get_str(devname), __entry->old_vrate, __entry->new_vrate,
__entry->busy_level,
__entry->read_missed_ppm, __entry->write_missed_ppm,
- __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages,
- __entry->nr_surpluses
+ __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages
+ )
+);
+
+TRACE_EVENT(iocost_iocg_forgive_debt,
+
+ TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
+ u32 usage_pct, u64 old_debt, u64 new_debt,
+ u64 old_delay, u64 new_delay),
+
+ TP_ARGS(iocg, path, now, usage_pct,
+ old_debt, new_debt, old_delay, new_delay),
+
+ TP_STRUCT__entry (
+ __string(devname, ioc_name(iocg->ioc))
+ __string(cgroup, path)
+ __field(u64, now)
+ __field(u64, vnow)
+ __field(u32, usage_pct)
+ __field(u64, old_debt)
+ __field(u64, new_debt)
+ __field(u64, old_delay)
+ __field(u64, new_delay)
+ ),
+
+ TP_fast_assign(
+ __assign_str(devname, ioc_name(iocg->ioc));
+ __assign_str(cgroup, path);
+ __entry->now = now->now;
+ __entry->vnow = now->vnow;
+ __entry->usage_pct = usage_pct;
+ __entry->old_debt = old_debt;
+ __entry->new_debt = new_debt;
+ __entry->old_delay = old_delay;
+ __entry->new_delay = new_delay;
+ ),
+
+ TP_printk("[%s:%s] now=%llu:%llu usage=%u debt=%llu->%llu delay=%llu->%llu",
+ __get_str(devname), __get_str(cgroup),
+ __entry->now, __entry->vnow, __entry->usage_pct,
+ __entry->old_debt, __entry->new_debt,
+ __entry->old_delay, __entry->new_delay
)
);
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index 42c3366cc25f..656a326821a2 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -93,12 +93,15 @@ enum blk_zone_report_flags {
* @non_seq: Flag indicating that the zone is using non-sequential resources
* (for host-aware zoned block devices only).
* @reset: Flag indicating that a zone reset is recommended.
- * @reserved: Padding to 64 B to match the ZBC/ZAC defined zone descriptor size.
+ * @resv: Padding for 8B alignment.
+ * @capacity: Zone usable capacity in 512 B sector units
+ * @reserved: Padding to 64 B to match the ZBC, ZAC and ZNS defined zone
+ * descriptor size.
*
- * start, len and wp use the regular 512 B sector unit, regardless of the
- * device logical block size. The overall structure size is 64 B to match the
- * ZBC/ZAC defined zone descriptor and allow support for future additional
- * zone information.
+ * start, len, capacity and wp use the regular 512 B sector unit, regardless
+ * of the device logical block size. The overall structure size is 64 B to
+ * match the ZBC, ZAC and ZNS defined zone descriptor and allow support for
+ * future additional zone information.
*/
struct blk_zone {
__u64 start; /* Zone start sector */
@@ -118,7 +121,7 @@ struct blk_zone {
*
* @sector: starting sector of report
* @nr_zones: IN maximum / OUT actual
- * @reserved: padding to 16 byte alignment
+ * @flags: one or more flags as defined by enum blk_zone_report_flags.
* @zones: Space to hold @nr_zones @zones entries on reply.
*
* The array of at most @nr_zones must follow this structure in memory.
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 395dd0df8d08..c6ca33034147 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -288,6 +288,8 @@ struct vfs_ns_cap_data {
processes and setting the scheduling algorithm used by another
process. */
/* Allow setting cpu affinity on other processes */
+/* Allow setting realtime ioprio class */
+/* Allow setting ioprio class on other processes */
#define CAP_SYS_NICE 23