summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst2
-rw-r--r--Documentation/admin-guide/sysctl/fs.rst5
-rw-r--r--Documentation/filesystems/nfs/exporting.rst7
-rw-r--r--MAINTAINERS1
-rw-r--r--fs/aio.c1
-rw-r--r--fs/btrfs/extent_io.c7
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/coredump.c1
-rw-r--r--fs/dcache.c16
-rw-r--r--fs/eventpoll.c11
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/f2fs/data.c9
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/freevxfs/vxfs_dir.h2
-rw-r--r--fs/fs-writeback.c40
-rw-r--r--fs/fs_parser.c1
-rw-r--r--fs/gfs2/export.c1
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h3
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/hugetlbfs/inode.c17
-rw-r--r--fs/inode.c29
-rw-r--r--fs/iomap/buffered-io.c2
-rw-r--r--fs/lockd/svclock.c7
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c41
-rw-r--r--fs/namespace.c161
-rw-r--r--fs/nfsd/nfs4state.c19
-rw-r--r--fs/ocfs2/export.c1
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/posix_acl.c13
-rw-r--r--fs/seq_file.c2
-rw-r--r--include/linux/eventpoll.h2
-rw-r--r--include/linux/exportfs.h13
-rw-r--r--include/linux/filelock.h5
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/posix_acl.h6
-rw-r--r--include/linux/wait.h1
-rw-r--r--include/linux/writeback.h32
-rw-r--r--include/trace/events/hugetlbfs.h156
-rw-r--r--include/uapi/linux/mount.h14
-rw-r--r--init/initramfs.c15
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/page-writeback.c4
46 files changed, 531 insertions, 141 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 6d02168d78be..2cb58daf3089 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2954,7 +2954,7 @@ following two functions.
a queue (device) has been associated with the bio and
before submission.
- wbc_account_cgroup_owner(@wbc, @page, @bytes)
+ wbc_account_cgroup_owner(@wbc, @folio, @bytes)
Should be called for each data segment being written out.
While this function doesn't care exactly when it's called
during the writeback session, it's the easiest and most
diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst
index 47499a1742bd..30c61474dec5 100644
--- a/Documentation/admin-guide/sysctl/fs.rst
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -38,6 +38,11 @@ requests. ``aio-max-nr`` allows you to change the maximum value
``aio-max-nr`` does not result in the
pre-allocation or re-sizing of any kernel data structures.
+dentry-negative
+----------------------------
+
+Policy for negative dentries. Set to 1 to to always delete the dentry when a
+file is removed, and 0 to disable it. By default, this behavior is disabled.
dentry-state
------------
diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst
index f04ce1215a03..de64d2d002a2 100644
--- a/Documentation/filesystems/nfs/exporting.rst
+++ b/Documentation/filesystems/nfs/exporting.rst
@@ -238,10 +238,3 @@ following flags are defined:
all of an inode's dirty data on last close. Exports that behave this
way should set EXPORT_OP_FLUSH_ON_CLOSE so that NFSD knows to skip
waiting for writeback when closing such files.
-
- EXPORT_OP_ASYNC_LOCK - Indicates a capable filesystem to do async lock
- requests from lockd. Only set EXPORT_OP_ASYNC_LOCK if the filesystem has
- it's own ->lock() functionality as core posix_lock_file() implementation
- has no async lock request handling yet. For more information about how to
- indicate an async lock request from a ->lock() file_operations struct, see
- fs/locks.c and comment for the function vfs_lock_file().
diff --git a/MAINTAINERS b/MAINTAINERS
index b878ddc99f94..32a8387dcdd0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10493,6 +10493,7 @@ F: Documentation/mm/hugetlbfs_reserv.rst
F: Documentation/mm/vmemmap_dedup.rst
F: fs/hugetlbfs/
F: include/linux/hugetlb.h
+F: include/trace/events/hugetlbfs.h
F: mm/hugetlb.c
F: mm/hugetlb_vmemmap.c
F: mm/hugetlb_vmemmap.h
diff --git a/fs/aio.c b/fs/aio.c
index e8920178b50f..72e3970f4225 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2191,7 +2191,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
- /* TODO: use a hash or array, this sucks. */
list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
if (kiocb->ki_res.obj == obj) {
ret = kiocb->ki_cancel(&kiocb->rw);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 872cca54cc6c..42c9899d9241 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -786,7 +786,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
}
if (bio_ctrl->wbc)
- wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page,
+ wbc_account_cgroup_owner(bio_ctrl->wbc, folio,
len);
size -= len;
@@ -1708,7 +1708,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
ret = bio_add_folio(&bbio->bio, folio, eb->len,
eb->start - folio_pos(folio));
ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
+ wbc_account_cgroup_owner(wbc, folio, eb->len);
folio_unlock(folio);
} else {
int num_folios = num_extent_folios(eb);
@@ -1722,8 +1722,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
folio_start_writeback(folio);
ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
- eb->folio_size);
+ wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
wbc->nr_to_write -= folio_nr_pages(folio);
folio_unlock(folio);
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e4ca1e7d2e5..df20f2e7ac1e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1729,7 +1729,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* need full accuracy. Just account the whole thing
* against the first page.
*/
- wbc_account_cgroup_owner(wbc, &locked_folio->page,
+ wbc_account_cgroup_owner(wbc, locked_folio,
cur_end - start);
async_chunk[i].locked_folio = locked_folio;
locked_folio = NULL;
diff --git a/fs/buffer.c b/fs/buffer.c
index 1fc9a50def0b..32bd0f4c4223 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2803,7 +2803,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_write_hint = write_hint;
- __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+ bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
@@ -2813,7 +2813,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
if (wbc) {
wbc_init_bio(wbc, bio);
- wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
+ wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
}
submit_bio(bio);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 57cc096c498a..c2ddb998f3c9 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -562,8 +562,8 @@ int cdev_device_add(struct cdev *cdev, struct device *dev)
/**
* cdev_device_del() - inverse of cdev_device_add
- * @dev: the device structure
* @cdev: the cdev structure
+ * @dev: the device structure
*
* cdev_device_del() is a helper function to call cdev_del and device_del.
* It should be used whenever cdev_device_add is used.
diff --git a/fs/coredump.c b/fs/coredump.c
index 45737b43dda5..d48edb37bc35 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -951,6 +951,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
} else {
dump_skip(cprm, PAGE_SIZE);
}
+ cond_resched();
}
dump_page_free(dump_page);
return 1;
diff --git a/fs/dcache.c b/fs/dcache.c
index 0f6b16ba30d0..0099077a2982 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -135,6 +135,7 @@ struct dentry_stat_t {
static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);
+static int dentry_negative_policy;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
/* Statistics gathering. */
@@ -199,6 +200,15 @@ static struct ctl_table fs_dcache_sysctls[] = {
.mode = 0444,
.proc_handler = proc_nr_dentry,
},
+ {
+ .procname = "dentry-negative",
+ .data = &dentry_negative_policy,
+ .maxlen = sizeof(dentry_negative_policy),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
};
static int __init init_fs_dcache_sysctls(void)
@@ -2039,8 +2049,8 @@ EXPORT_SYMBOL(d_obtain_root);
/**
* d_add_ci - lookup or allocate new dentry with case-exact name
- * @inode: the inode case-insensitive lookup has found
* @dentry: the negative dentry that was passed to the parent's lookup func
+ * @inode: the inode case-insensitive lookup has found
* @name: the case-exact name to be associated with the returned dentry
*
* This is to avoid filling the dcache with case-insensitive names to the
@@ -2093,8 +2103,8 @@ EXPORT_SYMBOL(d_add_ci);
/**
* d_same_name - compare dentry name with case-exact name
- * @parent: parent dentry
* @dentry: the negative dentry that was passed to the parent's lookup func
+ * @parent: parent dentry
* @name: the case-exact name to be associated with the returned dentry
*
* Return: true if names are same, or false
@@ -2401,6 +2411,8 @@ void d_delete(struct dentry * dentry)
* Are we the only user?
*/
if (dentry->d_lockref.count == 1) {
+ if (dentry_negative_policy)
+ __d_drop(dentry);
dentry->d_flags &= ~DCACHE_CANT_MOUNT;
dentry_unlink_inode(dentry);
} else {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1ae4542f0bd8..1a06e462b6ef 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -823,7 +823,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
to_free = NULL;
head = file->f_ep;
if (head->first == &epi->fllink && !epi->fllink.next) {
- file->f_ep = NULL;
+ /* See eventpoll_release() for details. */
+ WRITE_ONCE(file->f_ep, NULL);
if (!is_file_epoll(file)) {
struct epitems_head *v;
v = container_of(head, struct epitems_head, epitems);
@@ -1372,7 +1373,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up(&ep->wq);
+ if (sync)
+ wake_up_sync(&ep->wq);
+ else
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
@@ -1603,7 +1607,8 @@ allocate:
spin_unlock(&file->f_lock);
goto allocate;
}
- file->f_ep = head;
+ /* See eventpoll_release() for details. */
+ WRITE_ONCE(file->f_ep, head);
to_free = NULL;
}
hlist_add_head_rcu(&epi->fllink, file->f_ep);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index ad5543866d21..b7b9261fec3b 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -421,7 +421,7 @@ submit_and_retry:
io_submit_init_bio(io, bh);
if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
goto submit_and_retry;
- wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
+ wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
io->io_next_block++;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 94f7b084f601..e3ce763cce18 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
}
if (fio->io_wbc && !is_read_io(fio->op))
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
__read_io_type(page) : WB_DATA_TYPE(fio->page, false));
@@ -911,7 +912,8 @@ alloc_new:
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
@@ -1011,7 +1013,8 @@ alloc_new:
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
io->last_block_in_bio = fio->new_blkaddr;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 22dd9dcce7ec..3d89de31066a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -397,6 +397,9 @@ static long f_dupfd_query(int fd, struct file *filp)
{
CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+
/*
* We can do the 'fdput()' immediately, as the only thing that
* matters is the pointer value which isn't changed by the fdput.
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
index fbcd603365ad..8c67627f2a3d 100644
--- a/fs/freevxfs/vxfs_dir.h
+++ b/fs/freevxfs/vxfs_dir.h
@@ -25,7 +25,7 @@
struct vxfs_dirblk {
__fs16 d_free; /* free space in dirblock */
__fs16 d_nhash; /* no of hash chains */
- __fs16 d_hash[1]; /* hash chain */
+ __fs16 d_hash[]; /* hash chain */
};
/*
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d8bec3c1bb1f..3cd99e2dc6ac 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -290,7 +290,6 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio)
if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
wb_put(wb);
}
-EXPORT_SYMBOL_GPL(__inode_attach_wb);
/**
* inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
@@ -731,8 +730,9 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
* writeback completion, wbc_detach_inode() should be called. This is used
* to track the cgroup writeback context.
*/
-void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
- struct inode *inode)
+static void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+ __releases(&inode->i_lock)
{
if (!inode_cgwb_enabled(inode)) {
spin_unlock(&inode->i_lock);
@@ -762,7 +762,24 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
inode_switch_wbs(inode, wbc->wb_id);
}
-EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
+
+/**
+ * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * This function is to be used by __filemap_fdatawrite_range(), which is an
+ * alternative entry point into writeback code, and first ensures @inode is
+ * associated with a bdi_writeback and attaches it to @wbc.
+ */
+void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ inode_attach_wb(inode, NULL);
+ wbc_attach_and_unlock_inode(wbc, inode);
+}
+EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode);
/**
* wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@@ -890,17 +907,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
/**
* wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
* @wbc: writeback_control of the writeback in progress
- * @page: page being written out
+ * @folio: folio being written out
* @bytes: number of bytes being written out
*
- * @bytes from @page are about to written out during the writeback
+ * @bytes from @folio are about to written out during the writeback
* controlled by @wbc. Keep the book for foreign inode detection. See
* wbc_detach_inode().
*/
-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
size_t bytes)
{
- struct folio *folio;
struct cgroup_subsys_state *css;
int id;
@@ -913,7 +929,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
if (!wbc->wb || wbc->no_cgroup_owner)
return;
- folio = page_folio(page);
css = mem_cgroup_css_from_folio(folio);
/* dead cgroups shouldn't contribute to inode ownership arbitration */
if (!(css->flags & CSS_ONLINE))
@@ -1227,6 +1242,13 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
}
}
+static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+ __releases(&inode->i_lock)
+{
+ spin_unlock(&inode->i_lock);
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
/*
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 24727ec34e5a..698464f3e26a 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -156,6 +156,7 @@ int fs_lookup_param(struct fs_context *fc,
f = getname_kernel(param->string);
if (IS_ERR(f))
return PTR_ERR(f);
+ param->dirfd = AT_FDCWD;
put_f = true;
break;
case fs_value_is_filename:
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d418d8b5367f..3334c394ce9c 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -190,6 +190,5 @@ const struct export_operations gfs2_export_ops = {
.fh_to_parent = gfs2_fh_to_parent,
.get_name = gfs2_get_name,
.get_parent = gfs2_get_parent,
- .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index f7dd64856c9b..1e73cf87ff88 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1586,6 +1586,7 @@ const struct file_operations gfs2_file_fops = {
.splice_write = gfs2_file_splice_write,
.setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
+ .fop_flags = FOP_ASYNC_LOCK,
};
const struct file_operations gfs2_dir_fops = {
@@ -1598,6 +1599,7 @@ const struct file_operations gfs2_dir_fops = {
.lock = gfs2_lock,
.flock = gfs2_flock,
.llseek = default_llseek,
+ .fop_flags = FOP_ASYNC_LOCK,
};
#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index b733ef0c1547..2f089bff0095 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -157,6 +157,7 @@ struct hfsplus_sb_info {
/* Runtime variables */
u32 blockoffset;
+ u32 min_io_size;
sector_t part_start;
sector_t sect_count;
int fs_shift;
@@ -308,7 +309,7 @@ struct hfsplus_readdir_data {
*/
static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
{
- return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev),
+ return max_t(unsigned short, HFSPLUS_SB(sb)->min_io_size,
HFSPLUS_SECTOR_SIZE);
}
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 9592ffcb44e5..74801911bc1c 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -172,6 +172,8 @@ int hfsplus_read_wrapper(struct super_block *sb)
if (!blocksize)
goto out;
+ sbi->min_io_size = blocksize;
+
if (hfsplus_get_last_session(sb, &part_start, &part_size))
goto out;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5cf327337e22..2dea122e5b93 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -39,6 +39,9 @@
#include <linux/uaccess.h>
#include <linux/sched/mm.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/hugetlbfs.h>
+
static const struct address_space_operations hugetlbfs_aops;
static const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
@@ -687,6 +690,7 @@ static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
+ trace_hugetlbfs_evict_inode(inode);
remove_inode_hugepages(inode, 0, LLONG_MAX);
/*
@@ -814,8 +818,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- return hugetlbfs_punch_hole(inode, offset, len);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ error = hugetlbfs_punch_hole(inode, offset, len);
+ goto out_nolock;
+ }
/*
* Default preallocate case.
@@ -919,6 +925,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
inode_set_ctime_current(inode);
out:
inode_unlock(inode);
+
+out_nolock:
+ trace_hugetlbfs_fallocate(inode, mode, offset, len, error);
return error;
}
@@ -935,6 +944,8 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
if (error)
return error;
+ trace_hugetlbfs_setattr(inode, dentry, attr);
+
if (ia_valid & ATTR_SIZE) {
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
@@ -1033,6 +1044,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
break;
}
lockdep_annotate_inode_mutex_key(inode);
+ trace_hugetlbfs_alloc_inode(inode, dir, mode);
} else {
if (resv_map)
kref_put(&resv_map->refs, resv_map_release);
@@ -1272,6 +1284,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
static void hugetlbfs_free_inode(struct inode *inode)
{
+ trace_hugetlbfs_free_inode(inode);
kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}
diff --git a/fs/inode.c b/fs/inode.c
index 70a2f8c717e0..b13b778257ae 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -819,7 +819,7 @@ static void evict(struct inode *inode)
* ___wait_var_event() either sees the bit cleared or
* waitqueue_active() check in wake_up_var() sees the waiter.
*/
- smp_mb();
+ smp_mb__after_spinlock();
inode_wake_up_bit(inode, __I_NEW);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
spin_unlock(&inode->i_lock);
@@ -1312,16 +1312,15 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
* @data: opaque data pointer to pass to @test and @set
*
* Search for the inode specified by @hashval and @data in the inode cache,
- * and if present it is return it with an increased reference count. This is
- * a variant of iget5_locked() for callers that don't want to fail on memory
- * allocation of inode.
+ * and if present return it with an increased reference count. This is a
+ * variant of iget5_locked() that doesn't allocate an inode.
*
- * If the inode is not in cache, insert the pre-allocated inode to cache and
+ * If the inode is not present in the cache, insert the pre-allocated inode and
* return it locked, hashed, and with the I_NEW flag set. The file system gets
* to fill it in before unlocking it via unlock_new_inode().
*
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
+ * Note that both @test and @set are called with the inode_hash_lock held, so
+ * they can't sleep.
*/
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *),
@@ -1385,16 +1384,16 @@ EXPORT_SYMBOL(inode_insert5);
* @data: opaque data pointer to pass to @test and @set
*
* Search for the inode specified by @hashval and @data in the inode cache,
- * and if present it is return it with an increased reference count. This is
- * a generalized version of iget_locked() for file systems where the inode
+ * and if present return it with an increased reference count. This is a
+ * generalized version of iget_locked() for file systems where the inode
* number is not sufficient for unique identification of an inode.
*
- * If the inode is not in cache, allocate a new inode and return it locked,
- * hashed, and with the I_NEW flag set. The file system gets to fill it in
- * before unlocking it via unlock_new_inode().
+ * If the inode is not present in the cache, allocate and insert a new inode
+ * and return it locked, hashed, and with the I_NEW flag set. The file system
+ * gets to fill it in before unlocking it via unlock_new_inode().
*
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
+ * Note that both @test and @set are called with the inode_hash_lock held, so
+ * they can't sleep.
*/
struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *),
@@ -2912,7 +2911,7 @@ EXPORT_SYMBOL(inode_set_ctime_deleg);
* @inode: inode to check
* @vfsgid: the new/current vfsgid of @inode
*
- * Check wether @vfsgid is in the caller's group list or if the caller is
+ * Check whether @vfsgid is in the caller's group list or if the caller is
* privileged with CAP_FSETID over @inode. This can be used to determine
* whether the setgid bit can be kept or must be dropped.
*
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ef0b68bccbb6..ce73d2a48c1e 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1784,7 +1784,7 @@ new_ioend:
if (ifs)
atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len;
- wbc_account_cgroup_owner(wbc, &folio->page, len);
+ wbc_account_cgroup_owner(wbc, folio, len);
return 0;
}
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 1f2149db10f2..2359347c9fbd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -30,7 +30,6 @@
#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/nlm.h>
#include <linux/lockd/lockd.h>
-#include <linux/exportfs.h>
#define NLMDBG_FACILITY NLMDBG_SVCLOCK
@@ -481,7 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock, int wait,
struct nlm_cookie *cookie, int reclaim)
{
- struct inode *inode = nlmsvc_file_inode(file);
+ struct inode *inode __maybe_unused = nlmsvc_file_inode(file);
struct nlm_block *block = NULL;
int error;
int mode;
@@ -496,7 +495,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
(long long)lock->fl.fl_end,
wait);
- if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) {
+ if (!locks_can_async_lock(nlmsvc_file_file(file)->f_op)) {
async_block = wait;
wait = 0;
}
@@ -550,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
* requests on the underlaying ->lock() implementation but
* only one nlm_block to being granted by lm_grant().
*/
- if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) &&
+ if (locks_can_async_lock(nlmsvc_file_file(file)->f_op) &&
!list_empty(&block->b_list)) {
spin_unlock(&nlm_blocked_lock);
ret = nlm_lck_blocked;
diff --git a/fs/mpage.c b/fs/mpage.c
index b5b5ddf9d513..82aecf372743 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -606,7 +606,7 @@ alloc_new:
* the confused fail path above (OOM) will be very confused when
* it finds all bh marked clean (i.e. it will not write anything)
*/
- wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
+ wbc_account_cgroup_owner(wbc, folio, folio_size(folio));
length = first_unmapped << blkbits;
if (!bio_add_folio(bio, folio, length, 0)) {
bio = mpage_bio_submit_write(bio);
diff --git a/fs/namei.c b/fs/namei.c
index 4a4a22a08ac2..05a8d544fb35 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -326,6 +326,25 @@ static int check_acl(struct mnt_idmap *idmap,
return -EAGAIN;
}
+/*
+ * Very quick optimistic "we know we have no ACL's" check.
+ *
+ * Note that this is purely for ACL_TYPE_ACCESS, and purely
+ * for the "we have cached that there are no ACLs" case.
+ *
+ * If this returns true, we know there are no ACLs. But if
+ * it returns false, we might still not have ACLs (it could
+ * be the is_uncached_acl() case).
+ */
+static inline bool no_acl_inode(struct inode *inode)
+{
+#ifdef CONFIG_FS_POSIX_ACL
+ return likely(!READ_ONCE(inode->i_acl));
+#else
+ return true;
+#endif
+}
+
/**
* acl_permission_check - perform basic UNIX permission checking
* @idmap: idmap of the mount the inode was found from
@@ -348,6 +367,28 @@ static int acl_permission_check(struct mnt_idmap *idmap,
unsigned int mode = inode->i_mode;
vfsuid_t vfsuid;
+ /*
+ * Common cheap case: everybody has the requested
+ * rights, and there are no ACLs to check. No need
+ * to do any owner/group checks in that case.
+ *
+ * - 'mask&7' is the requested permission bit set
+ * - multiplying by 0111 spreads them out to all of ugo
+ * - '& ~mode' looks for missing inode permission bits
+ * - the '!' is for "no missing permissions"
+ *
+ * After that, we just need to check that there are no
+ * ACL's on the inode - do the 'IS_POSIXACL()' check last
+ * because it will dereference the ->i_sb pointer and we
+ * want to avoid that if at all possible.
+ */
+ if (!((mask & 7) * 0111 & ~mode)) {
+ if (no_acl_inode(inode))
+ return 0;
+ if (!IS_POSIXACL(inode))
+ return 0;
+ }
+
/* Are we the owner? If so, ACL's don't matter */
vfsuid = i_uid_into_vfsuid(idmap, inode);
if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
diff --git a/fs/namespace.c b/fs/namespace.c
index d26f5e6d2ca3..206fc54feeba 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3901,7 +3901,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
}
new_ns->ns.ops = &mntns_operations;
if (!anon)
- new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
+ new_ns->seq = atomic64_inc_return(&mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
@@ -5006,6 +5006,40 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+
+ if (sb->s_subtype)
+ seq_puts(seq, sb->s_subtype);
+}
+
+static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+ struct mount *r = real_mount(s->mnt);
+
+ if (sb->s_op->show_devname) {
+ size_t start = seq->count;
+ int ret;
+
+ ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
+ if (ret)
+ return ret;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ /* Unescape the result */
+ seq->buf[seq->count] = '\0';
+ seq->count = start;
+ seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
+ } else if (r->mnt_devname) {
+ seq_puts(seq, r->mnt_devname);
+ }
+ return 0;
+}
+
static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{
s->sm.mask |= STATMOUNT_MNT_NS_ID;
@@ -5040,35 +5074,134 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static inline int statmount_opt_unescape(struct seq_file *seq, char *buf_start)
+{
+ char *buf_end, *opt_start, *opt_end;
+ int count = 0;
+
+ buf_end = seq->buf + seq->count;
+ *buf_end = '\0';
+ for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) {
+ opt_end = strchrnul(opt_start, ',');
+ *opt_end = '\0';
+ buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1;
+ if (WARN_ON_ONCE(++count == INT_MAX))
+ return -EOVERFLOW;
+ }
+ seq->count = buf_start - 1 - seq->buf;
+ return count;
+}
+
+static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct super_block *sb = mnt->mnt_sb;
+ size_t start = seq->count;
+ char *buf_start;
+ int err;
+
+ if (!sb->s_op->show_options)
+ return 0;
+
+ buf_start = seq->buf + start;
+ err = sb->s_op->show_options(seq, mnt->mnt_root);
+ if (err)
+ return err;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (seq->count == start)
+ return 0;
+
+ err = statmount_opt_unescape(seq, buf_start);
+ if (err < 0)
+ return err;
+
+ s->sm.opt_num = err;
+ return 0;
+}
+
+static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct super_block *sb = mnt->mnt_sb;
+ size_t start = seq->count;
+ char *buf_start;
+ int err;
+
+ buf_start = seq->buf + start;
+
+ err = security_sb_show_options(seq, sb);
+ if (!err)
+ return err;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (seq->count == start)
+ return 0;
+
+ err = statmount_opt_unescape(seq, buf_start);
+ if (err < 0)
+ return err;
+
+ s->sm.opt_sec_num = err;
+ return 0;
+}
+
static int statmount_string(struct kstatmount *s, u64 flag)
{
- int ret;
+ int ret = 0;
size_t kbufsize;
struct seq_file *seq = &s->seq;
struct statmount *sm = &s->sm;
+ u32 start = seq->count;
switch (flag) {
case STATMOUNT_FS_TYPE:
- sm->fs_type = seq->count;
+ sm->fs_type = start;
ret = statmount_fs_type(s, seq);
break;
case STATMOUNT_MNT_ROOT:
- sm->mnt_root = seq->count;
+ sm->mnt_root = start;
ret = statmount_mnt_root(s, seq);
break;
case STATMOUNT_MNT_POINT:
- sm->mnt_point = seq->count;
+ sm->mnt_point = start;
ret = statmount_mnt_point(s, seq);
break;
case STATMOUNT_MNT_OPTS:
- sm->mnt_opts = seq->count;
+ sm->mnt_opts = start;
ret = statmount_mnt_opts(s, seq);
break;
+ case STATMOUNT_OPT_ARRAY:
+ sm->opt_array = start;
+ ret = statmount_opt_array(s, seq);
+ break;
+ case STATMOUNT_OPT_SEC_ARRAY:
+ sm->opt_sec_array = start;
+ ret = statmount_opt_sec_array(s, seq);
+ break;
+ case STATMOUNT_FS_SUBTYPE:
+ sm->fs_subtype = start;
+ statmount_fs_subtype(s, seq);
+ break;
+ case STATMOUNT_SB_SOURCE:
+ sm->sb_source = start;
+ ret = statmount_sb_source(s, seq);
+ break;
default:
WARN_ON_ONCE(true);
return -EINVAL;
}
+ /*
+ * If nothing was emitted, return to avoid setting the flag
+ * and terminating the buffer.
+ */
+ if (seq->count == start)
+ return ret;
if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
return -EOVERFLOW;
if (kbufsize >= s->bufsize)
@@ -5203,6 +5336,18 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!err && s->mask & STATMOUNT_MNT_OPTS)
err = statmount_string(s, STATMOUNT_MNT_OPTS);
+ if (!err && s->mask & STATMOUNT_OPT_ARRAY)
+ err = statmount_string(s, STATMOUNT_OPT_ARRAY);
+
+ if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
+ err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);
+
+ if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
+ err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
+
+ if (!err && s->mask & STATMOUNT_SB_SOURCE)
+ err = statmount_string(s, STATMOUNT_SB_SOURCE);
+
if (!err && s->mask & STATMOUNT_MNT_NS_ID)
statmount_mnt_ns_id(s, ns);
@@ -5224,7 +5369,9 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
}
#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
- STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS)
+ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
+ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
+ STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY)
static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
struct statmount __user *buf, size_t bufsize,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 551d2958ec29..d80406f8b568 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -8001,9 +8001,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READW_LT:
- if (nfsd4_has_session(cstate) ||
- exportfs_lock_op_is_async(sb->s_export_op))
- flags |= FL_SLEEP;
fallthrough;
case NFS4_READ_LT:
spin_lock(&fp->fi_lock);
@@ -8014,9 +8011,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
type = F_RDLCK;
break;
case NFS4_WRITEW_LT:
- if (nfsd4_has_session(cstate) ||
- exportfs_lock_op_is_async(sb->s_export_op))
- flags |= FL_SLEEP;
fallthrough;
case NFS4_WRITE_LT:
spin_lock(&fp->fi_lock);
@@ -8036,15 +8030,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- /*
- * Most filesystems with their own ->lock operations will block
- * the nfsd thread waiting to acquire the lock. That leads to
- * deadlocks (we don't want every nfsd thread tied up waiting
- * for file locks), so don't attempt blocking lock notifications
- * on those filesystems:
- */
- if (!exportfs_lock_op_is_async(sb->s_export_op))
- flags &= ~FL_SLEEP;
+ if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) &&
+ nfsd4_has_session(cstate) &&
+ locks_can_async_lock(nf->nf_file->f_op))
+ flags |= FL_SLEEP;
nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
if (!nbl) {
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 96b684763b39..b95724b767e1 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -280,5 +280,4 @@ const struct export_operations ocfs2_export_ops = {
.fh_to_dentry = ocfs2_fh_to_dentry,
.fh_to_parent = ocfs2_fh_to_parent,
.get_parent = ocfs2_get_parent,
- .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 06af21982c16..4fa6c840d20b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2812,6 +2812,7 @@ const struct file_operations ocfs2_fops = {
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
.remap_file_range = ocfs2_remap_file_range,
+ .fop_flags = FOP_ASYNC_LOCK,
};
WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
@@ -2828,6 +2829,7 @@ const struct file_operations ocfs2_dops = {
#endif
.lock = ocfs2_lock,
.flock = ocfs2_flock,
+ .fop_flags = FOP_ASYNC_LOCK,
};
/*
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 6c66a37522d0..4050942ab52f 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -200,11 +200,11 @@ EXPORT_SYMBOL(posix_acl_init);
* Allocate a new ACL with the specified number of entries.
*/
struct posix_acl *
-posix_acl_alloc(int count, gfp_t flags)
+posix_acl_alloc(unsigned int count, gfp_t flags)
{
- const size_t size = sizeof(struct posix_acl) +
- count * sizeof(struct posix_acl_entry);
- struct posix_acl *acl = kmalloc(size, flags);
+ struct posix_acl *acl;
+
+ acl = kmalloc(struct_size(acl, a_entries, count), flags);
if (acl)
posix_acl_init(acl, count);
return acl;
@@ -220,9 +220,8 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
struct posix_acl *clone = NULL;
if (acl) {
- int size = sizeof(struct posix_acl) + acl->a_count *
- sizeof(struct posix_acl_entry);
- clone = kmemdup(acl, size, flags);
+ clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count),
+ flags);
if (clone)
refcount_set(&clone->a_refcount, 1);
}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e676c8b0cf5d..8bbb1ad46335 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -343,8 +343,8 @@ EXPORT_SYMBOL(seq_lseek);
/**
* seq_release - free the structures associated with sequential file.
- * @file: file in question
* @inode: its inode
+ * @file: file in question
*
* Frees the structures associated with sequential file; can be used
* as ->f_op->release() if you don't have private data to destroy.
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 3337745d81bd..0c0d00fcd131 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -42,7 +42,7 @@ static inline void eventpoll_release(struct file *file)
* because the file in on the way to be removed and nobody ( but
* eventpoll ) has still a reference to this file.
*/
- if (likely(!file->f_ep))
+ if (likely(!READ_ONCE(file->f_ep)))
return;
/*
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 893a1d21dc1c..1ab165c2939f 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -250,19 +250,6 @@ struct export_operations {
unsigned long flags;
};
-/**
- * exportfs_lock_op_is_async() - export op supports async lock operation
- * @export_ops: the nfs export operations to check
- *
- * Returns true if the nfs export_operations structure has
- * EXPORT_OP_ASYNC_LOCK in their flags set
- */
-static inline bool
-exportfs_lock_op_is_async(const struct export_operations *export_ops)
-{
- return export_ops->flags & EXPORT_OP_ASYNC_LOCK;
-}
-
extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
int *max_len, struct inode *parent,
int flags);
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index bb44224c6676..c412ded9171e 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -180,6 +180,11 @@ static inline void locks_wake_up(struct file_lock *fl)
wake_up(&fl->c.flc_wait);
}
+static inline bool locks_can_async_lock(const struct file_operations *fops)
+{
+ return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK;
+}
+
/* fs/locks.c */
void locks_free_lock_context(struct inode *inode);
void locks_free_lock(struct file_lock *fl);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d32c6f6298b1..6b062f62cbb8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2124,6 +2124,8 @@ struct file_operations {
#define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4))
/* Treat loff_t as unsigned (e.g., /dev/mem) */
#define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5))
+/* Supports asynchronous lock callbacks */
+#define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6))
/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 0e65b3d634d9..e2d47eb1a7f3 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -28,9 +28,9 @@ struct posix_acl_entry {
struct posix_acl {
refcount_t a_refcount;
- struct rcu_head a_rcu;
unsigned int a_count;
- struct posix_acl_entry a_entries[];
+ struct rcu_head a_rcu;
+ struct posix_acl_entry a_entries[] __counted_by(a_count);
};
#define FOREACH_ACL_ENTRY(pa, acl, pe) \
@@ -62,7 +62,7 @@ posix_acl_release(struct posix_acl *acl)
/* posix_acl.c */
extern void posix_acl_init(struct posix_acl *, int);
-extern struct posix_acl *posix_acl_alloc(int, gfp_t);
+extern struct posix_acl *posix_acl_alloc(unsigned int count, gfp_t flags);
extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 8aa3372f21a0..2b322a9b88a2 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -221,6 +221,7 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head);
#define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0)
+#define wake_up_sync(x) __wake_up_sync(x, TASK_NORMAL)
#define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d6db822e4bb3..d11b903c2edb 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -213,11 +213,8 @@ static inline void wait_on_inode(struct inode *inode)
#include <linux/bio.h>
void __inode_attach_wb(struct inode *inode, struct folio *folio);
-void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
- struct inode *inode)
- __releases(&inode->i_lock);
void wbc_detach_inode(struct writeback_control *wbc);
-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
size_t bytes);
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
enum wb_reason reason, struct wb_completion *done);
@@ -254,22 +251,8 @@ static inline void inode_detach_wb(struct inode *inode)
}
}
-/**
- * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
- * @wbc: writeback_control of interest
- * @inode: target inode
- *
- * This function is to be used by __filemap_fdatawrite_range(), which is an
- * alternative entry point into writeback code, and first ensures @inode is
- * associated with a bdi_writeback and attaches it to @wbc.
- */
-static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
- struct inode *inode)
-{
- spin_lock(&inode->i_lock);
- inode_attach_wb(inode, NULL);
- wbc_attach_and_unlock_inode(wbc, inode);
-}
+void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+ struct inode *inode);
/**
* wbc_init_bio - writeback specific initializtion of bio
@@ -303,13 +286,6 @@ static inline void inode_detach_wb(struct inode *inode)
{
}
-static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
- struct inode *inode)
- __releases(&inode->i_lock)
-{
- spin_unlock(&inode->i_lock);
-}
-
static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
struct inode *inode)
{
@@ -324,7 +300,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
}
static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
- struct page *page, size_t bytes)
+ struct folio *folio, size_t bytes)
{
}
diff --git a/include/trace/events/hugetlbfs.h b/include/trace/events/hugetlbfs.h
new file mode 100644
index 000000000000..8331c904a9ba
--- /dev/null
+++ b/include/trace/events/hugetlbfs.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hugetlbfs
+
+#if !defined(_TRACE_HUGETLBFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HUGETLBFS_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(hugetlbfs_alloc_inode,
+
+ TP_PROTO(struct inode *inode, struct inode *dir, int mode),
+
+ TP_ARGS(inode, dir, mode),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(ino_t, dir)
+ __field(__u16, mode)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->dir = dir->i_ino;
+ __entry->mode = mode;
+ ),
+
+ TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ (unsigned long) __entry->dir, __entry->mode)
+);
+
+DECLARE_EVENT_CLASS(hugetlbfs__inode,
+
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(__u16, mode)
+ __field(loff_t, size)
+ __field(unsigned int, nlink)
+ __field(unsigned int, seals)
+ __field(blkcnt_t, blocks)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->mode = inode->i_mode;
+ __entry->size = inode->i_size;
+ __entry->nlink = inode->i_nlink;
+ __entry->seals = HUGETLBFS_I(inode)->seals;
+ __entry->blocks = inode->i_blocks;
+ ),
+
+ TP_printk("dev %d,%d ino %lu mode 0%o size %lld nlink %u seals %u blocks %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino,
+ __entry->mode, __entry->size, __entry->nlink, __entry->seals,
+ (unsigned long long)__entry->blocks)
+);
+
+DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_evict_inode,
+
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode)
+);
+
+DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_free_inode,
+
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode)
+);
+
+TRACE_EVENT(hugetlbfs_setattr,
+
+ TP_PROTO(struct inode *inode, struct dentry *dentry,
+ struct iattr *attr),
+
+ TP_ARGS(inode, dentry, attr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(unsigned int, d_len)
+ __string(d_name, dentry->d_name.name)
+ __field(unsigned int, ia_valid)
+ __field(unsigned int, ia_mode)
+ __field(loff_t, old_size)
+ __field(loff_t, ia_size)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->d_len = dentry->d_name.len;
+ __assign_str(d_name);
+ __entry->ia_valid = attr->ia_valid;
+ __entry->ia_mode = attr->ia_mode;
+ __entry->old_size = inode->i_size;
+ __entry->ia_size = attr->ia_size;
+ ),
+
+ TP_printk("dev %d,%d ino %lu name %.*s valid %#x mode 0%o old_size %lld size %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino,
+ __entry->d_len, __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
+ __entry->old_size, __entry->ia_size)
+);
+
+TRACE_EVENT(hugetlbfs_fallocate,
+
+ TP_PROTO(struct inode *inode, int mode,
+ loff_t offset, loff_t len, int ret),
+
+ TP_ARGS(inode, mode, offset, len, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(int, mode)
+ __field(loff_t, offset)
+ __field(loff_t, len)
+ __field(loff_t, size)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->mode = mode;
+ __entry->offset = offset;
+ __entry->len = len;
+ __entry->size = inode->i_size;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("dev %d,%d ino %lu mode 0%o offset %lld len %lld size %lld ret %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long)__entry->ino, __entry->mode,
+ (unsigned long long)__entry->offset,
+ (unsigned long long)__entry->len,
+ (unsigned long long)__entry->size,
+ __entry->ret)
+);
+
+#endif /* _TRACE_HUGETLBFS_H */
+
+ /* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 225bc366ffcb..c07008816aca 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -154,7 +154,7 @@ struct mount_attr {
*/
struct statmount {
__u32 size; /* Total size, including strings */
- __u32 mnt_opts; /* [str] Mount options of the mount */
+ __u32 mnt_opts; /* [str] Options (comma separated, escaped) */
__u64 mask; /* What results were written */
__u32 sb_dev_major; /* Device ID */
__u32 sb_dev_minor;
@@ -173,7 +173,13 @@ struct statmount {
__u32 mnt_root; /* [str] Root of mount relative to root of fs */
__u32 mnt_point; /* [str] Mountpoint relative to current root */
__u64 mnt_ns_id; /* ID of the mount namespace */
- __u64 __spare2[49];
+ __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */
+ __u32 sb_source; /* [str] Source string of the mount */
+ __u32 opt_num; /* Number of fs options */
+ __u32 opt_array; /* [str] Array of nul terminated fs options */
+ __u32 opt_sec_num; /* Number of security options */
+ __u32 opt_sec_array; /* [str] Array of nul terminated security options */
+ __u64 __spare2[46];
char str[]; /* Variable size part containing strings */
};
@@ -207,6 +213,10 @@ struct mnt_id_req {
#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */
#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */
#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */
+#define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */
+#define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */
+#define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */
+#define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */
/*
* Special @mnt_id values that can be passed to listmount
diff --git a/init/initramfs.c b/init/initramfs.c
index bc911e466d5b..b2f7583bb1f5 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -360,6 +360,15 @@ static int __init do_name(void)
{
state = SkipIt;
next_state = Reset;
+
+ /* name_len > 0 && name_len <= PATH_MAX checked in do_header */
+ if (collected[name_len - 1] != '\0') {
+ pr_err("initramfs name without nulterm: %.*s\n",
+ (int)name_len, collected);
+ error("malformed archive");
+ return 1;
+ }
+
if (strcmp(collected, "TRAILER!!!") == 0) {
free_hash();
return 0;
@@ -424,6 +433,12 @@ static int __init do_copy(void)
static int __init do_symlink(void)
{
+ if (collected[name_len - 1] != '\0') {
+ pr_err("initramfs symlink without nulterm: %.*s\n",
+ (int)name_len, collected);
+ error("malformed archive");
+ return 1;
+ }
collected[N_ALIGN(name_len) + body_len] = '\0';
clean_path(collected, 0);
init_symlink(collected + N_ALIGN(name_len), collected);
diff --git a/mm/filemap.c b/mm/filemap.c
index 56fa431c52af..e7fe07f7963d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2620,6 +2620,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
loff_t isize, end_offset;
loff_t last_pos = ra->prev_pos;
+ if (unlikely(iocb->ki_pos < 0))
+ return -EINVAL;
if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
if (unlikely(!iov_iter_count(iter)))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index fcd4c1439cb9..72a5d8836425 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -54,7 +54,7 @@
#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
/*
- * Estimate write bandwidth at 200ms intervals.
+ * Estimate write bandwidth or update dirty limit at 200ms intervals.
*/
#define BANDWIDTH_INTERVAL max(HZ/5, 1)
@@ -586,7 +586,7 @@ static void wb_domain_writeout_add(struct wb_domain *dom,
/* First event after period switching was turned off? */
if (unlikely(!dom->period_time)) {
/*
- * We can race with other __bdi_writeout_inc calls here but
+ * We can race with other wb_domain_writeout_add calls here but
* it does not cause any harm since the resulting time when
* timer will fire and what is in writeout_period_time will be
* roughly the same.