summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c2
-rw-r--r--fs/9p/v9fs.c21
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_dir.c19
-rw-r--r--fs/9p/vfs_file.c24
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c17
-rw-r--r--fs/btrfs/delayed-ref.c50
-rw-r--r--fs/btrfs/extent-tree.c37
-rw-r--r--fs/btrfs/extent_io.c12
-rw-r--r--fs/btrfs/file.c12
-rw-r--r--fs/btrfs/free-space-cache.c32
-rw-r--r--fs/btrfs/inode.c15
-rw-r--r--fs/btrfs/transaction.c9
-rw-r--r--fs/btrfs/tree-log.c5
-rw-r--r--fs/buffer.c14
-rw-r--r--fs/ceph/acl.c13
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/ceph/caps.c21
-rw-r--r--fs/ceph/file.c573
-rw-r--r--fs/ceph/inode.c13
-rw-r--r--fs/ceph/mds_client.c9
-rw-r--r--fs/ceph/super.c13
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/compat_ioctl.c131
-rw-r--r--fs/cramfs/inode.c7
-rw-r--r--fs/dax.c917
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/ext2/acl.c4
-rw-r--r--fs/ext2/ext2.h4
-rw-r--r--fs/ext2/super.c5
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/f2fs/data.c6
-rw-r--r--fs/f2fs/dir.c2
-rw-r--r--fs/f2fs/f2fs.h2
-rw-r--r--fs/f2fs/inline.c2
-rw-r--r--fs/f2fs/node.c6
-rw-r--r--fs/fat/dir.c6
-rw-r--r--fs/fat/fat.h4
-rw-r--r--fs/fat/file.c17
-rw-r--r--fs/fat/inode.c9
-rw-r--r--fs/fat/misc.c91
-rw-r--r--fs/fat/namei_msdos.c17
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/fs-writeback.c25
-rw-r--r--fs/fuse/Makefile2
-rw-r--r--fs/fuse/control.c34
-rw-r--r--fs/fuse/dev.c221
-rw-r--r--fs/fuse/dir.c381
-rw-r--r--fs/fuse/file.c158
-rw-r--r--fs/fuse/fuse_i.h124
-rw-r--r--fs/fuse/inode.c53
-rw-r--r--fs/fuse/readdir.c569
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/hfs/brec.c5
-rw-r--r--fs/hfs/btree.c41
-rw-r--r--fs/hfs/btree.h1
-rw-r--r--fs/hfs/catalog.c16
-rw-r--r--fs/hfs/extent.c10
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/attributes.c10
-rw-r--r--fs/hfsplus/brec.c5
-rw-r--r--fs/hfsplus/btree.c44
-rw-r--r--fs/hfsplus/catalog.c24
-rw-r--r--fs/hfsplus/extents.c8
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c1
-rw-r--r--fs/inode.c4
-rw-r--r--fs/isofs/dir.c2
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/dns_resolve.c15
-rw-r--r--fs/nfsd/cache.h20
-rw-r--r--fs/nfsd/export.c14
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/netns.h8
-rw-r--r--fs/nfsd/nfs4callback.c98
-rw-r--r--fs/nfsd/nfs4idmap.c11
-rw-r--r--fs/nfsd/nfs4proc.c289
-rw-r--r--fs/nfsd/nfs4state.c41
-rw-r--r--fs/nfsd/nfs4xdr.c50
-rw-r--r--fs/nfsd/nfscache.c142
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/state.h10
-rw-r--r--fs/nfsd/vfs.c5
-rw-r--r--fs/nfsd/xdr4.h28
-rw-r--r--fs/nfsd/xdr4cb.h10
-rw-r--r--fs/nilfs2/btnode.c26
-rw-r--r--fs/nilfs2/page.c29
-rw-r--r--fs/notify/fanotify/fanotify.c17
-rw-r--r--fs/notify/fanotify/fanotify.h4
-rw-r--r--fs/notify/fanotify/fanotify_user.c103
-rw-r--r--fs/notify/fdinfo.c29
-rw-r--r--fs/notify/fsnotify.c42
-rw-r--r--fs/notify/fsnotify.h11
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/notify/mark.c43
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--fs/proc/vmcore.c4
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/reiserfs/Makefile9
-rw-r--r--fs/reiserfs/xattr.c7
-rw-r--r--fs/super.c2
-rw-r--r--fs/udf/balloc.c30
-rw-r--r--fs/udf/super.c139
-rw-r--r--fs/udf/udf_sb.h10
110 files changed, 3318 insertions, 1865 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 082d227fa56b..6261719f6f2a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -276,7 +276,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
switch (handler->flags) {
case ACL_TYPE_ACCESS:
if (acl) {
- struct iattr iattr;
+ struct iattr iattr = { 0 };
struct posix_acl *old_acl = acl;
retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 89bac3d2f05b..619128b55837 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -61,6 +61,8 @@ enum {
Opt_cache_loose, Opt_fscache, Opt_mmap,
/* Access options */
Opt_access, Opt_posixacl,
+ /* Lock timeout option */
+ Opt_locktimeout,
/* Error token */
Opt_err
};
@@ -80,6 +82,7 @@ static const match_table_t tokens = {
{Opt_cachetag, "cachetag=%s"},
{Opt_access, "access=%s"},
{Opt_posixacl, "posixacl"},
+ {Opt_locktimeout, "locktimeout=%u"},
{Opt_err, NULL}
};
@@ -187,6 +190,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#ifdef CONFIG_9P_FSCACHE
v9ses->cachetag = NULL;
#endif
+ v9ses->session_lock_timeout = P9_LOCK_TIMEOUT;
if (!opts)
return 0;
@@ -359,6 +363,23 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#endif
break;
+ case Opt_locktimeout:
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ ret = r;
+ continue;
+ }
+ if (option < 1) {
+ p9_debug(P9_DEBUG_ERROR,
+ "locktimeout must be a greater than zero integer.\n");
+ ret = -EINVAL;
+ continue;
+ }
+ v9ses->session_lock_timeout = (long)option * HZ;
+ break;
+
default:
continue;
}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 982e017acadb..129e5243a6bf 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -116,6 +116,7 @@ struct v9fs_session_info {
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
struct rw_semaphore rename_sem;
+ long session_lock_timeout; /* retry interval for blocking locks */
};
/* cache_validity flags */
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b0405d6aac85..cb6c4031af55 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -76,15 +76,6 @@ static inline int dt_type(struct p9_wstat *mistat)
return rettype;
}
-static void p9stat_init(struct p9_wstat *stbuf)
-{
- stbuf->name = NULL;
- stbuf->uid = NULL;
- stbuf->gid = NULL;
- stbuf->muid = NULL;
- stbuf->extension = NULL;
-}
-
/**
* v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
* @filp: opened file structure
@@ -114,7 +105,6 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
int err = 0;
struct p9_fid *fid;
int buflen;
- int reclen = 0;
struct p9_rdir *rdir;
struct kvec kvec;
@@ -145,15 +135,12 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
rdir->tail = n;
}
while (rdir->head < rdir->tail) {
- p9stat_init(&st);
err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
rdir->tail - rdir->head, &st);
- if (err) {
+ if (err <= 0) {
p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
- p9stat_free(&st);
return -EIO;
}
- reclen = st.size+2;
over = !dir_emit(ctx, st.name, strlen(st.name),
v9fs_qid2ino(&st.qid), dt_type(&st));
@@ -161,8 +148,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
if (over)
return 0;
- rdir->head += reclen;
- ctx->pos += reclen;
+ rdir->head += err;
+ ctx->pos += err;
}
}
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5f2e48d41d72..a25efa782fcc 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -154,6 +154,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
uint8_t status = P9_LOCK_ERROR;
int res = 0;
unsigned char fl_type;
+ struct v9fs_session_info *v9ses;
fid = filp->private_data;
BUG_ON(fid == NULL);
@@ -189,6 +190,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
if (IS_SETLKW(cmd))
flock.flags = P9_LOCK_FLAGS_BLOCK;
+ v9ses = v9fs_inode2v9ses(file_inode(filp));
+
/*
* if its a blocked request and we get P9_LOCK_BLOCKED as the status
* for lock request, keep on trying
@@ -202,8 +205,17 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
break;
- if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+ if (schedule_timeout_interruptible(v9ses->session_lock_timeout)
+ != 0)
break;
+ /*
+ * p9_client_lock_dotl overwrites flock.client_id with the
+ * server message, free and reuse the client name
+ */
+ if (flock.client_id != fid->clnt->name) {
+ kfree(flock.client_id);
+ flock.client_id = fid->clnt->name;
+ }
}
/* map 9p status to VFS status */
@@ -216,7 +228,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
default:
WARN_ONCE(1, "unknown lock status code: %d\n", status);
- /* fallthough */
+ /* fall through */
case P9_LOCK_ERROR:
case P9_LOCK_GRACE:
res = -ENOLCK;
@@ -235,6 +247,8 @@ out_unlock:
locks_lock_file_wait(filp, fl);
fl->fl_type = fl_type;
}
+ if (flock.client_id != fid->clnt->name)
+ kfree(flock.client_id);
out:
return res;
}
@@ -269,7 +283,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
res = p9_client_getlock_dotl(fid, &glock);
if (res < 0)
- return res;
+ goto out;
/* map 9p lock type to os lock type */
switch (glock.type) {
case P9_LOCK_TYPE_RDLCK:
@@ -290,7 +304,9 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
fl->fl_end = glock.start + glock.length - 1;
fl->fl_pid = -glock.proc_id;
}
- kfree(glock.client_id);
+out:
+ if (glock.client_id != fid->clnt->name)
+ kfree(glock.client_id);
return res;
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8703ce68fe9d..2955a4ea2fa8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -437,10 +437,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&mapping->i_pages, pg_index);
- rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page)) {
+ page = xa_load(&mapping->i_pages, pg_index);
+ if (page && !xa_is_value(page)) {
misses++;
if (misses > 4)
break;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2ee43b6a4f09..539901fb5165 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1014,9 +1014,26 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
+ /*
+ * If we are COWing a node/leaf from the extent, chunk or device trees,
+ * make sure that we do not finish block group creation of pending block
+ * groups. We do this to avoid a deadlock.
+ * COWing can result in allocation of a new chunk, and flushing pending
+ * block groups (btrfs_create_pending_block_groups()) can be triggered
+ * when finishing allocation of a new chunk. Creation of a pending block
+ * group modifies the extent, chunk and device trees, therefore we could
+ * deadlock with ourselves since we are holding a lock on an extent
+ * buffer that btrfs_create_pending_block_groups() may try to COW later.
+ */
+ if (root == fs_info->extent_root ||
+ root == fs_info->chunk_root ||
+ root == fs_info->dev_root)
+ trans->can_flush_pending_bgs = false;
+
cow = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, &disk_key, level,
search_start, empty_size);
+ trans->can_flush_pending_bgs = true;
if (IS_ERR(cow))
return PTR_ERR(cow);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 5149165b49a4..9301b3ad9217 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -164,14 +164,27 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
return NULL;
}
+static struct btrfs_delayed_ref_head *find_first_ref_head(
+ struct btrfs_delayed_ref_root *dr)
+{
+ struct rb_node *n;
+ struct btrfs_delayed_ref_head *entry;
+
+ n = rb_first_cached(&dr->href_root);
+ if (!n)
+ return NULL;
+
+ entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+
+ return entry;
+}
+
/*
- * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot.
- * If return_bigger is given, the next bigger entry is returned if no exact
- * match is found. But if no bigger one is found then the first node of the
- * ref head tree will be returned.
+ * Find a head entry based on bytenr. This returns the delayed ref head if it
+ * was able to find one, or NULL if nothing was in that spot. If return_bigger
+ * is given, the next bigger entry is returned if no exact match is found.
*/
-static struct btrfs_delayed_ref_head* find_ref_head(
+static struct btrfs_delayed_ref_head *find_ref_head(
struct btrfs_delayed_ref_root *dr, u64 bytenr,
bool return_bigger)
{
@@ -195,10 +208,9 @@ static struct btrfs_delayed_ref_head* find_ref_head(
if (bytenr > entry->bytenr) {
n = rb_next(&entry->href_node);
if (!n)
- n = rb_first_cached(&dr->href_root);
+ return NULL;
entry = rb_entry(n, struct btrfs_delayed_ref_head,
href_node);
- return entry;
}
return entry;
}
@@ -355,33 +367,25 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
struct btrfs_delayed_ref_root *delayed_refs)
{
struct btrfs_delayed_ref_head *head;
- u64 start;
- bool loop = false;
again:
- start = delayed_refs->run_delayed_start;
- head = find_ref_head(delayed_refs, start, true);
- if (!head && !loop) {
+ head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
+ true);
+ if (!head && delayed_refs->run_delayed_start != 0) {
delayed_refs->run_delayed_start = 0;
- start = 0;
- loop = true;
- head = find_ref_head(delayed_refs, start, true);
- if (!head)
- return NULL;
- } else if (!head && loop) {
- return NULL;
+ head = find_first_ref_head(delayed_refs);
}
+ if (!head)
+ return NULL;
while (head->processing) {
struct rb_node *node;
node = rb_next(&head->href_node);
if (!node) {
- if (loop)
+ if (delayed_refs->run_delayed_start == 0)
return NULL;
delayed_refs->run_delayed_start = 0;
- start = 0;
- loop = true;
goto again;
}
head = rb_entry(node, struct btrfs_delayed_ref_head,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a4cd0221bc8d..a1febf155747 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2366,6 +2366,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
insert_reserved);
else
BUG();
+ if (ret && insert_reserved)
+ btrfs_pin_extent(trans->fs_info, node->bytenr,
+ node->num_bytes, 1);
return ret;
}
@@ -2954,7 +2957,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2971,7 +2973,6 @@ again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- trans->can_flush_pending_bgs = false;
ret = __btrfs_run_delayed_refs(trans, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -3002,7 +3003,6 @@ again:
goto again;
}
out:
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -4568,6 +4568,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
goto out;
} else {
ret = 1;
+ space_info->max_extent_size = 0;
}
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
@@ -4589,11 +4590,9 @@ out:
* the block groups that were made dirty during the lifetime of the
* transaction.
*/
- if (trans->can_flush_pending_bgs &&
- trans->chunk_bytes_reserved >= (u64)SZ_2M) {
+ if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
btrfs_create_pending_block_groups(trans);
- btrfs_trans_release_chunk_metadata(trans);
- }
+
return ret;
}
@@ -6464,6 +6463,7 @@ static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+ space_info->max_extent_size = 0;
if (delalloc)
cache->delalloc_bytes -= num_bytes;
@@ -7260,6 +7260,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
u64 max_extent_size = 0;
+ u64 max_free_space = 0;
u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
@@ -7555,8 +7556,8 @@ unclustered_alloc:
spin_lock(&ctl->tree_lock);
if (ctl->free_space <
num_bytes + empty_cluster + empty_size) {
- if (ctl->free_space > max_extent_size)
- max_extent_size = ctl->free_space;
+ max_free_space = max(max_free_space,
+ ctl->free_space);
spin_unlock(&ctl->tree_lock);
goto loop;
}
@@ -7723,6 +7724,8 @@ loop:
}
out:
if (ret == -ENOSPC) {
+ if (!max_extent_size)
+ max_extent_size = max_free_space;
spin_lock(&space_info->lock);
space_info->max_extent_size = max_extent_size;
spin_unlock(&space_info->lock);
@@ -8004,21 +8007,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
path = btrfs_alloc_path();
- if (!path) {
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
+ if (!path)
return -ENOMEM;
- }
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
&extent_key, size);
if (ret) {
btrfs_free_path(path);
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
return ret;
}
@@ -10132,9 +10128,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_block_group_item item;
struct btrfs_key key;
int ret = 0;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
- trans->can_flush_pending_bgs = false;
+ if (!trans->can_flush_pending_bgs)
+ return;
+
while (!list_empty(&trans->new_bgs)) {
block_group = list_first_entry(&trans->new_bgs,
struct btrfs_block_group_cache,
@@ -10159,7 +10156,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
next:
list_del_init(&block_group->bg_list);
}
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
+ btrfs_trans_release_chunk_metadata(trans);
}
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6877a74c7469..d228f706ff3e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3784,7 +3784,7 @@ int btree_write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
@@ -3909,7 +3909,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int range_whole = 0;
int scanned = 0;
- int tag;
+ xa_mark_t tag;
/*
* We have to hold onto the inode so that ordered extents can do their
@@ -5159,11 +5159,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
clear_page_dirty_for_io(page);
xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page)) {
- radix_tree_tag_clear(&page->mapping->i_pages,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- }
+ if (!PageDirty(page))
+ __xa_clear_mark(&page->mapping->i_pages,
+ page_index(page), PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&page->mapping->i_pages);
ClearPageError(page);
unlock_page(page);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 15b925142793..97c7a086f7bd 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2078,6 +2078,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
inode_lock(inode);
+
+ /*
+ * We take the dio_sem here because the tree log stuff can race with
+ * lockless dio writes and get an extent map logged for an extent we
+ * never waited on. We need it this high up for lockdep reasons.
+ */
+ down_write(&BTRFS_I(inode)->dio_sem);
+
atomic_inc(&root->log_batch);
/*
@@ -2086,6 +2094,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2109,6 +2118,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2127,6 +2137,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2148,6 +2159,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
/*
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 67441219d6c9..4ba0aedc878b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1772,6 +1772,13 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
return -1;
}
+static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
+{
+ if (entry->bitmap)
+ return entry->max_extent_size;
+ return entry->bytes;
+}
+
/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
@@ -1793,8 +1800,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
for (node = &entry->offset_index; node; node = rb_next(node)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bytes < *bytes) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1812,8 +1819,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
}
if (entry->bytes < *bytes + align_off) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1825,8 +1832,10 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
*offset = tmp;
*bytes = size;
return entry;
- } else if (size > *max_extent_size) {
- *max_extent_size = size;
+ } else {
+ *max_extent_size =
+ max(get_max_extent_size(entry),
+ *max_extent_size);
}
continue;
}
@@ -2449,6 +2458,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
struct rb_node *n;
int count = 0;
+ spin_lock(&ctl->tree_lock);
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes && !block_group->ro)
@@ -2457,6 +2467,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info->offset, info->bytes,
(info->bitmap) ? "yes" : "no");
}
+ spin_unlock(&ctl->tree_lock);
btrfs_info(fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes");
btrfs_info(fs_info,
@@ -2685,8 +2696,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
- if (search_bytes > *max_extent_size)
- *max_extent_size = search_bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
return 0;
}
@@ -2723,8 +2734,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
while (1) {
- if (entry->bytes < bytes && entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ if (entry->bytes < bytes)
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
if (entry->bytes < bytes ||
(!entry->bitmap && entry->offset < min_start)) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 181c58b23110..d3df5b52278c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -502,6 +502,7 @@ again:
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
+ nr_pages = 0;
goto cont;
}
@@ -2940,6 +2941,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
bool truncated = false;
bool range_locked = false;
bool clear_new_delalloc_bytes = false;
+ bool clear_reserved_extent = true;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
@@ -3043,10 +3045,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
- if (!ret)
+ if (!ret) {
+ clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
ordered_extent->start,
ordered_extent->disk_len);
+ }
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
@@ -3107,8 +3111,13 @@ out:
* wrong we need to return the space for this ordered extent
* back to the allocator. We only free the extent in the
* truncated case if we didn't write out the extent at all.
+ *
+ * If we made it past insert_reserved_file_extent before we
+ * errored out then we don't need to do this as the accounting
+ * has already been done.
*/
if ((ret || !logical_len) &&
+ clear_reserved_extent &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(fs_info,
@@ -5259,11 +5268,13 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct extent_state *cached_state = NULL;
u64 start;
u64 end;
+ unsigned state_flags;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
start = state->start;
end = state->end;
+ state_flags = state->state;
spin_unlock(&io_tree->lock);
lock_extent_bits(io_tree, start, end, &cached_state);
@@ -5276,7 +5287,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
- if (state->state & EXTENT_DELALLOC)
+ if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
clear_extent_bit(io_tree, start, end,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5686290a50e1..d1eeef9ec5da 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2283,15 +2283,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- /*
- * If fs has been frozen, we can not handle delayed iputs, otherwise
- * it'll result in deadlock about SB_FREEZE_FS.
- */
- if (current != fs_info->transaction_kthread &&
- current != fs_info->cleaner_kthread &&
- !test_bit(BTRFS_FS_FROZEN, &fs_info->flags))
- btrfs_run_delayed_iputs(fs_info);
-
return ret;
scrub_continue:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0dba09334a16..e07f3376b7df 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4390,7 +4390,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&extents);
- down_write(&inode->dio_sem);
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
logged_start = start;
@@ -4456,7 +4455,6 @@ process:
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- up_write(&inode->dio_sem);
btrfs_release_path(path);
if (!ret)
@@ -4652,7 +4650,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
ASSERT(len == i_size ||
(len == fs_info->sectorsize &&
btrfs_file_extent_compression(leaf, extent) !=
- BTRFS_COMPRESS_NONE));
+ BTRFS_COMPRESS_NONE) ||
+ (len < i_size && i_size < fs_info->sectorsize));
return 0;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 109f55196866..d60d61e8ed7d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -562,7 +562,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
EXPORT_SYMBOL(mark_buffer_dirty_inode);
/*
- * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
+ * Mark the page dirty, and set it dirty in the page cache, and mark the inode
* dirty.
*
* If warn is true, then emit a warning if the page is not uptodate and has
@@ -579,8 +579,8 @@ void __set_page_dirty(struct page *page, struct address_space *mapping,
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&mapping->i_pages, page_index(page),
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
@@ -1050,7 +1050,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* The relationship between dirty buffers and dirty pages:
*
* Whenever a page has any dirty buffers, the page's dirty bit is set, and
- * the page is tagged dirty in its radix tree.
+ * the page is tagged dirty in the page cache.
*
* At all times, the dirtiness of the buffers represents the dirtiness of
* subsections of the page. If the page has buffers, the page dirty bit is
@@ -1073,9 +1073,9 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* mark_buffer_dirty - mark a buffer_head as needing writeout
* @bh: the buffer_head to mark dirty
*
- * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
- * backing page dirty, then tag the page as dirty in its address_space's radix
- * tree and then attach the address_space's inode to its superblock's dirty
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set
+ * its backing page dirty, then tag the page as dirty in the page cache
+ * and then attach the address_space's inode to its superblock's dirty
* inode list.
*
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 027408d55aee..5f0103f40079 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -104,6 +104,11 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
struct timespec64 old_ctime = inode->i_ctime;
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+ if (ceph_snap(inode) != CEPH_NOSNAP) {
+ ret = -EROFS;
+ goto out;
+ }
+
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
@@ -138,11 +143,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
goto out_free;
}
- if (ceph_snap(inode) != CEPH_NOSNAP) {
- ret = -EROFS;
- goto out_free;
- }
-
if (new_mode != old_mode) {
newattrs.ia_ctime = current_time(inode);
newattrs.ia_mode = new_mode;
@@ -206,10 +206,9 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
if (!tmp_buf)
goto out_err;
- pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
+ pagelist = ceph_pagelist_alloc(GFP_KERNEL);
if (!pagelist)
goto out_err;
- ceph_pagelist_init(pagelist);
err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
if (err)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9c332a6f6667..8eade7a993c1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -322,7 +322,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
/* caller of readpages does not hold buffer and read caps
* (fadvise, madvise and readahead cases) */
int want = CEPH_CAP_FILE_CACHE;
- ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got);
+ ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got);
if (ret < 0) {
dout("start_read %p, error getting cap\n", inode);
} else if (!(got & want)) {
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index dd7dfdd2ba13..f3496db4bb3e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -519,9 +519,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
* -> we take mdsc->cap_delay_lock
*/
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci)
+ struct ceph_inode_info *ci,
+ bool set_timeout)
{
- __cap_set_timeouts(mdsc, ci);
dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
@@ -531,6 +531,8 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
goto no_change;
list_del_init(&ci->i_cap_delay_list);
}
+ if (set_timeout)
+ __cap_set_timeouts(mdsc, ci);
list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
no_change:
spin_unlock(&mdsc->cap_delay_lock);
@@ -720,7 +722,7 @@ void ceph_add_cap(struct inode *inode,
dout(" issued %s, mds wanted %s, actual %s, queueing\n",
ceph_cap_string(issued), ceph_cap_string(wanted),
ceph_cap_string(actual_wanted));
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, true);
}
if (flags & CEPH_CAP_FLAG_AUTH) {
@@ -1647,7 +1649,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
(mask & CEPH_CAP_FILE_BUFFER))
dirty |= I_DIRTY_DATASYNC;
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, true);
return dirty;
}
@@ -2065,7 +2067,7 @@ ack:
/* Reschedule delayed caps release if we delayed anything */
if (delayed)
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, false);
spin_unlock(&ci->i_ceph_lock);
@@ -2125,7 +2127,7 @@ retry:
if (delayed) {
spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, true);
spin_unlock(&ci->i_ceph_lock);
}
} else {
@@ -2671,17 +2673,18 @@ static void check_max_size(struct inode *inode, loff_t endoff)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
-int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got)
+int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
+ bool nonblock, int *got)
{
int ret, err = 0;
BUG_ON(need & ~CEPH_CAP_FILE_RD);
- BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
ret = ceph_pool_perm_check(ci, need);
if (ret < 0)
return ret;
- ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
+ ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err);
if (ret) {
if (err == -EAGAIN) {
ret = 0;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 92ab20433682..f788496fafcc 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/striper.h>
#include <linux/module.h>
#include <linux/sched.h>
@@ -557,90 +558,26 @@ enum {
};
/*
- * Read a range of bytes striped over one or more objects. Iterate over
- * objects we stripe over. (That's not atomic, but good enough for now.)
- *
- * If we get a short result from the OSD, check against i_size; we need to
- * only return a short read to the caller if we hit EOF.
- */
-static int striped_read(struct inode *inode,
- u64 pos, u64 len,
- struct page **pages, int num_pages,
- int page_align, int *checkeof)
-{
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_inode_info *ci = ceph_inode(inode);
- u64 this_len;
- loff_t i_size;
- int page_idx;
- int ret, read = 0;
- bool hit_stripe, was_short;
-
- /*
- * we may need to do multiple reads. not atomic, unfortunately.
- */
-more:
- this_len = len;
- page_idx = (page_align + read) >> PAGE_SHIFT;
- ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
- &ci->i_layout, pos, &this_len,
- ci->i_truncate_seq, ci->i_truncate_size,
- pages + page_idx, num_pages - page_idx,
- ((page_align + read) & ~PAGE_MASK));
- if (ret == -ENOENT)
- ret = 0;
- hit_stripe = this_len < len;
- was_short = ret >= 0 && ret < this_len;
- dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
- ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
-
- i_size = i_size_read(inode);
- if (ret >= 0) {
- if (was_short && (pos + ret < i_size)) {
- int zlen = min(this_len - ret, i_size - pos - ret);
- int zoff = page_align + read + ret;
- dout(" zero gap %llu to %llu\n",
- pos + ret, pos + ret + zlen);
- ceph_zero_page_vector_range(zoff, zlen, pages);
- ret += zlen;
- }
-
- read += ret;
- pos += ret;
- len -= ret;
-
- /* hit stripe and need continue*/
- if (len && hit_stripe && pos < i_size)
- goto more;
- }
-
- if (read > 0) {
- ret = read;
- /* did we bounce off eof? */
- if (pos + len > i_size)
- *checkeof = CHECK_EOF;
- }
-
- dout("striped_read returns %d\n", ret);
- return ret;
-}
-
-/*
* Completely synchronous read and write methods. Direct from __user
* buffer to osd, or directly to user pages (if O_DIRECT).
*
- * If the read spans object boundary, just do multiple reads.
+ * If the read spans object boundary, just do multiple reads. (That's not
+ * atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
*/
static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
- int *checkeof)
+ int *retry_op)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct page **pages;
- u64 off = iocb->ki_pos;
- int num_pages;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
ssize_t ret;
- size_t len = iov_iter_count(to);
+ u64 off = iocb->ki_pos;
+ u64 len = iov_iter_count(to);
dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
@@ -653,61 +590,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
* but it will at least behave sensibly when they are
* in sequence.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, off,
- off + len);
+ ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
if (ret < 0)
return ret;
- if (unlikely(to->type & ITER_PIPE)) {
+ ret = 0;
+ while ((len = iov_iter_count(to)) > 0) {
+ struct ceph_osd_request *req;
+ struct page **pages;
+ int num_pages;
size_t page_off;
- ret = iov_iter_get_pages_alloc(to, &pages, len,
- &page_off);
- if (ret <= 0)
- return -ENOMEM;
- num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+ u64 i_size;
+ bool more;
+
+ req = ceph_osdc_new_request(osdc, &ci->i_layout,
+ ci->i_vino, off, &len, 0, 1,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
+
+ more = len < iov_iter_count(to);
- ret = striped_read(inode, off, ret, pages, num_pages,
- page_off, checkeof);
- if (ret > 0) {
- iov_iter_advance(to, ret);
- off += ret;
+ if (unlikely(to->type & ITER_PIPE)) {
+ ret = iov_iter_get_pages_alloc(to, &pages, len,
+ &page_off);
+ if (ret <= 0) {
+ ceph_osdc_put_request(req);
+ ret = -ENOMEM;
+ break;
+ }
+ num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+ if (ret < len) {
+ len = ret;
+ osd_req_op_extent_update(req, 0, len);
+ more = false;
+ }
} else {
- iov_iter_advance(to, 0);
+ num_pages = calc_pages_for(off, len);
+ page_off = off & ~PAGE_MASK;
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ ceph_osdc_put_request(req);
+ ret = PTR_ERR(pages);
+ break;
+ }
}
- ceph_put_page_vector(pages, num_pages, false);
- } else {
- num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
- ret = striped_read(inode, off, len, pages, num_pages,
- (off & ~PAGE_MASK), checkeof);
- if (ret > 0) {
- int l, k = 0;
- size_t left = ret;
-
- while (left) {
- size_t page_off = off & ~PAGE_MASK;
- size_t copy = min_t(size_t, left,
- PAGE_SIZE - page_off);
- l = copy_page_to_iter(pages[k++], page_off,
- copy, to);
- off += l;
- left -= l;
- if (l < copy)
+
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
+ false, false);
+ ret = ceph_osdc_start_request(osdc, req, false);
+ if (!ret)
+ ret = ceph_osdc_wait_request(osdc, req);
+ ceph_osdc_put_request(req);
+
+ i_size = i_size_read(inode);
+ dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
+ off, len, ret, i_size, (more ? " MORE" : ""));
+
+ if (ret == -ENOENT)
+ ret = 0;
+ if (ret >= 0 && ret < len && (off + ret < i_size)) {
+ int zlen = min(len - ret, i_size - off - ret);
+ int zoff = page_off + ret;
+ dout("sync_read zero gap %llu~%llu\n",
+ off + ret, off + ret + zlen);
+ ceph_zero_page_vector_range(zoff, zlen, pages);
+ ret += zlen;
+ }
+
+ if (unlikely(to->type & ITER_PIPE)) {
+ if (ret > 0) {
+ iov_iter_advance(to, ret);
+ off += ret;
+ } else {
+ iov_iter_advance(to, 0);
+ }
+ ceph_put_page_vector(pages, num_pages, false);
+ } else {
+ int idx = 0;
+ size_t left = ret > 0 ? ret : 0;
+ while (left > 0) {
+ size_t len, copied;
+ page_off = off & ~PAGE_MASK;
+ len = min_t(size_t, left, PAGE_SIZE - page_off);
+ copied = copy_page_to_iter(pages[idx++],
+ page_off, len, to);
+ off += copied;
+ left -= copied;
+ if (copied < len) {
+ ret = -EFAULT;
break;
+ }
}
+ ceph_release_page_vector(pages, num_pages);
}
- ceph_release_page_vector(pages, num_pages);
+
+ if (ret <= 0 || off >= i_size || !more)
+ break;
}
if (off > iocb->ki_pos) {
+ if (ret >= 0 &&
+ iov_iter_count(to) > 0 && off >= i_size_read(inode))
+ *retry_op = CHECK_EOF;
ret = off - iocb->ki_pos;
iocb->ki_pos = off;
}
- dout("sync_read result %zd\n", ret);
+ dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
return ret;
}
@@ -865,7 +859,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
}
spin_unlock(&ci->i_ceph_lock);
- req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
+ req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1,
false, GFP_NOFS);
if (!req) {
ret = -ENOMEM;
@@ -877,6 +871,11 @@ static void ceph_aio_retry_work(struct work_struct *work)
ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
+ req->r_ops[0] = orig_req->r_ops[0];
+
+ req->r_mtime = aio_req->mtime;
+ req->r_data_offset = req->r_ops[0].extent.offset;
+
ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
if (ret) {
ceph_osdc_put_request(req);
@@ -884,11 +883,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
goto out;
}
- req->r_ops[0] = orig_req->r_ops[0];
-
- req->r_mtime = aio_req->mtime;
- req->r_data_offset = req->r_ops[0].extent.offset;
-
ceph_osdc_put_request(orig_req);
req->r_callback = ceph_aio_complete_req;
@@ -1735,7 +1729,6 @@ static long ceph_fallocate(struct file *file, int mode,
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_cap_flush *prealloc_cf;
int want, got = 0;
int dirty;
@@ -1743,10 +1736,7 @@ static long ceph_fallocate(struct file *file, int mode,
loff_t endoff = 0;
loff_t size;
- if ((offset + length) > max(i_size_read(inode), fsc->max_file_size))
- return -EFBIG;
-
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
if (!S_ISREG(inode->i_mode))
@@ -1763,18 +1753,6 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
- if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
- ceph_quota_is_max_bytes_exceeded(inode, offset + length)) {
- ret = -EDQUOT;
- goto unlock;
- }
-
- if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL) &&
- !(mode & FALLOC_FL_PUNCH_HOLE)) {
- ret = -ENOSPC;
- goto unlock;
- }
-
if (ci->i_inline_version != CEPH_INLINE_NONE) {
ret = ceph_uninline_data(file, NULL);
if (ret < 0)
@@ -1782,12 +1760,12 @@ static long ceph_fallocate(struct file *file, int mode,
}
size = i_size_read(inode);
- if (!(mode & FALLOC_FL_KEEP_SIZE)) {
- endoff = offset + length;
- ret = inode_newsize_ok(inode, endoff);
- if (ret)
- goto unlock;
- }
+
+ /* Are we punching a hole beyond EOF? */
+ if (offset >= size)
+ goto unlock;
+ if ((offset + length) > size)
+ length = size - offset;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
@@ -1798,16 +1776,8 @@ static long ceph_fallocate(struct file *file, int mode,
if (ret < 0)
goto unlock;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- if (offset < size)
- ceph_zero_pagecache_range(inode, offset, length);
- ret = ceph_zero_objects(inode, offset, length);
- } else if (endoff > size) {
- truncate_pagecache_range(inode, size, -1);
- if (ceph_inode_set_size(inode, endoff))
- ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY, NULL);
- }
+ ceph_zero_pagecache_range(inode, offset, length);
+ ret = ceph_zero_objects(inode, offset, length);
if (!ret) {
spin_lock(&ci->i_ceph_lock);
@@ -1817,9 +1787,6 @@ static long ceph_fallocate(struct file *file, int mode,
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
- if ((endoff > size) &&
- ceph_quota_is_max_bytes_approaching(inode, endoff))
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
}
ceph_put_cap_refs(ci, got);
@@ -1829,6 +1796,300 @@ unlock:
return ret;
}
+/*
+ * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
+ * src_ci. Two attempts are made to obtain both caps, and an error is return if
+ * this fails; zero is returned on success.
+ */
+static int get_rd_wr_caps(struct ceph_inode_info *src_ci,
+ loff_t src_endoff, int *src_got,
+ struct ceph_inode_info *dst_ci,
+ loff_t dst_endoff, int *dst_got)
+{
+ int ret = 0;
+ bool retrying = false;
+
+retry_caps:
+ ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+ dst_endoff, dst_got, NULL);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Since we're already holding the FILE_WR capability for the dst file,
+ * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some
+ * retry dance instead to try to get both capabilities.
+ */
+ ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
+ false, src_got);
+ if (ret <= 0) {
+ /* Start by dropping dst_ci caps and getting src_ci caps */
+ ceph_put_cap_refs(dst_ci, *dst_got);
+ if (retrying) {
+ if (!ret)
+ /* ceph_try_get_caps masks EAGAIN */
+ ret = -EAGAIN;
+ return ret;
+ }
+ ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD,
+ CEPH_CAP_FILE_SHARED, src_endoff,
+ src_got, NULL);
+ if (ret < 0)
+ return ret;
+ /*... drop src_ci caps too, and retry */
+ ceph_put_cap_refs(src_ci, *src_got);
+ retrying = true;
+ goto retry_caps;
+ }
+ return ret;
+}
+
+static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
+ struct ceph_inode_info *dst_ci, int dst_got)
+{
+ ceph_put_cap_refs(src_ci, src_got);
+ ceph_put_cap_refs(dst_ci, dst_got);
+}
+
+/*
+ * This function does several size-related checks, returning an error if:
+ * - source file is smaller than off+len
+ * - destination file size is not OK (inode_newsize_ok())
+ * - max bytes quotas is exceeded
+ */
+static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
+ loff_t src_off, loff_t dst_off, size_t len)
+{
+ loff_t size, endoff;
+
+ size = i_size_read(src_inode);
+ /*
+ * Don't copy beyond source file EOF. Instead of simply setting length
+ * to (size - src_off), just drop to VFS default implementation, as the
+ * local i_size may be stale due to other clients writing to the source
+ * inode.
+ */
+ if (src_off + len > size) {
+ dout("Copy beyond EOF (%llu + %zu > %llu)\n",
+ src_off, len, size);
+ return -EOPNOTSUPP;
+ }
+ size = i_size_read(dst_inode);
+
+ endoff = dst_off + len;
+ if (inode_newsize_ok(dst_inode, endoff))
+ return -EOPNOTSUPP;
+
+ if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff))
+ return -EDQUOT;
+
+ return 0;
+}
+
+static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off,
+ size_t len, unsigned int flags)
+{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *dst_inode = file_inode(dst_file);
+ struct ceph_inode_info *src_ci = ceph_inode(src_inode);
+ struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
+ struct ceph_cap_flush *prealloc_cf;
+ struct ceph_object_locator src_oloc, dst_oloc;
+ struct ceph_object_id src_oid, dst_oid;
+ loff_t endoff = 0, size;
+ ssize_t ret = -EIO;
+ u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
+ u32 src_objlen, dst_objlen, object_size;
+ int src_got = 0, dst_got = 0, err, dirty;
+ bool do_final_copy = false;
+
+ if (src_inode == dst_inode)
+ return -EINVAL;
+ if (ceph_snap(dst_inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ /*
+ * Some of the checks below will return -EOPNOTSUPP, which will force a
+ * fallback to the default VFS copy_file_range implementation. This is
+ * desirable in several cases (for ex, the 'len' is smaller than the
+ * size of the objects, or in cases where that would be more
+ * efficient).
+ */
+
+ if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM))
+ return -EOPNOTSUPP;
+
+ if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
+ (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) ||
+ (src_ci->i_layout.object_size != dst_ci->i_layout.object_size))
+ return -EOPNOTSUPP;
+
+ if (len < src_ci->i_layout.object_size)
+ return -EOPNOTSUPP; /* no remote copy will be done */
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ /* Start by sync'ing the source file */
+ ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other
+ * clients may have dirty data in their caches. And OSDs know nothing
+ * about caps, so they can't safely do the remote object copies.
+ */
+ err = get_rd_wr_caps(src_ci, (src_off + len), &src_got,
+ dst_ci, (dst_off + len), &dst_got);
+ if (err < 0) {
+ dout("get_rd_wr_caps returned %d\n", err);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len);
+ if (ret < 0)
+ goto out_caps;
+
+ size = i_size_read(dst_inode);
+ endoff = dst_off + len;
+
+ /* Drop dst file cached pages */
+ ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
+ dst_off >> PAGE_SHIFT,
+ endoff >> PAGE_SHIFT);
+ if (ret < 0) {
+ dout("Failed to invalidate inode pages (%zd)\n", ret);
+ ret = 0; /* XXX */
+ }
+ src_oloc.pool = src_ci->i_layout.pool_id;
+ src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
+ dst_oloc.pool = dst_ci->i_layout.pool_id;
+ dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+
+ ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
+ src_ci->i_layout.object_size,
+ &src_objnum, &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
+ dst_ci->i_layout.object_size,
+ &dst_objnum, &dst_objoff, &dst_objlen);
+ /* object-level offsets need to the same */
+ if (src_objoff != dst_objoff) {
+ ret = -EOPNOTSUPP;
+ goto out_caps;
+ }
+
+ /*
+ * Do a manual copy if the object offset isn't object aligned.
+ * 'src_objlen' contains the bytes left until the end of the object,
+ * starting at the src_off
+ */
+ if (src_objoff) {
+ /*
+ * we need to temporarily drop all caps as we'll be calling
+ * {read,write}_iter, which will get caps again.
+ */
+ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
+ ret = do_splice_direct(src_file, &src_off, dst_file,
+ &dst_off, src_objlen, flags);
+ if (ret < 0) {
+ dout("do_splice_direct returned %d\n", err);
+ goto out;
+ }
+ len -= ret;
+ err = get_rd_wr_caps(src_ci, (src_off + len),
+ &src_got, dst_ci,
+ (dst_off + len), &dst_got);
+ if (err < 0)
+ goto out;
+ err = is_file_size_ok(src_inode, dst_inode,
+ src_off, dst_off, len);
+ if (err < 0)
+ goto out_caps;
+ }
+ object_size = src_ci->i_layout.object_size;
+ while (len >= object_size) {
+ ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
+ object_size, &src_objnum,
+ &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
+ object_size, &dst_objnum,
+ &dst_objoff, &dst_objlen);
+ ceph_oid_init(&src_oid);
+ ceph_oid_printf(&src_oid, "%llx.%08llx",
+ src_ci->i_vino.ino, src_objnum);
+ ceph_oid_init(&dst_oid);
+ ceph_oid_printf(&dst_oid, "%llx.%08llx",
+ dst_ci->i_vino.ino, dst_objnum);
+ /* Do an object remote copy */
+ err = ceph_osdc_copy_from(
+ &ceph_inode_to_client(src_inode)->client->osdc,
+ src_ci->i_vino.snap, 0,
+ &src_oid, &src_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
+ &dst_oid, &dst_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0);
+ if (err) {
+ dout("ceph_osdc_copy_from returned %d\n", err);
+ if (!ret)
+ ret = err;
+ goto out_caps;
+ }
+ len -= object_size;
+ src_off += object_size;
+ dst_off += object_size;
+ ret += object_size;
+ }
+
+ if (len)
+ /* We still need one final local copy */
+ do_final_copy = true;
+
+ file_update_time(dst_file);
+ if (endoff > size) {
+ int caps_flags = 0;
+
+ /* Let the MDS know about dst file size change */
+ if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
+ caps_flags |= CHECK_CAPS_NODELAY;
+ if (ceph_inode_set_size(dst_inode, endoff))
+ caps_flags |= CHECK_CAPS_AUTHONLY;
+ if (caps_flags)
+ ceph_check_caps(dst_ci, caps_flags, NULL);
+ }
+ /* Mark Fw dirty */
+ spin_lock(&dst_ci->i_ceph_lock);
+ dst_ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+ spin_unlock(&dst_ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(dst_inode, dirty);
+
+out_caps:
+ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
+
+ if (do_final_copy) {
+ err = do_splice_direct(src_file, &src_off, dst_file,
+ &dst_off, len, flags);
+ if (err < 0) {
+ dout("do_splice_direct returned %d\n", err);
+ goto out;
+ }
+ len -= err;
+ ret += err;
+ }
+
+out:
+ ceph_free_cap_flush(prealloc_cf);
+
+ return ret;
+}
+
const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,
@@ -1844,5 +2105,5 @@ const struct file_operations ceph_file_fops = {
.unlocked_ioctl = ceph_ioctl,
.compat_ioctl = ceph_ioctl,
.fallocate = ceph_fallocate,
+ .copy_file_range = ceph_copy_file_range,
};
-
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ebc7bdaed2d0..79dd5e6ed755 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1132,8 +1132,12 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
if (IS_ERR(realdn)) {
pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
PTR_ERR(realdn), dn, in, ceph_vinop(in));
- dput(dn);
- dn = realdn; /* note realdn contains the error */
+ dn = realdn;
+ /*
+ * Caller should release 'dn' in the case of error.
+ * If 'req->r_dentry' is passed to this function,
+ * caller should leave 'req->r_dentry' untouched.
+ */
goto out;
} else if (realdn) {
dout("dn %p (%d) spliced with %p (%d) "
@@ -1196,7 +1200,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
WARN_ON_ONCE(1);
}
- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
+ if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
struct qstr dname;
struct dentry *dn, *parent;
@@ -1677,7 +1683,6 @@ retry_lookup:
if (IS_ERR(realdn)) {
err = PTR_ERR(realdn);
d_drop(dn);
- dn = NULL;
goto next_item;
}
dn = realdn;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index bc43c822426a..67a9aeb2f4ec 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2071,7 +2071,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_old_dentry_drop)
len += req->r_old_dentry->d_name.len;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
+ msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
if (!msg) {
msg = ERR_PTR(-ENOMEM);
goto out_free2;
@@ -2136,7 +2136,6 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_pagelist) {
struct ceph_pagelist *pagelist = req->r_pagelist;
- refcount_inc(&pagelist->refcnt);
ceph_msg_data_add_pagelist(msg, pagelist);
msg->hdr.data_len = cpu_to_le32(pagelist->length);
} else {
@@ -3126,12 +3125,11 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
pr_info("mds%d reconnect start\n", mds);
- pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
goto fail_nopagelist;
- ceph_pagelist_init(pagelist);
- reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
+ reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
if (!reply)
goto fail_nomsg;
@@ -3241,6 +3239,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
up_read(&mdsc->snap_rwsem);
+ ceph_pagelist_release(pagelist);
return;
fail:
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index eab1359d0553..b5ecd6f50360 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -165,6 +165,8 @@ enum {
Opt_noacl,
Opt_quotadf,
Opt_noquotadf,
+ Opt_copyfrom,
+ Opt_nocopyfrom,
};
static match_table_t fsopt_tokens = {
@@ -203,6 +205,8 @@ static match_table_t fsopt_tokens = {
{Opt_noacl, "noacl"},
{Opt_quotadf, "quotadf"},
{Opt_noquotadf, "noquotadf"},
+ {Opt_copyfrom, "copyfrom"},
+ {Opt_nocopyfrom, "nocopyfrom"},
{-1, NULL}
};
@@ -355,6 +359,12 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_noquotadf:
fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF;
break;
+ case Opt_copyfrom:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM;
+ break;
+ case Opt_nocopyfrom:
+ fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM;
+ break;
#ifdef CONFIG_CEPH_FS_POSIX_ACL
case Opt_acl:
fsopt->sb_flags |= SB_POSIXACL;
@@ -553,6 +563,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noacl");
#endif
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM)
+ seq_puts(m, ",nocopyfrom");
+
if (fsopt->mds_namespace)
seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 582e28fd1b7b..c005a5400f2e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -40,6 +40,7 @@
#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
+#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
@@ -1008,7 +1009,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page);
extern int ceph_try_get_caps(struct ceph_inode_info *ci,
- int need, int want, int *got);
+ int need, int want, bool nonblock, int *got);
/* for counting open files by mode */
extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 5cc8b94f8206..316f6ad10644 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -951,11 +951,10 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
if (size > 0) {
/* copy value into pagelist */
- pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
return -ENOMEM;
- ceph_pagelist_init(pagelist);
err = ceph_pagelist_append(pagelist, value, size);
if (err)
goto out;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index ce2cc2169040..6e30949d9f77 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -64,11 +64,6 @@
#include <linux/hiddev.h>
-#define __DVB_CORE__
-#include <linux/dvb/audio.h>
-#include <linux/dvb/dmx.h>
-#include <linux/dvb/frontend.h>
-#include <linux/dvb/video.h>
#include <linux/sort.h>
@@ -95,71 +90,6 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return vfs_ioctl(file, cmd, arg);
}
-struct compat_video_event {
- int32_t type;
- compat_time_t timestamp;
- union {
- video_size_t size;
- unsigned int frame_rate;
- } u;
-};
-
-static int do_video_get_event(struct file *file,
- unsigned int cmd, struct compat_video_event __user *up)
-{
- struct video_event __user *kevent =
- compat_alloc_user_space(sizeof(*kevent));
- int err;
-
- if (kevent == NULL)
- return -EFAULT;
-
- err = do_ioctl(file, cmd, (unsigned long)kevent);
- if (!err) {
- err = convert_in_user(&kevent->type, &up->type);
- err |= convert_in_user(&kevent->timestamp, &up->timestamp);
- err |= convert_in_user(&kevent->u.size.w, &up->u.size.w);
- err |= convert_in_user(&kevent->u.size.h, &up->u.size.h);
- err |= convert_in_user(&kevent->u.size.aspect_ratio,
- &up->u.size.aspect_ratio);
- if (err)
- err = -EFAULT;
- }
-
- return err;
-}
-
-struct compat_video_still_picture {
- compat_uptr_t iFrame;
- int32_t size;
-};
-
-static int do_video_stillpicture(struct file *file,
- unsigned int cmd, struct compat_video_still_picture __user *up)
-{
- struct video_still_picture __user *up_native;
- compat_uptr_t fp;
- int32_t size;
- int err;
-
- err = get_user(fp, &up->iFrame);
- err |= get_user(size, &up->size);
- if (err)
- return -EFAULT;
-
- up_native =
- compat_alloc_user_space(sizeof(struct video_still_picture));
-
- err = put_user(compat_ptr(fp), &up_native->iFrame);
- err |= put_user(size, &up_native->size);
- if (err)
- return -EFAULT;
-
- err = do_ioctl(file, cmd, (unsigned long) up_native);
-
- return err;
-}
-
#ifdef CONFIG_BLOCK
typedef struct sg_io_hdr32 {
compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */
@@ -958,61 +888,6 @@ COMPATIBLE_IOCTL(HIDIOCGFLAG)
COMPATIBLE_IOCTL(HIDIOCSFLAG)
COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
-/* dvb */
-COMPATIBLE_IOCTL(AUDIO_STOP)
-COMPATIBLE_IOCTL(AUDIO_PLAY)
-COMPATIBLE_IOCTL(AUDIO_PAUSE)
-COMPATIBLE_IOCTL(AUDIO_CONTINUE)
-COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE)
-COMPATIBLE_IOCTL(AUDIO_SET_MUTE)
-COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC)
-COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE)
-COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT)
-COMPATIBLE_IOCTL(AUDIO_GET_STATUS)
-COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES)
-COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER)
-COMPATIBLE_IOCTL(AUDIO_SET_ID)
-COMPATIBLE_IOCTL(AUDIO_SET_MIXER)
-COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE)
-COMPATIBLE_IOCTL(DMX_START)
-COMPATIBLE_IOCTL(DMX_STOP)
-COMPATIBLE_IOCTL(DMX_SET_FILTER)
-COMPATIBLE_IOCTL(DMX_SET_PES_FILTER)
-COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE)
-COMPATIBLE_IOCTL(DMX_GET_PES_PIDS)
-COMPATIBLE_IOCTL(DMX_GET_STC)
-COMPATIBLE_IOCTL(DMX_REQBUFS)
-COMPATIBLE_IOCTL(DMX_QUERYBUF)
-COMPATIBLE_IOCTL(DMX_EXPBUF)
-COMPATIBLE_IOCTL(DMX_QBUF)
-COMPATIBLE_IOCTL(DMX_DQBUF)
-COMPATIBLE_IOCTL(VIDEO_STOP)
-COMPATIBLE_IOCTL(VIDEO_PLAY)
-COMPATIBLE_IOCTL(VIDEO_FREEZE)
-COMPATIBLE_IOCTL(VIDEO_CONTINUE)
-COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE)
-COMPATIBLE_IOCTL(VIDEO_SET_BLANK)
-COMPATIBLE_IOCTL(VIDEO_GET_STATUS)
-COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT)
-COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD)
-COMPATIBLE_IOCTL(VIDEO_SLOWMOTION)
-COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES)
-COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER)
-COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE)
-COMPATIBLE_IOCTL(VIDEO_SET_FORMAT)
-COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
-/* cec */
-COMPATIBLE_IOCTL(CEC_ADAP_G_CAPS)
-COMPATIBLE_IOCTL(CEC_ADAP_G_LOG_ADDRS)
-COMPATIBLE_IOCTL(CEC_ADAP_S_LOG_ADDRS)
-COMPATIBLE_IOCTL(CEC_ADAP_G_PHYS_ADDR)
-COMPATIBLE_IOCTL(CEC_ADAP_S_PHYS_ADDR)
-COMPATIBLE_IOCTL(CEC_G_MODE)
-COMPATIBLE_IOCTL(CEC_S_MODE)
-COMPATIBLE_IOCTL(CEC_TRANSMIT)
-COMPATIBLE_IOCTL(CEC_RECEIVE)
-COMPATIBLE_IOCTL(CEC_DQEVENT)
-
/* joystick */
COMPATIBLE_IOCTL(JSIOCGVERSION)
COMPATIBLE_IOCTL(JSIOCGAXES)
@@ -1080,12 +955,6 @@ static long do_ioctl_trans(unsigned int cmd,
case RTC_EPOCH_READ32:
case RTC_EPOCH_SET32:
return rtc_ioctl(file, cmd, argp);
-
- /* dvb */
- case VIDEO_GET_EVENT:
- return do_video_get_event(file, cmd, argp);
- case VIDEO_STILLPICTURE:
- return do_video_stillpicture(file, cmd, argp);
}
/*
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 0c35e62f108d..9352487bd0fc 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -202,7 +202,8 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
continue;
blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT;
blk_offset += offset;
- if (blk_offset + len > BUFFER_SIZE)
+ if (blk_offset > BUFFER_SIZE ||
+ blk_offset + len > BUFFER_SIZE)
continue;
return read_buffers[i] + blk_offset;
}
@@ -872,8 +873,8 @@ static int cramfs_readpage(struct file *file, struct page *page)
if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) {
/* See comments on earlier code. */
u32 prev_start = block_start;
- block_start = prev_start & ~CRAMFS_BLK_FLAGS;
- block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
+ block_start = prev_start & ~CRAMFS_BLK_FLAGS;
+ block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) {
block_start += PAGE_SIZE;
} else {
diff --git a/fs/dax.c b/fs/dax.c
index 0fb270f0a0ef..616e36ea6aaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -38,6 +38,17 @@
#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>
+static inline unsigned int pe_order(enum page_entry_size pe_size)
+{
+ if (pe_size == PE_SIZE_PTE)
+ return PAGE_SHIFT - PAGE_SHIFT;
+ if (pe_size == PE_SIZE_PMD)
+ return PMD_SHIFT - PAGE_SHIFT;
+ if (pe_size == PE_SIZE_PUD)
+ return PUD_SHIFT - PAGE_SHIFT;
+ return ~0;
+}
+
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -46,6 +57,9 @@
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
+/* The order of a PMD entry */
+#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
+
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
@@ -59,63 +73,74 @@ static int __init init_dax_wait_table(void)
fs_initcall(init_dax_wait_table);
/*
- * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a zero page or
- * an empty entry that is just used for locking. In total four special bits.
+ * DAX pagecache entries use XArray value entries so they can't be mistaken
+ * for pages. We use one bit for locking, one bit for the entry size (PMD)
+ * and two more to tell us if the entry is a zero page or an empty entry that
+ * is just used for locking. In total four special bits.
*
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
* and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
* block allocation.
*/
-#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
-#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+#define DAX_SHIFT (4)
+#define DAX_LOCKED (1UL << 0)
+#define DAX_PMD (1UL << 1)
+#define DAX_ZERO_PAGE (1UL << 2)
+#define DAX_EMPTY (1UL << 3)
-static unsigned long dax_radix_pfn(void *entry)
+static unsigned long dax_to_pfn(void *entry)
{
- return (unsigned long)entry >> RADIX_DAX_SHIFT;
+ return xa_to_value(entry) >> DAX_SHIFT;
}
-static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
+static void *dax_make_entry(pfn_t pfn, unsigned long flags)
{
- return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
- (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
+ return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
}
-static unsigned int dax_radix_order(void *entry)
+static void *dax_make_page_entry(struct page *page)
{
- if ((unsigned long)entry & RADIX_DAX_PMD)
- return PMD_SHIFT - PAGE_SHIFT;
+ pfn_t pfn = page_to_pfn_t(page);
+ return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0);
+}
+
+static bool dax_is_locked(void *entry)
+{
+ return xa_to_value(entry) & DAX_LOCKED;
+}
+
+static unsigned int dax_entry_order(void *entry)
+{
+ if (xa_to_value(entry) & DAX_PMD)
+ return PMD_ORDER;
return 0;
}
static int dax_is_pmd_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_PMD;
+ return xa_to_value(entry) & DAX_PMD;
}
static int dax_is_pte_entry(void *entry)
{
- return !((unsigned long)entry & RADIX_DAX_PMD);
+ return !(xa_to_value(entry) & DAX_PMD);
}
static int dax_is_zero_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
+ return xa_to_value(entry) & DAX_ZERO_PAGE;
}
static int dax_is_empty_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_EMPTY;
+ return xa_to_value(entry) & DAX_EMPTY;
}
/*
- * DAX radix tree locking
+ * DAX page cache entry locking
*/
struct exceptional_entry_key {
- struct address_space *mapping;
+ struct xarray *xa;
pgoff_t entry_start;
};
@@ -124,10 +149,11 @@ struct wait_exceptional_entry_queue {
struct exceptional_entry_key key;
};
-static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
- pgoff_t index, void *entry, struct exceptional_entry_key *key)
+static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
+ void *entry, struct exceptional_entry_key *key)
{
unsigned long hash;
+ unsigned long index = xas->xa_index;
/*
* If 'entry' is a PMD, align the 'index' that we use for the wait
@@ -136,22 +162,21 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
*/
if (dax_is_pmd_entry(entry))
index &= ~PG_PMD_COLOUR;
-
- key->mapping = mapping;
+ key->xa = xas->xa;
key->entry_start = index;
- hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
+ hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
return wait_table + hash;
}
-static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
- int sync, void *keyp)
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
+ unsigned int mode, int sync, void *keyp)
{
struct exceptional_entry_key *key = keyp;
struct wait_exceptional_entry_queue *ewait =
container_of(wait, struct wait_exceptional_entry_queue, wait);
- if (key->mapping != ewait->key.mapping ||
+ if (key->xa != ewait->key.xa ||
key->entry_start != ewait->key.entry_start)
return 0;
return autoremove_wake_function(wait, mode, sync, NULL);
@@ -162,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
* The important information it's conveying is whether the entry at
* this index used to be a PMD entry.
*/
-static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
- pgoff_t index, void *entry, bool wake_all)
+static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
{
struct exceptional_entry_key key;
wait_queue_head_t *wq;
- wq = dax_entry_waitqueue(mapping, index, entry, &key);
+ wq = dax_entry_waitqueue(xas, entry, &key);
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -181,55 +205,16 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
}
/*
- * Check whether the given slot is locked. Must be called with the i_pages
- * lock held.
- */
-static inline int slot_locked(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
- return entry & RADIX_DAX_ENTRY_LOCK;
-}
-
-/*
- * Mark the given slot as locked. Must be called with the i_pages lock held.
- */
-static inline void *lock_slot(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
- entry |= RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
- return (void *)entry;
-}
-
-/*
- * Mark the given slot as unlocked. Must be called with the i_pages lock held.
- */
-static inline void *unlock_slot(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
- entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
- return (void *)entry;
-}
-
-/*
- * Lookup entry in radix tree, wait for it to become unlocked if it is
- * exceptional entry and return it. The caller must call
- * put_unlocked_mapping_entry() when he decided not to lock the entry or
- * put_locked_mapping_entry() when he locked the entry and now wants to
- * unlock it.
+ * Look up entry in page cache, wait for it to become unlocked if it
+ * is a DAX entry and return it. The caller must subsequently call
+ * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
+ * if it did.
*
* Must be called with the i_pages lock held.
*/
-static void *__get_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void ***slotp, bool (*wait_fn)(void))
+static void *get_unlocked_entry(struct xa_state *xas)
{
- void *entry, **slot;
+ void *entry;
struct wait_exceptional_entry_queue ewait;
wait_queue_head_t *wq;
@@ -237,80 +222,54 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping,
ewait.wait.func = wake_exceptional_entry_func;
for (;;) {
- bool revalidate;
-
- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
- &slot);
- if (!entry ||
- WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
- !slot_locked(mapping, slot)) {
- if (slotp)
- *slotp = slot;
+ entry = xas_load(xas);
+ if (!entry || xa_is_internal(entry) ||
+ WARN_ON_ONCE(!xa_is_value(entry)) ||
+ !dax_is_locked(entry))
return entry;
- }
- wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
- xa_unlock_irq(&mapping->i_pages);
- revalidate = wait_fn();
+ xas_unlock_irq(xas);
+ xas_reset(xas);
+ schedule();
finish_wait(wq, &ewait.wait);
- xa_lock_irq(&mapping->i_pages);
- if (revalidate)
- return ERR_PTR(-EAGAIN);
+ xas_lock_irq(xas);
}
}
-static bool entry_wait(void)
-{
- schedule();
- /*
- * Never return an ERR_PTR() from
- * __get_unlocked_mapping_entry(), just keep looping.
- */
- return false;
-}
-
-static void *get_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void ***slotp)
+static void put_unlocked_entry(struct xa_state *xas, void *entry)
{
- return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
-}
-
-static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
- void *entry, **slot;
-
- xa_lock_irq(&mapping->i_pages);
- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
- if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
- !slot_locked(mapping, slot))) {
- xa_unlock_irq(&mapping->i_pages);
- return;
- }
- unlock_slot(mapping, slot);
- xa_unlock_irq(&mapping->i_pages);
- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+ /* If we were the only waiter woken, wake the next one */
+ if (entry)
+ dax_wake_entry(xas, entry, false);
}
-static void put_locked_mapping_entry(struct address_space *mapping,
- pgoff_t index)
+/*
+ * We used the xa_state to get the entry, but then we locked the entry and
+ * dropped the xa_lock, so we know the xa_state is stale and must be reset
+ * before use.
+ */
+static void dax_unlock_entry(struct xa_state *xas, void *entry)
{
- unlock_mapping_entry(mapping, index);
+ void *old;
+
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ old = xas_store(xas, entry);
+ xas_unlock_irq(xas);
+ BUG_ON(!dax_is_locked(old));
+ dax_wake_entry(xas, entry, false);
}
/*
- * Called when we are done with radix tree entry we looked up via
- * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ * Return: The entry stored at this location before it was locked.
*/
-static void put_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+static void *dax_lock_entry(struct xa_state *xas, void *entry)
{
- if (!entry)
- return;
-
- /* We have to wake up next waiter for the radix tree entry lock */
- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+ unsigned long v = xa_to_value(entry);
+ return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
}
static unsigned long dax_entry_size(void *entry)
@@ -325,9 +284,9 @@ static unsigned long dax_entry_size(void *entry)
return PAGE_SIZE;
}
-static unsigned long dax_radix_end_pfn(void *entry)
+static unsigned long dax_end_pfn(void *entry)
{
- return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
+ return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
}
/*
@@ -335,8 +294,8 @@ static unsigned long dax_radix_end_pfn(void *entry)
* 'empty' and 'zero' entries.
*/
#define for_each_mapped_pfn(entry, pfn) \
- for (pfn = dax_radix_pfn(entry); \
- pfn < dax_radix_end_pfn(entry); pfn++)
+ for (pfn = dax_to_pfn(entry); \
+ pfn < dax_end_pfn(entry); pfn++)
/*
* TODO: for reflink+dax we need a way to associate a single page with
@@ -393,33 +352,16 @@ static struct page *dax_busy_page(void *entry)
return NULL;
}
-static bool entry_wait_revalidate(void)
-{
- rcu_read_unlock();
- schedule();
- rcu_read_lock();
-
- /*
- * Tell __get_unlocked_mapping_entry() to take a break, we need
- * to revalidate page->mapping after dropping locks
- */
- return true;
-}
-
bool dax_lock_mapping_entry(struct page *page)
{
- pgoff_t index;
- struct inode *inode;
- bool did_lock = false;
- void *entry = NULL, **slot;
- struct address_space *mapping;
+ XA_STATE(xas, NULL, 0);
+ void *entry;
- rcu_read_lock();
for (;;) {
- mapping = READ_ONCE(page->mapping);
+ struct address_space *mapping = READ_ONCE(page->mapping);
if (!dax_mapping(mapping))
- break;
+ return false;
/*
* In the device-dax case there's no need to lock, a
@@ -428,98 +370,94 @@ bool dax_lock_mapping_entry(struct page *page)
* otherwise we would not have a valid pfn_to_page()
* translation.
*/
- inode = mapping->host;
- if (S_ISCHR(inode->i_mode)) {
- did_lock = true;
- break;
- }
+ if (S_ISCHR(mapping->host->i_mode))
+ return true;
- xa_lock_irq(&mapping->i_pages);
+ xas.xa = &mapping->i_pages;
+ xas_lock_irq(&xas);
if (mapping != page->mapping) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
continue;
}
- index = page->index;
-
- entry = __get_unlocked_mapping_entry(mapping, index, &slot,
- entry_wait_revalidate);
- if (!entry) {
- xa_unlock_irq(&mapping->i_pages);
- break;
- } else if (IS_ERR(entry)) {
- xa_unlock_irq(&mapping->i_pages);
- WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
- continue;
+ xas_set(&xas, page->index);
+ entry = xas_load(&xas);
+ if (dax_is_locked(entry)) {
+ entry = get_unlocked_entry(&xas);
+ /* Did the page move while we slept? */
+ if (dax_to_pfn(entry) != page_to_pfn(page)) {
+ xas_unlock_irq(&xas);
+ continue;
+ }
}
- lock_slot(mapping, slot);
- did_lock = true;
- xa_unlock_irq(&mapping->i_pages);
- break;
+ dax_lock_entry(&xas, entry);
+ xas_unlock_irq(&xas);
+ return true;
}
- rcu_read_unlock();
-
- return did_lock;
}
void dax_unlock_mapping_entry(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
+ XA_STATE(xas, &mapping->i_pages, page->index);
- if (S_ISCHR(inode->i_mode))
+ if (S_ISCHR(mapping->host->i_mode))
return;
- unlock_mapping_entry(mapping, page->index);
+ dax_unlock_entry(&xas, dax_make_page_entry(page));
}
/*
- * Find radix tree entry at given index. If it points to an exceptional entry,
- * return it with the radix tree entry locked. If the radix tree doesn't
- * contain given index, create an empty exceptional entry for the index and
- * return with it locked.
+ * Find page cache entry at given index. If it is a DAX entry, return it
+ * with the entry locked. If the page cache doesn't contain an entry at
+ * that index, add a locked empty entry.
*
- * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
- * either return that locked entry or will return an error. This error will
- * happen if there are any 4k entries within the 2MiB range that we are
- * requesting.
+ * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
+ * either return that locked entry or will return VM_FAULT_FALLBACK.
+ * This will happen if there are any PTE entries within the PMD range
+ * that we are requesting.
*
- * We always favor 4k entries over 2MiB entries. There isn't a flow where we
- * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
- * insertion will fail if it finds any 4k entries already in the tree, and a
- * 4k insertion will cause an existing 2MiB entry to be unmapped and
- * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
- * well as 2MiB empty entries.
+ * We always favor PTE entries over PMD entries. There isn't a flow where we
+ * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
+ * insertion will fail if it finds any PTE entries already in the tree, and a
+ * PTE insertion will cause an existing PMD entry to be unmapped and
+ * downgraded to PTE entries. This happens for both PMD zero pages as
+ * well as PMD empty entries.
*
- * The exception to this downgrade path is for 2MiB DAX PMD entries that have
- * real storage backing them. We will leave these real 2MiB DAX entries in
- * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
+ * The exception to this downgrade path is for PMD entries that have
+ * real storage backing them. We will leave these real PMD entries in
+ * the tree, and PTE writes will simply dirty the entire PMD entry.
*
* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
* persistent memory the benefit is doubtful. We can add that later if we can
* show it helps.
+ *
+ * On error, this function does not return an ERR_PTR. Instead it returns
+ * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
+ * overlap with xarray value entries.
*/
-static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
- unsigned long size_flag)
+static void *grab_mapping_entry(struct xa_state *xas,
+ struct address_space *mapping, unsigned long size_flag)
{
- bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
- void *entry, **slot;
-
-restart:
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ unsigned long index = xas->xa_index;
+ bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
+ void *entry;
- if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
- entry = ERR_PTR(-EIO);
- goto out_unlock;
- }
+retry:
+ xas_lock_irq(xas);
+ entry = get_unlocked_entry(xas);
+ if (xa_is_internal(entry))
+ goto fallback;
if (entry) {
- if (size_flag & RADIX_DAX_PMD) {
+ if (WARN_ON_ONCE(!xa_is_value(entry))) {
+ xas_set_err(xas, EIO);
+ goto out_unlock;
+ }
+
+ if (size_flag & DAX_PMD) {
if (dax_is_pte_entry(entry)) {
- put_unlocked_mapping_entry(mapping, index,
- entry);
- entry = ERR_PTR(-EEXIST);
- goto out_unlock;
+ put_unlocked_entry(xas, entry);
+ goto fallback;
}
} else { /* trying to grab a PTE entry */
if (dax_is_pmd_entry(entry) &&
@@ -530,87 +468,57 @@ restart:
}
}
- /* No entry for given index? Make sure radix tree is big enough. */
- if (!entry || pmd_downgrade) {
- int err;
-
- if (pmd_downgrade) {
- /*
- * Make sure 'entry' remains valid while we drop
- * the i_pages lock.
- */
- entry = lock_slot(mapping, slot);
- }
+ if (pmd_downgrade) {
+ /*
+ * Make sure 'entry' remains valid while we drop
+ * the i_pages lock.
+ */
+ dax_lock_entry(xas, entry);
- xa_unlock_irq(&mapping->i_pages);
/*
* Besides huge zero pages the only other thing that gets
* downgraded are empty entries which don't need to be
* unmapped.
*/
- if (pmd_downgrade && dax_is_zero_entry(entry))
- unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
- PG_PMD_NR, false);
-
- err = radix_tree_preload(
- mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
- if (err) {
- if (pmd_downgrade)
- put_locked_mapping_entry(mapping, index);
- return ERR_PTR(err);
- }
- xa_lock_irq(&mapping->i_pages);
-
- if (!entry) {
- /*
- * We needed to drop the i_pages lock while calling
- * radix_tree_preload() and we didn't have an entry to
- * lock. See if another thread inserted an entry at
- * our index during this time.
- */
- entry = __radix_tree_lookup(&mapping->i_pages, index,
- NULL, &slot);
- if (entry) {
- radix_tree_preload_end();
- xa_unlock_irq(&mapping->i_pages);
- goto restart;
- }
+ if (dax_is_zero_entry(entry)) {
+ xas_unlock_irq(xas);
+ unmap_mapping_pages(mapping,
+ xas->xa_index & ~PG_PMD_COLOUR,
+ PG_PMD_NR, false);
+ xas_reset(xas);
+ xas_lock_irq(xas);
}
- if (pmd_downgrade) {
- dax_disassociate_entry(entry, mapping, false);
- radix_tree_delete(&mapping->i_pages, index);
- mapping->nrexceptional--;
- dax_wake_mapping_entry_waiter(mapping, index, entry,
- true);
- }
+ dax_disassociate_entry(entry, mapping, false);
+ xas_store(xas, NULL); /* undo the PMD join */
+ dax_wake_entry(xas, entry, true);
+ mapping->nrexceptional--;
+ entry = NULL;
+ xas_set(xas, index);
+ }
- entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
-
- err = __radix_tree_insert(&mapping->i_pages, index,
- dax_radix_order(entry), entry);
- radix_tree_preload_end();
- if (err) {
- xa_unlock_irq(&mapping->i_pages);
- /*
- * Our insertion of a DAX entry failed, most likely
- * because we were inserting a PMD entry and it
- * collided with a PTE sized entry at a different
- * index in the PMD range. We haven't inserted
- * anything into the radix tree and have no waiters to
- * wake.
- */
- return ERR_PTR(err);
- }
- /* Good, we have inserted empty locked entry into the tree. */
+ if (entry) {
+ dax_lock_entry(xas, entry);
+ } else {
+ entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY);
+ dax_lock_entry(xas, entry);
+ if (xas_error(xas))
+ goto out_unlock;
mapping->nrexceptional++;
- xa_unlock_irq(&mapping->i_pages);
- return entry;
}
- entry = lock_slot(mapping, slot);
- out_unlock:
- xa_unlock_irq(&mapping->i_pages);
+
+out_unlock:
+ xas_unlock_irq(xas);
+ if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
+ goto retry;
+ if (xas->xa_node == XA_ERROR(-ENOMEM))
+ return xa_mk_internal(VM_FAULT_OOM);
+ if (xas_error(xas))
+ return xa_mk_internal(VM_FAULT_SIGBUS);
return entry;
+fallback:
+ xas_unlock_irq(xas);
+ return xa_mk_internal(VM_FAULT_FALLBACK);
}
/**
@@ -630,11 +538,10 @@ restart:
*/
struct page *dax_layout_busy_page(struct address_space *mapping)
{
- pgoff_t indices[PAGEVEC_SIZE];
+ XA_STATE(xas, &mapping->i_pages, 0);
+ void *entry;
+ unsigned int scanned = 0;
struct page *page = NULL;
- struct pagevec pvec;
- pgoff_t index, end;
- unsigned i;
/*
* In the 'limited' case get_user_pages() for dax is disabled.
@@ -645,13 +552,9 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
if (!dax_mapping(mapping) || !mapping_mapped(mapping))
return NULL;
- pagevec_init(&pvec);
- index = 0;
- end = -1;
-
/*
* If we race get_user_pages_fast() here either we'll see the
- * elevated page count in the pagevec_lookup and wait, or
+ * elevated page count in the iteration and wait, or
* get_user_pages_fast() will see that the page it took a reference
* against is no longer mapped in the page tables and bail to the
* get_user_pages() slow path. The slow path is protected by
@@ -663,94 +566,68 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
*/
unmap_mapping_range(mapping, 0, 0, 1);
- while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- indices)) {
- pgoff_t nr_pages = 1;
-
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *pvec_ent = pvec.pages[i];
- void *entry;
-
- index = indices[i];
- if (index >= end)
- break;
-
- if (WARN_ON_ONCE(
- !radix_tree_exceptional_entry(pvec_ent)))
- continue;
-
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (entry) {
- page = dax_busy_page(entry);
- /*
- * Account for multi-order entries at
- * the end of the pagevec.
- */
- if (i + 1 >= pagevec_count(&pvec))
- nr_pages = 1UL << dax_radix_order(entry);
- }
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(&mapping->i_pages);
- if (page)
- break;
- }
-
- /*
- * We don't expect normal struct page entries to exist in our
- * tree, but we keep these pagevec calls so that this code is
- * consistent with the common pattern for handling pagevecs
- * throughout the kernel.
- */
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- index += nr_pages;
-
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ if (WARN_ON_ONCE(!xa_is_value(entry)))
+ continue;
+ if (unlikely(dax_is_locked(entry)))
+ entry = get_unlocked_entry(&xas);
+ if (entry)
+ page = dax_busy_page(entry);
+ put_unlocked_entry(&xas, entry);
if (page)
break;
+ if (++scanned % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
}
+ xas_unlock_irq(&xas);
return page;
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
-static int __dax_invalidate_mapping_entry(struct address_space *mapping,
+static int __dax_invalidate_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
+ XA_STATE(xas, &mapping->i_pages, index);
int ret = 0;
void *entry;
- struct radix_tree_root *pages = &mapping->i_pages;
- xa_lock_irq(pages);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
+ xas_lock_irq(&xas);
+ entry = get_unlocked_entry(&xas);
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
goto out;
if (!trunc &&
- (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
- radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
+ (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
+ xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
goto out;
dax_disassociate_entry(entry, mapping, trunc);
- radix_tree_delete(pages, index);
+ xas_store(&xas, NULL);
mapping->nrexceptional--;
ret = 1;
out:
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(pages);
+ put_unlocked_entry(&xas, entry);
+ xas_unlock_irq(&xas);
return ret;
}
+
/*
- * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
- * entry to get unlocked before deleting it.
+ * Delete DAX entry at @index from @mapping. Wait for it
+ * to be unlocked before deleting it.
*/
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
- int ret = __dax_invalidate_mapping_entry(mapping, index, true);
+ int ret = __dax_invalidate_entry(mapping, index, true);
/*
* This gets called from truncate / punch_hole path. As such, the caller
* must hold locks protecting against concurrent modifications of the
- * radix tree (usually fs-private i_mmap_sem for writing). Since the
- * caller has seen exceptional entry for this index, we better find it
+ * page cache (usually fs-private i_mmap_sem for writing). Since the
+ * caller has seen a DAX entry for this index, we better find it
* at that index as well...
*/
WARN_ON_ONCE(!ret);
@@ -758,12 +635,12 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
}
/*
- * Invalidate exceptional DAX entry if it is clean.
+ * Invalidate DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index)
{
- return __dax_invalidate_mapping_entry(mapping, index, false);
+ return __dax_invalidate_entry(mapping, index, false);
}
static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
@@ -799,30 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
-static void *dax_insert_mapping_entry(struct address_space *mapping,
- struct vm_fault *vmf,
- void *entry, pfn_t pfn_t,
- unsigned long flags, bool dirty)
+static void *dax_insert_entry(struct xa_state *xas,
+ struct address_space *mapping, struct vm_fault *vmf,
+ void *entry, pfn_t pfn, unsigned long flags, bool dirty)
{
- struct radix_tree_root *pages = &mapping->i_pages;
- unsigned long pfn = pfn_t_to_pfn(pfn_t);
- pgoff_t index = vmf->pgoff;
- void *new_entry;
+ void *new_entry = dax_make_entry(pfn, flags);
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+ if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+ unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
- PG_PMD_NR, false);
+ PG_PMD_NR, false);
else /* pte entry */
- unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
+ unmap_mapping_pages(mapping, index, 1, false);
}
- xa_lock_irq(pages);
- new_entry = dax_radix_locked_entry(pfn, flags);
+ xas_reset(xas);
+ xas_lock_irq(xas);
if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
@@ -830,33 +704,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
- * Only swap our new entry into the radix tree if the current
+ * Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
- * PMD entry is already in the tree, we leave it alone. This
+ * PMD entry is already in the cache, we leave it alone. This
* means that if we are trying to insert a PTE and the
* existing entry is a PMD, we will just leave the PMD in the
* tree and dirty it if necessary.
*/
- struct radix_tree_node *node;
- void **slot;
- void *ret;
-
- ret = __radix_tree_lookup(pages, index, &node, &slot);
- WARN_ON_ONCE(ret != entry);
- __radix_tree_replace(pages, node, slot,
- new_entry, NULL);
+ void *old = dax_lock_entry(xas, new_entry);
+ WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
+ DAX_LOCKED));
entry = new_entry;
+ } else {
+ xas_load(xas); /* Walk the xa_state */
}
if (dirty)
- radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
+ xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(pages);
+ xas_unlock_irq(xas);
return entry;
}
-static inline unsigned long
-pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+static inline
+unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
{
unsigned long address;
@@ -866,8 +737,8 @@ pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
}
/* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_mapping_entry_mkclean(struct address_space *mapping,
- pgoff_t index, unsigned long pfn)
+static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
+ unsigned long pfn)
{
struct vm_area_struct *vma;
pte_t pte, *ptep = NULL;
@@ -937,11 +808,9 @@ unlock_pte:
i_mmap_unlock_read(mapping);
}
-static int dax_writeback_one(struct dax_device *dax_dev,
- struct address_space *mapping, pgoff_t index, void *entry)
+static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
+ struct address_space *mapping, void *entry)
{
- struct radix_tree_root *pages = &mapping->i_pages;
- void *entry2, **slot;
unsigned long pfn;
long ret = 0;
size_t size;
@@ -950,32 +819,38 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* A page got tagged dirty in DAX mapping? Something is seriously
* wrong.
*/
- if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+ if (WARN_ON(!xa_is_value(entry)))
return -EIO;
- xa_lock_irq(pages);
- entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
- /* Entry got punched out / reallocated? */
- if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
- goto put_unlocked;
- /*
- * Entry got reallocated elsewhere? No need to writeback. We have to
- * compare pfns as we must not bail out due to difference in lockbit
- * or entry type.
- */
- if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
- goto put_unlocked;
- if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
- dax_is_zero_entry(entry))) {
- ret = -EIO;
- goto put_unlocked;
+ if (unlikely(dax_is_locked(entry))) {
+ void *old_entry = entry;
+
+ entry = get_unlocked_entry(xas);
+
+ /* Entry got punched out / reallocated? */
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+ goto put_unlocked;
+ /*
+ * Entry got reallocated elsewhere? No need to writeback.
+ * We have to compare pfns as we must not bail out due to
+ * difference in lockbit or entry type.
+ */
+ if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
+ goto put_unlocked;
+ if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+ dax_is_zero_entry(entry))) {
+ ret = -EIO;
+ goto put_unlocked;
+ }
+
+ /* Another fsync thread may have already done this entry */
+ if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
+ goto put_unlocked;
}
- /* Another fsync thread may have already written back this entry */
- if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
- goto put_unlocked;
/* Lock the entry to serialize with page faults */
- entry = lock_slot(mapping, slot);
+ dax_lock_entry(xas, entry);
+
/*
* We can clear the tag now but we have to be careful so that concurrent
* dax_writeback_one() calls for the same index cannot finish before we
@@ -983,8 +858,8 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* at the entry only under the i_pages lock and once they do that
* they will see the entry locked and wait for it to unlock.
*/
- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE);
- xa_unlock_irq(pages);
+ xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irq(xas);
/*
* Even if dax_writeback_mapping_range() was given a wbc->range_start
@@ -993,10 +868,10 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* This allows us to flush for PMD_SIZE and not have to worry about
* partial PMD writebacks.
*/
- pfn = dax_radix_pfn(entry);
- size = PAGE_SIZE << dax_radix_order(entry);
+ pfn = dax_to_pfn(entry);
+ size = PAGE_SIZE << dax_entry_order(entry);
- dax_mapping_entry_mkclean(mapping, index, pfn);
+ dax_entry_mkclean(mapping, xas->xa_index, pfn);
dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
@@ -1004,16 +879,18 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* the pfn mappings are writeprotected and fault waits for mapping
* entry lock.
*/
- xa_lock_irq(pages);
- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(pages);
- trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
- put_locked_mapping_entry(mapping, index);
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ xas_store(xas, entry);
+ xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
+ dax_wake_entry(xas, entry, false);
+
+ trace_dax_writeback_one(mapping->host, xas->xa_index,
+ size >> PAGE_SHIFT);
return ret;
put_unlocked:
- put_unlocked_mapping_entry(mapping, index, entry2);
- xa_unlock_irq(pages);
+ put_unlocked_entry(xas, entry);
return ret;
}
@@ -1025,13 +902,13 @@ static int dax_writeback_one(struct dax_device *dax_dev,
int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc)
{
+ XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
struct inode *inode = mapping->host;
- pgoff_t start_index, end_index;
- pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
struct dax_device *dax_dev;
- struct pagevec pvec;
- bool done = false;
- int i, ret = 0;
+ void *entry;
+ int ret = 0;
+ unsigned int scanned = 0;
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
return -EIO;
@@ -1043,41 +920,29 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (!dax_dev)
return -EIO;
- start_index = wbc->range_start >> PAGE_SHIFT;
- end_index = wbc->range_end >> PAGE_SHIFT;
-
- trace_dax_writeback_range(inode, start_index, end_index);
+ trace_dax_writeback_range(inode, xas.xa_index, end_index);
- tag_pages_for_writeback(mapping, start_index, end_index);
+ tag_pages_for_writeback(mapping, xas.xa_index, end_index);
- pagevec_init(&pvec);
- while (!done) {
- pvec.nr = find_get_entries_tag(mapping, start_index,
- PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
- pvec.pages, indices);
-
- if (pvec.nr == 0)
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
+ ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
+ if (ret < 0) {
+ mapping_set_error(mapping, ret);
break;
-
- for (i = 0; i < pvec.nr; i++) {
- if (indices[i] > end_index) {
- done = true;
- break;
- }
-
- ret = dax_writeback_one(dax_dev, mapping, indices[i],
- pvec.pages[i]);
- if (ret < 0) {
- mapping_set_error(mapping, ret);
- goto out;
- }
}
- start_index = indices[pvec.nr - 1] + 1;
+ if (++scanned % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
}
-out:
+ xas_unlock_irq(&xas);
put_dax(dax_dev);
- trace_dax_writeback_range_done(inode, start_index, end_index);
- return (ret < 0 ? ret : 0);
+ trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
+ return ret;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
@@ -1125,16 +990,18 @@ out:
* If this page is ever written to we will re-fault and change the mapping to
* point to real DAX storage instead.
*/
-static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
- struct vm_fault *vmf)
+static vm_fault_t dax_load_hole(struct xa_state *xas,
+ struct address_space *mapping, void **entry,
+ struct vm_fault *vmf)
{
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;
- dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
- false);
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+ DAX_ZERO_PAGE, false);
+
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
return ret;
@@ -1342,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
+ XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
@@ -1368,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (write && !vmf->cow_page)
flags |= IOMAP_WRITE;
- entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
- if (IS_ERR(entry)) {
- ret = dax_fault_return(PTR_ERR(entry));
+ entry = grab_mapping_entry(&xas, mapping, 0);
+ if (xa_is_internal(entry)) {
+ ret = xa_to_internal(entry);
goto out;
}
@@ -1443,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto error_finish_iomap;
- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
0, write && !sync);
/*
@@ -1471,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!write) {
- ret = dax_load_hole(mapping, entry, vmf);
+ ret = dax_load_hole(&xas, mapping, &entry, vmf);
goto finish_iomap;
}
/*FALLTHRU*/
@@ -1498,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff);
+ dax_unlock_entry(&xas, entry);
out:
trace_dax_pte_fault_done(inode, vmf, ret);
return ret | major;
}
#ifdef CONFIG_FS_DAX_PMD
-static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
- void *entry)
+static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ struct iomap *iomap, void **entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
struct inode *inode = mapping->host;
struct page *zero_page;
- void *ret = NULL;
spinlock_t *ptl;
pmd_t pmd_entry;
pfn_t pfn;
@@ -1523,8 +1390,8 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
goto fallback;
pfn = page_to_pfn_t(zero_page);
- ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE, false);
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
@@ -1536,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
pmd_entry = pmd_mkhuge(pmd_entry);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
+ trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
return VM_FAULT_NOPAGE;
fallback:
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
return VM_FAULT_FALLBACK;
}
@@ -1549,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
@@ -1556,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct inode *inode = mapping->host;
vm_fault_t result = VM_FAULT_FALLBACK;
struct iomap iomap = { 0 };
- pgoff_t max_pgoff, pgoff;
+ pgoff_t max_pgoff;
void *entry;
loff_t pos;
int error;
@@ -1567,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
- pgoff = linear_page_index(vma, pmd_addr);
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
@@ -1576,7 +1443,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* Make sure that the faulting address's PMD offset (color) matches
* the PMD offset from the start of the file. This is necessary so
* that a PMD range in the page table overlaps exactly with a PMD
- * range in the radix tree.
+ * range in the page cache.
*/
if ((vmf->pgoff & PG_PMD_COLOUR) !=
((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
@@ -1592,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if ((pmd_addr + PMD_SIZE) > vma->vm_end)
goto fallback;
- if (pgoff >= max_pgoff) {
+ if (xas.xa_index >= max_pgoff) {
result = VM_FAULT_SIGBUS;
goto out;
}
/* If the PMD would extend beyond the file size */
- if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
+ if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
goto fallback;
/*
- * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
- * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
- * is already in the tree, for instance), it will return -EEXIST and
- * we just fall back to 4k entries.
+ * grab_mapping_entry() will make sure we get an empty PMD entry,
+ * a zero PMD entry or a DAX PMD. If it can't (because a PTE
+ * entry is already in the array, for instance), it will return
+ * VM_FAULT_FALLBACK.
*/
- entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
- if (IS_ERR(entry))
+ entry = grab_mapping_entry(&xas, mapping, DAX_PMD);
+ if (xa_is_internal(entry)) {
+ result = xa_to_internal(entry);
goto fallback;
+ }
/*
* It is possible, particularly with mixed reads & writes to private
@@ -1628,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* setting up a mapping, so really we're using iomap_begin() as a way
* to look up our filesystem block.
*/
- pos = (loff_t)pgoff << PAGE_SHIFT;
+ pos = (loff_t)xas.xa_index << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error)
goto unlock_entry;
@@ -1644,8 +1513,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto finish_iomap;
- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_PMD, write && !sync);
+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
+ DAX_PMD, write && !sync);
/*
* If we are doing synchronous page fault and inode needs fsync,
@@ -1669,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
break;
- result = dax_pmd_load_hole(vmf, &iomap, entry);
+ result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
break;
default:
WARN_ON_ONCE(1);
@@ -1692,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
&iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, pgoff);
+ dax_unlock_entry(&xas, entry);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
@@ -1737,54 +1606,49 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
-/**
+/*
* dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
* @vmf: The description of the fault
- * @pe_size: Size of entry to be inserted
* @pfn: PFN to insert
+ * @order: Order of entry to insert.
*
- * This function inserts writeable PTE or PMD entry into page tables for mmaped
- * DAX file. It takes care of marking corresponding radix tree entry as dirty
- * as well.
+ * This function inserts a writeable PTE or PMD entry into the page tables
+ * for an mmaped DAX file. It also marks the page cache entry as dirty.
*/
-static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
- enum page_entry_size pe_size,
- pfn_t pfn)
+static vm_fault_t
+dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- void *entry, **slot;
- pgoff_t index = vmf->pgoff;
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
+ void *entry;
vm_fault_t ret;
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ xas_lock_irq(&xas);
+ entry = get_unlocked_entry(&xas);
/* Did we race with someone splitting entry or so? */
if (!entry ||
- (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
- (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(&mapping->i_pages);
+ (order == 0 && !dax_is_pte_entry(entry)) ||
+ (order == PMD_ORDER && (xa_is_internal(entry) ||
+ !dax_is_pmd_entry(entry)))) {
+ put_unlocked_entry(&xas, entry);
+ xas_unlock_irq(&xas);
trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
- radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY);
- entry = lock_slot(mapping, slot);
- xa_unlock_irq(&mapping->i_pages);
- switch (pe_size) {
- case PE_SIZE_PTE:
+ xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
+ dax_lock_entry(&xas, entry);
+ xas_unlock_irq(&xas);
+ if (order == 0)
ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
- break;
#ifdef CONFIG_FS_DAX_PMD
- case PE_SIZE_PMD:
+ else if (order == PMD_ORDER)
ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
pfn, true);
- break;
#endif
- default:
+ else
ret = VM_FAULT_FALLBACK;
- }
- put_locked_mapping_entry(mapping, index);
+ dax_unlock_entry(&xas, entry);
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
return ret;
}
@@ -1804,17 +1668,12 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
- size_t len = 0;
+ unsigned int order = pe_order(pe_size);
+ size_t len = PAGE_SIZE << order;
- if (pe_size == PE_SIZE_PTE)
- len = PAGE_SIZE;
- else if (pe_size == PE_SIZE_PMD)
- len = PMD_SIZE;
- else
- WARN_ON_ONCE(1);
err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
if (err)
return VM_FAULT_SIGBUS;
- return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+ return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/fs/dcache.c b/fs/dcache.c
index c2e443fb76ae..2593153471cf 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -26,7 +26,7 @@
#include <linux/export.h>
#include <linux/security.h>
#include <linux/seqlock.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/list_lru.h>
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 224c04abb2e5..cf4c77f8dd08 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -256,11 +256,15 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
if (default_acl) {
error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
+ } else {
+ inode->i_default_acl = NULL;
}
if (acl) {
if (!error)
error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
return error;
}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 00e759f05161..e770cd100a6a 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -390,11 +390,7 @@ struct ext2_inode {
#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */
#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */
#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */
-#ifdef CONFIG_FS_DAX
#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */
-#else
-#define EXT2_MOUNT_DAX 0
-#endif
#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 73bd58fa13de..cb91baa4275d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -309,20 +309,17 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
-#if defined(CONFIG_QUOTA)
if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA)
seq_puts(seq, ",usrquota");
if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA)
seq_puts(seq, ",grpquota");
-#endif
-#ifdef CONFIG_FS_DAX
if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
seq_puts(seq, ",xip");
+
if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
seq_puts(seq, ",dax");
-#endif
if (!test_opt(sb, RESERVATION))
seq_puts(seq, ",noreservation");
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c3d9a42c561e..05f01fbd9c7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2643,7 +2643,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
long left = mpd->wbc->nr_to_write;
pgoff_t index = mpd->first_page;
pgoff_t end = mpd->last_page;
- int tag;
+ xa_mark_t tag;
int i, err = 0;
int blkbits = mpd->inode->i_blkbits;
ext4_lblk_t lblk;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 106f116466bf..b293cb3e27a2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2071,7 +2071,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
- int tag;
+ xa_mark_t tag;
int nwritten = 0;
pagevec_init(&pvec);
@@ -2787,13 +2787,13 @@ const struct address_space_operations f2fs_dblock_aops = {
#endif
};
-void f2fs_clear_radix_tree_dirty_tag(struct page *page)
+void f2fs_clear_page_cache_dirty_tag(struct page *page)
{
struct address_space *mapping = page_mapping(page);
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 2ef84b4590ea..bacc667950b6 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -726,7 +726,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
ClearPagePrivate(page);
ClearPageUptodate(page);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 56204a8f8a12..1e031971a466 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3108,7 +3108,7 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate_mode mode);
#endif
bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
-void f2fs_clear_radix_tree_dirty_tag(struct page *page);
+void f2fs_clear_page_cache_dirty_tag(struct page *page);
/*
* gc.c
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index cb31a719b048..7b0cff7e6051 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -243,7 +243,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
kunmap_atomic(src_addr);
set_page_dirty(dn.inode_page);
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 2b34206486d8..d338740d0fda 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -101,7 +101,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
static void clear_node_page_dirty(struct page *page)
{
if (PageDirty(page)) {
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
}
@@ -1306,9 +1306,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
if (f2fs_check_nid_range(sbi, nid))
return;
- rcu_read_lock();
- apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid);
- rcu_read_unlock();
+ apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid);
if (apage)
return;
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 7f5f3699fc6c..c8366cb8eccd 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -369,7 +369,9 @@ static int fat_parse_short(struct super_block *sb,
}
memcpy(work, de->name, sizeof(work));
- /* see namei.c, msdos_format_name */
+ /* For an explanation of the special treatment of 0x05 in
+ * filenames, see msdos_format_name in namei_msdos.c
+ */
if (work[0] == 0x05)
work[0] = 0xE5;
@@ -1071,7 +1073,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
}
}
- dir->i_mtime = dir->i_atime = current_time(dir);
+ fat_truncate_time(dir, NULL, S_ATIME|S_MTIME);
if (IS_DIRSYNC(dir))
(void)fat_sync_inode(dir);
else
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 9d7d2d5da28b..4e1b2f6df5e6 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -416,6 +416,10 @@ extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 __time, __le16 __date, u8 time_cs);
extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 *time, __le16 *date, u8 *time_cs);
+extern int fat_truncate_time(struct inode *inode, struct timespec64 *now,
+ int flags);
+extern int fat_update_time(struct inode *inode, struct timespec64 *now,
+ int flags);
extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
int fat_cache_init(void);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 4f3d72fb1e60..13935ee99e1e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -227,7 +227,7 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
if (err)
goto out;
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
mark_inode_dirty(inode);
if (IS_SYNC(inode)) {
int err2;
@@ -330,7 +330,7 @@ static int fat_free(struct inode *inode, int skip)
MSDOS_I(inode)->i_logstart = 0;
}
MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
if (wait) {
err = fat_sync_inode(inode);
if (err) {
@@ -542,6 +542,18 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
up_write(&MSDOS_I(inode)->truncate_lock);
}
+ /*
+ * setattr_copy can't truncate these appropriately, so we'll
+ * copy them ourselves
+ */
+ if (attr->ia_valid & ATTR_ATIME)
+ fat_truncate_time(inode, &attr->ia_atime, S_ATIME);
+ if (attr->ia_valid & ATTR_CTIME)
+ fat_truncate_time(inode, &attr->ia_ctime, S_CTIME);
+ if (attr->ia_valid & ATTR_MTIME)
+ fat_truncate_time(inode, &attr->ia_mtime, S_MTIME);
+ attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME);
+
setattr_copy(inode, attr);
mark_inode_dirty(inode);
out:
@@ -552,4 +564,5 @@ EXPORT_SYMBOL_GPL(fat_setattr);
const struct inode_operations fat_file_inode_operations = {
.setattr = fat_setattr,
.getattr = fat_getattr,
+ .update_time = fat_update_time,
};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d6b81e31f9f5..c0b5b5c3373b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -244,7 +244,7 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
if (err < len)
fat_write_failed(mapping, pos + len);
if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
mark_inode_dirty(inode);
}
@@ -564,7 +564,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
de->cdate, de->ctime_cs);
fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
} else
- inode->i_ctime = inode->i_atime = inode->i_mtime;
+ fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME);
return 0;
}
@@ -1626,6 +1626,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
sb->s_magic = MSDOS_SUPER_MAGIC;
sb->s_op = &fat_sops;
sb->s_export_op = &fat_export_ops;
+ /*
+ * fat timestamps are complex and truncated by fat itself, so
+ * we set 1 here to be fast
+ */
+ sb->s_time_gran = 1;
mutex_init(&sbi->nfs_build_inode_lock);
ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 573836dcaefc..fce0a76f3f1e 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -7,6 +7,7 @@
*/
#include "fat.h"
+#include <linux/iversion.h>
/*
* fat_fs_error reports a file system problem that might indicate fa data
@@ -185,6 +186,13 @@ static long days_in_year[] = {
0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
};
+static inline int fat_tz_offset(struct msdos_sb_info *sbi)
+{
+ return (sbi->options.tz_set ?
+ -sbi->options.time_offset :
+ sys_tz.tz_minuteswest) * SECS_PER_MIN;
+}
+
/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 __time, __le16 __date, u8 time_cs)
@@ -210,10 +218,7 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
+ days_in_year[month] + day
+ DAYS_DELTA) * SECS_PER_DAY;
- if (!sbi->options.tz_set)
- second += sys_tz.tz_minuteswest * SECS_PER_MIN;
- else
- second -= sbi->options.time_offset * SECS_PER_MIN;
+ second += fat_tz_offset(sbi);
if (time_cs) {
ts->tv_sec = second + (time_cs / 100);
@@ -229,9 +234,7 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 *time, __le16 *date, u8 *time_cs)
{
struct tm tm;
- time64_to_tm(ts->tv_sec,
- (sbi->options.tz_set ? sbi->options.time_offset :
- -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm);
+ time64_to_tm(ts->tv_sec, -fat_tz_offset(sbi), &tm);
/* FAT can only support year between 1980 to 2107 */
if (tm.tm_year < 1980 - 1900) {
@@ -263,6 +266,80 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
}
EXPORT_SYMBOL_GPL(fat_time_unix2fat);
+static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts)
+{
+ return (struct timespec64){ ts.tv_sec & ~1ULL, 0 };
+}
+/*
+ * truncate the various times with appropriate granularity:
+ * root inode:
+ * all times always 0
+ * all other inodes:
+ * mtime - 2 seconds
+ * ctime
+ * msdos - 2 seconds
+ * vfat - 10 milliseconds
+ * atime - 24 hours (00:00:00 in local timezone)
+ */
+int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct timespec64 ts;
+
+ if (inode->i_ino == MSDOS_ROOT_INO)
+ return 0;
+
+ if (now == NULL) {
+ now = &ts;
+ ts = current_time(inode);
+ }
+
+ if (flags & S_ATIME) {
+ /* to localtime */
+ time64_t seconds = now->tv_sec - fat_tz_offset(sbi);
+ s32 remainder;
+
+ div_s64_rem(seconds, SECS_PER_DAY, &remainder);
+ /* to day boundary, and back to unix time */
+ seconds = seconds + fat_tz_offset(sbi) - remainder;
+
+ inode->i_atime = (struct timespec64){ seconds, 0 };
+ }
+ if (flags & S_CTIME) {
+ if (sbi->options.isvfat)
+ inode->i_ctime = timespec64_trunc(*now, 10000000);
+ else
+ inode->i_ctime = fat_timespec64_trunc_2secs(*now);
+ }
+ if (flags & S_MTIME)
+ inode->i_mtime = fat_timespec64_trunc_2secs(*now);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fat_truncate_time);
+
+int fat_update_time(struct inode *inode, struct timespec64 *now, int flags)
+{
+ int iflags = I_DIRTY_TIME;
+ bool dirty = false;
+
+ if (inode->i_ino == MSDOS_ROOT_INO)
+ return 0;
+
+ fat_truncate_time(inode, now, flags);
+ if (flags & S_VERSION)
+ dirty = inode_maybe_inc_iversion(inode, false);
+ if ((flags & (S_ATIME | S_CTIME | S_MTIME)) &&
+ !(inode->i_sb->s_flags & SB_LAZYTIME))
+ dirty = true;
+
+ if (dirty)
+ iflags |= I_DIRTY_SYNC;
+ __mark_inode_dirty(inode, iflags);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fat_update_time);
+
int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
{
int i, err = 0;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index efb8c40c9d27..f2cd365a4e86 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -250,7 +250,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
if (err)
return err;
- dir->i_ctime = dir->i_mtime = *ts;
+ fat_truncate_time(dir, ts, S_CTIME|S_MTIME);
if (IS_DIRSYNC(dir))
(void)fat_sync_inode(dir);
else
@@ -294,7 +294,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
err = PTR_ERR(inode);
goto out;
}
- inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+ fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -327,7 +327,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
clear_nlink(inode);
- inode->i_ctime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_CTIME);
fat_detach(inode);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -380,7 +380,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out;
}
set_nlink(inode, 2);
- inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+ fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -413,7 +413,7 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
clear_nlink(inode);
- inode->i_ctime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_CTIME);
fat_detach(inode);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
@@ -478,7 +478,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
mark_inode_dirty(old_inode);
inode_inc_iversion(old_dir);
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
+ fat_truncate_time(old_dir, NULL, S_CTIME|S_MTIME);
if (IS_DIRSYNC(old_dir))
(void)fat_sync_inode(old_dir);
else
@@ -538,7 +538,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
if (err)
goto error_dotdot;
inode_inc_iversion(old_dir);
- old_dir->i_ctime = old_dir->i_mtime = ts;
+ fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME);
if (IS_DIRSYNC(old_dir))
(void)fat_sync_inode(old_dir);
else
@@ -548,7 +548,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
drop_nlink(new_inode);
if (is_dir)
drop_nlink(new_inode);
- new_inode->i_ctime = ts;
+ fat_truncate_time(new_inode, &ts, S_CTIME);
}
out:
brelse(sinfo.bh);
@@ -637,6 +637,7 @@ static const struct inode_operations msdos_dir_inode_operations = {
.rename = msdos_rename,
.setattr = fat_setattr,
.getattr = fat_getattr,
+ .update_time = fat_update_time,
};
static void setup(struct super_block *sb)
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 82cd1e69cbdf..996c8c25e9c6 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -678,7 +678,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname,
goto cleanup;
/* update timestamp */
- dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
+ fat_truncate_time(dir, ts, S_CTIME|S_MTIME);
if (IS_DIRSYNC(dir))
(void)fat_sync_inode(dir);
else
@@ -779,7 +779,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
goto out;
}
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+ fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -810,7 +810,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_ATIME|S_MTIME);
fat_detach(inode);
vfat_d_version_set(dentry, inode_query_iversion(dir));
out:
@@ -836,7 +836,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = current_time(inode);
+ fat_truncate_time(inode, NULL, S_ATIME|S_MTIME);
fat_detach(inode);
vfat_d_version_set(dentry, inode_query_iversion(dir));
out:
@@ -876,7 +876,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
inode_inc_iversion(inode);
set_nlink(inode, 2);
- inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+ fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -969,7 +969,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
goto error_dotdot;
inode_inc_iversion(old_dir);
- old_dir->i_ctime = old_dir->i_mtime = ts;
+ fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME);
if (IS_DIRSYNC(old_dir))
(void)fat_sync_inode(old_dir);
else
@@ -979,7 +979,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
drop_nlink(new_inode);
if (is_dir)
drop_nlink(new_inode);
- new_inode->i_ctime = ts;
+ fat_truncate_time(new_inode, &ts, S_CTIME);
}
out:
brelse(sinfo.bh);
@@ -1032,6 +1032,7 @@ static const struct inode_operations vfat_dir_inode_operations = {
.rename = vfat_rename,
.setattr = fat_setattr,
.getattr = fat_getattr,
+ .update_time = fat_update_time,
};
static void setup(struct super_block *sb)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 471d863958bc..b40168fcc94a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -339,9 +339,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
struct address_space *mapping = inode->i_mapping;
struct bdi_writeback *old_wb = inode->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
- struct radix_tree_iter iter;
+ XA_STATE(xas, &mapping->i_pages, 0);
+ struct page *page;
bool switched = false;
- void **slot;
/*
* By the time control reaches here, RCU grace period has passed
@@ -375,25 +375,18 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
* to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
* pages actually under writeback.
*/
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
- PAGECACHE_TAG_DIRTY) {
- struct page *page = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (likely(page) && PageDirty(page)) {
+ xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
+ if (PageDirty(page)) {
dec_wb_stat(old_wb, WB_RECLAIMABLE);
inc_wb_stat(new_wb, WB_RECLAIMABLE);
}
}
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
- PAGECACHE_TAG_WRITEBACK) {
- struct page *page = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (likely(page)) {
- WARN_ON_ONCE(!PageWriteback(page));
- dec_wb_stat(old_wb, WB_WRITEBACK);
- inc_wb_stat(new_wb, WB_WRITEBACK);
- }
+ xas_set(&xas, 0);
+ xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
+ WARN_ON_ONCE(!PageWriteback(page));
+ dec_wb_stat(old_wb, WB_WRITEBACK);
+ inc_wb_stat(new_wb, WB_WRITEBACK);
}
wb_get(new_wb);
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 60da84a86dab..f7b807bc1027 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -5,4 +5,4 @@
obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
-fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o
+fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 0b694655d988..989df5accaee 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -107,7 +107,7 @@ static ssize_t fuse_conn_max_background_read(struct file *file,
if (!fc)
return 0;
- val = fc->max_background;
+ val = READ_ONCE(fc->max_background);
fuse_conn_put(fc);
return fuse_conn_limit_read(file, buf, len, ppos, val);
@@ -125,7 +125,12 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
if (ret > 0) {
struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
if (fc) {
+ spin_lock(&fc->bg_lock);
fc->max_background = val;
+ fc->blocked = fc->num_background >= fc->max_background;
+ if (!fc->blocked)
+ wake_up(&fc->blocked_waitq);
+ spin_unlock(&fc->bg_lock);
fuse_conn_put(fc);
}
}
@@ -144,7 +149,7 @@ static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
if (!fc)
return 0;
- val = fc->congestion_threshold;
+ val = READ_ONCE(fc->congestion_threshold);
fuse_conn_put(fc);
return fuse_conn_limit_read(file, buf, len, ppos, val);
@@ -155,18 +160,31 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
size_t count, loff_t *ppos)
{
unsigned uninitialized_var(val);
+ struct fuse_conn *fc;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
max_user_congthresh);
- if (ret > 0) {
- struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
- if (fc) {
- fc->congestion_threshold = val;
- fuse_conn_put(fc);
+ if (ret <= 0)
+ goto out;
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ goto out;
+
+ spin_lock(&fc->bg_lock);
+ fc->congestion_threshold = val;
+ if (fc->sb) {
+ if (fc->num_background < fc->congestion_threshold) {
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ } else {
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
}
-
+ spin_unlock(&fc->bg_lock);
+ fuse_conn_put(fc);
+out:
return ret;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 11ea2c4a38ab..ae813e609932 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -25,6 +25,10 @@
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
+/* Ordinary requests have even IDs, while interrupts IDs are odd */
+#define FUSE_INT_REQ_BIT (1ULL << 0)
+#define FUSE_REQ_ID_STEP (1ULL << 1)
+
static struct kmem_cache *fuse_req_cachep;
static struct fuse_dev *fuse_get_dev(struct file *file)
@@ -40,9 +44,6 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages,
struct fuse_page_desc *page_descs,
unsigned npages)
{
- memset(req, 0, sizeof(*req));
- memset(pages, 0, sizeof(*pages) * npages);
- memset(page_descs, 0, sizeof(*page_descs) * npages);
INIT_LIST_HEAD(&req->list);
INIT_LIST_HEAD(&req->intr_entry);
init_waitqueue_head(&req->waitq);
@@ -53,30 +54,36 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages,
__set_bit(FR_PENDING, &req->flags);
}
+static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags,
+ struct fuse_page_desc **desc)
+{
+ struct page **pages;
+
+ pages = kzalloc(npages * (sizeof(struct page *) +
+ sizeof(struct fuse_page_desc)), flags);
+ *desc = (void *) pages + npages * sizeof(struct page *);
+
+ return pages;
+}
+
static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
{
- struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
+ struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
if (req) {
- struct page **pages;
- struct fuse_page_desc *page_descs;
-
- if (npages <= FUSE_REQ_INLINE_PAGES) {
+ struct page **pages = NULL;
+ struct fuse_page_desc *page_descs = NULL;
+
+ WARN_ON(npages > FUSE_MAX_MAX_PAGES);
+ if (npages > FUSE_REQ_INLINE_PAGES) {
+ pages = fuse_req_pages_alloc(npages, flags,
+ &page_descs);
+ if (!pages) {
+ kmem_cache_free(fuse_req_cachep, req);
+ return NULL;
+ }
+ } else if (npages) {
pages = req->inline_pages;
page_descs = req->inline_page_descs;
- } else {
- pages = kmalloc_array(npages, sizeof(struct page *),
- flags);
- page_descs =
- kmalloc_array(npages,
- sizeof(struct fuse_page_desc),
- flags);
- }
-
- if (!pages || !page_descs) {
- kfree(pages);
- kfree(page_descs);
- kmem_cache_free(fuse_req_cachep, req);
- return NULL;
}
fuse_request_init(req, pages, page_descs, npages);
@@ -95,12 +102,41 @@ struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
return __fuse_request_alloc(npages, GFP_NOFS);
}
-void fuse_request_free(struct fuse_req *req)
+static void fuse_req_pages_free(struct fuse_req *req)
{
- if (req->pages != req->inline_pages) {
+ if (req->pages != req->inline_pages)
kfree(req->pages);
- kfree(req->page_descs);
- }
+}
+
+bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req,
+ gfp_t flags)
+{
+ struct page **pages;
+ struct fuse_page_desc *page_descs;
+ unsigned int npages = min_t(unsigned int,
+ max_t(unsigned int, req->max_pages * 2,
+ FUSE_DEFAULT_MAX_PAGES_PER_REQ),
+ fc->max_pages);
+ WARN_ON(npages <= req->max_pages);
+
+ pages = fuse_req_pages_alloc(npages, flags, &page_descs);
+ if (!pages)
+ return false;
+
+ memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages);
+ memcpy(page_descs, req->page_descs,
+ sizeof(struct fuse_page_desc) * req->max_pages);
+ fuse_req_pages_free(req);
+ req->pages = pages;
+ req->page_descs = page_descs;
+ req->max_pages = npages;
+
+ return true;
+}
+
+void fuse_request_free(struct fuse_req *req)
+{
+ fuse_req_pages_free(req);
kmem_cache_free(fuse_req_cachep, req);
}
@@ -235,8 +271,10 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
struct file *file = req->stolen_file;
struct fuse_file *ff = file->private_data;
+ WARN_ON(req->max_pages);
spin_lock(&fc->lock);
- fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
+ memset(req, 0, sizeof(*req));
+ fuse_request_init(req, NULL, NULL, 0);
BUG_ON(ff->reserved_req);
ff->reserved_req = req;
wake_up_all(&fc->reserved_req_waitq);
@@ -287,10 +325,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
* We get here in the unlikely case that a background
* request was allocated but not sent
*/
- spin_lock(&fc->lock);
+ spin_lock(&fc->bg_lock);
if (!fc->blocked)
wake_up(&fc->blocked_waitq);
- spin_unlock(&fc->lock);
+ spin_unlock(&fc->bg_lock);
}
if (test_bit(FR_WAITING, &req->flags)) {
@@ -319,7 +357,13 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
static u64 fuse_get_unique(struct fuse_iqueue *fiq)
{
- return ++fiq->reqctr;
+ fiq->reqctr += FUSE_REQ_ID_STEP;
+ return fiq->reqctr;
+}
+
+static unsigned int fuse_req_hash(u64 unique)
+{
+ return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
}
static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req)
@@ -353,12 +397,13 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
static void flush_bg_queue(struct fuse_conn *fc)
{
+ struct fuse_iqueue *fiq = &fc->iq;
+
while (fc->active_background < fc->max_background &&
!list_empty(&fc->bg_queue)) {
struct fuse_req *req;
- struct fuse_iqueue *fiq = &fc->iq;
- req = list_entry(fc->bg_queue.next, struct fuse_req, list);
+ req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
list_del(&req->list);
fc->active_background++;
spin_lock(&fiq->waitq.lock);
@@ -389,14 +434,21 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
WARN_ON(test_bit(FR_PENDING, &req->flags));
WARN_ON(test_bit(FR_SENT, &req->flags));
if (test_bit(FR_BACKGROUND, &req->flags)) {
- spin_lock(&fc->lock);
+ spin_lock(&fc->bg_lock);
clear_bit(FR_BACKGROUND, &req->flags);
- if (fc->num_background == fc->max_background)
+ if (fc->num_background == fc->max_background) {
fc->blocked = 0;
-
- /* Wake up next waiter, if any */
- if (!fc->blocked && waitqueue_active(&fc->blocked_waitq))
wake_up(&fc->blocked_waitq);
+ } else if (!fc->blocked) {
+ /*
+ * Wake up next waiter, if any. It's okay to use
+ * waitqueue_active(), as we've already synced up
+ * fc->blocked with waiters with the wake_up() call
+ * above.
+ */
+ if (waitqueue_active(&fc->blocked_waitq))
+ wake_up(&fc->blocked_waitq);
+ }
if (fc->num_background == fc->congestion_threshold && fc->sb) {
clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
@@ -405,7 +457,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
fc->num_background--;
fc->active_background--;
flush_bg_queue(fc);
- spin_unlock(&fc->lock);
+ spin_unlock(&fc->bg_lock);
}
wake_up(&req->waitq);
if (req->end)
@@ -573,40 +625,38 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
return ret;
}
-/*
- * Called under fc->lock
- *
- * fc->connected must have been checked previously
- */
-void fuse_request_send_background_locked(struct fuse_conn *fc,
- struct fuse_req *req)
+bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req)
{
- BUG_ON(!test_bit(FR_BACKGROUND, &req->flags));
+ bool queued = false;
+
+ WARN_ON(!test_bit(FR_BACKGROUND, &req->flags));
if (!test_bit(FR_WAITING, &req->flags)) {
__set_bit(FR_WAITING, &req->flags);
atomic_inc(&fc->num_waiting);
}
__set_bit(FR_ISREPLY, &req->flags);
- fc->num_background++;
- if (fc->num_background == fc->max_background)
- fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold && fc->sb) {
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ spin_lock(&fc->bg_lock);
+ if (likely(fc->connected)) {
+ fc->num_background++;
+ if (fc->num_background == fc->max_background)
+ fc->blocked = 1;
+ if (fc->num_background == fc->congestion_threshold && fc->sb) {
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
+ }
+ list_add_tail(&req->list, &fc->bg_queue);
+ flush_bg_queue(fc);
+ queued = true;
}
- list_add_tail(&req->list, &fc->bg_queue);
- flush_bg_queue(fc);
+ spin_unlock(&fc->bg_lock);
+
+ return queued;
}
void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
{
- BUG_ON(!req->end);
- spin_lock(&fc->lock);
- if (fc->connected) {
- fuse_request_send_background_locked(fc, req);
- spin_unlock(&fc->lock);
- } else {
- spin_unlock(&fc->lock);
+ WARN_ON(!req->end);
+ if (!fuse_request_queue_background(fc, req)) {
req->out.h.error = -ENOTCONN;
req->end(fc, req);
fuse_put_request(fc, req);
@@ -1084,12 +1134,11 @@ __releases(fiq->waitq.lock)
int err;
list_del_init(&req->intr_entry);
- req->intr_unique = fuse_get_unique(fiq);
memset(&ih, 0, sizeof(ih));
memset(&arg, 0, sizeof(arg));
ih.len = reqsize;
ih.opcode = FUSE_INTERRUPT;
- ih.unique = req->intr_unique;
+ ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT);
arg.unique = req->in.h.unique;
spin_unlock(&fiq->waitq.lock);
@@ -1238,6 +1287,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
struct fuse_req *req;
struct fuse_in *in;
unsigned reqsize;
+ unsigned int hash;
restart:
spin_lock(&fiq->waitq.lock);
@@ -1310,13 +1360,16 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
err = reqsize;
goto out_end;
}
- list_move_tail(&req->list, &fpq->processing);
- spin_unlock(&fpq->lock);
+ hash = fuse_req_hash(req->in.h.unique);
+ list_move_tail(&req->list, &fpq->processing[hash]);
+ __fuse_get_request(req);
set_bit(FR_SENT, &req->flags);
+ spin_unlock(&fpq->lock);
/* matches barrier in request_wait_answer() */
smp_mb__after_atomic();
if (test_bit(FR_INTERRUPTED, &req->flags))
queue_interrupt(fiq, req);
+ fuse_put_request(fc, req);
return reqsize;
@@ -1663,7 +1716,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
unsigned int num;
unsigned int offset;
size_t total_len = 0;
- int num_pages;
+ unsigned int num_pages;
offset = outarg->offset & ~PAGE_MASK;
file_size = i_size_read(inode);
@@ -1675,7 +1728,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
num = file_size - outarg->offset;
num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
- num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ);
+ num_pages = min(num_pages, fc->max_pages);
req = fuse_get_req(fc, num_pages);
if (IS_ERR(req))
@@ -1792,10 +1845,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
/* Look up request on processing list by unique ID */
static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
{
+ unsigned int hash = fuse_req_hash(unique);
struct fuse_req *req;
- list_for_each_entry(req, &fpq->processing, list) {
- if (req->in.h.unique == unique || req->intr_unique == unique)
+ list_for_each_entry(req, &fpq->processing[hash], list) {
+ if (req->in.h.unique == unique)
return req;
}
return NULL;
@@ -1869,22 +1923,26 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
if (!fpq->connected)
goto err_unlock_pq;
- req = request_find(fpq, oh.unique);
+ req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
if (!req)
goto err_unlock_pq;
- /* Is it an interrupt reply? */
- if (req->intr_unique == oh.unique) {
+ /* Is it an interrupt reply ID? */
+ if (oh.unique & FUSE_INT_REQ_BIT) {
+ __fuse_get_request(req);
spin_unlock(&fpq->lock);
err = -EINVAL;
- if (nbytes != sizeof(struct fuse_out_header))
+ if (nbytes != sizeof(struct fuse_out_header)) {
+ fuse_put_request(fc, req);
goto err_finish;
+ }
if (oh.error == -ENOSYS)
fc->no_interrupt = 1;
else if (oh.error == -EAGAIN)
queue_interrupt(&fc->iq, req);
+ fuse_put_request(fc, req);
fuse_copy_finish(cs);
return nbytes;
@@ -2102,9 +2160,13 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
struct fuse_dev *fud;
struct fuse_req *req, *next;
LIST_HEAD(to_end);
+ unsigned int i;
+ /* Background queuing checks fc->connected under bg_lock */
+ spin_lock(&fc->bg_lock);
fc->connected = 0;
- fc->blocked = 0;
+ spin_unlock(&fc->bg_lock);
+
fc->aborted = is_abort;
fuse_set_initialized(fc);
list_for_each_entry(fud, &fc->devices, entry) {
@@ -2123,11 +2185,16 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
}
spin_unlock(&req->waitq.lock);
}
- list_splice_tail_init(&fpq->processing, &to_end);
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ list_splice_tail_init(&fpq->processing[i],
+ &to_end);
spin_unlock(&fpq->lock);
}
+ spin_lock(&fc->bg_lock);
+ fc->blocked = 0;
fc->max_background = UINT_MAX;
flush_bg_queue(fc);
+ spin_unlock(&fc->bg_lock);
spin_lock(&fiq->waitq.lock);
fiq->connected = 0;
@@ -2163,10 +2230,12 @@ int fuse_dev_release(struct inode *inode, struct file *file)
struct fuse_conn *fc = fud->fc;
struct fuse_pqueue *fpq = &fud->pq;
LIST_HEAD(to_end);
+ unsigned int i;
spin_lock(&fpq->lock);
WARN_ON(!list_empty(&fpq->io));
- list_splice_init(&fpq->processing, &to_end);
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ list_splice_init(&fpq->processing[i], &to_end);
spin_unlock(&fpq->lock);
end_requests(fc, &to_end);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0979609d6eba..47395b0c3b35 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,24 +14,9 @@
#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/xattr.h>
+#include <linux/iversion.h>
#include <linux/posix_acl.h>
-static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
-{
- struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_inode *fi = get_fuse_inode(dir);
-
- if (!fc->do_readdirplus)
- return false;
- if (!fc->readdirplus_auto)
- return true;
- if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
- return true;
- if (ctx->pos == 0)
- return true;
- return false;
-}
-
static void fuse_advise_use_readdirplus(struct inode *dir)
{
struct fuse_inode *fi = get_fuse_inode(dir);
@@ -80,8 +65,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec)
* Set dentry and possibly attribute timeouts from the lookup/mk*
* replies
*/
-static void fuse_change_entry_timeout(struct dentry *entry,
- struct fuse_entry_out *o)
+void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o)
{
fuse_dentry_settime(entry,
time_to_jiffies(o->entry_valid, o->entry_valid_nsec));
@@ -92,18 +76,29 @@ static u64 attr_timeout(struct fuse_attr_out *o)
return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
}
-static u64 entry_attr_timeout(struct fuse_entry_out *o)
+u64 entry_attr_timeout(struct fuse_entry_out *o)
{
return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
}
+static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
+{
+ set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask);
+}
+
/*
* Mark the attributes as stale, so that at the next call to
* ->getattr() they will be fetched from userspace
*/
void fuse_invalidate_attr(struct inode *inode)
{
- get_fuse_inode(inode)->i_time = 0;
+ fuse_invalidate_attr_mask(inode, STATX_BASIC_STATS);
+}
+
+static void fuse_dir_changed(struct inode *dir)
+{
+ fuse_invalidate_attr(dir);
+ inode_maybe_inc_iversion(dir, false);
}
/**
@@ -113,7 +108,7 @@ void fuse_invalidate_attr(struct inode *inode)
void fuse_invalidate_atime(struct inode *inode)
{
if (!IS_RDONLY(inode))
- fuse_invalidate_attr(inode);
+ fuse_invalidate_attr_mask(inode, STATX_ATIME);
}
/*
@@ -262,11 +257,6 @@ invalid:
goto out;
}
-static int invalid_nodeid(u64 nodeid)
-{
- return !nodeid || nodeid == FUSE_ROOT_ID;
-}
-
static int fuse_dentry_init(struct dentry *dentry)
{
dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
@@ -469,7 +459,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
kfree(forget);
d_instantiate(entry, inode);
fuse_change_entry_timeout(entry, &outentry);
- fuse_invalidate_attr(dir);
+ fuse_dir_changed(dir);
err = finish_open(file, entry, generic_file_open);
if (err) {
fuse_sync_release(ff, flags);
@@ -583,7 +573,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
} else {
fuse_change_entry_timeout(entry, &outarg);
}
- fuse_invalidate_attr(dir);
+ fuse_dir_changed(dir);
return 0;
out_put_forget_req:
@@ -693,7 +683,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
drop_nlink(inode);
spin_unlock(&fc->lock);
fuse_invalidate_attr(inode);
- fuse_invalidate_attr(dir);
+ fuse_dir_changed(dir);
fuse_invalidate_entry_cache(entry);
fuse_update_ctime(inode);
} else if (err == -EINTR)
@@ -715,7 +705,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
err = fuse_simple_request(fc, &args);
if (!err) {
clear_nlink(d_inode(entry));
- fuse_invalidate_attr(dir);
+ fuse_dir_changed(dir);
fuse_invalidate_entry_cache(entry);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
@@ -754,9 +744,9 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
fuse_update_ctime(d_inode(newent));
}
- fuse_invalidate_attr(olddir);
+ fuse_dir_changed(olddir);
if (olddir != newdir)
- fuse_invalidate_attr(newdir);
+ fuse_dir_changed(newdir);
/* newent will end up negative */
if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) {
@@ -932,7 +922,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
}
static int fuse_update_get_attr(struct inode *inode, struct file *file,
- struct kstat *stat, unsigned int flags)
+ struct kstat *stat, u32 request_mask,
+ unsigned int flags)
{
struct fuse_inode *fi = get_fuse_inode(inode);
int err = 0;
@@ -942,6 +933,8 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
sync = true;
else if (flags & AT_STATX_DONT_SYNC)
sync = false;
+ else if (request_mask & READ_ONCE(fi->inval_mask))
+ sync = true;
else
sync = time_before64(fi->i_time, get_jiffies_64());
@@ -959,7 +952,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
int fuse_update_attributes(struct inode *inode, struct file *file)
{
- return fuse_update_get_attr(inode, file, NULL, 0);
+ /* Do *not* need to get atime for internal purposes */
+ return fuse_update_get_attr(inode, file, NULL,
+ STATX_BASIC_STATS & ~STATX_ATIME, 0);
}
int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
@@ -989,7 +984,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
if (!entry)
goto unlock;
- fuse_invalidate_attr(parent);
+ fuse_dir_changed(parent);
fuse_invalidate_entry(entry);
if (child_nodeid != 0 && d_really_is_positive(entry)) {
@@ -1165,271 +1160,78 @@ static int fuse_permission(struct inode *inode, int mask)
return err;
}
-static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
- struct dir_context *ctx)
-{
- while (nbytes >= FUSE_NAME_OFFSET) {
- struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
- size_t reclen = FUSE_DIRENT_SIZE(dirent);
- if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
- return -EIO;
- if (reclen > nbytes)
- break;
- if (memchr(dirent->name, '/', dirent->namelen) != NULL)
- return -EIO;
-
- if (!dir_emit(ctx, dirent->name, dirent->namelen,
- dirent->ino, dirent->type))
- break;
-
- buf += reclen;
- nbytes -= reclen;
- ctx->pos = dirent->off;
- }
-
- return 0;
-}
-
-static int fuse_direntplus_link(struct file *file,
- struct fuse_direntplus *direntplus,
- u64 attr_version)
-{
- struct fuse_entry_out *o = &direntplus->entry_out;
- struct fuse_dirent *dirent = &direntplus->dirent;
- struct dentry *parent = file->f_path.dentry;
- struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
- struct dentry *dentry;
- struct dentry *alias;
- struct inode *dir = d_inode(parent);
- struct fuse_conn *fc;
- struct inode *inode;
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
-
- if (!o->nodeid) {
- /*
- * Unlike in the case of fuse_lookup, zero nodeid does not mean
- * ENOENT. Instead, it only means the userspace filesystem did
- * not want to return attributes/handle for this entry.
- *
- * So do nothing.
- */
- return 0;
- }
-
- if (name.name[0] == '.') {
- /*
- * We could potentially refresh the attributes of the directory
- * and its parent?
- */
- if (name.len == 1)
- return 0;
- if (name.name[1] == '.' && name.len == 2)
- return 0;
- }
-
- if (invalid_nodeid(o->nodeid))
- return -EIO;
- if (!fuse_valid_type(o->attr.mode))
- return -EIO;
-
- fc = get_fuse_conn(dir);
-
- name.hash = full_name_hash(parent, name.name, name.len);
- dentry = d_lookup(parent, &name);
- if (!dentry) {
-retry:
- dentry = d_alloc_parallel(parent, &name, &wq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- }
- if (!d_in_lookup(dentry)) {
- struct fuse_inode *fi;
- inode = d_inode(dentry);
- if (!inode ||
- get_node_id(inode) != o->nodeid ||
- ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
- d_invalidate(dentry);
- dput(dentry);
- goto retry;
- }
- if (is_bad_inode(inode)) {
- dput(dentry);
- return -EIO;
- }
-
- fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->nlookup++;
- spin_unlock(&fc->lock);
-
- forget_all_cached_acls(inode);
- fuse_change_attributes(inode, &o->attr,
- entry_attr_timeout(o),
- attr_version);
- /*
- * The other branch comes via fuse_iget()
- * which bumps nlookup inside
- */
- } else {
- inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
- &o->attr, entry_attr_timeout(o),
- attr_version);
- if (!inode)
- inode = ERR_PTR(-ENOMEM);
-
- alias = d_splice_alias(inode, dentry);
- d_lookup_done(dentry);
- if (alias) {
- dput(dentry);
- dentry = alias;
- }
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- }
- if (fc->readdirplus_auto)
- set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
- fuse_change_entry_timeout(dentry, o);
-
- dput(dentry);
- return 0;
-}
-
-static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
- struct dir_context *ctx, u64 attr_version)
+static int fuse_readlink_page(struct inode *inode, struct page *page)
{
- struct fuse_direntplus *direntplus;
- struct fuse_dirent *dirent;
- size_t reclen;
- int over = 0;
- int ret;
-
- while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
- direntplus = (struct fuse_direntplus *) buf;
- dirent = &direntplus->dirent;
- reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
-
- if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
- return -EIO;
- if (reclen > nbytes)
- break;
- if (memchr(dirent->name, '/', dirent->namelen) != NULL)
- return -EIO;
-
- if (!over) {
- /* We fill entries into dstbuf only as much as
- it can hold. But we still continue iterating
- over remaining entries to link them. If not,
- we need to send a FORGET for each of those
- which we did not link.
- */
- over = !dir_emit(ctx, dirent->name, dirent->namelen,
- dirent->ino, dirent->type);
- if (!over)
- ctx->pos = dirent->off;
- }
-
- buf += reclen;
- nbytes -= reclen;
-
- ret = fuse_direntplus_link(file, direntplus, attr_version);
- if (ret)
- fuse_force_forget(file, direntplus->entry_out.nodeid);
- }
-
- return 0;
-}
-
-static int fuse_readdir(struct file *file, struct dir_context *ctx)
-{
- int plus, err;
- size_t nbytes;
- struct page *page;
- struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
- u64 attr_version = 0;
- bool locked;
-
- if (is_bad_inode(inode))
- return -EIO;
+ int err;
req = fuse_get_req(fc, 1);
if (IS_ERR(req))
return PTR_ERR(req);
- page = alloc_page(GFP_KERNEL);
- if (!page) {
- fuse_put_request(fc, req);
- return -ENOMEM;
- }
-
- plus = fuse_use_readdirplus(inode, ctx);
+ req->out.page_zeroing = 1;
req->out.argpages = 1;
req->num_pages = 1;
req->pages[0] = page;
- req->page_descs[0].length = PAGE_SIZE;
- if (plus) {
- attr_version = fuse_get_attr_version(fc);
- fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
- FUSE_READDIRPLUS);
- } else {
- fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
- FUSE_READDIR);
- }
- locked = fuse_lock_inode(inode);
+ req->page_descs[0].length = PAGE_SIZE - 1;
+ req->in.h.opcode = FUSE_READLINK;
+ req->in.h.nodeid = get_node_id(inode);
+ req->out.argvar = 1;
+ req->out.numargs = 1;
+ req->out.args[0].size = PAGE_SIZE - 1;
fuse_request_send(fc, req);
- fuse_unlock_inode(inode, locked);
- nbytes = req->out.args[0].size;
err = req->out.h.error;
- fuse_put_request(fc, req);
+
if (!err) {
- if (plus) {
- err = parse_dirplusfile(page_address(page), nbytes,
- file, ctx,
- attr_version);
- } else {
- err = parse_dirfile(page_address(page), nbytes, file,
- ctx);
- }
+ char *link = page_address(page);
+ size_t len = req->out.args[0].size;
+
+ BUG_ON(len >= PAGE_SIZE);
+ link[len] = '\0';
}
- __free_page(page);
+ fuse_put_request(fc, req);
fuse_invalidate_atime(inode);
+
return err;
}
-static const char *fuse_get_link(struct dentry *dentry,
- struct inode *inode,
- struct delayed_call *done)
+static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
{
struct fuse_conn *fc = get_fuse_conn(inode);
- FUSE_ARGS(args);
- char *link;
- ssize_t ret;
+ struct page *page;
+ int err;
+
+ err = -EIO;
+ if (is_bad_inode(inode))
+ goto out_err;
+ if (fc->cache_symlinks)
+ return page_get_link(dentry, inode, callback);
+
+ err = -ECHILD;
if (!dentry)
- return ERR_PTR(-ECHILD);
+ goto out_err;
- link = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!link)
- return ERR_PTR(-ENOMEM);
+ page = alloc_page(GFP_KERNEL);
+ err = -ENOMEM;
+ if (!page)
+ goto out_err;
- args.in.h.opcode = FUSE_READLINK;
- args.in.h.nodeid = get_node_id(inode);
- args.out.argvar = 1;
- args.out.numargs = 1;
- args.out.args[0].size = PAGE_SIZE - 1;
- args.out.args[0].value = link;
- ret = fuse_simple_request(fc, &args);
- if (ret < 0) {
- kfree(link);
- link = ERR_PTR(ret);
- } else {
- link[ret] = '\0';
- set_delayed_call(done, kfree_link, link);
+ err = fuse_readlink_page(inode, page);
+ if (err) {
+ __free_page(page);
+ goto out_err;
}
- fuse_invalidate_atime(inode);
- return link;
+
+ set_delayed_call(callback, page_put_link, page);
+
+ return page_address(page);
+
+out_err:
+ return ERR_PTR(err);
}
static int fuse_dir_open(struct inode *inode, struct file *file)
@@ -1662,8 +1464,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
file = NULL;
}
- if (attr->ia_valid & ATTR_SIZE)
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (WARN_ON(!S_ISREG(inode->i_mode)))
+ return -EIO;
is_truncate = true;
+ }
if (is_truncate) {
fuse_set_nowrite(inode);
@@ -1811,7 +1616,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat,
if (!fuse_allow_current_process(fc))
return -EACCES;
- return fuse_update_get_attr(inode, NULL, stat, flags);
+ return fuse_update_get_attr(inode, NULL, stat, request_mask, flags);
}
static const struct inode_operations fuse_dir_inode_operations = {
@@ -1867,11 +1672,37 @@ void fuse_init_common(struct inode *inode)
void fuse_init_dir(struct inode *inode)
{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
inode->i_op = &fuse_dir_inode_operations;
inode->i_fop = &fuse_dir_operations;
+
+ spin_lock_init(&fi->rdc.lock);
+ fi->rdc.cached = false;
+ fi->rdc.size = 0;
+ fi->rdc.pos = 0;
+ fi->rdc.version = 0;
}
+static int fuse_symlink_readpage(struct file *null, struct page *page)
+{
+ int err = fuse_readlink_page(page->mapping->host, page);
+
+ if (!err)
+ SetPageUptodate(page);
+
+ unlock_page(page);
+
+ return err;
+}
+
+static const struct address_space_operations fuse_symlink_aops = {
+ .readpage = fuse_symlink_readpage,
+};
+
void fuse_init_symlink(struct inode *inode)
{
inode->i_op = &fuse_symlink_inode_operations;
+ inode->i_data.a_ops = &fuse_symlink_aops;
+ inode_nohighmem(inode);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 32d0b883e74f..58dbc39fea63 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -59,6 +59,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
}
INIT_LIST_HEAD(&ff->write_entry);
+ mutex_init(&ff->readdir.lock);
refcount_set(&ff->count, 1);
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
@@ -73,6 +74,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
void fuse_file_free(struct fuse_file *ff)
{
fuse_request_free(ff->reserved_req);
+ mutex_destroy(&ff->readdir.lock);
kfree(ff);
}
@@ -848,11 +850,11 @@ static int fuse_readpages_fill(void *_data, struct page *page)
fuse_wait_on_page_writeback(inode, page->index);
if (req->num_pages &&
- (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+ (req->num_pages == fc->max_pages ||
(req->num_pages + 1) * PAGE_SIZE > fc->max_read ||
req->pages[req->num_pages - 1]->index + 1 != page->index)) {
- int nr_alloc = min_t(unsigned, data->nr_pages,
- FUSE_MAX_PAGES_PER_REQ);
+ unsigned int nr_alloc = min_t(unsigned int, data->nr_pages,
+ fc->max_pages);
fuse_send_readpages(req, data->file);
if (fc->async_read)
req = fuse_get_req_for_background(fc, nr_alloc);
@@ -887,7 +889,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_data data;
int err;
- int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);
+ unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages);
err = -EIO;
if (is_bad_inode(inode))
@@ -1102,12 +1104,13 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
return count > 0 ? count : err;
}
-static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
+static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
+ unsigned int max_pages)
{
- return min_t(unsigned,
+ return min_t(unsigned int,
((pos + len - 1) >> PAGE_SHIFT) -
(pos >> PAGE_SHIFT) + 1,
- FUSE_MAX_PAGES_PER_REQ);
+ max_pages);
}
static ssize_t fuse_perform_write(struct kiocb *iocb,
@@ -1129,7 +1132,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
do {
struct fuse_req *req;
ssize_t count;
- unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii));
+ unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
+ fc->max_pages);
req = fuse_get_req(fc, nr_pages);
if (IS_ERR(req)) {
@@ -1319,11 +1323,6 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
return ret < 0 ? ret : 0;
}
-static inline int fuse_iter_npages(const struct iov_iter *ii_p)
-{
- return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
-}
-
ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
loff_t *ppos, int flags)
{
@@ -1343,9 +1342,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
int err = 0;
if (io->async)
- req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
+ req = fuse_get_req_for_background(fc, iov_iter_npages(iter,
+ fc->max_pages));
else
- req = fuse_get_req(fc, fuse_iter_npages(iter));
+ req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages));
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1390,9 +1390,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
fuse_put_request(fc, req);
if (io->async)
req = fuse_get_req_for_background(fc,
- fuse_iter_npages(iter));
+ iov_iter_npages(iter, fc->max_pages));
else
- req = fuse_get_req(fc, fuse_iter_npages(iter));
+ req = fuse_get_req(fc, iov_iter_npages(iter,
+ fc->max_pages));
if (IS_ERR(req))
break;
}
@@ -1418,7 +1419,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
res = fuse_direct_io(io, iter, ppos, 0);
- fuse_invalidate_attr(inode);
+ fuse_invalidate_atime(inode);
return res;
}
@@ -1487,6 +1488,7 @@ __acquires(fc->lock)
struct fuse_inode *fi = get_fuse_inode(req->inode);
struct fuse_write_in *inarg = &req->misc.write.in;
__u64 data_size = req->num_pages * PAGE_SIZE;
+ bool queued;
if (!fc->connected)
goto out_free;
@@ -1502,7 +1504,8 @@ __acquires(fc->lock)
req->in.args[1].size = inarg->size;
fi->writectr++;
- fuse_request_send_background_locked(fc, req);
+ queued = fuse_request_queue_background(fc, req);
+ WARN_ON(!queued);
return;
out_free:
@@ -1819,12 +1822,18 @@ static int fuse_writepages_fill(struct page *page,
is_writeback = fuse_page_is_writeback(inode, page->index);
if (req && req->num_pages &&
- (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+ (is_writeback || req->num_pages == fc->max_pages ||
(req->num_pages + 1) * PAGE_SIZE > fc->max_write ||
data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
fuse_writepages_send(data);
data->req = NULL;
+ } else if (req && req->num_pages == req->max_pages) {
+ if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) {
+ fuse_writepages_send(data);
+ req = data->req = NULL;
+ }
}
+
err = -ENOMEM;
tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!tmp_page)
@@ -1847,7 +1856,7 @@ static int fuse_writepages_fill(struct page *page,
struct fuse_inode *fi = get_fuse_inode(inode);
err = -ENOMEM;
- req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
+ req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES);
if (!req) {
__free_page(tmp_page);
goto out_unlock;
@@ -1904,6 +1913,7 @@ static int fuse_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_wb_data data;
int err;
@@ -1916,7 +1926,7 @@ static int fuse_writepages(struct address_space *mapping,
data.ff = NULL;
err = -ENOMEM;
- data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ,
+ data.orig_pages = kcalloc(fc->max_pages,
sizeof(struct page *),
GFP_NOFS);
if (!data.orig_pages)
@@ -2387,10 +2397,11 @@ static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
}
/* Make sure iov_length() won't overflow */
-static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
+ size_t count)
{
size_t n;
- u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+ u32 max = fc->max_pages << PAGE_SHIFT;
for (n = 0; n < count; n++, iov++) {
if (iov->iov_len > (size_t) max)
@@ -2514,7 +2525,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
err = -ENOMEM;
- pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
+ pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL);
iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
if (!pages || !iov_page)
goto out;
@@ -2553,7 +2564,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
/* make sure there are enough buffer pages and init request with them */
err = -ENOMEM;
- if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+ if (max_pages > fc->max_pages)
goto out;
while (num_pages < max_pages) {
pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
@@ -2640,11 +2651,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
in_iov = iov_page;
out_iov = in_iov + in_iovs;
- err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+ err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
if (err)
goto out;
- err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+ err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
if (err)
goto out;
@@ -2835,9 +2846,9 @@ static void fuse_do_truncate(struct file *file)
fuse_do_setattr(file_dentry(file), &attr, file);
}
-static inline loff_t fuse_round_up(loff_t off)
+static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
{
- return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+ return round_up(off, fc->max_pages << PAGE_SHIFT);
}
static ssize_t
@@ -2866,7 +2877,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
if (offset >= i_size)
return 0;
- iov_iter_truncate(iter, fuse_round_up(i_size - offset));
+ iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
count = iov_iter_count(iter);
}
@@ -3011,6 +3022,82 @@ out:
return err;
}
+static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
+{
+ struct fuse_file *ff_in = file_in->private_data;
+ struct fuse_file *ff_out = file_out->private_data;
+ struct inode *inode_out = file_inode(file_out);
+ struct fuse_inode *fi_out = get_fuse_inode(inode_out);
+ struct fuse_conn *fc = ff_in->fc;
+ FUSE_ARGS(args);
+ struct fuse_copy_file_range_in inarg = {
+ .fh_in = ff_in->fh,
+ .off_in = pos_in,
+ .nodeid_out = ff_out->nodeid,
+ .fh_out = ff_out->fh,
+ .off_out = pos_out,
+ .len = len,
+ .flags = flags
+ };
+ struct fuse_write_out outarg;
+ ssize_t err;
+ /* mark unstable when write-back is not used, and file_out gets
+ * extended */
+ bool is_unstable = (!fc->writeback_cache) &&
+ ((pos_out + len) > inode_out->i_size);
+
+ if (fc->no_copy_file_range)
+ return -EOPNOTSUPP;
+
+ inode_lock(inode_out);
+
+ if (fc->writeback_cache) {
+ err = filemap_write_and_wait_range(inode_out->i_mapping,
+ pos_out, pos_out + len);
+ if (err)
+ goto out;
+
+ fuse_sync_writes(inode_out);
+ }
+
+ if (is_unstable)
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
+
+ args.in.h.opcode = FUSE_COPY_FILE_RANGE;
+ args.in.h.nodeid = ff_in->nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
+ if (err == -ENOSYS) {
+ fc->no_copy_file_range = 1;
+ err = -EOPNOTSUPP;
+ }
+ if (err)
+ goto out;
+
+ if (fc->writeback_cache) {
+ fuse_write_update_size(inode_out, pos_out + outarg.size);
+ file_update_time(file_out);
+ }
+
+ fuse_invalidate_attr(inode_out);
+
+ err = outarg.size;
+out:
+ if (is_unstable)
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
+
+ inode_unlock(inode_out);
+
+ return err;
+}
+
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
.read_iter = fuse_file_read_iter,
@@ -3027,6 +3114,7 @@ static const struct file_operations fuse_file_operations = {
.compat_ioctl = fuse_file_compat_ioctl,
.poll = fuse_file_poll,
.fallocate = fuse_file_fallocate,
+ .copy_file_range = fuse_copy_file_range,
};
static const struct file_operations fuse_direct_io_file_operations = {
@@ -3062,6 +3150,14 @@ static const struct address_space_operations fuse_file_aops = {
void fuse_init_file_inode(struct inode *inode)
{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
inode->i_fop = &fuse_file_operations;
inode->i_data.a_ops = &fuse_file_aops;
+
+ INIT_LIST_HEAD(&fi->write_files);
+ INIT_LIST_HEAD(&fi->queued_writes);
+ fi->writectr = 0;
+ init_waitqueue_head(&fi->page_waitq);
+ INIT_LIST_HEAD(&fi->writepages);
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f78e9614bb5f..e9f712e81c7d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -28,8 +28,11 @@
#include <linux/refcount.h>
#include <linux/user_namespace.h>
-/** Max number of pages that can be used in a single read request */
-#define FUSE_MAX_PAGES_PER_REQ 32
+/** Default max number of pages that can be used in a single read request */
+#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
+
+/** Maximum of max_pages received in init_out */
+#define FUSE_MAX_MAX_PAGES 256
/** Bias for fi->writectr, meaning new writepages must not be sent */
#define FUSE_NOWRITE INT_MIN
@@ -77,6 +80,9 @@ struct fuse_inode {
/** Time in jiffies until the file attributes are valid */
u64 i_time;
+ /* Which attributes are invalid */
+ u32 inval_mask;
+
/** The sticky bit in inode->i_mode may have been removed, so
preserve the original mode */
umode_t orig_i_mode;
@@ -87,21 +93,51 @@ struct fuse_inode {
/** Version of last attribute change */
u64 attr_version;
- /** Files usable in writepage. Protected by fc->lock */
- struct list_head write_files;
+ union {
+ /* Write related fields (regular file only) */
+ struct {
+ /* Files usable in writepage. Protected by fc->lock */
+ struct list_head write_files;
+
+ /* Writepages pending on truncate or fsync */
+ struct list_head queued_writes;
- /** Writepages pending on truncate or fsync */
- struct list_head queued_writes;
+ /* Number of sent writes, a negative bias
+ * (FUSE_NOWRITE) means more writes are blocked */
+ int writectr;
+
+ /* Waitq for writepage completion */
+ wait_queue_head_t page_waitq;
+
+ /* List of writepage requestst (pending or sent) */
+ struct list_head writepages;
+ };
+
+ /* readdir cache (directory only) */
+ struct {
+ /* true if fully cached */
+ bool cached;
- /** Number of sent writes, a negative bias (FUSE_NOWRITE)
- * means more writes are blocked */
- int writectr;
+ /* size of cache */
+ loff_t size;
- /** Waitq for writepage completion */
- wait_queue_head_t page_waitq;
+ /* position at end of cache (position of next entry) */
+ loff_t pos;
- /** List of writepage requestst (pending or sent) */
- struct list_head writepages;
+ /* version of the cache */
+ u64 version;
+
+ /* modification time of directory when cache was
+ * started */
+ struct timespec64 mtime;
+
+ /* iversion of directory when cache was started */
+ u64 iversion;
+
+ /* protects above fields */
+ spinlock_t lock;
+ } rdc;
+ };
/** Miscellaneous bits describing inode state */
unsigned long state;
@@ -148,6 +184,25 @@ struct fuse_file {
/** Entry on inode's write_files list */
struct list_head write_entry;
+ /* Readdir related */
+ struct {
+ /*
+ * Protects below fields against (crazy) parallel readdir on
+ * same open file. Uncontended in the normal case.
+ */
+ struct mutex lock;
+
+ /* Dir stream position */
+ loff_t pos;
+
+ /* Offset in cache */
+ loff_t cache_off;
+
+ /* Version of cache we are reading */
+ u64 version;
+
+ } readdir;
+
/** RB node to be linked on fuse_conn->polled_files */
struct rb_node polled_node;
@@ -311,9 +366,6 @@ struct fuse_req {
/** refcount */
refcount_t count;
- /** Unique ID for the interrupt request */
- u64 intr_unique;
-
/* Request flags, updated with test/set/clear_bit() */
unsigned long flags;
@@ -411,6 +463,9 @@ struct fuse_iqueue {
struct fasync_struct *fasync;
};
+#define FUSE_PQ_HASH_BITS 8
+#define FUSE_PQ_HASH_SIZE (1 << FUSE_PQ_HASH_BITS)
+
struct fuse_pqueue {
/** Connection established */
unsigned connected;
@@ -418,8 +473,8 @@ struct fuse_pqueue {
/** Lock protecting accessess to members of this structure */
spinlock_t lock;
- /** The list of requests being processed */
- struct list_head processing;
+ /** Hash table of requests being processed */
+ struct list_head *processing;
/** The list of requests under I/O */
struct list_head io;
@@ -476,6 +531,9 @@ struct fuse_conn {
/** Maximum write size */
unsigned max_write;
+ /** Maxmum number of pages that can be used in a single request */
+ unsigned int max_pages;
+
/** Input queue */
struct fuse_iqueue iq;
@@ -500,6 +558,10 @@ struct fuse_conn {
/** The list of background requests set aside for later queuing */
struct list_head bg_queue;
+ /** Protects: max_background, congestion_threshold, num_background,
+ * active_background, bg_queue, blocked */
+ spinlock_t bg_lock;
+
/** Flag indicating that INIT reply has been received. Allocating
* any fuse request will be suspended until the flag is set */
int initialized;
@@ -551,6 +613,9 @@ struct fuse_conn {
/** handle fs handles killing suid/sgid/cap on write/chown/trunc */
unsigned handle_killpriv:1;
+ /** cache READLINK responses in page cache */
+ unsigned cache_symlinks:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
@@ -637,6 +702,9 @@ struct fuse_conn {
/** Allow other than the mounter user to access the filesystem ? */
unsigned allow_other:1;
+ /** Does the filesystem support copy_file_range? */
+ unsigned no_copy_file_range:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -697,6 +765,11 @@ static inline u64 get_node_id(struct inode *inode)
return get_fuse_inode(inode)->nodeid;
}
+static inline int invalid_nodeid(u64 nodeid)
+{
+ return !nodeid || nodeid == FUSE_ROOT_ID;
+}
+
/** Device operations */
extern const struct file_operations fuse_dev_operations;
@@ -812,6 +885,10 @@ struct fuse_req *fuse_request_alloc(unsigned npages);
struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
+bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req,
+ gfp_t flags);
+
+
/**
* Free a request
*/
@@ -856,9 +933,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
* Send a request in the background
*/
void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
-
-void fuse_request_send_background_locked(struct fuse_conn *fc,
- struct fuse_req *req);
+bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req);
/* Abort all requests */
void fuse_abort_conn(struct fuse_conn *fc, bool is_abort);
@@ -873,6 +948,9 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
void fuse_invalidate_atime(struct inode *inode);
+u64 entry_attr_timeout(struct fuse_entry_out *o);
+void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o);
+
/**
* Acquire reference to fuse_conn
*/
@@ -992,4 +1070,8 @@ struct posix_acl;
struct posix_acl *fuse_get_acl(struct inode *inode, int type);
int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+
+/* readdir.c */
+int fuse_readdir(struct file *file, struct dir_context *ctx);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index db9e60b7eb69..0b94b23b02d4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -90,16 +90,12 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
fi = get_fuse_inode(inode);
fi->i_time = 0;
+ fi->inval_mask = 0;
fi->nodeid = 0;
fi->nlookup = 0;
fi->attr_version = 0;
- fi->writectr = 0;
fi->orig_ino = 0;
fi->state = 0;
- INIT_LIST_HEAD(&fi->write_files);
- INIT_LIST_HEAD(&fi->queued_writes);
- INIT_LIST_HEAD(&fi->writepages);
- init_waitqueue_head(&fi->page_waitq);
mutex_init(&fi->mutex);
fi->forget = fuse_alloc_forget();
if (!fi->forget) {
@@ -119,8 +115,10 @@ static void fuse_i_callback(struct rcu_head *head)
static void fuse_destroy_inode(struct inode *inode)
{
struct fuse_inode *fi = get_fuse_inode(inode);
- BUG_ON(!list_empty(&fi->write_files));
- BUG_ON(!list_empty(&fi->queued_writes));
+ if (S_ISREG(inode->i_mode)) {
+ WARN_ON(!list_empty(&fi->write_files));
+ WARN_ON(!list_empty(&fi->queued_writes));
+ }
mutex_destroy(&fi->mutex);
kfree(fi->forget);
call_rcu(&inode->i_rcu, fuse_i_callback);
@@ -167,6 +165,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
fi->attr_version = ++fc->attr_version;
fi->i_time = attr_valid;
+ WRITE_ONCE(fi->inval_mask, 0);
inode->i_ino = fuse_squash_ino(attr->ino);
inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
@@ -594,9 +593,11 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq)
static void fuse_pqueue_init(struct fuse_pqueue *fpq)
{
- memset(fpq, 0, sizeof(struct fuse_pqueue));
+ unsigned int i;
+
spin_lock_init(&fpq->lock);
- INIT_LIST_HEAD(&fpq->processing);
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&fpq->processing[i]);
INIT_LIST_HEAD(&fpq->io);
fpq->connected = 1;
}
@@ -605,6 +606,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
{
memset(fc, 0, sizeof(*fc));
spin_lock_init(&fc->lock);
+ spin_lock_init(&fc->bg_lock);
init_rwsem(&fc->killsb);
refcount_set(&fc->count, 1);
atomic_set(&fc->dev_count, 1);
@@ -852,6 +854,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
sanitize_global_limit(&max_user_bgreq);
sanitize_global_limit(&max_user_congthresh);
+ spin_lock(&fc->bg_lock);
if (arg->max_background) {
fc->max_background = arg->max_background;
@@ -865,6 +868,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
fc->congestion_threshold > max_user_congthresh)
fc->congestion_threshold = max_user_congthresh;
}
+ spin_unlock(&fc->bg_lock);
}
static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
@@ -924,8 +928,15 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->posix_acl = 1;
fc->sb->s_xattr = fuse_acl_xattr_handlers;
}
+ if (arg->flags & FUSE_CACHE_SYMLINKS)
+ fc->cache_symlinks = 1;
if (arg->flags & FUSE_ABORT_ERROR)
fc->abort_err = 1;
+ if (arg->flags & FUSE_MAX_PAGES) {
+ fc->max_pages =
+ min_t(unsigned int, FUSE_MAX_MAX_PAGES,
+ max_t(unsigned int, arg->max_pages, 1));
+ }
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -957,7 +968,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
- FUSE_ABORT_ERROR;
+ FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -1022,17 +1033,26 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
{
struct fuse_dev *fud;
+ struct list_head *pq;
fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL);
- if (fud) {
- fud->fc = fuse_conn_get(fc);
- fuse_pqueue_init(&fud->pq);
+ if (!fud)
+ return NULL;
- spin_lock(&fc->lock);
- list_add_tail(&fud->entry, &fc->devices);
- spin_unlock(&fc->lock);
+ pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL);
+ if (!pq) {
+ kfree(fud);
+ return NULL;
}
+ fud->pq.processing = pq;
+ fud->fc = fuse_conn_get(fc);
+ fuse_pqueue_init(&fud->pq);
+
+ spin_lock(&fc->lock);
+ list_add_tail(&fud->entry, &fc->devices);
+ spin_unlock(&fc->lock);
+
return fud;
}
EXPORT_SYMBOL_GPL(fuse_dev_alloc);
@@ -1141,6 +1161,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
fc->user_id = d.user_id;
fc->group_id = d.group_id;
fc->max_read = max_t(unsigned, 4096, d.max_read);
+ fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
/* Used by get_root_inode() */
sb->s_fs_info = fc;
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
new file mode 100644
index 000000000000..ab18b78f4755
--- /dev/null
+++ b/fs/fuse/readdir.c
@@ -0,0 +1,569 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2018 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+
+#include "fuse_i.h"
+#include <linux/iversion.h>
+#include <linux/posix_acl.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+{
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_inode *fi = get_fuse_inode(dir);
+
+ if (!fc->do_readdirplus)
+ return false;
+ if (!fc->readdirplus_auto)
+ return true;
+ if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
+ return true;
+ if (ctx->pos == 0)
+ return true;
+ return false;
+}
+
+static void fuse_add_dirent_to_cache(struct file *file,
+ struct fuse_dirent *dirent, loff_t pos)
+{
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
+ size_t reclen = FUSE_DIRENT_SIZE(dirent);
+ pgoff_t index;
+ struct page *page;
+ loff_t size;
+ u64 version;
+ unsigned int offset;
+ void *addr;
+
+ spin_lock(&fi->rdc.lock);
+ /*
+ * Is cache already completed? Or this entry does not go at the end of
+ * cache?
+ */
+ if (fi->rdc.cached || pos != fi->rdc.pos) {
+ spin_unlock(&fi->rdc.lock);
+ return;
+ }
+ version = fi->rdc.version;
+ size = fi->rdc.size;
+ offset = size & ~PAGE_MASK;
+ index = size >> PAGE_SHIFT;
+ /* Dirent doesn't fit in current page? Jump to next page. */
+ if (offset + reclen > PAGE_SIZE) {
+ index++;
+ offset = 0;
+ }
+ spin_unlock(&fi->rdc.lock);
+
+ if (offset) {
+ page = find_lock_page(file->f_mapping, index);
+ } else {
+ page = find_or_create_page(file->f_mapping, index,
+ mapping_gfp_mask(file->f_mapping));
+ }
+ if (!page)
+ return;
+
+ spin_lock(&fi->rdc.lock);
+ /* Raced with another readdir */
+ if (fi->rdc.version != version || fi->rdc.size != size ||
+ WARN_ON(fi->rdc.pos != pos))
+ goto unlock;
+
+ addr = kmap_atomic(page);
+ if (!offset)
+ clear_page(addr);
+ memcpy(addr + offset, dirent, reclen);
+ kunmap_atomic(addr);
+ fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
+ fi->rdc.pos = dirent->off;
+unlock:
+ spin_unlock(&fi->rdc.lock);
+ unlock_page(page);
+ put_page(page);
+}
+
+static void fuse_readdir_cache_end(struct file *file, loff_t pos)
+{
+ struct fuse_inode *fi = get_fuse_inode(file_inode(file));
+ loff_t end;
+
+ spin_lock(&fi->rdc.lock);
+ /* does cache end position match current position? */
+ if (fi->rdc.pos != pos) {
+ spin_unlock(&fi->rdc.lock);
+ return;
+ }
+
+ fi->rdc.cached = true;
+ end = ALIGN(fi->rdc.size, PAGE_SIZE);
+ spin_unlock(&fi->rdc.lock);
+
+ /* truncate unused tail of cache */
+ truncate_inode_pages(file->f_mapping, end);
+}
+
+static bool fuse_emit(struct file *file, struct dir_context *ctx,
+ struct fuse_dirent *dirent)
+{
+ struct fuse_file *ff = file->private_data;
+
+ if (ff->open_flags & FOPEN_CACHE_DIR)
+ fuse_add_dirent_to_cache(file, dirent, ctx->pos);
+
+ return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino,
+ dirent->type);
+}
+
+static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
+ struct dir_context *ctx)
+{
+ while (nbytes >= FUSE_NAME_OFFSET) {
+ struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+ size_t reclen = FUSE_DIRENT_SIZE(dirent);
+ if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+ return -EIO;
+ if (reclen > nbytes)
+ break;
+ if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+ return -EIO;
+
+ if (!fuse_emit(file, ctx, dirent))
+ break;
+
+ buf += reclen;
+ nbytes -= reclen;
+ ctx->pos = dirent->off;
+ }
+
+ return 0;
+}
+
+static int fuse_direntplus_link(struct file *file,
+ struct fuse_direntplus *direntplus,
+ u64 attr_version)
+{
+ struct fuse_entry_out *o = &direntplus->entry_out;
+ struct fuse_dirent *dirent = &direntplus->dirent;
+ struct dentry *parent = file->f_path.dentry;
+ struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
+ struct dentry *dentry;
+ struct dentry *alias;
+ struct inode *dir = d_inode(parent);
+ struct fuse_conn *fc;
+ struct inode *inode;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+ if (!o->nodeid) {
+ /*
+ * Unlike in the case of fuse_lookup, zero nodeid does not mean
+ * ENOENT. Instead, it only means the userspace filesystem did
+ * not want to return attributes/handle for this entry.
+ *
+ * So do nothing.
+ */
+ return 0;
+ }
+
+ if (name.name[0] == '.') {
+ /*
+ * We could potentially refresh the attributes of the directory
+ * and its parent?
+ */
+ if (name.len == 1)
+ return 0;
+ if (name.name[1] == '.' && name.len == 2)
+ return 0;
+ }
+
+ if (invalid_nodeid(o->nodeid))
+ return -EIO;
+ if (!fuse_valid_type(o->attr.mode))
+ return -EIO;
+
+ fc = get_fuse_conn(dir);
+
+ name.hash = full_name_hash(parent, name.name, name.len);
+ dentry = d_lookup(parent, &name);
+ if (!dentry) {
+retry:
+ dentry = d_alloc_parallel(parent, &name, &wq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ }
+ if (!d_in_lookup(dentry)) {
+ struct fuse_inode *fi;
+ inode = d_inode(dentry);
+ if (!inode ||
+ get_node_id(inode) != o->nodeid ||
+ ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+ d_invalidate(dentry);
+ dput(dentry);
+ goto retry;
+ }
+ if (is_bad_inode(inode)) {
+ dput(dentry);
+ return -EIO;
+ }
+
+ fi = get_fuse_inode(inode);
+ spin_lock(&fc->lock);
+ fi->nlookup++;
+ spin_unlock(&fc->lock);
+
+ forget_all_cached_acls(inode);
+ fuse_change_attributes(inode, &o->attr,
+ entry_attr_timeout(o),
+ attr_version);
+ /*
+ * The other branch comes via fuse_iget()
+ * which bumps nlookup inside
+ */
+ } else {
+ inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+ &o->attr, entry_attr_timeout(o),
+ attr_version);
+ if (!inode)
+ inode = ERR_PTR(-ENOMEM);
+
+ alias = d_splice_alias(inode, dentry);
+ d_lookup_done(dentry);
+ if (alias) {
+ dput(dentry);
+ dentry = alias;
+ }
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ }
+ if (fc->readdirplus_auto)
+ set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
+ fuse_change_entry_timeout(dentry, o);
+
+ dput(dentry);
+ return 0;
+}
+
+static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
+ struct dir_context *ctx, u64 attr_version)
+{
+ struct fuse_direntplus *direntplus;
+ struct fuse_dirent *dirent;
+ size_t reclen;
+ int over = 0;
+ int ret;
+
+ while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
+ direntplus = (struct fuse_direntplus *) buf;
+ dirent = &direntplus->dirent;
+ reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
+
+ if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+ return -EIO;
+ if (reclen > nbytes)
+ break;
+ if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+ return -EIO;
+
+ if (!over) {
+ /* We fill entries into dstbuf only as much as
+ it can hold. But we still continue iterating
+ over remaining entries to link them. If not,
+ we need to send a FORGET for each of those
+ which we did not link.
+ */
+ over = !fuse_emit(file, ctx, dirent);
+ if (!over)
+ ctx->pos = dirent->off;
+ }
+
+ buf += reclen;
+ nbytes -= reclen;
+
+ ret = fuse_direntplus_link(file, direntplus, attr_version);
+ if (ret)
+ fuse_force_forget(file, direntplus->entry_out.nodeid);
+ }
+
+ return 0;
+}
+
+static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
+{
+ int plus, err;
+ size_t nbytes;
+ struct page *page;
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ u64 attr_version = 0;
+ bool locked;
+
+ req = fuse_get_req(fc, 1);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ fuse_put_request(fc, req);
+ return -ENOMEM;
+ }
+
+ plus = fuse_use_readdirplus(inode, ctx);
+ req->out.argpages = 1;
+ req->num_pages = 1;
+ req->pages[0] = page;
+ req->page_descs[0].length = PAGE_SIZE;
+ if (plus) {
+ attr_version = fuse_get_attr_version(fc);
+ fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
+ FUSE_READDIRPLUS);
+ } else {
+ fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
+ FUSE_READDIR);
+ }
+ locked = fuse_lock_inode(inode);
+ fuse_request_send(fc, req);
+ fuse_unlock_inode(inode, locked);
+ nbytes = req->out.args[0].size;
+ err = req->out.h.error;
+ fuse_put_request(fc, req);
+ if (!err) {
+ if (!nbytes) {
+ struct fuse_file *ff = file->private_data;
+
+ if (ff->open_flags & FOPEN_CACHE_DIR)
+ fuse_readdir_cache_end(file, ctx->pos);
+ } else if (plus) {
+ err = parse_dirplusfile(page_address(page), nbytes,
+ file, ctx, attr_version);
+ } else {
+ err = parse_dirfile(page_address(page), nbytes, file,
+ ctx);
+ }
+ }
+
+ __free_page(page);
+ fuse_invalidate_atime(inode);
+ return err;
+}
+
+enum fuse_parse_result {
+ FOUND_ERR = -1,
+ FOUND_NONE = 0,
+ FOUND_SOME,
+ FOUND_ALL,
+};
+
+static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
+ void *addr, unsigned int size,
+ struct dir_context *ctx)
+{
+ unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK;
+ enum fuse_parse_result res = FOUND_NONE;
+
+ WARN_ON(offset >= size);
+
+ for (;;) {
+ struct fuse_dirent *dirent = addr + offset;
+ unsigned int nbytes = size - offset;
+ size_t reclen = FUSE_DIRENT_SIZE(dirent);
+
+ if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen)
+ break;
+
+ if (WARN_ON(dirent->namelen > FUSE_NAME_MAX))
+ return FOUND_ERR;
+ if (WARN_ON(reclen > nbytes))
+ return FOUND_ERR;
+ if (WARN_ON(memchr(dirent->name, '/', dirent->namelen) != NULL))
+ return FOUND_ERR;
+
+ if (ff->readdir.pos == ctx->pos) {
+ res = FOUND_SOME;
+ if (!dir_emit(ctx, dirent->name, dirent->namelen,
+ dirent->ino, dirent->type))
+ return FOUND_ALL;
+ ctx->pos = dirent->off;
+ }
+ ff->readdir.pos = dirent->off;
+ ff->readdir.cache_off += reclen;
+
+ offset += reclen;
+ }
+
+ return res;
+}
+
+static void fuse_rdc_reset(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ fi->rdc.cached = false;
+ fi->rdc.version++;
+ fi->rdc.size = 0;
+ fi->rdc.pos = 0;
+}
+
+#define UNCACHED 1
+
+static int fuse_readdir_cached(struct file *file, struct dir_context *ctx)
+{
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ enum fuse_parse_result res;
+ pgoff_t index;
+ unsigned int size;
+ struct page *page;
+ void *addr;
+
+ /* Seeked? If so, reset the cache stream */
+ if (ff->readdir.pos != ctx->pos) {
+ ff->readdir.pos = 0;
+ ff->readdir.cache_off = 0;
+ }
+
+ /*
+ * We're just about to start reading into the cache or reading the
+ * cache; both cases require an up-to-date mtime value.
+ */
+ if (!ctx->pos && fc->auto_inval_data) {
+ int err = fuse_update_attributes(inode, file);
+
+ if (err)
+ return err;
+ }
+
+retry:
+ spin_lock(&fi->rdc.lock);
+retry_locked:
+ if (!fi->rdc.cached) {
+ /* Starting cache? Set cache mtime. */
+ if (!ctx->pos && !fi->rdc.size) {
+ fi->rdc.mtime = inode->i_mtime;
+ fi->rdc.iversion = inode_query_iversion(inode);
+ }
+ spin_unlock(&fi->rdc.lock);
+ return UNCACHED;
+ }
+ /*
+ * When at the beginning of the directory (i.e. just after opendir(3) or
+ * rewinddir(3)), then need to check whether directory contents have
+ * changed, and reset the cache if so.
+ */
+ if (!ctx->pos) {
+ if (inode_peek_iversion(inode) != fi->rdc.iversion ||
+ !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) {
+ fuse_rdc_reset(inode);
+ goto retry_locked;
+ }
+ }
+
+ /*
+ * If cache version changed since the last getdents() call, then reset
+ * the cache stream.
+ */
+ if (ff->readdir.version != fi->rdc.version) {
+ ff->readdir.pos = 0;
+ ff->readdir.cache_off = 0;
+ }
+ /*
+ * If at the beginning of the cache, than reset version to
+ * current.
+ */
+ if (ff->readdir.pos == 0)
+ ff->readdir.version = fi->rdc.version;
+
+ WARN_ON(fi->rdc.size < ff->readdir.cache_off);
+
+ index = ff->readdir.cache_off >> PAGE_SHIFT;
+
+ if (index == (fi->rdc.size >> PAGE_SHIFT))
+ size = fi->rdc.size & ~PAGE_MASK;
+ else
+ size = PAGE_SIZE;
+ spin_unlock(&fi->rdc.lock);
+
+ /* EOF? */
+ if ((ff->readdir.cache_off & ~PAGE_MASK) == size)
+ return 0;
+
+ page = find_get_page_flags(file->f_mapping, index,
+ FGP_ACCESSED | FGP_LOCK);
+ spin_lock(&fi->rdc.lock);
+ if (!page) {
+ /*
+ * Uh-oh: page gone missing, cache is useless
+ */
+ if (fi->rdc.version == ff->readdir.version)
+ fuse_rdc_reset(inode);
+ goto retry_locked;
+ }
+
+ /* Make sure it's still the same version after getting the page. */
+ if (ff->readdir.version != fi->rdc.version) {
+ spin_unlock(&fi->rdc.lock);
+ unlock_page(page);
+ put_page(page);
+ goto retry;
+ }
+ spin_unlock(&fi->rdc.lock);
+
+ /*
+ * Contents of the page are now protected against changing by holding
+ * the page lock.
+ */
+ addr = kmap(page);
+ res = fuse_parse_cache(ff, addr, size, ctx);
+ kunmap(page);
+ unlock_page(page);
+ put_page(page);
+
+ if (res == FOUND_ERR)
+ return -EIO;
+
+ if (res == FOUND_ALL)
+ return 0;
+
+ if (size == PAGE_SIZE) {
+ /* We hit end of page: skip to next page. */
+ ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE);
+ goto retry;
+ }
+
+ /*
+ * End of cache reached. If found position, then we are done, otherwise
+ * need to fall back to uncached, since the position we were looking for
+ * wasn't in the cache.
+ */
+ return res == FOUND_SOME ? 0 : UNCACHED;
+}
+
+int fuse_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
+ int err;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ mutex_lock(&ff->readdir.lock);
+
+ err = UNCACHED;
+ if (ff->open_flags & FOPEN_CACHE_DIR)
+ err = fuse_readdir_cached(file, ctx);
+ if (err == UNCACHED)
+ err = fuse_readdir_uncached(file, ctx);
+
+ mutex_unlock(&ff->readdir.lock);
+
+ return err;
+}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 31e8270d0b26..8afbb35559b9 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -366,7 +366,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 9a8772465a90..896396554bcc 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -425,6 +425,10 @@ skip:
if (new_node) {
__be32 cnid;
+ if (!new_node->parent) {
+ hfs_btree_inc_height(tree);
+ new_node->parent = tree->root;
+ }
fd->bnode = hfs_bnode_find(tree, new_node->parent);
/* create index key and entry */
hfs_bnode_read_key(new_node, fd->search_key, 14);
@@ -441,6 +445,7 @@ skip:
/* restore search_key */
hfs_bnode_read_key(node, fd->search_key, 14);
}
+ new_node = NULL;
}
if (!rec && node->parent)
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 374b5688e29e..98b96ffb95ed 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -220,25 +220,17 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
return node;
}
-struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+/* Make sure @tree has enough space for the @rsvd_nodes */
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes)
{
- struct hfs_bnode *node, *next_node;
- struct page **pagep;
- u32 nidx, idx;
- unsigned off;
- u16 off16;
- u16 len;
- u8 *data, byte, m;
- int i;
-
- while (!tree->free_nodes) {
- struct inode *inode = tree->inode;
- u32 count;
- int res;
+ struct inode *inode = tree->inode;
+ u32 count;
+ int res;
+ while (tree->free_nodes < rsvd_nodes) {
res = hfs_extend_file(inode);
if (res)
- return ERR_PTR(res);
+ return res;
HFS_I(inode)->phys_size = inode->i_size =
(loff_t)HFS_I(inode)->alloc_blocks *
HFS_SB(tree->sb)->alloc_blksz;
@@ -246,9 +238,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
tree->sb->s_blocksize_bits;
inode_set_bytes(inode, inode->i_size);
count = inode->i_size >> tree->node_size_shift;
- tree->free_nodes = count - tree->node_count;
+ tree->free_nodes += count - tree->node_count;
tree->node_count = count;
}
+ return 0;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+ struct hfs_bnode *node, *next_node;
+ struct page **pagep;
+ u32 nidx, idx;
+ unsigned off;
+ u16 off16;
+ u16 len;
+ u8 *data, byte, m;
+ int i, res;
+
+ res = hfs_bmap_reserve(tree, 1);
+ if (res)
+ return ERR_PTR(res);
nidx = 0;
node = hfs_bnode_find(tree, nidx);
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index c8b252dbb26c..dcc2aab1b2c4 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -82,6 +82,7 @@ struct hfs_find_data {
extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp);
extern void hfs_btree_close(struct hfs_btree *);
extern void hfs_btree_write(struct hfs_btree *);
+extern int hfs_bmap_reserve(struct hfs_btree *, int);
extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *);
extern void hfs_bmap_free(struct hfs_bnode *node);
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 8a66405b0f8b..d365bf0b8c77 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -97,6 +97,14 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
+ if (err)
+ goto err2;
+
hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
HFS_CDR_THD : HFS_CDR_FTH,
@@ -295,6 +303,14 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
return err;
dst_fd = src_fd;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(src_fd.tree, 2 * src_fd.tree->depth);
+ if (err)
+ goto out;
+
/* find the old dir entry and read the data */
hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
err = hfs_brec_find(&src_fd);
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 5d0182654580..263d5028d9d1 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -117,6 +117,10 @@ static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) {
if (res != -ENOENT)
return res;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
+ if (res)
+ return res;
hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec));
HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
} else {
@@ -300,7 +304,7 @@ int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type)
return 0;
blocks = 0;
- for (i = 0; i < 3; extent++, i++)
+ for (i = 0; i < 3; i++)
blocks += be16_to_cpu(extent[i].count);
res = hfs_free_extents(sb, extent, blocks, blocks);
@@ -341,7 +345,9 @@ int hfs_get_block(struct inode *inode, sector_t block,
ablock = (u32)block / HFS_SB(sb)->fs_div;
if (block >= HFS_I(inode)->fs_blocks) {
- if (block > HFS_I(inode)->fs_blocks || !create)
+ if (!create)
+ return 0;
+ if (block > HFS_I(inode)->fs_blocks)
return -EIO;
if (ablock >= HFS_I(inode)->alloc_blocks) {
res = hfs_extend_file(inode);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a2dfa1b2a89c..da243c84e93b 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -642,6 +642,8 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
truncate_setsize(inode, attr->ia_size);
hfs_file_truncate(inode);
+ inode->i_atime = inode->i_mtime = inode->i_ctime =
+ current_time(inode);
}
setattr_copy(inode, attr);
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index 2bab6b3cdba4..e6d554476db4 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -217,6 +217,11 @@ int hfsplus_create_attr(struct inode *inode,
if (err)
goto failed_init_create_attr;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ err = hfs_bmap_reserve(fd.tree, fd.tree->depth + 1);
+ if (err)
+ goto failed_create_attr;
+
if (name) {
err = hfsplus_attr_build_key(sb, fd.search_key,
inode->i_ino, name);
@@ -313,6 +318,11 @@ int hfsplus_delete_attr(struct inode *inode, const char *name)
if (err)
return err;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ err = hfs_bmap_reserve(fd.tree, fd.tree->depth);
+ if (err)
+ goto out;
+
if (name) {
err = hfsplus_attr_build_key(sb, fd.search_key,
inode->i_ino, name);
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index ed8eacb34452..1918544a7871 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -429,6 +429,10 @@ skip:
if (new_node) {
__be32 cnid;
+ if (!new_node->parent) {
+ hfs_btree_inc_height(tree);
+ new_node->parent = tree->root;
+ }
fd->bnode = hfs_bnode_find(tree, new_node->parent);
/* create index key and entry */
hfs_bnode_read_key(new_node, fd->search_key, 14);
@@ -445,6 +449,7 @@ skip:
/* restore search_key */
hfs_bnode_read_key(node, fd->search_key, 14);
}
+ new_node = NULL;
}
if (!rec && node->parent)
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index de14b2b6881b..236efe51eca6 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -342,26 +342,21 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
return node;
}
-struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+/* Make sure @tree has enough space for the @rsvd_nodes */
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes)
{
- struct hfs_bnode *node, *next_node;
- struct page **pagep;
- u32 nidx, idx;
- unsigned off;
- u16 off16;
- u16 len;
- u8 *data, byte, m;
- int i;
+ struct inode *inode = tree->inode;
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+ u32 count;
+ int res;
- while (!tree->free_nodes) {
- struct inode *inode = tree->inode;
- struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
- u32 count;
- int res;
+ if (rsvd_nodes <= 0)
+ return 0;
+ while (tree->free_nodes < rsvd_nodes) {
res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree));
if (res)
- return ERR_PTR(res);
+ return res;
hip->phys_size = inode->i_size =
(loff_t)hip->alloc_blocks <<
HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
@@ -369,9 +364,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
inode_set_bytes(inode, inode->i_size);
count = inode->i_size >> tree->node_size_shift;
- tree->free_nodes = count - tree->node_count;
+ tree->free_nodes += count - tree->node_count;
tree->node_count = count;
}
+ return 0;
+}
+
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
+{
+ struct hfs_bnode *node, *next_node;
+ struct page **pagep;
+ u32 nidx, idx;
+ unsigned off;
+ u16 off16;
+ u16 len;
+ u8 *data, byte, m;
+ int i, res;
+
+ res = hfs_bmap_reserve(tree, 1);
+ if (res)
+ return ERR_PTR(res);
nidx = 0;
node = hfs_bnode_find(tree, nidx);
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index a196369ba779..35472cba750e 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -265,6 +265,14 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
+ if (err)
+ goto err2;
+
hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid);
entry_size = hfsplus_fill_cat_thread(sb, &entry,
S_ISDIR(inode->i_mode) ?
@@ -333,6 +341,14 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
if (err)
return err;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most once.
+ */
+ err = hfs_bmap_reserve(fd.tree, 2 * (int)fd.tree->depth - 2);
+ if (err)
+ goto out;
+
if (!str) {
int len;
@@ -433,6 +449,14 @@ int hfsplus_rename_cat(u32 cnid,
return err;
dst_fd = src_fd;
+ /*
+ * Fail early and avoid ENOSPC during the btree operations. We may
+ * have to split the root node at most twice.
+ */
+ err = hfs_bmap_reserve(src_fd.tree, 4 * (int)src_fd.tree->depth - 1);
+ if (err)
+ goto out;
+
/* find the old dir entry and read the data */
err = hfsplus_cat_build_key(sb, src_fd.search_key,
src_dir->i_ino, src_name);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 8e0f59767694..a930ddd15681 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -100,6 +100,10 @@ static int __hfsplus_ext_write_extent(struct inode *inode,
if (hip->extent_state & HFSPLUS_EXT_NEW) {
if (res != -ENOENT)
return res;
+ /* Fail early and avoid ENOSPC during the btree operation */
+ res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1);
+ if (res)
+ return res;
hfs_brec_insert(fd, hip->cached_extents,
sizeof(hfsplus_extent_rec));
hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
@@ -233,7 +237,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
ablock = iblock >> sbi->fs_shift;
if (iblock >= hip->fs_blocks) {
- if (iblock > hip->fs_blocks || !create)
+ if (!create)
+ return 0;
+ if (iblock > hip->fs_blocks)
return -EIO;
if (ablock >= hip->alloc_blocks) {
res = hfsplus_file_extend(inode, false);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 8e039435958a..dd7ad9f13e3a 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -311,6 +311,7 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
#define hfs_btree_open hfsplus_btree_open
#define hfs_btree_close hfsplus_btree_close
#define hfs_btree_write hfsplus_btree_write
+#define hfs_bmap_reserve hfsplus_bmap_reserve
#define hfs_bmap_alloc hfsplus_bmap_alloc
#define hfs_bmap_free hfsplus_bmap_free
#define hfs_bnode_read hfsplus_bnode_read
@@ -395,6 +396,7 @@ u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors,
struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id);
void hfs_btree_close(struct hfs_btree *tree);
int hfs_btree_write(struct hfs_btree *tree);
+int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes);
struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree);
void hfs_bmap_free(struct hfs_bnode *node);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 8e9427a42b81..d7ab9d8c4b67 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -261,6 +261,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
}
truncate_setsize(inode, attr->ia_size);
hfsplus_file_truncate(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
}
setattr_copy(inode, attr);
diff --git a/fs/inode.c b/fs/inode.c
index 42f6d25f32a5..9e198f00b64c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -10,7 +10,7 @@
#include <linux/swap.h>
#include <linux/security.h>
#include <linux/cdev.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
@@ -349,7 +349,7 @@ EXPORT_SYMBOL(inc_nlink);
static void __address_space_init_once(struct address_space *mapping)
{
- INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT);
+ xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 947ce22f5b3c..f0fe641893a5 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -46,7 +46,7 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod
return i;
}
-/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */
+/* Acorn extensions written by Matthew Wilcox <willy@infradead.org> 1998 */
int get_acorn_filename(struct iso_directory_record *de,
char *retname, struct inode *inode)
{
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index d35cd6be0675..93fb7cf0b92b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -341,7 +341,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
};
struct lockd_net *ln = net_generic(net, lockd_net_id);
- dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
+ dprintk("lockd: %s(host='%.*s', vers=%u, proto=%s)\n", __func__,
(int)hostname_len, hostname, rqstp->rq_vers,
(rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
diff --git a/fs/namespace.c b/fs/namespace.c
index d86830c86ce8..98d27da43304 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -23,7 +23,7 @@
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 06cb0c1d9aee..d3781cd983f6 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -896,7 +896,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (end != inode->i_mapping->nrpages) {
rcu_read_lock();
- end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
+ end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX);
rcu_read_unlock();
}
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 060c658eab66..a7d3df85736d 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -65,6 +65,7 @@ struct nfs_dns_ent {
struct sockaddr_storage addr;
size_t addrlen;
+ struct rcu_head rcu_head;
};
@@ -101,15 +102,23 @@ static void nfs_dns_ent_init(struct cache_head *cnew,
}
}
-static void nfs_dns_ent_put(struct kref *ref)
+static void nfs_dns_ent_free_rcu(struct rcu_head *head)
{
struct nfs_dns_ent *item;
- item = container_of(ref, struct nfs_dns_ent, h.ref);
+ item = container_of(head, struct nfs_dns_ent, rcu_head);
kfree(item->hostname);
kfree(item);
}
+static void nfs_dns_ent_put(struct kref *ref)
+{
+ struct nfs_dns_ent *item;
+
+ item = container_of(ref, struct nfs_dns_ent, h.ref);
+ call_rcu(&item->rcu_head, nfs_dns_ent_free_rcu);
+}
+
static struct cache_head *nfs_dns_ent_alloc(void)
{
struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
@@ -195,7 +204,7 @@ static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
{
struct cache_head *ch;
- ch = sunrpc_cache_lookup(cd,
+ ch = sunrpc_cache_lookup_rcu(cd,
&key->h,
nfs_dns_hash(key));
if (!ch)
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index b7559c6f2b97..4a98537efb0f 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -19,18 +19,22 @@
* is much larger than a sockaddr_in6.
*/
struct svc_cacherep {
- struct list_head c_lru;
+ struct {
+ /* Keep often-read xid, csum in the same cache line: */
+ __be32 k_xid;
+ __wsum k_csum;
+ u32 k_proc;
+ u32 k_prot;
+ u32 k_vers;
+ unsigned int k_len;
+ struct sockaddr_in6 k_addr;
+ } c_key;
+ struct rb_node c_node;
+ struct list_head c_lru;
unsigned char c_state, /* unused, inprog, done */
c_type, /* status, buffer */
c_secure : 1; /* req came from port < 1024 */
- struct sockaddr_in6 c_addr;
- __be32 c_xid;
- u32 c_prot;
- u32 c_proc;
- u32 c_vers;
- unsigned int c_len;
- __wsum c_csum;
unsigned long c_timestamp;
union {
struct kvec u_vec;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a1143f7c2201..802993d8912f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -46,7 +46,7 @@ static void expkey_put(struct kref *ref)
!test_bit(CACHE_NEGATIVE, &key->h.flags))
path_put(&key->ek_path);
auth_domain_put(key->ek_client);
- kfree(key);
+ kfree_rcu(key, ek_rcu);
}
static void expkey_request(struct cache_detail *cd,
@@ -265,7 +265,7 @@ svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item)
struct cache_head *ch;
int hash = svc_expkey_hash(item);
- ch = sunrpc_cache_lookup(cd, &item->h, hash);
+ ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash);
if (ch)
return container_of(ch, struct svc_expkey, h);
else
@@ -314,7 +314,7 @@ static void svc_export_put(struct kref *ref)
auth_domain_put(exp->ex_client);
nfsd4_fslocs_free(&exp->ex_fslocs);
kfree(exp->ex_uuid);
- kfree(exp);
+ kfree_rcu(exp, ex_rcu);
}
static void svc_export_request(struct cache_detail *cd,
@@ -780,7 +780,7 @@ svc_export_lookup(struct svc_export *exp)
struct cache_head *ch;
int hash = svc_export_hash(exp);
- ch = sunrpc_cache_lookup(exp->cd, &exp->h, hash);
+ ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash);
if (ch)
return container_of(ch, struct svc_export, h);
else
@@ -1216,9 +1216,9 @@ static int e_show(struct seq_file *m, void *p)
}
const struct seq_operations nfs_exports_op = {
- .start = cache_seq_start,
- .next = cache_seq_next,
- .stop = cache_seq_stop,
+ .start = cache_seq_start_rcu,
+ .next = cache_seq_next_rcu,
+ .stop = cache_seq_stop_rcu,
.show = e_show,
};
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index c8b74126ddaa..e7daa1f246f0 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -61,6 +61,7 @@ struct svc_export {
u32 ex_layout_types;
struct nfsd4_deviceid_map *ex_devid_map;
struct cache_detail *cd;
+ struct rcu_head ex_rcu;
};
/* an "export key" (expkey) maps a filehandlefragement to an
@@ -75,6 +76,7 @@ struct svc_expkey {
u32 ek_fsid[6];
struct path ek_path;
+ struct rcu_head ek_rcu;
};
#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC))
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 426f55005697..32cb8c027483 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -123,6 +123,14 @@ struct nfsd_net {
wait_queue_head_t ntf_wq;
atomic_t ntf_refcnt;
+
+ /*
+ * clientid and stateid data for construction of net unique COPY
+ * stateids.
+ */
+ u32 s2s_cp_cl_id;
+ struct idr s2s_cp_stateids;
+ spinlock_t s2s_cp_lock;
};
/* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 601bf33c26a0..25987bcdf96f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -39,6 +39,7 @@
#include "state.h"
#include "netns.h"
#include "xdr4cb.h"
+#include "xdr4.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -105,6 +106,7 @@ enum nfs_cb_opnum4 {
OP_CB_WANTS_CANCELLED = 12,
OP_CB_NOTIFY_LOCK = 13,
OP_CB_NOTIFY_DEVICEID = 14,
+ OP_CB_OFFLOAD = 15,
OP_CB_ILLEGAL = 10044
};
@@ -683,6 +685,101 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
}
/*
+ * struct write_response4 {
+ * stateid4 wr_callback_id<1>;
+ * length4 wr_count;
+ * stable_how4 wr_committed;
+ * verifier4 wr_writeverf;
+ * };
+ * union offload_info4 switch (nfsstat4 coa_status) {
+ * case NFS4_OK:
+ * write_response4 coa_resok4;
+ * default:
+ * length4 coa_bytes_copied;
+ * };
+ * struct CB_OFFLOAD4args {
+ * nfs_fh4 coa_fh;
+ * stateid4 coa_stateid;
+ * offload_info4 coa_offload_info;
+ * };
+ */
+static void encode_offload_info4(struct xdr_stream *xdr,
+ __be32 nfserr,
+ const struct nfsd4_copy *cp)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p++ = nfserr;
+ if (!nfserr) {
+ p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+ p = xdr_encode_empty_array(p);
+ p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written);
+ *p++ = cpu_to_be32(cp->cp_res.wr_stable_how);
+ p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data,
+ NFS4_VERIFIER_SIZE);
+ } else {
+ p = xdr_reserve_space(xdr, 8);
+ /* We always return success if bytes were written */
+ p = xdr_encode_hyper(p, 0);
+ }
+}
+
+static void encode_cb_offload4args(struct xdr_stream *xdr,
+ __be32 nfserr,
+ const struct knfsd_fh *fh,
+ const struct nfsd4_copy *cp,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ *p++ = cpu_to_be32(OP_CB_OFFLOAD);
+ encode_nfs_fh4(xdr, fh);
+ encode_stateid4(xdr, &cp->cp_res.cb_stateid);
+ encode_offload_info4(xdr, nfserr, cp);
+
+ hdr->nops++;
+}
+
+static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfsd4_callback *cb = data;
+ const struct nfsd4_copy *cp =
+ container_of(cb, struct nfsd4_copy, cp_cb);
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = 0,
+ .minorversion = cb->cb_clp->cl_minorversion,
+ };
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr);
+ encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfsd4_callback *cb = data;
+ struct nfs4_cb_compound_hdr hdr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ return status;
+
+ if (cb) {
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+ }
+ return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status);
+}
+/*
* RPC procedure tables
*/
#define PROC(proc, call, argtype, restype) \
@@ -703,6 +800,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout),
#endif
PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock),
+ PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload),
};
static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a5bb76593ce7..bf137fec33ff 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -65,6 +65,7 @@ struct ent {
u32 id;
char name[IDMAP_NAMESZ];
char authname[IDMAP_NAMESZ];
+ struct rcu_head rcu_head;
};
/* Common entry handling */
@@ -89,7 +90,7 @@ static void
ent_put(struct kref *ref)
{
struct ent *map = container_of(ref, struct ent, h.ref);
- kfree(map);
+ kfree_rcu(map, rcu_head);
}
static struct cache_head *
@@ -264,8 +265,8 @@ out:
static struct ent *
idtoname_lookup(struct cache_detail *cd, struct ent *item)
{
- struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h,
- idtoname_hash(item));
+ struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h,
+ idtoname_hash(item));
if (ch)
return container_of(ch, struct ent, h);
else
@@ -422,8 +423,8 @@ out:
static struct ent *
nametoid_lookup(struct cache_detail *cd, struct ent *item)
{
- struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h,
- nametoid_hash(item));
+ struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h,
+ nametoid_hash(item));
if (ch)
return container_of(ch, struct ent, h);
else
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b7bc6e1a85ac..edff074d38c7 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -36,6 +36,7 @@
#include <linux/file.h>
#include <linux/falloc.h>
#include <linux/slab.h>
+#include <linux/kthread.h>
#include "idmap.h"
#include "cache.h"
@@ -1089,36 +1090,254 @@ out:
return status;
}
+void nfs4_put_copy(struct nfsd4_copy *copy)
+{
+ if (!refcount_dec_and_test(&copy->refcount))
+ return;
+ kfree(copy);
+}
+
+static bool
+check_and_set_stop_copy(struct nfsd4_copy *copy)
+{
+ bool value;
+
+ spin_lock(&copy->cp_clp->async_lock);
+ value = copy->stopped;
+ if (!copy->stopped)
+ copy->stopped = true;
+ spin_unlock(&copy->cp_clp->async_lock);
+ return value;
+}
+
+static void nfsd4_stop_copy(struct nfsd4_copy *copy)
+{
+ /* only 1 thread should stop the copy */
+ if (!check_and_set_stop_copy(copy))
+ kthread_stop(copy->copy_task);
+ nfs4_put_copy(copy);
+}
+
+static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp)
+{
+ struct nfsd4_copy *copy = NULL;
+
+ spin_lock(&clp->async_lock);
+ if (!list_empty(&clp->async_copies)) {
+ copy = list_first_entry(&clp->async_copies, struct nfsd4_copy,
+ copies);
+ refcount_inc(&copy->refcount);
+ }
+ spin_unlock(&clp->async_lock);
+ return copy;
+}
+
+void nfsd4_shutdown_copy(struct nfs4_client *clp)
+{
+ struct nfsd4_copy *copy;
+
+ while ((copy = nfsd4_get_copy(clp)) != NULL)
+ nfsd4_stop_copy(copy);
+}
+
+static void nfsd4_cb_offload_release(struct nfsd4_callback *cb)
+{
+ struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb);
+
+ nfs4_put_copy(copy);
+}
+
+static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
+ struct rpc_task *task)
+{
+ return 1;
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = {
+ .release = nfsd4_cb_offload_release,
+ .done = nfsd4_cb_offload_done
+};
+
+static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
+{
+ copy->cp_res.wr_stable_how = NFS_UNSTABLE;
+ copy->cp_synchronous = sync;
+ gen_boot_verifier(&copy->cp_res.wr_verifier, copy->cp_clp->net);
+}
+
+static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
+{
+ ssize_t bytes_copied = 0;
+ size_t bytes_total = copy->cp_count;
+ u64 src_pos = copy->cp_src_pos;
+ u64 dst_pos = copy->cp_dst_pos;
+
+ do {
+ if (kthread_should_stop())
+ break;
+ bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos,
+ copy->file_dst, dst_pos, bytes_total);
+ if (bytes_copied <= 0)
+ break;
+ bytes_total -= bytes_copied;
+ copy->cp_res.wr_bytes_written += bytes_copied;
+ src_pos += bytes_copied;
+ dst_pos += bytes_copied;
+ } while (bytes_total > 0 && !copy->cp_synchronous);
+ return bytes_copied;
+}
+
+static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
+{
+ __be32 status;
+ ssize_t bytes;
+
+ bytes = _nfsd_copy_file_range(copy);
+ /* for async copy, we ignore the error, client can always retry
+ * to get the error
+ */
+ if (bytes < 0 && !copy->cp_res.wr_bytes_written)
+ status = nfserrno(bytes);
+ else {
+ nfsd4_init_copy_res(copy, sync);
+ status = nfs_ok;
+ }
+
+ fput(copy->file_src);
+ fput(copy->file_dst);
+ return status;
+}
+
+static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
+{
+ dst->cp_src_pos = src->cp_src_pos;
+ dst->cp_dst_pos = src->cp_dst_pos;
+ dst->cp_count = src->cp_count;
+ dst->cp_synchronous = src->cp_synchronous;
+ memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res));
+ memcpy(&dst->fh, &src->fh, sizeof(src->fh));
+ dst->cp_clp = src->cp_clp;
+ dst->file_dst = get_file(src->file_dst);
+ dst->file_src = get_file(src->file_src);
+ memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid));
+}
+
+static void cleanup_async_copy(struct nfsd4_copy *copy)
+{
+ nfs4_free_cp_state(copy);
+ fput(copy->file_dst);
+ fput(copy->file_src);
+ spin_lock(&copy->cp_clp->async_lock);
+ list_del(&copy->copies);
+ spin_unlock(&copy->cp_clp->async_lock);
+ nfs4_put_copy(copy);
+}
+
+static int nfsd4_do_async_copy(void *data)
+{
+ struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
+ struct nfsd4_copy *cb_copy;
+
+ copy->nfserr = nfsd4_do_copy(copy, 0);
+ cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
+ if (!cb_copy)
+ goto out;
+ memcpy(&cb_copy->cp_res, &copy->cp_res, sizeof(copy->cp_res));
+ cb_copy->cp_clp = copy->cp_clp;
+ cb_copy->nfserr = copy->nfserr;
+ memcpy(&cb_copy->fh, &copy->fh, sizeof(copy->fh));
+ nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp,
+ &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD);
+ nfsd4_run_cb(&cb_copy->cp_cb);
+out:
+ cleanup_async_copy(copy);
+ return 0;
+}
+
static __be32
nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_copy *copy = &u->copy;
- struct file *src, *dst;
__be32 status;
- ssize_t bytes;
+ struct nfsd4_copy *async_copy = NULL;
- status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid, &src,
- &copy->cp_dst_stateid, &dst);
+ status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid,
+ &copy->file_src, &copy->cp_dst_stateid,
+ &copy->file_dst);
if (status)
goto out;
- bytes = nfsd_copy_file_range(src, copy->cp_src_pos,
- dst, copy->cp_dst_pos, copy->cp_count);
+ copy->cp_clp = cstate->clp;
+ memcpy(&copy->fh, &cstate->current_fh.fh_handle,
+ sizeof(struct knfsd_fh));
+ if (!copy->cp_synchronous) {
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- if (bytes < 0)
- status = nfserrno(bytes);
- else {
- copy->cp_res.wr_bytes_written = bytes;
- copy->cp_res.wr_stable_how = NFS_UNSTABLE;
- copy->cp_synchronous = 1;
- gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
+ status = nfserrno(-ENOMEM);
+ async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
+ if (!async_copy)
+ goto out;
+ if (!nfs4_init_cp_state(nn, copy)) {
+ kfree(async_copy);
+ goto out;
+ }
+ refcount_set(&async_copy->refcount, 1);
+ memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid,
+ sizeof(copy->cp_stateid));
+ dup_copy_fields(copy, async_copy);
+ async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
+ async_copy, "%s", "copy thread");
+ if (IS_ERR(async_copy->copy_task))
+ goto out_err;
+ spin_lock(&async_copy->cp_clp->async_lock);
+ list_add(&async_copy->copies,
+ &async_copy->cp_clp->async_copies);
+ spin_unlock(&async_copy->cp_clp->async_lock);
+ wake_up_process(async_copy->copy_task);
status = nfs_ok;
+ } else
+ status = nfsd4_do_copy(copy, 1);
+out:
+ return status;
+out_err:
+ cleanup_async_copy(async_copy);
+ goto out;
+}
+
+struct nfsd4_copy *
+find_async_copy(struct nfs4_client *clp, stateid_t *stateid)
+{
+ struct nfsd4_copy *copy;
+
+ spin_lock(&clp->async_lock);
+ list_for_each_entry(copy, &clp->async_copies, copies) {
+ if (memcmp(&copy->cp_stateid, stateid, NFS4_STATEID_SIZE))
+ continue;
+ refcount_inc(&copy->refcount);
+ spin_unlock(&clp->async_lock);
+ return copy;
}
+ spin_unlock(&clp->async_lock);
+ return NULL;
+}
+
+static __be32
+nfsd4_offload_cancel(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_offload_status *os = &u->offload_status;
+ __be32 status = 0;
+ struct nfsd4_copy *copy;
+ struct nfs4_client *clp = cstate->clp;
+
+ copy = find_async_copy(clp, &os->stateid);
+ if (copy)
+ nfsd4_stop_copy(copy);
+ else
+ status = nfserr_bad_stateid;
- fput(src);
- fput(dst);
-out:
return status;
}
@@ -1144,6 +1363,25 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fput(file);
return status;
}
+static __be32
+nfsd4_offload_status(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_offload_status *os = &u->offload_status;
+ __be32 status = 0;
+ struct nfsd4_copy *copy;
+ struct nfs4_client *clp = cstate->clp;
+
+ copy = find_async_copy(clp, &os->stateid);
+ if (copy) {
+ os->count = copy->cp_res.wr_bytes_written;
+ nfs4_put_copy(copy);
+ } else
+ status = nfserr_bad_stateid;
+
+ return status;
+}
static __be32
nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
@@ -2047,6 +2285,14 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1 /* cr_synchronous */) * sizeof(__be32);
}
+static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 2 /* osr_count */ +
+ 1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32);
+}
+
#ifdef CONFIG_NFSD_PNFS
static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
@@ -2460,6 +2706,17 @@ static const struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_SEEK",
.op_rsize_bop = nfsd4_seek_rsize,
},
+ [OP_OFFLOAD_STATUS] = {
+ .op_func = nfsd4_offload_status,
+ .op_name = "OP_OFFLOAD_STATUS",
+ .op_rsize_bop = nfsd4_offload_status_rsize,
+ },
+ [OP_OFFLOAD_CANCEL] = {
+ .op_func = nfsd4_offload_cancel,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_OFFLOAD_CANCEL",
+ .op_rsize_bop = nfsd4_only_status_rsize,
+ },
};
/**
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b0ca0efd2875..f093fbe47133 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -713,6 +713,36 @@ out_free:
return NULL;
}
+/*
+ * Create a unique stateid_t to represent each COPY.
+ */
+int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy)
+{
+ int new_id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&nn->s2s_cp_lock);
+ new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, copy, 0, 0, GFP_NOWAIT);
+ spin_unlock(&nn->s2s_cp_lock);
+ idr_preload_end();
+ if (new_id < 0)
+ return 0;
+ copy->cp_stateid.si_opaque.so_id = new_id;
+ copy->cp_stateid.si_opaque.so_clid.cl_boot = nn->boot_time;
+ copy->cp_stateid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
+ return 1;
+}
+
+void nfs4_free_cp_state(struct nfsd4_copy *copy)
+{
+ struct nfsd_net *nn;
+
+ nn = net_generic(copy->cp_clp->net, nfsd_net_id);
+ spin_lock(&nn->s2s_cp_lock);
+ idr_remove(&nn->s2s_cp_stateids, copy->cp_stateid.si_opaque.so_id);
+ spin_unlock(&nn->s2s_cp_lock);
+}
+
static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
{
struct nfs4_stid *stid;
@@ -1827,6 +1857,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
#ifdef CONFIG_NFSD_PNFS
INIT_LIST_HEAD(&clp->cl_lo_states);
#endif
+ INIT_LIST_HEAD(&clp->async_copies);
+ spin_lock_init(&clp->async_lock);
spin_lock_init(&clp->cl_lock);
rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
return clp;
@@ -1942,6 +1974,7 @@ __destroy_client(struct nfs4_client *clp)
}
}
nfsd4_return_all_client_layouts(clp);
+ nfsd4_shutdown_copy(clp);
nfsd4_shutdown_callback(clp);
if (clp->cl_cb_conn.cb_xprt)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2475,7 +2508,8 @@ static bool client_has_state(struct nfs4_client *clp)
|| !list_empty(&clp->cl_lo_states)
#endif
|| !list_empty(&clp->cl_delegations)
- || !list_empty(&clp->cl_sessions);
+ || !list_empty(&clp->cl_sessions)
+ || !list_empty(&clp->async_copies);
}
__be32
@@ -4364,7 +4398,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
if (!fl)
- goto out_stid;
+ goto out_clnt_odstate;
status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL);
if (fl)
@@ -4389,7 +4423,6 @@ out_unlock:
vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp);
out_clnt_odstate:
put_clnt_odstate(dp->dl_clnt_odstate);
-out_stid:
nfs4_put_stid(&dp->dl_stid);
out_delegees:
put_deleg_file(fp);
@@ -7161,6 +7194,8 @@ static int nfs4_state_create_net(struct net *net)
INIT_LIST_HEAD(&nn->close_lru);
INIT_LIST_HEAD(&nn->del_recall_lru);
spin_lock_init(&nn->client_lock);
+ spin_lock_init(&nn->s2s_cp_lock);
+ idr_init(&nn->s2s_cp_stateids);
spin_lock_init(&nn->blocked_locks_lock);
INIT_LIST_HEAD(&nn->blocked_locks_lru);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 418fa9c78186..3de42a729093 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1768,6 +1768,13 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
}
static __be32
+nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp,
+ struct nfsd4_offload_status *os)
+{
+ return nfsd4_decode_stateid(argp, &os->stateid);
+}
+
+static __be32
nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
{
DECODE_HEAD;
@@ -1873,8 +1880,8 @@ static const nfsd4_dec nfsd4_dec_ops[] = {
[OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status,
+ [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status,
[OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
[OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -4224,15 +4231,27 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
#endif /* CONFIG_NFSD_PNFS */
static __be32
-nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
+nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
+ struct nfsd42_write_res *write, bool sync)
{
__be32 *p;
+ p = xdr_reserve_space(&resp->xdr, 4);
+ if (!p)
+ return nfserr_resource;
- p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+ if (sync)
+ *p++ = cpu_to_be32(0);
+ else {
+ __be32 nfserr;
+ *p++ = cpu_to_be32(1);
+ nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid);
+ if (nfserr)
+ return nfserr;
+ }
+ p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
if (!p)
return nfserr_resource;
- *p++ = cpu_to_be32(0);
p = xdr_encode_hyper(p, write->wr_bytes_written);
*p++ = cpu_to_be32(write->wr_stable_how);
p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
@@ -4246,7 +4265,8 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
{
__be32 *p;
- nfserr = nfsd42_encode_write_res(resp, &copy->cp_res);
+ nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
+ copy->cp_synchronous);
if (nfserr)
return nfserr;
@@ -4257,6 +4277,22 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
}
static __be32
+nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_offload_status *os)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8 + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_hyper(p, os->count);
+ *p++ = cpu_to_be32(0);
+
+ return nfserr;
+}
+
+static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_seek *seek)
{
@@ -4359,7 +4395,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = {
[OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
[OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status,
[OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
[OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
[OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index dbdeb9d6af03..e2fe0e9ce0df 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -30,6 +30,7 @@
#define TARGET_BUCKET_SIZE 64
struct nfsd_drc_bucket {
+ struct rb_root rb_head;
struct list_head lru_head;
spinlock_t cache_lock;
};
@@ -121,7 +122,7 @@ nfsd_cache_hash(__be32 xid)
}
static struct svc_cacherep *
-nfsd_reply_cache_alloc(void)
+nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum)
{
struct svc_cacherep *rp;
@@ -129,21 +130,35 @@ nfsd_reply_cache_alloc(void)
if (rp) {
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
+ RB_CLEAR_NODE(&rp->c_node);
INIT_LIST_HEAD(&rp->c_lru);
+
+ memset(&rp->c_key, 0, sizeof(rp->c_key));
+ rp->c_key.k_xid = rqstp->rq_xid;
+ rp->c_key.k_proc = rqstp->rq_proc;
+ rpc_copy_addr((struct sockaddr *)&rp->c_key.k_addr, svc_addr(rqstp));
+ rpc_set_port((struct sockaddr *)&rp->c_key.k_addr, rpc_get_port(svc_addr(rqstp)));
+ rp->c_key.k_prot = rqstp->rq_prot;
+ rp->c_key.k_vers = rqstp->rq_vers;
+ rp->c_key.k_len = rqstp->rq_arg.len;
+ rp->c_key.k_csum = csum;
}
return rp;
}
static void
-nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
+nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
drc_mem_usage -= rp->c_replvec.iov_len;
kfree(rp->c_replvec.iov_base);
}
- list_del(&rp->c_lru);
- atomic_dec(&num_drc_entries);
- drc_mem_usage -= sizeof(*rp);
+ if (rp->c_state != RC_UNUSED) {
+ rb_erase(&rp->c_node, &b->rb_head);
+ list_del(&rp->c_lru);
+ atomic_dec(&num_drc_entries);
+ drc_mem_usage -= sizeof(*rp);
+ }
kmem_cache_free(drc_slab, rp);
}
@@ -151,7 +166,7 @@ static void
nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
spin_lock(&b->cache_lock);
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(b, rp);
spin_unlock(&b->cache_lock);
}
@@ -207,7 +222,7 @@ void nfsd_reply_cache_shutdown(void)
struct list_head *head = &drc_hashtbl[i].lru_head;
while (!list_empty(head)) {
rp = list_first_entry(head, struct svc_cacherep, c_lru);
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(&drc_hashtbl[i], rp);
}
}
@@ -246,7 +261,7 @@ prune_bucket(struct nfsd_drc_bucket *b)
if (atomic_read(&num_drc_entries) <= max_drc_entries &&
time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
break;
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(b, rp);
freed++;
}
return freed;
@@ -318,51 +333,48 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
return csum;
}
-static bool
-nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
+static int
+nfsd_cache_key_cmp(const struct svc_cacherep *key, const struct svc_cacherep *rp)
{
- /* Check RPC XID first */
- if (rqstp->rq_xid != rp->c_xid)
- return false;
- /* compare checksum of NFS data */
- if (csum != rp->c_csum) {
+ if (key->c_key.k_xid == rp->c_key.k_xid &&
+ key->c_key.k_csum != rp->c_key.k_csum)
++payload_misses;
- return false;
- }
- /* Other discriminators */
- if (rqstp->rq_proc != rp->c_proc ||
- rqstp->rq_prot != rp->c_prot ||
- rqstp->rq_vers != rp->c_vers ||
- rqstp->rq_arg.len != rp->c_len ||
- !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
- rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
- return false;
-
- return true;
+ return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key));
}
/*
* Search the request hash for an entry that matches the given rqstp.
* Must be called with cache_lock held. Returns the found entry or
- * NULL on failure.
+ * inserts an empty key on failure.
*/
static struct svc_cacherep *
-nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
- __wsum csum)
+nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key)
{
- struct svc_cacherep *rp, *ret = NULL;
- struct list_head *rh = &b->lru_head;
+ struct svc_cacherep *rp, *ret = key;
+ struct rb_node **p = &b->rb_head.rb_node,
+ *parent = NULL;
unsigned int entries = 0;
+ int cmp;
- list_for_each_entry(rp, rh, c_lru) {
+ while (*p != NULL) {
++entries;
- if (nfsd_cache_match(rqstp, csum, rp)) {
+ parent = *p;
+ rp = rb_entry(parent, struct svc_cacherep, c_node);
+
+ cmp = nfsd_cache_key_cmp(key, rp);
+ if (cmp < 0)
+ p = &parent->rb_left;
+ else if (cmp > 0)
+ p = &parent->rb_right;
+ else {
ret = rp;
- break;
+ goto out;
}
}
-
+ rb_link_node(&key->c_node, parent, p);
+ rb_insert_color(&key->c_node, &b->rb_head);
+out:
/* tally hash chain length stats */
if (entries > longest_chain) {
longest_chain = entries;
@@ -374,6 +386,7 @@ nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
atomic_read(&num_drc_entries));
}
+ lru_put_end(b, ret);
return ret;
}
@@ -389,9 +402,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
{
struct svc_cacherep *rp, *found;
__be32 xid = rqstp->rq_xid;
- u32 proto = rqstp->rq_prot,
- vers = rqstp->rq_vers,
- proc = rqstp->rq_proc;
__wsum csum;
u32 hash = nfsd_cache_hash(xid);
struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
@@ -410,60 +420,38 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
* Since the common case is a cache miss followed by an insert,
* preallocate an entry.
*/
- rp = nfsd_reply_cache_alloc();
- spin_lock(&b->cache_lock);
- if (likely(rp)) {
- atomic_inc(&num_drc_entries);
- drc_mem_usage += sizeof(*rp);
+ rp = nfsd_reply_cache_alloc(rqstp, csum);
+ if (!rp) {
+ dprintk("nfsd: unable to allocate DRC entry!\n");
+ return rtn;
}
- /* go ahead and prune the cache */
- prune_bucket(b);
-
- found = nfsd_cache_search(b, rqstp, csum);
- if (found) {
- if (likely(rp))
- nfsd_reply_cache_free_locked(rp);
+ spin_lock(&b->cache_lock);
+ found = nfsd_cache_insert(b, rp);
+ if (found != rp) {
+ nfsd_reply_cache_free_locked(NULL, rp);
rp = found;
goto found_entry;
}
- if (!rp) {
- dprintk("nfsd: unable to allocate DRC entry!\n");
- goto out;
- }
-
nfsdstats.rcmisses++;
rqstp->rq_cacherep = rp;
rp->c_state = RC_INPROG;
- rp->c_xid = xid;
- rp->c_proc = proc;
- rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
- rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
- rp->c_prot = proto;
- rp->c_vers = vers;
- rp->c_len = rqstp->rq_arg.len;
- rp->c_csum = csum;
- lru_put_end(b, rp);
+ atomic_inc(&num_drc_entries);
+ drc_mem_usage += sizeof(*rp);
- /* release any buffer */
- if (rp->c_type == RC_REPLBUFF) {
- drc_mem_usage -= rp->c_replvec.iov_len;
- kfree(rp->c_replvec.iov_base);
- rp->c_replvec.iov_base = NULL;
- }
- rp->c_type = RC_NOCACHE;
+ /* go ahead and prune the cache */
+ prune_bucket(b);
out:
spin_unlock(&b->cache_lock);
return rtn;
found_entry:
- nfsdstats.rchits++;
/* We found a matching entry which is either in progress or done. */
- lru_put_end(b, rp);
-
+ nfsdstats.rchits++;
rtn = RC_DROPIT;
+
/* Request being processed */
if (rp->c_state == RC_INPROG)
goto out;
@@ -489,7 +477,7 @@ found_entry:
break;
default:
printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
- nfsd_reply_cache_free_locked(rp);
+ nfsd_reply_cache_free_locked(b, rp);
}
goto out;
@@ -524,7 +512,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
if (!rp)
return;
- hash = nfsd_cache_hash(rp->c_xid);
+ hash = nfsd_cache_hash(rp->c_key.k_xid);
b = &drc_hashtbl[hash];
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7fb9f7c667b1..6384c9b94898 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1242,6 +1242,7 @@ static __net_init int nfsd_init_net(struct net *net)
nn->somebody_reclaimed = false;
nn->clverifier_counter = prandom_u32();
nn->clientid_counter = prandom_u32();
+ nn->s2s_cp_cl_id = nn->clientid_counter++;
atomic_set(&nn->ntf_refcnt, 0);
init_waitqueue_head(&nn->ntf_wq);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 0b15dac7e609..6aacb325b6a0 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -355,6 +355,8 @@ struct nfs4_client {
struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
/* wait here for slots */
struct net *net;
+ struct list_head async_copies; /* list of async copies */
+ spinlock_t async_lock; /* lock for async copies */
};
/* struct nfs4_client_reset
@@ -573,6 +575,7 @@ enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_NULL = 0,
NFSPROC4_CLNT_CB_RECALL,
NFSPROC4_CLNT_CB_LAYOUT,
+ NFSPROC4_CLNT_CB_OFFLOAD,
NFSPROC4_CLNT_CB_SEQUENCE,
NFSPROC4_CLNT_CB_NOTIFY_LOCK,
};
@@ -599,6 +602,7 @@ struct nfsd4_blocked_lock {
struct nfsd4_compound_state;
struct nfsd_net;
+struct nfsd4_copy;
extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
@@ -608,6 +612,8 @@ __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
struct nfs4_stid **s, struct nfsd_net *nn);
struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
void (*sc_free)(struct nfs4_stid *));
+int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy);
+void nfs4_free_cp_state(struct nfsd4_copy *copy);
void nfs4_unhash_stid(struct nfs4_stid *s);
void nfs4_put_stid(struct nfs4_stid *s);
void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
@@ -626,6 +632,7 @@ extern void nfsd4_run_cb(struct nfsd4_callback *cb);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
+extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
struct nfsd_net *nn);
@@ -633,6 +640,9 @@ extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
struct nfs4_file *find_file(struct knfsd_fh *fh);
void put_nfs4_file(struct nfs4_file *fi);
+extern void nfs4_put_copy(struct nfsd4_copy *copy);
+extern struct nfsd4_copy *
+find_async_copy(struct nfs4_client *clp, stateid_t *staetid);
static inline void get_nfs4_file(struct nfs4_file *fi)
{
refcount_inc(&fi->fi_ref);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b53e76391e52..2751976704e9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1276,7 +1276,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
int type, dev_t rdev, struct svc_fh *resfhp)
{
struct dentry *dentry, *dchild = NULL;
- struct inode *dirp;
__be32 err;
int host_err;
@@ -1288,7 +1287,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
return err;
dentry = fhp->fh_dentry;
- dirp = d_inode(dentry);
host_err = fh_want_write(fhp);
if (host_err)
@@ -1409,6 +1407,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
*created = 1;
break;
}
+ /* fall through */
case NFS4_CREATE_EXCLUSIVE4_1:
if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
&& d_inode(dchild)->i_atime.tv_sec == v_atime
@@ -1417,7 +1416,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
*created = 1;
goto set_attr;
}
- /* fallthru */
+ /* fall through */
case NFS3_CREATE_GUARDED:
err = nfserr_exist;
}
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 17c453a7999c..feeb6d4bdffd 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -511,6 +511,7 @@ struct nfsd42_write_res {
u64 wr_bytes_written;
u32 wr_stable_how;
nfs4_verifier wr_verifier;
+ stateid_t cb_stateid;
};
struct nfsd4_copy {
@@ -526,6 +527,23 @@ struct nfsd4_copy {
/* response */
struct nfsd42_write_res cp_res;
+
+ /* for cb_offload */
+ struct nfsd4_callback cp_cb;
+ __be32 nfserr;
+ struct knfsd_fh fh;
+
+ struct nfs4_client *cp_clp;
+
+ struct file *file_src;
+ struct file *file_dst;
+
+ stateid_t cp_stateid;
+
+ struct list_head copies;
+ struct task_struct *copy_task;
+ refcount_t refcount;
+ bool stopped;
};
struct nfsd4_seek {
@@ -539,6 +557,15 @@ struct nfsd4_seek {
loff_t seek_pos;
};
+struct nfsd4_offload_status {
+ /* request */
+ stateid_t stateid;
+
+ /* response */
+ u64 count;
+ u32 status;
+};
+
struct nfsd4_op {
int opnum;
const struct nfsd4_operation * opdesc;
@@ -597,6 +624,7 @@ struct nfsd4_op {
struct nfsd4_fallocate deallocate;
struct nfsd4_clone clone;
struct nfsd4_copy copy;
+ struct nfsd4_offload_status offload_status;
struct nfsd4_seek seek;
} u;
struct nfs4_replay * replay;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 517239af0302..547cf07cf4e0 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -38,3 +38,13 @@
#define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)
+#define enc_cb_offload_info_sz (1 + 1 + 2 + 1 + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define NFS4_enc_cb_offload_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ enc_nfs4_fh_sz + \
+ enc_stateid_sz + \
+ enc_cb_offload_info_sz)
+#define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index ebb24a314f43..de99db518571 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -168,24 +168,18 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
ctxt->newbh = NULL;
if (inode->i_blkbits == PAGE_SHIFT) {
- lock_page(obh->b_page);
- /*
- * We cannot call radix_tree_preload for the kernels older
- * than 2.6.23, because it is not exported for modules.
- */
+ struct page *opage = obh->b_page;
+ lock_page(opage);
retry:
- err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
- if (err)
- goto failed_unlock;
/* BUG_ON(oldkey != obh->b_page->index); */
- if (unlikely(oldkey != obh->b_page->index))
- NILFS_PAGE_BUG(obh->b_page,
+ if (unlikely(oldkey != opage->index))
+ NILFS_PAGE_BUG(opage,
"invalid oldkey %lld (newkey=%lld)",
(unsigned long long)oldkey,
(unsigned long long)newkey);
xa_lock_irq(&btnc->i_pages);
- err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page);
+ err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS);
xa_unlock_irq(&btnc->i_pages);
/*
* Note: page->index will not change to newkey until
@@ -193,7 +187,6 @@ retry:
* To protect the page in intermediate state, the page lock
* is held.
*/
- radix_tree_preload_end();
if (!err)
return 0;
else if (err != -EEXIST)
@@ -203,7 +196,7 @@ retry:
if (!err)
goto retry;
/* fallback to copy mode */
- unlock_page(obh->b_page);
+ unlock_page(opage);
}
nbh = nilfs_btnode_create_block(btnc, newkey);
@@ -243,9 +236,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
mark_buffer_dirty(obh);
xa_lock_irq(&btnc->i_pages);
- radix_tree_delete(&btnc->i_pages, oldkey);
- radix_tree_tag_set(&btnc->i_pages, newkey,
- PAGECACHE_TAG_DIRTY);
+ __xa_erase(&btnc->i_pages, oldkey);
+ __xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&btnc->i_pages);
opage->index = obh->b_blocknr = newkey;
@@ -275,7 +267,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
if (nbh == NULL) { /* blocksize == pagesize */
xa_lock_irq(&btnc->i_pages);
- radix_tree_delete(&btnc->i_pages, newkey);
+ __xa_erase(&btnc->i_pages, newkey);
xa_unlock_irq(&btnc->i_pages);
unlock_page(ctxt->bh->b_page);
} else
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 329a056b73b1..d7fc8d369d89 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -289,7 +289,7 @@ repeat:
* @dmap: destination page cache
* @smap: source page cache
*
- * No pages must no be added to the cache during this process.
+ * No pages must be added to the cache during this process.
* This must be ensured by the caller.
*/
void nilfs_copy_back_pages(struct address_space *dmap,
@@ -298,7 +298,6 @@ void nilfs_copy_back_pages(struct address_space *dmap,
struct pagevec pvec;
unsigned int i, n;
pgoff_t index = 0;
- int err;
pagevec_init(&pvec);
repeat:
@@ -313,35 +312,34 @@ repeat:
lock_page(page);
dpage = find_lock_page(dmap, offset);
if (dpage) {
- /* override existing page on the destination cache */
+ /* overwrite existing page in the destination cache */
WARN_ON(PageDirty(dpage));
nilfs_copy_page(dpage, page, 0);
unlock_page(dpage);
put_page(dpage);
+ /* Do we not need to remove page from smap here? */
} else {
- struct page *page2;
+ struct page *p;
/* move the page to the destination cache */
xa_lock_irq(&smap->i_pages);
- page2 = radix_tree_delete(&smap->i_pages, offset);
- WARN_ON(page2 != page);
-
+ p = __xa_erase(&smap->i_pages, offset);
+ WARN_ON(page != p);
smap->nrpages--;
xa_unlock_irq(&smap->i_pages);
xa_lock_irq(&dmap->i_pages);
- err = radix_tree_insert(&dmap->i_pages, offset, page);
- if (unlikely(err < 0)) {
- WARN_ON(err == -EEXIST);
+ p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS);
+ if (unlikely(p)) {
+ /* Probably -ENOMEM */
page->mapping = NULL;
- put_page(page); /* for cache */
+ put_page(page);
} else {
page->mapping = dmap;
dmap->nrpages++;
if (PageDirty(page))
- radix_tree_tag_set(&dmap->i_pages,
- offset,
- PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&dmap->i_pages, offset,
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irq(&dmap->i_pages);
}
@@ -467,8 +465,7 @@ int __nilfs_clear_page_dirty(struct page *page)
if (mapping) {
xa_lock_irq(&mapping->i_pages);
if (test_bit(PG_dirty, &page->flags)) {
- radix_tree_tag_clear(&mapping->i_pages,
- page_index(page),
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&mapping->i_pages);
return clear_page_dirty_for_io(page);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 94b52157bf8d..5769cf3ff035 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -25,7 +25,7 @@ static bool should_merge(struct fsnotify_event *old_fsn,
old = FANOTIFY_E(old_fsn);
new = FANOTIFY_E(new_fsn);
- if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
+ if (old_fsn->inode == new_fsn->inode && old->pid == new->pid &&
old->path.mnt == new->path.mnt &&
old->path.dentry == new->path.dentry)
return true;
@@ -131,8 +131,8 @@ static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info,
!(marks_mask & FS_ISDIR & ~marks_ignored_mask))
return false;
- if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask &
- ~marks_ignored_mask)
+ if (event_mask & FANOTIFY_OUTGOING_EVENTS &
+ marks_mask & ~marks_ignored_mask)
return true;
return false;
@@ -171,7 +171,10 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
goto out;
init: __maybe_unused
fsnotify_init_event(&event->fse, inode, mask);
- event->tgid = get_pid(task_tgid(current));
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
+ event->pid = get_pid(task_pid(current));
+ else
+ event->pid = get_pid(task_tgid(current));
if (path) {
event->path = *path;
path_get(&event->path);
@@ -205,6 +208,8 @@ static int fanotify_handle_event(struct fsnotify_group *group,
BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 10);
+
if (!fanotify_should_send_event(iter_info, mask, data, data_type))
return 0;
@@ -236,7 +241,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
if (ret) {
/* Permission events shouldn't be merged */
- BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS);
+ BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
/* Our event wasn't used in the end. Free it. */
fsnotify_destroy_event(group, fsn_event);
@@ -268,7 +273,7 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
event = FANOTIFY_E(fsn_event);
path_put(&event->path);
- put_pid(event->tgid);
+ put_pid(event->pid);
if (fanotify_is_perm_event(fsn_event->mask)) {
kmem_cache_free(fanotify_perm_event_cachep,
FANOTIFY_PE(fsn_event));
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 8609ba06f474..ea05b8a401e7 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -19,7 +19,7 @@ struct fanotify_event_info {
* during this object's lifetime
*/
struct path path;
- struct pid *tgid;
+ struct pid *pid;
};
/*
@@ -44,7 +44,7 @@ FANOTIFY_PE(struct fsnotify_event *fse)
static inline bool fanotify_is_perm_event(u32 mask)
{
return IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS) &&
- mask & FAN_ALL_PERM_EVENTS;
+ mask & FANOTIFY_PERM_EVENTS;
}
static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 69054886915b..e03be5071362 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -131,8 +131,8 @@ static int fill_event_metadata(struct fsnotify_group *group,
metadata->metadata_len = FAN_EVENT_METADATA_LEN;
metadata->vers = FANOTIFY_METADATA_VERSION;
metadata->reserved = 0;
- metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
- metadata->pid = pid_vnr(event->tgid);
+ metadata->mask = fsn_event->mask & FANOTIFY_OUTGOING_EVENTS;
+ metadata->pid = pid_vnr(event->pid);
if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
metadata->fd = FAN_NOFD;
else {
@@ -191,7 +191,7 @@ static int process_access_response(struct fsnotify_group *group,
if (fd < 0)
return -EINVAL;
- if ((response & FAN_AUDIT) && !group->fanotify_data.audit)
+ if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
return -EINVAL;
event = dequeue_event(group, fd);
@@ -395,7 +395,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
*/
while (!fsnotify_notify_queue_is_empty(group)) {
fsn_event = fsnotify_remove_first_event(group);
- if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) {
+ if (!(fsn_event->mask & FANOTIFY_PERM_EVENTS)) {
spin_unlock(&group->notification_lock);
fsnotify_destroy_event(group, fsn_event);
spin_lock(&group->notification_lock);
@@ -506,18 +506,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
spin_lock(&fsn_mark->lock);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
- __u32 tmask = fsn_mark->mask & ~mask;
-
- if (flags & FAN_MARK_ONDIR)
- tmask &= ~FAN_ONDIR;
-
oldmask = fsn_mark->mask;
- fsn_mark->mask = tmask;
+ fsn_mark->mask &= ~mask;
} else {
- __u32 tmask = fsn_mark->ignored_mask & ~mask;
- if (flags & FAN_MARK_ONDIR)
- tmask &= ~FAN_ONDIR;
- fsn_mark->ignored_mask = tmask;
+ fsn_mark->ignored_mask &= ~mask;
}
*destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
spin_unlock(&fsn_mark->lock);
@@ -563,6 +555,13 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
mask, flags);
}
+static int fanotify_remove_sb_mark(struct fsnotify_group *group,
+ struct super_block *sb, __u32 mask,
+ unsigned int flags)
+{
+ return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, flags);
+}
+
static int fanotify_remove_inode_mark(struct fsnotify_group *group,
struct inode *inode, __u32 mask,
unsigned int flags)
@@ -579,19 +578,10 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
spin_lock(&fsn_mark->lock);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
- __u32 tmask = fsn_mark->mask | mask;
-
- if (flags & FAN_MARK_ONDIR)
- tmask |= FAN_ONDIR;
-
oldmask = fsn_mark->mask;
- fsn_mark->mask = tmask;
+ fsn_mark->mask |= mask;
} else {
- __u32 tmask = fsn_mark->ignored_mask | mask;
- if (flags & FAN_MARK_ONDIR)
- tmask |= FAN_ONDIR;
-
- fsn_mark->ignored_mask = tmask;
+ fsn_mark->ignored_mask |= mask;
if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
}
@@ -658,6 +648,14 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags);
}
+static int fanotify_add_sb_mark(struct fsnotify_group *group,
+ struct super_block *sb, __u32 mask,
+ unsigned int flags)
+{
+ return fanotify_add_mark(group, &sb->s_fsnotify_marks,
+ FSNOTIFY_OBJ_TYPE_SB, mask, flags);
+}
+
static int fanotify_add_inode_mark(struct fsnotify_group *group,
struct inode *inode, __u32 mask,
unsigned int flags)
@@ -686,16 +684,16 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
struct user_struct *user;
struct fanotify_event_info *oevent;
- pr_debug("%s: flags=%d event_f_flags=%d\n",
- __func__, flags, event_f_flags);
+ pr_debug("%s: flags=%x event_f_flags=%x\n",
+ __func__, flags, event_f_flags);
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
#ifdef CONFIG_AUDITSYSCALL
- if (flags & ~(FAN_ALL_INIT_FLAGS | FAN_ENABLE_AUDIT))
+ if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
#else
- if (flags & ~FAN_ALL_INIT_FLAGS)
+ if (flags & ~FANOTIFY_INIT_FLAGS)
#endif
return -EINVAL;
@@ -731,6 +729,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
}
group->fanotify_data.user = user;
+ group->fanotify_data.flags = flags;
atomic_inc(&user->fanotify_listeners);
group->memcg = get_mem_cgroup_from_mm(current->mm);
@@ -746,7 +745,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.f_flags = event_f_flags;
init_waitqueue_head(&group->fanotify_data.access_waitq);
INIT_LIST_HEAD(&group->fanotify_data.access_list);
- switch (flags & FAN_ALL_CLASS_BITS) {
+ switch (flags & FANOTIFY_CLASS_BITS) {
case FAN_CLASS_NOTIF:
group->priority = FS_PRIO_0;
break;
@@ -783,7 +782,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
fd = -EPERM;
if (!capable(CAP_AUDIT_WRITE))
goto out_destroy_group;
- group->fanotify_data.audit = true;
}
fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
@@ -805,7 +803,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct fsnotify_group *group;
struct fd f;
struct path path;
- u32 valid_mask = FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD;
+ u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
int ret;
pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
@@ -815,8 +814,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (mask & ((__u64)0xffffffff << 32))
return -EINVAL;
- if (flags & ~FAN_ALL_MARK_FLAGS)
+ if (flags & ~FANOTIFY_MARK_FLAGS)
+ return -EINVAL;
+
+ switch (mark_type) {
+ case FAN_MARK_INODE:
+ case FAN_MARK_MOUNT:
+ case FAN_MARK_FILESYSTEM:
+ break;
+ default:
return -EINVAL;
+ }
+
switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
case FAN_MARK_ADD: /* fallthrough */
case FAN_MARK_REMOVE:
@@ -824,20 +833,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
return -EINVAL;
break;
case FAN_MARK_FLUSH:
- if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH))
+ if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
return -EINVAL;
break;
default:
return -EINVAL;
}
- if (mask & FAN_ONDIR) {
- flags |= FAN_MARK_ONDIR;
- mask &= ~FAN_ONDIR;
- }
-
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
- valid_mask |= FAN_ALL_PERM_EVENTS;
+ valid_mask |= FANOTIFY_PERM_EVENTS;
if (mask & ~valid_mask)
return -EINVAL;
@@ -857,14 +861,16 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* allowed to set permissions events.
*/
ret = -EINVAL;
- if (mask & FAN_ALL_PERM_EVENTS &&
+ if (mask & FANOTIFY_PERM_EVENTS &&
group->priority == FS_PRIO_0)
goto fput_and_out;
if (flags & FAN_MARK_FLUSH) {
ret = 0;
- if (flags & FAN_MARK_MOUNT)
+ if (mark_type == FAN_MARK_MOUNT)
fsnotify_clear_vfsmount_marks_by_group(group);
+ else if (mark_type == FAN_MARK_FILESYSTEM)
+ fsnotify_clear_sb_marks_by_group(group);
else
fsnotify_clear_inode_marks_by_group(group);
goto fput_and_out;
@@ -875,7 +881,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
goto fput_and_out;
/* inode held in place by reference to path; group by fget on fd */
- if (!(flags & FAN_MARK_MOUNT))
+ if (mark_type == FAN_MARK_INODE)
inode = path.dentry->d_inode;
else
mnt = path.mnt;
@@ -883,14 +889,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
/* create/update an inode mark */
switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
case FAN_MARK_ADD:
- if (flags & FAN_MARK_MOUNT)
+ if (mark_type == FAN_MARK_MOUNT)
ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
+ else if (mark_type == FAN_MARK_FILESYSTEM)
+ ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags);
else
ret = fanotify_add_inode_mark(group, inode, mask, flags);
break;
case FAN_MARK_REMOVE:
- if (flags & FAN_MARK_MOUNT)
+ if (mark_type == FAN_MARK_MOUNT)
ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
+ else if (mark_type == FAN_MARK_FILESYSTEM)
+ ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags);
else
ret = fanotify_remove_inode_mark(group, inode, mask, flags);
break;
@@ -934,6 +944,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
*/
static int __init fanotify_user_setup(void)
{
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
+
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 86fcf5814279..348a184bcdda 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -131,37 +131,20 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
+ } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) {
+ struct super_block *sb = fsnotify_conn_sb(mark->connector);
+
+ seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n",
+ sb->s_dev, mflags, mark->mask, mark->ignored_mask);
}
}
void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
{
struct fsnotify_group *group = f->private_data;
- unsigned int flags = 0;
-
- switch (group->priority) {
- case FS_PRIO_0:
- flags |= FAN_CLASS_NOTIF;
- break;
- case FS_PRIO_1:
- flags |= FAN_CLASS_CONTENT;
- break;
- case FS_PRIO_2:
- flags |= FAN_CLASS_PRE_CONTENT;
- break;
- }
-
- if (group->max_events == UINT_MAX)
- flags |= FAN_UNLIMITED_QUEUE;
-
- if (group->fanotify_data.max_marks == UINT_MAX)
- flags |= FAN_UNLIMITED_MARKS;
-
- if (group->fanotify_data.audit)
- flags |= FAN_ENABLE_AUDIT;
seq_printf(m, "fanotify flags:%x event-flags:%x\n",
- flags, group->fanotify_data.f_flags);
+ group->fanotify_data.flags, group->fanotify_data.f_flags);
show_fdinfo(m, f, fanotify_fdinfo);
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ababdbfab537..2172ba516c61 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -48,7 +48,7 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
* Called during unmount with no locks held, so needs to be safe against
* concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
*/
-void fsnotify_unmount_inodes(struct super_block *sb)
+static void fsnotify_unmount_inodes(struct super_block *sb)
{
struct inode *inode, *iput_inode = NULL;
@@ -96,6 +96,15 @@ void fsnotify_unmount_inodes(struct super_block *sb)
if (iput_inode)
iput(iput_inode);
+ /* Wait for outstanding inode references from connectors */
+ wait_var_event(&sb->s_fsnotify_inode_refs,
+ !atomic_long_read(&sb->s_fsnotify_inode_refs));
+}
+
+void fsnotify_sb_delete(struct super_block *sb)
+{
+ fsnotify_unmount_inodes(sb);
+ fsnotify_clear_marks_by_sb(sb);
}
/*
@@ -190,7 +199,7 @@ static int send_to_group(struct inode *to_tell,
struct fsnotify_iter_info *iter_info)
{
struct fsnotify_group *group = NULL;
- __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+ __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
__u32 marks_mask = 0;
__u32 marks_ignored_mask = 0;
struct fsnotify_mark *mark;
@@ -319,15 +328,17 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
const unsigned char *file_name, u32 cookie)
{
struct fsnotify_iter_info iter_info = {};
- struct mount *mnt;
+ struct super_block *sb = NULL;
+ struct mount *mnt = NULL;
+ __u32 mnt_or_sb_mask = 0;
int ret = 0;
- /* global tests shouldn't care about events on child only the specific event */
- __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+ __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
- if (data_is == FSNOTIFY_EVENT_PATH)
+ if (data_is == FSNOTIFY_EVENT_PATH) {
mnt = real_mount(((const struct path *)data)->mnt);
- else
- mnt = NULL;
+ sb = mnt->mnt.mnt_sb;
+ mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask;
+ }
/*
* Optimization: srcu_read_lock() has a memory barrier which can
@@ -337,16 +348,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
* need SRCU to keep them "alive".
*/
if (!to_tell->i_fsnotify_marks &&
- (!mnt || !mnt->mnt_fsnotify_marks))
+ (!mnt || (!mnt->mnt_fsnotify_marks && !sb->s_fsnotify_marks)))
return 0;
/*
* if this is a modify event we may need to clear the ignored masks
- * otherwise return if neither the inode nor the vfsmount care about
+ * otherwise return if neither the inode nor the vfsmount/sb care about
* this type of event.
*/
if (!(mask & FS_MODIFY) &&
- !(test_mask & to_tell->i_fsnotify_mask) &&
- !(mnt && test_mask & mnt->mnt_fsnotify_mask))
+ !(test_mask & (to_tell->i_fsnotify_mask | mnt_or_sb_mask)))
return 0;
iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
@@ -356,11 +366,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
if (mnt) {
iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
+ iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
+ fsnotify_first_mark(&sb->s_fsnotify_marks);
}
/*
- * We need to merge inode & vfsmount mark lists so that inode mark
- * ignore masks are properly reflected for mount mark notifications.
+ * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
+ * ignore masks are properly reflected for mount/sb mark notifications.
* That's why this traversal is so complicated...
*/
while (fsnotify_iter_select_report_types(&iter_info)) {
@@ -386,7 +398,7 @@ static __init int fsnotify_init(void)
{
int ret;
- BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 7902653dd577..5a00121fb219 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -21,6 +21,12 @@ static inline struct mount *fsnotify_conn_mount(
return container_of(conn->obj, struct mount, mnt_fsnotify_marks);
}
+static inline struct super_block *fsnotify_conn_sb(
+ struct fsnotify_mark_connector *conn)
+{
+ return container_of(conn->obj, struct super_block, s_fsnotify_marks);
+}
+
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
@@ -43,6 +49,11 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
{
fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
}
+/* run the list of all marks associated with sb and destroy them */
+static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
+{
+ fsnotify_destroy_marks(&sb->s_fsnotify_marks);
+}
/* Wait until all marks queued for destruction are destroyed */
extern void fsnotify_wait_marks_destroyed(void);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ac6978d3208c..105576daca4a 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -815,7 +815,7 @@ static int __init inotify_user_setup(void)
BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
- BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22);
+ BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22);
inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark,
SLAB_PANIC|SLAB_ACCOUNT);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 59cdb27826de..d2dd16cb5989 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -115,6 +115,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
return &fsnotify_conn_inode(conn)->i_fsnotify_mask;
else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT)
return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
+ else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
+ return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
return NULL;
}
@@ -179,19 +181,24 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
}
}
-static struct inode *fsnotify_detach_connector_from_object(
- struct fsnotify_mark_connector *conn)
+static void *fsnotify_detach_connector_from_object(
+ struct fsnotify_mark_connector *conn,
+ unsigned int *type)
{
struct inode *inode = NULL;
+ *type = conn->type;
if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED)
return NULL;
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
+ atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs);
} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
+ } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
+ fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
}
rcu_assign_pointer(*(conn->obj), NULL);
@@ -211,10 +218,29 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
fsnotify_put_group(group);
}
+/* Drop object reference originally held by a connector */
+static void fsnotify_drop_object(unsigned int type, void *objp)
+{
+ struct inode *inode;
+ struct super_block *sb;
+
+ if (!objp)
+ return;
+ /* Currently only inode references are passed to be dropped */
+ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
+ return;
+ inode = objp;
+ sb = inode->i_sb;
+ iput(inode);
+ if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs))
+ wake_up_var(&sb->s_fsnotify_inode_refs);
+}
+
void fsnotify_put_mark(struct fsnotify_mark *mark)
{
struct fsnotify_mark_connector *conn;
- struct inode *inode = NULL;
+ void *objp = NULL;
+ unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
bool free_conn = false;
/* Catch marks that were actually never attached to object */
@@ -234,7 +260,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
conn = mark->connector;
hlist_del_init_rcu(&mark->obj_list);
if (hlist_empty(&conn->list)) {
- inode = fsnotify_detach_connector_from_object(conn);
+ objp = fsnotify_detach_connector_from_object(conn, &type);
free_conn = true;
} else {
__fsnotify_recalc_mask(conn);
@@ -242,7 +268,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
mark->connector = NULL;
spin_unlock(&conn->lock);
- iput(inode);
+ fsnotify_drop_object(type, objp);
if (free_conn) {
spin_lock(&destroy_lock);
@@ -709,7 +735,8 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp)
{
struct fsnotify_mark_connector *conn;
struct fsnotify_mark *mark, *old_mark = NULL;
- struct inode *inode;
+ void *objp;
+ unsigned int type;
conn = fsnotify_grab_connector(connp);
if (!conn)
@@ -735,11 +762,11 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp)
* mark references get dropped. It would lead to strange results such
* as delaying inode deletion or blocking unmount.
*/
- inode = fsnotify_detach_connector_from_object(conn);
+ objp = fsnotify_detach_connector_from_object(conn, &type);
spin_unlock(&conn->lock);
if (old_mark)
fsnotify_put_mark(old_mark);
- iput(inode);
+ fsnotify_drop_object(type, objp);
}
/*
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index d297fe4472a9..bbcc185062bb 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -22,7 +22,7 @@
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/printk.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 792c78a49174..6c517b11acf8 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/init.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a027473561c6..47c3764c469b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -521,7 +521,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
mss->swap += PAGE_SIZE;
else
put_page(page);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 91ae16fbd7d5..3fe90443c1bb 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -16,7 +16,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/printk.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/crash_dump.h>
#include <linux/list.h>
@@ -423,7 +423,7 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
if (rc < 0) {
unlock_page(page);
put_page(page);
- return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+ return vmf_error(rc);
}
SetPageUptodate(page);
}
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 503086f7f7c1..0d19d191ae70 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -141,7 +141,6 @@ config PSTORE_RAM
tristate "Log panic/oops to a RAM buffer"
depends on PSTORE
depends on HAS_IOMEM
- depends on HAVE_MEMBLOCK
select REED_SOLOMON
select REED_SOLOMON_ENC8
select REED_SOLOMON_DEC8
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index a39a562c1c10..bd29c58ccbd8 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -26,14 +26,5 @@ ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
reiserfs-objs += xattr_acl.o
endif
-# gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline
-# functions are used. This causes the compiler to advance the stack
-# pointer out of the available stack space, corrupting kernel space,
-# and causing a panic. Since this behavior only affects ppc32, this ifeq
-# will work around it. If any other architecture displays this behavior,
-# add it here.
-ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1)
-
TAGS:
etags *.c
-
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 48cdfc81fe10..32d8986c26fb 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -185,6 +185,7 @@ struct reiserfs_dentry_buf {
struct dir_context ctx;
struct dentry *xadir;
int count;
+ int err;
struct dentry *dentries[8];
};
@@ -207,6 +208,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
dentry = lookup_one_len(name, dbuf->xadir, namelen);
if (IS_ERR(dentry)) {
+ dbuf->err = PTR_ERR(dentry);
return PTR_ERR(dentry);
} else if (d_really_is_negative(dentry)) {
/* A directory entry exists, but no file? */
@@ -215,6 +217,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
"not found for file %pd.\n",
dentry, dbuf->xadir);
dput(dentry);
+ dbuf->err = -EIO;
return -EIO;
}
@@ -262,6 +265,10 @@ static int reiserfs_for_each_xattr(struct inode *inode,
err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
if (err)
break;
+ if (buf.err) {
+ err = buf.err;
+ break;
+ }
if (!buf.count)
break;
for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
diff --git a/fs/super.c b/fs/super.c
index f3a8c008e164..ca53a08497ed 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -442,7 +442,7 @@ void generic_shutdown_super(struct super_block *sb)
sync_filesystem(sb);
sb->s_flags &= ~SB_ACTIVE;
- fsnotify_unmount_inodes(sb);
+ fsnotify_sb_delete(sb);
cgroup_writeback_umount();
evict_inodes(sb);
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index fcda0fc97b90..ec85aeaed54a 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -175,8 +175,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
{
struct udf_sb_info *sbi = UDF_SB(sb);
int alloc_count = 0;
- int bit, block, block_group, group_start;
- int nr_groups, bitmap_nr;
+ int bit, block, block_group;
+ int bitmap_nr;
struct buffer_head *bh;
__u32 part_len;
@@ -189,10 +189,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
block_count = part_len - first_block;
do {
- nr_groups = udf_compute_nr_groups(sb, partition);
block = first_block + (sizeof(struct spaceBitmapDesc) << 3);
block_group = block >> (sb->s_blocksize_bits + 3);
- group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc);
bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
if (bitmap_nr < 0)
@@ -652,12 +650,6 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode,
} else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
udf_table_free_blocks(sb, map->s_uspace.s_table,
bloc, offset, count);
- } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
- udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap,
- bloc, offset, count);
- } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
- udf_table_free_blocks(sb, map->s_fspace.s_table,
- bloc, offset, count);
}
if (inode) {
@@ -684,16 +676,6 @@ inline int udf_prealloc_blocks(struct super_block *sb,
map->s_uspace.s_table,
partition, first_block,
block_count);
- else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
- allocated = udf_bitmap_prealloc_blocks(sb,
- map->s_fspace.s_bitmap,
- partition, first_block,
- block_count);
- else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
- allocated = udf_table_prealloc_blocks(sb,
- map->s_fspace.s_table,
- partition, first_block,
- block_count);
else
return 0;
@@ -717,14 +699,6 @@ inline udf_pblk_t udf_new_block(struct super_block *sb,
block = udf_table_new_block(sb,
map->s_uspace.s_table,
partition, goal, err);
- else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
- block = udf_bitmap_new_block(sb,
- map->s_fspace.s_bitmap,
- partition, goal, err);
- else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
- block = udf_table_new_block(sb,
- map->s_fspace.s_table,
- partition, goal, err);
else {
*err = -EIO;
return 0;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6f515651a2c2..8f2f56d9a1bb 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -290,12 +290,8 @@ static void udf_free_partition(struct udf_part_map *map)
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
iput(map->s_uspace.s_table);
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
- iput(map->s_fspace.s_table);
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
udf_sb_free_bitmap(map->s_uspace.s_bitmap);
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
- udf_sb_free_bitmap(map->s_fspace.s_bitmap);
if (map->s_partition_type == UDF_SPARABLE_MAP15)
for (i = 0; i < 4; i++)
brelse(map->s_type_specific.s_sparing.s_spar_map[i]);
@@ -613,14 +609,11 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
struct udf_options uopt;
struct udf_sb_info *sbi = UDF_SB(sb);
int error = 0;
- struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
+
+ if (!(*flags & SB_RDONLY) && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
+ return -EACCES;
sync_filesystem(sb);
- if (lvidiu) {
- int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
- if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & SB_RDONLY))
- return -EACCES;
- }
uopt.flags = sbi->s_flags;
uopt.uid = sbi->s_uid;
@@ -988,12 +981,62 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
return bitmap;
}
+static int check_partition_desc(struct super_block *sb,
+ struct partitionDesc *p,
+ struct udf_part_map *map)
+{
+ bool umap, utable, fmap, ftable;
+ struct partitionHeaderDesc *phd;
+
+ switch (le32_to_cpu(p->accessType)) {
+ case PD_ACCESS_TYPE_READ_ONLY:
+ case PD_ACCESS_TYPE_WRITE_ONCE:
+ case PD_ACCESS_TYPE_REWRITABLE:
+ case PD_ACCESS_TYPE_NONE:
+ goto force_ro;
+ }
+
+ /* No Partition Header Descriptor? */
+ if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
+ strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
+ goto force_ro;
+
+ phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
+ utable = phd->unallocSpaceTable.extLength;
+ umap = phd->unallocSpaceBitmap.extLength;
+ ftable = phd->freedSpaceTable.extLength;
+ fmap = phd->freedSpaceBitmap.extLength;
+
+ /* No allocation info? */
+ if (!utable && !umap && !ftable && !fmap)
+ goto force_ro;
+
+ /* We don't support blocks that require erasing before overwrite */
+ if (ftable || fmap)
+ goto force_ro;
+ /* UDF 2.60: 2.3.3 - no mixing of tables & bitmaps, no VAT. */
+ if (utable && umap)
+ goto force_ro;
+
+ if (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
+ map->s_partition_type == UDF_VIRTUAL_MAP20)
+ goto force_ro;
+
+ return 0;
+force_ro:
+ if (!sb_rdonly(sb))
+ return -EACCES;
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
+ return 0;
+}
+
static int udf_fill_partdesc_info(struct super_block *sb,
struct partitionDesc *p, int p_index)
{
struct udf_part_map *map;
struct udf_sb_info *sbi = UDF_SB(sb);
struct partitionHeaderDesc *phd;
+ int err;
map = &sbi->s_partmaps[p_index];
@@ -1013,8 +1056,16 @@ static int udf_fill_partdesc_info(struct super_block *sb,
p_index, map->s_partition_type,
map->s_partition_root, map->s_partition_len);
- if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
- strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
+ err = check_partition_desc(sb, p, map);
+ if (err)
+ return err;
+
+ /*
+ * Skip loading allocation info it we cannot ever write to the fs.
+ * This is a correctness thing as we may have decided to force ro mount
+ * to avoid allocation info we don't support.
+ */
+ if (UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
return 0;
phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
@@ -1050,40 +1101,6 @@ static int udf_fill_partdesc_info(struct super_block *sb,
p_index, bitmap->s_extPosition);
}
- if (phd->partitionIntegrityTable.extLength)
- udf_debug("partitionIntegrityTable (part %d)\n", p_index);
-
- if (phd->freedSpaceTable.extLength) {
- struct kernel_lb_addr loc = {
- .logicalBlockNum = le32_to_cpu(
- phd->freedSpaceTable.extPosition),
- .partitionReferenceNum = p_index,
- };
- struct inode *inode;
-
- inode = udf_iget_special(sb, &loc);
- if (IS_ERR(inode)) {
- udf_debug("cannot load freedSpaceTable (part %d)\n",
- p_index);
- return PTR_ERR(inode);
- }
- map->s_fspace.s_table = inode;
- map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
- udf_debug("freedSpaceTable (part %d) @ %lu\n",
- p_index, map->s_fspace.s_table->i_ino);
- }
-
- if (phd->freedSpaceBitmap.extLength) {
- struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
- if (!bitmap)
- return -ENOMEM;
- map->s_fspace.s_bitmap = bitmap;
- bitmap->s_extPosition = le32_to_cpu(
- phd->freedSpaceBitmap.extPosition);
- map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
- udf_debug("freedSpaceBitmap (part %d) @ %u\n",
- p_index, bitmap->s_extPosition);
- }
return 0;
}
@@ -1257,6 +1274,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
ret = -EACCES;
goto out_bh;
}
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
ret = udf_load_vat(sb, i, type1_idx);
if (ret < 0)
goto out_bh;
@@ -2155,10 +2173,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
UDF_MAX_READ_VERSION);
ret = -EINVAL;
goto error_out;
- } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION &&
- !sb_rdonly(sb)) {
- ret = -EACCES;
- goto error_out;
+ } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) {
+ if (!sb_rdonly(sb)) {
+ ret = -EACCES;
+ goto error_out;
+ }
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
}
sbi->s_udfrev = minUDFWriteRev;
@@ -2176,10 +2196,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
}
if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
- UDF_PART_FLAG_READ_ONLY &&
- !sb_rdonly(sb)) {
- ret = -EACCES;
- goto error_out;
+ UDF_PART_FLAG_READ_ONLY) {
+ if (!sb_rdonly(sb)) {
+ ret = -EACCES;
+ goto error_out;
+ }
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
}
if (udf_find_fileset(sb, &fileset, &rootdir)) {
@@ -2433,10 +2455,6 @@ static unsigned int udf_count_free(struct super_block *sb)
accum += udf_count_free_bitmap(sb,
map->s_uspace.s_bitmap);
}
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
- accum += udf_count_free_bitmap(sb,
- map->s_fspace.s_bitmap);
- }
if (accum)
return accum;
@@ -2444,11 +2462,6 @@ static unsigned int udf_count_free(struct super_block *sb)
accum += udf_count_free_table(sb,
map->s_uspace.s_table);
}
- if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
- accum += udf_count_free_table(sb,
- map->s_fspace.s_table);
- }
-
return accum;
}
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 9424d7cab790..3d83be54c474 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -30,11 +30,11 @@
#define UDF_FLAG_LASTBLOCK_SET 16
#define UDF_FLAG_BLOCKSIZE_SET 17
#define UDF_FLAG_INCONSISTENT 18
+#define UDF_FLAG_RW_INCOMPAT 19 /* Set when we find RW incompatible
+ * feature */
#define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001
#define UDF_PART_FLAG_UNALLOC_TABLE 0x0002
-#define UDF_PART_FLAG_FREED_BITMAP 0x0004
-#define UDF_PART_FLAG_FREED_TABLE 0x0008
#define UDF_PART_FLAG_READ_ONLY 0x0010
#define UDF_PART_FLAG_WRITE_ONCE 0x0020
#define UDF_PART_FLAG_REWRITABLE 0x0040
@@ -50,8 +50,6 @@
#define UDF_INVALID_MODE ((umode_t)-1)
-#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
-
#define MF_DUPLICATE_MD 0x01
#define MF_MIRROR_FE_LOADED 0x02
@@ -93,10 +91,6 @@ struct udf_part_map {
struct udf_bitmap *s_bitmap;
struct inode *s_table;
} s_uspace;
- union {
- struct udf_bitmap *s_bitmap;
- struct inode *s_table;
- } s_fspace;
__u32 s_partition_root;
__u32 s_partition_len;
__u16 s_partition_type;