diff options
Diffstat (limited to 'fs')
110 files changed, 3318 insertions, 1865 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 082d227fa56b..6261719f6f2a 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -276,7 +276,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, switch (handler->flags) { case ACL_TYPE_ACCESS: if (acl) { - struct iattr iattr; + struct iattr iattr = { 0 }; struct posix_acl *old_acl = acl; retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 89bac3d2f05b..619128b55837 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -61,6 +61,8 @@ enum { Opt_cache_loose, Opt_fscache, Opt_mmap, /* Access options */ Opt_access, Opt_posixacl, + /* Lock timeout option */ + Opt_locktimeout, /* Error token */ Opt_err }; @@ -80,6 +82,7 @@ static const match_table_t tokens = { {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, + {Opt_locktimeout, "locktimeout=%u"}, {Opt_err, NULL} }; @@ -187,6 +190,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #ifdef CONFIG_9P_FSCACHE v9ses->cachetag = NULL; #endif + v9ses->session_lock_timeout = P9_LOCK_TIMEOUT; if (!opts) return 0; @@ -359,6 +363,23 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #endif break; + case Opt_locktimeout: + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + if (option < 1) { + p9_debug(P9_DEBUG_ERROR, + "locktimeout must be a greater than zero integer.\n"); + ret = -EINVAL; + continue; + } + v9ses->session_lock_timeout = (long)option * HZ; + break; + default: continue; } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 982e017acadb..129e5243a6bf 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -116,6 +116,7 @@ struct v9fs_session_info { struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct rw_semaphore rename_sem; + long session_lock_timeout; /* retry interval for blocking locks */ }; /* cache_validity flags */ diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index b0405d6aac85..cb6c4031af55 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -76,15 +76,6 @@ static inline int dt_type(struct p9_wstat *mistat) return rettype; } -static void p9stat_init(struct p9_wstat *stbuf) -{ - stbuf->name = NULL; - stbuf->uid = NULL; - stbuf->gid = NULL; - stbuf->muid = NULL; - stbuf->extension = NULL; -} - /** * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir * @filp: opened file structure @@ -114,7 +105,6 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) int err = 0; struct p9_fid *fid; int buflen; - int reclen = 0; struct p9_rdir *rdir; struct kvec kvec; @@ -145,15 +135,12 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) rdir->tail = n; } while (rdir->head < rdir->tail) { - p9stat_init(&st); err = p9stat_read(fid->clnt, rdir->buf + rdir->head, rdir->tail - rdir->head, &st); - if (err) { + if (err <= 0) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - p9stat_free(&st); return -EIO; } - reclen = st.size+2; over = !dir_emit(ctx, st.name, strlen(st.name), v9fs_qid2ino(&st.qid), dt_type(&st)); @@ -161,8 +148,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) if (over) return 0; - rdir->head += reclen; - ctx->pos += reclen; + rdir->head += err; + ctx->pos += err; } } } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 5f2e48d41d72..a25efa782fcc 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -154,6 +154,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) uint8_t status = P9_LOCK_ERROR; int res = 0; unsigned char fl_type; + struct v9fs_session_info *v9ses; fid = filp->private_data; BUG_ON(fid == NULL); @@ -189,6 +190,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) if (IS_SETLKW(cmd)) flock.flags = P9_LOCK_FLAGS_BLOCK; + v9ses = v9fs_inode2v9ses(file_inode(filp)); + /* * if its a blocked request and we get P9_LOCK_BLOCKED as the status * for lock request, keep on trying @@ -202,8 +205,17 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) break; - if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0) + if (schedule_timeout_interruptible(v9ses->session_lock_timeout) + != 0) break; + /* + * p9_client_lock_dotl overwrites flock.client_id with the + * server message, free and reuse the client name + */ + if (flock.client_id != fid->clnt->name) { + kfree(flock.client_id); + flock.client_id = fid->clnt->name; + } } /* map 9p status to VFS status */ @@ -216,7 +228,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; default: WARN_ONCE(1, "unknown lock status code: %d\n", status); - /* fallthough */ + /* fall through */ case P9_LOCK_ERROR: case P9_LOCK_GRACE: res = -ENOLCK; @@ -235,6 +247,8 @@ out_unlock: locks_lock_file_wait(filp, fl); fl->fl_type = fl_type; } + if (flock.client_id != fid->clnt->name) + kfree(flock.client_id); out: return res; } @@ -269,7 +283,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) res = p9_client_getlock_dotl(fid, &glock); if (res < 0) - return res; + goto out; /* map 9p lock type to os lock type */ switch (glock.type) { case P9_LOCK_TYPE_RDLCK: @@ -290,7 +304,9 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) fl->fl_end = glock.start + glock.length - 1; fl->fl_pid = -glock.proc_id; } - kfree(glock.client_id); +out: + if (glock.client_id != fid->clnt->name) + kfree(glock.client_id); return res; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8703ce68fe9d..2955a4ea2fa8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -437,10 +437,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - rcu_read_lock(); - page = radix_tree_lookup(&mapping->i_pages, pg_index); - rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) { + page = xa_load(&mapping->i_pages, pg_index); + if (page && !xa_is_value(page)) { misses++; if (misses > 4) break; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2ee43b6a4f09..539901fb5165 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1014,9 +1014,26 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) parent_start = parent->start; + /* + * If we are COWing a node/leaf from the extent, chunk or device trees, + * make sure that we do not finish block group creation of pending block + * groups. We do this to avoid a deadlock. + * COWing can result in allocation of a new chunk, and flushing pending + * block groups (btrfs_create_pending_block_groups()) can be triggered + * when finishing allocation of a new chunk. Creation of a pending block + * group modifies the extent, chunk and device trees, therefore we could + * deadlock with ourselves since we are holding a lock on an extent + * buffer that btrfs_create_pending_block_groups() may try to COW later. + */ + if (root == fs_info->extent_root || + root == fs_info->chunk_root || + root == fs_info->dev_root) + trans->can_flush_pending_bgs = false; + cow = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, &disk_key, level, search_start, empty_size); + trans->can_flush_pending_bgs = true; if (IS_ERR(cow)) return PTR_ERR(cow); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 5149165b49a4..9301b3ad9217 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -164,14 +164,27 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, return NULL; } +static struct btrfs_delayed_ref_head *find_first_ref_head( + struct btrfs_delayed_ref_root *dr) +{ + struct rb_node *n; + struct btrfs_delayed_ref_head *entry; + + n = rb_first_cached(&dr->href_root); + if (!n) + return NULL; + + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); + + return entry; +} + /* - * find an head entry based on bytenr. This returns the delayed ref - * head if it was able to find one, or NULL if nothing was in that spot. - * If return_bigger is given, the next bigger entry is returned if no exact - * match is found. But if no bigger one is found then the first node of the - * ref head tree will be returned. + * Find a head entry based on bytenr. This returns the delayed ref head if it + * was able to find one, or NULL if nothing was in that spot. If return_bigger + * is given, the next bigger entry is returned if no exact match is found. */ -static struct btrfs_delayed_ref_head* find_ref_head( +static struct btrfs_delayed_ref_head *find_ref_head( struct btrfs_delayed_ref_root *dr, u64 bytenr, bool return_bigger) { @@ -195,10 +208,9 @@ static struct btrfs_delayed_ref_head* find_ref_head( if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) - n = rb_first_cached(&dr->href_root); + return NULL; entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - return entry; } return entry; } @@ -355,33 +367,25 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; - u64 start; - bool loop = false; again: - start = delayed_refs->run_delayed_start; - head = find_ref_head(delayed_refs, start, true); - if (!head && !loop) { + head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, + true); + if (!head && delayed_refs->run_delayed_start != 0) { delayed_refs->run_delayed_start = 0; - start = 0; - loop = true; - head = find_ref_head(delayed_refs, start, true); - if (!head) - return NULL; - } else if (!head && loop) { - return NULL; + head = find_first_ref_head(delayed_refs); } + if (!head) + return NULL; while (head->processing) { struct rb_node *node; node = rb_next(&head->href_node); if (!node) { - if (loop) + if (delayed_refs->run_delayed_start == 0) return NULL; delayed_refs->run_delayed_start = 0; - start = 0; - loop = true; goto again; } head = rb_entry(node, struct btrfs_delayed_ref_head, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a4cd0221bc8d..a1febf155747 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2366,6 +2366,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, insert_reserved); else BUG(); + if (ret && insert_reserved) + btrfs_pin_extent(trans->fs_info, node->bytenr, + node->num_bytes, 1); return ret; } @@ -2954,7 +2957,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; - bool can_flush_pending_bgs = trans->can_flush_pending_bgs; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2971,7 +2973,6 @@ again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, count); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -3002,7 +3003,6 @@ again: goto again; } out: - trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } @@ -4568,6 +4568,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, goto out; } else { ret = 1; + space_info->max_extent_size = 0; } space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; @@ -4589,11 +4590,9 @@ out: * the block groups that were made dirty during the lifetime of the * transaction. */ - if (trans->can_flush_pending_bgs && - trans->chunk_bytes_reserved >= (u64)SZ_2M) { + if (trans->chunk_bytes_reserved >= (u64)SZ_2M) btrfs_create_pending_block_groups(trans); - btrfs_trans_release_chunk_metadata(trans); - } + return ret; } @@ -6464,6 +6463,7 @@ static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + space_info->max_extent_size = 0; if (delalloc) cache->delalloc_bytes -= num_bytes; @@ -7260,6 +7260,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group = NULL; u64 search_start = 0; u64 max_extent_size = 0; + u64 max_free_space = 0; u64 empty_cluster = 0; struct btrfs_space_info *space_info; int loop = 0; @@ -7555,8 +7556,8 @@ unclustered_alloc: spin_lock(&ctl->tree_lock); if (ctl->free_space < num_bytes + empty_cluster + empty_size) { - if (ctl->free_space > max_extent_size) - max_extent_size = ctl->free_space; + max_free_space = max(max_free_space, + ctl->free_space); spin_unlock(&ctl->tree_lock); goto loop; } @@ -7723,6 +7724,8 @@ loop: } out: if (ret == -ENOSPC) { + if (!max_extent_size) + max_extent_size = max_free_space; spin_lock(&space_info->lock); space_info->max_extent_size = max_extent_size; spin_unlock(&space_info->lock); @@ -8004,21 +8007,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); - if (!path) { - btrfs_free_and_pin_reserved_extent(fs_info, - extent_key.objectid, - fs_info->nodesize); + if (!path) return -ENOMEM; - } path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, &extent_key, size); if (ret) { btrfs_free_path(path); - btrfs_free_and_pin_reserved_extent(fs_info, - extent_key.objectid, - fs_info->nodesize); return ret; } @@ -10132,9 +10128,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) struct btrfs_block_group_item item; struct btrfs_key key; int ret = 0; - bool can_flush_pending_bgs = trans->can_flush_pending_bgs; - trans->can_flush_pending_bgs = false; + if (!trans->can_flush_pending_bgs) + return; + while (!list_empty(&trans->new_bgs)) { block_group = list_first_entry(&trans->new_bgs, struct btrfs_block_group_cache, @@ -10159,7 +10156,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) next: list_del_init(&block_group->bg_list); } - trans->can_flush_pending_bgs = can_flush_pending_bgs; + btrfs_trans_release_chunk_metadata(trans); } int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6877a74c7469..d228f706ff3e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3784,7 +3784,7 @@ int btree_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { @@ -3909,7 +3909,7 @@ static int extent_write_cache_pages(struct address_space *mapping, pgoff_t done_index; int range_whole = 0; int scanned = 0; - int tag; + xa_mark_t tag; /* * We have to hold onto the inode so that ordered extents can do their @@ -5159,11 +5159,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) clear_page_dirty_for_io(page); xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->i_pages, - page_index(page), - PAGECACHE_TAG_DIRTY); - } + if (!PageDirty(page)) + __xa_clear_mark(&page->mapping->i_pages, + page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irq(&page->mapping->i_pages); ClearPageError(page); unlock_page(page); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 15b925142793..97c7a086f7bd 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2078,6 +2078,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; inode_lock(inode); + + /* + * We take the dio_sem here because the tree log stuff can race with + * lockless dio writes and get an extent map logged for an extent we + * never waited on. We need it this high up for lockdep reasons. + */ + down_write(&BTRFS_I(inode)->dio_sem); + atomic_inc(&root->log_batch); /* @@ -2086,6 +2094,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = btrfs_wait_ordered_range(inode, start, len); if (ret) { + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2109,6 +2118,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * checked called fsync. */ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2127,6 +2137,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2148,6 +2159,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); /* diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 67441219d6c9..4ba0aedc878b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1772,6 +1772,13 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, return -1; } +static inline u64 get_max_extent_size(struct btrfs_free_space *entry) +{ + if (entry->bitmap) + return entry->max_extent_size; + return entry->bytes; +} + /* Cache the size of the max extent in bytes */ static struct btrfs_free_space * find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, @@ -1793,8 +1800,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, for (node = &entry->offset_index; node; node = rb_next(node)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bytes < *bytes) { - if (entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); continue; } @@ -1812,8 +1819,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, } if (entry->bytes < *bytes + align_off) { - if (entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); continue; } @@ -1825,8 +1832,10 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, *offset = tmp; *bytes = size; return entry; - } else if (size > *max_extent_size) { - *max_extent_size = size; + } else { + *max_extent_size = + max(get_max_extent_size(entry), + *max_extent_size); } continue; } @@ -2449,6 +2458,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, struct rb_node *n; int count = 0; + spin_lock(&ctl->tree_lock); for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes && !block_group->ro) @@ -2457,6 +2467,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info->offset, info->bytes, (info->bitmap) ? "yes" : "no"); } + spin_unlock(&ctl->tree_lock); btrfs_info(fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); btrfs_info(fs_info, @@ -2685,8 +2696,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); if (err) { - if (search_bytes > *max_extent_size) - *max_extent_size = search_bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); return 0; } @@ -2723,8 +2734,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); while (1) { - if (entry->bytes < bytes && entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + if (entry->bytes < bytes) + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); if (entry->bytes < bytes || (!entry->bitmap && entry->offset < min_start)) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 181c58b23110..d3df5b52278c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -502,6 +502,7 @@ again: pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { /* just bail out to the uncompressed code */ + nr_pages = 0; goto cont; } @@ -2940,6 +2941,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) bool truncated = false; bool range_locked = false; bool clear_new_delalloc_bytes = false; + bool clear_reserved_extent = true; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && @@ -3043,10 +3045,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len, logical_len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); - if (!ret) + if (!ret) { + clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, ordered_extent->start, ordered_extent->disk_len); + } } unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, ordered_extent->len, @@ -3107,8 +3111,13 @@ out: * wrong we need to return the space for this ordered extent * back to the allocator. We only free the extent in the * truncated case if we didn't write out the extent at all. + * + * If we made it past insert_reserved_file_extent before we + * errored out then we don't need to do this as the accounting + * has already been done. */ if ((ret || !logical_len) && + clear_reserved_extent && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) btrfs_free_reserved_extent(fs_info, @@ -5259,11 +5268,13 @@ static void evict_inode_truncate_pages(struct inode *inode) struct extent_state *cached_state = NULL; u64 start; u64 end; + unsigned state_flags; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); start = state->start; end = state->end; + state_flags = state->state; spin_unlock(&io_tree->lock); lock_extent_bits(io_tree, start, end, &cached_state); @@ -5276,7 +5287,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * * Note, end is the bytenr of last byte, so we need + 1 here. */ - if (state->state & EXTENT_DELALLOC) + if (state_flags & EXTENT_DELALLOC) btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5686290a50e1..d1eeef9ec5da 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2283,15 +2283,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) kmem_cache_free(btrfs_trans_handle_cachep, trans); - /* - * If fs has been frozen, we can not handle delayed iputs, otherwise - * it'll result in deadlock about SB_FREEZE_FS. - */ - if (current != fs_info->transaction_kthread && - current != fs_info->cleaner_kthread && - !test_bit(BTRFS_FS_FROZEN, &fs_info->flags)) - btrfs_run_delayed_iputs(fs_info); - return ret; scrub_continue: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0dba09334a16..e07f3376b7df 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4390,7 +4390,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); - down_write(&inode->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; logged_start = start; @@ -4456,7 +4455,6 @@ process: } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - up_write(&inode->dio_sem); btrfs_release_path(path); if (!ret) @@ -4652,7 +4650,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, ASSERT(len == i_size || (len == fs_info->sectorsize && btrfs_file_extent_compression(leaf, extent) != - BTRFS_COMPRESS_NONE)); + BTRFS_COMPRESS_NONE) || + (len < i_size && i_size < fs_info->sectorsize)); return 0; } diff --git a/fs/buffer.c b/fs/buffer.c index 109f55196866..d60d61e8ed7d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -562,7 +562,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) EXPORT_SYMBOL(mark_buffer_dirty_inode); /* - * Mark the page dirty, and set it dirty in the radix tree, and mark the inode + * Mark the page dirty, and set it dirty in the page cache, and mark the inode * dirty. * * If warn is true, then emit a warning if the page is not uptodate and has @@ -579,8 +579,8 @@ void __set_page_dirty(struct page *page, struct address_space *mapping, if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); + __xa_set_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } @@ -1050,7 +1050,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and - * the page is tagged dirty in its radix tree. + * the page is tagged dirty in the page cache. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is @@ -1073,9 +1073,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, * mark_buffer_dirty - mark a buffer_head as needing writeout * @bh: the buffer_head to mark dirty * - * mark_buffer_dirty() will set the dirty bit against the buffer, then set its - * backing page dirty, then tag the page as dirty in its address_space's radix - * tree and then attach the address_space's inode to its superblock's dirty + * mark_buffer_dirty() will set the dirty bit against the buffer, then set + * its backing page dirty, then tag the page as dirty in the page cache + * and then attach the address_space's inode to its superblock's dirty * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 027408d55aee..5f0103f40079 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -104,6 +104,11 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) struct timespec64 old_ctime = inode->i_ctime; umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto out; + } + switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; @@ -138,11 +143,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) goto out_free; } - if (ceph_snap(inode) != CEPH_NOSNAP) { - ret = -EROFS; - goto out_free; - } - if (new_mode != old_mode) { newattrs.ia_ctime = current_time(inode); newattrs.ia_mode = new_mode; @@ -206,10 +206,9 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); if (!tmp_buf) goto out_err; - pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL); + pagelist = ceph_pagelist_alloc(GFP_KERNEL); if (!pagelist) goto out_err; - ceph_pagelist_init(pagelist); err = ceph_pagelist_reserve(pagelist, PAGE_SIZE); if (err) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9c332a6f6667..8eade7a993c1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -322,7 +322,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, /* caller of readpages does not hold buffer and read caps * (fadvise, madvise and readahead cases) */ int want = CEPH_CAP_FILE_CACHE; - ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got); + ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); if (ret < 0) { dout("start_read %p, error getting cap\n", inode); } else if (!(got & want)) { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index dd7dfdd2ba13..f3496db4bb3e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -519,9 +519,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, - struct ceph_inode_info *ci) + struct ceph_inode_info *ci, + bool set_timeout) { - __cap_set_timeouts(mdsc, ci); dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { @@ -531,6 +531,8 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, goto no_change; list_del_init(&ci->i_cap_delay_list); } + if (set_timeout) + __cap_set_timeouts(mdsc, ci); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); no_change: spin_unlock(&mdsc->cap_delay_lock); @@ -720,7 +722,7 @@ void ceph_add_cap(struct inode *inode, dout(" issued %s, mds wanted %s, actual %s, queueing\n", ceph_cap_string(issued), ceph_cap_string(wanted), ceph_cap_string(actual_wanted)); - __cap_delay_requeue(mdsc, ci); + __cap_delay_requeue(mdsc, ci, true); } if (flags & CEPH_CAP_FLAG_AUTH) { @@ -1647,7 +1649,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && (mask & CEPH_CAP_FILE_BUFFER)) dirty |= I_DIRTY_DATASYNC; - __cap_delay_requeue(mdsc, ci); + __cap_delay_requeue(mdsc, ci, true); return dirty; } @@ -2065,7 +2067,7 @@ ack: /* Reschedule delayed caps release if we delayed anything */ if (delayed) - __cap_delay_requeue(mdsc, ci); + __cap_delay_requeue(mdsc, ci, false); spin_unlock(&ci->i_ceph_lock); @@ -2125,7 +2127,7 @@ retry: if (delayed) { spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + __cap_delay_requeue(mdsc, ci, true); spin_unlock(&ci->i_ceph_lock); } } else { @@ -2671,17 +2673,18 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } -int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got) +int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, + bool nonblock, int *got) { int ret, err = 0; BUG_ON(need & ~CEPH_CAP_FILE_RD); - BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); + BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); ret = ceph_pool_perm_check(ci, need); if (ret < 0) return ret; - ret = try_get_cap_refs(ci, need, want, 0, true, got, &err); + ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err); if (ret) { if (err == -EAGAIN) { ret = 0; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 92ab20433682..f788496fafcc 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> +#include <linux/ceph/striper.h> #include <linux/module.h> #include <linux/sched.h> @@ -557,90 +558,26 @@ enum { }; /* - * Read a range of bytes striped over one or more objects. Iterate over - * objects we stripe over. (That's not atomic, but good enough for now.) - * - * If we get a short result from the OSD, check against i_size; we need to - * only return a short read to the caller if we hit EOF. - */ -static int striped_read(struct inode *inode, - u64 pos, u64 len, - struct page **pages, int num_pages, - int page_align, int *checkeof) -{ - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_inode_info *ci = ceph_inode(inode); - u64 this_len; - loff_t i_size; - int page_idx; - int ret, read = 0; - bool hit_stripe, was_short; - - /* - * we may need to do multiple reads. not atomic, unfortunately. - */ -more: - this_len = len; - page_idx = (page_align + read) >> PAGE_SHIFT; - ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), - &ci->i_layout, pos, &this_len, - ci->i_truncate_seq, ci->i_truncate_size, - pages + page_idx, num_pages - page_idx, - ((page_align + read) & ~PAGE_MASK)); - if (ret == -ENOENT) - ret = 0; - hit_stripe = this_len < len; - was_short = ret >= 0 && ret < this_len; - dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read, - ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); - - i_size = i_size_read(inode); - if (ret >= 0) { - if (was_short && (pos + ret < i_size)) { - int zlen = min(this_len - ret, i_size - pos - ret); - int zoff = page_align + read + ret; - dout(" zero gap %llu to %llu\n", - pos + ret, pos + ret + zlen); - ceph_zero_page_vector_range(zoff, zlen, pages); - ret += zlen; - } - - read += ret; - pos += ret; - len -= ret; - - /* hit stripe and need continue*/ - if (len && hit_stripe && pos < i_size) - goto more; - } - - if (read > 0) { - ret = read; - /* did we bounce off eof? */ - if (pos + len > i_size) - *checkeof = CHECK_EOF; - } - - dout("striped_read returns %d\n", ret); - return ret; -} - -/* * Completely synchronous read and write methods. Direct from __user * buffer to osd, or directly to user pages (if O_DIRECT). * - * If the read spans object boundary, just do multiple reads. + * If the read spans object boundary, just do multiple reads. (That's not + * atomic, but good enough for now.) + * + * If we get a short result from the OSD, check against i_size; we need to + * only return a short read to the caller if we hit EOF. */ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, - int *checkeof) + int *retry_op) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct page **pages; - u64 off = iocb->ki_pos; - int num_pages; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_client *osdc = &fsc->client->osdc; ssize_t ret; - size_t len = iov_iter_count(to); + u64 off = iocb->ki_pos; + u64 len = iov_iter_count(to); dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); @@ -653,61 +590,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, * but it will at least behave sensibly when they are * in sequence. */ - ret = filemap_write_and_wait_range(inode->i_mapping, off, - off + len); + ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len); if (ret < 0) return ret; - if (unlikely(to->type & ITER_PIPE)) { + ret = 0; + while ((len = iov_iter_count(to)) > 0) { + struct ceph_osd_request *req; + struct page **pages; + int num_pages; size_t page_off; - ret = iov_iter_get_pages_alloc(to, &pages, len, - &page_off); - if (ret <= 0) - return -ENOMEM; - num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); + u64 i_size; + bool more; + + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, off, &len, 0, 1, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + + more = len < iov_iter_count(to); - ret = striped_read(inode, off, ret, pages, num_pages, - page_off, checkeof); - if (ret > 0) { - iov_iter_advance(to, ret); - off += ret; + if (unlikely(to->type & ITER_PIPE)) { + ret = iov_iter_get_pages_alloc(to, &pages, len, + &page_off); + if (ret <= 0) { + ceph_osdc_put_request(req); + ret = -ENOMEM; + break; + } + num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); + if (ret < len) { + len = ret; + osd_req_op_extent_update(req, 0, len); + more = false; + } } else { - iov_iter_advance(to, 0); + num_pages = calc_pages_for(off, len); + page_off = off & ~PAGE_MASK; + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) { + ceph_osdc_put_request(req); + ret = PTR_ERR(pages); + break; + } } - ceph_put_page_vector(pages, num_pages, false); - } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - ret = striped_read(inode, off, len, pages, num_pages, - (off & ~PAGE_MASK), checkeof); - if (ret > 0) { - int l, k = 0; - size_t left = ret; - - while (left) { - size_t page_off = off & ~PAGE_MASK; - size_t copy = min_t(size_t, left, - PAGE_SIZE - page_off); - l = copy_page_to_iter(pages[k++], page_off, - copy, to); - off += l; - left -= l; - if (l < copy) + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, + false, false); + ret = ceph_osdc_start_request(osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(osdc, req); + ceph_osdc_put_request(req); + + i_size = i_size_read(inode); + dout("sync_read %llu~%llu got %zd i_size %llu%s\n", + off, len, ret, i_size, (more ? " MORE" : "")); + + if (ret == -ENOENT) + ret = 0; + if (ret >= 0 && ret < len && (off + ret < i_size)) { + int zlen = min(len - ret, i_size - off - ret); + int zoff = page_off + ret; + dout("sync_read zero gap %llu~%llu\n", + off + ret, off + ret + zlen); + ceph_zero_page_vector_range(zoff, zlen, pages); + ret += zlen; + } + + if (unlikely(to->type & ITER_PIPE)) { + if (ret > 0) { + iov_iter_advance(to, ret); + off += ret; + } else { + iov_iter_advance(to, 0); + } + ceph_put_page_vector(pages, num_pages, false); + } else { + int idx = 0; + size_t left = ret > 0 ? ret : 0; + while (left > 0) { + size_t len, copied; + page_off = off & ~PAGE_MASK; + len = min_t(size_t, left, PAGE_SIZE - page_off); + copied = copy_page_to_iter(pages[idx++], + page_off, len, to); + off += copied; + left -= copied; + if (copied < len) { + ret = -EFAULT; break; + } } + ceph_release_page_vector(pages, num_pages); } - ceph_release_page_vector(pages, num_pages); + + if (ret <= 0 || off >= i_size || !more) + break; } if (off > iocb->ki_pos) { + if (ret >= 0 && + iov_iter_count(to) > 0 && off >= i_size_read(inode)) + *retry_op = CHECK_EOF; ret = off - iocb->ki_pos; iocb->ki_pos = off; } - dout("sync_read result %zd\n", ret); + dout("sync_read result %zd retry_op %d\n", ret, *retry_op); return ret; } @@ -865,7 +859,7 @@ static void ceph_aio_retry_work(struct work_struct *work) } spin_unlock(&ci->i_ceph_lock); - req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, + req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1, false, GFP_NOFS); if (!req) { ret = -ENOMEM; @@ -877,6 +871,11 @@ static void ceph_aio_retry_work(struct work_struct *work) ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); + req->r_ops[0] = orig_req->r_ops[0]; + + req->r_mtime = aio_req->mtime; + req->r_data_offset = req->r_ops[0].extent.offset; + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); if (ret) { ceph_osdc_put_request(req); @@ -884,11 +883,6 @@ static void ceph_aio_retry_work(struct work_struct *work) goto out; } - req->r_ops[0] = orig_req->r_ops[0]; - - req->r_mtime = aio_req->mtime; - req->r_data_offset = req->r_ops[0].extent.offset; - ceph_osdc_put_request(orig_req); req->r_callback = ceph_aio_complete_req; @@ -1735,7 +1729,6 @@ static long ceph_fallocate(struct file *file, int mode, struct ceph_file_info *fi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_cap_flush *prealloc_cf; int want, got = 0; int dirty; @@ -1743,10 +1736,7 @@ static long ceph_fallocate(struct file *file, int mode, loff_t endoff = 0; loff_t size; - if ((offset + length) > max(i_size_read(inode), fsc->max_file_size)) - return -EFBIG; - - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (!S_ISREG(inode->i_mode)) @@ -1763,18 +1753,6 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) && - ceph_quota_is_max_bytes_exceeded(inode, offset + length)) { - ret = -EDQUOT; - goto unlock; - } - - if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL) && - !(mode & FALLOC_FL_PUNCH_HOLE)) { - ret = -ENOSPC; - goto unlock; - } - if (ci->i_inline_version != CEPH_INLINE_NONE) { ret = ceph_uninline_data(file, NULL); if (ret < 0) @@ -1782,12 +1760,12 @@ static long ceph_fallocate(struct file *file, int mode, } size = i_size_read(inode); - if (!(mode & FALLOC_FL_KEEP_SIZE)) { - endoff = offset + length; - ret = inode_newsize_ok(inode, endoff); - if (ret) - goto unlock; - } + + /* Are we punching a hole beyond EOF? */ + if (offset >= size) + goto unlock; + if ((offset + length) > size) + length = size - offset; if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; @@ -1798,16 +1776,8 @@ static long ceph_fallocate(struct file *file, int mode, if (ret < 0) goto unlock; - if (mode & FALLOC_FL_PUNCH_HOLE) { - if (offset < size) - ceph_zero_pagecache_range(inode, offset, length); - ret = ceph_zero_objects(inode, offset, length); - } else if (endoff > size) { - truncate_pagecache_range(inode, size, -1); - if (ceph_inode_set_size(inode, endoff)) - ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, NULL); - } + ceph_zero_pagecache_range(inode, offset, length); + ret = ceph_zero_objects(inode, offset, length); if (!ret) { spin_lock(&ci->i_ceph_lock); @@ -1817,9 +1787,6 @@ static long ceph_fallocate(struct file *file, int mode, spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); - if ((endoff > size) && - ceph_quota_is_max_bytes_approaching(inode, endoff)) - ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL); } ceph_put_cap_refs(ci, got); @@ -1829,6 +1796,300 @@ unlock: return ret; } +/* + * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for + * src_ci. Two attempts are made to obtain both caps, and an error is return if + * this fails; zero is returned on success. + */ +static int get_rd_wr_caps(struct ceph_inode_info *src_ci, + loff_t src_endoff, int *src_got, + struct ceph_inode_info *dst_ci, + loff_t dst_endoff, int *dst_got) +{ + int ret = 0; + bool retrying = false; + +retry_caps: + ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + dst_endoff, dst_got, NULL); + if (ret < 0) + return ret; + + /* + * Since we're already holding the FILE_WR capability for the dst file, + * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some + * retry dance instead to try to get both capabilities. + */ + ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, + false, src_got); + if (ret <= 0) { + /* Start by dropping dst_ci caps and getting src_ci caps */ + ceph_put_cap_refs(dst_ci, *dst_got); + if (retrying) { + if (!ret) + /* ceph_try_get_caps masks EAGAIN */ + ret = -EAGAIN; + return ret; + } + ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, + CEPH_CAP_FILE_SHARED, src_endoff, + src_got, NULL); + if (ret < 0) + return ret; + /*... drop src_ci caps too, and retry */ + ceph_put_cap_refs(src_ci, *src_got); + retrying = true; + goto retry_caps; + } + return ret; +} + +static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got, + struct ceph_inode_info *dst_ci, int dst_got) +{ + ceph_put_cap_refs(src_ci, src_got); + ceph_put_cap_refs(dst_ci, dst_got); +} + +/* + * This function does several size-related checks, returning an error if: + * - source file is smaller than off+len + * - destination file size is not OK (inode_newsize_ok()) + * - max bytes quotas is exceeded + */ +static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, + loff_t src_off, loff_t dst_off, size_t len) +{ + loff_t size, endoff; + + size = i_size_read(src_inode); + /* + * Don't copy beyond source file EOF. Instead of simply setting length + * to (size - src_off), just drop to VFS default implementation, as the + * local i_size may be stale due to other clients writing to the source + * inode. + */ + if (src_off + len > size) { + dout("Copy beyond EOF (%llu + %zu > %llu)\n", + src_off, len, size); + return -EOPNOTSUPP; + } + size = i_size_read(dst_inode); + + endoff = dst_off + len; + if (inode_newsize_ok(dst_inode, endoff)) + return -EOPNOTSUPP; + + if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff)) + return -EDQUOT; + + return 0; +} + +static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + struct ceph_inode_info *src_ci = ceph_inode(src_inode); + struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); + struct ceph_cap_flush *prealloc_cf; + struct ceph_object_locator src_oloc, dst_oloc; + struct ceph_object_id src_oid, dst_oid; + loff_t endoff = 0, size; + ssize_t ret = -EIO; + u64 src_objnum, dst_objnum, src_objoff, dst_objoff; + u32 src_objlen, dst_objlen, object_size; + int src_got = 0, dst_got = 0, err, dirty; + bool do_final_copy = false; + + if (src_inode == dst_inode) + return -EINVAL; + if (ceph_snap(dst_inode) != CEPH_NOSNAP) + return -EROFS; + + /* + * Some of the checks below will return -EOPNOTSUPP, which will force a + * fallback to the default VFS copy_file_range implementation. This is + * desirable in several cases (for ex, the 'len' is smaller than the + * size of the objects, or in cases where that would be more + * efficient). + */ + + if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) + return -EOPNOTSUPP; + + if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || + (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) || + (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) + return -EOPNOTSUPP; + + if (len < src_ci->i_layout.object_size) + return -EOPNOTSUPP; /* no remote copy will be done */ + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + /* Start by sync'ing the source file */ + ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); + if (ret < 0) + goto out; + + /* + * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other + * clients may have dirty data in their caches. And OSDs know nothing + * about caps, so they can't safely do the remote object copies. + */ + err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, + dst_ci, (dst_off + len), &dst_got); + if (err < 0) { + dout("get_rd_wr_caps returned %d\n", err); + ret = -EOPNOTSUPP; + goto out; + } + + ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len); + if (ret < 0) + goto out_caps; + + size = i_size_read(dst_inode); + endoff = dst_off + len; + + /* Drop dst file cached pages */ + ret = invalidate_inode_pages2_range(dst_inode->i_mapping, + dst_off >> PAGE_SHIFT, + endoff >> PAGE_SHIFT); + if (ret < 0) { + dout("Failed to invalidate inode pages (%zd)\n", ret); + ret = 0; /* XXX */ + } + src_oloc.pool = src_ci->i_layout.pool_id; + src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); + dst_oloc.pool = dst_ci->i_layout.pool_id; + dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); + + ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, + src_ci->i_layout.object_size, + &src_objnum, &src_objoff, &src_objlen); + ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, + dst_ci->i_layout.object_size, + &dst_objnum, &dst_objoff, &dst_objlen); + /* object-level offsets need to the same */ + if (src_objoff != dst_objoff) { + ret = -EOPNOTSUPP; + goto out_caps; + } + + /* + * Do a manual copy if the object offset isn't object aligned. + * 'src_objlen' contains the bytes left until the end of the object, + * starting at the src_off + */ + if (src_objoff) { + /* + * we need to temporarily drop all caps as we'll be calling + * {read,write}_iter, which will get caps again. + */ + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); + ret = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, src_objlen, flags); + if (ret < 0) { + dout("do_splice_direct returned %d\n", err); + goto out; + } + len -= ret; + err = get_rd_wr_caps(src_ci, (src_off + len), + &src_got, dst_ci, + (dst_off + len), &dst_got); + if (err < 0) + goto out; + err = is_file_size_ok(src_inode, dst_inode, + src_off, dst_off, len); + if (err < 0) + goto out_caps; + } + object_size = src_ci->i_layout.object_size; + while (len >= object_size) { + ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, + object_size, &src_objnum, + &src_objoff, &src_objlen); + ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, + object_size, &dst_objnum, + &dst_objoff, &dst_objlen); + ceph_oid_init(&src_oid); + ceph_oid_printf(&src_oid, "%llx.%08llx", + src_ci->i_vino.ino, src_objnum); + ceph_oid_init(&dst_oid); + ceph_oid_printf(&dst_oid, "%llx.%08llx", + dst_ci->i_vino.ino, dst_objnum); + /* Do an object remote copy */ + err = ceph_osdc_copy_from( + &ceph_inode_to_client(src_inode)->client->osdc, + src_ci->i_vino.snap, 0, + &src_oid, &src_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, + &dst_oid, &dst_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0); + if (err) { + dout("ceph_osdc_copy_from returned %d\n", err); + if (!ret) + ret = err; + goto out_caps; + } + len -= object_size; + src_off += object_size; + dst_off += object_size; + ret += object_size; + } + + if (len) + /* We still need one final local copy */ + do_final_copy = true; + + file_update_time(dst_file); + if (endoff > size) { + int caps_flags = 0; + + /* Let the MDS know about dst file size change */ + if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff)) + caps_flags |= CHECK_CAPS_NODELAY; + if (ceph_inode_set_size(dst_inode, endoff)) + caps_flags |= CHECK_CAPS_AUTHONLY; + if (caps_flags) + ceph_check_caps(dst_ci, caps_flags, NULL); + } + /* Mark Fw dirty */ + spin_lock(&dst_ci->i_ceph_lock); + dst_ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); + spin_unlock(&dst_ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(dst_inode, dirty); + +out_caps: + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); + + if (do_final_copy) { + err = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, len, flags); + if (err < 0) { + dout("do_splice_direct returned %d\n", err); + goto out; + } + len -= err; + ret += err; + } + +out: + ceph_free_cap_flush(prealloc_cf); + + return ret; +} + const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, @@ -1844,5 +2105,5 @@ const struct file_operations ceph_file_fops = { .unlocked_ioctl = ceph_ioctl, .compat_ioctl = ceph_ioctl, .fallocate = ceph_fallocate, + .copy_file_range = ceph_copy_file_range, }; - diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ebc7bdaed2d0..79dd5e6ed755 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1132,8 +1132,12 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in) if (IS_ERR(realdn)) { pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", PTR_ERR(realdn), dn, in, ceph_vinop(in)); - dput(dn); - dn = realdn; /* note realdn contains the error */ + dn = realdn; + /* + * Caller should release 'dn' in the case of error. + * If 'req->r_dentry' is passed to this function, + * caller should leave 'req->r_dentry' untouched. + */ goto out; } else if (realdn) { dout("dn %p (%d) spliced with %p (%d) " @@ -1196,7 +1200,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) WARN_ON_ONCE(1); } - if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { + if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { struct qstr dname; struct dentry *dn, *parent; @@ -1677,7 +1683,6 @@ retry_lookup: if (IS_ERR(realdn)) { err = PTR_ERR(realdn); d_drop(dn); - dn = NULL; goto next_item; } dn = realdn; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index bc43c822426a..67a9aeb2f4ec 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2071,7 +2071,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_old_dentry_drop) len += req->r_old_dentry->d_name.len; - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); + msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); goto out_free2; @@ -2136,7 +2136,6 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_pagelist) { struct ceph_pagelist *pagelist = req->r_pagelist; - refcount_inc(&pagelist->refcnt); ceph_msg_data_add_pagelist(msg, pagelist); msg->hdr.data_len = cpu_to_le32(pagelist->length); } else { @@ -3126,12 +3125,11 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, pr_info("mds%d reconnect start\n", mds); - pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) goto fail_nopagelist; - ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); + reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); if (!reply) goto fail_nomsg; @@ -3241,6 +3239,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, mutex_unlock(&mdsc->mutex); up_read(&mdsc->snap_rwsem); + ceph_pagelist_release(pagelist); return; fail: diff --git a/fs/ceph/super.c b/fs/ceph/super.c index eab1359d0553..b5ecd6f50360 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -165,6 +165,8 @@ enum { Opt_noacl, Opt_quotadf, Opt_noquotadf, + Opt_copyfrom, + Opt_nocopyfrom, }; static match_table_t fsopt_tokens = { @@ -203,6 +205,8 @@ static match_table_t fsopt_tokens = { {Opt_noacl, "noacl"}, {Opt_quotadf, "quotadf"}, {Opt_noquotadf, "noquotadf"}, + {Opt_copyfrom, "copyfrom"}, + {Opt_nocopyfrom, "nocopyfrom"}, {-1, NULL} }; @@ -355,6 +359,12 @@ static int parse_fsopt_token(char *c, void *private) case Opt_noquotadf: fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; break; + case Opt_copyfrom: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; + break; + case Opt_nocopyfrom: + fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; + break; #ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: fsopt->sb_flags |= SB_POSIXACL; @@ -553,6 +563,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noacl"); #endif + if (fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) + seq_puts(m, ",nocopyfrom"); + if (fsopt->mds_namespace) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 582e28fd1b7b..c005a5400f2e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -40,6 +40,7 @@ #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ +#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE @@ -1008,7 +1009,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page); extern int ceph_try_get_caps(struct ceph_inode_info *ci, - int need, int want, int *got); + int need, int want, bool nonblock, int *got); /* for counting open files by mode */ extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5cc8b94f8206..316f6ad10644 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -951,11 +951,10 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, if (size > 0) { /* copy value into pagelist */ - pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) return -ENOMEM; - ceph_pagelist_init(pagelist); err = ceph_pagelist_append(pagelist, value, size); if (err) goto out; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index ce2cc2169040..6e30949d9f77 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -64,11 +64,6 @@ #include <linux/hiddev.h> -#define __DVB_CORE__ -#include <linux/dvb/audio.h> -#include <linux/dvb/dmx.h> -#include <linux/dvb/frontend.h> -#include <linux/dvb/video.h> #include <linux/sort.h> @@ -95,71 +90,6 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return vfs_ioctl(file, cmd, arg); } -struct compat_video_event { - int32_t type; - compat_time_t timestamp; - union { - video_size_t size; - unsigned int frame_rate; - } u; -}; - -static int do_video_get_event(struct file *file, - unsigned int cmd, struct compat_video_event __user *up) -{ - struct video_event __user *kevent = - compat_alloc_user_space(sizeof(*kevent)); - int err; - - if (kevent == NULL) - return -EFAULT; - - err = do_ioctl(file, cmd, (unsigned long)kevent); - if (!err) { - err = convert_in_user(&kevent->type, &up->type); - err |= convert_in_user(&kevent->timestamp, &up->timestamp); - err |= convert_in_user(&kevent->u.size.w, &up->u.size.w); - err |= convert_in_user(&kevent->u.size.h, &up->u.size.h); - err |= convert_in_user(&kevent->u.size.aspect_ratio, - &up->u.size.aspect_ratio); - if (err) - err = -EFAULT; - } - - return err; -} - -struct compat_video_still_picture { - compat_uptr_t iFrame; - int32_t size; -}; - -static int do_video_stillpicture(struct file *file, - unsigned int cmd, struct compat_video_still_picture __user *up) -{ - struct video_still_picture __user *up_native; - compat_uptr_t fp; - int32_t size; - int err; - - err = get_user(fp, &up->iFrame); - err |= get_user(size, &up->size); - if (err) - return -EFAULT; - - up_native = - compat_alloc_user_space(sizeof(struct video_still_picture)); - - err = put_user(compat_ptr(fp), &up_native->iFrame); - err |= put_user(size, &up_native->size); - if (err) - return -EFAULT; - - err = do_ioctl(file, cmd, (unsigned long) up_native); - - return err; -} - #ifdef CONFIG_BLOCK typedef struct sg_io_hdr32 { compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ @@ -958,61 +888,6 @@ COMPATIBLE_IOCTL(HIDIOCGFLAG) COMPATIBLE_IOCTL(HIDIOCSFLAG) COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX) COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO) -/* dvb */ -COMPATIBLE_IOCTL(AUDIO_STOP) -COMPATIBLE_IOCTL(AUDIO_PLAY) -COMPATIBLE_IOCTL(AUDIO_PAUSE) -COMPATIBLE_IOCTL(AUDIO_CONTINUE) -COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE) -COMPATIBLE_IOCTL(AUDIO_SET_MUTE) -COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC) -COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE) -COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT) -COMPATIBLE_IOCTL(AUDIO_GET_STATUS) -COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES) -COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER) -COMPATIBLE_IOCTL(AUDIO_SET_ID) -COMPATIBLE_IOCTL(AUDIO_SET_MIXER) -COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE) -COMPATIBLE_IOCTL(DMX_START) -COMPATIBLE_IOCTL(DMX_STOP) -COMPATIBLE_IOCTL(DMX_SET_FILTER) -COMPATIBLE_IOCTL(DMX_SET_PES_FILTER) -COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE) -COMPATIBLE_IOCTL(DMX_GET_PES_PIDS) -COMPATIBLE_IOCTL(DMX_GET_STC) -COMPATIBLE_IOCTL(DMX_REQBUFS) -COMPATIBLE_IOCTL(DMX_QUERYBUF) -COMPATIBLE_IOCTL(DMX_EXPBUF) -COMPATIBLE_IOCTL(DMX_QBUF) -COMPATIBLE_IOCTL(DMX_DQBUF) -COMPATIBLE_IOCTL(VIDEO_STOP) -COMPATIBLE_IOCTL(VIDEO_PLAY) -COMPATIBLE_IOCTL(VIDEO_FREEZE) -COMPATIBLE_IOCTL(VIDEO_CONTINUE) -COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE) -COMPATIBLE_IOCTL(VIDEO_SET_BLANK) -COMPATIBLE_IOCTL(VIDEO_GET_STATUS) -COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT) -COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD) -COMPATIBLE_IOCTL(VIDEO_SLOWMOTION) -COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES) -COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER) -COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE) -COMPATIBLE_IOCTL(VIDEO_SET_FORMAT) -COMPATIBLE_IOCTL(VIDEO_GET_SIZE) -/* cec */ -COMPATIBLE_IOCTL(CEC_ADAP_G_CAPS) -COMPATIBLE_IOCTL(CEC_ADAP_G_LOG_ADDRS) -COMPATIBLE_IOCTL(CEC_ADAP_S_LOG_ADDRS) -COMPATIBLE_IOCTL(CEC_ADAP_G_PHYS_ADDR) -COMPATIBLE_IOCTL(CEC_ADAP_S_PHYS_ADDR) -COMPATIBLE_IOCTL(CEC_G_MODE) -COMPATIBLE_IOCTL(CEC_S_MODE) -COMPATIBLE_IOCTL(CEC_TRANSMIT) -COMPATIBLE_IOCTL(CEC_RECEIVE) -COMPATIBLE_IOCTL(CEC_DQEVENT) - /* joystick */ COMPATIBLE_IOCTL(JSIOCGVERSION) COMPATIBLE_IOCTL(JSIOCGAXES) @@ -1080,12 +955,6 @@ static long do_ioctl_trans(unsigned int cmd, case RTC_EPOCH_READ32: case RTC_EPOCH_SET32: return rtc_ioctl(file, cmd, argp); - - /* dvb */ - case VIDEO_GET_EVENT: - return do_video_get_event(file, cmd, argp); - case VIDEO_STILLPICTURE: - return do_video_stillpicture(file, cmd, argp); } /* diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 0c35e62f108d..9352487bd0fc 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -202,7 +202,8 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, continue; blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT; blk_offset += offset; - if (blk_offset + len > BUFFER_SIZE) + if (blk_offset > BUFFER_SIZE || + blk_offset + len > BUFFER_SIZE) continue; return read_buffers[i] + blk_offset; } @@ -872,8 +873,8 @@ static int cramfs_readpage(struct file *file, struct page *page) if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) { /* See comments on earlier code. */ u32 prev_start = block_start; - block_start = prev_start & ~CRAMFS_BLK_FLAGS; - block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; + block_start = prev_start & ~CRAMFS_BLK_FLAGS; + block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) { block_start += PAGE_SIZE; } else { @@ -38,6 +38,17 @@ #define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> +static inline unsigned int pe_order(enum page_entry_size pe_size) +{ + if (pe_size == PE_SIZE_PTE) + return PAGE_SHIFT - PAGE_SHIFT; + if (pe_size == PE_SIZE_PMD) + return PMD_SHIFT - PAGE_SHIFT; + if (pe_size == PE_SIZE_PUD) + return PUD_SHIFT - PAGE_SHIFT; + return ~0; +} + /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -46,6 +57,9 @@ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) +/* The order of a PMD entry */ +#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) + static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -59,63 +73,74 @@ static int __init init_dax_wait_table(void) fs_initcall(init_dax_wait_table); /* - * We use lowest available bit in exceptional entry for locking, one bit for - * the entry size (PMD) and two more to tell us if the entry is a zero page or - * an empty entry that is just used for locking. In total four special bits. + * DAX pagecache entries use XArray value entries so they can't be mistaken + * for pages. We use one bit for locking, one bit for the entry size (PMD) + * and two more to tell us if the entry is a zero page or an empty entry that + * is just used for locking. In total four special bits. * * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem * block allocation. */ -#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) -#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) -#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) -#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) +#define DAX_SHIFT (4) +#define DAX_LOCKED (1UL << 0) +#define DAX_PMD (1UL << 1) +#define DAX_ZERO_PAGE (1UL << 2) +#define DAX_EMPTY (1UL << 3) -static unsigned long dax_radix_pfn(void *entry) +static unsigned long dax_to_pfn(void *entry) { - return (unsigned long)entry >> RADIX_DAX_SHIFT; + return xa_to_value(entry) >> DAX_SHIFT; } -static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) +static void *dax_make_entry(pfn_t pfn, unsigned long flags) { - return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | - (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); + return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); } -static unsigned int dax_radix_order(void *entry) +static void *dax_make_page_entry(struct page *page) { - if ((unsigned long)entry & RADIX_DAX_PMD) - return PMD_SHIFT - PAGE_SHIFT; + pfn_t pfn = page_to_pfn_t(page); + return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0); +} + +static bool dax_is_locked(void *entry) +{ + return xa_to_value(entry) & DAX_LOCKED; +} + +static unsigned int dax_entry_order(void *entry) +{ + if (xa_to_value(entry) & DAX_PMD) + return PMD_ORDER; return 0; } static int dax_is_pmd_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_PMD; + return xa_to_value(entry) & DAX_PMD; } static int dax_is_pte_entry(void *entry) { - return !((unsigned long)entry & RADIX_DAX_PMD); + return !(xa_to_value(entry) & DAX_PMD); } static int dax_is_zero_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; + return xa_to_value(entry) & DAX_ZERO_PAGE; } static int dax_is_empty_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_EMPTY; + return xa_to_value(entry) & DAX_EMPTY; } /* - * DAX radix tree locking + * DAX page cache entry locking */ struct exceptional_entry_key { - struct address_space *mapping; + struct xarray *xa; pgoff_t entry_start; }; @@ -124,10 +149,11 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; -static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, - pgoff_t index, void *entry, struct exceptional_entry_key *key) +static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, + void *entry, struct exceptional_entry_key *key) { unsigned long hash; + unsigned long index = xas->xa_index; /* * If 'entry' is a PMD, align the 'index' that we use for the wait @@ -136,22 +162,21 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, */ if (dax_is_pmd_entry(entry)) index &= ~PG_PMD_COLOUR; - - key->mapping = mapping; + key->xa = xas->xa; key->entry_start = index; - hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); + hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); return wait_table + hash; } -static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, - int sync, void *keyp) +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; struct wait_exceptional_entry_queue *ewait = container_of(wait, struct wait_exceptional_entry_queue, wait); - if (key->mapping != ewait->key.mapping || + if (key->xa != ewait->key.xa || key->entry_start != ewait->key.entry_start) return 0; return autoremove_wake_function(wait, mode, sync, NULL); @@ -162,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ -static void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, void *entry, bool wake_all) +static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) { struct exceptional_entry_key key; wait_queue_head_t *wq; - wq = dax_entry_waitqueue(mapping, index, entry, &key); + wq = dax_entry_waitqueue(xas, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens @@ -181,55 +205,16 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, } /* - * Check whether the given slot is locked. Must be called with the i_pages - * lock held. - */ -static inline int slot_locked(struct address_space *mapping, void **slot) -{ - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - return entry & RADIX_DAX_ENTRY_LOCK; -} - -/* - * Mark the given slot as locked. Must be called with the i_pages lock held. - */ -static inline void *lock_slot(struct address_space *mapping, void **slot) -{ - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - - entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); - return (void *)entry; -} - -/* - * Mark the given slot as unlocked. Must be called with the i_pages lock held. - */ -static inline void *unlock_slot(struct address_space *mapping, void **slot) -{ - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - - entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); - return (void *)entry; -} - -/* - * Lookup entry in radix tree, wait for it to become unlocked if it is - * exceptional entry and return it. The caller must call - * put_unlocked_mapping_entry() when he decided not to lock the entry or - * put_locked_mapping_entry() when he locked the entry and now wants to - * unlock it. + * Look up entry in page cache, wait for it to become unlocked if it + * is a DAX entry and return it. The caller must subsequently call + * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() + * if it did. * * Must be called with the i_pages lock held. */ -static void *__get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp, bool (*wait_fn)(void)) +static void *get_unlocked_entry(struct xa_state *xas) { - void *entry, **slot; + void *entry; struct wait_exceptional_entry_queue ewait; wait_queue_head_t *wq; @@ -237,80 +222,54 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping, ewait.wait.func = wake_exceptional_entry_func; for (;;) { - bool revalidate; - - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, - &slot); - if (!entry || - WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || - !slot_locked(mapping, slot)) { - if (slotp) - *slotp = slot; + entry = xas_load(xas); + if (!entry || xa_is_internal(entry) || + WARN_ON_ONCE(!xa_is_value(entry)) || + !dax_is_locked(entry)) return entry; - } - wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); + wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); - xa_unlock_irq(&mapping->i_pages); - revalidate = wait_fn(); + xas_unlock_irq(xas); + xas_reset(xas); + schedule(); finish_wait(wq, &ewait.wait); - xa_lock_irq(&mapping->i_pages); - if (revalidate) - return ERR_PTR(-EAGAIN); + xas_lock_irq(xas); } } -static bool entry_wait(void) -{ - schedule(); - /* - * Never return an ERR_PTR() from - * __get_unlocked_mapping_entry(), just keep looping. - */ - return false; -} - -static void *get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp) +static void put_unlocked_entry(struct xa_state *xas, void *entry) { - return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait); -} - -static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - void *entry, **slot; - - xa_lock_irq(&mapping->i_pages); - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || - !slot_locked(mapping, slot))) { - xa_unlock_irq(&mapping->i_pages); - return; - } - unlock_slot(mapping, slot); - xa_unlock_irq(&mapping->i_pages); - dax_wake_mapping_entry_waiter(mapping, index, entry, false); + /* If we were the only waiter woken, wake the next one */ + if (entry) + dax_wake_entry(xas, entry, false); } -static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index) +/* + * We used the xa_state to get the entry, but then we locked the entry and + * dropped the xa_lock, so we know the xa_state is stale and must be reset + * before use. + */ +static void dax_unlock_entry(struct xa_state *xas, void *entry) { - unlock_mapping_entry(mapping, index); + void *old; + + xas_reset(xas); + xas_lock_irq(xas); + old = xas_store(xas, entry); + xas_unlock_irq(xas); + BUG_ON(!dax_is_locked(old)); + dax_wake_entry(xas, entry, false); } /* - * Called when we are done with radix tree entry we looked up via - * get_unlocked_mapping_entry() and which we didn't lock in the end. + * Return: The entry stored at this location before it was locked. */ -static void put_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) +static void *dax_lock_entry(struct xa_state *xas, void *entry) { - if (!entry) - return; - - /* We have to wake up next waiter for the radix tree entry lock */ - dax_wake_mapping_entry_waiter(mapping, index, entry, false); + unsigned long v = xa_to_value(entry); + return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); } static unsigned long dax_entry_size(void *entry) @@ -325,9 +284,9 @@ static unsigned long dax_entry_size(void *entry) return PAGE_SIZE; } -static unsigned long dax_radix_end_pfn(void *entry) +static unsigned long dax_end_pfn(void *entry) { - return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; + return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; } /* @@ -335,8 +294,8 @@ static unsigned long dax_radix_end_pfn(void *entry) * 'empty' and 'zero' entries. */ #define for_each_mapped_pfn(entry, pfn) \ - for (pfn = dax_radix_pfn(entry); \ - pfn < dax_radix_end_pfn(entry); pfn++) + for (pfn = dax_to_pfn(entry); \ + pfn < dax_end_pfn(entry); pfn++) /* * TODO: for reflink+dax we need a way to associate a single page with @@ -393,33 +352,16 @@ static struct page *dax_busy_page(void *entry) return NULL; } -static bool entry_wait_revalidate(void) -{ - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - - /* - * Tell __get_unlocked_mapping_entry() to take a break, we need - * to revalidate page->mapping after dropping locks - */ - return true; -} - bool dax_lock_mapping_entry(struct page *page) { - pgoff_t index; - struct inode *inode; - bool did_lock = false; - void *entry = NULL, **slot; - struct address_space *mapping; + XA_STATE(xas, NULL, 0); + void *entry; - rcu_read_lock(); for (;;) { - mapping = READ_ONCE(page->mapping); + struct address_space *mapping = READ_ONCE(page->mapping); if (!dax_mapping(mapping)) - break; + return false; /* * In the device-dax case there's no need to lock, a @@ -428,98 +370,94 @@ bool dax_lock_mapping_entry(struct page *page) * otherwise we would not have a valid pfn_to_page() * translation. */ - inode = mapping->host; - if (S_ISCHR(inode->i_mode)) { - did_lock = true; - break; - } + if (S_ISCHR(mapping->host->i_mode)) + return true; - xa_lock_irq(&mapping->i_pages); + xas.xa = &mapping->i_pages; + xas_lock_irq(&xas); if (mapping != page->mapping) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); continue; } - index = page->index; - - entry = __get_unlocked_mapping_entry(mapping, index, &slot, - entry_wait_revalidate); - if (!entry) { - xa_unlock_irq(&mapping->i_pages); - break; - } else if (IS_ERR(entry)) { - xa_unlock_irq(&mapping->i_pages); - WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN); - continue; + xas_set(&xas, page->index); + entry = xas_load(&xas); + if (dax_is_locked(entry)) { + entry = get_unlocked_entry(&xas); + /* Did the page move while we slept? */ + if (dax_to_pfn(entry) != page_to_pfn(page)) { + xas_unlock_irq(&xas); + continue; + } } - lock_slot(mapping, slot); - did_lock = true; - xa_unlock_irq(&mapping->i_pages); - break; + dax_lock_entry(&xas, entry); + xas_unlock_irq(&xas); + return true; } - rcu_read_unlock(); - - return did_lock; } void dax_unlock_mapping_entry(struct page *page) { struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; + XA_STATE(xas, &mapping->i_pages, page->index); - if (S_ISCHR(inode->i_mode)) + if (S_ISCHR(mapping->host->i_mode)) return; - unlock_mapping_entry(mapping, page->index); + dax_unlock_entry(&xas, dax_make_page_entry(page)); } /* - * Find radix tree entry at given index. If it points to an exceptional entry, - * return it with the radix tree entry locked. If the radix tree doesn't - * contain given index, create an empty exceptional entry for the index and - * return with it locked. + * Find page cache entry at given index. If it is a DAX entry, return it + * with the entry locked. If the page cache doesn't contain an entry at + * that index, add a locked empty entry. * - * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will - * either return that locked entry or will return an error. This error will - * happen if there are any 4k entries within the 2MiB range that we are - * requesting. + * When requesting an entry with size DAX_PMD, grab_mapping_entry() will + * either return that locked entry or will return VM_FAULT_FALLBACK. + * This will happen if there are any PTE entries within the PMD range + * that we are requesting. * - * We always favor 4k entries over 2MiB entries. There isn't a flow where we - * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB - * insertion will fail if it finds any 4k entries already in the tree, and a - * 4k insertion will cause an existing 2MiB entry to be unmapped and - * downgraded to 4k entries. This happens for both 2MiB huge zero pages as - * well as 2MiB empty entries. + * We always favor PTE entries over PMD entries. There isn't a flow where we + * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD + * insertion will fail if it finds any PTE entries already in the tree, and a + * PTE insertion will cause an existing PMD entry to be unmapped and + * downgraded to PTE entries. This happens for both PMD zero pages as + * well as PMD empty entries. * - * The exception to this downgrade path is for 2MiB DAX PMD entries that have - * real storage backing them. We will leave these real 2MiB DAX entries in - * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. + * The exception to this downgrade path is for PMD entries that have + * real storage backing them. We will leave these real PMD entries in + * the tree, and PTE writes will simply dirty the entire PMD entry. * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. + * + * On error, this function does not return an ERR_PTR. Instead it returns + * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values + * overlap with xarray value entries. */ -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, - unsigned long size_flag) +static void *grab_mapping_entry(struct xa_state *xas, + struct address_space *mapping, unsigned long size_flag) { - bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ - void *entry, **slot; - -restart: - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, &slot); + unsigned long index = xas->xa_index; + bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ + void *entry; - if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { - entry = ERR_PTR(-EIO); - goto out_unlock; - } +retry: + xas_lock_irq(xas); + entry = get_unlocked_entry(xas); + if (xa_is_internal(entry)) + goto fallback; if (entry) { - if (size_flag & RADIX_DAX_PMD) { + if (WARN_ON_ONCE(!xa_is_value(entry))) { + xas_set_err(xas, EIO); + goto out_unlock; + } + + if (size_flag & DAX_PMD) { if (dax_is_pte_entry(entry)) { - put_unlocked_mapping_entry(mapping, index, - entry); - entry = ERR_PTR(-EEXIST); - goto out_unlock; + put_unlocked_entry(xas, entry); + goto fallback; } } else { /* trying to grab a PTE entry */ if (dax_is_pmd_entry(entry) && @@ -530,87 +468,57 @@ restart: } } - /* No entry for given index? Make sure radix tree is big enough. */ - if (!entry || pmd_downgrade) { - int err; - - if (pmd_downgrade) { - /* - * Make sure 'entry' remains valid while we drop - * the i_pages lock. - */ - entry = lock_slot(mapping, slot); - } + if (pmd_downgrade) { + /* + * Make sure 'entry' remains valid while we drop + * the i_pages lock. + */ + dax_lock_entry(xas, entry); - xa_unlock_irq(&mapping->i_pages); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be * unmapped. */ - if (pmd_downgrade && dax_is_zero_entry(entry)) - unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, - PG_PMD_NR, false); - - err = radix_tree_preload( - mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); - if (err) { - if (pmd_downgrade) - put_locked_mapping_entry(mapping, index); - return ERR_PTR(err); - } - xa_lock_irq(&mapping->i_pages); - - if (!entry) { - /* - * We needed to drop the i_pages lock while calling - * radix_tree_preload() and we didn't have an entry to - * lock. See if another thread inserted an entry at - * our index during this time. - */ - entry = __radix_tree_lookup(&mapping->i_pages, index, - NULL, &slot); - if (entry) { - radix_tree_preload_end(); - xa_unlock_irq(&mapping->i_pages); - goto restart; - } + if (dax_is_zero_entry(entry)) { + xas_unlock_irq(xas); + unmap_mapping_pages(mapping, + xas->xa_index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); + xas_reset(xas); + xas_lock_irq(xas); } - if (pmd_downgrade) { - dax_disassociate_entry(entry, mapping, false); - radix_tree_delete(&mapping->i_pages, index); - mapping->nrexceptional--; - dax_wake_mapping_entry_waiter(mapping, index, entry, - true); - } + dax_disassociate_entry(entry, mapping, false); + xas_store(xas, NULL); /* undo the PMD join */ + dax_wake_entry(xas, entry, true); + mapping->nrexceptional--; + entry = NULL; + xas_set(xas, index); + } - entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); - - err = __radix_tree_insert(&mapping->i_pages, index, - dax_radix_order(entry), entry); - radix_tree_preload_end(); - if (err) { - xa_unlock_irq(&mapping->i_pages); - /* - * Our insertion of a DAX entry failed, most likely - * because we were inserting a PMD entry and it - * collided with a PTE sized entry at a different - * index in the PMD range. We haven't inserted - * anything into the radix tree and have no waiters to - * wake. - */ - return ERR_PTR(err); - } - /* Good, we have inserted empty locked entry into the tree. */ + if (entry) { + dax_lock_entry(xas, entry); + } else { + entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY); + dax_lock_entry(xas, entry); + if (xas_error(xas)) + goto out_unlock; mapping->nrexceptional++; - xa_unlock_irq(&mapping->i_pages); - return entry; } - entry = lock_slot(mapping, slot); - out_unlock: - xa_unlock_irq(&mapping->i_pages); + +out_unlock: + xas_unlock_irq(xas); + if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) + goto retry; + if (xas->xa_node == XA_ERROR(-ENOMEM)) + return xa_mk_internal(VM_FAULT_OOM); + if (xas_error(xas)) + return xa_mk_internal(VM_FAULT_SIGBUS); return entry; +fallback: + xas_unlock_irq(xas); + return xa_mk_internal(VM_FAULT_FALLBACK); } /** @@ -630,11 +538,10 @@ restart: */ struct page *dax_layout_busy_page(struct address_space *mapping) { - pgoff_t indices[PAGEVEC_SIZE]; + XA_STATE(xas, &mapping->i_pages, 0); + void *entry; + unsigned int scanned = 0; struct page *page = NULL; - struct pagevec pvec; - pgoff_t index, end; - unsigned i; /* * In the 'limited' case get_user_pages() for dax is disabled. @@ -645,13 +552,9 @@ struct page *dax_layout_busy_page(struct address_space *mapping) if (!dax_mapping(mapping) || !mapping_mapped(mapping)) return NULL; - pagevec_init(&pvec); - index = 0; - end = -1; - /* * If we race get_user_pages_fast() here either we'll see the - * elevated page count in the pagevec_lookup and wait, or + * elevated page count in the iteration and wait, or * get_user_pages_fast() will see that the page it took a reference * against is no longer mapped in the page tables and bail to the * get_user_pages() slow path. The slow path is protected by @@ -663,94 +566,68 @@ struct page *dax_layout_busy_page(struct address_space *mapping) */ unmap_mapping_range(mapping, 0, 0, 1); - while (index < end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - indices)) { - pgoff_t nr_pages = 1; - - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *pvec_ent = pvec.pages[i]; - void *entry; - - index = indices[i]; - if (index >= end) - break; - - if (WARN_ON_ONCE( - !radix_tree_exceptional_entry(pvec_ent))) - continue; - - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (entry) { - page = dax_busy_page(entry); - /* - * Account for multi-order entries at - * the end of the pagevec. - */ - if (i + 1 >= pagevec_count(&pvec)) - nr_pages = 1UL << dax_radix_order(entry); - } - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(&mapping->i_pages); - if (page) - break; - } - - /* - * We don't expect normal struct page entries to exist in our - * tree, but we keep these pagevec calls so that this code is - * consistent with the common pattern for handling pagevecs - * throughout the kernel. - */ - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - index += nr_pages; - + xas_lock_irq(&xas); + xas_for_each(&xas, entry, ULONG_MAX) { + if (WARN_ON_ONCE(!xa_is_value(entry))) + continue; + if (unlikely(dax_is_locked(entry))) + entry = get_unlocked_entry(&xas); + if (entry) + page = dax_busy_page(entry); + put_unlocked_entry(&xas, entry); if (page) break; + if (++scanned % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } + xas_unlock_irq(&xas); return page; } EXPORT_SYMBOL_GPL(dax_layout_busy_page); -static int __dax_invalidate_mapping_entry(struct address_space *mapping, +static int __dax_invalidate_entry(struct address_space *mapping, pgoff_t index, bool trunc) { + XA_STATE(xas, &mapping->i_pages, index); int ret = 0; void *entry; - struct radix_tree_root *pages = &mapping->i_pages; - xa_lock_irq(pages); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) + xas_lock_irq(&xas); + entry = get_unlocked_entry(&xas); + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto out; if (!trunc && - (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) + (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || + xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) goto out; dax_disassociate_entry(entry, mapping, trunc); - radix_tree_delete(pages, index); + xas_store(&xas, NULL); mapping->nrexceptional--; ret = 1; out: - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(pages); + put_unlocked_entry(&xas, entry); + xas_unlock_irq(&xas); return ret; } + /* - * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree - * entry to get unlocked before deleting it. + * Delete DAX entry at @index from @mapping. Wait for it + * to be unlocked before deleting it. */ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) { - int ret = __dax_invalidate_mapping_entry(mapping, index, true); + int ret = __dax_invalidate_entry(mapping, index, true); /* * This gets called from truncate / punch_hole path. As such, the caller * must hold locks protecting against concurrent modifications of the - * radix tree (usually fs-private i_mmap_sem for writing). Since the - * caller has seen exceptional entry for this index, we better find it + * page cache (usually fs-private i_mmap_sem for writing). Since the + * caller has seen a DAX entry for this index, we better find it * at that index as well... */ WARN_ON_ONCE(!ret); @@ -758,12 +635,12 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) } /* - * Invalidate exceptional DAX entry if it is clean. + * Invalidate DAX entry if it is clean. */ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index) { - return __dax_invalidate_mapping_entry(mapping, index, false); + return __dax_invalidate_entry(mapping, index, false); } static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, @@ -799,30 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ -static void *dax_insert_mapping_entry(struct address_space *mapping, - struct vm_fault *vmf, - void *entry, pfn_t pfn_t, - unsigned long flags, bool dirty) +static void *dax_insert_entry(struct xa_state *xas, + struct address_space *mapping, struct vm_fault *vmf, + void *entry, pfn_t pfn, unsigned long flags, bool dirty) { - struct radix_tree_root *pages = &mapping->i_pages; - unsigned long pfn = pfn_t_to_pfn(pfn_t); - pgoff_t index = vmf->pgoff; - void *new_entry; + void *new_entry = dax_make_entry(pfn, flags); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { + if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { + unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, - PG_PMD_NR, false); + PG_PMD_NR, false); else /* pte entry */ - unmap_mapping_pages(mapping, vmf->pgoff, 1, false); + unmap_mapping_pages(mapping, index, 1, false); } - xa_lock_irq(pages); - new_entry = dax_radix_locked_entry(pfn, flags); + xas_reset(xas); + xas_lock_irq(xas); if (dax_entry_size(entry) != dax_entry_size(new_entry)) { dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); @@ -830,33 +704,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* - * Only swap our new entry into the radix tree if the current + * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or - * PMD entry is already in the tree, we leave it alone. This + * PMD entry is already in the cache, we leave it alone. This * means that if we are trying to insert a PTE and the * existing entry is a PMD, we will just leave the PMD in the * tree and dirty it if necessary. */ - struct radix_tree_node *node; - void **slot; - void *ret; - - ret = __radix_tree_lookup(pages, index, &node, &slot); - WARN_ON_ONCE(ret != entry); - __radix_tree_replace(pages, node, slot, - new_entry, NULL); + void *old = dax_lock_entry(xas, new_entry); + WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | + DAX_LOCKED)); entry = new_entry; + } else { + xas_load(xas); /* Walk the xa_state */ } if (dirty) - radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); + xas_set_mark(xas, PAGECACHE_TAG_DIRTY); - xa_unlock_irq(pages); + xas_unlock_irq(xas); return entry; } -static inline unsigned long -pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) +static inline +unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) { unsigned long address; @@ -866,8 +737,8 @@ pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) } /* Walk all mappings of a given index of a file and writeprotect them */ -static void dax_mapping_entry_mkclean(struct address_space *mapping, - pgoff_t index, unsigned long pfn) +static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, + unsigned long pfn) { struct vm_area_struct *vma; pte_t pte, *ptep = NULL; @@ -937,11 +808,9 @@ unlock_pte: i_mmap_unlock_read(mapping); } -static int dax_writeback_one(struct dax_device *dax_dev, - struct address_space *mapping, pgoff_t index, void *entry) +static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, + struct address_space *mapping, void *entry) { - struct radix_tree_root *pages = &mapping->i_pages; - void *entry2, **slot; unsigned long pfn; long ret = 0; size_t size; @@ -950,32 +819,38 @@ static int dax_writeback_one(struct dax_device *dax_dev, * A page got tagged dirty in DAX mapping? Something is seriously * wrong. */ - if (WARN_ON(!radix_tree_exceptional_entry(entry))) + if (WARN_ON(!xa_is_value(entry))) return -EIO; - xa_lock_irq(pages); - entry2 = get_unlocked_mapping_entry(mapping, index, &slot); - /* Entry got punched out / reallocated? */ - if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) - goto put_unlocked; - /* - * Entry got reallocated elsewhere? No need to writeback. We have to - * compare pfns as we must not bail out due to difference in lockbit - * or entry type. - */ - if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) - goto put_unlocked; - if (WARN_ON_ONCE(dax_is_empty_entry(entry) || - dax_is_zero_entry(entry))) { - ret = -EIO; - goto put_unlocked; + if (unlikely(dax_is_locked(entry))) { + void *old_entry = entry; + + entry = get_unlocked_entry(xas); + + /* Entry got punched out / reallocated? */ + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) + goto put_unlocked; + /* + * Entry got reallocated elsewhere? No need to writeback. + * We have to compare pfns as we must not bail out due to + * difference in lockbit or entry type. + */ + if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) + goto put_unlocked; + if (WARN_ON_ONCE(dax_is_empty_entry(entry) || + dax_is_zero_entry(entry))) { + ret = -EIO; + goto put_unlocked; + } + + /* Another fsync thread may have already done this entry */ + if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) + goto put_unlocked; } - /* Another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) - goto put_unlocked; /* Lock the entry to serialize with page faults */ - entry = lock_slot(mapping, slot); + dax_lock_entry(xas, entry); + /* * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we @@ -983,8 +858,8 @@ static int dax_writeback_one(struct dax_device *dax_dev, * at the entry only under the i_pages lock and once they do that * they will see the entry locked and wait for it to unlock. */ - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); - xa_unlock_irq(pages); + xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irq(xas); /* * Even if dax_writeback_mapping_range() was given a wbc->range_start @@ -993,10 +868,10 @@ static int dax_writeback_one(struct dax_device *dax_dev, * This allows us to flush for PMD_SIZE and not have to worry about * partial PMD writebacks. */ - pfn = dax_radix_pfn(entry); - size = PAGE_SIZE << dax_radix_order(entry); + pfn = dax_to_pfn(entry); + size = PAGE_SIZE << dax_entry_order(entry); - dax_mapping_entry_mkclean(mapping, index, pfn); + dax_entry_mkclean(mapping, xas->xa_index, pfn); dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); /* * After we have flushed the cache, we can clear the dirty tag. There @@ -1004,16 +879,18 @@ static int dax_writeback_one(struct dax_device *dax_dev, * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ - xa_lock_irq(pages); - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); - xa_unlock_irq(pages); - trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); - put_locked_mapping_entry(mapping, index); + xas_reset(xas); + xas_lock_irq(xas); + xas_store(xas, entry); + xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); + dax_wake_entry(xas, entry, false); + + trace_dax_writeback_one(mapping->host, xas->xa_index, + size >> PAGE_SHIFT); return ret; put_unlocked: - put_unlocked_mapping_entry(mapping, index, entry2); - xa_unlock_irq(pages); + put_unlocked_entry(xas, entry); return ret; } @@ -1025,13 +902,13 @@ static int dax_writeback_one(struct dax_device *dax_dev, int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { + XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); struct inode *inode = mapping->host; - pgoff_t start_index, end_index; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; struct dax_device *dax_dev; - struct pagevec pvec; - bool done = false; - int i, ret = 0; + void *entry; + int ret = 0; + unsigned int scanned = 0; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; @@ -1043,41 +920,29 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (!dax_dev) return -EIO; - start_index = wbc->range_start >> PAGE_SHIFT; - end_index = wbc->range_end >> PAGE_SHIFT; - - trace_dax_writeback_range(inode, start_index, end_index); + trace_dax_writeback_range(inode, xas.xa_index, end_index); - tag_pages_for_writeback(mapping, start_index, end_index); + tag_pages_for_writeback(mapping, xas.xa_index, end_index); - pagevec_init(&pvec); - while (!done) { - pvec.nr = find_get_entries_tag(mapping, start_index, - PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, - pvec.pages, indices); - - if (pvec.nr == 0) + xas_lock_irq(&xas); + xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { + ret = dax_writeback_one(&xas, dax_dev, mapping, entry); + if (ret < 0) { + mapping_set_error(mapping, ret); break; - - for (i = 0; i < pvec.nr; i++) { - if (indices[i] > end_index) { - done = true; - break; - } - - ret = dax_writeback_one(dax_dev, mapping, indices[i], - pvec.pages[i]); - if (ret < 0) { - mapping_set_error(mapping, ret); - goto out; - } } - start_index = indices[pvec.nr - 1] + 1; + if (++scanned % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } -out: + xas_unlock_irq(&xas); put_dax(dax_dev); - trace_dax_writeback_range_done(inode, start_index, end_index); - return (ret < 0 ? ret : 0); + trace_dax_writeback_range_done(inode, xas.xa_index, end_index); + return ret; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); @@ -1125,16 +990,18 @@ out: * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ -static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, - struct vm_fault *vmf) +static vm_fault_t dax_load_hole(struct xa_state *xas, + struct address_space *mapping, void **entry, + struct vm_fault *vmf) { struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; - dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, - false); + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, + DAX_ZERO_PAGE, false); + ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); trace_dax_load_hole(inode, vmf, ret); return ret; @@ -1342,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; + XA_STATE(xas, &mapping->i_pages, vmf->pgoff); struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; @@ -1368,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (write && !vmf->cow_page) flags |= IOMAP_WRITE; - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); - if (IS_ERR(entry)) { - ret = dax_fault_return(PTR_ERR(entry)); + entry = grab_mapping_entry(&xas, mapping, 0); + if (xa_is_internal(entry)) { + ret = xa_to_internal(entry); goto out; } @@ -1443,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto error_finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, 0, write && !sync); /* @@ -1471,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!write) { - ret = dax_load_hole(mapping, entry, vmf); + ret = dax_load_hole(&xas, mapping, &entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1498,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff); + dax_unlock_entry(&xas, entry); out: trace_dax_pte_fault_done(inode, vmf, ret); return ret | major; } #ifdef CONFIG_FS_DAX_PMD -static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, - void *entry) +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, + struct iomap *iomap, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; struct inode *inode = mapping->host; struct page *zero_page; - void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; pfn_t pfn; @@ -1523,8 +1390,8 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; pfn = page_to_pfn_t(zero_page); - ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, - RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, + DAX_PMD | DAX_ZERO_PAGE, false); ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1536,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); - trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); return VM_FAULT_NOPAGE; fallback: - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); return VM_FAULT_FALLBACK; } @@ -1549,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; @@ -1556,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct inode *inode = mapping->host; vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; - pgoff_t max_pgoff, pgoff; + pgoff_t max_pgoff; void *entry; loff_t pos; int error; @@ -1567,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ - pgoff = linear_page_index(vma, pmd_addr); max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); @@ -1576,7 +1443,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * Make sure that the faulting address's PMD offset (color) matches * the PMD offset from the start of the file. This is necessary so * that a PMD range in the page table overlaps exactly with a PMD - * range in the radix tree. + * range in the page cache. */ if ((vmf->pgoff & PG_PMD_COLOUR) != ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) @@ -1592,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; - if (pgoff >= max_pgoff) { + if (xas.xa_index >= max_pgoff) { result = VM_FAULT_SIGBUS; goto out; } /* If the PMD would extend beyond the file size */ - if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) + if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) goto fallback; /* - * grab_mapping_entry() will make sure we get a 2MiB empty entry, a - * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page - * is already in the tree, for instance), it will return -EEXIST and - * we just fall back to 4k entries. + * grab_mapping_entry() will make sure we get an empty PMD entry, + * a zero PMD entry or a DAX PMD. If it can't (because a PTE + * entry is already in the array, for instance), it will return + * VM_FAULT_FALLBACK. */ - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); - if (IS_ERR(entry)) + entry = grab_mapping_entry(&xas, mapping, DAX_PMD); + if (xa_is_internal(entry)) { + result = xa_to_internal(entry); goto fallback; + } /* * It is possible, particularly with mixed reads & writes to private @@ -1628,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. */ - pos = (loff_t)pgoff << PAGE_SHIFT; + pos = (loff_t)xas.xa_index << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) goto unlock_entry; @@ -1644,8 +1513,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, - RADIX_DAX_PMD, write && !sync); + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, + DAX_PMD, write && !sync); /* * If we are doing synchronous page fault and inode needs fsync, @@ -1669,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_HOLE: if (WARN_ON_ONCE(write)) break; - result = dax_pmd_load_hole(vmf, &iomap, entry); + result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); break; default: WARN_ON_ONCE(1); @@ -1692,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, pgoff); + dax_unlock_entry(&xas, entry); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); @@ -1737,54 +1606,49 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, } EXPORT_SYMBOL_GPL(dax_iomap_fault); -/** +/* * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables * @vmf: The description of the fault - * @pe_size: Size of entry to be inserted * @pfn: PFN to insert + * @order: Order of entry to insert. * - * This function inserts writeable PTE or PMD entry into page tables for mmaped - * DAX file. It takes care of marking corresponding radix tree entry as dirty - * as well. + * This function inserts a writeable PTE or PMD entry into the page tables + * for an mmaped DAX file. It also marks the page cache entry as dirty. */ -static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, - enum page_entry_size pe_size, - pfn_t pfn) +static vm_fault_t +dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; - void *entry, **slot; - pgoff_t index = vmf->pgoff; + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); + void *entry; vm_fault_t ret; - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, &slot); + xas_lock_irq(&xas); + entry = get_unlocked_entry(&xas); /* Did we race with someone splitting entry or so? */ if (!entry || - (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || - (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(&mapping->i_pages); + (order == 0 && !dax_is_pte_entry(entry)) || + (order == PMD_ORDER && (xa_is_internal(entry) || + !dax_is_pmd_entry(entry)))) { + put_unlocked_entry(&xas, entry); + xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } - radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); - entry = lock_slot(mapping, slot); - xa_unlock_irq(&mapping->i_pages); - switch (pe_size) { - case PE_SIZE_PTE: + xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); + dax_lock_entry(&xas, entry); + xas_unlock_irq(&xas); + if (order == 0) ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); - break; #ifdef CONFIG_FS_DAX_PMD - case PE_SIZE_PMD: + else if (order == PMD_ORDER) ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, true); - break; #endif - default: + else ret = VM_FAULT_FALLBACK; - } - put_locked_mapping_entry(mapping, index); + dax_unlock_entry(&xas, entry); trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); return ret; } @@ -1804,17 +1668,12 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; - size_t len = 0; + unsigned int order = pe_order(pe_size); + size_t len = PAGE_SIZE << order; - if (pe_size == PE_SIZE_PTE) - len = PAGE_SIZE; - else if (pe_size == PE_SIZE_PMD) - len = PMD_SIZE; - else - WARN_ON_ONCE(1); err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); if (err) return VM_FAULT_SIGBUS; - return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); + return dax_insert_pfn_mkwrite(vmf, pfn, order); } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); diff --git a/fs/dcache.c b/fs/dcache.c index c2e443fb76ae..2593153471cf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -26,7 +26,7 @@ #include <linux/export.h> #include <linux/security.h> #include <linux/seqlock.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/list_lru.h> diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 224c04abb2e5..cf4c77f8dd08 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -256,11 +256,15 @@ ext2_init_acl(struct inode *inode, struct inode *dir) if (default_acl) { error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } return error; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 00e759f05161..e770cd100a6a 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -390,11 +390,7 @@ struct ext2_inode { #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ -#ifdef CONFIG_FS_DAX #define EXT2_MOUNT_DAX 0x100000 /* Direct Access */ -#else -#define EXT2_MOUNT_DAX 0 -#endif #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 73bd58fa13de..cb91baa4275d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -309,20 +309,17 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); -#if defined(CONFIG_QUOTA) if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA) seq_puts(seq, ",usrquota"); if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA) seq_puts(seq, ",grpquota"); -#endif -#ifdef CONFIG_FS_DAX if (sbi->s_mount_opt & EXT2_MOUNT_XIP) seq_puts(seq, ",xip"); + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) seq_puts(seq, ",dax"); -#endif if (!test_opt(sb, RESERVATION)) seq_puts(seq, ",noreservation"); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c3d9a42c561e..05f01fbd9c7f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2643,7 +2643,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; - int tag; + xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 106f116466bf..b293cb3e27a2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2071,7 +2071,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; int nwritten = 0; pagevec_init(&pvec); @@ -2787,13 +2787,13 @@ const struct address_space_operations f2fs_dblock_aops = { #endif }; -void f2fs_clear_radix_tree_dirty_tag(struct page *page) +void f2fs_clear_page_cache_dirty_tag(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - radix_tree_tag_clear(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 2ef84b4590ea..bacc667950b6 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -726,7 +726,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, page->index, page->index + 1)) { - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 56204a8f8a12..1e031971a466 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3108,7 +3108,7 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); -void f2fs_clear_radix_tree_dirty_tag(struct page *page); +void f2fs_clear_page_cache_dirty_tag(struct page *page); /* * gc.c diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index cb31a719b048..7b0cff7e6051 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -243,7 +243,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 2b34206486d8..d338740d0fda 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -101,7 +101,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { if (PageDirty(page)) { - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); } @@ -1306,9 +1306,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (f2fs_check_nid_range(sbi, nid)) return; - rcu_read_lock(); - apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); - rcu_read_unlock(); + apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid); if (apage) return; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 7f5f3699fc6c..c8366cb8eccd 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -369,7 +369,9 @@ static int fat_parse_short(struct super_block *sb, } memcpy(work, de->name, sizeof(work)); - /* see namei.c, msdos_format_name */ + /* For an explanation of the special treatment of 0x05 in + * filenames, see msdos_format_name in namei_msdos.c + */ if (work[0] == 0x05) work[0] = 0xE5; @@ -1071,7 +1073,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) } } - dir->i_mtime = dir->i_atime = current_time(dir); + fat_truncate_time(dir, NULL, S_ATIME|S_MTIME); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 9d7d2d5da28b..4e1b2f6df5e6 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -416,6 +416,10 @@ extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs); extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs); +extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, + int flags); +extern int fat_update_time(struct inode *inode, struct timespec64 *now, + int flags); extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); int fat_cache_init(void); diff --git a/fs/fat/file.c b/fs/fat/file.c index 4f3d72fb1e60..13935ee99e1e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -227,7 +227,7 @@ static int fat_cont_expand(struct inode *inode, loff_t size) if (err) goto out; - inode->i_ctime = inode->i_mtime = current_time(inode); + fat_truncate_time(inode, NULL, S_CTIME|S_MTIME); mark_inode_dirty(inode); if (IS_SYNC(inode)) { int err2; @@ -330,7 +330,7 @@ static int fat_free(struct inode *inode, int skip) MSDOS_I(inode)->i_logstart = 0; } MSDOS_I(inode)->i_attrs |= ATTR_ARCH; - inode->i_ctime = inode->i_mtime = current_time(inode); + fat_truncate_time(inode, NULL, S_CTIME|S_MTIME); if (wait) { err = fat_sync_inode(inode); if (err) { @@ -542,6 +542,18 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) up_write(&MSDOS_I(inode)->truncate_lock); } + /* + * setattr_copy can't truncate these appropriately, so we'll + * copy them ourselves + */ + if (attr->ia_valid & ATTR_ATIME) + fat_truncate_time(inode, &attr->ia_atime, S_ATIME); + if (attr->ia_valid & ATTR_CTIME) + fat_truncate_time(inode, &attr->ia_ctime, S_CTIME); + if (attr->ia_valid & ATTR_MTIME) + fat_truncate_time(inode, &attr->ia_mtime, S_MTIME); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME); + setattr_copy(inode, attr); mark_inode_dirty(inode); out: @@ -552,4 +564,5 @@ EXPORT_SYMBOL_GPL(fat_setattr); const struct inode_operations fat_file_inode_operations = { .setattr = fat_setattr, .getattr = fat_getattr, + .update_time = fat_update_time, }; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index d6b81e31f9f5..c0b5b5c3373b 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -244,7 +244,7 @@ static int fat_write_end(struct file *file, struct address_space *mapping, if (err < len) fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { - inode->i_mtime = inode->i_ctime = current_time(inode); + fat_truncate_time(inode, NULL, S_CTIME|S_MTIME); MSDOS_I(inode)->i_attrs |= ATTR_ARCH; mark_inode_dirty(inode); } @@ -564,7 +564,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) de->cdate, de->ctime_cs); fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); } else - inode->i_ctime = inode->i_atime = inode->i_mtime; + fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME); return 0; } @@ -1626,6 +1626,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sb->s_magic = MSDOS_SUPER_MAGIC; sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; + /* + * fat timestamps are complex and truncated by fat itself, so + * we set 1 here to be fast + */ + sb->s_time_gran = 1; mutex_init(&sbi->nfs_build_inode_lock); ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 573836dcaefc..fce0a76f3f1e 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -7,6 +7,7 @@ */ #include "fat.h" +#include <linux/iversion.h> /* * fat_fs_error reports a file system problem that might indicate fa data @@ -185,6 +186,13 @@ static long days_in_year[] = { 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, }; +static inline int fat_tz_offset(struct msdos_sb_info *sbi) +{ + return (sbi->options.tz_set ? + -sbi->options.time_offset : + sys_tz.tz_minuteswest) * SECS_PER_MIN; +} + /* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs) @@ -210,10 +218,7 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, + days_in_year[month] + day + DAYS_DELTA) * SECS_PER_DAY; - if (!sbi->options.tz_set) - second += sys_tz.tz_minuteswest * SECS_PER_MIN; - else - second -= sbi->options.time_offset * SECS_PER_MIN; + second += fat_tz_offset(sbi); if (time_cs) { ts->tv_sec = second + (time_cs / 100); @@ -229,9 +234,7 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs) { struct tm tm; - time64_to_tm(ts->tv_sec, - (sbi->options.tz_set ? sbi->options.time_offset : - -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm); + time64_to_tm(ts->tv_sec, -fat_tz_offset(sbi), &tm); /* FAT can only support year between 1980 to 2107 */ if (tm.tm_year < 1980 - 1900) { @@ -263,6 +266,80 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, } EXPORT_SYMBOL_GPL(fat_time_unix2fat); +static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts) +{ + return (struct timespec64){ ts.tv_sec & ~1ULL, 0 }; +} +/* + * truncate the various times with appropriate granularity: + * root inode: + * all times always 0 + * all other inodes: + * mtime - 2 seconds + * ctime + * msdos - 2 seconds + * vfat - 10 milliseconds + * atime - 24 hours (00:00:00 in local timezone) + */ +int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct timespec64 ts; + + if (inode->i_ino == MSDOS_ROOT_INO) + return 0; + + if (now == NULL) { + now = &ts; + ts = current_time(inode); + } + + if (flags & S_ATIME) { + /* to localtime */ + time64_t seconds = now->tv_sec - fat_tz_offset(sbi); + s32 remainder; + + div_s64_rem(seconds, SECS_PER_DAY, &remainder); + /* to day boundary, and back to unix time */ + seconds = seconds + fat_tz_offset(sbi) - remainder; + + inode->i_atime = (struct timespec64){ seconds, 0 }; + } + if (flags & S_CTIME) { + if (sbi->options.isvfat) + inode->i_ctime = timespec64_trunc(*now, 10000000); + else + inode->i_ctime = fat_timespec64_trunc_2secs(*now); + } + if (flags & S_MTIME) + inode->i_mtime = fat_timespec64_trunc_2secs(*now); + + return 0; +} +EXPORT_SYMBOL_GPL(fat_truncate_time); + +int fat_update_time(struct inode *inode, struct timespec64 *now, int flags) +{ + int iflags = I_DIRTY_TIME; + bool dirty = false; + + if (inode->i_ino == MSDOS_ROOT_INO) + return 0; + + fat_truncate_time(inode, now, flags); + if (flags & S_VERSION) + dirty = inode_maybe_inc_iversion(inode, false); + if ((flags & (S_ATIME | S_CTIME | S_MTIME)) && + !(inode->i_sb->s_flags & SB_LAZYTIME)) + dirty = true; + + if (dirty) + iflags |= I_DIRTY_SYNC; + __mark_inode_dirty(inode, iflags); + return 0; +} +EXPORT_SYMBOL_GPL(fat_update_time); + int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) { int i, err = 0; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index efb8c40c9d27..f2cd365a4e86 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -250,7 +250,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, if (err) return err; - dir->i_ctime = dir->i_mtime = *ts; + fat_truncate_time(dir, ts, S_CTIME|S_MTIME); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -294,7 +294,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, err = PTR_ERR(inode); goto out; } - inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -327,7 +327,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) drop_nlink(dir); clear_nlink(inode); - inode->i_ctime = current_time(inode); + fat_truncate_time(inode, NULL, S_CTIME); fat_detach(inode); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -380,7 +380,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; } set_nlink(inode, 2); - inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -413,7 +413,7 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; clear_nlink(inode); - inode->i_ctime = current_time(inode); + fat_truncate_time(inode, NULL, S_CTIME); fat_detach(inode); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -478,7 +478,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, mark_inode_dirty(old_inode); inode_inc_iversion(old_dir); - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + fat_truncate_time(old_dir, NULL, S_CTIME|S_MTIME); if (IS_DIRSYNC(old_dir)) (void)fat_sync_inode(old_dir); else @@ -538,7 +538,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, if (err) goto error_dotdot; inode_inc_iversion(old_dir); - old_dir->i_ctime = old_dir->i_mtime = ts; + fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME); if (IS_DIRSYNC(old_dir)) (void)fat_sync_inode(old_dir); else @@ -548,7 +548,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, drop_nlink(new_inode); if (is_dir) drop_nlink(new_inode); - new_inode->i_ctime = ts; + fat_truncate_time(new_inode, &ts, S_CTIME); } out: brelse(sinfo.bh); @@ -637,6 +637,7 @@ static const struct inode_operations msdos_dir_inode_operations = { .rename = msdos_rename, .setattr = fat_setattr, .getattr = fat_getattr, + .update_time = fat_update_time, }; static void setup(struct super_block *sb) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 82cd1e69cbdf..996c8c25e9c6 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -678,7 +678,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname, goto cleanup; /* update timestamp */ - dir->i_ctime = dir->i_mtime = dir->i_atime = *ts; + fat_truncate_time(dir, ts, S_CTIME|S_MTIME); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -779,7 +779,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, goto out; } inode_inc_iversion(inode); - inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -810,7 +810,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) drop_nlink(dir); clear_nlink(inode); - inode->i_mtime = inode->i_atime = current_time(inode); + fat_truncate_time(inode, NULL, S_ATIME|S_MTIME); fat_detach(inode); vfat_d_version_set(dentry, inode_query_iversion(dir)); out: @@ -836,7 +836,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; clear_nlink(inode); - inode->i_mtime = inode->i_atime = current_time(inode); + fat_truncate_time(inode, NULL, S_ATIME|S_MTIME); fat_detach(inode); vfat_d_version_set(dentry, inode_query_iversion(dir)); out: @@ -876,7 +876,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } inode_inc_iversion(inode); set_nlink(inode, 2); - inode->i_mtime = inode->i_atime = inode->i_ctime = ts; + fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -969,7 +969,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto error_dotdot; inode_inc_iversion(old_dir); - old_dir->i_ctime = old_dir->i_mtime = ts; + fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME); if (IS_DIRSYNC(old_dir)) (void)fat_sync_inode(old_dir); else @@ -979,7 +979,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, drop_nlink(new_inode); if (is_dir) drop_nlink(new_inode); - new_inode->i_ctime = ts; + fat_truncate_time(new_inode, &ts, S_CTIME); } out: brelse(sinfo.bh); @@ -1032,6 +1032,7 @@ static const struct inode_operations vfat_dir_inode_operations = { .rename = vfat_rename, .setattr = fat_setattr, .getattr = fat_getattr, + .update_time = fat_update_time, }; static void setup(struct super_block *sb) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 471d863958bc..b40168fcc94a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -339,9 +339,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; - struct radix_tree_iter iter; + XA_STATE(xas, &mapping->i_pages, 0); + struct page *page; bool switched = false; - void **slot; /* * By the time control reaches here, RCU grace period has passed @@ -375,25 +375,18 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to * pages actually under writeback. */ - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, - PAGECACHE_TAG_DIRTY) { - struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (likely(page) && PageDirty(page)) { + xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) { + if (PageDirty(page)) { dec_wb_stat(old_wb, WB_RECLAIMABLE); inc_wb_stat(new_wb, WB_RECLAIMABLE); } } - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, - PAGECACHE_TAG_WRITEBACK) { - struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (likely(page)) { - WARN_ON_ONCE(!PageWriteback(page)); - dec_wb_stat(old_wb, WB_WRITEBACK); - inc_wb_stat(new_wb, WB_WRITEBACK); - } + xas_set(&xas, 0); + xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { + WARN_ON_ONCE(!PageWriteback(page)); + dec_wb_stat(old_wb, WB_WRITEBACK); + inc_wb_stat(new_wb, WB_WRITEBACK); } wb_get(new_wb); diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 60da84a86dab..f7b807bc1027 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o -fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o +fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 0b694655d988..989df5accaee 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -107,7 +107,7 @@ static ssize_t fuse_conn_max_background_read(struct file *file, if (!fc) return 0; - val = fc->max_background; + val = READ_ONCE(fc->max_background); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); @@ -125,7 +125,12 @@ static ssize_t fuse_conn_max_background_write(struct file *file, if (ret > 0) { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { + spin_lock(&fc->bg_lock); fc->max_background = val; + fc->blocked = fc->num_background >= fc->max_background; + if (!fc->blocked) + wake_up(&fc->blocked_waitq); + spin_unlock(&fc->bg_lock); fuse_conn_put(fc); } } @@ -144,7 +149,7 @@ static ssize_t fuse_conn_congestion_threshold_read(struct file *file, if (!fc) return 0; - val = fc->congestion_threshold; + val = READ_ONCE(fc->congestion_threshold); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); @@ -155,18 +160,31 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, size_t count, loff_t *ppos) { unsigned uninitialized_var(val); + struct fuse_conn *fc; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, max_user_congthresh); - if (ret > 0) { - struct fuse_conn *fc = fuse_ctl_file_conn_get(file); - if (fc) { - fc->congestion_threshold = val; - fuse_conn_put(fc); + if (ret <= 0) + goto out; + fc = fuse_ctl_file_conn_get(file); + if (!fc) + goto out; + + spin_lock(&fc->bg_lock); + fc->congestion_threshold = val; + if (fc->sb) { + if (fc->num_background < fc->congestion_threshold) { + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + } else { + set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } } - + spin_unlock(&fc->bg_lock); + fuse_conn_put(fc); +out: return ret; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 11ea2c4a38ab..ae813e609932 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -25,6 +25,10 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); +/* Ordinary requests have even IDs, while interrupts IDs are odd */ +#define FUSE_INT_REQ_BIT (1ULL << 0) +#define FUSE_REQ_ID_STEP (1ULL << 1) + static struct kmem_cache *fuse_req_cachep; static struct fuse_dev *fuse_get_dev(struct file *file) @@ -40,9 +44,6 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, struct fuse_page_desc *page_descs, unsigned npages) { - memset(req, 0, sizeof(*req)); - memset(pages, 0, sizeof(*pages) * npages); - memset(page_descs, 0, sizeof(*page_descs) * npages); INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); @@ -53,30 +54,36 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, __set_bit(FR_PENDING, &req->flags); } +static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) +{ + struct page **pages; + + pages = kzalloc(npages * (sizeof(struct page *) + + sizeof(struct fuse_page_desc)), flags); + *desc = (void *) pages + npages * sizeof(struct page *); + + return pages; +} + static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags); + struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); if (req) { - struct page **pages; - struct fuse_page_desc *page_descs; - - if (npages <= FUSE_REQ_INLINE_PAGES) { + struct page **pages = NULL; + struct fuse_page_desc *page_descs = NULL; + + WARN_ON(npages > FUSE_MAX_MAX_PAGES); + if (npages > FUSE_REQ_INLINE_PAGES) { + pages = fuse_req_pages_alloc(npages, flags, + &page_descs); + if (!pages) { + kmem_cache_free(fuse_req_cachep, req); + return NULL; + } + } else if (npages) { pages = req->inline_pages; page_descs = req->inline_page_descs; - } else { - pages = kmalloc_array(npages, sizeof(struct page *), - flags); - page_descs = - kmalloc_array(npages, - sizeof(struct fuse_page_desc), - flags); - } - - if (!pages || !page_descs) { - kfree(pages); - kfree(page_descs); - kmem_cache_free(fuse_req_cachep, req); - return NULL; } fuse_request_init(req, pages, page_descs, npages); @@ -95,12 +102,41 @@ struct fuse_req *fuse_request_alloc_nofs(unsigned npages) return __fuse_request_alloc(npages, GFP_NOFS); } -void fuse_request_free(struct fuse_req *req) +static void fuse_req_pages_free(struct fuse_req *req) { - if (req->pages != req->inline_pages) { + if (req->pages != req->inline_pages) kfree(req->pages); - kfree(req->page_descs); - } +} + +bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, + gfp_t flags) +{ + struct page **pages; + struct fuse_page_desc *page_descs; + unsigned int npages = min_t(unsigned int, + max_t(unsigned int, req->max_pages * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), + fc->max_pages); + WARN_ON(npages <= req->max_pages); + + pages = fuse_req_pages_alloc(npages, flags, &page_descs); + if (!pages) + return false; + + memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages); + memcpy(page_descs, req->page_descs, + sizeof(struct fuse_page_desc) * req->max_pages); + fuse_req_pages_free(req); + req->pages = pages; + req->page_descs = page_descs; + req->max_pages = npages; + + return true; +} + +void fuse_request_free(struct fuse_req *req) +{ + fuse_req_pages_free(req); kmem_cache_free(fuse_req_cachep, req); } @@ -235,8 +271,10 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) struct file *file = req->stolen_file; struct fuse_file *ff = file->private_data; + WARN_ON(req->max_pages); spin_lock(&fc->lock); - fuse_request_init(req, req->pages, req->page_descs, req->max_pages); + memset(req, 0, sizeof(*req)); + fuse_request_init(req, NULL, NULL, 0); BUG_ON(ff->reserved_req); ff->reserved_req = req; wake_up_all(&fc->reserved_req_waitq); @@ -287,10 +325,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) * We get here in the unlikely case that a background * request was allocated but not sent */ - spin_lock(&fc->lock); + spin_lock(&fc->bg_lock); if (!fc->blocked) wake_up(&fc->blocked_waitq); - spin_unlock(&fc->lock); + spin_unlock(&fc->bg_lock); } if (test_bit(FR_WAITING, &req->flags)) { @@ -319,7 +357,13 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) static u64 fuse_get_unique(struct fuse_iqueue *fiq) { - return ++fiq->reqctr; + fiq->reqctr += FUSE_REQ_ID_STEP; + return fiq->reqctr; +} + +static unsigned int fuse_req_hash(u64 unique) +{ + return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) @@ -353,12 +397,13 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, static void flush_bg_queue(struct fuse_conn *fc) { + struct fuse_iqueue *fiq = &fc->iq; + while (fc->active_background < fc->max_background && !list_empty(&fc->bg_queue)) { struct fuse_req *req; - struct fuse_iqueue *fiq = &fc->iq; - req = list_entry(fc->bg_queue.next, struct fuse_req, list); + req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; spin_lock(&fiq->waitq.lock); @@ -389,14 +434,21 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); if (test_bit(FR_BACKGROUND, &req->flags)) { - spin_lock(&fc->lock); + spin_lock(&fc->bg_lock); clear_bit(FR_BACKGROUND, &req->flags); - if (fc->num_background == fc->max_background) + if (fc->num_background == fc->max_background) { fc->blocked = 0; - - /* Wake up next waiter, if any */ - if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) wake_up(&fc->blocked_waitq); + } else if (!fc->blocked) { + /* + * Wake up next waiter, if any. It's okay to use + * waitqueue_active(), as we've already synced up + * fc->blocked with waiters with the wake_up() call + * above. + */ + if (waitqueue_active(&fc->blocked_waitq)) + wake_up(&fc->blocked_waitq); + } if (fc->num_background == fc->congestion_threshold && fc->sb) { clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); @@ -405,7 +457,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) fc->num_background--; fc->active_background--; flush_bg_queue(fc); - spin_unlock(&fc->lock); + spin_unlock(&fc->bg_lock); } wake_up(&req->waitq); if (req->end) @@ -573,40 +625,38 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) return ret; } -/* - * Called under fc->lock - * - * fc->connected must have been checked previously - */ -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req) +bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(!test_bit(FR_BACKGROUND, &req->flags)); + bool queued = false; + + WARN_ON(!test_bit(FR_BACKGROUND, &req->flags)); if (!test_bit(FR_WAITING, &req->flags)) { __set_bit(FR_WAITING, &req->flags); atomic_inc(&fc->num_waiting); } __set_bit(FR_ISREPLY, &req->flags); - fc->num_background++; - if (fc->num_background == fc->max_background) - fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fc->sb) { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + spin_lock(&fc->bg_lock); + if (likely(fc->connected)) { + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + if (fc->num_background == fc->congestion_threshold && fc->sb) { + set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + } + list_add_tail(&req->list, &fc->bg_queue); + flush_bg_queue(fc); + queued = true; } - list_add_tail(&req->list, &fc->bg_queue); - flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); + + return queued; } void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(!req->end); - spin_lock(&fc->lock); - if (fc->connected) { - fuse_request_send_background_locked(fc, req); - spin_unlock(&fc->lock); - } else { - spin_unlock(&fc->lock); + WARN_ON(!req->end); + if (!fuse_request_queue_background(fc, req)) { req->out.h.error = -ENOTCONN; req->end(fc, req); fuse_put_request(fc, req); @@ -1084,12 +1134,11 @@ __releases(fiq->waitq.lock) int err; list_del_init(&req->intr_entry); - req->intr_unique = fuse_get_unique(fiq); memset(&ih, 0, sizeof(ih)); memset(&arg, 0, sizeof(arg)); ih.len = reqsize; ih.opcode = FUSE_INTERRUPT; - ih.unique = req->intr_unique; + ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT); arg.unique = req->in.h.unique; spin_unlock(&fiq->waitq.lock); @@ -1238,6 +1287,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_req *req; struct fuse_in *in; unsigned reqsize; + unsigned int hash; restart: spin_lock(&fiq->waitq.lock); @@ -1310,13 +1360,16 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, err = reqsize; goto out_end; } - list_move_tail(&req->list, &fpq->processing); - spin_unlock(&fpq->lock); + hash = fuse_req_hash(req->in.h.unique); + list_move_tail(&req->list, &fpq->processing[hash]); + __fuse_get_request(req); set_bit(FR_SENT, &req->flags); + spin_unlock(&fpq->lock); /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) queue_interrupt(fiq, req); + fuse_put_request(fc, req); return reqsize; @@ -1663,7 +1716,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - int num_pages; + unsigned int num_pages; offset = outarg->offset & ~PAGE_MASK; file_size = i_size_read(inode); @@ -1675,7 +1728,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, num = file_size - outarg->offset; num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ); + num_pages = min(num_pages, fc->max_pages); req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) @@ -1792,10 +1845,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, /* Look up request on processing list by unique ID */ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) { + unsigned int hash = fuse_req_hash(unique); struct fuse_req *req; - list_for_each_entry(req, &fpq->processing, list) { - if (req->in.h.unique == unique || req->intr_unique == unique) + list_for_each_entry(req, &fpq->processing[hash], list) { + if (req->in.h.unique == unique) return req; } return NULL; @@ -1869,22 +1923,26 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, if (!fpq->connected) goto err_unlock_pq; - req = request_find(fpq, oh.unique); + req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); if (!req) goto err_unlock_pq; - /* Is it an interrupt reply? */ - if (req->intr_unique == oh.unique) { + /* Is it an interrupt reply ID? */ + if (oh.unique & FUSE_INT_REQ_BIT) { + __fuse_get_request(req); spin_unlock(&fpq->lock); err = -EINVAL; - if (nbytes != sizeof(struct fuse_out_header)) + if (nbytes != sizeof(struct fuse_out_header)) { + fuse_put_request(fc, req); goto err_finish; + } if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) queue_interrupt(&fc->iq, req); + fuse_put_request(fc, req); fuse_copy_finish(cs); return nbytes; @@ -2102,9 +2160,13 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) struct fuse_dev *fud; struct fuse_req *req, *next; LIST_HEAD(to_end); + unsigned int i; + /* Background queuing checks fc->connected under bg_lock */ + spin_lock(&fc->bg_lock); fc->connected = 0; - fc->blocked = 0; + spin_unlock(&fc->bg_lock); + fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { @@ -2123,11 +2185,16 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) } spin_unlock(&req->waitq.lock); } - list_splice_tail_init(&fpq->processing, &to_end); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_tail_init(&fpq->processing[i], + &to_end); spin_unlock(&fpq->lock); } + spin_lock(&fc->bg_lock); + fc->blocked = 0; fc->max_background = UINT_MAX; flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); spin_lock(&fiq->waitq.lock); fiq->connected = 0; @@ -2163,10 +2230,12 @@ int fuse_dev_release(struct inode *inode, struct file *file) struct fuse_conn *fc = fud->fc; struct fuse_pqueue *fpq = &fud->pq; LIST_HEAD(to_end); + unsigned int i; spin_lock(&fpq->lock); WARN_ON(!list_empty(&fpq->io)); - list_splice_init(&fpq->processing, &to_end); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); end_requests(fc, &to_end); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0979609d6eba..47395b0c3b35 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,24 +14,9 @@ #include <linux/namei.h> #include <linux/slab.h> #include <linux/xattr.h> +#include <linux/iversion.h> #include <linux/posix_acl.h> -static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) -{ - struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_inode *fi = get_fuse_inode(dir); - - if (!fc->do_readdirplus) - return false; - if (!fc->readdirplus_auto) - return true; - if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) - return true; - if (ctx->pos == 0) - return true; - return false; -} - static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -80,8 +65,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec) * Set dentry and possibly attribute timeouts from the lookup/mk* * replies */ -static void fuse_change_entry_timeout(struct dentry *entry, - struct fuse_entry_out *o) +void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) { fuse_dentry_settime(entry, time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); @@ -92,18 +76,29 @@ static u64 attr_timeout(struct fuse_attr_out *o) return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } -static u64 entry_attr_timeout(struct fuse_entry_out *o) +u64 entry_attr_timeout(struct fuse_entry_out *o) { return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } +static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) +{ + set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask); +} + /* * Mark the attributes as stale, so that at the next call to * ->getattr() they will be fetched from userspace */ void fuse_invalidate_attr(struct inode *inode) { - get_fuse_inode(inode)->i_time = 0; + fuse_invalidate_attr_mask(inode, STATX_BASIC_STATS); +} + +static void fuse_dir_changed(struct inode *dir) +{ + fuse_invalidate_attr(dir); + inode_maybe_inc_iversion(dir, false); } /** @@ -113,7 +108,7 @@ void fuse_invalidate_attr(struct inode *inode) void fuse_invalidate_atime(struct inode *inode) { if (!IS_RDONLY(inode)) - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, STATX_ATIME); } /* @@ -262,11 +257,6 @@ invalid: goto out; } -static int invalid_nodeid(u64 nodeid) -{ - return !nodeid || nodeid == FUSE_ROOT_ID; -} - static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); @@ -469,7 +459,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, kfree(forget); d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); err = finish_open(file, entry, generic_file_open); if (err) { fuse_sync_release(ff, flags); @@ -583,7 +573,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, } else { fuse_change_entry_timeout(entry, &outarg); } - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); return 0; out_put_forget_req: @@ -693,7 +683,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) drop_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); fuse_update_ctime(inode); } else if (err == -EINTR) @@ -715,7 +705,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) err = fuse_simple_request(fc, &args); if (!err) { clear_nlink(d_inode(entry)); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); @@ -754,9 +744,9 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, fuse_update_ctime(d_inode(newent)); } - fuse_invalidate_attr(olddir); + fuse_dir_changed(olddir); if (olddir != newdir) - fuse_invalidate_attr(newdir); + fuse_dir_changed(newdir); /* newent will end up negative */ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { @@ -932,7 +922,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, } static int fuse_update_get_attr(struct inode *inode, struct file *file, - struct kstat *stat, unsigned int flags) + struct kstat *stat, u32 request_mask, + unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; @@ -942,6 +933,8 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; + else if (request_mask & READ_ONCE(fi->inval_mask)) + sync = true; else sync = time_before64(fi->i_time, get_jiffies_64()); @@ -959,7 +952,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, int fuse_update_attributes(struct inode *inode, struct file *file) { - return fuse_update_get_attr(inode, file, NULL, 0); + /* Do *not* need to get atime for internal purposes */ + return fuse_update_get_attr(inode, file, NULL, + STATX_BASIC_STATS & ~STATX_ATIME, 0); } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, @@ -989,7 +984,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!entry) goto unlock; - fuse_invalidate_attr(parent); + fuse_dir_changed(parent); fuse_invalidate_entry(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { @@ -1165,271 +1160,78 @@ static int fuse_permission(struct inode *inode, int mask) return err; } -static int parse_dirfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx) -{ - while (nbytes >= FUSE_NAME_OFFSET) { - struct fuse_dirent *dirent = (struct fuse_dirent *) buf; - size_t reclen = FUSE_DIRENT_SIZE(dirent); - if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) - return -EIO; - if (reclen > nbytes) - break; - if (memchr(dirent->name, '/', dirent->namelen) != NULL) - return -EIO; - - if (!dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type)) - break; - - buf += reclen; - nbytes -= reclen; - ctx->pos = dirent->off; - } - - return 0; -} - -static int fuse_direntplus_link(struct file *file, - struct fuse_direntplus *direntplus, - u64 attr_version) -{ - struct fuse_entry_out *o = &direntplus->entry_out; - struct fuse_dirent *dirent = &direntplus->dirent; - struct dentry *parent = file->f_path.dentry; - struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); - struct dentry *dentry; - struct dentry *alias; - struct inode *dir = d_inode(parent); - struct fuse_conn *fc; - struct inode *inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); - - if (!o->nodeid) { - /* - * Unlike in the case of fuse_lookup, zero nodeid does not mean - * ENOENT. Instead, it only means the userspace filesystem did - * not want to return attributes/handle for this entry. - * - * So do nothing. - */ - return 0; - } - - if (name.name[0] == '.') { - /* - * We could potentially refresh the attributes of the directory - * and its parent? - */ - if (name.len == 1) - return 0; - if (name.name[1] == '.' && name.len == 2) - return 0; - } - - if (invalid_nodeid(o->nodeid)) - return -EIO; - if (!fuse_valid_type(o->attr.mode)) - return -EIO; - - fc = get_fuse_conn(dir); - - name.hash = full_name_hash(parent, name.name, name.len); - dentry = d_lookup(parent, &name); - if (!dentry) { -retry: - dentry = d_alloc_parallel(parent, &name, &wq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - } - if (!d_in_lookup(dentry)) { - struct fuse_inode *fi; - inode = d_inode(dentry); - if (!inode || - get_node_id(inode) != o->nodeid || - ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { - d_invalidate(dentry); - dput(dentry); - goto retry; - } - if (is_bad_inode(inode)) { - dput(dentry); - return -EIO; - } - - fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->nlookup++; - spin_unlock(&fc->lock); - - forget_all_cached_acls(inode); - fuse_change_attributes(inode, &o->attr, - entry_attr_timeout(o), - attr_version); - /* - * The other branch comes via fuse_iget() - * which bumps nlookup inside - */ - } else { - inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, - &o->attr, entry_attr_timeout(o), - attr_version); - if (!inode) - inode = ERR_PTR(-ENOMEM); - - alias = d_splice_alias(inode, dentry); - d_lookup_done(dentry); - if (alias) { - dput(dentry); - dentry = alias; - } - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - } - if (fc->readdirplus_auto) - set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); - fuse_change_entry_timeout(dentry, o); - - dput(dentry); - return 0; -} - -static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx, u64 attr_version) +static int fuse_readlink_page(struct inode *inode, struct page *page) { - struct fuse_direntplus *direntplus; - struct fuse_dirent *dirent; - size_t reclen; - int over = 0; - int ret; - - while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { - direntplus = (struct fuse_direntplus *) buf; - dirent = &direntplus->dirent; - reclen = FUSE_DIRENTPLUS_SIZE(direntplus); - - if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) - return -EIO; - if (reclen > nbytes) - break; - if (memchr(dirent->name, '/', dirent->namelen) != NULL) - return -EIO; - - if (!over) { - /* We fill entries into dstbuf only as much as - it can hold. But we still continue iterating - over remaining entries to link them. If not, - we need to send a FORGET for each of those - which we did not link. - */ - over = !dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type); - if (!over) - ctx->pos = dirent->off; - } - - buf += reclen; - nbytes -= reclen; - - ret = fuse_direntplus_link(file, direntplus, attr_version); - if (ret) - fuse_force_forget(file, direntplus->entry_out.nodeid); - } - - return 0; -} - -static int fuse_readdir(struct file *file, struct dir_context *ctx) -{ - int plus, err; - size_t nbytes; - struct page *page; - struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; - u64 attr_version = 0; - bool locked; - - if (is_bad_inode(inode)) - return -EIO; + int err; req = fuse_get_req(fc, 1); if (IS_ERR(req)) return PTR_ERR(req); - page = alloc_page(GFP_KERNEL); - if (!page) { - fuse_put_request(fc, req); - return -ENOMEM; - } - - plus = fuse_use_readdirplus(inode, ctx); + req->out.page_zeroing = 1; req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE; - if (plus) { - attr_version = fuse_get_attr_version(fc); - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); - } else { - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); - } - locked = fuse_lock_inode(inode); + req->page_descs[0].length = PAGE_SIZE - 1; + req->in.h.opcode = FUSE_READLINK; + req->in.h.nodeid = get_node_id(inode); + req->out.argvar = 1; + req->out.numargs = 1; + req->out.args[0].size = PAGE_SIZE - 1; fuse_request_send(fc, req); - fuse_unlock_inode(inode, locked); - nbytes = req->out.args[0].size; err = req->out.h.error; - fuse_put_request(fc, req); + if (!err) { - if (plus) { - err = parse_dirplusfile(page_address(page), nbytes, - file, ctx, - attr_version); - } else { - err = parse_dirfile(page_address(page), nbytes, file, - ctx); - } + char *link = page_address(page); + size_t len = req->out.args[0].size; + + BUG_ON(len >= PAGE_SIZE); + link[len] = '\0'; } - __free_page(page); + fuse_put_request(fc, req); fuse_invalidate_atime(inode); + return err; } -static const char *fuse_get_link(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) +static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - char *link; - ssize_t ret; + struct page *page; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out_err; + if (fc->cache_symlinks) + return page_get_link(dentry, inode, callback); + + err = -ECHILD; if (!dentry) - return ERR_PTR(-ECHILD); + goto out_err; - link = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!link) - return ERR_PTR(-ENOMEM); + page = alloc_page(GFP_KERNEL); + err = -ENOMEM; + if (!page) + goto out_err; - args.in.h.opcode = FUSE_READLINK; - args.in.h.nodeid = get_node_id(inode); - args.out.argvar = 1; - args.out.numargs = 1; - args.out.args[0].size = PAGE_SIZE - 1; - args.out.args[0].value = link; - ret = fuse_simple_request(fc, &args); - if (ret < 0) { - kfree(link); - link = ERR_PTR(ret); - } else { - link[ret] = '\0'; - set_delayed_call(done, kfree_link, link); + err = fuse_readlink_page(inode, page); + if (err) { + __free_page(page); + goto out_err; } - fuse_invalidate_atime(inode); - return link; + + set_delayed_call(callback, page_put_link, page); + + return page_address(page); + +out_err: + return ERR_PTR(err); } static int fuse_dir_open(struct inode *inode, struct file *file) @@ -1662,8 +1464,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, file = NULL; } - if (attr->ia_valid & ATTR_SIZE) + if (attr->ia_valid & ATTR_SIZE) { + if (WARN_ON(!S_ISREG(inode->i_mode))) + return -EIO; is_truncate = true; + } if (is_truncate) { fuse_set_nowrite(inode); @@ -1811,7 +1616,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, if (!fuse_allow_current_process(fc)) return -EACCES; - return fuse_update_get_attr(inode, NULL, stat, flags); + return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); } static const struct inode_operations fuse_dir_inode_operations = { @@ -1867,11 +1672,37 @@ void fuse_init_common(struct inode *inode) void fuse_init_dir(struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + inode->i_op = &fuse_dir_inode_operations; inode->i_fop = &fuse_dir_operations; + + spin_lock_init(&fi->rdc.lock); + fi->rdc.cached = false; + fi->rdc.size = 0; + fi->rdc.pos = 0; + fi->rdc.version = 0; } +static int fuse_symlink_readpage(struct file *null, struct page *page) +{ + int err = fuse_readlink_page(page->mapping->host, page); + + if (!err) + SetPageUptodate(page); + + unlock_page(page); + + return err; +} + +static const struct address_space_operations fuse_symlink_aops = { + .readpage = fuse_symlink_readpage, +}; + void fuse_init_symlink(struct inode *inode) { inode->i_op = &fuse_symlink_inode_operations; + inode->i_data.a_ops = &fuse_symlink_aops; + inode_nohighmem(inode); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 32d0b883e74f..58dbc39fea63 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -59,6 +59,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) } INIT_LIST_HEAD(&ff->write_entry); + mutex_init(&ff->readdir.lock); refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -73,6 +74,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) void fuse_file_free(struct fuse_file *ff) { fuse_request_free(ff->reserved_req); + mutex_destroy(&ff->readdir.lock); kfree(ff); } @@ -848,11 +850,11 @@ static int fuse_readpages_fill(void *_data, struct page *page) fuse_wait_on_page_writeback(inode, page->index); if (req->num_pages && - (req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (req->num_pages == fc->max_pages || (req->num_pages + 1) * PAGE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { - int nr_alloc = min_t(unsigned, data->nr_pages, - FUSE_MAX_PAGES_PER_REQ); + unsigned int nr_alloc = min_t(unsigned int, data->nr_pages, + fc->max_pages); fuse_send_readpages(req, data->file); if (fc->async_read) req = fuse_get_req_for_background(fc, nr_alloc); @@ -887,7 +889,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; - int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ); + unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages); err = -EIO; if (is_bad_inode(inode)) @@ -1102,12 +1104,13 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, return count > 0 ? count : err; } -static inline unsigned fuse_wr_pages(loff_t pos, size_t len) +static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, + unsigned int max_pages) { - return min_t(unsigned, + return min_t(unsigned int, ((pos + len - 1) >> PAGE_SHIFT) - (pos >> PAGE_SHIFT) + 1, - FUSE_MAX_PAGES_PER_REQ); + max_pages); } static ssize_t fuse_perform_write(struct kiocb *iocb, @@ -1129,7 +1132,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, do { struct fuse_req *req; ssize_t count; - unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii)); + unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), + fc->max_pages); req = fuse_get_req(fc, nr_pages); if (IS_ERR(req)) { @@ -1319,11 +1323,6 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, return ret < 0 ? ret : 0; } -static inline int fuse_iter_npages(const struct iov_iter *ii_p) -{ - return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ); -} - ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { @@ -1343,9 +1342,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, int err = 0; if (io->async) - req = fuse_get_req_for_background(fc, fuse_iter_npages(iter)); + req = fuse_get_req_for_background(fc, iov_iter_npages(iter, + fc->max_pages)); else - req = fuse_get_req(fc, fuse_iter_npages(iter)); + req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1390,9 +1390,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, fuse_put_request(fc, req); if (io->async) req = fuse_get_req_for_background(fc, - fuse_iter_npages(iter)); + iov_iter_npages(iter, fc->max_pages)); else - req = fuse_get_req(fc, fuse_iter_npages(iter)); + req = fuse_get_req(fc, iov_iter_npages(iter, + fc->max_pages)); if (IS_ERR(req)) break; } @@ -1418,7 +1419,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, res = fuse_direct_io(io, iter, ppos, 0); - fuse_invalidate_attr(inode); + fuse_invalidate_atime(inode); return res; } @@ -1487,6 +1488,7 @@ __acquires(fc->lock) struct fuse_inode *fi = get_fuse_inode(req->inode); struct fuse_write_in *inarg = &req->misc.write.in; __u64 data_size = req->num_pages * PAGE_SIZE; + bool queued; if (!fc->connected) goto out_free; @@ -1502,7 +1504,8 @@ __acquires(fc->lock) req->in.args[1].size = inarg->size; fi->writectr++; - fuse_request_send_background_locked(fc, req); + queued = fuse_request_queue_background(fc, req); + WARN_ON(!queued); return; out_free: @@ -1819,12 +1822,18 @@ static int fuse_writepages_fill(struct page *page, is_writeback = fuse_page_is_writeback(inode, page->index); if (req && req->num_pages && - (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (is_writeback || req->num_pages == fc->max_pages || (req->num_pages + 1) * PAGE_SIZE > fc->max_write || data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { fuse_writepages_send(data); data->req = NULL; + } else if (req && req->num_pages == req->max_pages) { + if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) { + fuse_writepages_send(data); + req = data->req = NULL; + } } + err = -ENOMEM; tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) @@ -1847,7 +1856,7 @@ static int fuse_writepages_fill(struct page *page, struct fuse_inode *fi = get_fuse_inode(inode); err = -ENOMEM; - req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ); + req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); if (!req) { __free_page(tmp_page); goto out_unlock; @@ -1904,6 +1913,7 @@ static int fuse_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_wb_data data; int err; @@ -1916,7 +1926,7 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + data.orig_pages = kcalloc(fc->max_pages, sizeof(struct page *), GFP_NOFS); if (!data.orig_pages) @@ -2387,10 +2397,11 @@ static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, } /* Make sure iov_length() won't overflow */ -static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, + size_t count) { size_t n; - u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + u32 max = fc->max_pages << PAGE_SHIFT; for (n = 0; n < count; n++, iov++) { if (iov->iov_len > (size_t) max) @@ -2514,7 +2525,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL); + pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!pages || !iov_page) goto out; @@ -2553,7 +2564,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, /* make sure there are enough buffer pages and init request with them */ err = -ENOMEM; - if (max_pages > FUSE_MAX_PAGES_PER_REQ) + if (max_pages > fc->max_pages) goto out; while (num_pages < max_pages) { pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); @@ -2640,11 +2651,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iov = iov_page; out_iov = in_iov + in_iovs; - err = fuse_verify_ioctl_iov(in_iov, in_iovs); + err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs); if (err) goto out; - err = fuse_verify_ioctl_iov(out_iov, out_iovs); + err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs); if (err) goto out; @@ -2835,9 +2846,9 @@ static void fuse_do_truncate(struct file *file) fuse_do_setattr(file_dentry(file), &attr, file); } -static inline loff_t fuse_round_up(loff_t off) +static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) { - return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); + return round_up(off, fc->max_pages << PAGE_SHIFT); } static ssize_t @@ -2866,7 +2877,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { if (offset >= i_size) return 0; - iov_iter_truncate(iter, fuse_round_up(i_size - offset)); + iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); count = iov_iter_count(iter); } @@ -3011,6 +3022,82 @@ out: return err; } +static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) +{ + struct fuse_file *ff_in = file_in->private_data; + struct fuse_file *ff_out = file_out->private_data; + struct inode *inode_out = file_inode(file_out); + struct fuse_inode *fi_out = get_fuse_inode(inode_out); + struct fuse_conn *fc = ff_in->fc; + FUSE_ARGS(args); + struct fuse_copy_file_range_in inarg = { + .fh_in = ff_in->fh, + .off_in = pos_in, + .nodeid_out = ff_out->nodeid, + .fh_out = ff_out->fh, + .off_out = pos_out, + .len = len, + .flags = flags + }; + struct fuse_write_out outarg; + ssize_t err; + /* mark unstable when write-back is not used, and file_out gets + * extended */ + bool is_unstable = (!fc->writeback_cache) && + ((pos_out + len) > inode_out->i_size); + + if (fc->no_copy_file_range) + return -EOPNOTSUPP; + + inode_lock(inode_out); + + if (fc->writeback_cache) { + err = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + len); + if (err) + goto out; + + fuse_sync_writes(inode_out); + } + + if (is_unstable) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); + + args.in.h.opcode = FUSE_COPY_FILE_RANGE; + args.in.h.nodeid = ff_in->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } + if (err) + goto out; + + if (fc->writeback_cache) { + fuse_write_update_size(inode_out, pos_out + outarg.size); + file_update_time(file_out); + } + + fuse_invalidate_attr(inode_out); + + err = outarg.size; +out: + if (is_unstable) + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); + + inode_unlock(inode_out); + + return err; +} + static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read_iter = fuse_file_read_iter, @@ -3027,6 +3114,7 @@ static const struct file_operations fuse_file_operations = { .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, .fallocate = fuse_file_fallocate, + .copy_file_range = fuse_copy_file_range, }; static const struct file_operations fuse_direct_io_file_operations = { @@ -3062,6 +3150,14 @@ static const struct address_space_operations fuse_file_aops = { void fuse_init_file_inode(struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + + INIT_LIST_HEAD(&fi->write_files); + INIT_LIST_HEAD(&fi->queued_writes); + fi->writectr = 0; + init_waitqueue_head(&fi->page_waitq); + INIT_LIST_HEAD(&fi->writepages); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f78e9614bb5f..e9f712e81c7d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -28,8 +28,11 @@ #include <linux/refcount.h> #include <linux/user_namespace.h> -/** Max number of pages that can be used in a single read request */ -#define FUSE_MAX_PAGES_PER_REQ 32 +/** Default max number of pages that can be used in a single read request */ +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 + +/** Maximum of max_pages received in init_out */ +#define FUSE_MAX_MAX_PAGES 256 /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN @@ -77,6 +80,9 @@ struct fuse_inode { /** Time in jiffies until the file attributes are valid */ u64 i_time; + /* Which attributes are invalid */ + u32 inval_mask; + /** The sticky bit in inode->i_mode may have been removed, so preserve the original mode */ umode_t orig_i_mode; @@ -87,21 +93,51 @@ struct fuse_inode { /** Version of last attribute change */ u64 attr_version; - /** Files usable in writepage. Protected by fc->lock */ - struct list_head write_files; + union { + /* Write related fields (regular file only) */ + struct { + /* Files usable in writepage. Protected by fc->lock */ + struct list_head write_files; + + /* Writepages pending on truncate or fsync */ + struct list_head queued_writes; - /** Writepages pending on truncate or fsync */ - struct list_head queued_writes; + /* Number of sent writes, a negative bias + * (FUSE_NOWRITE) means more writes are blocked */ + int writectr; + + /* Waitq for writepage completion */ + wait_queue_head_t page_waitq; + + /* List of writepage requestst (pending or sent) */ + struct list_head writepages; + }; + + /* readdir cache (directory only) */ + struct { + /* true if fully cached */ + bool cached; - /** Number of sent writes, a negative bias (FUSE_NOWRITE) - * means more writes are blocked */ - int writectr; + /* size of cache */ + loff_t size; - /** Waitq for writepage completion */ - wait_queue_head_t page_waitq; + /* position at end of cache (position of next entry) */ + loff_t pos; - /** List of writepage requestst (pending or sent) */ - struct list_head writepages; + /* version of the cache */ + u64 version; + + /* modification time of directory when cache was + * started */ + struct timespec64 mtime; + + /* iversion of directory when cache was started */ + u64 iversion; + + /* protects above fields */ + spinlock_t lock; + } rdc; + }; /** Miscellaneous bits describing inode state */ unsigned long state; @@ -148,6 +184,25 @@ struct fuse_file { /** Entry on inode's write_files list */ struct list_head write_entry; + /* Readdir related */ + struct { + /* + * Protects below fields against (crazy) parallel readdir on + * same open file. Uncontended in the normal case. + */ + struct mutex lock; + + /* Dir stream position */ + loff_t pos; + + /* Offset in cache */ + loff_t cache_off; + + /* Version of cache we are reading */ + u64 version; + + } readdir; + /** RB node to be linked on fuse_conn->polled_files */ struct rb_node polled_node; @@ -311,9 +366,6 @@ struct fuse_req { /** refcount */ refcount_t count; - /** Unique ID for the interrupt request */ - u64 intr_unique; - /* Request flags, updated with test/set/clear_bit() */ unsigned long flags; @@ -411,6 +463,9 @@ struct fuse_iqueue { struct fasync_struct *fasync; }; +#define FUSE_PQ_HASH_BITS 8 +#define FUSE_PQ_HASH_SIZE (1 << FUSE_PQ_HASH_BITS) + struct fuse_pqueue { /** Connection established */ unsigned connected; @@ -418,8 +473,8 @@ struct fuse_pqueue { /** Lock protecting accessess to members of this structure */ spinlock_t lock; - /** The list of requests being processed */ - struct list_head processing; + /** Hash table of requests being processed */ + struct list_head *processing; /** The list of requests under I/O */ struct list_head io; @@ -476,6 +531,9 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; + /** Maxmum number of pages that can be used in a single request */ + unsigned int max_pages; + /** Input queue */ struct fuse_iqueue iq; @@ -500,6 +558,10 @@ struct fuse_conn { /** The list of background requests set aside for later queuing */ struct list_head bg_queue; + /** Protects: max_background, congestion_threshold, num_background, + * active_background, bg_queue, blocked */ + spinlock_t bg_lock; + /** Flag indicating that INIT reply has been received. Allocating * any fuse request will be suspended until the flag is set */ int initialized; @@ -551,6 +613,9 @@ struct fuse_conn { /** handle fs handles killing suid/sgid/cap on write/chown/trunc */ unsigned handle_killpriv:1; + /** cache READLINK responses in page cache */ + unsigned cache_symlinks:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -637,6 +702,9 @@ struct fuse_conn { /** Allow other than the mounter user to access the filesystem ? */ unsigned allow_other:1; + /** Does the filesystem support copy_file_range? */ + unsigned no_copy_file_range:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -697,6 +765,11 @@ static inline u64 get_node_id(struct inode *inode) return get_fuse_inode(inode)->nodeid; } +static inline int invalid_nodeid(u64 nodeid) +{ + return !nodeid || nodeid == FUSE_ROOT_ID; +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -812,6 +885,10 @@ struct fuse_req *fuse_request_alloc(unsigned npages); struct fuse_req *fuse_request_alloc_nofs(unsigned npages); +bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, + gfp_t flags); + + /** * Free a request */ @@ -856,9 +933,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); * Send a request in the background */ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); - -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req); +bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); @@ -873,6 +948,9 @@ void fuse_invalidate_entry_cache(struct dentry *entry); void fuse_invalidate_atime(struct inode *inode); +u64 entry_attr_timeout(struct fuse_entry_out *o); +void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); + /** * Acquire reference to fuse_conn */ @@ -992,4 +1070,8 @@ struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type); int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); + +/* readdir.c */ +int fuse_readdir(struct file *file, struct dir_context *ctx); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index db9e60b7eb69..0b94b23b02d4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -90,16 +90,12 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi = get_fuse_inode(inode); fi->i_time = 0; + fi->inval_mask = 0; fi->nodeid = 0; fi->nlookup = 0; fi->attr_version = 0; - fi->writectr = 0; fi->orig_ino = 0; fi->state = 0; - INIT_LIST_HEAD(&fi->write_files); - INIT_LIST_HEAD(&fi->queued_writes); - INIT_LIST_HEAD(&fi->writepages); - init_waitqueue_head(&fi->page_waitq); mutex_init(&fi->mutex); fi->forget = fuse_alloc_forget(); if (!fi->forget) { @@ -119,8 +115,10 @@ static void fuse_i_callback(struct rcu_head *head) static void fuse_destroy_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); - BUG_ON(!list_empty(&fi->write_files)); - BUG_ON(!list_empty(&fi->queued_writes)); + if (S_ISREG(inode->i_mode)) { + WARN_ON(!list_empty(&fi->write_files)); + WARN_ON(!list_empty(&fi->queued_writes)); + } mutex_destroy(&fi->mutex); kfree(fi->forget); call_rcu(&inode->i_rcu, fuse_i_callback); @@ -167,6 +165,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, fi->attr_version = ++fc->attr_version; fi->i_time = attr_valid; + WRITE_ONCE(fi->inval_mask, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -594,9 +593,11 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq) static void fuse_pqueue_init(struct fuse_pqueue *fpq) { - memset(fpq, 0, sizeof(struct fuse_pqueue)); + unsigned int i; + spin_lock_init(&fpq->lock); - INIT_LIST_HEAD(&fpq->processing); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + INIT_LIST_HEAD(&fpq->processing[i]); INIT_LIST_HEAD(&fpq->io); fpq->connected = 1; } @@ -605,6 +606,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); + spin_lock_init(&fc->bg_lock); init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); @@ -852,6 +854,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); + spin_lock(&fc->bg_lock); if (arg->max_background) { fc->max_background = arg->max_background; @@ -865,6 +868,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) fc->congestion_threshold > max_user_congthresh) fc->congestion_threshold = max_user_congthresh; } + spin_unlock(&fc->bg_lock); } static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) @@ -924,8 +928,15 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->posix_acl = 1; fc->sb->s_xattr = fuse_acl_xattr_handlers; } + if (arg->flags & FUSE_CACHE_SYMLINKS) + fc->cache_symlinks = 1; if (arg->flags & FUSE_ABORT_ERROR) fc->abort_err = 1; + if (arg->flags & FUSE_MAX_PAGES) { + fc->max_pages = + min_t(unsigned int, FUSE_MAX_MAX_PAGES, + max_t(unsigned int, arg->max_pages, 1)); + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -957,7 +968,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | - FUSE_ABORT_ERROR; + FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1022,17 +1033,26 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) { struct fuse_dev *fud; + struct list_head *pq; fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); - if (fud) { - fud->fc = fuse_conn_get(fc); - fuse_pqueue_init(&fud->pq); + if (!fud) + return NULL; - spin_lock(&fc->lock); - list_add_tail(&fud->entry, &fc->devices); - spin_unlock(&fc->lock); + pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); + if (!pq) { + kfree(fud); + return NULL; } + fud->pq.processing = pq; + fud->fc = fuse_conn_get(fc); + fuse_pqueue_init(&fud->pq); + + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); + return fud; } EXPORT_SYMBOL_GPL(fuse_dev_alloc); @@ -1141,6 +1161,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = max_t(unsigned, 4096, d.max_read); + fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; /* Used by get_root_inode() */ sb->s_fs_info = fc; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c new file mode 100644 index 000000000000..ab18b78f4755 --- /dev/null +++ b/fs/fuse/readdir.c @@ -0,0 +1,569 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2018 Miklos Szeredi <miklos@szeredi.hu> + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + + +#include "fuse_i.h" +#include <linux/iversion.h> +#include <linux/posix_acl.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> + +static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_inode *fi = get_fuse_inode(dir); + + if (!fc->do_readdirplus) + return false; + if (!fc->readdirplus_auto) + return true; + if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) + return true; + if (ctx->pos == 0) + return true; + return false; +} + +static void fuse_add_dirent_to_cache(struct file *file, + struct fuse_dirent *dirent, loff_t pos) +{ + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); + size_t reclen = FUSE_DIRENT_SIZE(dirent); + pgoff_t index; + struct page *page; + loff_t size; + u64 version; + unsigned int offset; + void *addr; + + spin_lock(&fi->rdc.lock); + /* + * Is cache already completed? Or this entry does not go at the end of + * cache? + */ + if (fi->rdc.cached || pos != fi->rdc.pos) { + spin_unlock(&fi->rdc.lock); + return; + } + version = fi->rdc.version; + size = fi->rdc.size; + offset = size & ~PAGE_MASK; + index = size >> PAGE_SHIFT; + /* Dirent doesn't fit in current page? Jump to next page. */ + if (offset + reclen > PAGE_SIZE) { + index++; + offset = 0; + } + spin_unlock(&fi->rdc.lock); + + if (offset) { + page = find_lock_page(file->f_mapping, index); + } else { + page = find_or_create_page(file->f_mapping, index, + mapping_gfp_mask(file->f_mapping)); + } + if (!page) + return; + + spin_lock(&fi->rdc.lock); + /* Raced with another readdir */ + if (fi->rdc.version != version || fi->rdc.size != size || + WARN_ON(fi->rdc.pos != pos)) + goto unlock; + + addr = kmap_atomic(page); + if (!offset) + clear_page(addr); + memcpy(addr + offset, dirent, reclen); + kunmap_atomic(addr); + fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; + fi->rdc.pos = dirent->off; +unlock: + spin_unlock(&fi->rdc.lock); + unlock_page(page); + put_page(page); +} + +static void fuse_readdir_cache_end(struct file *file, loff_t pos) +{ + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); + loff_t end; + + spin_lock(&fi->rdc.lock); + /* does cache end position match current position? */ + if (fi->rdc.pos != pos) { + spin_unlock(&fi->rdc.lock); + return; + } + + fi->rdc.cached = true; + end = ALIGN(fi->rdc.size, PAGE_SIZE); + spin_unlock(&fi->rdc.lock); + + /* truncate unused tail of cache */ + truncate_inode_pages(file->f_mapping, end); +} + +static bool fuse_emit(struct file *file, struct dir_context *ctx, + struct fuse_dirent *dirent) +{ + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_CACHE_DIR) + fuse_add_dirent_to_cache(file, dirent, ctx->pos); + + return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, + dirent->type); +} + +static int parse_dirfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx) +{ + while (nbytes >= FUSE_NAME_OFFSET) { + struct fuse_dirent *dirent = (struct fuse_dirent *) buf; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!fuse_emit(file, ctx, dirent)) + break; + + buf += reclen; + nbytes -= reclen; + ctx->pos = dirent->off; + } + + return 0; +} + +static int fuse_direntplus_link(struct file *file, + struct fuse_direntplus *direntplus, + u64 attr_version) +{ + struct fuse_entry_out *o = &direntplus->entry_out; + struct fuse_dirent *dirent = &direntplus->dirent; + struct dentry *parent = file->f_path.dentry; + struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = d_inode(parent); + struct fuse_conn *fc; + struct inode *inode; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + + if (!o->nodeid) { + /* + * Unlike in the case of fuse_lookup, zero nodeid does not mean + * ENOENT. Instead, it only means the userspace filesystem did + * not want to return attributes/handle for this entry. + * + * So do nothing. + */ + return 0; + } + + if (name.name[0] == '.') { + /* + * We could potentially refresh the attributes of the directory + * and its parent? + */ + if (name.len == 1) + return 0; + if (name.name[1] == '.' && name.len == 2) + return 0; + } + + if (invalid_nodeid(o->nodeid)) + return -EIO; + if (!fuse_valid_type(o->attr.mode)) + return -EIO; + + fc = get_fuse_conn(dir); + + name.hash = full_name_hash(parent, name.name, name.len); + dentry = d_lookup(parent, &name); + if (!dentry) { +retry: + dentry = d_alloc_parallel(parent, &name, &wq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + if (!d_in_lookup(dentry)) { + struct fuse_inode *fi; + inode = d_inode(dentry); + if (!inode || + get_node_id(inode) != o->nodeid || + ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { + d_invalidate(dentry); + dput(dentry); + goto retry; + } + if (is_bad_inode(inode)) { + dput(dentry); + return -EIO; + } + + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + + forget_all_cached_acls(inode); + fuse_change_attributes(inode, &o->attr, + entry_attr_timeout(o), + attr_version); + /* + * The other branch comes via fuse_iget() + * which bumps nlookup inside + */ + } else { + inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, + &o->attr, entry_attr_timeout(o), + attr_version); + if (!inode) + inode = ERR_PTR(-ENOMEM); + + alias = d_splice_alias(inode, dentry); + d_lookup_done(dentry); + if (alias) { + dput(dentry); + dentry = alias; + } + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + if (fc->readdirplus_auto) + set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + fuse_change_entry_timeout(dentry, o); + + dput(dentry); + return 0; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx, u64 attr_version) +{ + struct fuse_direntplus *direntplus; + struct fuse_dirent *dirent; + size_t reclen; + int over = 0; + int ret; + + while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { + direntplus = (struct fuse_direntplus *) buf; + dirent = &direntplus->dirent; + reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!over) { + /* We fill entries into dstbuf only as much as + it can hold. But we still continue iterating + over remaining entries to link them. If not, + we need to send a FORGET for each of those + which we did not link. + */ + over = !fuse_emit(file, ctx, dirent); + if (!over) + ctx->pos = dirent->off; + } + + buf += reclen; + nbytes -= reclen; + + ret = fuse_direntplus_link(file, direntplus, attr_version); + if (ret) + fuse_force_forget(file, direntplus->entry_out.nodeid); + } + + return 0; +} + +static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) +{ + int plus, err; + size_t nbytes; + struct page *page; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + u64 attr_version = 0; + bool locked; + + req = fuse_get_req(fc, 1); + if (IS_ERR(req)) + return PTR_ERR(req); + + page = alloc_page(GFP_KERNEL); + if (!page) { + fuse_put_request(fc, req); + return -ENOMEM; + } + + plus = fuse_use_readdirplus(inode, ctx); + req->out.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + req->page_descs[0].length = PAGE_SIZE; + if (plus) { + attr_version = fuse_get_attr_version(fc); + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, + FUSE_READDIRPLUS); + } else { + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, + FUSE_READDIR); + } + locked = fuse_lock_inode(inode); + fuse_request_send(fc, req); + fuse_unlock_inode(inode, locked); + nbytes = req->out.args[0].size; + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + if (!nbytes) { + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_CACHE_DIR) + fuse_readdir_cache_end(file, ctx->pos); + } else if (plus) { + err = parse_dirplusfile(page_address(page), nbytes, + file, ctx, attr_version); + } else { + err = parse_dirfile(page_address(page), nbytes, file, + ctx); + } + } + + __free_page(page); + fuse_invalidate_atime(inode); + return err; +} + +enum fuse_parse_result { + FOUND_ERR = -1, + FOUND_NONE = 0, + FOUND_SOME, + FOUND_ALL, +}; + +static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, + void *addr, unsigned int size, + struct dir_context *ctx) +{ + unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK; + enum fuse_parse_result res = FOUND_NONE; + + WARN_ON(offset >= size); + + for (;;) { + struct fuse_dirent *dirent = addr + offset; + unsigned int nbytes = size - offset; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + + if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen) + break; + + if (WARN_ON(dirent->namelen > FUSE_NAME_MAX)) + return FOUND_ERR; + if (WARN_ON(reclen > nbytes)) + return FOUND_ERR; + if (WARN_ON(memchr(dirent->name, '/', dirent->namelen) != NULL)) + return FOUND_ERR; + + if (ff->readdir.pos == ctx->pos) { + res = FOUND_SOME; + if (!dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type)) + return FOUND_ALL; + ctx->pos = dirent->off; + } + ff->readdir.pos = dirent->off; + ff->readdir.cache_off += reclen; + + offset += reclen; + } + + return res; +} + +static void fuse_rdc_reset(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + fi->rdc.cached = false; + fi->rdc.version++; + fi->rdc.size = 0; + fi->rdc.pos = 0; +} + +#define UNCACHED 1 + +static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + enum fuse_parse_result res; + pgoff_t index; + unsigned int size; + struct page *page; + void *addr; + + /* Seeked? If so, reset the cache stream */ + if (ff->readdir.pos != ctx->pos) { + ff->readdir.pos = 0; + ff->readdir.cache_off = 0; + } + + /* + * We're just about to start reading into the cache or reading the + * cache; both cases require an up-to-date mtime value. + */ + if (!ctx->pos && fc->auto_inval_data) { + int err = fuse_update_attributes(inode, file); + + if (err) + return err; + } + +retry: + spin_lock(&fi->rdc.lock); +retry_locked: + if (!fi->rdc.cached) { + /* Starting cache? Set cache mtime. */ + if (!ctx->pos && !fi->rdc.size) { + fi->rdc.mtime = inode->i_mtime; + fi->rdc.iversion = inode_query_iversion(inode); + } + spin_unlock(&fi->rdc.lock); + return UNCACHED; + } + /* + * When at the beginning of the directory (i.e. just after opendir(3) or + * rewinddir(3)), then need to check whether directory contents have + * changed, and reset the cache if so. + */ + if (!ctx->pos) { + if (inode_peek_iversion(inode) != fi->rdc.iversion || + !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) { + fuse_rdc_reset(inode); + goto retry_locked; + } + } + + /* + * If cache version changed since the last getdents() call, then reset + * the cache stream. + */ + if (ff->readdir.version != fi->rdc.version) { + ff->readdir.pos = 0; + ff->readdir.cache_off = 0; + } + /* + * If at the beginning of the cache, than reset version to + * current. + */ + if (ff->readdir.pos == 0) + ff->readdir.version = fi->rdc.version; + + WARN_ON(fi->rdc.size < ff->readdir.cache_off); + + index = ff->readdir.cache_off >> PAGE_SHIFT; + + if (index == (fi->rdc.size >> PAGE_SHIFT)) + size = fi->rdc.size & ~PAGE_MASK; + else + size = PAGE_SIZE; + spin_unlock(&fi->rdc.lock); + + /* EOF? */ + if ((ff->readdir.cache_off & ~PAGE_MASK) == size) + return 0; + + page = find_get_page_flags(file->f_mapping, index, + FGP_ACCESSED | FGP_LOCK); + spin_lock(&fi->rdc.lock); + if (!page) { + /* + * Uh-oh: page gone missing, cache is useless + */ + if (fi->rdc.version == ff->readdir.version) + fuse_rdc_reset(inode); + goto retry_locked; + } + + /* Make sure it's still the same version after getting the page. */ + if (ff->readdir.version != fi->rdc.version) { + spin_unlock(&fi->rdc.lock); + unlock_page(page); + put_page(page); + goto retry; + } + spin_unlock(&fi->rdc.lock); + + /* + * Contents of the page are now protected against changing by holding + * the page lock. + */ + addr = kmap(page); + res = fuse_parse_cache(ff, addr, size, ctx); + kunmap(page); + unlock_page(page); + put_page(page); + + if (res == FOUND_ERR) + return -EIO; + + if (res == FOUND_ALL) + return 0; + + if (size == PAGE_SIZE) { + /* We hit end of page: skip to next page. */ + ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE); + goto retry; + } + + /* + * End of cache reached. If found position, then we are done, otherwise + * need to fall back to uncached, since the position we were looking for + * wasn't in the cache. + */ + return res == FOUND_SOME ? 0 : UNCACHED; +} + +int fuse_readdir(struct file *file, struct dir_context *ctx) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + int err; + + if (is_bad_inode(inode)) + return -EIO; + + mutex_lock(&ff->readdir.lock); + + err = UNCACHED; + if (ff->open_flags & FOPEN_CACHE_DIR) + err = fuse_readdir_cached(file, ctx); + if (err == UNCACHED) + err = fuse_readdir_uncached(file, ctx); + + mutex_unlock(&ff->readdir.lock); + + return err; +} diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 31e8270d0b26..8afbb35559b9 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -366,7 +366,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 9a8772465a90..896396554bcc 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -425,6 +425,10 @@ skip: if (new_node) { __be32 cnid; + if (!new_node->parent) { + hfs_btree_inc_height(tree); + new_node->parent = tree->root; + } fd->bnode = hfs_bnode_find(tree, new_node->parent); /* create index key and entry */ hfs_bnode_read_key(new_node, fd->search_key, 14); @@ -441,6 +445,7 @@ skip: /* restore search_key */ hfs_bnode_read_key(node, fd->search_key, 14); } + new_node = NULL; } if (!rec && node->parent) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 374b5688e29e..98b96ffb95ed 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -220,25 +220,17 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) return node; } -struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +/* Make sure @tree has enough space for the @rsvd_nodes */ +int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes) { - struct hfs_bnode *node, *next_node; - struct page **pagep; - u32 nidx, idx; - unsigned off; - u16 off16; - u16 len; - u8 *data, byte, m; - int i; - - while (!tree->free_nodes) { - struct inode *inode = tree->inode; - u32 count; - int res; + struct inode *inode = tree->inode; + u32 count; + int res; + while (tree->free_nodes < rsvd_nodes) { res = hfs_extend_file(inode); if (res) - return ERR_PTR(res); + return res; HFS_I(inode)->phys_size = inode->i_size = (loff_t)HFS_I(inode)->alloc_blocks * HFS_SB(tree->sb)->alloc_blksz; @@ -246,9 +238,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) tree->sb->s_blocksize_bits; inode_set_bytes(inode, inode->i_size); count = inode->i_size >> tree->node_size_shift; - tree->free_nodes = count - tree->node_count; + tree->free_nodes += count - tree->node_count; tree->node_count = count; } + return 0; +} + +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *next_node; + struct page **pagep; + u32 nidx, idx; + unsigned off; + u16 off16; + u16 len; + u8 *data, byte, m; + int i, res; + + res = hfs_bmap_reserve(tree, 1); + if (res) + return ERR_PTR(res); nidx = 0; node = hfs_bnode_find(tree, nidx); diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index c8b252dbb26c..dcc2aab1b2c4 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -82,6 +82,7 @@ struct hfs_find_data { extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp); extern void hfs_btree_close(struct hfs_btree *); extern void hfs_btree_write(struct hfs_btree *); +extern int hfs_bmap_reserve(struct hfs_btree *, int); extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *); extern void hfs_bmap_free(struct hfs_bnode *node); diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 8a66405b0f8b..d365bf0b8c77 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -97,6 +97,14 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i if (err) return err; + /* + * Fail early and avoid ENOSPC during the btree operations. We may + * have to split the root node at most once. + */ + err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth); + if (err) + goto err2; + hfs_cat_build_key(sb, fd.search_key, cnid, NULL); entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ? HFS_CDR_THD : HFS_CDR_FTH, @@ -295,6 +303,14 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, return err; dst_fd = src_fd; + /* + * Fail early and avoid ENOSPC during the btree operations. We may + * have to split the root node at most once. + */ + err = hfs_bmap_reserve(src_fd.tree, 2 * src_fd.tree->depth); + if (err) + goto out; + /* find the old dir entry and read the data */ hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); err = hfs_brec_find(&src_fd); diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 5d0182654580..263d5028d9d1 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -117,6 +117,10 @@ static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) { if (res != -ENOENT) return res; + /* Fail early and avoid ENOSPC during the btree operation */ + res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1); + if (res) + return res; hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec)); HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); } else { @@ -300,7 +304,7 @@ int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type) return 0; blocks = 0; - for (i = 0; i < 3; extent++, i++) + for (i = 0; i < 3; i++) blocks += be16_to_cpu(extent[i].count); res = hfs_free_extents(sb, extent, blocks, blocks); @@ -341,7 +345,9 @@ int hfs_get_block(struct inode *inode, sector_t block, ablock = (u32)block / HFS_SB(sb)->fs_div; if (block >= HFS_I(inode)->fs_blocks) { - if (block > HFS_I(inode)->fs_blocks || !create) + if (!create) + return 0; + if (block > HFS_I(inode)->fs_blocks) return -EIO; if (ablock >= HFS_I(inode)->alloc_blocks) { res = hfs_extend_file(inode); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a2dfa1b2a89c..da243c84e93b 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -642,6 +642,8 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) truncate_setsize(inode, attr->ia_size); hfs_file_truncate(inode); + inode->i_atime = inode->i_mtime = inode->i_ctime = + current_time(inode); } setattr_copy(inode, attr); diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index 2bab6b3cdba4..e6d554476db4 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -217,6 +217,11 @@ int hfsplus_create_attr(struct inode *inode, if (err) goto failed_init_create_attr; + /* Fail early and avoid ENOSPC during the btree operation */ + err = hfs_bmap_reserve(fd.tree, fd.tree->depth + 1); + if (err) + goto failed_create_attr; + if (name) { err = hfsplus_attr_build_key(sb, fd.search_key, inode->i_ino, name); @@ -313,6 +318,11 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) if (err) return err; + /* Fail early and avoid ENOSPC during the btree operation */ + err = hfs_bmap_reserve(fd.tree, fd.tree->depth); + if (err) + goto out; + if (name) { err = hfsplus_attr_build_key(sb, fd.search_key, inode->i_ino, name); diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index ed8eacb34452..1918544a7871 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -429,6 +429,10 @@ skip: if (new_node) { __be32 cnid; + if (!new_node->parent) { + hfs_btree_inc_height(tree); + new_node->parent = tree->root; + } fd->bnode = hfs_bnode_find(tree, new_node->parent); /* create index key and entry */ hfs_bnode_read_key(new_node, fd->search_key, 14); @@ -445,6 +449,7 @@ skip: /* restore search_key */ hfs_bnode_read_key(node, fd->search_key, 14); } + new_node = NULL; } if (!rec && node->parent) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index de14b2b6881b..236efe51eca6 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -342,26 +342,21 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) return node; } -struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +/* Make sure @tree has enough space for the @rsvd_nodes */ +int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes) { - struct hfs_bnode *node, *next_node; - struct page **pagep; - u32 nidx, idx; - unsigned off; - u16 off16; - u16 len; - u8 *data, byte, m; - int i; + struct inode *inode = tree->inode; + struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + u32 count; + int res; - while (!tree->free_nodes) { - struct inode *inode = tree->inode; - struct hfsplus_inode_info *hip = HFSPLUS_I(inode); - u32 count; - int res; + if (rsvd_nodes <= 0) + return 0; + while (tree->free_nodes < rsvd_nodes) { res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree)); if (res) - return ERR_PTR(res); + return res; hip->phys_size = inode->i_size = (loff_t)hip->alloc_blocks << HFSPLUS_SB(tree->sb)->alloc_blksz_shift; @@ -369,9 +364,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift; inode_set_bytes(inode, inode->i_size); count = inode->i_size >> tree->node_size_shift; - tree->free_nodes = count - tree->node_count; + tree->free_nodes += count - tree->node_count; tree->node_count = count; } + return 0; +} + +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) +{ + struct hfs_bnode *node, *next_node; + struct page **pagep; + u32 nidx, idx; + unsigned off; + u16 off16; + u16 len; + u8 *data, byte, m; + int i, res; + + res = hfs_bmap_reserve(tree, 1); + if (res) + return ERR_PTR(res); nidx = 0; node = hfs_bnode_find(tree, nidx); diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index a196369ba779..35472cba750e 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -265,6 +265,14 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, if (err) return err; + /* + * Fail early and avoid ENOSPC during the btree operations. We may + * have to split the root node at most once. + */ + err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth); + if (err) + goto err2; + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? @@ -333,6 +341,14 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) if (err) return err; + /* + * Fail early and avoid ENOSPC during the btree operations. We may + * have to split the root node at most once. + */ + err = hfs_bmap_reserve(fd.tree, 2 * (int)fd.tree->depth - 2); + if (err) + goto out; + if (!str) { int len; @@ -433,6 +449,14 @@ int hfsplus_rename_cat(u32 cnid, return err; dst_fd = src_fd; + /* + * Fail early and avoid ENOSPC during the btree operations. We may + * have to split the root node at most twice. + */ + err = hfs_bmap_reserve(src_fd.tree, 4 * (int)src_fd.tree->depth - 1); + if (err) + goto out; + /* find the old dir entry and read the data */ err = hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 8e0f59767694..a930ddd15681 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -100,6 +100,10 @@ static int __hfsplus_ext_write_extent(struct inode *inode, if (hip->extent_state & HFSPLUS_EXT_NEW) { if (res != -ENOENT) return res; + /* Fail early and avoid ENOSPC during the btree operation */ + res = hfs_bmap_reserve(fd->tree, fd->tree->depth + 1); + if (res) + return res; hfs_brec_insert(fd, hip->cached_extents, sizeof(hfsplus_extent_rec)); hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); @@ -233,7 +237,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, ablock = iblock >> sbi->fs_shift; if (iblock >= hip->fs_blocks) { - if (iblock > hip->fs_blocks || !create) + if (!create) + return 0; + if (iblock > hip->fs_blocks) return -EIO; if (ablock >= hip->alloc_blocks) { res = hfsplus_file_extend(inode, false); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 8e039435958a..dd7ad9f13e3a 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -311,6 +311,7 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) #define hfs_btree_open hfsplus_btree_open #define hfs_btree_close hfsplus_btree_close #define hfs_btree_write hfsplus_btree_write +#define hfs_bmap_reserve hfsplus_bmap_reserve #define hfs_bmap_alloc hfsplus_bmap_alloc #define hfs_bmap_free hfsplus_bmap_free #define hfs_bnode_read hfsplus_bnode_read @@ -395,6 +396,7 @@ u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors, struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id); void hfs_btree_close(struct hfs_btree *tree); int hfs_btree_write(struct hfs_btree *tree); +int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes); struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree); void hfs_bmap_free(struct hfs_bnode *node); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 8e9427a42b81..d7ab9d8c4b67 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -261,6 +261,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) } truncate_setsize(inode, attr->ia_size); hfsplus_file_truncate(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); } setattr_copy(inode, attr); diff --git a/fs/inode.c b/fs/inode.c index 42f6d25f32a5..9e198f00b64c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -10,7 +10,7 @@ #include <linux/swap.h> #include <linux/security.h> #include <linux/cdev.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/fsnotify.h> #include <linux/mount.h> #include <linux/posix_acl.h> @@ -349,7 +349,7 @@ EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { - INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT); + xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 947ce22f5b3c..f0fe641893a5 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -46,7 +46,7 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod return i; } -/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */ +/* Acorn extensions written by Matthew Wilcox <willy@infradead.org> 1998 */ int get_acorn_filename(struct iso_directory_record *de, char *retname, struct inode *inode) { diff --git a/fs/lockd/host.c b/fs/lockd/host.c index d35cd6be0675..93fb7cf0b92b 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -341,7 +341,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, }; struct lockd_net *ln = net_generic(net, lockd_net_id); - dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, + dprintk("lockd: %s(host='%.*s', vers=%u, proto=%s)\n", __func__, (int)hostname_len, hostname, rqstp->rq_vers, (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); diff --git a/fs/namespace.c b/fs/namespace.c index d86830c86ce8..98d27da43304 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -23,7 +23,7 @@ #include <linux/uaccess.h> #include <linux/proc_ns.h> #include <linux/magic.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/task_work.h> #include <linux/sched/task.h> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 06cb0c1d9aee..d3781cd983f6 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -896,7 +896,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (end != inode->i_mapping->nrpages) { rcu_read_lock(); - end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); + end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX); rcu_read_unlock(); } diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 060c658eab66..a7d3df85736d 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -65,6 +65,7 @@ struct nfs_dns_ent { struct sockaddr_storage addr; size_t addrlen; + struct rcu_head rcu_head; }; @@ -101,15 +102,23 @@ static void nfs_dns_ent_init(struct cache_head *cnew, } } -static void nfs_dns_ent_put(struct kref *ref) +static void nfs_dns_ent_free_rcu(struct rcu_head *head) { struct nfs_dns_ent *item; - item = container_of(ref, struct nfs_dns_ent, h.ref); + item = container_of(head, struct nfs_dns_ent, rcu_head); kfree(item->hostname); kfree(item); } +static void nfs_dns_ent_put(struct kref *ref) +{ + struct nfs_dns_ent *item; + + item = container_of(ref, struct nfs_dns_ent, h.ref); + call_rcu(&item->rcu_head, nfs_dns_ent_free_rcu); +} + static struct cache_head *nfs_dns_ent_alloc(void) { struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL); @@ -195,7 +204,7 @@ static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, { struct cache_head *ch; - ch = sunrpc_cache_lookup(cd, + ch = sunrpc_cache_lookup_rcu(cd, &key->h, nfs_dns_hash(key)); if (!ch) diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index b7559c6f2b97..4a98537efb0f 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -19,18 +19,22 @@ * is much larger than a sockaddr_in6. */ struct svc_cacherep { - struct list_head c_lru; + struct { + /* Keep often-read xid, csum in the same cache line: */ + __be32 k_xid; + __wsum k_csum; + u32 k_proc; + u32 k_prot; + u32 k_vers; + unsigned int k_len; + struct sockaddr_in6 k_addr; + } c_key; + struct rb_node c_node; + struct list_head c_lru; unsigned char c_state, /* unused, inprog, done */ c_type, /* status, buffer */ c_secure : 1; /* req came from port < 1024 */ - struct sockaddr_in6 c_addr; - __be32 c_xid; - u32 c_prot; - u32 c_proc; - u32 c_vers; - unsigned int c_len; - __wsum c_csum; unsigned long c_timestamp; union { struct kvec u_vec; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index a1143f7c2201..802993d8912f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -46,7 +46,7 @@ static void expkey_put(struct kref *ref) !test_bit(CACHE_NEGATIVE, &key->h.flags)) path_put(&key->ek_path); auth_domain_put(key->ek_client); - kfree(key); + kfree_rcu(key, ek_rcu); } static void expkey_request(struct cache_detail *cd, @@ -265,7 +265,7 @@ svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item) struct cache_head *ch; int hash = svc_expkey_hash(item); - ch = sunrpc_cache_lookup(cd, &item->h, hash); + ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct svc_expkey, h); else @@ -314,7 +314,7 @@ static void svc_export_put(struct kref *ref) auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); kfree(exp->ex_uuid); - kfree(exp); + kfree_rcu(exp, ex_rcu); } static void svc_export_request(struct cache_detail *cd, @@ -780,7 +780,7 @@ svc_export_lookup(struct svc_export *exp) struct cache_head *ch; int hash = svc_export_hash(exp); - ch = sunrpc_cache_lookup(exp->cd, &exp->h, hash); + ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash); if (ch) return container_of(ch, struct svc_export, h); else @@ -1216,9 +1216,9 @@ static int e_show(struct seq_file *m, void *p) } const struct seq_operations nfs_exports_op = { - .start = cache_seq_start, - .next = cache_seq_next, - .stop = cache_seq_stop, + .start = cache_seq_start_rcu, + .next = cache_seq_next_rcu, + .stop = cache_seq_stop_rcu, .show = e_show, }; diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index c8b74126ddaa..e7daa1f246f0 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -61,6 +61,7 @@ struct svc_export { u32 ex_layout_types; struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; + struct rcu_head ex_rcu; }; /* an "export key" (expkey) maps a filehandlefragement to an @@ -75,6 +76,7 @@ struct svc_expkey { u32 ek_fsid[6]; struct path ek_path; + struct rcu_head ek_rcu; }; #define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 426f55005697..32cb8c027483 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -123,6 +123,14 @@ struct nfsd_net { wait_queue_head_t ntf_wq; atomic_t ntf_refcnt; + + /* + * clientid and stateid data for construction of net unique COPY + * stateids. + */ + u32 s2s_cp_cl_id; + struct idr s2s_cp_stateids; + spinlock_t s2s_cp_lock; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 601bf33c26a0..25987bcdf96f 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -39,6 +39,7 @@ #include "state.h" #include "netns.h" #include "xdr4cb.h" +#include "xdr4.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -105,6 +106,7 @@ enum nfs_cb_opnum4 { OP_CB_WANTS_CANCELLED = 12, OP_CB_NOTIFY_LOCK = 13, OP_CB_NOTIFY_DEVICEID = 14, + OP_CB_OFFLOAD = 15, OP_CB_ILLEGAL = 10044 }; @@ -683,6 +685,101 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, } /* + * struct write_response4 { + * stateid4 wr_callback_id<1>; + * length4 wr_count; + * stable_how4 wr_committed; + * verifier4 wr_writeverf; + * }; + * union offload_info4 switch (nfsstat4 coa_status) { + * case NFS4_OK: + * write_response4 coa_resok4; + * default: + * length4 coa_bytes_copied; + * }; + * struct CB_OFFLOAD4args { + * nfs_fh4 coa_fh; + * stateid4 coa_stateid; + * offload_info4 coa_offload_info; + * }; + */ +static void encode_offload_info4(struct xdr_stream *xdr, + __be32 nfserr, + const struct nfsd4_copy *cp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p++ = nfserr; + if (!nfserr) { + p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); + p = xdr_encode_empty_array(p); + p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written); + *p++ = cpu_to_be32(cp->cp_res.wr_stable_how); + p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data, + NFS4_VERIFIER_SIZE); + } else { + p = xdr_reserve_space(xdr, 8); + /* We always return success if bytes were written */ + p = xdr_encode_hyper(p, 0); + } +} + +static void encode_cb_offload4args(struct xdr_stream *xdr, + __be32 nfserr, + const struct knfsd_fh *fh, + const struct nfsd4_copy *cp, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + *p++ = cpu_to_be32(OP_CB_OFFLOAD); + encode_nfs_fh4(xdr, fh); + encode_stateid4(xdr, &cp->cp_res.cb_stateid); + encode_offload_info4(xdr, nfserr, cp); + + hdr->nops++; +} + +static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfsd4_callback *cb = data; + const struct nfsd4_copy *cp = + container_of(cb, struct nfsd4_copy, cp_cb); + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr); + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + if (cb) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + } + return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status); +} +/* * RPC procedure tables */ #define PROC(proc, call, argtype, restype) \ @@ -703,6 +800,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), #endif PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), + PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), }; static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index a5bb76593ce7..bf137fec33ff 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -65,6 +65,7 @@ struct ent { u32 id; char name[IDMAP_NAMESZ]; char authname[IDMAP_NAMESZ]; + struct rcu_head rcu_head; }; /* Common entry handling */ @@ -89,7 +90,7 @@ static void ent_put(struct kref *ref) { struct ent *map = container_of(ref, struct ent, h.ref); - kfree(map); + kfree_rcu(map, rcu_head); } static struct cache_head * @@ -264,8 +265,8 @@ out: static struct ent * idtoname_lookup(struct cache_detail *cd, struct ent *item) { - struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h, - idtoname_hash(item)); + struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, + idtoname_hash(item)); if (ch) return container_of(ch, struct ent, h); else @@ -422,8 +423,8 @@ out: static struct ent * nametoid_lookup(struct cache_detail *cd, struct ent *item) { - struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h, - nametoid_hash(item)); + struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, + nametoid_hash(item)); if (ch) return container_of(ch, struct ent, h); else diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b7bc6e1a85ac..edff074d38c7 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -36,6 +36,7 @@ #include <linux/file.h> #include <linux/falloc.h> #include <linux/slab.h> +#include <linux/kthread.h> #include "idmap.h" #include "cache.h" @@ -1089,36 +1090,254 @@ out: return status; } +void nfs4_put_copy(struct nfsd4_copy *copy) +{ + if (!refcount_dec_and_test(©->refcount)) + return; + kfree(copy); +} + +static bool +check_and_set_stop_copy(struct nfsd4_copy *copy) +{ + bool value; + + spin_lock(©->cp_clp->async_lock); + value = copy->stopped; + if (!copy->stopped) + copy->stopped = true; + spin_unlock(©->cp_clp->async_lock); + return value; +} + +static void nfsd4_stop_copy(struct nfsd4_copy *copy) +{ + /* only 1 thread should stop the copy */ + if (!check_and_set_stop_copy(copy)) + kthread_stop(copy->copy_task); + nfs4_put_copy(copy); +} + +static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp) +{ + struct nfsd4_copy *copy = NULL; + + spin_lock(&clp->async_lock); + if (!list_empty(&clp->async_copies)) { + copy = list_first_entry(&clp->async_copies, struct nfsd4_copy, + copies); + refcount_inc(©->refcount); + } + spin_unlock(&clp->async_lock); + return copy; +} + +void nfsd4_shutdown_copy(struct nfs4_client *clp) +{ + struct nfsd4_copy *copy; + + while ((copy = nfsd4_get_copy(clp)) != NULL) + nfsd4_stop_copy(copy); +} + +static void nfsd4_cb_offload_release(struct nfsd4_callback *cb) +{ + struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb); + + nfs4_put_copy(copy); +} + +static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + return 1; +} + +static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = { + .release = nfsd4_cb_offload_release, + .done = nfsd4_cb_offload_done +}; + +static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) +{ + copy->cp_res.wr_stable_how = NFS_UNSTABLE; + copy->cp_synchronous = sync; + gen_boot_verifier(©->cp_res.wr_verifier, copy->cp_clp->net); +} + +static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) +{ + ssize_t bytes_copied = 0; + size_t bytes_total = copy->cp_count; + u64 src_pos = copy->cp_src_pos; + u64 dst_pos = copy->cp_dst_pos; + + do { + if (kthread_should_stop()) + break; + bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos, + copy->file_dst, dst_pos, bytes_total); + if (bytes_copied <= 0) + break; + bytes_total -= bytes_copied; + copy->cp_res.wr_bytes_written += bytes_copied; + src_pos += bytes_copied; + dst_pos += bytes_copied; + } while (bytes_total > 0 && !copy->cp_synchronous); + return bytes_copied; +} + +static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) +{ + __be32 status; + ssize_t bytes; + + bytes = _nfsd_copy_file_range(copy); + /* for async copy, we ignore the error, client can always retry + * to get the error + */ + if (bytes < 0 && !copy->cp_res.wr_bytes_written) + status = nfserrno(bytes); + else { + nfsd4_init_copy_res(copy, sync); + status = nfs_ok; + } + + fput(copy->file_src); + fput(copy->file_dst); + return status; +} + +static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) +{ + dst->cp_src_pos = src->cp_src_pos; + dst->cp_dst_pos = src->cp_dst_pos; + dst->cp_count = src->cp_count; + dst->cp_synchronous = src->cp_synchronous; + memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res)); + memcpy(&dst->fh, &src->fh, sizeof(src->fh)); + dst->cp_clp = src->cp_clp; + dst->file_dst = get_file(src->file_dst); + dst->file_src = get_file(src->file_src); + memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid)); +} + +static void cleanup_async_copy(struct nfsd4_copy *copy) +{ + nfs4_free_cp_state(copy); + fput(copy->file_dst); + fput(copy->file_src); + spin_lock(©->cp_clp->async_lock); + list_del(©->copies); + spin_unlock(©->cp_clp->async_lock); + nfs4_put_copy(copy); +} + +static int nfsd4_do_async_copy(void *data) +{ + struct nfsd4_copy *copy = (struct nfsd4_copy *)data; + struct nfsd4_copy *cb_copy; + + copy->nfserr = nfsd4_do_copy(copy, 0); + cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); + if (!cb_copy) + goto out; + memcpy(&cb_copy->cp_res, ©->cp_res, sizeof(copy->cp_res)); + cb_copy->cp_clp = copy->cp_clp; + cb_copy->nfserr = copy->nfserr; + memcpy(&cb_copy->fh, ©->fh, sizeof(copy->fh)); + nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp, + &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); + nfsd4_run_cb(&cb_copy->cp_cb); +out: + cleanup_async_copy(copy); + return 0; +} + static __be32 nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_copy *copy = &u->copy; - struct file *src, *dst; __be32 status; - ssize_t bytes; + struct nfsd4_copy *async_copy = NULL; - status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, &src, - ©->cp_dst_stateid, &dst); + status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, + ©->file_src, ©->cp_dst_stateid, + ©->file_dst); if (status) goto out; - bytes = nfsd_copy_file_range(src, copy->cp_src_pos, - dst, copy->cp_dst_pos, copy->cp_count); + copy->cp_clp = cstate->clp; + memcpy(©->fh, &cstate->current_fh.fh_handle, + sizeof(struct knfsd_fh)); + if (!copy->cp_synchronous) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - if (bytes < 0) - status = nfserrno(bytes); - else { - copy->cp_res.wr_bytes_written = bytes; - copy->cp_res.wr_stable_how = NFS_UNSTABLE; - copy->cp_synchronous = 1; - gen_boot_verifier(©->cp_res.wr_verifier, SVC_NET(rqstp)); + status = nfserrno(-ENOMEM); + async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); + if (!async_copy) + goto out; + if (!nfs4_init_cp_state(nn, copy)) { + kfree(async_copy); + goto out; + } + refcount_set(&async_copy->refcount, 1); + memcpy(©->cp_res.cb_stateid, ©->cp_stateid, + sizeof(copy->cp_stateid)); + dup_copy_fields(copy, async_copy); + async_copy->copy_task = kthread_create(nfsd4_do_async_copy, + async_copy, "%s", "copy thread"); + if (IS_ERR(async_copy->copy_task)) + goto out_err; + spin_lock(&async_copy->cp_clp->async_lock); + list_add(&async_copy->copies, + &async_copy->cp_clp->async_copies); + spin_unlock(&async_copy->cp_clp->async_lock); + wake_up_process(async_copy->copy_task); status = nfs_ok; + } else + status = nfsd4_do_copy(copy, 1); +out: + return status; +out_err: + cleanup_async_copy(async_copy); + goto out; +} + +struct nfsd4_copy * +find_async_copy(struct nfs4_client *clp, stateid_t *stateid) +{ + struct nfsd4_copy *copy; + + spin_lock(&clp->async_lock); + list_for_each_entry(copy, &clp->async_copies, copies) { + if (memcmp(©->cp_stateid, stateid, NFS4_STATEID_SIZE)) + continue; + refcount_inc(©->refcount); + spin_unlock(&clp->async_lock); + return copy; } + spin_unlock(&clp->async_lock); + return NULL; +} + +static __be32 +nfsd4_offload_cancel(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_offload_status *os = &u->offload_status; + __be32 status = 0; + struct nfsd4_copy *copy; + struct nfs4_client *clp = cstate->clp; + + copy = find_async_copy(clp, &os->stateid); + if (copy) + nfsd4_stop_copy(copy); + else + status = nfserr_bad_stateid; - fput(src); - fput(dst); -out: return status; } @@ -1144,6 +1363,25 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fput(file); return status; } +static __be32 +nfsd4_offload_status(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_offload_status *os = &u->offload_status; + __be32 status = 0; + struct nfsd4_copy *copy; + struct nfs4_client *clp = cstate->clp; + + copy = find_async_copy(clp, &os->stateid); + if (copy) { + os->count = copy->cp_res.wr_bytes_written; + nfs4_put_copy(copy); + } else + status = nfserr_bad_stateid; + + return status; +} static __be32 nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -2047,6 +2285,14 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1 /* cr_synchronous */) * sizeof(__be32); } +static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 2 /* osr_count */ + + 1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32); +} + #ifdef CONFIG_NFSD_PNFS static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { @@ -2460,6 +2706,17 @@ static const struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_SEEK", .op_rsize_bop = nfsd4_seek_rsize, }, + [OP_OFFLOAD_STATUS] = { + .op_func = nfsd4_offload_status, + .op_name = "OP_OFFLOAD_STATUS", + .op_rsize_bop = nfsd4_offload_status_rsize, + }, + [OP_OFFLOAD_CANCEL] = { + .op_func = nfsd4_offload_cancel, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_OFFLOAD_CANCEL", + .op_rsize_bop = nfsd4_only_status_rsize, + }, }; /** diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b0ca0efd2875..f093fbe47133 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -713,6 +713,36 @@ out_free: return NULL; } +/* + * Create a unique stateid_t to represent each COPY. + */ +int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy) +{ + int new_id; + + idr_preload(GFP_KERNEL); + spin_lock(&nn->s2s_cp_lock); + new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, copy, 0, 0, GFP_NOWAIT); + spin_unlock(&nn->s2s_cp_lock); + idr_preload_end(); + if (new_id < 0) + return 0; + copy->cp_stateid.si_opaque.so_id = new_id; + copy->cp_stateid.si_opaque.so_clid.cl_boot = nn->boot_time; + copy->cp_stateid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; + return 1; +} + +void nfs4_free_cp_state(struct nfsd4_copy *copy) +{ + struct nfsd_net *nn; + + nn = net_generic(copy->cp_clp->net, nfsd_net_id); + spin_lock(&nn->s2s_cp_lock); + idr_remove(&nn->s2s_cp_stateids, copy->cp_stateid.si_opaque.so_id); + spin_unlock(&nn->s2s_cp_lock); +} + static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) { struct nfs4_stid *stid; @@ -1827,6 +1857,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&clp->cl_lo_states); #endif + INIT_LIST_HEAD(&clp->async_copies); + spin_lock_init(&clp->async_lock); spin_lock_init(&clp->cl_lock); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; @@ -1942,6 +1974,7 @@ __destroy_client(struct nfs4_client *clp) } } nfsd4_return_all_client_layouts(clp); + nfsd4_shutdown_copy(clp); nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); @@ -2475,7 +2508,8 @@ static bool client_has_state(struct nfs4_client *clp) || !list_empty(&clp->cl_lo_states) #endif || !list_empty(&clp->cl_delegations) - || !list_empty(&clp->cl_sessions); + || !list_empty(&clp->cl_sessions) + || !list_empty(&clp->async_copies); } __be32 @@ -4364,7 +4398,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); if (!fl) - goto out_stid; + goto out_clnt_odstate; status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL); if (fl) @@ -4389,7 +4423,6 @@ out_unlock: vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); -out_stid: nfs4_put_stid(&dp->dl_stid); out_delegees: put_deleg_file(fp); @@ -7161,6 +7194,8 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); spin_lock_init(&nn->client_lock); + spin_lock_init(&nn->s2s_cp_lock); + idr_init(&nn->s2s_cp_stateids); spin_lock_init(&nn->blocked_locks_lock); INIT_LIST_HEAD(&nn->blocked_locks_lru); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 418fa9c78186..3de42a729093 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1768,6 +1768,13 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) } static __be32 +nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, + struct nfsd4_offload_status *os) +{ + return nfsd4_decode_stateid(argp, &os->stateid); +} + +static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { DECODE_HEAD; @@ -1873,8 +1880,8 @@ static const nfsd4_dec nfsd4_dec_ops[] = { [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status, + [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status, [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -4224,15 +4231,27 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, #endif /* CONFIG_NFSD_PNFS */ static __be32 -nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write) +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, + struct nfsd42_write_res *write, bool sync) { __be32 *p; + p = xdr_reserve_space(&resp->xdr, 4); + if (!p) + return nfserr_resource; - p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); + if (sync) + *p++ = cpu_to_be32(0); + else { + __be32 nfserr; + *p++ = cpu_to_be32(1); + nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid); + if (nfserr) + return nfserr; + } + p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE); if (!p) return nfserr_resource; - *p++ = cpu_to_be32(0); p = xdr_encode_hyper(p, write->wr_bytes_written); *p++ = cpu_to_be32(write->wr_stable_how); p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, @@ -4246,7 +4265,8 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, { __be32 *p; - nfserr = nfsd42_encode_write_res(resp, ©->cp_res); + nfserr = nfsd42_encode_write_res(resp, ©->cp_res, + copy->cp_synchronous); if (nfserr) return nfserr; @@ -4257,6 +4277,22 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 +nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_offload_status *os) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 8 + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_hyper(p, os->count); + *p++ = cpu_to_be32(0); + + return nfserr; +} + +static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_seek *seek) { @@ -4359,7 +4395,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = { [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status, [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop, [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index dbdeb9d6af03..e2fe0e9ce0df 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -30,6 +30,7 @@ #define TARGET_BUCKET_SIZE 64 struct nfsd_drc_bucket { + struct rb_root rb_head; struct list_head lru_head; spinlock_t cache_lock; }; @@ -121,7 +122,7 @@ nfsd_cache_hash(__be32 xid) } static struct svc_cacherep * -nfsd_reply_cache_alloc(void) +nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum) { struct svc_cacherep *rp; @@ -129,21 +130,35 @@ nfsd_reply_cache_alloc(void) if (rp) { rp->c_state = RC_UNUSED; rp->c_type = RC_NOCACHE; + RB_CLEAR_NODE(&rp->c_node); INIT_LIST_HEAD(&rp->c_lru); + + memset(&rp->c_key, 0, sizeof(rp->c_key)); + rp->c_key.k_xid = rqstp->rq_xid; + rp->c_key.k_proc = rqstp->rq_proc; + rpc_copy_addr((struct sockaddr *)&rp->c_key.k_addr, svc_addr(rqstp)); + rpc_set_port((struct sockaddr *)&rp->c_key.k_addr, rpc_get_port(svc_addr(rqstp))); + rp->c_key.k_prot = rqstp->rq_prot; + rp->c_key.k_vers = rqstp->rq_vers; + rp->c_key.k_len = rqstp->rq_arg.len; + rp->c_key.k_csum = csum; } return rp; } static void -nfsd_reply_cache_free_locked(struct svc_cacherep *rp) +nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { drc_mem_usage -= rp->c_replvec.iov_len; kfree(rp->c_replvec.iov_base); } - list_del(&rp->c_lru); - atomic_dec(&num_drc_entries); - drc_mem_usage -= sizeof(*rp); + if (rp->c_state != RC_UNUSED) { + rb_erase(&rp->c_node, &b->rb_head); + list_del(&rp->c_lru); + atomic_dec(&num_drc_entries); + drc_mem_usage -= sizeof(*rp); + } kmem_cache_free(drc_slab, rp); } @@ -151,7 +166,7 @@ static void nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { spin_lock(&b->cache_lock); - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(b, rp); spin_unlock(&b->cache_lock); } @@ -207,7 +222,7 @@ void nfsd_reply_cache_shutdown(void) struct list_head *head = &drc_hashtbl[i].lru_head; while (!list_empty(head)) { rp = list_first_entry(head, struct svc_cacherep, c_lru); - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(&drc_hashtbl[i], rp); } } @@ -246,7 +261,7 @@ prune_bucket(struct nfsd_drc_bucket *b) if (atomic_read(&num_drc_entries) <= max_drc_entries && time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) break; - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(b, rp); freed++; } return freed; @@ -318,51 +333,48 @@ nfsd_cache_csum(struct svc_rqst *rqstp) return csum; } -static bool -nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) +static int +nfsd_cache_key_cmp(const struct svc_cacherep *key, const struct svc_cacherep *rp) { - /* Check RPC XID first */ - if (rqstp->rq_xid != rp->c_xid) - return false; - /* compare checksum of NFS data */ - if (csum != rp->c_csum) { + if (key->c_key.k_xid == rp->c_key.k_xid && + key->c_key.k_csum != rp->c_key.k_csum) ++payload_misses; - return false; - } - /* Other discriminators */ - if (rqstp->rq_proc != rp->c_proc || - rqstp->rq_prot != rp->c_prot || - rqstp->rq_vers != rp->c_vers || - rqstp->rq_arg.len != rp->c_len || - !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) || - rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr)) - return false; - - return true; + return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key)); } /* * Search the request hash for an entry that matches the given rqstp. * Must be called with cache_lock held. Returns the found entry or - * NULL on failure. + * inserts an empty key on failure. */ static struct svc_cacherep * -nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp, - __wsum csum) +nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key) { - struct svc_cacherep *rp, *ret = NULL; - struct list_head *rh = &b->lru_head; + struct svc_cacherep *rp, *ret = key; + struct rb_node **p = &b->rb_head.rb_node, + *parent = NULL; unsigned int entries = 0; + int cmp; - list_for_each_entry(rp, rh, c_lru) { + while (*p != NULL) { ++entries; - if (nfsd_cache_match(rqstp, csum, rp)) { + parent = *p; + rp = rb_entry(parent, struct svc_cacherep, c_node); + + cmp = nfsd_cache_key_cmp(key, rp); + if (cmp < 0) + p = &parent->rb_left; + else if (cmp > 0) + p = &parent->rb_right; + else { ret = rp; - break; + goto out; } } - + rb_link_node(&key->c_node, parent, p); + rb_insert_color(&key->c_node, &b->rb_head); +out: /* tally hash chain length stats */ if (entries > longest_chain) { longest_chain = entries; @@ -374,6 +386,7 @@ nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp, atomic_read(&num_drc_entries)); } + lru_put_end(b, ret); return ret; } @@ -389,9 +402,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) { struct svc_cacherep *rp, *found; __be32 xid = rqstp->rq_xid; - u32 proto = rqstp->rq_prot, - vers = rqstp->rq_vers, - proc = rqstp->rq_proc; __wsum csum; u32 hash = nfsd_cache_hash(xid); struct nfsd_drc_bucket *b = &drc_hashtbl[hash]; @@ -410,60 +420,38 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) * Since the common case is a cache miss followed by an insert, * preallocate an entry. */ - rp = nfsd_reply_cache_alloc(); - spin_lock(&b->cache_lock); - if (likely(rp)) { - atomic_inc(&num_drc_entries); - drc_mem_usage += sizeof(*rp); + rp = nfsd_reply_cache_alloc(rqstp, csum); + if (!rp) { + dprintk("nfsd: unable to allocate DRC entry!\n"); + return rtn; } - /* go ahead and prune the cache */ - prune_bucket(b); - - found = nfsd_cache_search(b, rqstp, csum); - if (found) { - if (likely(rp)) - nfsd_reply_cache_free_locked(rp); + spin_lock(&b->cache_lock); + found = nfsd_cache_insert(b, rp); + if (found != rp) { + nfsd_reply_cache_free_locked(NULL, rp); rp = found; goto found_entry; } - if (!rp) { - dprintk("nfsd: unable to allocate DRC entry!\n"); - goto out; - } - nfsdstats.rcmisses++; rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; - rp->c_xid = xid; - rp->c_proc = proc; - rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp)); - rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp))); - rp->c_prot = proto; - rp->c_vers = vers; - rp->c_len = rqstp->rq_arg.len; - rp->c_csum = csum; - lru_put_end(b, rp); + atomic_inc(&num_drc_entries); + drc_mem_usage += sizeof(*rp); - /* release any buffer */ - if (rp->c_type == RC_REPLBUFF) { - drc_mem_usage -= rp->c_replvec.iov_len; - kfree(rp->c_replvec.iov_base); - rp->c_replvec.iov_base = NULL; - } - rp->c_type = RC_NOCACHE; + /* go ahead and prune the cache */ + prune_bucket(b); out: spin_unlock(&b->cache_lock); return rtn; found_entry: - nfsdstats.rchits++; /* We found a matching entry which is either in progress or done. */ - lru_put_end(b, rp); - + nfsdstats.rchits++; rtn = RC_DROPIT; + /* Request being processed */ if (rp->c_state == RC_INPROG) goto out; @@ -489,7 +477,7 @@ found_entry: break; default: printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type); - nfsd_reply_cache_free_locked(rp); + nfsd_reply_cache_free_locked(b, rp); } goto out; @@ -524,7 +512,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) if (!rp) return; - hash = nfsd_cache_hash(rp->c_xid); + hash = nfsd_cache_hash(rp->c_key.k_xid); b = &drc_hashtbl[hash]; len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7fb9f7c667b1..6384c9b94898 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1242,6 +1242,7 @@ static __net_init int nfsd_init_net(struct net *net) nn->somebody_reclaimed = false; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); + nn->s2s_cp_cl_id = nn->clientid_counter++; atomic_set(&nn->ntf_refcnt, 0); init_waitqueue_head(&nn->ntf_wq); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 0b15dac7e609..6aacb325b6a0 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -355,6 +355,8 @@ struct nfs4_client { struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ /* wait here for slots */ struct net *net; + struct list_head async_copies; /* list of async copies */ + spinlock_t async_lock; /* lock for async copies */ }; /* struct nfs4_client_reset @@ -573,6 +575,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_NULL = 0, NFSPROC4_CLNT_CB_RECALL, NFSPROC4_CLNT_CB_LAYOUT, + NFSPROC4_CLNT_CB_OFFLOAD, NFSPROC4_CLNT_CB_SEQUENCE, NFSPROC4_CLNT_CB_NOTIFY_LOCK, }; @@ -599,6 +602,7 @@ struct nfsd4_blocked_lock { struct nfsd4_compound_state; struct nfsd_net; +struct nfsd4_copy; extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, @@ -608,6 +612,8 @@ __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, struct nfs4_stid **s, struct nfsd_net *nn); struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, void (*sc_free)(struct nfs4_stid *)); +int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy); +void nfs4_free_cp_state(struct nfsd4_copy *copy); void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); @@ -626,6 +632,7 @@ extern void nfsd4_run_cb(struct nfsd4_callback *cb); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); +extern void nfsd4_shutdown_copy(struct nfs4_client *clp); extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn); @@ -633,6 +640,9 @@ extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); struct nfs4_file *find_file(struct knfsd_fh *fh); void put_nfs4_file(struct nfs4_file *fi); +extern void nfs4_put_copy(struct nfsd4_copy *copy); +extern struct nfsd4_copy * +find_async_copy(struct nfs4_client *clp, stateid_t *staetid); static inline void get_nfs4_file(struct nfs4_file *fi) { refcount_inc(&fi->fi_ref); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b53e76391e52..2751976704e9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1276,7 +1276,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, dev_t rdev, struct svc_fh *resfhp) { struct dentry *dentry, *dchild = NULL; - struct inode *dirp; __be32 err; int host_err; @@ -1288,7 +1287,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, return err; dentry = fhp->fh_dentry; - dirp = d_inode(dentry); host_err = fh_want_write(fhp); if (host_err) @@ -1409,6 +1407,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, *created = 1; break; } + /* fall through */ case NFS4_CREATE_EXCLUSIVE4_1: if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime && d_inode(dchild)->i_atime.tv_sec == v_atime @@ -1417,7 +1416,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, *created = 1; goto set_attr; } - /* fallthru */ + /* fall through */ case NFS3_CREATE_GUARDED: err = nfserr_exist; } diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 17c453a7999c..feeb6d4bdffd 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -511,6 +511,7 @@ struct nfsd42_write_res { u64 wr_bytes_written; u32 wr_stable_how; nfs4_verifier wr_verifier; + stateid_t cb_stateid; }; struct nfsd4_copy { @@ -526,6 +527,23 @@ struct nfsd4_copy { /* response */ struct nfsd42_write_res cp_res; + + /* for cb_offload */ + struct nfsd4_callback cp_cb; + __be32 nfserr; + struct knfsd_fh fh; + + struct nfs4_client *cp_clp; + + struct file *file_src; + struct file *file_dst; + + stateid_t cp_stateid; + + struct list_head copies; + struct task_struct *copy_task; + refcount_t refcount; + bool stopped; }; struct nfsd4_seek { @@ -539,6 +557,15 @@ struct nfsd4_seek { loff_t seek_pos; }; +struct nfsd4_offload_status { + /* request */ + stateid_t stateid; + + /* response */ + u64 count; + u32 status; +}; + struct nfsd4_op { int opnum; const struct nfsd4_operation * opdesc; @@ -597,6 +624,7 @@ struct nfsd4_op { struct nfsd4_fallocate deallocate; struct nfsd4_clone clone; struct nfsd4_copy copy; + struct nfsd4_offload_status offload_status; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index 517239af0302..547cf07cf4e0 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -38,3 +38,13 @@ #define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) +#define enc_cb_offload_info_sz (1 + 1 + 2 + 1 + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) +#define NFS4_enc_cb_offload_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + enc_nfs4_fh_sz + \ + enc_stateid_sz + \ + enc_cb_offload_info_sz) +#define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index ebb24a314f43..de99db518571 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -168,24 +168,18 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, ctxt->newbh = NULL; if (inode->i_blkbits == PAGE_SHIFT) { - lock_page(obh->b_page); - /* - * We cannot call radix_tree_preload for the kernels older - * than 2.6.23, because it is not exported for modules. - */ + struct page *opage = obh->b_page; + lock_page(opage); retry: - err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (err) - goto failed_unlock; /* BUG_ON(oldkey != obh->b_page->index); */ - if (unlikely(oldkey != obh->b_page->index)) - NILFS_PAGE_BUG(obh->b_page, + if (unlikely(oldkey != opage->index)) + NILFS_PAGE_BUG(opage, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); xa_lock_irq(&btnc->i_pages); - err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page); + err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS); xa_unlock_irq(&btnc->i_pages); /* * Note: page->index will not change to newkey until @@ -193,7 +187,6 @@ retry: * To protect the page in intermediate state, the page lock * is held. */ - radix_tree_preload_end(); if (!err) return 0; else if (err != -EEXIST) @@ -203,7 +196,7 @@ retry: if (!err) goto retry; /* fallback to copy mode */ - unlock_page(obh->b_page); + unlock_page(opage); } nbh = nilfs_btnode_create_block(btnc, newkey); @@ -243,9 +236,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, mark_buffer_dirty(obh); xa_lock_irq(&btnc->i_pages); - radix_tree_delete(&btnc->i_pages, oldkey); - radix_tree_tag_set(&btnc->i_pages, newkey, - PAGECACHE_TAG_DIRTY); + __xa_erase(&btnc->i_pages, oldkey); + __xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY); xa_unlock_irq(&btnc->i_pages); opage->index = obh->b_blocknr = newkey; @@ -275,7 +267,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, if (nbh == NULL) { /* blocksize == pagesize */ xa_lock_irq(&btnc->i_pages); - radix_tree_delete(&btnc->i_pages, newkey); + __xa_erase(&btnc->i_pages, newkey); xa_unlock_irq(&btnc->i_pages); unlock_page(ctxt->bh->b_page); } else diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 329a056b73b1..d7fc8d369d89 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -289,7 +289,7 @@ repeat: * @dmap: destination page cache * @smap: source page cache * - * No pages must no be added to the cache during this process. + * No pages must be added to the cache during this process. * This must be ensured by the caller. */ void nilfs_copy_back_pages(struct address_space *dmap, @@ -298,7 +298,6 @@ void nilfs_copy_back_pages(struct address_space *dmap, struct pagevec pvec; unsigned int i, n; pgoff_t index = 0; - int err; pagevec_init(&pvec); repeat: @@ -313,35 +312,34 @@ repeat: lock_page(page); dpage = find_lock_page(dmap, offset); if (dpage) { - /* override existing page on the destination cache */ + /* overwrite existing page in the destination cache */ WARN_ON(PageDirty(dpage)); nilfs_copy_page(dpage, page, 0); unlock_page(dpage); put_page(dpage); + /* Do we not need to remove page from smap here? */ } else { - struct page *page2; + struct page *p; /* move the page to the destination cache */ xa_lock_irq(&smap->i_pages); - page2 = radix_tree_delete(&smap->i_pages, offset); - WARN_ON(page2 != page); - + p = __xa_erase(&smap->i_pages, offset); + WARN_ON(page != p); smap->nrpages--; xa_unlock_irq(&smap->i_pages); xa_lock_irq(&dmap->i_pages); - err = radix_tree_insert(&dmap->i_pages, offset, page); - if (unlikely(err < 0)) { - WARN_ON(err == -EEXIST); + p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS); + if (unlikely(p)) { + /* Probably -ENOMEM */ page->mapping = NULL; - put_page(page); /* for cache */ + put_page(page); } else { page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) - radix_tree_tag_set(&dmap->i_pages, - offset, - PAGECACHE_TAG_DIRTY); + __xa_set_mark(&dmap->i_pages, offset, + PAGECACHE_TAG_DIRTY); } xa_unlock_irq(&dmap->i_pages); } @@ -467,8 +465,7 @@ int __nilfs_clear_page_dirty(struct page *page) if (mapping) { xa_lock_irq(&mapping->i_pages); if (test_bit(PG_dirty, &page->flags)) { - radix_tree_tag_clear(&mapping->i_pages, - page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irq(&mapping->i_pages); return clear_page_dirty_for_io(page); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 94b52157bf8d..5769cf3ff035 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -25,7 +25,7 @@ static bool should_merge(struct fsnotify_event *old_fsn, old = FANOTIFY_E(old_fsn); new = FANOTIFY_E(new_fsn); - if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid && + if (old_fsn->inode == new_fsn->inode && old->pid == new->pid && old->path.mnt == new->path.mnt && old->path.dentry == new->path.dentry) return true; @@ -131,8 +131,8 @@ static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info, !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) return false; - if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask & - ~marks_ignored_mask) + if (event_mask & FANOTIFY_OUTGOING_EVENTS & + marks_mask & ~marks_ignored_mask) return true; return false; @@ -171,7 +171,10 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, goto out; init: __maybe_unused fsnotify_init_event(&event->fse, inode, mask); - event->tgid = get_pid(task_tgid(current)); + if (FAN_GROUP_FLAG(group, FAN_REPORT_TID)) + event->pid = get_pid(task_pid(current)); + else + event->pid = get_pid(task_tgid(current)); if (path) { event->path = *path; path_get(&event->path); @@ -205,6 +208,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 10); + if (!fanotify_should_send_event(iter_info, mask, data, data_type)) return 0; @@ -236,7 +241,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, ret = fsnotify_add_event(group, fsn_event, fanotify_merge); if (ret) { /* Permission events shouldn't be merged */ - BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS); + BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); @@ -268,7 +273,7 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) event = FANOTIFY_E(fsn_event); path_put(&event->path); - put_pid(event->tgid); + put_pid(event->pid); if (fanotify_is_perm_event(fsn_event->mask)) { kmem_cache_free(fanotify_perm_event_cachep, FANOTIFY_PE(fsn_event)); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 8609ba06f474..ea05b8a401e7 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -19,7 +19,7 @@ struct fanotify_event_info { * during this object's lifetime */ struct path path; - struct pid *tgid; + struct pid *pid; }; /* @@ -44,7 +44,7 @@ FANOTIFY_PE(struct fsnotify_event *fse) static inline bool fanotify_is_perm_event(u32 mask) { return IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS) && - mask & FAN_ALL_PERM_EVENTS; + mask & FANOTIFY_PERM_EVENTS; } static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 69054886915b..e03be5071362 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -131,8 +131,8 @@ static int fill_event_metadata(struct fsnotify_group *group, metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; metadata->reserved = 0; - metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS; - metadata->pid = pid_vnr(event->tgid); + metadata->mask = fsn_event->mask & FANOTIFY_OUTGOING_EVENTS; + metadata->pid = pid_vnr(event->pid); if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) metadata->fd = FAN_NOFD; else { @@ -191,7 +191,7 @@ static int process_access_response(struct fsnotify_group *group, if (fd < 0) return -EINVAL; - if ((response & FAN_AUDIT) && !group->fanotify_data.audit) + if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL; event = dequeue_event(group, fd); @@ -395,7 +395,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ while (!fsnotify_notify_queue_is_empty(group)) { fsn_event = fsnotify_remove_first_event(group); - if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) { + if (!(fsn_event->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, fsn_event); spin_lock(&group->notification_lock); @@ -506,18 +506,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { - __u32 tmask = fsn_mark->mask & ~mask; - - if (flags & FAN_MARK_ONDIR) - tmask &= ~FAN_ONDIR; - oldmask = fsn_mark->mask; - fsn_mark->mask = tmask; + fsn_mark->mask &= ~mask; } else { - __u32 tmask = fsn_mark->ignored_mask & ~mask; - if (flags & FAN_MARK_ONDIR) - tmask &= ~FAN_ONDIR; - fsn_mark->ignored_mask = tmask; + fsn_mark->ignored_mask &= ~mask; } *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask); spin_unlock(&fsn_mark->lock); @@ -563,6 +555,13 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, mask, flags); } +static int fanotify_remove_sb_mark(struct fsnotify_group *group, + struct super_block *sb, __u32 mask, + unsigned int flags) +{ + return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, flags); +} + static int fanotify_remove_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, unsigned int flags) @@ -579,19 +578,10 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { - __u32 tmask = fsn_mark->mask | mask; - - if (flags & FAN_MARK_ONDIR) - tmask |= FAN_ONDIR; - oldmask = fsn_mark->mask; - fsn_mark->mask = tmask; + fsn_mark->mask |= mask; } else { - __u32 tmask = fsn_mark->ignored_mask | mask; - if (flags & FAN_MARK_ONDIR) - tmask |= FAN_ONDIR; - - fsn_mark->ignored_mask = tmask; + fsn_mark->ignored_mask |= mask; if (flags & FAN_MARK_IGNORED_SURV_MODIFY) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; } @@ -658,6 +648,14 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags); } +static int fanotify_add_sb_mark(struct fsnotify_group *group, + struct super_block *sb, __u32 mask, + unsigned int flags) +{ + return fanotify_add_mark(group, &sb->s_fsnotify_marks, + FSNOTIFY_OBJ_TYPE_SB, mask, flags); +} + static int fanotify_add_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, unsigned int flags) @@ -686,16 +684,16 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) struct user_struct *user; struct fanotify_event_info *oevent; - pr_debug("%s: flags=%d event_f_flags=%d\n", - __func__, flags, event_f_flags); + pr_debug("%s: flags=%x event_f_flags=%x\n", + __func__, flags, event_f_flags); if (!capable(CAP_SYS_ADMIN)) return -EPERM; #ifdef CONFIG_AUDITSYSCALL - if (flags & ~(FAN_ALL_INIT_FLAGS | FAN_ENABLE_AUDIT)) + if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) #else - if (flags & ~FAN_ALL_INIT_FLAGS) + if (flags & ~FANOTIFY_INIT_FLAGS) #endif return -EINVAL; @@ -731,6 +729,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } group->fanotify_data.user = user; + group->fanotify_data.flags = flags; atomic_inc(&user->fanotify_listeners); group->memcg = get_mem_cgroup_from_mm(current->mm); @@ -746,7 +745,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - switch (flags & FAN_ALL_CLASS_BITS) { + switch (flags & FANOTIFY_CLASS_BITS) { case FAN_CLASS_NOTIF: group->priority = FS_PRIO_0; break; @@ -783,7 +782,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) fd = -EPERM; if (!capable(CAP_AUDIT_WRITE)) goto out_destroy_group; - group->fanotify_data.audit = true; } fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); @@ -805,7 +803,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct fsnotify_group *group; struct fd f; struct path path; - u32 valid_mask = FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD; + u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; int ret; pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", @@ -815,8 +814,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & ((__u64)0xffffffff << 32)) return -EINVAL; - if (flags & ~FAN_ALL_MARK_FLAGS) + if (flags & ~FANOTIFY_MARK_FLAGS) + return -EINVAL; + + switch (mark_type) { + case FAN_MARK_INODE: + case FAN_MARK_MOUNT: + case FAN_MARK_FILESYSTEM: + break; + default: return -EINVAL; + } + switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { case FAN_MARK_ADD: /* fallthrough */ case FAN_MARK_REMOVE: @@ -824,20 +833,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; break; case FAN_MARK_FLUSH: - if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH)) + if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH)) return -EINVAL; break; default: return -EINVAL; } - if (mask & FAN_ONDIR) { - flags |= FAN_MARK_ONDIR; - mask &= ~FAN_ONDIR; - } - if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) - valid_mask |= FAN_ALL_PERM_EVENTS; + valid_mask |= FANOTIFY_PERM_EVENTS; if (mask & ~valid_mask) return -EINVAL; @@ -857,14 +861,16 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * allowed to set permissions events. */ ret = -EINVAL; - if (mask & FAN_ALL_PERM_EVENTS && + if (mask & FANOTIFY_PERM_EVENTS && group->priority == FS_PRIO_0) goto fput_and_out; if (flags & FAN_MARK_FLUSH) { ret = 0; - if (flags & FAN_MARK_MOUNT) + if (mark_type == FAN_MARK_MOUNT) fsnotify_clear_vfsmount_marks_by_group(group); + else if (mark_type == FAN_MARK_FILESYSTEM) + fsnotify_clear_sb_marks_by_group(group); else fsnotify_clear_inode_marks_by_group(group); goto fput_and_out; @@ -875,7 +881,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; /* inode held in place by reference to path; group by fget on fd */ - if (!(flags & FAN_MARK_MOUNT)) + if (mark_type == FAN_MARK_INODE) inode = path.dentry->d_inode; else mnt = path.mnt; @@ -883,14 +889,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, /* create/update an inode mark */ switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { case FAN_MARK_ADD: - if (flags & FAN_MARK_MOUNT) + if (mark_type == FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); + else if (mark_type == FAN_MARK_FILESYSTEM) + ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags); else ret = fanotify_add_inode_mark(group, inode, mask, flags); break; case FAN_MARK_REMOVE: - if (flags & FAN_MARK_MOUNT) + if (mark_type == FAN_MARK_MOUNT) ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags); + else if (mark_type == FAN_MARK_FILESYSTEM) + ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags); else ret = fanotify_remove_inode_mark(group, inode, mask, flags); break; @@ -934,6 +944,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); + fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 86fcf5814279..348a184bcdda 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -131,37 +131,20 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); + } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) { + struct super_block *sb = fsnotify_conn_sb(mark->connector); + + seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", + sb->s_dev, mflags, mark->mask, mark->ignored_mask); } } void fanotify_show_fdinfo(struct seq_file *m, struct file *f) { struct fsnotify_group *group = f->private_data; - unsigned int flags = 0; - - switch (group->priority) { - case FS_PRIO_0: - flags |= FAN_CLASS_NOTIF; - break; - case FS_PRIO_1: - flags |= FAN_CLASS_CONTENT; - break; - case FS_PRIO_2: - flags |= FAN_CLASS_PRE_CONTENT; - break; - } - - if (group->max_events == UINT_MAX) - flags |= FAN_UNLIMITED_QUEUE; - - if (group->fanotify_data.max_marks == UINT_MAX) - flags |= FAN_UNLIMITED_MARKS; - - if (group->fanotify_data.audit) - flags |= FAN_ENABLE_AUDIT; seq_printf(m, "fanotify flags:%x event-flags:%x\n", - flags, group->fanotify_data.f_flags); + group->fanotify_data.flags, group->fanotify_data.f_flags); show_fdinfo(m, f, fanotify_fdinfo); } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ababdbfab537..2172ba516c61 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -48,7 +48,7 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) * Called during unmount with no locks held, so needs to be safe against * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ -void fsnotify_unmount_inodes(struct super_block *sb) +static void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; @@ -96,6 +96,15 @@ void fsnotify_unmount_inodes(struct super_block *sb) if (iput_inode) iput(iput_inode); + /* Wait for outstanding inode references from connectors */ + wait_var_event(&sb->s_fsnotify_inode_refs, + !atomic_long_read(&sb->s_fsnotify_inode_refs)); +} + +void fsnotify_sb_delete(struct super_block *sb) +{ + fsnotify_unmount_inodes(sb); + fsnotify_clear_marks_by_sb(sb); } /* @@ -190,7 +199,7 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; - __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); + __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; __u32 marks_ignored_mask = 0; struct fsnotify_mark *mark; @@ -319,15 +328,17 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *file_name, u32 cookie) { struct fsnotify_iter_info iter_info = {}; - struct mount *mnt; + struct super_block *sb = NULL; + struct mount *mnt = NULL; + __u32 mnt_or_sb_mask = 0; int ret = 0; - /* global tests shouldn't care about events on child only the specific event */ - __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); + __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); - if (data_is == FSNOTIFY_EVENT_PATH) + if (data_is == FSNOTIFY_EVENT_PATH) { mnt = real_mount(((const struct path *)data)->mnt); - else - mnt = NULL; + sb = mnt->mnt.mnt_sb; + mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask; + } /* * Optimization: srcu_read_lock() has a memory barrier which can @@ -337,16 +348,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, * need SRCU to keep them "alive". */ if (!to_tell->i_fsnotify_marks && - (!mnt || !mnt->mnt_fsnotify_marks)) + (!mnt || (!mnt->mnt_fsnotify_marks && !sb->s_fsnotify_marks))) return 0; /* * if this is a modify event we may need to clear the ignored masks - * otherwise return if neither the inode nor the vfsmount care about + * otherwise return if neither the inode nor the vfsmount/sb care about * this type of event. */ if (!(mask & FS_MODIFY) && - !(test_mask & to_tell->i_fsnotify_mask) && - !(mnt && test_mask & mnt->mnt_fsnotify_mask)) + !(test_mask & (to_tell->i_fsnotify_mask | mnt_or_sb_mask))) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); @@ -356,11 +366,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, if (mnt) { iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); + iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] = + fsnotify_first_mark(&sb->s_fsnotify_marks); } /* - * We need to merge inode & vfsmount mark lists so that inode mark - * ignore masks are properly reflected for mount mark notifications. + * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark + * ignore masks are properly reflected for mount/sb mark notifications. * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { @@ -386,7 +398,7 @@ static __init int fsnotify_init(void) { int ret; - BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 7902653dd577..5a00121fb219 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -21,6 +21,12 @@ static inline struct mount *fsnotify_conn_mount( return container_of(conn->obj, struct mount, mnt_fsnotify_marks); } +static inline struct super_block *fsnotify_conn_sb( + struct fsnotify_mark_connector *conn) +{ + return container_of(conn->obj, struct super_block, s_fsnotify_marks); +} + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); @@ -43,6 +49,11 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } +/* run the list of all marks associated with sb and destroy them */ +static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) +{ + fsnotify_destroy_marks(&sb->s_fsnotify_marks); +} /* Wait until all marks queued for destruction are destroyed */ extern void fsnotify_wait_marks_destroyed(void); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ac6978d3208c..105576daca4a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -815,7 +815,7 @@ static int __init inotify_user_setup(void) BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); - BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22); + BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC|SLAB_ACCOUNT); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 59cdb27826de..d2dd16cb5989 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -115,6 +115,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) return &fsnotify_conn_inode(conn)->i_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; + else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) + return &fsnotify_conn_sb(conn)->s_fsnotify_mask; return NULL; } @@ -179,19 +181,24 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } -static struct inode *fsnotify_detach_connector_from_object( - struct fsnotify_mark_connector *conn) +static void *fsnotify_detach_connector_from_object( + struct fsnotify_mark_connector *conn, + unsigned int *type) { struct inode *inode = NULL; + *type = conn->type; if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) return NULL; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; + atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs); } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; + } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { + fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } rcu_assign_pointer(*(conn->obj), NULL); @@ -211,10 +218,29 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) fsnotify_put_group(group); } +/* Drop object reference originally held by a connector */ +static void fsnotify_drop_object(unsigned int type, void *objp) +{ + struct inode *inode; + struct super_block *sb; + + if (!objp) + return; + /* Currently only inode references are passed to be dropped */ + if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) + return; + inode = objp; + sb = inode->i_sb; + iput(inode); + if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs)) + wake_up_var(&sb->s_fsnotify_inode_refs); +} + void fsnotify_put_mark(struct fsnotify_mark *mark) { struct fsnotify_mark_connector *conn; - struct inode *inode = NULL; + void *objp = NULL; + unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ @@ -234,7 +260,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) conn = mark->connector; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { - inode = fsnotify_detach_connector_from_object(conn); + objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { __fsnotify_recalc_mask(conn); @@ -242,7 +268,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) mark->connector = NULL; spin_unlock(&conn->lock); - iput(inode); + fsnotify_drop_object(type, objp); if (free_conn) { spin_lock(&destroy_lock); @@ -709,7 +735,8 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark, *old_mark = NULL; - struct inode *inode; + void *objp; + unsigned int type; conn = fsnotify_grab_connector(connp); if (!conn) @@ -735,11 +762,11 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp) * mark references get dropped. It would lead to strange results such * as delaying inode deletion or blocking unmount. */ - inode = fsnotify_detach_connector_from_object(conn); + objp = fsnotify_detach_connector_from_object(conn, &type); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); - iput(inode); + fsnotify_drop_object(type, objp); } /* diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index d297fe4472a9..bbcc185062bb 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -22,7 +22,7 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/printk.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/uaccess.h> diff --git a/fs/proc/page.c b/fs/proc/page.c index 792c78a49174..6c517b11acf8 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/init.h> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a027473561c6..47c3764c469b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -521,7 +521,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) mss->swap += PAGE_SIZE; else put_page(page); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 91ae16fbd7d5..3fe90443c1bb 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -16,7 +16,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/printk.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/init.h> #include <linux/crash_dump.h> #include <linux/list.h> @@ -423,7 +423,7 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) if (rc < 0) { unlock_page(page); put_page(page); - return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + return vmf_error(rc); } SetPageUptodate(page); } diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 503086f7f7c1..0d19d191ae70 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -141,7 +141,6 @@ config PSTORE_RAM tristate "Log panic/oops to a RAM buffer" depends on PSTORE depends on HAS_IOMEM - depends on HAVE_MEMBLOCK select REED_SOLOMON select REED_SOLOMON_ENC8 select REED_SOLOMON_DEC8 diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile index a39a562c1c10..bd29c58ccbd8 100644 --- a/fs/reiserfs/Makefile +++ b/fs/reiserfs/Makefile @@ -26,14 +26,5 @@ ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y) reiserfs-objs += xattr_acl.o endif -# gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline -# functions are used. This causes the compiler to advance the stack -# pointer out of the available stack space, corrupting kernel space, -# and causing a panic. Since this behavior only affects ppc32, this ifeq -# will work around it. If any other architecture displays this behavior, -# add it here. -ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1) - TAGS: etags *.c - diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 48cdfc81fe10..32d8986c26fb 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -185,6 +185,7 @@ struct reiserfs_dentry_buf { struct dir_context ctx; struct dentry *xadir; int count; + int err; struct dentry *dentries[8]; }; @@ -207,6 +208,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, dentry = lookup_one_len(name, dbuf->xadir, namelen); if (IS_ERR(dentry)) { + dbuf->err = PTR_ERR(dentry); return PTR_ERR(dentry); } else if (d_really_is_negative(dentry)) { /* A directory entry exists, but no file? */ @@ -215,6 +217,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, "not found for file %pd.\n", dentry, dbuf->xadir); dput(dentry); + dbuf->err = -EIO; return -EIO; } @@ -262,6 +265,10 @@ static int reiserfs_for_each_xattr(struct inode *inode, err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); if (err) break; + if (buf.err) { + err = buf.err; + break; + } if (!buf.count) break; for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) { diff --git a/fs/super.c b/fs/super.c index f3a8c008e164..ca53a08497ed 100644 --- a/fs/super.c +++ b/fs/super.c @@ -442,7 +442,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; - fsnotify_unmount_inodes(sb); + fsnotify_sb_delete(sb); cgroup_writeback_umount(); evict_inodes(sb); diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index fcda0fc97b90..ec85aeaed54a 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -175,8 +175,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, { struct udf_sb_info *sbi = UDF_SB(sb); int alloc_count = 0; - int bit, block, block_group, group_start; - int nr_groups, bitmap_nr; + int bit, block, block_group; + int bitmap_nr; struct buffer_head *bh; __u32 part_len; @@ -189,10 +189,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, block_count = part_len - first_block; do { - nr_groups = udf_compute_nr_groups(sb, partition); block = first_block + (sizeof(struct spaceBitmapDesc) << 3); block_group = block >> (sb->s_blocksize_bits + 3); - group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc); bitmap_nr = load_block_bitmap(sb, bitmap, block_group); if (bitmap_nr < 0) @@ -652,12 +650,6 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode, } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { udf_table_free_blocks(sb, map->s_uspace.s_table, bloc, offset, count); - } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { - udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap, - bloc, offset, count); - } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { - udf_table_free_blocks(sb, map->s_fspace.s_table, - bloc, offset, count); } if (inode) { @@ -684,16 +676,6 @@ inline int udf_prealloc_blocks(struct super_block *sb, map->s_uspace.s_table, partition, first_block, block_count); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - allocated = udf_bitmap_prealloc_blocks(sb, - map->s_fspace.s_bitmap, - partition, first_block, - block_count); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - allocated = udf_table_prealloc_blocks(sb, - map->s_fspace.s_table, - partition, first_block, - block_count); else return 0; @@ -717,14 +699,6 @@ inline udf_pblk_t udf_new_block(struct super_block *sb, block = udf_table_new_block(sb, map->s_uspace.s_table, partition, goal, err); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - block = udf_bitmap_new_block(sb, - map->s_fspace.s_bitmap, - partition, goal, err); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - block = udf_table_new_block(sb, - map->s_fspace.s_table, - partition, goal, err); else { *err = -EIO; return 0; diff --git a/fs/udf/super.c b/fs/udf/super.c index 6f515651a2c2..8f2f56d9a1bb 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -290,12 +290,8 @@ static void udf_free_partition(struct udf_part_map *map) if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) iput(map->s_uspace.s_table); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - iput(map->s_fspace.s_table); if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) udf_sb_free_bitmap(map->s_uspace.s_bitmap); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - udf_sb_free_bitmap(map->s_fspace.s_bitmap); if (map->s_partition_type == UDF_SPARABLE_MAP15) for (i = 0; i < 4; i++) brelse(map->s_type_specific.s_sparing.s_spar_map[i]); @@ -613,14 +609,11 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) struct udf_options uopt; struct udf_sb_info *sbi = UDF_SB(sb); int error = 0; - struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); + + if (!(*flags & SB_RDONLY) && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT)) + return -EACCES; sync_filesystem(sb); - if (lvidiu) { - int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); - if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & SB_RDONLY)) - return -EACCES; - } uopt.flags = sbi->s_flags; uopt.uid = sbi->s_uid; @@ -988,12 +981,62 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) return bitmap; } +static int check_partition_desc(struct super_block *sb, + struct partitionDesc *p, + struct udf_part_map *map) +{ + bool umap, utable, fmap, ftable; + struct partitionHeaderDesc *phd; + + switch (le32_to_cpu(p->accessType)) { + case PD_ACCESS_TYPE_READ_ONLY: + case PD_ACCESS_TYPE_WRITE_ONCE: + case PD_ACCESS_TYPE_REWRITABLE: + case PD_ACCESS_TYPE_NONE: + goto force_ro; + } + + /* No Partition Header Descriptor? */ + if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && + strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) + goto force_ro; + + phd = (struct partitionHeaderDesc *)p->partitionContentsUse; + utable = phd->unallocSpaceTable.extLength; + umap = phd->unallocSpaceBitmap.extLength; + ftable = phd->freedSpaceTable.extLength; + fmap = phd->freedSpaceBitmap.extLength; + + /* No allocation info? */ + if (!utable && !umap && !ftable && !fmap) + goto force_ro; + + /* We don't support blocks that require erasing before overwrite */ + if (ftable || fmap) + goto force_ro; + /* UDF 2.60: 2.3.3 - no mixing of tables & bitmaps, no VAT. */ + if (utable && umap) + goto force_ro; + + if (map->s_partition_type == UDF_VIRTUAL_MAP15 || + map->s_partition_type == UDF_VIRTUAL_MAP20) + goto force_ro; + + return 0; +force_ro: + if (!sb_rdonly(sb)) + return -EACCES; + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); + return 0; +} + static int udf_fill_partdesc_info(struct super_block *sb, struct partitionDesc *p, int p_index) { struct udf_part_map *map; struct udf_sb_info *sbi = UDF_SB(sb); struct partitionHeaderDesc *phd; + int err; map = &sbi->s_partmaps[p_index]; @@ -1013,8 +1056,16 @@ static int udf_fill_partdesc_info(struct super_block *sb, p_index, map->s_partition_type, map->s_partition_root, map->s_partition_len); - if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && - strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) + err = check_partition_desc(sb, p, map); + if (err) + return err; + + /* + * Skip loading allocation info it we cannot ever write to the fs. + * This is a correctness thing as we may have decided to force ro mount + * to avoid allocation info we don't support. + */ + if (UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT)) return 0; phd = (struct partitionHeaderDesc *)p->partitionContentsUse; @@ -1050,40 +1101,6 @@ static int udf_fill_partdesc_info(struct super_block *sb, p_index, bitmap->s_extPosition); } - if (phd->partitionIntegrityTable.extLength) - udf_debug("partitionIntegrityTable (part %d)\n", p_index); - - if (phd->freedSpaceTable.extLength) { - struct kernel_lb_addr loc = { - .logicalBlockNum = le32_to_cpu( - phd->freedSpaceTable.extPosition), - .partitionReferenceNum = p_index, - }; - struct inode *inode; - - inode = udf_iget_special(sb, &loc); - if (IS_ERR(inode)) { - udf_debug("cannot load freedSpaceTable (part %d)\n", - p_index); - return PTR_ERR(inode); - } - map->s_fspace.s_table = inode; - map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; - udf_debug("freedSpaceTable (part %d) @ %lu\n", - p_index, map->s_fspace.s_table->i_ino); - } - - if (phd->freedSpaceBitmap.extLength) { - struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); - if (!bitmap) - return -ENOMEM; - map->s_fspace.s_bitmap = bitmap; - bitmap->s_extPosition = le32_to_cpu( - phd->freedSpaceBitmap.extPosition); - map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; - udf_debug("freedSpaceBitmap (part %d) @ %u\n", - p_index, bitmap->s_extPosition); - } return 0; } @@ -1257,6 +1274,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) ret = -EACCES; goto out_bh; } + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); ret = udf_load_vat(sb, i, type1_idx); if (ret < 0) goto out_bh; @@ -2155,10 +2173,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) UDF_MAX_READ_VERSION); ret = -EINVAL; goto error_out; - } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION && - !sb_rdonly(sb)) { - ret = -EACCES; - goto error_out; + } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) { + if (!sb_rdonly(sb)) { + ret = -EACCES; + goto error_out; + } + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); } sbi->s_udfrev = minUDFWriteRev; @@ -2176,10 +2196,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & - UDF_PART_FLAG_READ_ONLY && - !sb_rdonly(sb)) { - ret = -EACCES; - goto error_out; + UDF_PART_FLAG_READ_ONLY) { + if (!sb_rdonly(sb)) { + ret = -EACCES; + goto error_out; + } + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); } if (udf_find_fileset(sb, &fileset, &rootdir)) { @@ -2433,10 +2455,6 @@ static unsigned int udf_count_free(struct super_block *sb) accum += udf_count_free_bitmap(sb, map->s_uspace.s_bitmap); } - if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { - accum += udf_count_free_bitmap(sb, - map->s_fspace.s_bitmap); - } if (accum) return accum; @@ -2444,11 +2462,6 @@ static unsigned int udf_count_free(struct super_block *sb) accum += udf_count_free_table(sb, map->s_uspace.s_table); } - if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { - accum += udf_count_free_table(sb, - map->s_fspace.s_table); - } - return accum; } diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 9424d7cab790..3d83be54c474 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -30,11 +30,11 @@ #define UDF_FLAG_LASTBLOCK_SET 16 #define UDF_FLAG_BLOCKSIZE_SET 17 #define UDF_FLAG_INCONSISTENT 18 +#define UDF_FLAG_RW_INCOMPAT 19 /* Set when we find RW incompatible + * feature */ #define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 #define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 -#define UDF_PART_FLAG_FREED_BITMAP 0x0004 -#define UDF_PART_FLAG_FREED_TABLE 0x0008 #define UDF_PART_FLAG_READ_ONLY 0x0010 #define UDF_PART_FLAG_WRITE_ONCE 0x0020 #define UDF_PART_FLAG_REWRITABLE 0x0040 @@ -50,8 +50,6 @@ #define UDF_INVALID_MODE ((umode_t)-1) -#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ - #define MF_DUPLICATE_MD 0x01 #define MF_MIRROR_FE_LOADED 0x02 @@ -93,10 +91,6 @@ struct udf_part_map { struct udf_bitmap *s_bitmap; struct inode *s_table; } s_uspace; - union { - struct udf_bitmap *s_bitmap; - struct inode *s_table; - } s_fspace; __u32 s_partition_root; __u32 s_partition_len; __u16 s_partition_type; |