diff options
Diffstat (limited to 'fs')
207 files changed, 2709 insertions, 1982 deletions
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index a183e213a4a5..21527189e430 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -55,12 +55,11 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) static int adfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, adfs_get_block, &ADFS_I(mapping->host)->mmu_private); if (unlikely(ret)) diff --git a/fs/affs/file.c b/fs/affs/file.c index 04c018e19602..a5a861dd5223 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -417,12 +417,11 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int affs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, affs_get_block, &AFFS_I(mapping->host)->mmu_private); if (unlikely(ret)) @@ -433,12 +432,12 @@ static int affs_write_begin(struct file *file, struct address_space *mapping, static int affs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); /* Clear Archived bit on file writes, as AmigaOS would do */ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) { @@ -648,7 +647,7 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio) static int affs_write_begin_ofs(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct folio *folio; @@ -671,7 +670,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; if (folio_test_uptodate(folio)) return 0; @@ -687,9 +686,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping static int affs_write_end_ofs(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh, *prev_bh; @@ -882,14 +880,14 @@ affs_truncate(struct inode *inode) if (inode->i_size > AFFS_I(inode)->mmu_private) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; void *fsdata = NULL; loff_t isize = inode->i_size; int res; - res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &page, &fsdata); + res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &folio, &fsdata); if (!res) - res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata); + res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, folio, fsdata); else inode->i_size = AFFS_I(inode)->mmu_private; mark_inode_dirty(inode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 3acf5e050072..a95e77670b49 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -695,13 +695,18 @@ static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_vnode *vnode = vp->vnode; + struct inode *inode = &vnode->netfs.inode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; - loff_t i_size = op->setattr.old_i_size; + loff_t old = op->setattr.old_i_size; + + /* Note: inode->i_size was updated by afs_apply_status() inside + * the I/O and callback locks. + */ - if (size != i_size) { - truncate_setsize(&vnode->netfs.inode, size); + if (size != old) { + truncate_pagecache(inode, size); netfs_resize_file(&vnode->netfs, size, true); fscache_resize_cookie(afs_vnode_cache(vnode), size); } @@ -100,7 +100,7 @@ struct kioctx { unsigned long user_id; - struct __percpu kioctx_cpu *cpu; + struct kioctx_cpu __percpu *cpu; /* * For percpu reqs_available, number of slots we move to/from global diff --git a/fs/attr.c b/fs/attr.c index 825007d5cda4..c04d19b58f12 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -487,9 +487,17 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, error = security_inode_setattr(idmap, dentry, attr); if (error) return error; - error = try_break_deleg(inode, delegated_inode); - if (error) - return error; + + /* + * If ATTR_DELEG is set, then these attributes are being set on + * behalf of the holder of a write delegation. We want to avoid + * breaking the delegation in this case. + */ + if (!(ia_valid & ATTR_DELEG)) { + error = try_break_deleg(inode, delegated_inode); + if (error) + return error; + } if (inode->i_op->setattr) error = inode->i_op->setattr(idmap, dentry, attr); diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 8c1d587b3eef..77c7991d89aa 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -62,6 +62,7 @@ struct autofs_info { struct list_head expiring; struct autofs_sb_info *sbi; + unsigned long exp_timeout; unsigned long last_used; int count; @@ -81,6 +82,9 @@ struct autofs_info { */ #define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ +#define AUTOFS_INF_EXPIRE_SET (1<<3) /* per-dentry expire timeout set for + this mount point. + */ struct autofs_wait_queue { wait_queue_head_t queue; struct autofs_wait_queue *next; diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 5bf781ea6d67..f011e026358e 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -128,7 +128,13 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) goto out; } + /* Setting the per-dentry expire timeout requires a trailing + * path component, ie. no '/', so invert the logic of the + * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD. + */ err = check_name(param->path); + if (cmd == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) + err = err ? 0 : -EINVAL; if (err) { pr_warn("invalid path supplied for cmd(0x%08x)\n", cmd); @@ -396,16 +402,97 @@ static int autofs_dev_ioctl_catatonic(struct file *fp, return 0; } -/* Set the autofs mount timeout */ +/* + * Set the autofs mount expire timeout. + * + * There are two places an expire timeout can be set, in the autofs + * super block info. (this is all that's needed for direct and offset + * mounts because there's a distinct mount corresponding to each of + * these) and per-dentry within within the dentry info. If a per-dentry + * timeout is set it will override the expire timeout set in the parent + * autofs super block info. + * + * If setting the autofs super block expire timeout the autofs_dev_ioctl + * size field will be equal to the autofs_dev_ioctl structure size. If + * setting the per-dentry expire timeout the mount point name is passed + * in the autofs_dev_ioctl path field and the size field updated to + * reflect this. + * + * Setting the autofs mount expire timeout sets the timeout in the super + * block info. struct. Setting the per-dentry timeout does a little more. + * If the timeout is equal to -1 the per-dentry timeout (and flag) is + * cleared which reverts to using the super block timeout, otherwise if + * timeout is 0 the timeout is set to this value and the flag is left + * set which disables expiration for the mount point, lastly the flag + * and the timeout are set enabling the dentry to use this timeout. + */ static int autofs_dev_ioctl_timeout(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - unsigned long timeout; + unsigned long timeout = param->timeout.timeout; + + /* If setting the expire timeout for an individual indirect + * mount point dentry the mount trailing component path is + * placed in param->path and param->size adjusted to account + * for it otherwise param->size it is set to the structure + * size. + */ + if (param->size == AUTOFS_DEV_IOCTL_SIZE) { + param->timeout.timeout = sbi->exp_timeout / HZ; + sbi->exp_timeout = timeout * HZ; + } else { + struct dentry *base = fp->f_path.dentry; + struct inode *inode = base->d_inode; + int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1; + struct dentry *dentry; + struct autofs_info *ino; + + if (!autofs_type_indirect(sbi->type)) + return -EINVAL; + + /* An expire timeout greater than the superblock timeout + * could be a problem at shutdown but the super block + * timeout itself can change so all we can really do is + * warn the user. + */ + if (timeout >= sbi->exp_timeout) + pr_warn("per-mount expire timeout is greater than " + "the parent autofs mount timeout which could " + "prevent shutdown\n"); + + inode_lock_shared(inode); + dentry = try_lookup_one_len(param->path, base, path_len); + inode_unlock_shared(inode); + if (IS_ERR_OR_NULL(dentry)) + return dentry ? PTR_ERR(dentry) : -ENOENT; + ino = autofs_dentry_ino(dentry); + if (!ino) { + dput(dentry); + return -ENOENT; + } + + if (ino->exp_timeout && ino->flags & AUTOFS_INF_EXPIRE_SET) + param->timeout.timeout = ino->exp_timeout / HZ; + else + param->timeout.timeout = sbi->exp_timeout / HZ; + + if (timeout == -1) { + /* Revert to using the super block timeout */ + ino->flags &= ~AUTOFS_INF_EXPIRE_SET; + ino->exp_timeout = 0; + } else { + /* Set the dentry expire flag and timeout. + * + * If timeout is 0 it will prevent the expire + * of this particular automount. + */ + ino->flags |= AUTOFS_INF_EXPIRE_SET; + ino->exp_timeout = timeout * HZ; + } + dput(dentry); + } - timeout = param->timeout.timeout; - param->timeout.timeout = sbi->exp_timeout / HZ; - sbi->exp_timeout = timeout * HZ; return 0; } diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 39d8c84c16f4..5c2d459e1e48 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -429,8 +429,6 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, if (!root) return NULL; - timeout = sbi->exp_timeout; - dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { spin_lock(&sbi->fs_lock); @@ -441,6 +439,11 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, } spin_unlock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_EXPIRE_SET) + timeout = ino->exp_timeout; + else + timeout = sbi->exp_timeout; + expired = should_expire(dentry, mnt, timeout, how); if (!expired) continue; diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index cf792d4de4f1..ee2edccaef70 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -19,6 +19,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) INIT_LIST_HEAD(&ino->expiring); ino->last_used = jiffies; ino->sbi = sbi; + ino->exp_timeout = -1; ino->count = 1; } return ino; @@ -28,6 +29,7 @@ void autofs_clean_ino(struct autofs_info *ino) { ino->uid = GLOBAL_ROOT_UID; ino->gid = GLOBAL_ROOT_GID; + ino->exp_timeout = -1; ino->last_used = jiffies; } @@ -172,8 +174,7 @@ static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi, ret = autofs_check_pipe(pipe); if (ret < 0) { errorf(fc, "Invalid/unusable pipe"); - if (param->type != fs_value_is_file) - fput(pipe); + fput(pipe); return -EBADF; } diff --git a/fs/backing-file.c b/fs/backing-file.c index afb557446c27..8860dac58c37 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -303,13 +303,16 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) return -EIO; + if (!out->f_op->splice_write) + return -EINVAL; + ret = file_remove_privs(ctx->user_file); if (ret) return ret; old_cred = override_creds(ctx->cred); file_start_write(out); - ret = iter_file_splice_write(pipe, out, ppos, len, flags); + ret = out->f_op->splice_write(pipe, out, ppos, len, flags); file_end_write(out); revert_creds(old_cred); diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index fd3a2522bc3e..dc3a4024aab6 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -240,71 +240,73 @@ fsck_err: int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { - struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + struct bch_alloc_v4 a; int ret = 0; - bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), + bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k)); + + bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k), c, alloc_v4_val_size_bad, "bad val size (%u > %zu)", - alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k)); + alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k)); - bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), + bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) && + BCH_ALLOC_V4_NR_BACKPOINTERS(&a), c, alloc_v4_backpointers_start_bad, "invalid backpointers_start"); - bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, + bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type, c, alloc_key_data_type_bad, "invalid data type (got %u should be %u)", - a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + a.data_type, alloc_data_type(a, a.data_type)); for (unsigned i = 0; i < 2; i++) - bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX, + bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX, c, alloc_key_io_time_bad, "invalid io_time[%s]: %llu, max %llu", i == READ ? "read" : "write", - a.v->io_time[i], LRU_TIME_MAX); + a.io_time[i], LRU_TIME_MAX); - unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) > + unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) > offsetof(struct bch_alloc_v4, stripe_sectors) - ? a.v->stripe_sectors + ? a.stripe_sectors : 0; - switch (a.v->data_type) { + switch (a.data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: bkey_fsck_err_on(stripe_sectors || - a.v->dirty_sectors || - a.v->cached_sectors || - a.v->stripe, + a.dirty_sectors || + a.cached_sectors || + a.stripe, c, alloc_key_empty_but_have_data, "empty data type free but have data %u.%u.%u %u", stripe_sectors, - a.v->dirty_sectors, - a.v->cached_sectors, - a.v->stripe); + a.dirty_sectors, + a.cached_sectors, + a.stripe); break; case BCH_DATA_sb: case BCH_DATA_journal: case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: - bkey_fsck_err_on(!a.v->dirty_sectors && + bkey_fsck_err_on(!a.dirty_sectors && !stripe_sectors, c, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", - bch2_data_type_str(a.v->data_type)); + bch2_data_type_str(a.data_type)); break; case BCH_DATA_cached: - bkey_fsck_err_on(!a.v->cached_sectors || - a.v->dirty_sectors || + bkey_fsck_err_on(!a.cached_sectors || + a.dirty_sectors || stripe_sectors || - a.v->stripe, + a.stripe, c, alloc_key_cached_inconsistency, "data type inconsistency"); - bkey_fsck_err_on(!a.v->io_time[READ] && + bkey_fsck_err_on(!a.io_time[READ] && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); @@ -556,7 +558,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) struct bpos pos = alloc_gens_pos(iter.pos, &offset); int ret2 = 0; - if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { + if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) { ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret2) @@ -829,7 +831,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (likely(new.k->type == KEY_TYPE_alloc_v4)) { new_a = bkey_s_to_alloc_v4(new).v; } else { - BUG_ON(!(flags & BTREE_TRIGGER_gc)); + BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair))); struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); ret = PTR_ERR_OR_ZERO(new_ka); @@ -1872,26 +1874,26 @@ static void bch2_do_discards_work(struct work_struct *work) trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_dev_do_discards(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) - goto put_ioref; + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_write_ref; if (queue_work(c->write_ref_wq, &ca->discard_work)) return; - bch2_write_ref_put(c, BCH_WRITE_REF_discard); -put_ioref: percpu_ref_put(&ca->io_ref); +put_write_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) @@ -1966,8 +1968,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) @@ -1977,18 +1979,18 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) if (discard_in_flight_add(ca, bucket, false)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) - goto put_ioref; + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_ref; if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) return; - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); -put_ioref: percpu_ref_put(&ca->io_ref); +put_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } static int invalidate_one_bucket(struct btree_trans *trans, @@ -2130,26 +2132,26 @@ static void bch2_do_invalidates_work(struct work_struct *work) bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_dev_do_invalidates(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) - goto put_ioref; + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + goto put_ref; if (queue_work(c->write_ref_wq, &ca->invalidate_work)) return; - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); -put_ioref: percpu_ref_put(&ca->io_ref); +put_ref: + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h index 47d9d006502c..f754a2951d8a 100644 --- a/fs/bcachefs/alloc_background_format.h +++ b/fs/bcachefs/alloc_background_format.h @@ -69,6 +69,7 @@ struct bch_alloc_v4 { __u64 io_time[2]; __u32 stripe; __u32 nr_external_backpointers; + /* end of fields in original version of alloc_v4 */ __u64 fragmentation_lru; __u32 stripe_sectors; __u32 pad; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index c75f2e0f32bb..14ce726bf5a3 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -677,7 +677,8 @@ struct bch_sb_field_ext { x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ x(disk_accounting_v2, BCH_VERSION(1, 9)) \ x(disk_accounting_v3, BCH_VERSION(1, 10)) \ - x(disk_accounting_inum, BCH_VERSION(1, 11)) + x(disk_accounting_inum, BCH_VERSION(1, 11)) \ + x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index f5d85b50b6f2..e52a06d3418c 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -159,6 +159,16 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return b; } +void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) +{ + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +} + /* Btree in memory cache - hash table */ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) @@ -736,6 +746,13 @@ out: start_time); memalloc_nofs_restore(flags); + + int ret = bch2_trans_relock(trans); + if (unlikely(ret)) { + bch2_btree_node_to_freelist(c, b); + return ERR_PTR(ret); + } + return b; err: mutex_lock(&bc->lock); @@ -856,6 +873,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, bch2_btree_node_read(trans, b, sync); + int ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + if (!sync) return NULL; @@ -974,6 +995,10 @@ retry: bch2_btree_node_wait_on_read(b); + ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + /* * should_be_locked is not set on this path yet, so we need to * relock it specifically: diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index c0eb87a057cc..f82064007127 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -12,6 +12,8 @@ struct btree_iter; void bch2_recalc_btree_reserve(struct bch_fs *); +void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *); + void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index dca62375d7d3..222b7ce8a901 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -569,6 +569,15 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ _btree_id, _pos, _flags, KEY_TYPE_##_type)) +#define bkey_val_copy(_dst_v, _src_k) \ +do { \ + unsigned b = min_t(unsigned, sizeof(*_dst_v), \ + bkey_val_bytes(_src_k.k)); \ + memcpy(_dst_v, _src_k.v, b); \ + if (b < sizeof(*_dst_v)) \ + memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \ +} while (0) + static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, unsigned btree_id, struct bpos pos, unsigned flags, unsigned type, diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 74933490aaba..c1657182c275 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -530,6 +530,8 @@ static void __journal_keys_sort(struct journal_keys *keys) { sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); + cond_resched(); + struct journal_key *dst = keys->data; darray_for_each(*keys, src) { diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 79954490627c..fda7998734cb 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -726,6 +726,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + path->should_be_locked = false; } static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, @@ -777,6 +778,20 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, rcu_read_lock(); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + + /* + * Scanning is expensive while a rehash is in progress - most elements + * will be on the new hashtable, if it's in progress + * + * A rehash could still start while we're scanning - that's ok, we'll + * still see most elements. + */ + if (unlikely(tbl->nest)) { + rcu_read_unlock(); + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + return SHRINK_STOP; + } + if (bc->shrink_iter >= tbl->size) bc->shrink_iter = 0; start = bc->shrink_iter; @@ -784,7 +799,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, do { struct rhash_head *pos, *next; - pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); + pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]); while (!rht_is_a_nulls(pos)) { next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); @@ -865,12 +880,22 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) while (atomic_long_read(&bc->nr_keys)) { rcu_read_lock(); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (tbl) + if (tbl) { + if (tbl->nest) { + /* wait for in progress rehash */ + rcu_read_unlock(); + mutex_lock(&bc->table.mutex); + mutex_unlock(&bc->table.mutex); + rcu_read_lock(); + continue; + } for (i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { + ck = container_of(pos, struct bkey_cached, hash); bkey_cached_evict(bc, ck); list_add(&ck->list, &items); } + } rcu_read_unlock(); } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index b3454d4619e8..8fd112026e7a 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -317,6 +317,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, : 0; int ret; + b = bch2_btree_node_mem_alloc(trans, interior_node); + if (IS_ERR(b)) + return b; + + BUG_ON(b->ob.nr); + mutex_lock(&c->btree_reserve_cache_lock); if (c->btree_reserve_cache_nr > nr_reserve) { struct btree_alloc *a = @@ -325,10 +331,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, obs = a->ob; bkey_copy(&tmp.k, &a->k); mutex_unlock(&c->btree_reserve_cache_lock); - goto mem_alloc; + goto out; } mutex_unlock(&c->btree_reserve_cache_lock); - retry: ret = bch2_alloc_sectors_start_trans(trans, c->opts.metadata_target ?: @@ -341,7 +346,7 @@ retry: c->opts.metadata_replicas_required), watermark, 0, cl, &wp); if (unlikely(ret)) - return ERR_PTR(ret); + goto err; if (wp->sectors_free < btree_sectors(c)) { struct open_bucket *ob; @@ -360,19 +365,16 @@ retry: bch2_open_bucket_get(c, wp, &obs); bch2_alloc_sectors_done(c, wp); -mem_alloc: - b = bch2_btree_node_mem_alloc(trans, interior_node); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - - /* we hold cannibalize_lock: */ - BUG_ON(IS_ERR(b)); - BUG_ON(b->ob.nr); - +out: bkey_copy(&b->key, &tmp.k); b->ob = obs; + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); return b; +err: + bch2_btree_node_to_freelist(c, b); + return ERR_PTR(ret); } static struct btree *bch2_btree_node_alloc(struct btree_update *as, @@ -2439,6 +2441,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite } new_hash = bch2_btree_node_mem_alloc(trans, false); + ret = PTR_ERR_OR_ZERO(new_hash); + if (ret) + goto err; } path->intent_ref++; @@ -2446,14 +2451,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite commit_flags, skip_triggers); --path->intent_ref; - if (new_hash) { - mutex_lock(&c->btree_cache.lock); - list_move(&new_hash->list, &c->btree_cache.freeable); - mutex_unlock(&c->btree_cache.lock); - - six_unlock_write(&new_hash->c.lock); - six_unlock_intent(&new_hash->c.lock); - } + if (new_hash) + bch2_btree_node_to_freelist(c, new_hash); +err: closure_sync(&cl); bch2_btree_cache_cannibalize_unlock(trans); return ret; @@ -2522,6 +2522,10 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id b = bch2_btree_node_mem_alloc(trans, false); bch2_btree_cache_cannibalize_unlock(trans); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret; + set_btree_node_fake(b); set_btree_node_need_rewrite(b); b->c.level = level; @@ -2553,7 +2557,7 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level)); + bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level))); } static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index be2bbd248631..721bbe1dffc1 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -100,12 +100,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (!ca) { - if (fsck_err(trans, ptr_to_invalid_device, - "pointer to missing device %u\n" - "while marking %s", - p.ptr.dev, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID, + trans, ptr_to_invalid_device, + "pointer to missing device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) *do_update = true; return 0; } @@ -562,7 +563,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (unlikely(!ca)) { - if (insert) + if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) ret = -EIO; goto err; } @@ -699,7 +700,8 @@ err: static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags) + enum btree_iter_update_trigger_flags flags, + s64 *replicas_sectors) { bool gc = flags & BTREE_TRIGGER_gc; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -708,7 +710,6 @@ static int __trigger_extent(struct btree_trans *trans, enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; - s64 replicas_sectors = 0; int ret = 0; struct disk_accounting_pos acc_replicas_key = { @@ -739,7 +740,7 @@ static int __trigger_extent(struct btree_trans *trans, if (ret) return ret; } else if (!p.has_ec) { - replicas_sectors += disk_sectors; + *replicas_sectors += disk_sectors; acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev; } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); @@ -777,7 +778,7 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs) { - ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc); if (ret) return ret; } @@ -787,7 +788,7 @@ static int __trigger_extent(struct btree_trans *trans, .type = BCH_DISK_ACCOUNTING_snapshot, .snapshot.id = k.k->p.snapshot, }; - ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); if (ret) return ret; } @@ -807,7 +808,7 @@ static int __trigger_extent(struct btree_trans *trans, .type = BCH_DISK_ACCOUNTING_btree, .btree.id = btree_id, }; - ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); if (ret) return ret; } else { @@ -819,22 +820,13 @@ static int __trigger_extent(struct btree_trans *trans, s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), - replicas_sectors, + *replicas_sectors, }; ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); if (ret) return ret; } - if (bch2_bkey_rebalance_opts(k)) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_rebalance_work, - }; - ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc); - if (ret) - return ret; - } - return 0; } @@ -843,6 +835,7 @@ int bch2_trigger_extent(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; @@ -858,21 +851,53 @@ int bch2_trigger_extent(struct btree_trans *trans, new_ptrs_bytes)) return 0; - if (flags & BTREE_TRIGGER_transactional) { - struct bch_fs *c = trans->c; - int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) - - (int) bch2_bkey_needs_rebalance(c, old); + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { + s64 old_replicas_sectors = 0, new_replicas_sectors = 0; + + if (old.k->type) { + int ret = __trigger_extent(trans, btree, level, old, + flags & ~BTREE_TRIGGER_insert, + &old_replicas_sectors); + if (ret) + return ret; + } + + if (new.k->type) { + int ret = __trigger_extent(trans, btree, level, new.s_c, + flags & ~BTREE_TRIGGER_overwrite, + &new_replicas_sectors); + if (ret) + return ret; + } - if (mod) { + int need_rebalance_delta = 0; + s64 need_rebalance_sectors_delta = 0; + + s64 s = bch2_bkey_sectors_need_rebalance(c, old); + need_rebalance_delta -= s != 0; + need_rebalance_sectors_delta -= s; + + s = bch2_bkey_sectors_need_rebalance(c, new.s_c); + need_rebalance_delta += s != 0; + need_rebalance_sectors_delta += s; + + if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, mod > 0); + new.k->p, need_rebalance_delta > 0); if (ret) return ret; } - } - if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) - return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags); + if (need_rebalance_sectors_delta) { + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_rebalance_work, + }; + int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, + flags & BTREE_TRIGGER_gc); + if (ret) + return ret; + } + } return 0; } diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index f70eb2127d32..f9fb150eda70 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -107,7 +107,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, nr_elements += t->d[i].journal_seq > flushed_seq; new_bits = ilog2(roundup_pow_of_two(nr_elements * 3)); - +realloc: n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; @@ -118,6 +118,8 @@ retry_rehash: if (nr_rehashes_this_size == 3) { new_bits++; nr_rehashes_this_size = 0; + kvfree(n); + goto realloc; } nr_rehashes++; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 6a854c918496..004894ad4147 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -20,6 +20,76 @@ #include "subvolume.h" #include "trace.h" +static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) + bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); +} + +static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + if (!bch2_dev_tryget(c, ptr->dev)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); + } + return false; + } + } + return true; +} + +static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + } +} + +static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + + if (ctxt) { + bool locked; + + move_ctxt_wait_event(ctxt, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || + list_empty(&ctxt->ios)); + + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); + } else { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + + bucket = PTR_BUCKET_POS(ca, ptr2); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + } + return false; + } + } + } + return true; +} + static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) { if (trace_move_extent_finish_enabled()) { @@ -267,6 +337,7 @@ restart_drop_extra_replicas: printbuf_exit(&buf); bch2_fatal_error(c); + ret = -EIO; goto out; } @@ -355,17 +426,11 @@ void bch2_data_update_read_done(struct data_update *m, void bch2_data_update_exit(struct data_update *update) { struct bch_fs *c = update->op.c; - struct bkey_ptrs_c ptrs = - bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); - - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - if (c->opts.nocow_enabled) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(ca, ptr), 0); - bch2_dev_put(ca); - } + struct bkey_s_c k = bkey_i_to_s_c(update->k.k); + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); + bkey_put_dev_refs(c, k); bch2_bkey_buf_exit(&update->k, c); bch2_disk_reservation_put(c, &update->op.res); bch2_bio_free_pages_pool(c, &update->op.wbio.bio); @@ -475,6 +540,9 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, bch2_compression_opt_to_text(out, background_compression(*io_opts)); prt_newline(out); + prt_str(out, "opts.replicas:\t"); + prt_u64(out, io_opts->data_replicas); + prt_str(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); } @@ -543,7 +611,6 @@ int bch2_data_update_init(struct btree_trans *trans, const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; - unsigned ptrs_locked = 0; int ret = 0; /* @@ -554,6 +621,15 @@ int bch2_data_update_init(struct btree_trans *trans, if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) return -BCH_ERR_data_update_done; + if (!bkey_get_dev_refs(c, k)) + return -BCH_ERR_data_update_done; + + if (c->opts.nocow_enabled && + !bkey_nocow_lock(c, ctxt, k)) { + bkey_put_dev_refs(c, k); + return -BCH_ERR_nocow_lock_blocked; + } + bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); m->btree_id = btree_id; @@ -575,40 +651,24 @@ int bch2_data_update_init(struct btree_trans *trans, m->op.compression_opt = background_compression(io_opts); m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - bkey_for_each_ptr(ptrs, ptr) { - if (!bch2_dev_tryget(c, ptr->dev)) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; - bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); - } - return -BCH_ERR_data_update_done; - } - } - unsigned durability_have = 0, durability_removing = 0; i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); - struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); - bool locked; - - rcu_read_lock(); - if (((1U << i) & m->data_opts.rewrite_ptrs)) { - BUG_ON(p.ptr.cached); - - if (crc_is_compressed(p.crc)) - reserve_sectors += k.k->size; - - m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); - durability_removing += bch2_extent_ptr_desired_durability(c, &p); - } else if (!p.ptr.cached && - !((1U << i) & m->data_opts.kill_ptrs)) { - bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); - durability_have += bch2_extent_ptr_durability(c, &p); + if (!p.ptr.cached) { + rcu_read_lock(); + if (BIT(i) & m->data_opts.rewrite_ptrs) { + if (crc_is_compressed(p.crc)) + reserve_sectors += k.k->size; + + m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); + durability_removing += bch2_extent_ptr_desired_durability(c, &p); + } else if (!(BIT(i) & m->data_opts.kill_ptrs)) { + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + durability_have += bch2_extent_ptr_durability(c, &p); + } + rcu_read_unlock(); } - rcu_read_unlock(); /* * op->csum_type is normally initialized from the fs/file's @@ -623,24 +683,6 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; - if (c->opts.nocow_enabled) { - if (ctxt) { - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, - bucket, 0)) || - list_empty(&ctxt->ios)); - - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); - } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { - ret = -BCH_ERR_nocow_lock_blocked; - goto err; - } - } - ptrs_locked |= (1U << i); - } - i++; } @@ -654,16 +696,6 @@ int bch2_data_update_init(struct btree_trans *trans, * Increasing replication is an explicit operation triggered by * rereplicate, currently, so that users don't get an unexpected -ENOSPC */ - if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) && - !durability_required) { - m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; - m->data_opts.rewrite_ptrs = 0; - /* if iter == NULL, it's just a promote */ - if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); - goto done; - } - m->op.nr_replicas = min(durability_removing, durability_required) + m->data_opts.extra_replicas; @@ -675,48 +707,38 @@ int bch2_data_update_init(struct btree_trans *trans, if (!(durability_have + durability_removing)) m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); - if (!m->op.nr_replicas) { - struct printbuf buf = PRINTBUF; + m->op.nr_replicas_required = m->op.nr_replicas; - bch2_data_update_to_text(&buf, m); - WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_data_update_done; - goto done; + /* + * It might turn out that we don't need any new replicas, if the + * replicas or durability settings have been changed since the extent + * was written: + */ + if (!m->op.nr_replicas) { + m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; + m->data_opts.rewrite_ptrs = 0; + /* if iter == NULL, it's just a promote */ + if (iter) + ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); + goto out; } - m->op.nr_replicas_required = m->op.nr_replicas; - if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, m->data_opts.extra_replicas ? 0 : BCH_DISK_RESERVATION_NOFAIL); if (ret) - goto err; + goto out; } if (bkey_extent_is_unwritten(k)) { bch2_update_unwritten_extent(trans, m); - goto done; + goto out; } return 0; -err: - i = 0; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); - struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); - if ((1U << i) & ptrs_locked) - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - bch2_dev_put(ca); - i++; - } - - bch2_bkey_buf_exit(&m->k, c); - bch2_bio_free_pages_pool(c, &m->op.wbio.bio); - return ret; -done: +out: bch2_data_update_exit(m); return ret ?: -BCH_ERR_data_update_done; } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 90962b3c0130..9baf3411a8f9 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -97,7 +97,9 @@ static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe const struct bch_extent_ptr *data_ptr, unsigned sectors) { - return data_ptr->dev == stripe_ptr->dev && + return (data_ptr->dev == stripe_ptr->dev || + data_ptr->dev == BCH_SB_MEMBER_INVALID || + stripe_ptr->dev == BCH_SB_MEMBER_INVALID) && data_ptr->gen == stripe_ptr->gen && data_ptr->offset >= stripe_ptr->offset && data_ptr->offset < stripe_ptr->offset + sectors; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index ab5a7adece10..742dcdd3e5d7 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -257,7 +257,6 @@ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, need_inode_lock) \ x(0, invalid_snapshot_node) \ x(0, option_needs_open_fs) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 4419ad3e454e..324303bf4353 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -781,14 +781,17 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, /* * Returns pointer to the next entry after the one being dropped: */ -union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, - struct bch_extent_ptr *ptr) +void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry = to_entry(ptr), *next; - union bch_extent_entry *ret = entry; bool drop_crc = true; + if (k.k->type == KEY_TYPE_stripe) { + ptr->dev = BCH_SB_MEMBER_INVALID; + return; + } + EBUG_ON(ptr < &ptrs.start->ptr || ptr >= &ptrs.end->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); @@ -811,21 +814,16 @@ union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, break; if ((extent_entry_is_crc(entry) && drop_crc) || - extent_entry_is_stripe_ptr(entry)) { - ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_is_stripe_ptr(entry)) extent_entry_drop(k, entry); - } } - - return ret; } -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr) { bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; - union bch_extent_entry *ret = - bch2_bkey_drop_ptr_noerror(k, ptr); + + bch2_bkey_drop_ptr_noerror(k, ptr); /* * If we deleted all the dirty pointers and there's still cached @@ -837,14 +835,10 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, !bch2_bkey_dirty_devs(k.s_c).nr) { k.k->type = KEY_TYPE_error; set_bkey_val_u64s(k.k, 0); - ret = NULL; } else if (!bch2_bkey_nr_ptrs(k.s_c)) { k.k->type = KEY_TYPE_deleted; set_bkey_val_u64s(k.k, 0); - ret = NULL; } - - return ret; } void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) @@ -929,8 +923,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) if (p1.ptr.dev == p2.ptr.dev && p1.ptr.gen == p2.ptr.gen && + + /* + * This checks that the two pointers point + * to the same region on disk - adjusting + * for the difference in where the extents + * start, since one may have been trimmed: + */ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && + + /* + * This additionally checks that the + * extents overlap on disk, since the + * previous check may trigger spuriously + * when one extent is immediately partially + * overwritten with another extent (so that + * on disk they are adjacent) and + * compression is in use: + */ + ((p1.ptr.offset >= p2.ptr.offset && + p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || + (p2.ptr.offset >= p1.ptr.offset && + p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) return true; return false; @@ -1017,6 +1032,8 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_printf(out, "ptr: %u:%llu:%u gen %u", ptr->dev, b, offset, ptr->gen); + if (ca->mi.durability != 1) + prt_printf(out, " d=%u", ca->mi.durability); if (ptr->cached) prt_str(out, " cached"); if (ptr->unwritten) @@ -1377,6 +1394,45 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) return r != NULL; } +static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k, + unsigned target, unsigned compression) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 sectors = 0; + + if (compression) { + unsigned compression_type = bch2_compression_opt_to_type(compression); + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) { + sectors = 0; + goto incompressible; + } + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + sectors += p.crc.compressed_size; + } + } +incompressible: + if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target)) + sectors += p.crc.compressed_size; + } + + return sectors; +} + +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + + return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0; +} + int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, struct bch_io_opts *opts) { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 1a6ddee48041..42a7c6d820a0 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -649,26 +649,21 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); -union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s, - struct bch_extent_ptr *); -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, - struct bch_extent_ptr *); +void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *); +void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); #define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ do { \ - struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ + __label__ _again; \ + struct bkey_ptrs _ptrs; \ +_again: \ + _ptrs = bch2_bkey_ptrs(_k); \ \ - struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \ - \ - while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ + bkey_for_each_ptr(_ptrs, _ptr) \ if (_cond) { \ - _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ - _ptrs = bch2_bkey_ptrs(_k); \ - continue; \ + bch2_bkey_drop_ptr(_k, _ptr); \ + goto _again; \ } \ - \ - (_ptr)++; \ - } \ } while (0) bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, @@ -692,6 +687,7 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, unsigned, unsigned); bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, struct bch_io_opts *); diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index cc33d763f722..ff60c041abe5 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -534,7 +534,7 @@ do_io: if (f_sectors > w->tmp_sectors) { kfree(w->tmp); - w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL); + w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL); w->tmp_sectors = f_sectors; } @@ -659,7 +659,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc int bch2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -728,12 +728,11 @@ out: goto err; } - *pagep = &folio->page; + *foliop = folio; return 0; err: folio_unlock(folio); folio_put(folio); - *pagep = NULL; err_unlock: bch2_pagecache_add_put(inode); kfree(res); @@ -743,12 +742,11 @@ err_unlock: int bch2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation *res = fsdata; - struct folio *folio = page_folio(page); unsigned offset = pos - folio_pos(folio); lockdep_assert_held(&inode->v.i_rwsem); @@ -802,8 +800,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi) static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, struct iov_iter *iter, - loff_t pos, unsigned len, - bool inode_locked) + loff_t pos, unsigned len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; @@ -827,15 +824,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, BUG_ON(!fs.nr); - /* - * If we're not using the inode lock, we need to lock all the folios for - * atomiticity of writes vs. other writes: - */ - if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { - ret = -BCH_ERR_need_inode_lock; - goto out; - } - f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); @@ -932,10 +920,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, end = pos + copied; spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) { - BUG_ON(!inode_locked); + if (end > inode->v.i_size) i_size_write(&inode->v, end); - } spin_unlock(&inode->v.i_lock); f_pos = pos; @@ -979,68 +965,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos; - bool inode_locked = false; - ssize_t written = 0, written2 = 0, ret = 0; - - /* - * We don't take the inode lock unless i_size will be changing. Folio - * locks provide exclusion with other writes, and the pagecache add lock - * provides exclusion with truncate and hole punching. - * - * There is one nasty corner case where atomicity would be broken - * without great care: when copying data from userspace to the page - * cache, we do that with faults disable - a page fault would recurse - * back into the filesystem, taking filesystem locks again, and - * deadlock; so it's done with faults disabled, and we fault in the user - * buffer when we aren't holding locks. - * - * If we do part of the write, but we then race and in the userspace - * buffer have been evicted and are no longer resident, then we have to - * drop our folio locks to re-fault them in, breaking write atomicity. - * - * To fix this, we restart the write from the start, if we weren't - * holding the inode lock. - * - * There is another wrinkle after that; if we restart the write from the - * start, and then get an unrecoverable error, we _cannot_ claim to - * userspace that we did not write data we actually did - so we must - * track (written2) the most we ever wrote. - */ - - if ((iocb->ki_flags & IOCB_APPEND) || - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { - inode_lock(&inode->v); - inode_locked = true; - } - - ret = generic_write_checks(iocb, iter); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); - if (ret) { - if (!inode_locked) { - inode_lock(&inode->v); - inode_locked = true; - ret = file_remove_privs_flags(file, 0); - } - if (ret) - goto unlock; - } - - ret = file_update_time(file); - if (ret) - goto unlock; - - pos = iocb->ki_pos; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + int ret = 0; bch2_pagecache_add_get(inode); - if (!inode_locked && - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) - goto get_inode_lock; - do { unsigned offset = pos & (PAGE_SIZE - 1); unsigned bytes = iov_iter_count(iter); @@ -1065,17 +995,12 @@ again: } } - if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) - goto get_inode_lock; - if (unlikely(fatal_signal_pending(current))) { ret = -EINTR; break; } - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); - if (ret == -BCH_ERR_need_inode_lock) - goto get_inode_lock; + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); if (unlikely(ret < 0)) break; @@ -1096,46 +1021,50 @@ again: } pos += ret; written += ret; - written2 = max(written, written2); - - if (ret != bytes && !inode_locked) - goto get_inode_lock; ret = 0; balance_dirty_pages_ratelimited(mapping); - - if (0) { -get_inode_lock: - bch2_pagecache_add_put(inode); - inode_lock(&inode->v); - inode_locked = true; - bch2_pagecache_add_get(inode); - - iov_iter_revert(iter, written); - pos -= written; - written = 0; - ret = 0; - } } while (iov_iter_count(iter)); - bch2_pagecache_add_put(inode); -unlock: - if (inode_locked) - inode_unlock(&inode->v); - iocb->ki_pos += written; + bch2_pagecache_add_put(inode); - ret = max(written, written2) ?: ret; - if (ret > 0) - ret = generic_write_sync(iocb, ret); - return ret; + return written ? written : ret; } -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) { - ssize_t ret = iocb->ki_flags & IOCB_DIRECT - ? bch2_direct_write(iocb, iter) - : bch2_buffered_write(iocb, iter); + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } + + inode_lock(&inode->v); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs(file); + if (ret) + goto unlock; + + ret = file_update_time(file); + if (ret) + goto unlock; + + ret = bch2_buffered_write(iocb, from); + if (likely(ret > 0)) + iocb->ki_pos += ret; +unlock: + inode_unlock(&inode->v); + if (ret > 0) + ret = generic_write_sync(iocb, ret); +out: return bch2_err_class(ret); } diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h index a6126ff790e6..3207ebbb4ab4 100644 --- a/fs/bcachefs/fs-io-buffered.h +++ b/fs/bcachefs/fs-io-buffered.h @@ -10,10 +10,10 @@ int bch2_read_folio(struct file *, struct folio *); int bch2_writepages(struct address_space *, struct writeback_control *); void bch2_readahead(struct readahead_control *); -int bch2_write_begin(struct file *, struct address_space *, loff_t, - unsigned, struct page **, void **); +int bch2_write_begin(struct file *, struct address_space *, loff_t pos, + unsigned len, struct folio **, void **); int bch2_write_end(struct file *, struct address_space *, loff_t, - unsigned, unsigned, struct page *, void *); + unsigned len, unsigned copied, struct folio *, void *); ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index aea8132d2c40..99c7fe987c74 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -328,9 +328,8 @@ static int bch2_ioc_setlabel(struct bch_fs *c, mutex_lock(&c->sb_lock); strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); - mutex_unlock(&c->sb_lock); - ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); mnt_drop_write_file(file); return ret; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 94c392abef65..011817afc3ad 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -177,6 +177,14 @@ static unsigned bch2_inode_hash(subvol_inum inum) return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); } +struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +{ + return to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); +} + static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) { subvol_inum inum = inode_inum(inode); @@ -1644,14 +1652,16 @@ again: break; } } else if (clean_pass && this_pass_clean) { - wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); + prepare_to_wait_event(wq_head, &wqe.wq_entry, + TASK_UNINTERRUPTIBLE); mutex_unlock(&c->vfs_inodes_lock); schedule(); - finish_wait(wq, &wait.wq_entry); + finish_wait(wq_head, &wqe.wq_entry); goto again; } } diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index c3af7225ff69..990ec43e0365 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -56,6 +56,8 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) }; } +struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum); + /* * Set if we've gotten a btree error for this inode, and thus the vfs inode and * btree inode may be inconsistent: @@ -194,6 +196,11 @@ int bch2_vfs_init(void); #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) +static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +{ + return NULL; +} + static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} static inline void bch2_vfs_exit(void) {} diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9138944c5ae6..9b3470a97546 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -8,6 +8,7 @@ #include "darray.h" #include "dirent.h" #include "error.h" +#include "fs.h" #include "fs-common.h" #include "fsck.h" #include "inode.h" @@ -962,6 +963,22 @@ fsck_err: return ret; } +static bool bch2_inode_open(struct bch_fs *c, struct bpos p) +{ + subvol_inum inum = { + .subvol = snapshot_t(c, p.snapshot)->subvol, + .inum = p.offset, + }; + + /* snapshot tree corruption, can't safely delete */ + if (!inum.subvol) { + bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot); + return true; + } + + return __bch2_inode_hash_find(c, inum) != NULL; +} + static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1040,6 +1057,7 @@ static int check_inode(struct btree_trans *trans, } if (u.bi_flags & BCH_INODE_unlinked && + !bch2_inode_open(c, k.k->p) && (!c->sb.clean || fsck_err(trans, inode_unlinked_but_clean, "filesystem marked clean, but inode %llu unlinked", @@ -2006,7 +2024,6 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (ret) { bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); ret = -BCH_ERR_fsck_repair_unimplemented; - ret = 0; goto err; } @@ -2216,6 +2233,8 @@ int bch2_check_xattrs(struct bch_fs *c) NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_xattr(trans, &iter, k, &hash_info, &inode))); + + inode_walker_exit(&inode); bch_err_fn(c, ret); return ret; } @@ -2469,8 +2488,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino : bch2_inode_unpack(inode_k, &inode); if (ret) { /* Should have been caught in dirents pass */ - if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "error looking up parent directory: %i", ret); + bch_err_msg(c, ret, "error looking up parent directory"); break; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 649e3a01608a..f5f7db50ca31 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1260,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) } if (!had_entries) - j->last_empty_seq = cur_seq; + j->last_empty_seq = cur_seq - 1; /* to match j->seq */ spin_lock(&j->lock); diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c index db80e506e3ab..62b910f2fb27 100644 --- a/fs/bcachefs/journal_sb.c +++ b/fs/bcachefs/journal_sb.c @@ -104,6 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); int ret = -BCH_ERR_invalid_sb_journal; + u64 sum = 0; unsigned nr; unsigned i; struct u64_range *b; @@ -119,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f for (i = 0; i < nr; i++) { b[i].start = le64_to_cpu(journal->d[i].start); b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); + + if (b[i].end <= b[i].start) { + prt_printf(err, "journal buckets entry with bad nr: %llu+%llu", + le64_to_cpu(journal->d[i].start), + le64_to_cpu(journal->d[i].nr)); + goto err; + } + + sum += le64_to_cpu(journal->d[i].nr); } sort(b, nr, sizeof(*b), u64_range_cmp, NULL); @@ -148,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f } } + if (sum > UINT_MAX) { + prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX); + goto err; + } + ret = 0; err: kfree(b); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index deef4f024d20..d86565bf07c8 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -383,7 +383,7 @@ static int bch2_copygc_thread(void *arg) if (min_member_capacity == U64_MAX) min_member_capacity = 128 * 2048; - bch2_trans_unlock_long(ctxt.trans); + move_buckets_wait(&ctxt, buckets, true); bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), MAX_SCHEDULE_TIMEOUT); } diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index d89eb43c5ce9..36de1c6fe8c3 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -241,7 +241,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) const struct journal_key *l = *((const struct journal_key **)_l); const struct journal_key *r = *((const struct journal_key **)_r); - return cmp_int(l->journal_seq, r->journal_seq); + /* + * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last + * + * journal_seq == 0 means that the key comes from early repair, and + * should be inserted last so as to avoid overflowing the journal + */ + return cmp_int(l->journal_seq - 1, r->journal_seq - 1); } int bch2_journal_replay(struct bch_fs *c) @@ -322,6 +328,7 @@ int bch2_journal_replay(struct bch_fs *c) } } + bch2_trans_unlock_long(trans); /* * Now, replay any remaining keys in the order in which they appear in * the journal, unpinning those journal entries as we go: diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 1223b710755d..1f34c92a6d11 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -82,7 +82,8 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, } for (unsigned i = 0; i < r->nr_devs; i++) - if (!bch2_member_exists(sb, r->devs[i])) { + if (r->devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_member_exists(sb, r->devs[i])) { prt_printf(err, "invalid device %u in entry ", r->devs[i]); goto bad; } @@ -451,7 +452,8 @@ retry: .type = BCH_DISK_ACCOUNTING_replicas, }; - memcpy(&k.replicas, e, replicas_entry_bytes(e)); + unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), + "embedded variable length struct"); struct bpos p = disk_accounting_pos_to_bpos(&k); @@ -794,7 +796,7 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, nr_online += test_bit(e->devs[i], devs.d); struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]); - nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed; + nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; } rcu_read_unlock(); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 650a1f77ca40..c7e4cdd3f6a5 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -75,6 +75,9 @@ BCH_FSCK_ERR_accounting_key_junk_at_end) \ x(disk_accounting_inum, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch) \ + x(rebalance_work_acct_fix, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BCH_FSCK_ERR_accounting_mismatch) #define DOWNGRADE_TABLE() \ @@ -108,7 +111,10 @@ BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ BCH_FSCK_ERR_fs_usage_replicas_wrong, \ BCH_FSCK_ERR_accounting_replicas_not_marked, \ - BCH_FSCK_ERR_bkey_version_in_future) + BCH_FSCK_ERR_bkey_version_in_future) \ + x(rebalance_work_acct_fix, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch) struct upgrade_downgrade_entry { u64 recovery_passes; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d3a498617303..f0c14702f9e6 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -23,7 +23,7 @@ enum bch_fsck_flags { x(jset_past_bucket_end, 9, 0) \ x(jset_seq_blacklisted, 10, 0) \ x(journal_entries_missing, 11, 0) \ - x(journal_entry_replicas_not_marked, 12, 0) \ + x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \ x(journal_entry_past_jset_end, 13, 0) \ x(journal_entry_replicas_data_mismatch, 14, 0) \ x(journal_entry_bkey_u64s_0, 15, 0) \ @@ -288,10 +288,10 @@ enum bch_fsck_flags { x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ - x(accounting_key_junk_at_end, 277, 0) \ - x(accounting_key_replicas_nr_devs_0, 278, 0) \ - x(accounting_key_replicas_nr_required_bad, 279, 0) \ - x(accounting_key_replicas_devs_unsorted, 280, 0) \ + x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ + x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ + x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 39196f2a4197..4b765422dd77 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -11,7 +11,8 @@ void bch2_dev_missing(struct bch_fs *c, unsigned dev) { - bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); + if (dev != BCH_SB_MEMBER_INVALID) + bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); } void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket) diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index e2630548c0f6..d727d2dfda08 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -8,6 +8,11 @@ */ #define BCH_SB_MEMBERS_MAX 64 +/* + * Sentinal value - indicates a device that does not exist + */ +#define BCH_SB_MEMBER_INVALID 255 + #define BCH_MIN_NR_NBUCKETS (1 << 6) #define BCH_IOPS_MEASUREMENTS() \ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index f393023a3ae2..33f2a64c14c9 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -461,7 +461,7 @@ STORE(bch2_fs) sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); + c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc); } if (attr == &sysfs_trigger_gc) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 138320eaa2ad..1b8554460af4 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -416,7 +416,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, TABSTOP_SIZE + 2); prt_printf(out, "\tsince mount\r\trecent\r\n"); - prt_printf(out, "recent"); printbuf_tabstops_reset(out); printbuf_tabstop_push(out, out->indent + 20); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index f2b4c17a0307..331f944d73dc 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -612,10 +612,20 @@ static int bch2_xattr_bcachefs_get_effective( name, buffer, size, true); } +/* Noop - xattrs in the bcachefs_effective namespace are inherited */ +static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *vinode, + const char *name, const void *value, + size_t size, int flags) +{ + return 0; +} + static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { .prefix = "bcachefs_effective.", .get = bch2_xattr_bcachefs_get_effective, - .set = bch2_xattr_bcachefs_set, + .set = bch2_xattr_bcachefs_set_effective, }; #endif /* NO_BCACHEFS_FS */ diff --git a/fs/bfs/file.c b/fs/bfs/file.c index a778411574a9..fa66a09e496a 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -172,11 +172,11 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) static int bfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, bfs_get_block); + ret = block_write_begin(mapping, pos, len, foliop, bfs_get_block); if (unlikely(ret)) bfs_write_failed(mapping, pos + len); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 28a3439f163a..4fe5bb9f1b1f 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -589,6 +589,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, if (bprm->have_execfd) nitems++; +#ifdef ELF_HWCAP2 + nitems++; +#endif csp = sp; sp -= nitems * 2 * sizeof(unsigned long); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index f04d93109960..b4e31ae17cd9 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -668,7 +668,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = bbio->fs_info; - struct btrfs_bio *orig_bbio = bbio; struct bio *bio = &bbio->bio; u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; @@ -706,7 +705,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) - goto fail_put_bio; + goto fail; } if (btrfs_op(bio) == BTRFS_MAP_WRITE) { @@ -740,13 +739,13 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) - goto fail_put_bio; + goto fail; } else if (use_append || (btrfs_is_zoned(fs_info) && inode && inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); if (ret) - goto fail_put_bio; + goto fail; } } @@ -754,12 +753,23 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) done: return map_length == length; -fail_put_bio: - if (map_length < length) - btrfs_cleanup_bio(bbio); fail: btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(orig_bbio, ret); + /* + * We have split the original bbio, now we have to end both the current + * @bbio and remaining one, as the remaining one will never be submitted. + */ + if (map_length < length) { + struct btrfs_bio *remaining = bbio->private; + + ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); + ASSERT(remaining); + + remaining->bio.bi_status = ret; + btrfs_orig_bbio_end_io(remaining); + } + bbio->bio.bi_status = ret; + btrfs_orig_bbio_end_io(bbio); /* Do not submit another chunk */ return true; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 75fa563e4cac..c8568b1a61c4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -459,7 +459,6 @@ struct btrfs_file_private { void *filldir_buf; u64 last_index; struct extent_state *llseek_cached_state; - bool fsync_skip_inode_lock; }; static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 67adbe9d294a..364bce34f034 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -864,13 +864,6 @@ again: if (IS_ERR_OR_NULL(dio)) { ret = PTR_ERR_OR_ZERO(dio); } else { - struct btrfs_file_private stack_private = { 0 }; - struct btrfs_file_private *private; - const bool have_private = (file->private_data != NULL); - - if (!have_private) - file->private_data = &stack_private; - /* * If we have a synchronous write, we must make sure the fsync * triggered by the iomap_dio_complete() call below doesn't @@ -879,13 +872,10 @@ again: * partial writes due to the input buffer (or parts of it) not * being already faulted in. */ - private = file->private_data; - private->fsync_skip_inode_lock = true; + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB; ret = iomap_dio_complete(dio); - private->fsync_skip_inode_lock = false; - - if (!have_private) - file->private_data = NULL; + current->journal_info = NULL; } /* No increment (+=) because iomap returns a cumulative value. */ diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index 8f95f3e44e99..df7f09f3b02e 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -637,7 +637,7 @@ static int extent_fiemap(struct btrfs_inode *inode, struct btrfs_path *path; struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; - u64 last_extent_end; + u64 last_extent_end = 0; u64 prev_extent_end; u64 range_start; u64 range_end; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9914419f3b7d..2aeb8116549c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1603,7 +1603,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct btrfs_file_private *private = file->private_data; struct dentry *dentry = file_dentry(file); struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_root *root = inode->root; @@ -1613,7 +1612,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; u64 len; bool full_sync; - const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false); + bool skip_ilock = false; + + if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { + skip_ilock = true; + current->journal_info = NULL; + lockdep_assert_held(&inode->vfs_inode.i_rwsem); + } trace_btrfs_sync_file(file, datasync); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5d57a285d59b..feb8f9f2f358 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4185,6 +4185,8 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } + btrfs_run_delayed_iputs(root->fs_info); + btrfs_wait_on_delayed_iputs(root->fs_info); ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; @@ -4344,10 +4346,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { - extent_changeset_init(&changeset); return clear_record_extent_bits(&inode->io_tree, start, start + len - 1, - EXTENT_QGROUP_RESERVED, &changeset); + EXTENT_QGROUP_RESERVED, NULL); } /* In release case, we shouldn't have @reserved */ diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 68e14fd48638..c691784b4660 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1985,8 +1985,8 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static int do_reclaim_sweep(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, int raid) +static void do_reclaim_sweep(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; int thresh_pct; @@ -2031,7 +2031,6 @@ again: } up_read(&space_info->groups_sem); - return 0; } void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes) @@ -2074,21 +2073,15 @@ bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) return ret; } -int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info) +void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info) { - int ret; int raid; struct btrfs_space_info *space_info; list_for_each_entry(space_info, &fs_info->space_info, list) { if (!btrfs_should_periodic_reclaim(space_info)) continue; - for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) { - ret = do_reclaim_sweep(fs_info, space_info, raid); - if (ret) - return ret; - } + for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) + do_reclaim_sweep(fs_info, space_info, raid); } - - return ret; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 88b44221ce97..5602026c5e14 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -294,6 +294,6 @@ void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s6 void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info); -int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info); +void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 98c03ddc760b..dd9ce9b9f69e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -27,6 +27,12 @@ struct btrfs_root_item; struct btrfs_root; struct btrfs_path; +/* + * Signal that a direct IO write is in progress, to avoid deadlock for sync + * direct IO writes when fsync is called during the direct IO write path. + */ +#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) + /* Radix-tree tag for roots that are part of the trasaction. */ #define BTRFS_ROOT_TRANS_TAG 0 diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 66f63e82af79..047e3337852e 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1406,6 +1406,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, return -EINVAL; } + bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + if (zone_info[0].alloc_offset == WP_MISSING_DEV) { btrfs_err(bg->fs_info, "zoned: cannot recover write pointer for zone %llu", @@ -1432,7 +1434,6 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, } bg->alloc_offset = zone_info[0].alloc_offset; - bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity); return 0; } @@ -1450,6 +1451,9 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, return -EINVAL; } + /* In case a device is missing we have a cap of 0, so don't use it. */ + bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + for (i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV || zone_info[i].alloc_offset == WP_CONVENTIONAL) @@ -1471,9 +1475,6 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, if (test_bit(0, active)) set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } - /* In case a device is missing we have a cap of 0, so don't use it. */ - bg->zone_capacity = min_not_zero(zone_info[0].capacity, - zone_info[1].capacity); } if (zone_info[0].alloc_offset != WP_MISSING_DEV) @@ -1563,6 +1564,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; + u64 profile; if (!btrfs_is_zoned(fs_info)) return 0; @@ -1623,7 +1625,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } } - switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + switch (profile) { case 0: /* single */ ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; @@ -1650,6 +1653,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && + profile != BTRFS_BLOCK_GROUP_RAID10) { + /* + * Detected broken write pointer. Make this block group + * unallocatable by setting the allocation pointer at the end of + * allocatable region. Relocating this block group will fix the + * mismatch. + * + * Currently, we cannot handle RAID0 or RAID10 case like this + * because we don't have a proper zone_capacity value. But, + * reading from this block group won't work anyway by a missing + * stripe. + */ + cache->alloc_offset = cache->zone_capacity; + ret = 0; + } + out: /* Reject non SINGLE data profiles without RST */ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && diff --git a/fs/buffer.c b/fs/buffer.c index e55ad471c530..1fc9a50def0b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -774,12 +774,11 @@ EXPORT_SYMBOL(block_dirty_folio); static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) { struct buffer_head *bh; - struct list_head tmp; struct address_space *mapping; int err = 0, err2; struct blk_plug plug; + LIST_HEAD(tmp); - INIT_LIST_HEAD(&tmp); blk_start_plug(&plug); spin_lock(lock); @@ -958,12 +957,9 @@ no_grow: } EXPORT_SYMBOL_GPL(folio_alloc_buffers); -struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - bool retry) +struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size) { gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; - if (retry) - gfp |= __GFP_NOFAIL; return folio_alloc_buffers(page_folio(page), size, gfp); } @@ -2168,11 +2164,10 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, return err; } -int __block_write_begin(struct page *page, loff_t pos, unsigned len, +int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { - return __block_write_begin_int(page_folio(page), pos, len, get_block, - NULL); + return __block_write_begin_int(folio, pos, len, get_block, NULL); } EXPORT_SYMBOL(__block_write_begin); @@ -2222,33 +2217,33 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to) * The filesystem needs to handle block truncation upon failure. */ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, get_block_t *get_block) + struct folio **foliop, get_block_t *get_block) { pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; + struct folio *folio; int status; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - status = __block_write_begin(page, pos, len, get_block); + status = __block_write_begin_int(folio, pos, len, get_block, NULL); if (unlikely(status)) { - unlock_page(page); - put_page(page); - page = NULL; + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - *pagep = page; + *foliop = folio; return status; } EXPORT_SYMBOL(block_write_begin); int block_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); size_t start = pos - folio_pos(folio); if (unlikely(copied < len)) { @@ -2280,19 +2275,19 @@ EXPORT_SYMBOL(block_write_end); int generic_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool i_size_changed = false; - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * No need to use i_size_read() here, the i_size cannot change under us * because we hold i_rwsem. * - * But it's important to update i_size while still holding page lock: + * But it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. */ if (pos + copied > inode->i_size) { @@ -2300,8 +2295,8 @@ int generic_write_end(struct file *file, struct address_space *mapping, i_size_changed = true; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); @@ -2467,7 +2462,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) { struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; - struct page *page; + struct folio *folio; void *fsdata = NULL; int err; @@ -2475,11 +2470,11 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) if (err) goto out; - err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata); + err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata); if (err) goto out; - err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata); + err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata); BUG_ON(err > 0); out: @@ -2493,7 +2488,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; const struct address_space_operations *aops = mapping->a_ops; unsigned int blocksize = i_blocksize(inode); - struct page *page; + struct folio *folio; void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; @@ -2512,12 +2507,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, len = PAGE_SIZE - zerofrom; err = aops->write_begin(file, mapping, curpos, len, - &page, &fsdata); + &folio, &fsdata); if (err) goto out; - zero_user(page, zerofrom, len); + folio_zero_range(folio, offset_in_folio(folio, curpos), len); err = aops->write_end(file, mapping, curpos, len, len, - page, fsdata); + folio, fsdata); if (err < 0) goto out; BUG_ON(err != len); @@ -2545,12 +2540,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, len = offset - zerofrom; err = aops->write_begin(file, mapping, curpos, len, - &page, &fsdata); + &folio, &fsdata); if (err) goto out; - zero_user(page, zerofrom, len); + folio_zero_range(folio, offset_in_folio(folio, curpos), len); err = aops->write_end(file, mapping, curpos, len, len, - page, fsdata); + folio, fsdata); if (err < 0) goto out; BUG_ON(err != len); @@ -2566,7 +2561,7 @@ out: */ int cont_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata, + struct folio **foliop, void **fsdata, get_block_t *get_block, loff_t *bytes) { struct inode *inode = mapping->host; @@ -2584,7 +2579,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping, (*bytes)++; } - return block_write_begin(mapping, pos, len, pagep, get_block); + return block_write_begin(mapping, pos, len, foliop, get_block); } EXPORT_SYMBOL(cont_write_begin); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c4744a02db75..a5f848c167fa 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1508,20 +1508,18 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned */ static int ceph_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct folio *folio = NULL; int r; - r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); + r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, foliop, NULL); if (r < 0) return r; - folio_wait_private_2(folio); /* [DEPRECATED] */ - WARN_ON_ONCE(!folio_test_locked(folio)); - *pagep = &folio->page; + folio_wait_private_2(*foliop); /* [DEPRECATED] */ + WARN_ON_ONCE(!folio_test_locked(*foliop)); return 0; } @@ -1531,9 +1529,8 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, */ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *subpage, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(subpage); struct inode *inode = file_inode(file); struct ceph_client *cl = ceph_inode_to_client(inode); bool check_cap = false; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 71cd70514efa..4a8eec46254b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -695,6 +695,7 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 6898dc621011..6896fce122e1 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -119,31 +119,43 @@ static const struct fs_parameter_spec coda_param_specs[] = { {} }; -static int coda_parse_fd(struct fs_context *fc, int fd) +static int coda_set_idx(struct fs_context *fc, struct file *file) { struct coda_fs_context *ctx = fc->fs_private; - struct fd f; struct inode *inode; int idx; - f = fdget(fd); - if (!f.file) - return -EBADF; - inode = file_inode(f.file); + inode = file_inode(file); if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) { - fdput(f); - return invalf(fc, "code: Not coda psdev"); + return invalf(fc, "coda: Not coda psdev"); } - idx = iminor(inode); - fdput(f); - if (idx < 0 || idx >= MAX_CODADEVS) return invalf(fc, "coda: Bad minor number"); ctx->idx = idx; return 0; } +static int coda_parse_fd(struct fs_context *fc, struct fs_parameter *param, + struct fs_parse_result *result) +{ + struct file *file; + int err; + + if (param->type == fs_value_is_file) { + file = param->file; + param->file = NULL; + } else { + file = fget(result->uint_32); + } + if (!file) + return -EBADF; + + err = coda_set_idx(fc, file); + fput(file); + return err; +} + static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; @@ -155,7 +167,7 @@ static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) switch (opt) { case Opt_fd: - return coda_parse_fd(fc, result.uint_32); + return coda_parse_fd(fc, param, &result); } return 0; @@ -167,6 +179,7 @@ static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) */ static int coda_parse_monolithic(struct fs_context *fc, void *_data) { + struct file *file; struct coda_mount_data *data = _data; if (!data) @@ -175,7 +188,11 @@ static int coda_parse_monolithic(struct fs_context *fc, void *_data) if (data->version != CODA_MOUNT_VERSION) return invalf(fc, "coda: Bad mount version"); - coda_parse_fd(fc, data->fd); + file = fget(data->fd); + if (file) { + coda_set_idx(fc, file); + fput(file); + } return 0; } diff --git a/fs/dcache.c b/fs/dcache.c index 3d8daaecb6d1..0f6b16ba30d0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -96,11 +96,16 @@ EXPORT_SYMBOL(dotdot_name); * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. + * + * Marking the variables "used" ensures that the compiler doesn't + * optimize them away completely on architectures with runtime + * constant infrastructure, this allows debuggers to see their + * values. But updating these values has no effect on those arches. */ -static unsigned int d_hash_shift __ro_after_init; +static unsigned int d_hash_shift __ro_after_init __used; -static struct hlist_bl_head *dentry_hashtable __ro_after_init; +static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { @@ -1908,8 +1913,13 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); @@ -2163,9 +2173,6 @@ seqretry: * without taking d_lock and checking d_seq sequence count against @seq * returned here. * - * A refcount may be taken on the found dentry with the d_rcu_to_refcount - * function. - * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 91521576f500..66d9b3b4c588 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -89,12 +89,14 @@ enum { Opt_uid, Opt_gid, Opt_mode, + Opt_source, }; static const struct fs_parameter_spec debugfs_param_specs[] = { fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("mode", Opt_mode), fsparam_uid ("uid", Opt_uid), + fsparam_string ("source", Opt_source), {} }; @@ -126,6 +128,12 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_mode: opts->mode = result.uint_32 & S_IALLUGO; break; + case Opt_source: + if (fc->source) + return invalfc(fc, "Multiple sources specified"); + fc->source = param->string; + param->string = NULL; + break; /* * We might like to report bad mount options here; * but traditionally debugfs has ignored all mount options diff --git a/fs/direct-io.c b/fs/direct-io.c index b0aafe640fa4..bbd05f1a2145 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,7 +37,6 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <linux/atomic.h> -#include <linux/prefetch.h> #include "internal.h" @@ -1121,11 +1120,6 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct blk_plug plug; unsigned long align = offset | iov_iter_alignment(iter); - /* - * Avoid references to bdev if not absolutely needed to give - * the early prefetch in the caller enough time. - */ - /* watch out for a 0 len io from a tricksy fs */ if (iov_iter_rw(iter) == READ && !count) return 0; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index e2483acc4366..287e5d407f08 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -234,17 +234,17 @@ out: /* * Called with lower inode mutex held. */ -static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) +static int fill_zeros_to_end_of_page(struct folio *folio, unsigned int to) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; int end_byte_in_page; - if ((i_size_read(inode) / PAGE_SIZE) != page->index) + if ((i_size_read(inode) / PAGE_SIZE) != folio->index) goto out; end_byte_in_page = i_size_read(inode) % PAGE_SIZE; if (to > end_byte_in_page) end_byte_in_page = to; - zero_user_segment(page, end_byte_in_page, PAGE_SIZE); + folio_zero_segment(folio, end_byte_in_page, PAGE_SIZE); out: return 0; } @@ -255,7 +255,7 @@ out: * @mapping: The eCryptfs object * @pos: The file offset at which to start writing * @len: Length of the write - * @pagep: Pointer to return the page + * @foliop: Pointer to return the folio * @fsdata: Pointer to return fs data (unused) * * This function must zero any hole we create @@ -265,38 +265,39 @@ out: static int ecryptfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; + struct folio *folio; loff_t prev_page_end_size; int rc = 0; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + *foliop = folio; prev_page_end_size = ((loff_t)index << PAGE_SHIFT); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(mapping->host)->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_read_lower_page_segment( - page, index, 0, PAGE_SIZE, mapping->host); + &folio->page, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error attempting to read " "lower page segment; rc = [%d]\n", __func__, rc); - ClearPageUptodate(page); + folio_clear_uptodate(folio); goto out; } else - SetPageUptodate(page); + folio_mark_uptodate(folio); } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { rc = ecryptfs_copy_up_encrypted_with_header( - page, crypt_stat); + &folio->page, crypt_stat); if (rc) { printk(KERN_ERR "%s: Error attempting " "to copy the encrypted content " @@ -304,46 +305,46 @@ static int ecryptfs_write_begin(struct file *file, "inserting the metadata from " "the xattr into the header; rc " "= [%d]\n", __func__, rc); - ClearPageUptodate(page); + folio_clear_uptodate(folio); goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } else { rc = ecryptfs_read_lower_page_segment( - page, index, 0, PAGE_SIZE, + &folio->page, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error reading " "page; rc = [%d]\n", __func__, rc); - ClearPageUptodate(page); + folio_clear_uptodate(folio); goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } } else { if (prev_page_end_size - >= i_size_read(page->mapping->host)) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); + >= i_size_read(mapping->host)) { + folio_zero_range(folio, 0, PAGE_SIZE); + folio_mark_uptodate(folio); } else if (len < PAGE_SIZE) { - rc = ecryptfs_decrypt_page(page); + rc = ecryptfs_decrypt_page(&folio->page); if (rc) { printk(KERN_ERR "%s: Error decrypting " "page at index [%ld]; " "rc = [%d]\n", - __func__, page->index, rc); - ClearPageUptodate(page); + __func__, folio->index, rc); + folio_clear_uptodate(folio); goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } } } /* If creating a page or more of holes, zero them out via truncate. * Note, this will increase i_size. */ if (index != 0) { - if (prev_page_end_size > i_size_read(page->mapping->host)) { + if (prev_page_end_size > i_size_read(mapping->host)) { rc = ecryptfs_truncate(file->f_path.dentry, prev_page_end_size); if (rc) { @@ -359,12 +360,11 @@ static int ecryptfs_write_begin(struct file *file, * of page? Zero it out. */ if ((i_size_read(mapping->host) == prev_page_end_size) && (pos != 0)) - zero_user(page, 0, PAGE_SIZE); + folio_zero_range(folio, 0, PAGE_SIZE); out: if (unlikely(rc)) { - unlock_page(page); - put_page(page); - *pagep = NULL; + folio_unlock(folio); + folio_put(folio); } return rc; } @@ -457,13 +457,13 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) * @pos: The file position * @len: The length of the data (unused) * @copied: The amount of data copied - * @page: The eCryptfs page + * @folio: The eCryptfs folio * @fsdata: The fsdata (unused) */ static int ecryptfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { pgoff_t index = pos >> PAGE_SHIFT; unsigned from = pos & (PAGE_SIZE - 1); @@ -476,8 +476,8 @@ static int ecryptfs_write_end(struct file *file, ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, - to); + rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, + &folio->page, 0, to); if (!rc) { rc = copied; fsstack_copy_inode_size(ecryptfs_inode, @@ -485,21 +485,21 @@ static int ecryptfs_write_end(struct file *file, } goto out; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (copied < PAGE_SIZE) { rc = 0; goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* Fills in zeros if 'to' goes beyond inode size */ - rc = fill_zeros_to_end_of_page(page, to); + rc = fill_zeros_to_end_of_page(folio, to); if (rc) { ecryptfs_printk(KERN_WARNING, "Error attempting to fill " "zeros in page with index = [0x%.16lx]\n", index); goto out; } - rc = ecryptfs_encrypt_page(page); + rc = ecryptfs_encrypt_page(&folio->page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " "index [0x%.16lx])\n", index); @@ -518,8 +518,8 @@ static int ecryptfs_write_end(struct file *file, else rc = copied; out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return rc; } diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 2193a6710c8f..c3b90abdee37 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -8,19 +8,15 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, void *dentry_blk, struct erofs_dirent *de, - unsigned int nameoff, unsigned int maxsize) + unsigned int nameoff0, unsigned int maxsize) { - const struct erofs_dirent *end = dentry_blk + nameoff; + const struct erofs_dirent *end = dentry_blk + nameoff0; while (de < end) { - const char *de_name; + unsigned char d_type = fs_ftype_to_dtype(de->file_type); + unsigned int nameoff = le16_to_cpu(de->nameoff); + const char *de_name = (char *)dentry_blk + nameoff; unsigned int de_namelen; - unsigned char d_type; - - d_type = fs_ftype_to_dtype(de->file_type); - - nameoff = le16_to_cpu(de->nameoff); - de_name = (char *)dentry_blk + nameoff; /* the last dirent in the block? */ if (de + 1 >= end) @@ -52,21 +48,20 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct super_block *sb = dir->i_sb; unsigned long bsz = sb->s_blocksize; - const size_t dirsize = i_size_read(dir); - unsigned int i = erofs_blknr(sb, ctx->pos); unsigned int ofs = erofs_blkoff(sb, ctx->pos); int err = 0; bool initial = true; buf.mapping = dir->i_mapping; - while (ctx->pos < dirsize) { + while (ctx->pos < dir->i_size) { + erofs_off_t dbstart = ctx->pos - ofs; struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, erofs_pos(sb, i), EROFS_KMAP); + de = erofs_bread(&buf, dbstart, EROFS_KMAP); if (IS_ERR(de)) { erofs_err(sb, "fail to readdir of logical block %u of nid %llu", - i, EROFS_I(dir)->nid); + erofs_blknr(sb, dbstart), EROFS_I(dir)->nid); err = PTR_ERR(de); break; } @@ -79,25 +74,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) break; } - maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz); - + maxsize = min_t(unsigned int, dir->i_size - dbstart, bsz); /* search dirents at the arbitrary position */ if (initial) { initial = false; - ofs = roundup(ofs, sizeof(struct erofs_dirent)); - ctx->pos = erofs_pos(sb, i) + ofs; - if (ofs >= nameoff) - goto skip_this; + ctx->pos = dbstart + ofs; } err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); if (err) break; -skip_this: - ctx->pos = erofs_pos(sb, i) + maxsize; - ++i; + ctx->pos = dbstart + maxsize; ofs = 0; } erofs_put_metabuf(&buf); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 43c09aae2afc..419432be3223 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -257,25 +257,23 @@ static int erofs_fill_inode(struct inode *inode) goto out_unlock; } + mapping_set_large_folios(inode->i_mapping); if (erofs_inode_is_data_compressed(vi->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, erofs_info, inode->i_sb, "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); inode->i_mapping->a_ops = &z_erofs_aops; - err = 0; - goto out_unlock; -#endif +#else err = -EOPNOTSUPP; - goto out_unlock; - } - inode->i_mapping->a_ops = &erofs_raw_access_aops; - mapping_set_large_folios(inode->i_mapping); +#endif + } else { + inode->i_mapping->a_ops = &erofs_raw_access_aops; #ifdef CONFIG_EROFS_FS_ONDEMAND - if (erofs_is_fscache_mode(inode->i_sb)) - inode->i_mapping->a_ops = &erofs_fscache_access_aops; + if (erofs_is_fscache_mode(inode->i_sb)) + inode->i_mapping->a_ops = &erofs_fscache_access_aops; #endif - + } out_unlock: erofs_put_metabuf(&buf); return err; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 736607675396..45dc15ebd870 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -220,7 +220,7 @@ struct erofs_buf { }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) -#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits) +#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits)) #define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) #define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) #define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 32ce5b35e1df..6cb5c8916174 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -108,22 +108,6 @@ static void erofs_free_inode(struct inode *inode) kmem_cache_free(erofs_inode_cachep, vi); } -static bool check_layout_compatibility(struct super_block *sb, - struct erofs_super_block *dsb) -{ - const unsigned int feature = le32_to_cpu(dsb->feature_incompat); - - EROFS_SB(sb)->feature_incompat = feature; - - /* check if current kernel meets all mandatory requirements */ - if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { - erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", - feature & ~EROFS_ALL_FEATURE_INCOMPAT); - return false; - } - return true; -} - /* read variable-sized metadata, offset will be aligned by 4-byte */ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp) @@ -279,7 +263,7 @@ static int erofs_scan_devices(struct super_block *sb, static int erofs_read_superblock(struct super_block *sb) { - struct erofs_sb_info *sbi; + struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_super_block *dsb; void *data; @@ -291,9 +275,7 @@ static int erofs_read_superblock(struct super_block *sb) return PTR_ERR(data); } - sbi = EROFS_SB(sb); dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); - ret = -EINVAL; if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) { erofs_err(sb, "cannot find valid erofs superblock"); @@ -318,8 +300,12 @@ static int erofs_read_superblock(struct super_block *sb) } ret = -EINVAL; - if (!check_layout_compatibility(sb, dsb)) + sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat); + if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) { + erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", + sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT); goto out; + } sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) { diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 9b53883e5caf..37afe2024840 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -111,7 +111,8 @@ int z_erofs_gbuf_growsize(unsigned int nrpages) out: if (i < z_erofs_gbuf_count && tmp_pages) { for (j = 0; j < nrpages; ++j) - if (tmp_pages[j] && tmp_pages[j] != gbuf->pages[j]) + if (tmp_pages[j] && (j >= gbuf->nrpages || + tmp_pages[j] != gbuf->pages[j])) __free_page(tmp_pages[j]); kfree(tmp_pages); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index f53ca4f7fced..145f5349c612 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -420,7 +420,7 @@ static bool busy_loop_ep_timeout(unsigned long start_time, static bool ep_busy_loop_on(struct eventpoll *ep) { - return !!ep->busy_poll_usecs || net_busy_loop_on(); + return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on(); } static bool ep_busy_loop_end(void *p, unsigned long start_time) @@ -2200,11 +2200,6 @@ static int do_epoll_create(int flags) error = PTR_ERR(file); goto out_free_fd; } -#ifdef CONFIG_NET_RX_BUSY_POLL - ep->busy_poll_usecs = 0; - ep->busy_poll_budget = 0; - ep->prefer_busy_poll = false; -#endif ep->file = file; fd_install(fd, file); return fd; diff --git a/fs/exec.c b/fs/exec.c index 50e76cc633c4..caae051c5a95 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -145,13 +145,11 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) goto out; /* - * may_open() has already checked for this, so it should be - * impossible to trip now. But we need to be extra cautious - * and check again at the very end too. + * Check do_open_execat() for an explanation. */ error = -EACCES; - if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || - path_noexec(&file->f_path))) + if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || + path_noexec(&file->f_path)) goto exit; error = -ENOEXEC; @@ -954,7 +952,6 @@ EXPORT_SYMBOL(transfer_args_to_stack); static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; - int err; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, @@ -971,24 +968,20 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) - goto out; + return file; /* - * may_open() has already checked for this, so it should be - * impossible to trip now. But we need to be extra cautious - * and check again at the very end too. + * In the past the regular type check was here. It moved to may_open() in + * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is + * an invariant that all non-regular files error out before we get here. */ - err = -EACCES; - if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || - path_noexec(&file->f_path))) - goto exit; + if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || + path_noexec(&file->f_path)) { + fput(file); + return ERR_PTR(-EACCES); + } -out: return file; - -exit: - fput(file); - return ERR_PTR(err); } /** diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 64c31867bc76..e19469e88000 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -535,20 +535,20 @@ static int exfat_file_zeroed_range(struct file *file, loff_t start, loff_t end) while (start < end) { u32 zerofrom, len; - struct page *page = NULL; + struct folio *folio; zerofrom = start & (PAGE_SIZE - 1); len = PAGE_SIZE - zerofrom; if (start + len > end) len = end - start; - err = ops->write_begin(file, mapping, start, len, &page, NULL); + err = ops->write_begin(file, mapping, start, len, &folio, NULL); if (err) goto out; - zero_user_segment(page, zerofrom, zerofrom + len); + folio_zero_range(folio, offset_in_folio(folio, start), len); - err = ops->write_end(file, mapping, start, len, len, page, NULL); + err = ops->write_end(file, mapping, start, len, len, folio, NULL); if (err < 0) goto out; start += len; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index dd894e558c91..05f0e07b01d0 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -448,12 +448,11 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) static int exfat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = block_write_begin(mapping, pos, len, pagep, exfat_get_block); + ret = block_write_begin(mapping, pos, len, foliop, exfat_get_block); if (ret < 0) exfat_write_failed(mapping, pos+len); @@ -463,13 +462,13 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping, static int exfat_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, - struct page *pagep, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); int err; - err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ei->i_size_aligned < i_size_read(inode)) { exfat_fs_error(inode->i_sb, diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 6622c582f550..402fecf90a44 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -87,7 +87,7 @@ static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); @@ -434,7 +434,7 @@ int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino) static int ext2_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(&folio->page, pos, len, ext2_get_block); + return __block_write_begin(folio, pos, len, ext2_get_block); } static int ext2_handle_dirsync(struct inode *dir) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0caa1650cee8..30f8201c155f 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -916,11 +916,11 @@ static void ext2_readahead(struct readahead_control *rac) static int ext2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, ext2_get_block); + ret = block_write_begin(mapping, pos, len, foliop, ext2_get_block); if (ret < 0) ext2_write_failed(mapping, pos + len); return ret; @@ -928,11 +928,11 @@ ext2_write_begin(struct file *file, struct address_space *mapping, static int ext2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ret < len) ext2_write_failed(mapping, pos + len); return ret; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d62a4b9b26ce..ecc15e5f1eba 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3565,13 +3565,13 @@ int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep); + struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); extern int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep, + struct folio **foliop, void **fsdata); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 4282e12dc405..edf4aa99a974 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -601,10 +601,10 @@ retry: goto out; if (ext4_should_dioread_nolock(inode)) { - ret = __block_write_begin(&folio->page, from, to, + ret = __block_write_begin(folio, from, to, ext4_get_block_unwritten); } else - ret = __block_write_begin(&folio->page, from, to, ext4_get_block); + ret = __block_write_begin(folio, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -660,7 +660,7 @@ out_nofolio: int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep) + struct folio **foliop) { int ret; handle_t *handle; @@ -708,7 +708,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, goto out; } - *pagep = &folio->page; + *foliop = folio; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; @@ -856,7 +856,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, goto out; } - ret = __block_write_begin(&folio->page, 0, inline_size, + ret = __block_write_begin(folio, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); @@ -891,7 +891,7 @@ out: int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep, + struct folio **foliop, void **fsdata) { int ret; @@ -954,7 +954,7 @@ retry_journal: goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); - *pagep = &folio->page; + *foliop = folio; brelse(iloc.bh); return 1; out_release_page: diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 941c1c0d5c6e..03374dc215d1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1145,7 +1145,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; @@ -1170,7 +1170,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, - pagep); + foliop); if (ret < 0) return ret; if (ret == 1) @@ -1224,10 +1224,10 @@ retry_journal: ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(&folio->page, pos, len, + ret = __block_write_begin(folio, pos, len, ext4_get_block_unwritten); else - ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); + ret = __block_write_begin(folio, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -1270,7 +1270,7 @@ retry_journal: folio_put(folio); return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -1298,9 +1298,8 @@ static int write_end_fn(handle_t *handle, struct inode *inode, static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1315,7 +1314,7 @@ static int ext4_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1402,9 +1401,8 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -2926,7 +2924,7 @@ static int ext4_nonda_switch(struct super_block *sb) static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; @@ -2941,14 +2939,14 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, - len, pagep, fsdata); + len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, - pagep, fsdata); + foliop, fsdata); if (ret < 0) return ret; if (ret == 1) @@ -2964,7 +2962,7 @@ retry: #ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else - ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); + ret = __block_write_begin(folio, pos, len, ext4_da_get_block_prep); #endif if (ret < 0) { folio_unlock(folio); @@ -2983,7 +2981,7 @@ retry: return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -3029,7 +3027,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(NULL, mapping, pos, len, copied, - &folio->page, NULL); + folio, NULL); new_i_size = pos + copied; /* @@ -3080,15 +3078,14 @@ static int ext4_da_do_write_end(struct address_space *mapping, static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; - struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, - len, copied, &folio->page, fsdata); + len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); @@ -6219,7 +6216,7 @@ retry_alloc: if (folio_pos(folio) + len > size) len = size - folio_pos(folio); - err = __block_write_begin(&folio->page, 0, len, ext4_get_block); + err = __block_write_begin(folio, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 2f37e1ea3955..d9203228ce97 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -76,17 +76,17 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, while (count) { size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); - struct page *page; + struct folio *folio; void *fsdata = NULL; int res; - res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); + res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata); if (res) return res; - memcpy_to_page(page, offset_in_page(pos), buf, n); + memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n); - res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); + res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata); if (res < 0) return res; if (res != n) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6457e5bca9c9..5dfa0207ad8f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3552,12 +3552,12 @@ reserve_block: } static int f2fs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *page = NULL; - pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; + struct folio *folio; + pgoff_t index = pos >> PAGE_SHIFT; bool need_balance = false; bool use_cow = false; block_t blkaddr = NULL_ADDR; @@ -3573,7 +3573,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: - * lock_page(page #0) -> lock_page(inode_page) + * folio_lock(folio #0) -> folio_lock(inode_page) */ if (index != 0) { err = f2fs_convert_inline_inode(inode); @@ -3584,18 +3584,20 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret; + struct page *page; *fsdata = NULL; if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) goto repeat; - ret = f2fs_prepare_compress_overwrite(inode, pagep, + ret = f2fs_prepare_compress_overwrite(inode, &page, index, fsdata); if (ret < 0) { err = ret; goto fail; } else if (ret) { + *foliop = page_folio(page); return 0; } } @@ -3603,81 +3605,85 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, repeat: /* - * Do not use grab_cache_page_write_begin() to avoid deadlock due to - * wait_for_stable_page. Will wait that below with our IO control. + * Do not use FGP_STABLE to avoid deadlock. + * Will wait that below with our IO control. */ - page = f2fs_pagecache_get_page(mapping, index, + folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); - if (!page) { - err = -ENOMEM; + if (IS_ERR(folio)) { + err = PTR_ERR(folio); goto fail; } /* TODO: cluster can be compressed due to race with .writepage */ - *pagep = page; + *foliop = folio; if (f2fs_is_atomic_file(inode)) - err = prepare_atomic_write_begin(sbi, page, pos, len, + err = prepare_atomic_write_begin(sbi, &folio->page, pos, len, &blkaddr, &need_balance, &use_cow); else - err = prepare_write_begin(sbi, page, pos, len, + err = prepare_write_begin(sbi, &folio->page, pos, len, &blkaddr, &need_balance); if (err) - goto fail; + goto put_folio; if (need_balance && !IS_NOQUOTA(inode) && has_not_enough_free_secs(sbi, 0, 0)) { - unlock_page(page); + folio_unlock(folio); f2fs_balance_fs(sbi, true); - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - f2fs_put_page(page, 1); + folio_lock(folio); + if (folio->mapping != mapping) { + /* The folio got truncated from under us */ + folio_unlock(folio); + folio_put(folio); goto repeat; } } - f2fs_wait_on_page_writeback(page, DATA, false, true); + f2fs_wait_on_page_writeback(&folio->page, DATA, false, true); - if (len == PAGE_SIZE || PageUptodate(page)) + if (len == folio_size(folio) || folio_test_uptodate(folio)) return 0; if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) && !f2fs_verity_in_progress(inode)) { - zero_user_segment(page, len, PAGE_SIZE); + folio_zero_segment(folio, len, PAGE_SIZE); return 0; } if (blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); } else { if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; - goto fail; + goto put_folio; } err = f2fs_submit_page_read(use_cow ? - F2FS_I(inode)->cow_inode : inode, page, + F2FS_I(inode)->cow_inode : inode, &folio->page, blkaddr, 0, true); if (err) - goto fail; + goto put_folio; - lock_page(page); - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); goto repeat; } - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; - goto fail; + goto put_folio; } } return 0; +put_folio: + folio_unlock(folio); + folio_put(folio); fail: - f2fs_put_page(page, 1); f2fs_write_failed(inode, pos + len); return err; } @@ -3685,9 +3691,9 @@ fail: static int f2fs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; trace_f2fs_write_end(inode, pos, len, copied); @@ -3696,17 +3702,17 @@ static int f2fs_write_end(struct file *file, * should be PAGE_SIZE. Otherwise, we treat it with zero copied and * let generic_perform_write() try to copy data again through copied=0. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (unlikely(copied != len)) copied = 0; else - SetPageUptodate(page); + folio_mark_uptodate(folio); } #ifdef CONFIG_F2FS_FS_COMPRESSION /* overwrite compressed file */ if (f2fs_compressed_file(inode) && fsdata) { - f2fs_compress_write_end(inode, fsdata, page->index, copied); + f2fs_compress_write_end(inode, fsdata, folio->index, copied); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (pos + copied > i_size_read(inode) && @@ -3719,7 +3725,7 @@ static int f2fs_write_end(struct file *file, if (!copied) goto unlock_out; - set_page_dirty(page); + folio_mark_dirty(folio); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) { @@ -3729,7 +3735,8 @@ static int f2fs_write_end(struct file *file, pos + copied); } unlock_out: - f2fs_put_page(page, 1); + folio_unlock(folio); + folio_put(folio); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3959fd137cc9..176b5177c89d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2677,7 +2677,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, const struct address_space_operations *a_ops = mapping->a_ops; int offset = off & (sb->s_blocksize - 1); size_t towrite = len; - struct page *page; + struct folio *folio; void *fsdata = NULL; int err = 0; int tocopy; @@ -2687,7 +2687,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, towrite); retry: err = a_ops->write_begin(NULL, mapping, off, tocopy, - &page, &fsdata); + &folio, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); @@ -2697,10 +2697,10 @@ retry: break; } - memcpy_to_page(page, offset, data, tocopy); + memcpy_to_folio(folio, offset_in_folio(folio, off), data, tocopy); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, - page, fsdata); + folio, fsdata); offset = 0; towrite -= tocopy; off += tocopy; diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index f7bb0c54502c..84a33fe49bed 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -80,17 +80,17 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, while (count) { size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); - struct page *page; + struct folio *folio; void *fsdata = NULL; int res; - res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); + res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata); if (res) return res; - memcpy_to_page(page, offset_in_page(pos), buf, n); + memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n); - res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); + res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata); if (res < 0) return res; if (res != n) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 19115fd2d2a4..75722bbd6b5f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -221,13 +221,12 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) static int fat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int err; - *pagep = NULL; err = cont_write_begin(file, mapping, pos, len, - pagep, fsdata, fat_get_block, + foliop, fsdata, fat_get_block, &MSDOS_I(mapping->host)->mmu_private); if (err < 0) fat_write_failed(mapping, pos + len); @@ -236,11 +235,11 @@ static int fat_write_begin(struct file *file, struct address_space *mapping, static int fat_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pagep, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (err < len) fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { diff --git a/fs/fcntl.c b/fs/fcntl.c index 0587a0e221a6..f6fde75a3bd5 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -414,6 +414,12 @@ static long f_dupfd_query(int fd, struct file *filp) return f.file == filp; } +/* Let the caller figure out whether a given file was just created. */ +static long f_created_query(const struct file *filp) +{ + return !!(filp->f_mode & FMODE_CREATED); +} + static int f_owner_sig(struct file *filp, int signum, bool setsig) { int ret = 0; @@ -447,6 +453,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, long err = -EINVAL; switch (cmd) { + case F_CREATED_QUERY: + err = f_created_query(filp); + break; case F_DUPFD: err = f_dupfd(argi, filp, 0); break; @@ -553,6 +562,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, static int check_fcntl_cmd(unsigned cmd) { switch (cmd) { + case F_CREATED_QUERY: case F_DUPFD: case F_DUPFD_CLOEXEC: case F_DUPFD_QUERY: diff --git a/fs/fhandle.c b/fs/fhandle.c index 6e8cea16790e..8cb665629f4a 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -16,7 +16,8 @@ static long do_sys_name_to_handle(const struct path *path, struct file_handle __user *ufh, - int __user *mnt_id, int fh_flags) + void __user *mnt_id, bool unique_mntid, + int fh_flags) { long retval; struct file_handle f_handle; @@ -69,9 +70,19 @@ static long do_sys_name_to_handle(const struct path *path, } else retval = 0; /* copy the mount id */ - if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) || - copy_to_user(ufh, handle, - struct_size(handle, f_handle, handle_bytes))) + if (unique_mntid) { + if (put_user(real_mount(path->mnt)->mnt_id_unique, + (u64 __user *) mnt_id)) + retval = -EFAULT; + } else { + if (put_user(real_mount(path->mnt)->mnt_id, + (int __user *) mnt_id)) + retval = -EFAULT; + } + /* copy the handle */ + if (retval != -EFAULT && + copy_to_user(ufh, handle, + struct_size(handle, f_handle, handle_bytes))) retval = -EFAULT; kfree(handle); return retval; @@ -83,6 +94,7 @@ static long do_sys_name_to_handle(const struct path *path, * @name: name that should be converted to handle. * @handle: resulting file handle * @mnt_id: mount id of the file system containing the file + * (u64 if AT_HANDLE_MNT_ID_UNIQUE, otherwise int) * @flag: flag value to indicate whether to follow symlink or not * and whether a decodable file handle is required. * @@ -92,7 +104,7 @@ static long do_sys_name_to_handle(const struct path *path, * value required. */ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, - struct file_handle __user *, handle, int __user *, mnt_id, + struct file_handle __user *, handle, void __user *, mnt_id, int, flag) { struct path path; @@ -100,7 +112,8 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, int fh_flags; int err; - if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID)) + if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID | + AT_HANDLE_MNT_ID_UNIQUE)) return -EINVAL; lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; @@ -109,7 +122,9 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, lookup_flags |= LOOKUP_EMPTY; err = user_path_at(dfd, name, lookup_flags, &path); if (!err) { - err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags); + err = do_sys_name_to_handle(&path, handle, mnt_id, + flag & AT_HANDLE_MNT_ID_UNIQUE, + fh_flags); path_put(&path); } return err; diff --git a/fs/file.c b/fs/file.c index 655338effe9c..976ecd4ce2c6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -672,7 +672,7 @@ int close_fd(unsigned fd) return filp_close(file, files); } -EXPORT_SYMBOL(close_fd); /* for ksys_close() */ +EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table diff --git a/fs/file_table.c b/fs/file_table.c index 7ce4d5dac080..3cac9c34264f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -136,6 +136,7 @@ static int __init init_fs_stat_sysctls(void) register_sysctl_init("fs", fs_stat_sysctls); if (IS_ENABLED(CONFIG_BINFMT_MISC)) { struct ctl_table_header *hdr; + hdr = register_sysctl_mount_point("fs/binfmt_misc"); kmemleak_not_leak(hdr); } @@ -389,7 +390,9 @@ EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { - struct file *f = alloc_file(&base->f_path, flags, fops); + struct file *f; + + f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b865a3fa52f3..d8bec3c1bb1f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1132,6 +1132,7 @@ out_bdi_put: /** * cgroup_writeback_umount - flush inode wb switches for umount + * @sb: target super_block * * This function is called when a super_block is about to be destroyed and * flushes in-flight inode wb switches. An inode wb switch goes through @@ -1140,8 +1141,12 @@ out_bdi_put: * rare occurrences and synchronize_rcu() can take a while, perform * flushing iff wb switches are in flight. */ -void cgroup_writeback_umount(void) +void cgroup_writeback_umount(struct super_block *sb) { + + if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK)) + return; + /* * SB_ACTIVE should be reliably cleared before checking * isw_nr_in_flight, see generic_shutdown_super(). @@ -1381,12 +1386,13 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) static void inode_sync_complete(struct inode *inode) { + assert_spin_locked(&inode->i_lock); + inode->i_state &= ~I_SYNC; /* If inode is clean an unused, put it into LRU now... */ inode_add_lru(inode); - /* Waiters must see I_SYNC cleared before being woken up */ - smp_mb(); - wake_up_bit(&inode->i_state, __I_SYNC); + /* Called with inode->i_lock which ensures memory ordering. */ + inode_wake_up_bit(inode, __I_SYNC); } static bool inode_dirtied_after(struct inode *inode, unsigned long t) @@ -1505,30 +1511,27 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) * Wait for writeback on an inode to complete. Called with i_lock held. * Caller must make sure inode cannot go away when we drop i_lock. */ -static void __inode_wait_for_writeback(struct inode *inode) - __releases(inode->i_lock) - __acquires(inode->i_lock) +void inode_wait_for_writeback(struct inode *inode) { - DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); - wait_queue_head_t *wqh; + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; + + assert_spin_locked(&inode->i_lock); - wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - while (inode->i_state & I_SYNC) { + if (!(inode->i_state & I_SYNC)) + return; + + wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); + for (;;) { + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ + if (!(inode->i_state & I_SYNC)) + break; spin_unlock(&inode->i_lock); - __wait_on_bit(wqh, &wq, bit_wait, - TASK_UNINTERRUPTIBLE); + schedule(); spin_lock(&inode->i_lock); } -} - -/* - * Wait for writeback on an inode to complete. Caller must have inode pinned. - */ -void inode_wait_for_writeback(struct inode *inode) -{ - spin_lock(&inode->i_lock); - __inode_wait_for_writeback(inode); - spin_unlock(&inode->i_lock); + finish_wait(wq_head, &wqe.wq_entry); } /* @@ -1539,16 +1542,20 @@ void inode_wait_for_writeback(struct inode *inode) static void inode_sleep_on_writeback(struct inode *inode) __releases(inode->i_lock) { - DEFINE_WAIT(wait); - wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - int sleep; + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; + bool sleep; + + assert_spin_locked(&inode->i_lock); - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); - sleep = inode->i_state & I_SYNC; + wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ + sleep = !!(inode->i_state & I_SYNC); spin_unlock(&inode->i_lock); if (sleep) schedule(); - finish_wait(wqh, &wait); + finish_wait(wq_head, &wqe.wq_entry); } /* @@ -1752,7 +1759,7 @@ static int writeback_single_inode(struct inode *inode, */ if (wbc->sync_mode != WB_SYNC_ALL) goto out; - __inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode); } WARN_ON(inode->i_state & I_SYNC); /* diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 7146038b2fe7..f0c9cd1a0b39 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -31,6 +31,8 @@ MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; +static void end_requests(struct list_head *head); + static struct fuse_dev *fuse_get_dev(struct file *file) { /* @@ -773,7 +775,6 @@ static int fuse_check_folio(struct folio *folio) (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | - 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | 1 << PG_workingset | @@ -818,9 +819,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) newfolio = page_folio(buf->page); - if (!folio_test_uptodate(newfolio)) - folio_mark_uptodate(newfolio); - + folio_clear_uptodate(newfolio); folio_clear_mappedtodisk(newfolio); if (fuse_check_folio(newfolio) != 0) @@ -1822,6 +1821,13 @@ static void fuse_resend(struct fuse_conn *fc) } spin_lock(&fiq->lock); + if (!fiq->connected) { + spin_unlock(&fiq->lock); + list_for_each_entry(req, &to_queue, list) + clear_bit(FR_PENDING, &req->flags); + end_requests(&to_queue); + return; + } /* iq and pq requests are both oldest to newest */ list_splice(&to_queue, &fiq->pending); fiq->ops->wake_pending_and_unlock(fiq); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2b0d4781f394..8e96df9fd76c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -670,7 +670,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, err = get_create_ext(&args, dir, entry, mode); if (err) - goto out_put_forget_req; + goto out_free_ff; err = fuse_simple_request(fm, &args); free_ext_value(&args); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f39456c65ed7..ba6df52a823e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1832,10 +1832,16 @@ __acquires(fi->lock) fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); - /* After fuse_writepage_finish() aux request list is private */ + /* After rb_erase() aux request list is private */ for (aux = wpa->next; aux; aux = next) { + struct backing_dev_info *bdi = inode_to_bdi(aux->inode); + next = aux->next; aux->next = NULL; + + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + dec_node_page_state(aux->ia.ap.pages[0], NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); fuse_writepage_free(aux); } @@ -2387,76 +2393,77 @@ out: * but how to implement it without killing performance need more thinking. */ static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); - struct page *page; + struct folio *folio; loff_t fsize; int err = -ENOMEM; WARN_ON(!fc->writeback_cache); - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) goto error; - fuse_wait_on_page_writeback(mapping->host, page->index); + fuse_wait_on_page_writeback(mapping->host, folio->index); - if (PageUptodate(page) || len == PAGE_SIZE) + if (folio_test_uptodate(folio) || len >= folio_size(folio)) goto success; /* - * Check if the start this page comes after the end of file, in which - * case the readpage can be optimized away. + * Check if the start of this folio comes after the end of file, + * in which case the readpage can be optimized away. */ fsize = i_size_read(mapping->host); - if (fsize <= (pos & PAGE_MASK)) { - size_t off = pos & ~PAGE_MASK; + if (fsize <= folio_pos(folio)) { + size_t off = offset_in_folio(folio, pos); if (off) - zero_user_segment(page, 0, off); + folio_zero_segment(folio, 0, off); goto success; } - err = fuse_do_readpage(file, page); + err = fuse_do_readpage(file, &folio->page); if (err) goto cleanup; success: - *pagep = page; + *foliop = folio; return 0; cleanup: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); error: return err; } static int fuse_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ if (!copied) goto unlock; pos += copied; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* Zero any unwritten bytes at the end of the page */ size_t endoff = pos & ~PAGE_MASK; if (endoff) - zero_user_segment(page, endoff, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, endoff, PAGE_SIZE); + folio_mark_uptodate(folio); } if (pos > inode->i_size) i_size_write(inode, pos); - set_page_dirty(page); + folio_mark_dirty(folio); unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d8ab4e93916f..bebd89002328 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1332,11 +1332,16 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, * on a stacked fs (e.g. overlayfs) themselves and with * max_stack_depth == 1, FUSE fs can be stacked as the * underlying fs of a stacked fs (e.g. overlayfs). + * + * Also don't allow the combination of FUSE_PASSTHROUGH + * and FUSE_WRITEBACK_CACHE, current design doesn't handle + * them together. */ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) && (flags & FUSE_PASSTHROUGH) && arg->max_stack_depth > 0 && - arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) { + arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH && + !(flags & FUSE_WRITEBACK_CACHE)) { fc->passthrough = 1; fc->max_stack_depth = arg->max_stack_depth; fm->sb->s_stack_depth = arg->max_stack_depth; diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 5b423fdbb13f..9f568d345c51 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -81,7 +81,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, } ret = fuse_simple_request(fm, &args); if (!ret && !size) - ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); + ret = min_t(size_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { fm->fc->no_getxattr = 1; ret = -EOPNOTSUPP; @@ -143,7 +143,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) } ret = fuse_simple_request(fm, &args); if (!ret && !size) - ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); + ret = min_t(size_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 6d1878b99b30..4a0ce131e233 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -487,15 +487,15 @@ void hfs_file_truncate(struct inode *inode) if (inode->i_size > HFS_I(inode)->phys_size) { struct address_space *mapping = inode->i_mapping; void *fsdata = NULL; - struct page *page; + struct folio *folio; /* XXX: Can use generic_cont_expand? */ size = inode->i_size - 1; - res = hfs_write_begin(NULL, mapping, size + 1, 0, &page, + res = hfs_write_begin(NULL, mapping, size + 1, 0, &folio, &fsdata); if (!res) { res = generic_write_end(NULL, mapping, size + 1, 0, 0, - page, fsdata); + folio, fsdata); } if (res) inode->i_size = HFS_I(inode)->phys_size; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index b5a6ad5df357..a0c7cb0f79fc 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -202,7 +202,7 @@ extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; int hfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata); + loff_t pos, unsigned len, struct folio **foliop, void **fsdata); extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 744e10b46904..a81ce7a740b9 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -45,12 +45,11 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) } int hfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, hfs_get_block, &HFS_I(mapping->host)->phys_size); if (unlikely(ret)) diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 9c51867dddc5..a6d61685ae79 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -554,16 +554,16 @@ void hfsplus_file_truncate(struct inode *inode) if (inode->i_size > hip->phys_size) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; void *fsdata = NULL; loff_t size = inode->i_size; res = hfsplus_write_begin(NULL, mapping, size, 0, - &page, &fsdata); + &folio, &fsdata); if (res) return; res = generic_write_end(NULL, mapping, size, 0, 0, - page, fsdata); + folio, fsdata); if (res < 0) return; mark_inode_dirty(inode); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 9e78f181c24f..59ce81dca73f 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -472,7 +472,7 @@ extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata); + loff_t pos, unsigned len, struct folio **foliop, void **fsdata); struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, umode_t mode); void hfsplus_delete_inode(struct inode *inode); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 3d326926c195..f331e9574217 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -39,12 +39,11 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to) } int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, hfsplus_get_block, &HFSPLUS_I(mapping->host)->phys_size); if (unlikely(ret)) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 22df574ca99e..6d1cf2436ead 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -465,31 +465,32 @@ static int hostfs_read_folio(struct file *file, struct folio *folio) static int hostfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; - *pagep = grab_cache_page_write_begin(mapping, index); - if (!*pagep) + *foliop = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (!*foliop) return -ENOMEM; return 0; } static int hostfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; void *buffer; - unsigned from = pos & (PAGE_SIZE - 1); + size_t from = offset_in_folio(folio, pos); int err; - buffer = kmap_local_page(page); - err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied); + buffer = kmap_local_folio(folio, from); + err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer, copied); kunmap_local(buffer); - if (!PageUptodate(page) && err == PAGE_SIZE) - SetPageUptodate(page); + if (!folio_test_uptodate(folio) && err == folio_size(folio)) + folio_mark_uptodate(folio); /* * If err > 0, write_file has added err to pos, so we are comparing @@ -497,8 +498,8 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, */ if (err > 0 && (pos > inode->i_size)) inode->i_size = pos; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 1bb8d97cd9ae..449a3fc1b8d9 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -190,12 +190,11 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) static int hpfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, hpfs_get_block, &hpfs_i(mapping->host)->mmu_private); if (unlikely(ret)) @@ -206,11 +205,11 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping, static int hpfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pagep, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (err < len) hpfs_write_failed(mapping, pos + len); if (!(err < 0)) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9f6cff356796..5cf327337e22 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -388,14 +388,14 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) static int hugetlbfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { return -EINVAL; } static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { BUG(); return -EINVAL; diff --git a/fs/inode.c b/fs/inode.c index 10c4619faeef..af78f515403f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -472,6 +472,17 @@ static void __inode_add_lru(struct inode *inode, bool rotate) inode->i_state |= I_REFERENCED; } +struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, + struct inode *inode, u32 bit) +{ + void *bit_address; + + bit_address = inode_state_wait_address(inode, bit); + init_wait_var_entry(wqe, bit_address, 0); + return __var_waitqueue(bit_address); +} +EXPORT_SYMBOL(inode_bit_waitqueue); + /* * Add inode to LRU if needed (inode is unused and clean). * @@ -500,25 +511,35 @@ static void inode_unpin_lru_isolating(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); inode->i_state &= ~I_LRU_ISOLATING; - smp_mb(); - wake_up_bit(&inode->i_state, __I_LRU_ISOLATING); + /* Called with inode->i_lock which ensures memory ordering. */ + inode_wake_up_bit(inode, __I_LRU_ISOLATING); spin_unlock(&inode->i_lock); } static void inode_wait_for_lru_isolating(struct inode *inode) { - spin_lock(&inode->i_lock); - if (inode->i_state & I_LRU_ISOLATING) { - DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING); - wait_queue_head_t *wqh; + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; + + lockdep_assert_held(&inode->i_lock); + if (!(inode->i_state & I_LRU_ISOLATING)) + return; - wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING); + wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING); + for (;;) { + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + /* + * Checking I_LRU_ISOLATING with inode->i_lock guarantees + * memory ordering. + */ + if (!(inode->i_state & I_LRU_ISOLATING)) + break; spin_unlock(&inode->i_lock); - __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE); + schedule(); spin_lock(&inode->i_lock); - WARN_ON(inode->i_state & I_LRU_ISOLATING); } - spin_unlock(&inode->i_lock); + finish_wait(wq_head, &wqe.wq_entry); + WARN_ON(inode->i_state & I_LRU_ISOLATING); } /** @@ -595,6 +616,7 @@ void dump_mapping(const struct address_space *mapping) struct hlist_node *dentry_first; struct dentry *dentry_ptr; struct dentry dentry; + char fname[64] = {}; unsigned long ino; /* @@ -631,11 +653,14 @@ void dump_mapping(const struct address_space *mapping) return; } + if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0) + strscpy(fname, "<invalid>"); /* - * if dentry is corrupted, the %pd handler may still crash, - * but it's unlikely that we reach here with a corrupt mapping + * Even if strncpy_from_kernel_nofault() succeeded, + * the fname could be unreliable */ - pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry); + pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n", + a_ops, ino, fname); } void clear_inode(struct inode *inode) @@ -690,6 +715,7 @@ static void evict(struct inode *inode) inode_sb_list_del(inode); + spin_lock(&inode->i_lock); inode_wait_for_lru_isolating(inode); /* @@ -699,6 +725,7 @@ static void evict(struct inode *inode) * the inode. We just have to wait for running writeback to finish. */ inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); if (op->evict_inode) { op->evict_inode(inode); @@ -722,7 +749,13 @@ static void evict(struct inode *inode) * used as an indicator whether blocking on it is safe. */ spin_lock(&inode->i_lock); - wake_up_bit(&inode->i_state, __I_NEW); + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ + smp_mb(); + inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); @@ -770,6 +803,10 @@ again: continue; spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + continue; + } if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; @@ -1130,8 +1167,13 @@ void unlock_new_inode(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); @@ -1142,8 +1184,13 @@ void discard_new_inode(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); } @@ -1570,9 +1617,7 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: - spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino, true); - spin_unlock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) @@ -2334,8 +2379,8 @@ EXPORT_SYMBOL(inode_needs_sync); */ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked) { - wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; /* * Handle racing against evict(), see that routine for more details. @@ -2346,14 +2391,14 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock return; } - wq = bit_waitqueue(&inode->i_state, __I_NEW); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); rcu_read_unlock(); if (is_inode_hash_locked) spin_unlock(&inode_hash_lock); schedule(); - finish_wait(wq, &wait.wq_entry); + finish_wait(wq_head, &wqe.wq_entry); if (is_inode_hash_locked) spin_lock(&inode_hash_lock); rcu_read_lock(); @@ -2502,18 +2547,11 @@ EXPORT_SYMBOL(inode_owner_or_capable); /* * Direct i/o helper functions */ -static void __inode_dio_wait(struct inode *inode) +bool inode_dio_finished(const struct inode *inode) { - wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); - DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); - - do { - prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); - if (atomic_read(&inode->i_dio_count)) - schedule(); - } while (atomic_read(&inode->i_dio_count)); - finish_wait(wq, &q.wq_entry); + return atomic_read(&inode->i_dio_count) == 0; } +EXPORT_SYMBOL(inode_dio_finished); /** * inode_dio_wait - wait for outstanding DIO requests to finish @@ -2527,11 +2565,17 @@ static void __inode_dio_wait(struct inode *inode) */ void inode_dio_wait(struct inode *inode) { - if (atomic_read(&inode->i_dio_count)) - __inode_dio_wait(inode); + wait_var_event(&inode->i_dio_count, inode_dio_finished(inode)); } EXPORT_SYMBOL(inode_dio_wait); +void inode_dio_wait_interruptible(struct inode *inode) +{ + wait_var_event_interruptible(&inode->i_dio_count, + inode_dio_finished(inode)); +} +EXPORT_SYMBOL(inode_dio_wait_interruptible); + /* * inode_set_flags - atomically set some inode flags * diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f420c53d86ac..9b4ca3811a24 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -900,7 +900,7 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t bh_written; bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, - len, copied, &folio->page, NULL); + len, copied, folio, NULL); WARN_ON_ONCE(bh_written != copied && bh_written != 0); return bh_written == copied; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index e12cb145147e..13c18ccc13b0 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -23,10 +23,10 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pg, void *fsdata); + struct folio *folio, void *fsdata); static int jffs2_write_begin(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata); + struct folio **foliop, void **fsdata); static int jffs2_read_folio(struct file *filp, struct folio *folio); int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) @@ -77,29 +77,27 @@ const struct address_space_operations jffs2_file_address_operations = .write_end = jffs2_write_end, }; -static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) +static int jffs2_do_readpage_nolock(struct inode *inode, struct folio *folio) { struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); - unsigned char *pg_buf; + unsigned char *kaddr; int ret; jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n", - __func__, inode->i_ino, pg->index << PAGE_SHIFT); + __func__, inode->i_ino, folio->index << PAGE_SHIFT); - BUG_ON(!PageLocked(pg)); + BUG_ON(!folio_test_locked(folio)); - pg_buf = kmap(pg); - /* FIXME: Can kmap fail? */ - - ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT, + kaddr = kmap_local_folio(folio, 0); + ret = jffs2_read_inode_range(c, f, kaddr, folio->index << PAGE_SHIFT, PAGE_SIZE); + kunmap_local(kaddr); if (!ret) - SetPageUptodate(pg); + folio_mark_uptodate(folio); - flush_dcache_page(pg); - kunmap(pg); + flush_dcache_folio(folio); jffs2_dbg(2, "readpage finished\n"); return ret; @@ -107,7 +105,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) int __jffs2_read_folio(struct file *file, struct folio *folio) { - int ret = jffs2_do_readpage_nolock(folio->mapping->host, &folio->page); + int ret = jffs2_do_readpage_nolock(folio->mapping->host, folio); folio_unlock(folio); return ret; } @@ -125,9 +123,9 @@ static int jffs2_read_folio(struct file *file, struct folio *folio) static int jffs2_write_begin(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { - struct page *pg; + struct folio *folio; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); @@ -206,29 +204,30 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, * page in read_cache_page(), which causes a deadlock. */ mutex_lock(&c->alloc_sem); - pg = grab_cache_page_write_begin(mapping, index); - if (!pg) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto release_sem; } - *pagep = pg; + *foliop = folio; /* - * Read in the page if it wasn't already present. Cannot optimize away - * the whole page write case until jffs2_write_end can handle the + * Read in the folio if it wasn't already present. Cannot optimize away + * the whole folio write case until jffs2_write_end can handle the * case of a short-copy. */ - if (!PageUptodate(pg)) { + if (!folio_test_uptodate(folio)) { mutex_lock(&f->sem); - ret = jffs2_do_readpage_nolock(inode, pg); + ret = jffs2_do_readpage_nolock(inode, folio); mutex_unlock(&f->sem); if (ret) { - unlock_page(pg); - put_page(pg); + folio_unlock(folio); + folio_put(folio); goto release_sem; } } - jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); + jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags); release_sem: mutex_unlock(&c->alloc_sem); @@ -238,7 +237,7 @@ out_err: static int jffs2_write_end(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pg, void *fsdata) + struct folio *folio, void *fsdata) { /* Actually commit the write from the page cache page we're looking at. * For now, we write the full page out each time. It sucks, but it's simple @@ -252,16 +251,17 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, unsigned aligned_start = start & ~3; int ret = 0; uint32_t writtenlen = 0; + void *buf; - jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n", - __func__, inode->i_ino, pg->index << PAGE_SHIFT, - start, end, pg->flags); + jffs2_dbg(1, "%s(): ino #%lu, page at 0x%llx, range %d-%d, flags %lx\n", + __func__, inode->i_ino, folio_pos(folio), + start, end, folio->flags); /* We need to avoid deadlock with page_cache_read() in - jffs2_garbage_collect_pass(). So the page must be + jffs2_garbage_collect_pass(). So the folio must be up to date to prevent page_cache_read() from trying to re-lock it. */ - BUG_ON(!PageUptodate(pg)); + BUG_ON(!folio_test_uptodate(folio)); if (end == PAGE_SIZE) { /* When writing out the end of a page, write out the @@ -276,8 +276,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, if (!ri) { jffs2_dbg(1, "%s(): Allocation of raw inode failed\n", __func__); - unlock_page(pg); - put_page(pg); + folio_unlock(folio); + folio_put(folio); return -ENOMEM; } @@ -289,15 +289,11 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, ri->isize = cpu_to_je32((uint32_t)inode->i_size); ri->atime = ri->ctime = ri->mtime = cpu_to_je32(JFFS2_NOW()); - /* In 2.4, it was already kmapped by generic_file_write(). Doesn't - hurt to do it again. The alternative is ifdefs, which are ugly. */ - kmap(pg); - - ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start, - (pg->index << PAGE_SHIFT) + aligned_start, + buf = kmap_local_folio(folio, aligned_start); + ret = jffs2_write_inode_range(c, f, ri, buf, + folio_pos(folio) + aligned_start, end - aligned_start, &writtenlen); - - kunmap(pg); + kunmap_local(buf); if (ret) mapping_set_error(mapping, ret); @@ -323,12 +319,12 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, it gets reread */ jffs2_dbg(1, "%s(): Not all bytes written. Marking page !uptodate\n", __func__); - ClearPageUptodate(pg); + folio_clear_uptodate(folio); } jffs2_dbg(1, "%s() returning %d\n", __func__, writtenlen > 0 ? writtenlen : ret); - unlock_page(pg); - put_page(pg); + folio_unlock(folio); + folio_put(folio); return writtenlen > 0 ? writtenlen : ret; } diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 5c6602f3c189..822949d0eb00 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -1171,7 +1171,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era uint32_t alloclen, offset, orig_end, orig_start; int ret = 0; unsigned char *comprbuf = NULL, *writebuf; - struct page *page; + struct folio *folio; unsigned char *pg_ptr; memset(&ri, 0, sizeof(ri)); @@ -1317,25 +1317,25 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era BUG_ON(start > orig_start); } - /* The rules state that we must obtain the page lock *before* f->sem, so + /* The rules state that we must obtain the folio lock *before* f->sem, so * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's * actually going to *change* so we're safe; we only allow reading. * * It is important to note that jffs2_write_begin() will ensure that its - * page is marked Uptodate before allocating space. That means that if we - * end up here trying to GC the *same* page that jffs2_write_begin() is - * trying to write out, read_cache_page() will not deadlock. */ + * folio is marked uptodate before allocating space. That means that if we + * end up here trying to GC the *same* folio that jffs2_write_begin() is + * trying to write out, read_cache_folio() will not deadlock. */ mutex_unlock(&f->sem); - page = read_cache_page(inode->i_mapping, start >> PAGE_SHIFT, + folio = read_cache_folio(inode->i_mapping, start >> PAGE_SHIFT, __jffs2_read_folio, NULL); - if (IS_ERR(page)) { - pr_warn("read_cache_page() returned error: %ld\n", - PTR_ERR(page)); + if (IS_ERR(folio)) { + pr_warn("read_cache_folio() returned error: %ld\n", + PTR_ERR(folio)); mutex_lock(&f->sem); - return PTR_ERR(page); + return PTR_ERR(folio); } - pg_ptr = kmap(page); + pg_ptr = kmap_local_folio(folio, 0); mutex_lock(&f->sem); offset = start; @@ -1400,7 +1400,6 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era } } - kunmap(page); - put_page(page); + folio_release_kmap(folio, pg_ptr); return ret; } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 1a6b5921d17a..07cfdc440596 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -292,11 +292,11 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to) static int jfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, jfs_get_block); + ret = block_write_begin(mapping, pos, len, foliop, jfs_get_block); if (unlikely(ret)) jfs_write_failed(mapping, pos + len); @@ -304,12 +304,12 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping, } static int jfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct page *page, + loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ret < len) jfs_write_failed(mapping, pos + len); return ret; diff --git a/fs/libfs.c b/fs/libfs.c index 02602d00939e..46966fd8bcf9 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -914,7 +914,7 @@ static int simple_read_folio(struct file *file, struct folio *folio) int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct folio *folio; @@ -923,7 +923,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping, if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { size_t from = offset_in_folio(folio, pos); @@ -942,11 +942,11 @@ EXPORT_SYMBOL(simple_write_begin); * @pos: " * @len: " * @copied: " - * @page: " + * @folio: " * @fsdata: " * - * simple_write_end does the minimum needed for updating a page after writing is - * done. It has the same API signature as the .write_end of + * simple_write_end does the minimum needed for updating a folio after + * writing is done. It has the same API signature as the .write_end of * address_space_operations vector. So it can just be set onto .write_end for * FSes that don't need any other processing. i_mutex is assumed to be held. * Block based filesystems should use generic_write_end(). @@ -959,9 +959,8 @@ EXPORT_SYMBOL(simple_write_begin); */ static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; @@ -2003,13 +2002,19 @@ bool inode_maybe_inc_iversion(struct inode *inode, bool force) * information, but the legacy inode_inc_iversion code used a spinlock * to serialize increments. * - * Here, we add full memory barriers to ensure that any de-facto - * ordering with other info is preserved. + * We add a full memory barrier to ensure that any de facto ordering + * with other state is preserved (either implicitly coming from cmpxchg + * or explicitly from smp_mb if we don't know upfront if we will execute + * the former). * - * This barrier pairs with the barrier in inode_query_iversion() + * These barriers pair with inode_query_iversion(). */ - smp_mb(); cur = inode_peek_iversion_raw(inode); + if (!force && !(cur & I_VERSION_QUERIED)) { + smp_mb(); + cur = inode_peek_iversion_raw(inode); + } + do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) @@ -2038,20 +2043,22 @@ EXPORT_SYMBOL(inode_maybe_inc_iversion); u64 inode_query_iversion(struct inode *inode) { u64 cur, new; + bool fenced = false; + /* + * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with + * inode_maybe_inc_iversion(), see that routine for more details. + */ cur = inode_peek_iversion_raw(inode); do { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { - /* - * This barrier (and the implicit barrier in the - * cmpxchg below) pairs with the barrier in - * inode_maybe_inc_iversion(). - */ - smp_mb(); + if (!fenced) + smp_mb(); break; } + fenced = true; new = cur | I_VERSION_QUERIED; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return cur >> I_VERSION_QUERIED_SHIFT; @@ -2117,12 +2124,12 @@ struct timespec64 simple_inode_init_ts(struct inode *inode) } EXPORT_SYMBOL(simple_inode_init_ts); -static inline struct dentry *get_stashed_dentry(struct dentry *stashed) +static inline struct dentry *get_stashed_dentry(struct dentry **stashed) { struct dentry *dentry; guard(rcu)(); - dentry = READ_ONCE(stashed); + dentry = rcu_dereference(*stashed); if (!dentry) return NULL; if (!lockref_get_not_dead(&dentry->d_lockref)) @@ -2219,7 +2226,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ - path->dentry = get_stashed_dentry(*stashed); + path->dentry = get_stashed_dentry(stashed); if (path->dentry) { sops->put_data(data); goto out_path; diff --git a/fs/minix/dir.c b/fs/minix/dir.c index a224cf222570..dd2a425b41f0 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -40,18 +40,18 @@ minix_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; - block_write_end(NULL, mapping, pos, len, len, page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - unlock_page(page); + folio_unlock(folio); } static int minix_handle_dirsync(struct inode *dir) @@ -64,14 +64,15 @@ static int minix_handle_dirsync(struct inode *dir) return err; } -static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p) +static void *dir_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); - *p = page; - return kmap_local_page(page); + struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); + + if (IS_ERR(folio)) + return ERR_CAST(folio); + *foliop = folio; + return kmap_local_folio(folio, 0); } static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) @@ -99,9 +100,9 @@ static int minix_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *p, *kaddr, *limit; - struct page *page; + struct folio *folio; - kaddr = dir_get_page(inode, n, &page); + kaddr = dir_get_folio(inode, n, &folio); if (IS_ERR(kaddr)) continue; p = kaddr+offset; @@ -122,13 +123,13 @@ static int minix_readdir(struct file *file, struct dir_context *ctx) unsigned l = strnlen(name, sbi->s_namelen); if (!dir_emit(ctx, name, l, inumber, DT_UNKNOWN)) { - unmap_and_put_page(page, p); + folio_release_kmap(folio, p); return 0; } } ctx->pos += chunk_size; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 0; } @@ -144,12 +145,13 @@ static inline int namecompare(int len, int maxlen, /* * minix_find_entry() * - * finds an entry in the specified directory with the wanted name. It - * returns the cache buffer in which the entry was found, and the entry - * itself (as a parameter - res_dir). It does NOT read the inode of the + * finds an entry in the specified directory with the wanted name. + * It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. + * + * On Success folio_release_kmap() should be called on *foliop. */ -minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) +minix_dirent *minix_find_entry(struct dentry *dentry, struct folio **foliop) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; @@ -158,17 +160,15 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) struct minix_sb_info * sbi = minix_sb(sb); unsigned long n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; char *p; char *namx; __u32 inumber; - *res_page = NULL; for (n = 0; n < npages; n++) { char *kaddr, *limit; - kaddr = dir_get_page(dir, n, &page); + kaddr = dir_get_folio(dir, n, foliop); if (IS_ERR(kaddr)) continue; @@ -188,12 +188,11 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) if (namecompare(namelen, sbi->s_namelen, name, namx)) goto found; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(*foliop, kaddr); } return NULL; found: - *res_page = page; return (minix_dirent *)p; } @@ -204,7 +203,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) int namelen = dentry->d_name.len; struct super_block * sb = dir->i_sb; struct minix_sb_info * sbi = minix_sb(sb); - struct page *page = NULL; + struct folio *folio = NULL; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr, *p; @@ -223,10 +222,10 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) for (n = 0; n <= npages; n++) { char *limit, *dir_end; - kaddr = dir_get_page(dir, n, &page); + kaddr = dir_get_folio(dir, n, &folio); if (IS_ERR(kaddr)) return PTR_ERR(kaddr); - lock_page(page); + folio_lock(folio); dir_end = kaddr + minix_last_byte(dir, n); limit = kaddr + PAGE_SIZE - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { @@ -253,15 +252,15 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) if (namecompare(namelen, sbi->s_namelen, name, namx)) goto out_unlock; } - unlock_page(page); - unmap_and_put_page(page, kaddr); + folio_unlock(folio); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + offset_in_page(p); - err = minix_prepare_chunk(page, pos, sbi->s_dirsize); + pos = folio_pos(folio) + offset_in_folio(folio, p); + err = minix_prepare_chunk(folio, pos, sbi->s_dirsize); if (err) goto out_unlock; memcpy (namx, name, namelen); @@ -272,37 +271,37 @@ got_it: memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2); de->inode = inode->i_ino; } - dir_commit_chunk(page, pos, sbi->s_dirsize); + dir_commit_chunk(folio, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = minix_handle_dirsync(dir); out_put: - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return err; out_unlock: - unlock_page(page); + folio_unlock(folio); goto out_put; } -int minix_delete_entry(struct minix_dir_entry *de, struct page *page) +int minix_delete_entry(struct minix_dir_entry *de, struct folio *folio) { - struct inode *inode = page->mapping->host; - loff_t pos = page_offset(page) + offset_in_page(de); + struct inode *inode = folio->mapping->host; + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); struct minix_sb_info *sbi = minix_sb(inode->i_sb); unsigned len = sbi->s_dirsize; int err; - lock_page(page); - err = minix_prepare_chunk(page, pos, len); + folio_lock(folio); + err = minix_prepare_chunk(folio, pos, len); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } if (sbi->s_version == MINIX_V3) ((minix3_dirent *)de)->inode = 0; else de->inode = 0; - dir_commit_chunk(page, pos, len); + dir_commit_chunk(folio, pos, len); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return minix_handle_dirsync(inode); @@ -310,21 +309,21 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page) int minix_make_empty(struct inode *inode, struct inode *dir) { - struct page *page = grab_cache_page(inode->i_mapping, 0); + struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); struct minix_sb_info *sbi = minix_sb(inode->i_sb); char *kaddr; int err; - if (!page) - return -ENOMEM; - err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize); + if (IS_ERR(folio)) + return PTR_ERR(folio); + err = minix_prepare_chunk(folio, 0, 2 * sbi->s_dirsize); if (err) { - unlock_page(page); + folio_unlock(folio); goto fail; } - kaddr = kmap_local_page(page); - memset(kaddr, 0, PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); + memset(kaddr, 0, folio_size(folio)); if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)kaddr; @@ -345,10 +344,10 @@ int minix_make_empty(struct inode *inode, struct inode *dir) } kunmap_local(kaddr); - dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); + dir_commit_chunk(folio, 0, 2 * sbi->s_dirsize); err = minix_handle_dirsync(inode); fail: - put_page(page); + folio_put(folio); return err; } @@ -357,7 +356,7 @@ fail: */ int minix_empty_dir(struct inode * inode) { - struct page *page = NULL; + struct folio *folio = NULL; unsigned long i, npages = dir_pages(inode); struct minix_sb_info *sbi = minix_sb(inode->i_sb); char *name, *kaddr; @@ -366,7 +365,7 @@ int minix_empty_dir(struct inode * inode) for (i = 0; i < npages; i++) { char *p, *limit; - kaddr = dir_get_page(inode, i, &page); + kaddr = dir_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) continue; @@ -395,44 +394,44 @@ int minix_empty_dir(struct inode * inode) goto not_empty; } } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } /* Releases the page */ -int minix_set_link(struct minix_dir_entry *de, struct page *page, +int minix_set_link(struct minix_dir_entry *de, struct folio *folio, struct inode *inode) { - struct inode *dir = page->mapping->host; + struct inode *dir = folio->mapping->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); - loff_t pos = page_offset(page) + offset_in_page(de); + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; - lock_page(page); - err = minix_prepare_chunk(page, pos, sbi->s_dirsize); + folio_lock(folio); + err = minix_prepare_chunk(folio, pos, sbi->s_dirsize); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } if (sbi->s_version == MINIX_V3) ((minix3_dirent *)de)->inode = inode->i_ino; else de->inode = inode->i_ino; - dir_commit_chunk(page, pos, sbi->s_dirsize); + dir_commit_chunk(folio, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return minix_handle_dirsync(dir); } -struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) +struct minix_dir_entry *minix_dotdot(struct inode *dir, struct folio **foliop) { struct minix_sb_info *sbi = minix_sb(dir->i_sb); - struct minix_dir_entry *de = dir_get_page(dir, 0, p); + struct minix_dir_entry *de = dir_get_folio(dir, 0, foliop); if (!IS_ERR(de)) return minix_next_entry(de, sbi); @@ -441,20 +440,19 @@ struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) ino_t minix_inode_by_name(struct dentry *dentry) { - struct page *page; - struct minix_dir_entry *de = minix_find_entry(dentry, &page); + struct folio *folio; + struct minix_dir_entry *de = minix_find_entry(dentry, &folio); ino_t res = 0; if (de) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; + struct inode *inode = folio->mapping->host; struct minix_sb_info *sbi = minix_sb(inode->i_sb); if (sbi->s_version == MINIX_V3) res = ((minix3_dirent *) de)->inode; else res = de->inode; - unmap_and_put_page(page, de); + folio_release_kmap(folio, de); } return res; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 1c3df63162ef..f007e389d5d2 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -427,9 +427,9 @@ static int minix_read_folio(struct file *file, struct folio *folio) return block_read_full_folio(folio, minix_get_block); } -int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len) +int minix_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(page, pos, len, minix_get_block); + return __block_write_begin(folio, pos, len, minix_get_block); } static void minix_write_failed(struct address_space *mapping, loff_t to) @@ -444,11 +444,11 @@ static void minix_write_failed(struct address_space *mapping, loff_t to) static int minix_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, minix_get_block); + ret = block_write_begin(mapping, pos, len, foliop, minix_get_block); if (unlikely(ret)) minix_write_failed(mapping, pos + len); diff --git a/fs/minix/minix.h b/fs/minix/minix.h index d493507c064f..d54273c3c9ff 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -42,18 +42,18 @@ struct minix_sb_info { unsigned short s_version; }; -extern struct inode *minix_iget(struct super_block *, unsigned long); -extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct inode * minix_new_inode(const struct inode *, umode_t); -extern void minix_free_inode(struct inode * inode); -extern unsigned long minix_count_free_inodes(struct super_block *sb); -extern int minix_new_block(struct inode * inode); -extern void minix_free_block(struct inode *inode, unsigned long block); -extern unsigned long minix_count_free_blocks(struct super_block *sb); -extern int minix_getattr(struct mnt_idmap *, const struct path *, - struct kstat *, u32, unsigned int); -extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); +struct inode *minix_iget(struct super_block *, unsigned long); +struct minix_inode *minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); +struct minix2_inode *minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); +struct inode *minix_new_inode(const struct inode *, umode_t); +void minix_free_inode(struct inode *inode); +unsigned long minix_count_free_inodes(struct super_block *sb); +int minix_new_block(struct inode *inode); +void minix_free_block(struct inode *inode, unsigned long block); +unsigned long minix_count_free_blocks(struct super_block *sb); +int minix_getattr(struct mnt_idmap *, const struct path *, + struct kstat *, u32, unsigned int); +int minix_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); extern void V1_minix_truncate(struct inode *); extern void V2_minix_truncate(struct inode *); @@ -64,15 +64,15 @@ extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int); extern unsigned V1_minix_blocks(loff_t, struct super_block *); extern unsigned V2_minix_blocks(loff_t, struct super_block *); -extern struct minix_dir_entry *minix_find_entry(struct dentry*, struct page**); -extern int minix_add_link(struct dentry*, struct inode*); -extern int minix_delete_entry(struct minix_dir_entry*, struct page*); -extern int minix_make_empty(struct inode*, struct inode*); -extern int minix_empty_dir(struct inode*); -int minix_set_link(struct minix_dir_entry *de, struct page *page, +struct minix_dir_entry *minix_find_entry(struct dentry *, struct folio **); +int minix_add_link(struct dentry*, struct inode*); +int minix_delete_entry(struct minix_dir_entry *, struct folio *); +int minix_make_empty(struct inode*, struct inode*); +int minix_empty_dir(struct inode*); +int minix_set_link(struct minix_dir_entry *de, struct folio *folio, struct inode *inode); -extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); -extern ino_t minix_inode_by_name(struct dentry*); +struct minix_dir_entry *minix_dotdot(struct inode*, struct folio **); +ino_t minix_inode_by_name(struct dentry*); extern const struct inode_operations minix_file_inode_operations; extern const struct inode_operations minix_dir_inode_operations; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index a944a0f17b53..5d9c1406fe27 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -141,15 +141,15 @@ out_fail: static int minix_unlink(struct inode * dir, struct dentry *dentry) { struct inode * inode = d_inode(dentry); - struct page * page; + struct folio *folio; struct minix_dir_entry * de; int err; - de = minix_find_entry(dentry, &page); + de = minix_find_entry(dentry, &folio); if (!de) return -ENOENT; - err = minix_delete_entry(de, page); - unmap_and_put_page(page, de); + err = minix_delete_entry(de, folio); + folio_release_kmap(folio, de); if (err) return err; @@ -180,28 +180,28 @@ static int minix_rename(struct mnt_idmap *idmap, { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); - struct page * dir_page = NULL; + struct folio * dir_folio = NULL; struct minix_dir_entry * dir_de = NULL; - struct page * old_page; + struct folio *old_folio; struct minix_dir_entry * old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; - old_de = minix_find_entry(old_dentry, &old_page); + old_de = minix_find_entry(old_dentry, &old_folio); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = minix_dotdot(old_inode, &dir_page); + dir_de = minix_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } if (new_inode) { - struct page * new_page; + struct folio *new_folio; struct minix_dir_entry * new_de; err = -ENOTEMPTY; @@ -209,11 +209,11 @@ static int minix_rename(struct mnt_idmap *idmap, goto out_dir; err = -ENOENT; - new_de = minix_find_entry(new_dentry, &new_page); + new_de = minix_find_entry(new_dentry, &new_folio); if (!new_de) goto out_dir; - err = minix_set_link(new_de, new_page, old_inode); - unmap_and_put_page(new_page, new_de); + err = minix_set_link(new_de, new_folio, old_inode); + folio_release_kmap(new_folio, new_de); if (err) goto out_dir; inode_set_ctime_current(new_inode); @@ -228,22 +228,22 @@ static int minix_rename(struct mnt_idmap *idmap, inode_inc_link_count(new_dir); } - err = minix_delete_entry(old_de, old_page); + err = minix_delete_entry(old_de, old_folio); if (err) goto out_dir; mark_inode_dirty(old_inode); if (dir_de) { - err = minix_set_link(dir_de, dir_page, new_dir); + err = minix_set_link(dir_de, dir_folio, new_dir); if (!err) inode_dec_link_count(old_dir); } out_dir: if (dir_de) - unmap_and_put_page(dir_page, dir_de); + folio_release_kmap(dir_folio, dir_de); out_old: - unmap_and_put_page(old_page, old_de); + folio_release_kmap(old_folio, old_de); out: return err; } diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 3c60f1eaca61..79491663dbc0 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -228,15 +228,15 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from, return 0; } - forward = kmemdup(map_from->forward, - nr_extents * sizeof(struct uid_gid_extent), - GFP_KERNEL_ACCOUNT); + forward = kmemdup_array(map_from->forward, nr_extents, + sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); if (!forward) return -ENOMEM; - reverse = kmemdup(map_from->reverse, - nr_extents * sizeof(struct uid_gid_extent), - GFP_KERNEL_ACCOUNT); + reverse = kmemdup_array(map_from->reverse, nr_extents, + sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); if (!reverse) { kfree(forward); return -ENOMEM; diff --git a/fs/mount.h b/fs/mount.h index ad4b1ddebb54..0a78f85cf737 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -153,5 +153,4 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) list_add_tail(&mnt->mnt_list, dt_list); } -extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); bool has_locked_children(struct mount *mnt, struct dentry *dentry); diff --git a/fs/namei.c b/fs/namei.c index 5512cb10fa89..891b169e38c9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1639,6 +1639,20 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, } EXPORT_SYMBOL(lookup_one_qstr_excl); +/** + * lookup_fast - do fast lockless (but racy) lookup of a dentry + * @nd: current nameidata + * + * Do a fast, but racy lookup in the dcache for the given dentry, and + * revalidate it. Returns a valid dentry pointer or NULL if one wasn't + * found. On error, an ERR_PTR will be returned. + * + * If this function returns a valid dentry and the walk is no longer + * lazy, the dentry will carry a reference that must later be put. If + * RCU mode is still in force, then this is not the case and the dentry + * must be legitimized before use. If this returns NULL, then the walk + * will no longer be in RCU mode. + */ static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; @@ -3521,6 +3535,9 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, return dentry; } + if (open_flag & O_CREAT) + audit_inode(nd->name, dir, AUDIT_INODE_PARENT); + /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the @@ -3591,6 +3608,42 @@ out_dput: return ERR_PTR(error); } +static inline bool trailing_slashes(struct nameidata *nd) +{ + return (bool)nd->last.name[nd->last.len]; +} + +static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) +{ + struct dentry *dentry; + + if (open_flag & O_CREAT) { + if (trailing_slashes(nd)) + return ERR_PTR(-EISDIR); + + /* Don't bother on an O_EXCL create */ + if (open_flag & O_EXCL) + return NULL; + } + + if (trailing_slashes(nd)) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + + dentry = lookup_fast(nd); + if (IS_ERR_OR_NULL(dentry)) + return dentry; + + if (open_flag & O_CREAT) { + /* Discard negative dentries. Need inode_lock to do the create */ + if (!dentry->d_inode) { + if (!(nd->flags & LOOKUP_RCU)) + dput(dentry); + dentry = NULL; + } + } + return dentry; +} + static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { @@ -3608,28 +3661,22 @@ static const char *open_last_lookups(struct nameidata *nd, return handle_dots(nd, nd->last_type); } - if (!(open_flag & O_CREAT)) { - if (nd->last.name[nd->last.len]) - nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - /* we _can_ be in RCU mode here */ - dentry = lookup_fast(nd); - if (IS_ERR(dentry)) - return ERR_CAST(dentry); - if (likely(dentry)) - goto finish_lookup; + /* We _can_ be in RCU mode here */ + dentry = lookup_fast_for_open(nd, open_flag); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + if (likely(dentry)) + goto finish_lookup; + + if (!(open_flag & O_CREAT)) { if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) return ERR_PTR(-ECHILD); } else { - /* create side of things */ if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } - audit_inode(nd->name, dir, AUDIT_INODE_PARENT); - /* trailing slashes? */ - if (unlikely(nd->last.name[nd->last.len])) - return ERR_PTR(-EISDIR); } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { @@ -5304,7 +5351,7 @@ int page_symlink(struct inode *inode, const char *symname, int len) struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); - struct page *page; + struct folio *folio; void *fsdata = NULL; int err; unsigned int flags; @@ -5312,16 +5359,16 @@ int page_symlink(struct inode *inode, const char *symname, int len) retry: if (nofs) flags = memalloc_nofs_save(); - err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata); + err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata); if (nofs) memalloc_nofs_restore(flags); if (err) goto fail; - memcpy(page_address(page), symname, len-1); + memcpy(folio_address(folio), symname, len - 1); - err = aops->write_end(NULL, mapping, 0, len-1, len-1, - page, fsdata); + err = aops->write_end(NULL, mapping, 0, len - 1, len - 1, + folio, fsdata); if (err < 0) goto fail; if (err < len-1) diff --git a/fs/namespace.c b/fs/namespace.c index 328087a4df8a..5f2dddee0074 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1774,7 +1774,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) list_del_init(&p->mnt_child); } - /* Add propogated mounts to the tmp_list */ + /* Add propagated mounts to the tmp_list */ if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); @@ -2921,8 +2921,15 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * if (!__mnt_is_readonly(mnt) && (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { - char *buf = (char *)__get_free_page(GFP_KERNEL); - char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); + char *buf, *mntpath; + + buf = (char *)__get_free_page(GFP_KERNEL); + if (buf) + mntpath = d_path(mountpoint, buf, PAGE_SIZE); + else + mntpath = ERR_PTR(-ENOMEM); + if (IS_ERR(mntpath)) + mntpath = "(unknown)"; pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n", sb->s_type->name, @@ -2930,8 +2937,9 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * mntpath, &sb->s_time_max, (unsigned long long)sb->s_time_max); - free_page((unsigned long)buf); sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; + if (buf) + free_page((unsigned long)buf); } } @@ -5605,7 +5613,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, /* Only worry about locked mounts */ if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; - /* Is the directory permanetly empty? */ + /* Is the directory permanently empty? */ if (!is_empty_dir_inode(inode)) goto next; } diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c index 42e98bb523e3..49849005eb7c 100644 --- a/fs/netfs/fscache_main.c +++ b/fs/netfs/fscache_main.c @@ -103,6 +103,7 @@ void __exit fscache_exit(void) kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); + timer_shutdown_sync(&fscache_cookie_lru_timer); destroy_workqueue(fscache_wq); pr_notice("FS-Cache unloaded\n"); } diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 5367caf3fa28..d6ada4eba744 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -270,7 +270,7 @@ static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, if (count == remaining) return; - _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", + _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x", rreq->debug_id, subreq->debug_index, iov_iter_count(&subreq->io_iter), subreq->transferred, subreq->len, rreq->i_size, @@ -306,6 +306,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) break; subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->error = 0; + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); netfs_stat(&netfs_n_rh_download_instead); trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); @@ -313,6 +314,8 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) netfs_reset_subreq_iter(rreq, subreq); netfs_read_from_server(rreq, subreq); } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + netfs_reset_subreq_iter(rreq, subreq); netfs_rreq_short_read(rreq, subreq); } } @@ -365,7 +368,8 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) if (subreq->error || subreq->transferred == 0) break; transferred += subreq->transferred; - if (subreq->transferred < subreq->len) + if (subreq->transferred < subreq->len || + test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) break; } @@ -500,7 +504,8 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, subreq->error = 0; subreq->transferred += transferred_or_error; - if (subreq->transferred < subreq->len) + if (subreq->transferred < subreq->len && + !test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) goto incomplete; complete: @@ -779,10 +784,13 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) TASK_UNINTERRUPTIBLE); ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len && - rreq->origin != NETFS_DIO_READ) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); - ret = -EIO; + if (ret == 0) { + if (rreq->origin == NETFS_DIO_READ) { + ret = rreq->transferred; + } else if (rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } } } else { /* If we decrement nr_outstanding to 0, the ref belongs to us. */ diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c index 75dc52a49b3a..21eab56ee2f9 100644 --- a/fs/netfs/locking.c +++ b/fs/netfs/locking.c @@ -19,25 +19,13 @@ * Must be called under a lock that serializes taking new references * to i_dio_count, usually by inode->i_mutex. */ -static int inode_dio_wait_interruptible(struct inode *inode) +static int netfs_inode_dio_wait_interruptible(struct inode *inode) { - if (!atomic_read(&inode->i_dio_count)) + if (inode_dio_finished(inode)) return 0; - wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); - DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); - - for (;;) { - prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE); - if (!atomic_read(&inode->i_dio_count)) - break; - if (signal_pending(current)) - break; - schedule(); - } - finish_wait(wq, &q.wq_entry); - - return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0; + inode_dio_wait_interruptible(inode); + return !inode_dio_finished(inode) ? -ERESTARTSYS : 0; } /* Call with exclusively locked inode->i_rwsem */ @@ -46,7 +34,7 @@ static int netfs_block_o_direct(struct netfs_inode *ictx) if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) return 0; clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags); - return inode_dio_wait_interruptible(&ictx->inode); + return netfs_inode_dio_wait_interruptible(&ictx->inode); } /** diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 5f0f438e5d21..9d6b49dc6694 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -142,7 +142,7 @@ static int __init netfs_init(void) error_fscache: error_procfile: - remove_proc_entry("fs/netfs", NULL); + remove_proc_subtree("fs/netfs", NULL); error_proc: mempool_exit(&netfs_subrequest_pool); error_subreqpool: @@ -159,7 +159,7 @@ fs_initcall(netfs_init); static void __exit netfs_exit(void) { fscache_exit(); - remove_proc_entry("fs/netfs", NULL); + remove_proc_subtree("fs/netfs", NULL); mempool_exit(&netfs_subrequest_pool); kmem_cache_destroy(netfs_subrequest_slab); mempool_exit(&netfs_request_pool); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 83e644bd518f..c1f321cf5999 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -97,10 +97,22 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct netfs_folio *finfo; + struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); size_t flen = folio_size(folio); _enter("{%lx},%zx,%zx", folio->index, offset, length); + if (offset == 0 && length == flen) { + unsigned long long i_size = i_size_read(&ctx->inode); + unsigned long long fpos = folio_pos(folio), end; + + end = umin(fpos + flen, i_size); + if (fpos < i_size && end > ctx->zero_point) + ctx->zero_point = end; + } + + folio_wait_private_2(folio); /* [DEPRECATED] */ + if (!folio_test_private(folio)) return; @@ -113,18 +125,34 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) /* We have a partially uptodate page from a streaming write. */ unsigned int fstart = finfo->dirty_offset; unsigned int fend = fstart + finfo->dirty_len; - unsigned int end = offset + length; + unsigned int iend = offset + length; if (offset >= fend) return; - if (end <= fstart) + if (iend <= fstart) + return; + + /* The invalidation region overlaps the data. If the region + * covers the start of the data, we either move along the start + * or just erase the data entirely. + */ + if (offset <= fstart) { + if (iend >= fend) + goto erase_completely; + /* Move the start of the data. */ + finfo->dirty_len = fend - iend; + finfo->dirty_offset = offset; + return; + } + + /* Reduce the length of the data if the invalidation region + * covers the tail part. + */ + if (iend >= fend) { + finfo->dirty_len = offset - fstart; return; - if (offset <= fstart && end >= fend) - goto erase_completely; - if (offset <= fstart && end > fstart) - goto reduce_len; - if (offset > fstart && end >= fend) - goto move_start; + } + /* A partial write was split. The caller has already zeroed * it, so just absorb the hole. */ @@ -137,12 +165,6 @@ erase_completely: folio_clear_uptodate(folio); kfree(finfo); return; -reduce_len: - finfo->dirty_len = offset + length - finfo->dirty_offset; - return; -move_start: - finfo->dirty_len -= offset - finfo->dirty_offset; - finfo->dirty_offset = offset; } EXPORT_SYMBOL(netfs_invalidate_folio); @@ -159,12 +181,20 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); unsigned long long end; - end = folio_pos(folio) + folio_size(folio); + if (folio_test_dirty(folio)) + return false; + + end = umin(folio_pos(folio) + folio_size(folio), i_size_read(&ctx->inode)); if (end > ctx->zero_point) ctx->zero_point = end; if (folio_test_private(folio)) return false; + if (unlikely(folio_test_private_2(folio))) { /* [DEPRECATED] */ + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + folio_wait_private_2(folio); + } fscache_note_page_release(netfs_i_cookie(ctx)); return true; } diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 426cf87aaf2e..ae7a2043f670 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -33,6 +33,7 @@ int netfs_folio_written_back(struct folio *folio) { enum netfs_folio_trace why = netfs_folio_trace_clear; + struct netfs_inode *ictx = netfs_inode(folio->mapping->host); struct netfs_folio *finfo; struct netfs_group *group = NULL; int gcount = 0; @@ -41,6 +42,12 @@ int netfs_folio_written_back(struct folio *folio) /* Streaming writes cannot be redirtied whilst under writeback, * so discard the streaming record. */ + unsigned long long fend; + + fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len; + if (fend > ictx->zero_point) + ictx->zero_point = fend; + folio_detach_private(folio); group = finfo->netfs_group; gcount++; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 29c49a7e5fe1..6df77f008d3f 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -118,7 +118,9 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) if (likely(attrlen > 0)) bitmap[0] = ntohl(*p++); if (attrlen > 1) - bitmap[1] = ntohl(*p); + bitmap[1] = ntohl(*p++); + if (attrlen > 2) + bitmap[2] = ntohl(*p); return 0; } @@ -446,7 +448,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, void *argp) { struct cb_recallanyargs *args = argp; - uint32_t bitmap[2]; + uint32_t bitmap[3]; __be32 *p, status; p = xdr_inline_decode(xdr, 4); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d5edb3b3eeef..20cb2008f9e4 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -647,6 +647,9 @@ restart: prev = delegation; continue; } + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; if (prev) { struct inode *tmp = nfs_delegation_grab_inode(prev); @@ -657,12 +660,6 @@ restart: } } - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - iput(to_put); - goto restart; - } delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); @@ -1184,7 +1181,6 @@ static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server, struct inode *inode; restart: rcu_read_lock(); -restart_locked: list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags) || @@ -1195,7 +1191,7 @@ restart_locked: continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) - goto restart_locked; + continue; delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); if (delegation != NULL) { @@ -1318,7 +1314,6 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server, restart: rcu_read_lock(); -restart_locked: list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags) || @@ -1330,7 +1325,7 @@ restart_locked: continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) - goto restart_locked; + continue; spin_lock(&delegation->lock); cred = get_cred_rcu(delegation->cred); nfs4_stateid_copy(&stateid, &delegation->stateid); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 61a8cdb9f1e1..6800ee92d742 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -336,7 +336,7 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, * increment the page use counts until he is done with the page. */ static int nfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { fgf_t fgp = FGP_WRITEBEGIN; @@ -353,7 +353,7 @@ start: mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; ret = nfs_flush_incompatible(file, folio); if (ret) { @@ -372,10 +372,9 @@ start: static int nfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct folio *folio = page_folio(page); unsigned offset = offset_in_folio(folio, pos); int status; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8883016c551c..b8ffbe52ba15 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3931,7 +3931,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_CASE_INSENSITIVE | FATTR4_WORD0_CASE_PRESERVING; if (minorversion) - bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT | + FATTR4_WORD2_OPEN_ARGUMENTS; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { @@ -9997,6 +9998,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) fallthrough; default: task->tk_status = 0; + lrp->res.lrs_present = 0; fallthrough; case 0: break; @@ -10010,9 +10012,11 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) task->tk_status = 0; break; case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) - break; - goto out_restart; + if (nfs4_async_handle_error(task, server, NULL, NULL) == + -EAGAIN) + goto out_restart; + lrp->res.lrs_present = 0; + break; } return; out_restart: diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index aa698481bec8..0d16b383a452 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1284,10 +1284,9 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, LIST_HEAD(freeme); spin_lock(&inode->i_lock); - if (!pnfs_layout_is_valid(lo) || - !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + if (!nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) goto out_unlock; - if (stateid) { + if (stateid && pnfs_layout_is_valid(lo)) { u32 seq = be32_to_cpu(arg_stateid->seqid); pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index cbbd4866b0b7..97b386032b71 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -47,6 +47,7 @@ #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> +#include <linux/sched.h> #include <linux/slab.h> #include <net/ipv6.h> #include <linux/netdevice.h> @@ -228,6 +229,7 @@ static int __nfs_list_for_each_server(struct list_head *head, ret = fn(server, data); if (ret) goto out; + cond_resched(); rcu_read_lock(); } rcu_read_unlock(); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a20c2c9d7d45..a366fb1c1b9b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2789,15 +2789,18 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); - spin_lock(&nf->fi_lock); - file = find_any_file_locked(nf); - if (file) { - nfs4_show_superblock(s, file); - seq_puts(s, ", "); - nfs4_show_fname(s, file); - seq_puts(s, ", "); - } - spin_unlock(&nf->fi_lock); + if (nf) { + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (file) { + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + seq_puts(s, ", "); + } + spin_unlock(&nf->fi_lock); + } else + seq_puts(s, "closed, "); nfs4_show_owner(s, oo); if (st->sc_status & SC_STATUS_ADMIN_REVOKED) seq_puts(s, ", admin-revoked"); @@ -3075,9 +3078,9 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb) struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); - nfs4_put_stid(&dp->dl_stid); clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags); wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY); + nfs4_put_stid(&dp->dl_stid); } static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { @@ -8812,7 +8815,7 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, /** * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict * @rqstp: RPC transaction context - * @inode: file to be checked for a conflict + * @dentry: dentry of inode to be checked for a conflict * @modified: return true if file was modified * @size: new size of file if modified is true * @@ -8827,16 +8830,16 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * code is returned. */ __be32 -nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, bool *modified, u64 *size) { __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file_lock_context *ctx; struct file_lease *fl; - struct nfs4_delegation *dp; struct iattr attrs; struct nfs4_cb_fattr *ncf; + struct inode *inode = d_inode(dentry); *modified = false; ctx = locks_inode_context(inode); @@ -8856,17 +8859,26 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, */ if (type == F_RDLCK) break; - goto break_lease; + + nfsd_stats_wdeleg_getattr_inc(nn); + spin_unlock(&ctx->flc_lock); + + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + return 0; } if (type == F_WRLCK) { - dp = fl->c.flc_owner; + struct nfs4_delegation *dp = fl->c.flc_owner; + if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { spin_unlock(&ctx->flc_lock); return 0; } -break_lease: nfsd_stats_wdeleg_getattr_inc(nn); dp = fl->c.flc_owner; + refcount_inc(&dp->dl_stid.sc_count); ncf = &dp->dl_cb_fattr; nfs4_cb_getattr(&dp->dl_cb_fattr); spin_unlock(&ctx->flc_lock); @@ -8876,27 +8888,37 @@ break_lease: /* Recall delegation only if client didn't respond */ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); if (status != nfserr_jukebox || - !nfsd_wait_for_delegreturn(rqstp, inode)) + !nfsd_wait_for_delegreturn(rqstp, inode)) { + nfs4_put_stid(&dp->dl_stid); return status; + } } if (!ncf->ncf_file_modified && (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) ncf->ncf_file_modified = true; if (ncf->ncf_file_modified) { + int err; + /* * Per section 10.4.3 of RFC 8881, the server would * not update the file's metadata with the client's * modified size */ attrs.ia_mtime = attrs.ia_ctime = current_time(inode); - attrs.ia_valid = ATTR_MTIME | ATTR_CTIME; - setattr_copy(&nop_mnt_idmap, inode, &attrs); - mark_inode_dirty(inode); + attrs.ia_valid = ATTR_MTIME | ATTR_CTIME | ATTR_DELEG; + inode_lock(inode); + err = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL); + inode_unlock(inode); + if (err) { + nfs4_put_stid(&dp->dl_stid); + return nfserrno(err); + } ncf->ncf_cur_fsize = ncf->ncf_cb_fsize; *size = ncf->ncf_cur_fsize; *modified = true; } + nfs4_put_stid(&dp->dl_stid); return 0; } break; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 42b41d55d4ed..97f583777972 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3545,6 +3545,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, args.dentry = dentry; args.ignore_crossmnt = (ignore_crossmnt != 0); args.acl = NULL; +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + args.context = NULL; +#endif /* * Make a local copy of the attribute bitmap that can be modified. @@ -3562,7 +3565,7 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, } args.size = 0; if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { - status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry), + status = nfsd4_deleg_getattr_conflict(rqstp, dentry, &file_modified, &size); if (status) goto out; @@ -3617,7 +3620,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, args.contextsupport = false; #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - args.context = NULL; if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { if (exp->ex_flags & NFSEXP_SECURITY_LABEL) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index ffc217099d19..ec4559ecd193 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -781,5 +781,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) } extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, - struct inode *inode, bool *file_modified, u64 *size); + struct dentry *dentry, bool *file_modified, u64 *size); #endif /* NFSD4_STATE_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 4a29b0138d75..4b3e19d74925 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -83,7 +83,7 @@ static int nilfs_prepare_chunk(struct folio *folio, unsigned int from, { loff_t pos = folio_pos(folio) + from; - return __block_write_begin(&folio->page, pos, to - from, nilfs_get_block); + return __block_write_begin(folio, pos, to - from, nilfs_get_block); } static void nilfs_commit_chunk(struct folio *folio, @@ -96,7 +96,7 @@ static void nilfs_commit_chunk(struct folio *folio, int err; nr_dirty = nilfs_page_count_clean_buffers(&folio->page, from, to); - copied = block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); + copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); if (IS_DIRSYNC(dir)) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 7340a01d80e1..8661f452dba6 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -250,7 +250,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to) static int nilfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; @@ -259,7 +259,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping, if (unlikely(err)) return err; - err = block_write_begin(mapping, pos, len, pagep, nilfs_get_block); + err = block_write_begin(mapping, pos, len, foliop, nilfs_get_block); if (unlikely(err)) { nilfs_write_failed(mapping, pos + len); nilfs_transaction_abort(inode->i_sb); @@ -269,16 +269,16 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping, static int nilfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; unsigned int start = pos & (PAGE_SIZE - 1); unsigned int nr_dirty; int err; - nr_dirty = nilfs_page_count_clean_buffers(page, start, + nr_dirty = nilfs_page_count_clean_buffers(&folio->page, start, start + copied); - copied = generic_write_end(file, mapping, pos, len, copied, page, + copied = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); nilfs_set_file_dirty(inode, nr_dirty); err = nilfs_transaction_commit(inode->i_sb); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index b638dc06df2f..ec61ce9f29a2 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -498,7 +498,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, struct inode *inode; struct nilfs_recovery_block *rb, *n; unsigned int blocksize = nilfs->ns_blocksize; - struct page *page; + struct folio *folio; loff_t pos; int err = 0, err2 = 0; @@ -512,7 +512,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, pos = rb->blkoff << inode->i_blkbits; err = block_write_begin(inode->i_mapping, pos, blocksize, - &page, nilfs_get_block); + &folio, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; @@ -522,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_inode; } - err = nilfs_recovery_copy_block(nilfs, rb, pos, page); + err = nilfs_recovery_copy_block(nilfs, rb, pos, &folio->page); if (unlikely(err)) goto failed_page; @@ -531,17 +531,17 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_page; block_write_end(NULL, inode->i_mapping, pos, blocksize, - blocksize, page, NULL); + blocksize, folio, NULL); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); (*nr_salvaged_blocks)++; goto next; failed_page: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); failed_inode: nilfs_warn(sb, @@ -716,6 +716,33 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, } /** + * nilfs_abort_roll_forward - cleaning up after a failed rollforward recovery + * @nilfs: nilfs object + */ +static void nilfs_abort_roll_forward(struct the_nilfs *nilfs) +{ + struct nilfs_inode_info *ii, *n; + LIST_HEAD(head); + + /* Abandon inodes that have read recovery data */ + spin_lock(&nilfs->ns_inode_lock); + list_splice_init(&nilfs->ns_dirty_files, &head); + spin_unlock(&nilfs->ns_inode_lock); + if (list_empty(&head)) + return; + + set_nilfs_purging(nilfs); + list_for_each_entry_safe(ii, n, &head, i_dirty) { + spin_lock(&nilfs->ns_inode_lock); + list_del_init(&ii->i_dirty); + spin_unlock(&nilfs->ns_inode_lock); + + iput(&ii->vfs_inode); + } + clear_nilfs_purging(nilfs); +} + +/** * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint * @nilfs: nilfs object * @sb: super block instance @@ -773,15 +800,19 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, if (unlikely(err)) { nilfs_err(sb, "error %d writing segment for recovery", err); - goto failed; + goto put_root; } nilfs_finish_roll_forward(nilfs, ri); } - failed: +put_root: nilfs_put_root(root); return err; + +failed: + nilfs_abort_roll_forward(nilfs); + goto put_root; } /** diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0ca3110d6386..871ec35ea8e8 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1812,6 +1812,9 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, nilfs_abort_logs(&logs, ret ? : err); list_splice_tail_init(&sci->sc_segbufs, &logs); + if (list_empty(&logs)) + return; /* if the first segment buffer preparation failed */ + nilfs_cancel_segusage(&logs, nilfs->ns_sufile); nilfs_free_incomplete_logs(&logs, nilfs); @@ -2056,7 +2059,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) err = nilfs_segctor_begin_construction(sci, nilfs); if (unlikely(err)) - goto out; + goto failed; /* Update time stamp */ sci->sc_seg_ctime = ktime_get_real_seconds(); @@ -2120,10 +2123,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) return err; failed_to_write: - if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) - nilfs_redirty_inodes(&sci->sc_dirty_files); - failed: + if (mode == SC_LSEG_SR && nilfs_sc_cstage_get(sci) >= NILFS_ST_IFILE) + nilfs_redirty_inodes(&sci->sc_dirty_files); if (nilfs_doing_gc()) nilfs_redirty_inodes(&sci->sc_gc_inodes); nilfs_segctor_abort_construction(sci, nilfs, err); diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index a5569b7f47a3..14868a3dd592 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -836,9 +836,15 @@ ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u32 major = le32_to_cpu(sbp[0]->s_rev_level); - u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level); + struct nilfs_super_block *raw_sb; + u32 major; + u16 minor; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + major = le32_to_cpu(raw_sb->s_rev_level); + minor = le16_to_cpu(raw_sb->s_minor_rev_level); + up_read(&nilfs->ns_sem); return sysfs_emit(buf, "%d.%d\n", major, minor); } @@ -856,8 +862,13 @@ ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size); + struct nilfs_super_block *raw_sb; + u64 dev_size; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + dev_size = le64_to_cpu(raw_sb->s_dev_size); + up_read(&nilfs->ns_sem); return sysfs_emit(buf, "%llu\n", dev_size); } @@ -879,9 +890,15 @@ ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct nilfs_super_block *raw_sb; + ssize_t len; - return sysfs_emit(buf, "%pUb\n", sbp[0]->s_uuid); + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + len = sysfs_emit(buf, "%pUb\n", raw_sb->s_uuid); + up_read(&nilfs->ns_sem); + + return len; } static @@ -889,10 +906,16 @@ ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct nilfs_super_block *raw_sb; + ssize_t len; + + down_read(&nilfs->ns_sem); + raw_sb = nilfs->ns_sbp[0]; + len = scnprintf(buf, sizeof(raw_sb->s_volume_name), "%s\n", + raw_sb->s_volume_name); + up_read(&nilfs->ns_sem); - return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n", - sbp[0]->s_volume_name); + return len; } static const char dev_readme_str[] = diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index ca1ddc46bd86..6202895a4542 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -182,7 +182,7 @@ static int ntfs_extend_initialized_size(struct file *file, for (;;) { u32 zerofrom, len; - struct page *page; + struct folio *folio; u8 bits; CLST vcn, lcn, clen; @@ -208,14 +208,13 @@ static int ntfs_extend_initialized_size(struct file *file, if (pos + len > new_valid) len = new_valid - pos; - err = ntfs_write_begin(file, mapping, pos, len, &page, NULL); + err = ntfs_write_begin(file, mapping, pos, len, &folio, NULL); if (err) goto out; - zero_user_segment(page, zerofrom, PAGE_SIZE); + folio_zero_range(folio, zerofrom, folio_size(folio)); - /* This function in any case puts page. */ - err = ntfs_write_end(file, mapping, pos, len, len, page, NULL); + err = ntfs_write_end(file, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; pos += len; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 6b0bdc474e76..f672072e6bd4 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -901,7 +901,7 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, } int ntfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, struct page **pagep, void **fsdata) + loff_t pos, u32 len, struct folio **foliop, void **fsdata) { int err; struct inode *inode = mapping->host; @@ -910,7 +910,6 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; - *pagep = NULL; if (is_resident(ni)) { struct folio *folio = __filemap_get_folio( mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN, @@ -926,7 +925,7 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, ni_unlock(ni); if (!err) { - *pagep = &folio->page; + *foliop = folio; goto out; } folio_unlock(folio); @@ -936,7 +935,7 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, goto out; } - err = block_write_begin(mapping, pos, len, pagep, + err = block_write_begin(mapping, pos, len, foliop, ntfs_get_block_write_begin); out: @@ -947,9 +946,8 @@ out: * ntfs_write_end - Address_space_operations::write_end. */ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, - u32 len, u32 copied, struct page *page, void *fsdata) + u32 len, u32 copied, struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); u64 valid = ni->i_valid; @@ -979,7 +977,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, folio_unlock(folio); folio_put(folio); } else { - err = generic_write_end(file, mapping, pos, len, copied, page, + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); } @@ -1008,45 +1006,6 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, return err; } -int reset_log_file(struct inode *inode) -{ - int err; - loff_t pos = 0; - u32 log_size = inode->i_size; - struct address_space *mapping = inode->i_mapping; - - for (;;) { - u32 len; - void *kaddr; - struct page *page; - - len = pos + PAGE_SIZE > log_size ? (log_size - pos) : PAGE_SIZE; - - err = block_write_begin(mapping, pos, len, &page, - ntfs_get_block_write_begin); - if (err) - goto out; - - kaddr = kmap_atomic(page); - memset(kaddr, -1, len); - kunmap_atomic(kaddr); - flush_dcache_page(page); - - err = block_write_end(NULL, mapping, pos, len, len, page, NULL); - if (err < 0) - goto out; - pos += len; - - if (pos >= log_size) - break; - balance_dirty_pages_ratelimited(mapping); - } -out: - mark_inode_dirty_sync(inode); - - return err; -} - int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc) { return _ni_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index e5255a251929..584f814715f4 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -708,13 +708,12 @@ int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi, struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, const struct cpu_str *name); int ntfs_set_size(struct inode *inode, u64 new_size); -int reset_log_file(struct inode *inode); int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create); int ntfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, struct page **pagep, void **fsdata); + loff_t pos, u32 len, struct folio **foliop, void **fsdata); int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, - u32 len, u32 copied, struct page *page, void *fsdata); + u32 len, u32 copied, struct folio *folio, void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 6be175a1ab3c..d6c985cc6353 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1643,7 +1643,7 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, ocfs2_write_type_t type, - struct page **pagep, void **fsdata, + struct folio **foliop, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) { int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; @@ -1826,8 +1826,8 @@ try_again: ocfs2_free_alloc_context(meta_ac); success: - if (pagep) - *pagep = wc->w_target_page; + if (foliop) + *foliop = page_folio(wc->w_target_page); *fsdata = wc; return 0; out_quota: @@ -1879,7 +1879,7 @@ out: static int ocfs2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; struct buffer_head *di_bh = NULL; @@ -1901,7 +1901,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, down_write(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER, - pagep, fsdata, di_bh, NULL); + foliop, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); goto out_fail; @@ -2076,7 +2076,7 @@ out: static int ocfs2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { int ret; struct inode *inode = mapping->host; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 3a520117fa59..45db1781ea73 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -38,7 +38,7 @@ typedef enum { int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, ocfs2_write_type_t type, - struct page **pagep, void **fsdata, + struct folio **foliop, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page); int ocfs2_read_inline_data(struct inode *inode, struct page *page, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 115ab2172820..ad131a2fc58e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -755,7 +755,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, u64 abs_to, struct buffer_head *di_bh) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; unsigned long index = abs_from >> PAGE_SHIFT; handle_t *handle; int ret = 0; @@ -774,9 +774,10 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, goto out; } - page = find_or_create_page(mapping, index, GFP_NOFS); - if (!page) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); mlog_errno(ret); goto out_commit_trans; } @@ -803,7 +804,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, * __block_write_begin and block_commit_write to zero the * whole block. */ - ret = __block_write_begin(page, block_start + 1, 0, + ret = __block_write_begin(folio, block_start + 1, 0, ocfs2_get_block); if (ret < 0) { mlog_errno(ret); @@ -812,7 +813,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* must not update i_size! */ - block_commit_write(page, block_start + 1, block_start + 1); + block_commit_write(&folio->page, block_start + 1, block_start + 1); } /* @@ -833,8 +834,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, } out_unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_commit_trans: if (handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 1834f26522ed..6ef4cb045ccd 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -53,7 +53,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, loff_t pos = page_offset(page); unsigned int len = PAGE_SIZE; pgoff_t last_index; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; void *fsdata; loff_t size = i_size_read(inode); @@ -91,7 +91,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, len = ((size - 1) & ~PAGE_MASK) + 1; err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, - &locked_page, &fsdata, di_bh, page); + &locked_folio, &fsdata, di_bh, page); if (err) { if (err != -ENOSPC) mlog_errno(err); @@ -99,7 +99,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, goto out; } - if (!locked_page) { + if (!locked_folio) { ret = VM_FAULT_NOPAGE; goto out; } diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 6b580b9da8e3..98358d405b6a 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -312,11 +312,11 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to) static int omfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, omfs_get_block); + ret = block_write_begin(mapping, pos, len, foliop, omfs_get_block); if (unlikely(ret)) omfs_write_failed(mapping, pos + len); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index fdb9b65db1de..aae6d2b8767d 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -309,22 +309,18 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) static int orangefs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct orangefs_write_range *wr; struct folio *folio; - struct page *page; - pgoff_t index; int ret; - index = pos >> PAGE_SHIFT; + folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - - *pagep = page; - folio = page_folio(page); + *foliop = folio; if (folio_test_dirty(folio) && !folio_test_private(folio)) { /* @@ -365,9 +361,10 @@ okay: } static int orangefs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) + loff_t pos, unsigned len, unsigned copied, struct folio *folio, + void *fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; /* @@ -377,23 +374,23 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping, if (last_pos > inode->i_size) i_size_write(inode, last_pos); - /* zero the stale part of the page if we did a short copy */ - if (!PageUptodate(page)) { + /* zero the stale part of the folio if we did a short copy */ + if (!folio_test_uptodate(folio)) { unsigned from = pos & (PAGE_SIZE - 1); if (copied < len) { - zero_user(page, from + copied, len - copied); + folio_zero_range(folio, from + copied, len - copied); } /* Set fully written pages uptodate. */ - if (pos == page_offset(page) && + if (pos == folio_pos(folio) && (len == PAGE_SIZE || pos + len == inode->i_size)) { - zero_user_segment(page, from + copied, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, from + copied, PAGE_SIZE); + folio_mark_uptodate(folio); } } - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); mark_inode_dirty_sync(file_inode(file)); return copied; diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 4860fcc4611b..d0568c091341 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -353,6 +353,8 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, case Opt_datadir_add: ctx->nr_data++; fallthrough; + case Opt_lowerdir: + fallthrough; case Opt_lowerdir_add: WARN_ON(ctx->nr >= ctx->capacity); l = &ctx->lower[ctx->nr++]; @@ -365,10 +367,9 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, } } -static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, - enum ovl_opt layer) +static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer) { - char *name = kstrdup(param->string, GFP_KERNEL); + char *name = kstrdup(layer_name, GFP_KERNEL); bool upper = (layer == Opt_upperdir || layer == Opt_workdir); struct path path; int err; @@ -376,7 +377,7 @@ static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, if (!name) return -ENOMEM; - if (upper) + if (upper || layer == Opt_lowerdir) err = ovl_mount_dir(name, &path); else err = ovl_mount_dir_noesc(name, &path); @@ -432,7 +433,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) { int err; struct ovl_fs_context *ctx = fc->fs_private; - struct ovl_fs_context_layer *l; char *dup = NULL, *iter; ssize_t nr_lower, nr; bool data_layer = false; @@ -449,7 +449,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) return 0; if (*name == ':') { - pr_err("cannot append lower layer"); + pr_err("cannot append lower layer\n"); return -EINVAL; } @@ -472,35 +472,11 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) goto out_err; } - if (nr_lower > ctx->capacity) { - err = -ENOMEM; - l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower), - GFP_KERNEL_ACCOUNT); - if (!l) - goto out_err; - - ctx->lower = l; - ctx->capacity = nr_lower; - } - iter = dup; - l = ctx->lower; - for (nr = 0; nr < nr_lower; nr++, l++) { - ctx->nr++; - memset(l, 0, sizeof(*l)); - - err = ovl_mount_dir(iter, &l->path); + for (nr = 0; nr < nr_lower; nr++) { + err = ovl_parse_layer(fc, iter, Opt_lowerdir); if (err) - goto out_put; - - err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false); - if (err) - goto out_put; - - err = -ENOMEM; - l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT); - if (!l->name) - goto out_put; + goto out_err; if (data_layer) ctx->nr_data++; @@ -517,8 +493,8 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) * there are no data layers. */ if (ctx->nr_data > 0) { - pr_err("regular lower layers cannot follow data lower layers"); - goto out_put; + pr_err("regular lower layers cannot follow data lower layers\n"); + goto out_err; } data_layer = false; @@ -532,9 +508,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) kfree(dup); return 0; -out_put: - ovl_reset_lowerdirs(ctx); - out_err: kfree(dup); @@ -582,7 +555,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_datadir_add: case Opt_upperdir: case Opt_workdir: - err = ovl_parse_layer(fc, param, opt); + err = ovl_parse_layer(fc, param->string, opt); break; case Opt_default_permissions: config->default_permissions = true; diff --git a/fs/pipe.c b/fs/pipe.c index d3735f1d6f00..4083ba492cb6 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1429,7 +1429,7 @@ static const struct super_operations pipefs_ops = { /* * pipefs should _never_ be mounted by userland - too much of security hassle, - * no real gain from having the whole whorehouse mounted. So we don't need + * no real gain from having the whole file system mounted. So we don't need * any operations on the root directory. However, we need a non-trivial * d_name - pipe: will go nicely and kill the special-casing in procfs. */ diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 3f87297dbfdb..6c66a37522d0 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -715,8 +715,8 @@ int posix_acl_update_mode(struct mnt_idmap *idmap, return error; if (error == 0) *acl = NULL; - if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && - !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) + if (!in_group_or_capable(idmap, inode, + i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; *mode_p = mode; return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 988f6bbfc4bd..7f3abc3de49f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -827,12 +827,9 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) static int mem_open(struct inode *inode, struct file *file) { - int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); - - /* OK to pass negative loff_t, we can catch out-of-range */ - file->f_mode |= FMODE_UNSIGNED_OFFSET; - - return ret; + if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) + return -EINVAL; + return __mem_open(inode, file, PTRACE_MODE_ATTACH); } static ssize_t mem_rw(struct file *file, char __user *buf, @@ -932,6 +929,7 @@ static const struct file_operations proc_mem_operations = { .write = mem_write, .open = mem_open, .release = mem_release, + .fop_flags = FOP_UNSIGNED_OFFSET, }; static int environ_open(struct inode *inode, struct file *file) diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 586bbc84ca04..7baafb1eba13 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -59,7 +59,7 @@ static int seq_show(struct seq_file *m, void *v) real_mount(file->f_path.mnt)->mnt_id, file_inode(file)->i_ino); - /* show_fd_locks() never deferences files so a stale value is safe */ + /* show_fd_locks() never dereferences files, so a stale value is safe */ show_fd_locks(m, file, files); if (seq_has_overflowed(m)) goto out; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 8e08a9a1b7ed..7d0acdad74e2 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -235,7 +235,7 @@ static int kcore_ram_list(struct list_head *list) int nid, ret; unsigned long end_pfn; - /* Not inialized....update now */ + /* Not initialized....update now */ /* find out "max pfn" */ end_pfn = 0; for_each_node_state(nid, N_MEMORY) { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5f171ad7b436..2c5f4814aef9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -976,7 +976,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", +#if VM_PKEY_BIT3 [ilog2(VM_PKEY_BIT3)] = "", +#endif #if VM_PKEY_BIT4 [ilog2(VM_PKEY_BIT4)] = "", #endif diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index c1cfb8a19e9d..b4d10e45f2e4 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -24,13 +24,15 @@ static unsigned qnx6_lfile_checksum(char *name, unsigned size) return crc; } -static struct page *qnx6_get_page(struct inode *dir, unsigned long n) +static void *qnx6_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) - kmap(page); - return page; + struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); + + if (IS_ERR(folio)) + return folio; + *foliop = folio; + return kmap_local_folio(folio, 0); } static unsigned last_entry(struct inode *inode, unsigned long page_nr) @@ -44,19 +46,20 @@ static unsigned last_entry(struct inode *inode, unsigned long page_nr) static struct qnx6_long_filename *qnx6_longname(struct super_block *sb, struct qnx6_long_dir_entry *de, - struct page **p) + struct folio **foliop) { struct qnx6_sb_info *sbi = QNX6_SB(sb); u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */ u32 n = s >> (PAGE_SHIFT - sb->s_blocksize_bits); /* in pages */ - /* within page */ - u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_MASK; + u32 offs; struct address_space *mapping = sbi->longfile->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); - kmap(*p = page); - return (struct qnx6_long_filename *)(page_address(page) + offs); + struct folio *folio = read_mapping_folio(mapping, n, NULL); + + if (IS_ERR(folio)) + return ERR_CAST(folio); + offs = offset_in_folio(folio, s << sb->s_blocksize_bits); + *foliop = folio; + return kmap_local_folio(folio, offs); } static int qnx6_dir_longfilename(struct inode *inode, @@ -67,7 +70,7 @@ static int qnx6_dir_longfilename(struct inode *inode, struct qnx6_long_filename *lf; struct super_block *s = inode->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); - struct page *page; + struct folio *folio; int lf_size; if (de->de_size != 0xff) { @@ -76,7 +79,7 @@ static int qnx6_dir_longfilename(struct inode *inode, pr_err("invalid direntry size (%i).\n", de->de_size); return 0; } - lf = qnx6_longname(s, de, &page); + lf = qnx6_longname(s, de, &folio); if (IS_ERR(lf)) { pr_err("Error reading longname\n"); return 0; @@ -87,7 +90,7 @@ static int qnx6_dir_longfilename(struct inode *inode, if (lf_size > QNX6_LONG_NAME_MAX) { pr_debug("file %s\n", lf->lf_fname); pr_err("Filename too long (%i)\n", lf_size); - qnx6_put_page(page); + folio_release_kmap(folio, lf); return 0; } @@ -100,11 +103,11 @@ static int qnx6_dir_longfilename(struct inode *inode, pr_debug("qnx6_readdir:%.*s inode:%u\n", lf_size, lf->lf_fname, de_inode); if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) { - qnx6_put_page(page); + folio_release_kmap(folio, lf); return 0; } - qnx6_put_page(page); + folio_release_kmap(folio, lf); /* success */ return 1; } @@ -117,26 +120,27 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1); unsigned long npages = dir_pages(inode); unsigned long n = pos >> PAGE_SHIFT; - unsigned start = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE; + unsigned offset = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE; bool done = false; ctx->pos = pos; if (ctx->pos >= inode->i_size) return 0; - for ( ; !done && n < npages; n++, start = 0) { - struct page *page = qnx6_get_page(inode, n); - int limit = last_entry(inode, n); + for ( ; !done && n < npages; n++, offset = 0) { struct qnx6_dir_entry *de; - int i = start; + struct folio *folio; + char *kaddr = qnx6_get_folio(inode, n, &folio); + char *limit; - if (IS_ERR(page)) { + if (IS_ERR(kaddr)) { pr_err("%s(): read failed\n", __func__); ctx->pos = (n + 1) << PAGE_SHIFT; - return PTR_ERR(page); + return PTR_ERR(kaddr); } - de = ((struct qnx6_dir_entry *)page_address(page)) + start; - for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) { + de = (struct qnx6_dir_entry *)(kaddr + offset); + limit = kaddr + last_entry(inode, n); + for (; (char *)de < limit; de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) { int size = de->de_size; u32 no_inode = fs32_to_cpu(sbi, de->de_inode); @@ -164,7 +168,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) } } } - qnx6_put_page(page); + folio_release_kmap(folio, kaddr); } return 0; } @@ -177,23 +181,23 @@ static unsigned qnx6_long_match(int len, const char *name, { struct super_block *s = dir->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); - struct page *page; + struct folio *folio; int thislen; - struct qnx6_long_filename *lf = qnx6_longname(s, de, &page); + struct qnx6_long_filename *lf = qnx6_longname(s, de, &folio); if (IS_ERR(lf)) return 0; thislen = fs16_to_cpu(sbi, lf->lf_size); if (len != thislen) { - qnx6_put_page(page); + folio_release_kmap(folio, lf); return 0; } if (memcmp(name, lf->lf_fname, len) == 0) { - qnx6_put_page(page); + folio_release_kmap(folio, lf); return fs32_to_cpu(sbi, de->de_inode); } - qnx6_put_page(page); + folio_release_kmap(folio, lf); return 0; } @@ -210,20 +214,17 @@ static unsigned qnx6_match(struct super_block *s, int len, const char *name, } -unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, - struct page **res_page) +unsigned qnx6_find_ino(int len, struct inode *dir, const char *name) { struct super_block *s = dir->i_sb; struct qnx6_inode_info *ei = QNX6_I(dir); - struct page *page = NULL; + struct folio *folio; unsigned long start, n; unsigned long npages = dir_pages(dir); unsigned ino; struct qnx6_dir_entry *de; struct qnx6_long_dir_entry *lde; - *res_page = NULL; - if (npages == 0) return 0; start = ei->i_dir_start_lookup; @@ -232,12 +233,11 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, n = start; do { - page = qnx6_get_page(dir, n); - if (!IS_ERR(page)) { + de = qnx6_get_folio(dir, n, &folio); + if (!IS_ERR(de)) { int limit = last_entry(dir, n); int i; - de = (struct qnx6_dir_entry *)page_address(page); for (i = 0; i < limit; i++, de++) { if (len <= QNX6_SHORT_NAME_MAX) { /* short filename */ @@ -256,7 +256,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, } else pr_err("undefined filename size in inode.\n"); } - qnx6_put_page(page); + folio_release_kmap(folio, de - i); } if (++n >= npages) @@ -265,8 +265,8 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, return 0; found: - *res_page = page; ei->i_dir_start_lookup = n; + folio_release_kmap(folio, de); return ino; } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 4f1735b882b1..85925ec0051a 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -184,17 +184,17 @@ static const char *qnx6_checkroot(struct super_block *s) struct qnx6_dir_entry *dir_entry; struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; - struct page *page = read_mapping_page(mapping, 0, NULL); - if (IS_ERR(page)) + struct folio *folio = read_mapping_folio(mapping, 0, NULL); + + if (IS_ERR(folio)) return "error reading root directory"; - kmap(page); - dir_entry = page_address(page); + dir_entry = kmap_local_folio(folio, 0); for (i = 0; i < 2; i++) { /* maximum 3 bytes - due to match_root limitation */ if (strncmp(dir_entry[i].de_fname, match_root[i], 3)) error = 1; } - qnx6_put_page(page); + folio_release_kmap(folio, dir_entry); if (error) return "error reading root directory."; return NULL; @@ -518,7 +518,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) struct inode *inode; struct qnx6_inode_info *ei; struct address_space *mapping; - struct page *page; + struct folio *folio; u32 n, offs; inode = iget_locked(sb, ino); @@ -538,17 +538,16 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) return ERR_PTR(-EIO); } n = (ino - 1) >> (PAGE_SHIFT - QNX6_INODE_SIZE_BITS); - offs = (ino - 1) & (~PAGE_MASK >> QNX6_INODE_SIZE_BITS); mapping = sbi->inodes->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) { + folio = read_mapping_folio(mapping, n, NULL); + if (IS_ERR(folio)) { pr_err("major problem: unable to read inode from dev %s\n", sb->s_id); iget_failed(inode); - return ERR_CAST(page); + return ERR_CAST(folio); } - kmap(page); - raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs; + offs = offset_in_folio(folio, (ino - 1) << QNX6_INODE_SIZE_BITS); + raw_inode = kmap_local_folio(folio, offs); inode->i_mode = fs16_to_cpu(sbi, raw_inode->di_mode); i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid)); @@ -578,7 +577,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode->i_mapping->a_ops = &qnx6_aops; } else init_special_inode(inode, inode->i_mode, 0); - qnx6_put_page(page); + folio_release_kmap(folio, raw_inode); unlock_new_inode(inode); return inode; } diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c index e2e98e653b8d..0f0755a9ecb5 100644 --- a/fs/qnx6/namei.c +++ b/fs/qnx6/namei.c @@ -17,7 +17,6 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned ino; - struct page *page; struct inode *foundinode = NULL; const char *name = dentry->d_name.name; int len = dentry->d_name.len; @@ -25,10 +24,9 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, if (len > QNX6_LONG_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - ino = qnx6_find_entry(len, dir, name, &page); + ino = qnx6_find_ino(len, dir, name); if (ino) { foundinode = qnx6_iget(dir->i_sb, ino); - qnx6_put_page(page); if (IS_ERR(foundinode)) pr_debug("lookup->iget -> error %ld\n", PTR_ERR(foundinode)); diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h index 34a6b126a3a9..56ed1367499e 100644 --- a/fs/qnx6/qnx6.h +++ b/fs/qnx6/qnx6.h @@ -126,11 +126,4 @@ static inline __fs16 cpu_to_fs16(struct qnx6_sb_info *sbi, __u16 n) extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent); -static inline void qnx6_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - -extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, - struct page **res_page); +unsigned qnx6_find_ino(int len, struct inode *dir, const char *name); diff --git a/fs/read_write.c b/fs/read_write.c index b19cce8e55b9..070a7c33b9dd 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { - return file->f_mode & FMODE_UNSIGNED_OFFSET; + return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET; } /** diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 9b43a81a6488..72c53129c952 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2178,7 +2178,7 @@ static int grab_tail_page(struct inode *inode, unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1); struct buffer_head *bh; struct buffer_head *head; - struct page *page; + struct folio *folio; int error; /* @@ -2190,20 +2190,20 @@ static int grab_tail_page(struct inode *inode, if ((offset & (blocksize - 1)) == 0) { return -ENOENT; } - page = grab_cache_page(inode->i_mapping, index); - error = -ENOMEM; - if (!page) { - goto out; - } + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(inode->i_mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); /* start within the page of the last block in the file */ start = (offset / blocksize) * blocksize; - error = __block_write_begin(page, start, offset - start, + error = __block_write_begin(folio, start, offset - start, reiserfs_get_block_create_0); if (error) goto unlock; - head = page_buffers(page); + head = folio_buffers(folio); bh = head; do { if (pos >= start) { @@ -2226,14 +2226,13 @@ static int grab_tail_page(struct inode *inode, goto unlock; } *bh_result = bh; - *page_result = page; + *page_result = &folio->page; -out: return error; unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return error; } @@ -2736,23 +2735,24 @@ static void reiserfs_truncate_failed_write(struct inode *inode) static int reiserfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode; - struct page *page; + struct folio *folio; pgoff_t index; int ret; int old_ref = 0; inode = mapping->host; index = pos >> PAGE_SHIFT; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + *foliop = folio; reiserfs_wait_on_write_block(inode->i_sb); - fix_tail_page_for_writing(page); + fix_tail_page_for_writing(&folio->page); if (reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th; th = (struct reiserfs_transaction_handle *)current-> @@ -2762,7 +2762,7 @@ static int reiserfs_write_begin(struct file *file, old_ref = th->t_refcount; th->t_refcount++; } - ret = __block_write_begin(page, pos, len, reiserfs_get_block); + ret = __block_write_begin(folio, pos, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; /* @@ -2792,8 +2792,8 @@ static int reiserfs_write_begin(struct file *file, } } if (ret) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* Truncate allocated blocks */ reiserfs_truncate_failed_write(inode); } @@ -2822,7 +2822,7 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) th->t_refcount++; } - ret = __block_write_begin(page, from, len, reiserfs_get_block); + ret = __block_write_begin(page_folio(page), from, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; /* @@ -2862,10 +2862,9 @@ static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) static int reiserfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; int ret = 0; int update_sd = 0; struct reiserfs_transaction_handle *th; @@ -2887,7 +2886,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, } flush_dcache_folio(folio); - reiserfs_commit_page(inode, page, start, start + copied); + reiserfs_commit_page(inode, &folio->page, start, start + copied); /* * generic_commit_write does this for us, but does not update the @@ -2942,8 +2941,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, out: if (locked) reiserfs_write_unlock(inode->i_sb); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (pos + len > inode->i_size) reiserfs_truncate_failed_write(inode); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 68758b6fed94..0addcc849ff2 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -126,7 +126,7 @@ static int romfs_read_folio(struct file *file, struct folio *folio) } } - buf = folio_zero_tail(folio, fillsize, buf); + buf = folio_zero_tail(folio, fillsize, buf + fillsize); kunmap_local(buf); folio_end_read(folio, ret == 0); return ret; diff --git a/fs/select.c b/fs/select.c index 9515c3fa1a03..1a4849e2afb9 100644 --- a/fs/select.c +++ b/fs/select.c @@ -840,7 +840,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) struct poll_list { struct poll_list *next; unsigned int len; - struct pollfd entries[]; + struct pollfd entries[] __counted_by(len); }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 6322f0f68a17..b0473c2567fe 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -129,7 +129,7 @@ static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize, for (j = foffset / PAGE_SIZE; j < npages; j++) { len = min_t(size_t, maxsize, PAGE_SIZE - offset); p = kmap_local_page(folio_page(folio, j)); - ret = crypto_shash_update(shash, p, len); + ret = crypto_shash_update(shash, p + offset, len); kunmap_local(p); if (ret < 0) return ret; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 2c4b357d85e2..2a2523c93944 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -75,9 +75,9 @@ unsigned int sign_CIFS_PDUs = 1; /* * Global transaction id (XID) information */ -unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ -unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ -unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ +unsigned int GlobalCurrentXid; /* protected by GlobalMid_Lock */ +unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Lock */ +unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Lock */ spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */ /* @@ -1341,7 +1341,6 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, struct cifsFileInfo *smb_file_target; struct cifs_tcon *src_tcon; struct cifs_tcon *target_tcon; - unsigned long long destend, fstart, fend; ssize_t rc; cifs_dbg(FYI, "copychunk range\n"); @@ -1391,25 +1390,13 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, goto unlock; } - destend = destoff + len - 1; - - /* Flush the folios at either end of the destination range to prevent - * accidental loss of dirty data outside of the range. + /* Flush and invalidate all the folios in the destination region. If + * the copy was successful, then some of the flush is extra overhead, + * but we need to allow for the copy failing in some way (eg. ENOSPC). */ - fstart = destoff; - fend = destend; - - rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true); + rc = filemap_invalidate_inode(target_inode, true, destoff, destoff + len - 1); if (rc) goto unlock; - rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false); - if (rc) - goto unlock; - if (fend > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = fend + 1; - - /* Discard all the folios that overlap the destination region. */ - truncate_inode_pages_range(&target_inode->i_data, fstart, fend); fscache_invalidate(cifs_inode_cookie(target_inode), NULL, i_size_read(target_inode), 0); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 5c9b3e6cd95f..9eae8649f90c 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -254,7 +254,6 @@ struct cifs_open_info_data { struct smb_rqst { struct kvec *rq_iov; /* array of kvecs */ unsigned int rq_nvec; /* number of kvecs in array */ - size_t rq_iter_size; /* Amount of data in ->rq_iter */ struct iov_iter rq_iter; /* Data iterator */ struct xarray rq_buffer; /* Page buffer for encryption */ }; @@ -1486,6 +1485,7 @@ struct cifs_io_subrequest { struct cifs_io_request *req; }; ssize_t got_bytes; + size_t actual_len; unsigned int xid; int result; bool have_xid; @@ -2017,9 +2017,9 @@ extern spinlock_t cifs_tcp_ses_lock; /* * Global transaction id (XID) information */ -extern unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ -extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ -extern unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ +extern unsigned int GlobalCurrentXid; /* protected by GlobalMid_Lock */ +extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Lock */ +extern unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Lock */ extern spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */ /* diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 595c4b673707..cfae2e918209 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1261,16 +1261,32 @@ openRetry: return rc; } +static void cifs_readv_worker(struct work_struct *work) +{ + struct cifs_io_subrequest *rdata = + container_of(work, struct cifs_io_subrequest, subreq.work); + + netfs_subreq_terminated(&rdata->subreq, + (rdata->result == 0 || rdata->result == -EAGAIN) ? + rdata->got_bytes : rdata->result, true); +} + static void cifs_readv_callback(struct mid_q_entry *mid) { struct cifs_io_subrequest *rdata = mid->callback_data; + struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode); struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_iter = rdata->subreq.io_iter }; - struct cifs_credits credits = { .value = 1, .instance = 0 }; + struct cifs_credits credits = { + .value = 1, + .instance = 0, + .rreq_debug_id = rdata->rreq->debug_id, + .rreq_debug_index = rdata->subreq.debug_index, + }; cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n", __func__, mid->mid, mid->mid_state, rdata->result, @@ -1282,6 +1298,7 @@ cifs_readv_callback(struct mid_q_entry *mid) if (server->sign) { int rc = 0; + iov_iter_truncate(&rqst.rq_iter, rdata->got_bytes); rc = cifs_verify_signature(&rqst, server, mid->sequence_number); if (rc) @@ -1306,13 +1323,21 @@ cifs_readv_callback(struct mid_q_entry *mid) rdata->result = -EIO; } - if (rdata->result == 0 || rdata->result == -EAGAIN) - iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes); + if (rdata->result == -ENODATA) { + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + rdata->result = 0; + } else { + if (rdata->got_bytes < rdata->actual_len && + rdata->subreq.start + rdata->subreq.transferred + rdata->got_bytes == + ictx->remote_i_size) { + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + rdata->result = 0; + } + } + rdata->credits.value = 0; - netfs_subreq_terminated(&rdata->subreq, - (rdata->result == 0 || rdata->result == -EAGAIN) ? - rdata->got_bytes : rdata->result, - false); + INIT_WORK(&rdata->subreq.work, cifs_readv_worker); + queue_work(cifsiod_wq, &rdata->subreq.work); release_mid(mid); add_credits(server, &credits, 0); } @@ -1619,9 +1644,15 @@ static void cifs_writev_callback(struct mid_q_entry *mid) { struct cifs_io_subrequest *wdata = mid->callback_data; + struct TCP_Server_Info *server = wdata->server; struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink); WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; - struct cifs_credits credits = { .value = 1, .instance = 0 }; + struct cifs_credits credits = { + .value = 1, + .instance = 0, + .rreq_debug_id = wdata->rreq->debug_id, + .rreq_debug_index = wdata->subreq.debug_index, + }; ssize_t result; size_t written; @@ -1657,9 +1688,16 @@ cifs_writev_callback(struct mid_q_entry *mid) break; } + trace_smb3_rw_credits(credits.rreq_debug_id, credits.rreq_debug_index, + wdata->credits.value, + server->credits, server->in_flight, + 0, cifs_trace_rw_credits_write_response_clear); wdata->credits.value = 0; cifs_write_subrequest_terminated(wdata, result, true); release_mid(mid); + trace_smb3_rw_credits(credits.rreq_debug_id, credits.rreq_debug_index, 0, + server->credits, server->in_flight, + credits.value, cifs_trace_rw_credits_write_response_add); add_credits(tcon->ses->server, &credits, 0); } @@ -1713,7 +1751,6 @@ cifs_async_writev(struct cifs_io_subrequest *wdata) rqst.rq_iov = iov; rqst.rq_nvec = 2; rqst.rq_iter = wdata->subreq.io_iter; - rqst.rq_iter_size = iov_iter_count(&wdata->subreq.io_iter); cifs_dbg(FYI, "async write at %llu %zu bytes\n", wdata->subreq.start, wdata->subreq.len); diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index d2307162a2de..5375b0c1dfb9 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -657,6 +657,19 @@ static bool server_unresponsive(struct TCP_Server_Info *server) { /* + * If we're in the process of mounting a share or reconnecting a session + * and the server abruptly shut down (e.g. socket wasn't closed, packet + * had been ACK'ed but no SMB response), don't wait longer than 20s to + * negotiate protocol. + */ + spin_lock(&server->srv_lock); + if (server->tcpStatus == CifsInNegotiate && + time_after(jiffies, server->lstrp + 20 * HZ)) { + spin_unlock(&server->srv_lock); + cifs_reconnect(server, false); + return true; + } + /* * We need to wait 3 echo intervals to make sure we handle such * situations right: * 1s client sends a normal SMB request @@ -667,7 +680,6 @@ server_unresponsive(struct TCP_Server_Info *server) * 65s kernel_recvmsg times out, and we see that we haven't gotten * a response in >60s. */ - spin_lock(&server->srv_lock); if ((server->tcpStatus == CifsGood || server->tcpStatus == CifsNeedNegotiate) && (!server->ops->can_echo || server->ops->can_echo(server)) && @@ -4194,6 +4206,9 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) * * If one doesn't exist then insert a new tcon_link struct into the tree and * try to construct a new one. + * + * REMEMBER to call cifs_put_tlink() after successful calls to cifs_sb_tlink, + * to avoid refcount issues */ struct tcon_link * cifs_sb_tlink(struct cifs_sb_info *cifs_sb) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 1fc66bcf49eb..2d387485f05b 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -111,6 +111,7 @@ static void cifs_issue_write(struct netfs_io_subrequest *subreq) goto fail; } + wdata->actual_len = wdata->subreq.len; rc = adjust_credits(wdata->server, wdata, cifs_trace_rw_credits_issue_write_adjust); if (rc) goto fail; @@ -153,7 +154,7 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); struct TCP_Server_Info *server = req->server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); - size_t rsize = 0; + size_t rsize; int rc; rdata->xid = get_xid(); @@ -166,8 +167,8 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) cifs_sb->ctx); - rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize, - &rdata->credits); + rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, + &rsize, &rdata->credits); if (rc) { subreq->error = rc; return false; @@ -183,7 +184,8 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) server->credits, server->in_flight, 0, cifs_trace_rw_credits_read_submit); - subreq->len = min_t(size_t, subreq->len, rsize); + subreq->len = umin(subreq->len, rsize); + rdata->actual_len = subreq->len; #ifdef CONFIG_CIFS_SMB_DIRECT if (server->smbd_conn) @@ -203,12 +205,39 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); + struct TCP_Server_Info *server = req->server; + struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); int rc = 0; cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", __func__, rreq->debug_id, subreq->debug_index, rreq->mapping, subreq->transferred, subreq->len); + if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) { + /* + * As we're issuing a retry, we need to negotiate some new + * credits otherwise the server may reject the op with + * INVALID_PARAMETER. Note, however, we may get back less + * credit than we need to complete the op, in which case, we + * shorten the op and rely on additional rounds of retry. + */ + size_t rsize = umin(subreq->len - subreq->transferred, + cifs_sb->ctx->rsize); + + rc = server->ops->wait_mtu_credits(server, rsize, &rdata->actual_len, + &rdata->credits); + if (rc) + goto out; + + rdata->credits.in_flight_check = 1; + + trace_smb3_rw_credits(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->credits.value, + server->credits, server->in_flight, 0, + cifs_trace_rw_credits_read_resubmit); + } + if (req->cfile->invalidHandle) { do { rc = cifs_reopen_file(req->cfile, true); @@ -2912,9 +2941,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) if (!CIFS_CACHE_READ(cinode)) return netfs_unbuffered_read_iter(iocb, to); - if (cap_unix(tcon->ses) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) { if (iocb->ki_flags & IOCB_DIRECT) return netfs_unbuffered_read_iter(iocb, to); return netfs_buffered_read_iter(iocb, to); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index dd0afa23734c..73e2e6c230b7 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -172,6 +172,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, CIFS_I(inode)->time = 0; /* force reval */ return -ESTALE; } + if (inode->i_state & I_NEW) + CIFS_I(inode)->netfs.zero_point = fattr->cf_eof; cifs_revalidate_cache(inode, fattr); diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 44dbaf9929a4..9bb5c869f4db 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -229,9 +229,11 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg) shutdown_good: trace_smb3_shutdown_done(flags, tcon->tid); + cifs_put_tlink(tlink); return 0; shutdown_out_err: trace_smb3_shutdown_err(rc, flags, tcon->tid); + cifs_put_tlink(tlink); return rc; } diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index d86da949a919..80099bbb333b 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -588,6 +588,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { rc = PTR_ERR(tlink); + /* BB could be clearer if skipped put_tlink on error here, but harmless */ goto symlink_exit; } pTcon = tlink_tcon(tlink); diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 689d8a506d45..48c27581ec51 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -378,6 +378,8 @@ int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, bool unicode, struct cifs_open_info_data *data) { + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + data->reparse.buf = buf; /* See MS-FSCC 2.1.2 */ @@ -394,12 +396,13 @@ int parse_reparse_point(struct reparse_data_buffer *buf, case IO_REPARSE_TAG_LX_FIFO: case IO_REPARSE_TAG_LX_CHR: case IO_REPARSE_TAG_LX_BLK: - return 0; + break; default: - cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n", - __func__, le32_to_cpu(buf->ReparseTag)); - return -EOPNOTSUPP; + cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", + le32_to_cpu(buf->ReparseTag)); + break; } + return 0; } int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 9f5bc41433c1..11a1c53c64e0 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -1106,6 +1106,8 @@ int smb2_rename_path(const unsigned int xid, co, DELETE, SMB2_OP_RENAME, cfile, source_dentry); if (rc == -EINVAL) { cifs_dbg(FYI, "invalid lease key, resending request without lease"); + cifs_get_writable_path(tcon, from_name, + FIND_WR_WITH_DELETE, &cfile); rc = smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, co, DELETE, SMB2_OP_RENAME, cfile, NULL); } @@ -1149,6 +1151,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, cfile, NULL, NULL, dentry); if (rc == -EINVAL) { cifs_dbg(FYI, "invalid lease key, resending request without lease"); + cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, &in_iov, &(int){SMB2_OP_SET_EOF}, 1, diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 322cabc69c6f..e6540072ffb0 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -301,7 +301,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server, unsigned int /*enum smb3_rw_credits_trace*/ trace) { struct cifs_credits *credits = &subreq->credits; - int new_val = DIV_ROUND_UP(subreq->subreq.len, SMB2_MAX_BUFFER_SIZE); + int new_val = DIV_ROUND_UP(subreq->actual_len, SMB2_MAX_BUFFER_SIZE); int scredits, in_flight; if (!credits->value || credits->value == new_val) @@ -316,7 +316,8 @@ smb2_adjust_credits(struct TCP_Server_Info *server, cifs_trace_rw_credits_no_adjust_up); trace_smb3_too_many_credits(server->CurrentMid, server->conn_id, server->hostname, 0, credits->value - new_val, 0); - cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)", + cifs_server_dbg(VFS, "R=%x[%x] request has less credits (%d) than required (%d)", + subreq->rreq->debug_id, subreq->subreq.debug_index, credits->value, new_val); return -EOPNOTSUPP; @@ -338,8 +339,9 @@ smb2_adjust_credits(struct TCP_Server_Info *server, trace_smb3_reconnect_detected(server->CurrentMid, server->conn_id, server->hostname, scredits, credits->value - new_val, in_flight); - cifs_server_dbg(VFS, "trying to return %d credits to old session\n", - credits->value - new_val); + cifs_server_dbg(VFS, "R=%x[%x] trying to return %d credits to old session\n", + subreq->rreq->debug_id, subreq->subreq.debug_index, + credits->value - new_val); return -EAGAIN; } @@ -3237,13 +3239,15 @@ static long smb3_zero_data(struct file *file, struct cifs_tcon *tcon, } static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, - loff_t offset, loff_t len, bool keep_size) + unsigned long long offset, unsigned long long len, + bool keep_size) { struct cifs_ses *ses = tcon->ses; struct inode *inode = file_inode(file); struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsFileInfo *cfile = file->private_data; - unsigned long long new_size; + struct netfs_inode *ictx = netfs_inode(inode); + unsigned long long i_size, new_size, remote_size; long rc; unsigned int xid; @@ -3255,6 +3259,16 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); + i_size = i_size_read(inode); + remote_size = ictx->remote_i_size; + if (offset + len >= remote_size && offset < i_size) { + unsigned long long top = umin(offset + len, i_size); + + rc = filemap_write_and_wait_range(inode->i_mapping, offset, top - 1); + if (rc < 0) + goto zero_range_exit; + } + /* * We zero the range through ioctl, so we need remove the page caches * first, otherwise the data may be inconsistent with the server. @@ -3305,6 +3319,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; + unsigned long long end = offset + len, i_size, remote_i_size; long rc; unsigned int xid; __u8 set_sparse = 1; @@ -3336,6 +3351,27 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, (char *)&fsctl_buf, sizeof(struct file_zero_data_information), CIFSMaxBufSize, NULL, NULL); + + if (rc) + goto unlock; + + /* If there's dirty data in the buffer that would extend the EOF if it + * were written, then we need to move the EOF marker over to the lower + * of the high end of the hole and the proposed EOF. The problem is + * that we locally hole-punch the tail of the dirty data, the proposed + * EOF update will end up in the wrong place. + */ + i_size = i_size_read(inode); + remote_i_size = netfs_inode(inode)->remote_i_size; + if (end > remote_i_size && i_size > remote_i_size) { + unsigned long long extend_to = umin(end, i_size); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, extend_to); + if (rc >= 0) + netfs_inode(inode)->remote_i_size = extend_to; + } + +unlock: filemap_invalidate_unlock(inode->i_mapping); out: inode_unlock(inode); @@ -4446,7 +4482,6 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, } iov_iter_xarray(&new->rq_iter, ITER_SOURCE, buffer, 0, size); - new->rq_iter_size = size; } } @@ -4492,7 +4527,6 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, rqst.rq_nvec = 2; if (iter) { rqst.rq_iter = *iter; - rqst.rq_iter_size = iov_iter_count(iter); iter_size = iov_iter_count(iter); } diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 83facb54276a..88dc49d67037 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4441,7 +4441,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * If we want to do a RDMA write, fill in and append * smbd_buffer_descriptor_v1 to the end of read request */ - if (smb3_use_rdma_offload(io_parms)) { + if (rdata && smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; bool need_invalidate = server->dialect == SMB30_PROT_ID; @@ -4507,6 +4507,7 @@ static void smb2_readv_callback(struct mid_q_entry *mid) { struct cifs_io_subrequest *rdata = mid->callback_data; + struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode); struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink); struct TCP_Server_Info *server = rdata->server; struct smb2_hdr *shdr = @@ -4523,16 +4524,15 @@ smb2_readv_callback(struct mid_q_entry *mid) if (rdata->got_bytes) { rqst.rq_iter = rdata->subreq.io_iter; - rqst.rq_iter_size = iov_iter_count(&rdata->subreq.io_iter); } WARN_ONCE(rdata->server != mid->server, "rdata server %p != mid server %p", rdata->server, mid->server); - cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n", + cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu/%zu\n", __func__, mid->mid, mid->mid_state, rdata->result, - rdata->subreq.len); + rdata->actual_len, rdata->subreq.len - rdata->subreq.transferred); switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -4586,22 +4586,29 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->subreq.debug_index, rdata->xid, rdata->req->cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, rdata->subreq.start, - rdata->subreq.len, rdata->result); + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, + rdata->actual_len, + rdata->result); } else trace_smb3_read_done(rdata->rreq->debug_id, rdata->subreq.debug_index, rdata->xid, rdata->req->cfile->fid.persistent_fid, tcon->tid, tcon->ses->Suid, - rdata->subreq.start, rdata->got_bytes); + rdata->subreq.start + rdata->subreq.transferred, + rdata->got_bytes); if (rdata->result == -ENODATA) { - /* We may have got an EOF error because fallocate - * failed to enlarge the file. - */ - if (rdata->subreq.start < rdata->subreq.rreq->i_size) + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + rdata->result = 0; + } else { + if (rdata->got_bytes < rdata->actual_len && + rdata->subreq.start + rdata->subreq.transferred + rdata->got_bytes == + ictx->remote_i_size) { + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + } } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, server->credits, server->in_flight, @@ -4622,6 +4629,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) { int rc, flags = 0; char *buf; + struct netfs_io_subrequest *subreq = &rdata->subreq; struct smb2_hdr *shdr; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = rdata->iov, @@ -4632,15 +4640,15 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) int credit_request; cifs_dbg(FYI, "%s: offset=%llu bytes=%zu\n", - __func__, rdata->subreq.start, rdata->subreq.len); + __func__, subreq->start, subreq->len); if (!rdata->server) rdata->server = cifs_pick_channel(tcon->ses); io_parms.tcon = tlink_tcon(rdata->req->cfile->tlink); io_parms.server = server = rdata->server; - io_parms.offset = rdata->subreq.start; - io_parms.length = rdata->subreq.len; + io_parms.offset = subreq->start + subreq->transferred; + io_parms.length = rdata->actual_len; io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid; io_parms.pid = rdata->req->pid; @@ -4655,11 +4663,13 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) rdata->iov[0].iov_base = buf; rdata->iov[0].iov_len = total_len; + rdata->got_bytes = 0; + rdata->result = 0; shdr = (struct smb2_hdr *)buf; if (rdata->credits.value > 0) { - shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->subreq.len, + shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->actual_len, SMB2_MAX_BUFFER_SIZE)); credit_request = le16_to_cpu(shdr->CreditCharge) + 8; if (server->credits >= server->max_credits) @@ -4683,11 +4693,11 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) if (rc) { cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); trace_smb3_read_err(rdata->rreq->debug_id, - rdata->subreq.debug_index, + subreq->debug_index, rdata->xid, io_parms.persistent_fid, io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length, rc); + io_parms.offset, rdata->actual_len, rc); } async_readv_out: @@ -4914,6 +4924,13 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) if (rc) goto out; + rqst.rq_iov = iov; + rqst.rq_iter = wdata->subreq.io_iter; + + rqst.rq_iov[0].iov_len = total_len - 1; + rqst.rq_iov[0].iov_base = (char *)req; + rqst.rq_nvec += 1; + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -4925,6 +4942,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = SMB2_CHANNEL_NONE; + req->Length = cpu_to_le32(io_parms->length); req->Offset = cpu_to_le64(io_parms->offset); req->DataOffset = cpu_to_le16( offsetof(struct smb2_write_req, Buffer)); @@ -4944,7 +4962,6 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) */ if (smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; - size_t data_size = iov_iter_count(&wdata->subreq.io_iter); bool need_invalidate = server->dialect == SMB30_PROT_ID; wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->subreq.io_iter, @@ -4953,9 +4970,10 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) rc = -EAGAIN; goto async_writev_out; } + /* For RDMA read, I/O size is in RemainingBytes not in Length */ + req->RemainingBytes = req->Length; req->Length = 0; req->DataOffset = 0; - req->RemainingBytes = cpu_to_le32(data_size); req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE; if (need_invalidate) req->Channel = SMB2_CHANNEL_RDMA_V1; @@ -4967,31 +4985,22 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) v1->offset = cpu_to_le64(wdata->mr->mr->iova); v1->token = cpu_to_le32(wdata->mr->mr->rkey); v1->length = cpu_to_le32(wdata->mr->mr->length); + + rqst.rq_iov[0].iov_len += sizeof(*v1); + + /* + * We keep wdata->subreq.io_iter, + * but we have to truncate rqst.rq_iter + */ + iov_iter_truncate(&rqst.rq_iter, 0); } #endif - iov[0].iov_len = total_len - 1; - iov[0].iov_base = (char *)req; - rqst.rq_iov = iov; - rqst.rq_nvec = 1; - rqst.rq_iter = wdata->subreq.io_iter; - rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter); if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags)) smb2_set_replay(server, &rqst); -#ifdef CONFIG_CIFS_SMB_DIRECT - if (wdata->mr) - iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); -#endif - cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", - io_parms->offset, io_parms->length, iov_iter_count(&rqst.rq_iter)); -#ifdef CONFIG_CIFS_SMB_DIRECT - /* For RDMA read, I/O size is in RemainingBytes not in Length */ - if (!wdata->mr) - req->Length = cpu_to_le32(io_parms->length); -#else - req->Length = cpu_to_le32(io_parms->length); -#endif + cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", + io_parms->offset, io_parms->length, iov_iter_count(&wdata->subreq.io_iter)); if (wdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->subreq.len, diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 0f0c10c7ada7..8e9964001e2a 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -30,6 +30,7 @@ EM(cifs_trace_rw_credits_old_session, "old-session") \ EM(cifs_trace_rw_credits_read_response_add, "rd-resp-add") \ EM(cifs_trace_rw_credits_read_response_clear, "rd-resp-clr") \ + EM(cifs_trace_rw_credits_read_resubmit, "rd-resubmit") \ EM(cifs_trace_rw_credits_read_submit, "rd-submit ") \ EM(cifs_trace_rw_credits_write_prepare, "wr-prepare ") \ EM(cifs_trace_rw_credits_write_response_add, "wr-resp-add") \ diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 09e1e7771592..7889df8112b4 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -165,11 +165,43 @@ void ksmbd_all_conn_set_status(u64 sess_id, u32 status) up_read(&conn_list_lock); } -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id) +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn) { wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2); } +int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id) +{ + struct ksmbd_conn *conn; + int rc, retry_count = 0, max_timeout = 120; + int rcount = 1; + +retry_idle: + if (retry_count >= max_timeout) + return -EIO; + + down_read(&conn_list_lock); + list_for_each_entry(conn, &conn_list, conns_list) { + if (conn->binding || xa_load(&conn->sessions, sess_id)) { + if (conn == curr_conn) + rcount = 2; + if (atomic_read(&conn->req_running) >= rcount) { + rc = wait_event_timeout(conn->req_running_q, + atomic_read(&conn->req_running) < rcount, + HZ); + if (!rc) { + up_read(&conn_list_lock); + retry_count++; + goto retry_idle; + } + } + } + } + up_read(&conn_list_lock); + + return 0; +} + int ksmbd_conn_write(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 5c2845e47cf2..5b947175c048 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -145,7 +145,8 @@ extern struct list_head conn_list; extern struct rw_semaphore conn_list_lock; bool ksmbd_conn_alive(struct ksmbd_conn *conn); -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id); +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn); +int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id); struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 162a12685d2c..99416ce9f501 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -311,6 +311,7 @@ void destroy_previous_session(struct ksmbd_conn *conn, { struct ksmbd_session *prev_sess; struct ksmbd_user *prev_user; + int err; down_write(&sessions_table_lock); down_write(&conn->session_lock); @@ -325,8 +326,16 @@ void destroy_previous_session(struct ksmbd_conn *conn, memcmp(user->passkey, prev_user->passkey, user->passkey_sz)) goto out; + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_RECONNECT); + err = ksmbd_conn_wait_idle_sess_id(conn, id); + if (err) { + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); + goto out; + } + ksmbd_destroy_file_table(&prev_sess->file_table); prev_sess->state = SMB2_SESSION_EXPIRED; + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); ksmbd_launch_ksmbd_durable_scavenger(); out: up_write(&conn->session_lock); diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index a8f52c4ebbda..e546ffa57b55 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -1510,7 +1510,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease) * parse_lease_state() - parse lease context containted in file open request * @open_req: buffer containing smb2 file open(create) request * - * Return: oplock state, -ENOENT if create lease context not found + * Return: allocated lease context object on success, otherwise NULL */ struct lease_ctx_info *parse_lease_state(void *open_req) { diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 2df1354288e6..8bdc59251418 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -519,7 +519,7 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work) * smb2_allocate_rsp_buf() - allocate smb2 response buffer * @work: smb work containing smb request buffer * - * Return: 0 on success, otherwise -ENOMEM + * Return: 0 on success, otherwise error */ int smb2_allocate_rsp_buf(struct ksmbd_work *work) { @@ -1370,7 +1370,8 @@ static int ntlm_negotiate(struct ksmbd_work *work, } sz = le16_to_cpu(rsp->SecurityBufferOffset); - memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); + unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len, + /* alloc is larger than blob, see smb2_allocate_rsp_buf() */); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); out: @@ -1453,7 +1454,9 @@ static int ntlm_authenticate(struct ksmbd_work *work, return -ENOMEM; sz = le16_to_cpu(rsp->SecurityBufferOffset); - memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); + unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, + spnego_blob_len, + /* alloc is larger than blob, see smb2_allocate_rsp_buf() */); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); kfree(spnego_blob); } @@ -1687,6 +1690,8 @@ int smb2_sess_setup(struct ksmbd_work *work) rc = ksmbd_session_register(conn, sess); if (rc) goto out_err; + + conn->binding = false; } else if (conn->dialect >= SMB30_PROT_ID && (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) && req->Flags & SMB2_SESSION_REQ_FLAG_BINDING) { @@ -1765,6 +1770,8 @@ int smb2_sess_setup(struct ksmbd_work *work) sess = NULL; goto out_err; } + + conn->binding = false; } work->sess = sess; @@ -2210,7 +2217,7 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_conn_unlock(conn); ksmbd_close_session_fds(work); - ksmbd_conn_wait_idle(conn, sess_id); + ksmbd_conn_wait_idle(conn); /* * Re-lookup session to validate if session is deleted @@ -2767,8 +2774,8 @@ static int parse_durable_handle_context(struct ksmbd_work *work, } } - if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || - req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) { + if ((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || + req_op_level == SMB2_OPLOCK_LEVEL_BATCH) { dh_info->CreateGuid = durable_v2_blob->CreateGuid; dh_info->persistent = @@ -2788,8 +2795,8 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } - if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || - req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) { + if ((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) || + req_op_level == SMB2_OPLOCK_LEVEL_BATCH) { ksmbd_debug(SMB, "Request for durable open\n"); dh_info->type = dh_idx; } @@ -3093,7 +3100,6 @@ int smb2_open(struct ksmbd_work *work) goto err_out; } - file_present = true; idmap = mnt_idmap(path.mnt); } else { if (rc != -ENOENT) @@ -3411,7 +3417,7 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } } else { - if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) { + if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE && lc) { if (S_ISDIR(file_inode(filp)->i_mode)) { lc->req_state &= ~SMB2_LEASE_WRITE_CACHING_LE; lc->is_dir = true; @@ -3710,7 +3716,7 @@ err_out2: kfree(name); kfree(lc); - return 0; + return rc; } static int readdir_info_level_struct_sz(int info_level) @@ -4406,7 +4412,8 @@ int smb2_query_dir(struct ksmbd_work *work) rsp->OutputBufferLength = cpu_to_le32(0); rsp->Buffer[0] = 0; rc = ksmbd_iov_pin_rsp(work, (void *)rsp, - sizeof(struct smb2_query_directory_rsp)); + offsetof(struct smb2_query_directory_rsp, Buffer) + + 1); if (rc) goto err_out; } else { @@ -5357,7 +5364,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, "NTFS", PATH_MAX, conn->local_nls, 0); len = len * 2; info->FileSystemNameLen = cpu_to_le32(len); - sz = sizeof(struct filesystem_attribute_info) - 2 + len; + sz = sizeof(struct filesystem_attribute_info) + len; rsp->OutputBufferLength = cpu_to_le32(sz); break; } @@ -5383,7 +5390,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, len = len * 2; info->VolumeLabelSize = cpu_to_le32(len); info->Reserved = 0; - sz = sizeof(struct filesystem_vol_info) - 2 + len; + sz = sizeof(struct filesystem_vol_info) + len; rsp->OutputBufferLength = cpu_to_le32(sz); break; } diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index 4a3148b0167f..cc1d6dfe29d5 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -213,7 +213,7 @@ struct filesystem_attribute_info { __le32 Attributes; __le32 MaxPathNameComponentLength; __le32 FileSystemNameLen; - __le16 FileSystemName[1]; /* do not have to save this - get subset? */ + __le16 FileSystemName[]; /* do not have to save this - get subset? */ } __packed; struct filesystem_device_info { @@ -226,7 +226,7 @@ struct filesystem_vol_info { __le32 SerialNumber; __le32 VolumeLabelSize; __le16 Reserved; - __le16 VolumeLabel[1]; + __le16 VolumeLabel[]; } __packed; struct filesystem_info { diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index a84788396daa..aaed9e293b2e 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -624,8 +624,10 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz) for_each_netdev(&init_net, netdev) { if (netif_is_bridge_port(netdev)) continue; - if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) + if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) { + rtnl_unlock(); return -ENOMEM; + } } rtnl_unlock(); bind_additional_ifaces = 1; diff --git a/fs/smb/server/xattr.h b/fs/smb/server/xattr.h index 16499ca5c82d..fa3e27d6971b 100644 --- a/fs/smb/server/xattr.h +++ b/fs/smb/server/xattr.h @@ -76,7 +76,7 @@ struct xattr_acl_entry { struct xattr_smb_acl { int count; int next; - struct xattr_acl_entry entries[]; + struct xattr_acl_entry entries[] __counted_by(count); }; /* 64bytes hash in xattr_ntacl is computed with sha256 */ diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index a8c1e7f9a609..21aaa96856c1 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -494,39 +494,73 @@ out: } static int squashfs_readahead_fragment(struct page **page, - unsigned int pages, unsigned int expected) + unsigned int pages, unsigned int expected, loff_t start) { struct inode *inode = page[0]->mapping->host; struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, squashfs_i(inode)->fragment_block, squashfs_i(inode)->fragment_size); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - int error = buffer->error; + int i, bytes, copied; + struct squashfs_page_actor *actor; + unsigned int offset; + void *addr; + struct page *last_page; + + if (buffer->error) + goto out; - if (error) + actor = squashfs_page_actor_init_special(msblk, page, pages, + expected, start); + if (!actor) goto out; - expected += squashfs_i(inode)->fragment_offset; + squashfs_actor_nobuff(actor); + addr = squashfs_first_page(actor); + + for (copied = offset = 0; offset < expected; offset += PAGE_SIZE) { + int avail = min_t(int, expected - offset, PAGE_SIZE); + + if (!IS_ERR(addr)) { + bytes = squashfs_copy_data(addr, buffer, offset + + squashfs_i(inode)->fragment_offset, avail); + + if (bytes != avail) + goto failed; + } + + copied += avail; + addr = squashfs_next_page(actor); + } - for (n = 0; n < pages; n++) { - unsigned int base = (page[n]->index & mask) << PAGE_SHIFT; - unsigned int offset = base + squashfs_i(inode)->fragment_offset; + last_page = squashfs_page_actor_free(actor); - if (expected > offset) { - unsigned int avail = min_t(unsigned int, expected - - offset, PAGE_SIZE); + if (copied == expected && !IS_ERR(last_page)) { + /* Last page (if present) may have trailing bytes not filled */ + bytes = copied % PAGE_SIZE; + if (bytes && last_page) + memzero_page(last_page, bytes, PAGE_SIZE - bytes); - squashfs_fill_page(page[n], buffer, offset, avail); + for (i = 0; i < pages; i++) { + flush_dcache_page(page[i]); + SetPageUptodate(page[i]); } + } - unlock_page(page[n]); - put_page(page[n]); + for (i = 0; i < pages; i++) { + unlock_page(page[i]); + put_page(page[i]); } + squashfs_cache_put(buffer); + return 0; + +failed: + squashfs_page_actor_free(actor); + out: squashfs_cache_put(buffer); - return error; + return 1; } static void squashfs_readahead(struct readahead_control *ractl) @@ -551,7 +585,6 @@ static void squashfs_readahead(struct readahead_control *ractl) return; for (;;) { - pgoff_t index; int res, bsize; u64 block = 0; unsigned int expected; @@ -570,26 +603,21 @@ static void squashfs_readahead(struct readahead_control *ractl) if (readahead_pos(ractl) >= i_size_read(inode)) goto skip_pages; - index = pages[0]->index >> shift; - - if ((pages[nr_pages - 1]->index >> shift) != index) - goto skip_pages; - - if (index == file_end && squashfs_i(inode)->fragment_block != - SQUASHFS_INVALID_BLK) { + if (start >> msblk->block_log == file_end && + squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { res = squashfs_readahead_fragment(pages, nr_pages, - expected); + expected, start); if (res) goto skip_pages; continue; } - bsize = read_blocklist(inode, index, &block); + bsize = read_blocklist(inode, start >> msblk->block_log, &block); if (bsize == 0) goto skip_pages; actor = squashfs_page_actor_init_special(msblk, pages, nr_pages, - expected); + expected, start); if (!actor) goto skip_pages; @@ -597,12 +625,12 @@ static void squashfs_readahead(struct readahead_control *ractl) last_page = squashfs_page_actor_free(actor); - if (res == expected) { + if (res == expected && !IS_ERR(last_page)) { int bytes; /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (index == file_end && bytes && last_page) + if (start >> msblk->block_log == file_end && bytes && last_page) memzero_page(last_page, bytes, PAGE_SIZE - bytes); @@ -616,6 +644,8 @@ static void squashfs_readahead(struct readahead_control *ractl) unlock_page(pages[i]); put_page(pages[i]); } + + start += readahead_batch_length(ractl); } kfree(pages); diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 2a689ce71de9..22251743fadf 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -23,15 +23,15 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int expected) { + struct folio *folio = page_folio(target_page); struct inode *inode = target_page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - loff_t start_index = target_page->index & ~mask; + loff_t start_index = folio->index & ~mask; loff_t end_index = start_index | mask; int i, n, pages, bytes, res = -ENOMEM; - struct page **page; + struct page **page, *last_page; struct squashfs_page_actor *actor; void *pageaddr; @@ -46,7 +46,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, /* Try to grab all the pages covered by the Squashfs block */ for (i = 0, n = start_index; n <= end_index; n++) { - page[i] = (n == target_page->index) ? target_page : + page[i] = (n == folio->index) ? target_page : grab_cache_page_nowait(target_page->mapping, n); if (page[i] == NULL) @@ -67,27 +67,28 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, * Create a "page actor" which will kmap and kunmap the * page cache pages appropriately within the decompressor */ - actor = squashfs_page_actor_init_special(msblk, page, pages, expected); + actor = squashfs_page_actor_init_special(msblk, page, pages, expected, + start_index << PAGE_SHIFT); if (actor == NULL) goto out; /* Decompress directly into the page cache buffers */ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - squashfs_page_actor_free(actor); + last_page = squashfs_page_actor_free(actor); if (res < 0) goto mark_errored; - if (res != expected) { + if (res != expected || IS_ERR(last_page)) { res = -EIO; goto mark_errored; } /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (page[pages - 1]->index == end_index && bytes) { - pageaddr = kmap_local_page(page[pages - 1]); + if (end_index == file_end && last_page && bytes) { + pageaddr = kmap_local_page(last_page); memset(pageaddr + bytes, 0, PAGE_SIZE - bytes); kunmap_local(pageaddr); } diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 81af6c4ca115..2b3e807d4dea 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -60,6 +60,11 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, } /* Implementation of page_actor for decompressing directly into page cache. */ +static loff_t page_next_index(struct squashfs_page_actor *actor) +{ + return page_folio(actor->page[actor->next_page])->index; +} + static void *handle_next_page(struct squashfs_page_actor *actor) { int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -68,7 +73,7 @@ static void *handle_next_page(struct squashfs_page_actor *actor) return NULL; if ((actor->next_page == actor->pages) || - (actor->next_index != actor->page[actor->next_page]->index)) { + (actor->next_index != page_next_index(actor))) { actor->next_index++; actor->returned_pages++; actor->last_page = NULL; @@ -103,7 +108,7 @@ static void direct_finish_page(struct squashfs_page_actor *actor) } struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk, - struct page **page, int pages, int length) + struct page **page, int pages, int length, loff_t start_index) { struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); @@ -125,7 +130,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_ actor->pages = pages; actor->next_page = 0; actor->returned_pages = 0; - actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); + actor->next_index = start_index >> PAGE_SHIFT; actor->pageaddr = NULL; actor->last_page = NULL; actor->alloc_buffer = msblk->decompressor->alloc_buffer; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 97d4983559b1..ffe25eb77c32 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -29,13 +29,15 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, int pages, int length); extern struct squashfs_page_actor *squashfs_page_actor_init_special( struct squashfs_sb_info *msblk, - struct page **page, int pages, int length); + struct page **page, int pages, int length, + loff_t start_index); static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor) { - struct page *last_page = actor->last_page; + struct page *last_page = actor->next_page == actor->pages ? actor->last_page : ERR_PTR(-EIO); kfree(actor->tmp_buffer); kfree(actor); + return last_page; } static inline void *squashfs_first_page(struct squashfs_page_actor *actor) diff --git a/fs/super.c b/fs/super.c index 38d72a3cf6fc..1db230432960 100644 --- a/fs/super.c +++ b/fs/super.c @@ -621,7 +621,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; - cgroup_writeback_umount(); + cgroup_writeback_umount(sb); /* Evict all inodes with zero refcount. */ evict_inodes(sb); @@ -1802,8 +1802,8 @@ int vfs_get_tree(struct fs_context *fc) return error; if (!fc->root) { - pr_err("Filesystem %s get_tree() didn't set fc->root\n", - fc->fs_type->name); + pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n", + fc->fs_type->name, error); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ @@ -1905,7 +1905,7 @@ static void lockdep_sb_freeze_release(struct super_block *sb) int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) - percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_); + percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_); } /* diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 2e126d72d619..639307e2ff8c 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -28,17 +28,17 @@ const struct file_operations sysv_dir_operations = { .fsync = generic_file_fsync, }; -static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; - block_write_end(NULL, mapping, pos, len, len, page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - unlock_page(page); + folio_unlock(folio); } static int sysv_handle_dirsync(struct inode *dir) @@ -52,20 +52,21 @@ static int sysv_handle_dirsync(struct inode *dir) } /* - * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the + * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the * rules documented in mm/highmem.rst. * - * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_page() + * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_folio() * and must be treated accordingly for nesting purposes. */ -static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p) +static void *dir_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); - *p = page; - return kmap_local_page(page); + struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); + + if (IS_ERR(folio)) + return ERR_CAST(folio); + *foliop = folio; + return kmap_local_folio(folio, 0); } static int sysv_readdir(struct file *file, struct dir_context *ctx) @@ -87,9 +88,9 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct sysv_dir_entry *de; - struct page *page; + struct folio *folio; - kaddr = dir_get_page(inode, n, &page); + kaddr = dir_get_folio(inode, n, &folio); if (IS_ERR(kaddr)) continue; de = (struct sysv_dir_entry *)(kaddr+offset); @@ -103,11 +104,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), DT_UNKNOWN)) { - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 0; } @@ -126,39 +127,35 @@ static inline int namecompare(int len, int maxlen, /* * sysv_find_entry() * - * finds an entry in the specified directory with the wanted name. It - * returns the cache buffer in which the entry was found, and the entry - * itself (as a parameter - res_dir). It does NOT read the inode of the + * finds an entry in the specified directory with the wanted name. + * It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * - * On Success unmap_and_put_page() should be called on *res_page. + * On Success folio_release_kmap() should be called on *foliop. * - * sysv_find_entry() acts as a call to dir_get_page() and must be treated + * sysv_find_entry() acts as a call to dir_get_folio() and must be treated * accordingly for nesting purposes. */ -struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_page) +struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct folio **foliop) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct inode * dir = d_inode(dentry->d_parent); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; struct sysv_dir_entry *de; - *res_page = NULL; - start = SYSV_I(dir)->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { - char *kaddr = dir_get_page(dir, n, &page); + char *kaddr = dir_get_folio(dir, n, foliop); if (!IS_ERR(kaddr)) { de = (struct sysv_dir_entry *)kaddr; - kaddr += PAGE_SIZE - SYSV_DIRSIZE; + kaddr += folio_size(*foliop) - SYSV_DIRSIZE; for ( ; (char *) de <= kaddr ; de++) { if (!de->inode) continue; @@ -166,7 +163,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ name, de->name)) goto found; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(*foliop, kaddr); } if (++n >= npages) @@ -177,7 +174,6 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ found: SYSV_I(dir)->i_dir_start_lookup = n; - *res_page = page; return de; } @@ -186,7 +182,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; - struct page *page = NULL; + struct folio *folio = NULL; struct sysv_dir_entry * de; unsigned long npages = dir_pages(dir); unsigned long n; @@ -196,7 +192,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) /* We take care of directory expansion in the same loop */ for (n = 0; n <= npages; n++) { - kaddr = dir_get_page(dir, n, &page); + kaddr = dir_get_folio(dir, n, &folio); if (IS_ERR(kaddr)) return PTR_ERR(kaddr); de = (struct sysv_dir_entry *)kaddr; @@ -206,49 +202,49 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) goto got_it; err = -EEXIST; if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) - goto out_page; + goto out_folio; de++; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + offset_in_page(de); - lock_page(page); - err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + pos = folio_pos(folio) + offset_in_folio(folio, de); + folio_lock(folio); + err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) goto out_unlock; memcpy (de->name, name, namelen); memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); -out_page: - unmap_and_put_page(page, kaddr); +out_folio: + folio_release_kmap(folio, kaddr); return err; out_unlock: - unlock_page(page); - goto out_page; + folio_unlock(folio); + goto out_folio; } -int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) +int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio) { - struct inode *inode = page->mapping->host; - loff_t pos = page_offset(page) + offset_in_page(de); + struct inode *inode = folio->mapping->host; + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; - lock_page(page); - err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + folio_lock(folio); + err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } de->inode = 0; - dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return sysv_handle_dirsync(inode); @@ -256,33 +252,33 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) int sysv_make_empty(struct inode *inode, struct inode *dir) { - struct page *page = grab_cache_page(inode->i_mapping, 0); + struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); struct sysv_dir_entry * de; - char *base; + char *kaddr; int err; - if (!page) - return -ENOMEM; - err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE); + if (IS_ERR(folio)) + return PTR_ERR(folio); + err = sysv_prepare_chunk(folio, 0, 2 * SYSV_DIRSIZE); if (err) { - unlock_page(page); + folio_unlock(folio); goto fail; } - base = kmap_local_page(page); - memset(base, 0, PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); + memset(kaddr, 0, folio_size(folio)); - de = (struct sysv_dir_entry *) base; + de = (struct sysv_dir_entry *)kaddr; de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); strcpy(de->name,"."); de++; de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino); strcpy(de->name,".."); - kunmap_local(base); - dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE); + kunmap_local(kaddr); + dir_commit_chunk(folio, 0, 2 * SYSV_DIRSIZE); err = sysv_handle_dirsync(inode); fail: - put_page(page); + folio_put(folio); return err; } @@ -292,19 +288,19 @@ fail: int sysv_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; - struct page *page = NULL; + struct folio *folio = NULL; unsigned long i, npages = dir_pages(inode); char *kaddr; for (i = 0; i < npages; i++) { struct sysv_dir_entry *de; - kaddr = dir_get_page(inode, i, &page); + kaddr = dir_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) continue; de = (struct sysv_dir_entry *)kaddr; - kaddr += PAGE_SIZE-SYSV_DIRSIZE; + kaddr += folio_size(folio) - SYSV_DIRSIZE; for ( ;(char *)de <= kaddr; de++) { if (!de->inode) @@ -321,46 +317,46 @@ int sysv_empty_dir(struct inode * inode) if (de->name[1] != '.' || de->name[2]) goto not_empty; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } /* Releases the page */ -int sysv_set_link(struct sysv_dir_entry *de, struct page *page, - struct inode *inode) +int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio, + struct inode *inode) { - struct inode *dir = page->mapping->host; - loff_t pos = page_offset(page) + offset_in_page(de); + struct inode *dir = folio->mapping->host; + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; - lock_page(page); - err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + folio_lock(folio); + err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return sysv_handle_dirsync(inode); } /* - * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the + * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the * rules documented in mm/highmem.rst. * - * sysv_dotdot() acts as a call to dir_get_page() and must be treated + * sysv_dotdot() acts as a call to dir_get_folio() and must be treated * accordingly for nesting purposes. */ -struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct page **p) +struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct folio **foliop) { - struct sysv_dir_entry *de = dir_get_page(dir, 0, p); + struct sysv_dir_entry *de = dir_get_folio(dir, 0, foliop); if (IS_ERR(de)) return NULL; @@ -370,13 +366,13 @@ struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct page **p) ino_t sysv_inode_by_name(struct dentry *dentry) { - struct page *page; - struct sysv_dir_entry *de = sysv_find_entry (dentry, &page); + struct folio *folio; + struct sysv_dir_entry *de = sysv_find_entry (dentry, &folio); ino_t res = 0; if (de) { res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - unmap_and_put_page(page, de); + folio_release_kmap(folio, de); } return res; } diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 19bcb51a2203..451e95f474fa 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -466,9 +466,9 @@ static int sysv_read_folio(struct file *file, struct folio *folio) return block_read_full_folio(folio, get_block); } -int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len) +int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(page, pos, len, get_block); + return __block_write_begin(folio, pos, len, get_block); } static void sysv_write_failed(struct address_space *mapping, loff_t to) @@ -483,11 +483,11 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to) static int sysv_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, get_block); + ret = block_write_begin(mapping, pos, len, foliop, get_block); if (unlikely(ret)) sysv_write_failed(mapping, pos + len); diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index d6b73798071b..fb8bd8437872 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -151,20 +151,20 @@ out_dir: static int sysv_unlink(struct inode * dir, struct dentry * dentry) { struct inode * inode = d_inode(dentry); - struct page * page; + struct folio *folio; struct sysv_dir_entry * de; int err; - de = sysv_find_entry(dentry, &page); + de = sysv_find_entry(dentry, &folio); if (!de) return -ENOENT; - err = sysv_delete_entry(de, page); + err = sysv_delete_entry(de, folio); if (!err) { inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); } - unmap_and_put_page(page, de); + folio_release_kmap(folio, de); return err; } @@ -194,28 +194,28 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); - struct page * dir_page = NULL; + struct folio *dir_folio; struct sysv_dir_entry * dir_de = NULL; - struct page * old_page; + struct folio *old_folio; struct sysv_dir_entry * old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; - old_de = sysv_find_entry(old_dentry, &old_page); + old_de = sysv_find_entry(old_dentry, &old_folio); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = sysv_dotdot(old_inode, &dir_page); + dir_de = sysv_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } if (new_inode) { - struct page * new_page; + struct folio *new_folio; struct sysv_dir_entry * new_de; err = -ENOTEMPTY; @@ -223,11 +223,11 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_dir; err = -ENOENT; - new_de = sysv_find_entry(new_dentry, &new_page); + new_de = sysv_find_entry(new_dentry, &new_folio); if (!new_de) goto out_dir; - err = sysv_set_link(new_de, new_page, old_inode); - unmap_and_put_page(new_page, new_de); + err = sysv_set_link(new_de, new_folio, old_inode); + folio_release_kmap(new_folio, new_de); if (err) goto out_dir; inode_set_ctime_current(new_inode); @@ -242,23 +242,23 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, inode_inc_link_count(new_dir); } - err = sysv_delete_entry(old_de, old_page); + err = sysv_delete_entry(old_de, old_folio); if (err) goto out_dir; mark_inode_dirty(old_inode); if (dir_de) { - err = sysv_set_link(dir_de, dir_page, new_dir); + err = sysv_set_link(dir_de, dir_folio, new_dir); if (!err) inode_dec_link_count(old_dir); } out_dir: if (dir_de) - unmap_and_put_page(dir_page, dir_de); + folio_release_kmap(dir_folio, dir_de); out_old: - unmap_and_put_page(old_page, old_de); + folio_release_kmap(old_folio, old_de); out: return err; } diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index e3f988b469ee..0a48b2e7edb1 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -133,8 +133,8 @@ extern void sysv_free_block(struct super_block *, sysv_zone_t); extern unsigned long sysv_count_free_blocks(struct super_block *); /* itree.c */ -extern void sysv_truncate(struct inode *); -extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len); +void sysv_truncate(struct inode *); +int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); /* inode.c */ extern struct inode *sysv_iget(struct super_block *, unsigned int); @@ -148,15 +148,15 @@ extern void sysv_destroy_icache(void); /* dir.c */ -extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **); -extern int sysv_add_link(struct dentry *, struct inode *); -extern int sysv_delete_entry(struct sysv_dir_entry *, struct page *); -extern int sysv_make_empty(struct inode *, struct inode *); -extern int sysv_empty_dir(struct inode *); -extern int sysv_set_link(struct sysv_dir_entry *, struct page *, +struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct folio **); +int sysv_add_link(struct dentry *, struct inode *); +int sysv_delete_entry(struct sysv_dir_entry *, struct folio *); +int sysv_make_empty(struct inode *, struct inode *); +int sysv_empty_dir(struct inode *); +int sysv_set_link(struct sysv_dir_entry *, struct folio *, struct inode *); -extern struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **); -extern ino_t sysv_inode_by_name(struct dentry *); +struct sysv_dir_entry *sysv_dotdot(struct inode *, struct folio **); +ino_t sysv_inode_by_name(struct dentry *); extern const struct inode_operations sysv_file_inode_operations; diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 01e99e98457d..8705c77a9e75 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -862,7 +862,7 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) list_for_each_entry(ei_child, &ei->children, list) eventfs_remove_rec(ei_child, level + 1); - list_del(&ei->list); + list_del_rcu(&ei->list); free_ei(ei); } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 68e104423a48..5130123005e4 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -211,7 +211,7 @@ static void release_existing_page_budget(struct ubifs_info *c) } static int write_begin_slow(struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep) + loff_t pos, unsigned len, struct folio **foliop) { struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -298,7 +298,7 @@ static int write_begin_slow(struct address_space *mapping, ubifs_release_dirty_inode_budget(c, ui); } - *pagep = &folio->page; + *foliop = folio; return 0; } @@ -414,7 +414,7 @@ static int allocate_budget(struct ubifs_info *c, struct folio *folio, */ static int ubifs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -483,7 +483,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, folio_unlock(folio); folio_put(folio); - return write_begin_slow(mapping, pos, len, pagep); + return write_begin_slow(mapping, pos, len, foliop); } /* @@ -492,7 +492,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * with @ui->ui_mutex locked if we are appending pages, and unlocked * otherwise. This is an optimization (slightly hacky though). */ - *pagep = &folio->page; + *foliop = folio; return 0; } @@ -524,9 +524,8 @@ static void cancel_budget(struct ubifs_info *c, struct folio *folio, static int ubifs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; diff --git a/fs/udf/file.c b/fs/udf/file.c index 3a4179de316b..412fe7c4d348 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -62,7 +62,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) end = size & ~PAGE_MASK; else end = PAGE_SIZE; - err = __block_write_begin(&folio->page, 0, end, udf_get_block); + err = __block_write_begin(folio, 0, end, udf_get_block); if (err) { folio_unlock(folio); ret = vmf_fs_error(err); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 4726a4d014b6..eaee57b91c6c 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -246,14 +246,14 @@ static void udf_readahead(struct readahead_control *rac) static int udf_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct udf_inode_info *iinfo = UDF_I(file_inode(file)); struct folio *folio; int ret; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - ret = block_write_begin(mapping, pos, len, pagep, + ret = block_write_begin(mapping, pos, len, foliop, udf_get_block); if (unlikely(ret)) udf_write_failed(mapping, pos + len); @@ -265,7 +265,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; if (!folio_test_uptodate(folio)) udf_adinicb_read_folio(folio); return 0; @@ -273,16 +273,14 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, static int udf_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = file_inode(file); - struct folio *folio; loff_t last_pos; if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) - return generic_write_end(file, mapping, pos, len, copied, page, + return generic_write_end(file, mapping, pos, len, copied, folio, fsdata); - folio = page_folio(page); last_pos = pos + copied; if (last_pos > inode->i_size) i_size_write(inode, last_pos); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 335f0ae529b4..d6e6a2198971 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -42,18 +42,18 @@ static inline int ufs_match(struct super_block *sb, int len, return !memcmp(name, de->d_name, len); } -static void ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void ufs_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - unlock_page(page); + folio_unlock(folio); } static int ufs_handle_dirsync(struct inode *dir) @@ -66,22 +66,16 @@ static int ufs_handle_dirsync(struct inode *dir) return err; } -static inline void ufs_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; struct ufs_dir_entry *de; - struct page *page; + struct folio *folio; - de = ufs_find_entry(dir, qstr, &page); + de = ufs_find_entry(dir, qstr, &folio); if (de) { res = fs32_to_cpu(dir->i_sb, de->d_ino); - ufs_put_page(page); + folio_release_kmap(folio, de); } return res; } @@ -89,43 +83,40 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) /* Releases the page */ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode, + struct folio *folio, struct inode *inode, bool update_times) { - loff_t pos = page_offset(page) + - (char *) de - (char *) page_address(page); + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen); int err; - lock_page(page); - err = ufs_prepare_chunk(page, pos, len); + folio_lock(folio); + err = ufs_prepare_chunk(folio, pos, len); BUG_ON(err); de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); ufs_set_de_type(dir->i_sb, de, inode->i_mode); - ufs_commit_chunk(page, pos, len); - ufs_put_page(page); + ufs_commit_chunk(folio, pos, len); + folio_release_kmap(folio, de); if (update_times) inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); ufs_handle_dirsync(dir); } - -static bool ufs_check_page(struct page *page) +static bool ufs_check_folio(struct folio *folio, char *kaddr) { - struct inode *dir = page->mapping->host; + struct inode *dir = folio->mapping->host; struct super_block *sb = dir->i_sb; - char *kaddr = page_address(page); unsigned offs, rec_len; - unsigned limit = PAGE_SIZE; + unsigned limit = folio_size(folio); const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1; struct ufs_dir_entry *p; char *error; - if ((dir->i_size >> PAGE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_MASK; + if (dir->i_size < folio_pos(folio) + limit) { + limit = offset_in_folio(folio, dir->i_size); if (limit & chunk_mask) goto Ebadsize; if (!limit) @@ -150,13 +141,13 @@ static bool ufs_check_page(struct page *page) if (offs != limit) goto Eend; out: - SetPageChecked(page); + folio_set_checked(folio); return true; /* Too bad, we had an error */ Ebadsize: - ufs_error(sb, "ufs_check_page", + ufs_error(sb, __func__, "size of directory #%lu is not a multiple of chunk size", dir->i_ino ); @@ -176,36 +167,40 @@ Espan: Einumber: error = "inode out of bounds"; bad_entry: - ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - " - "offset=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, + ufs_error(sb, __func__, "bad entry in directory #%lu: %s - " + "offset=%llu, rec_len=%d, name_len=%d", + dir->i_ino, error, folio_pos(folio) + offs, rec_len, ufs_get_de_namlen(sb, p)); goto fail; Eend: p = (struct ufs_dir_entry *)(kaddr + offs); ufs_error(sb, __func__, "entry in directory #%lu spans the page boundary" - "offset=%lu", - dir->i_ino, (page->index<<PAGE_SHIFT)+offs); + "offset=%llu", + dir->i_ino, folio_pos(folio) + offs); fail: return false; } -static struct page *ufs_get_page(struct inode *dir, unsigned long n) +static void *ufs_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) { - kmap(page); - if (unlikely(!PageChecked(page))) { - if (!ufs_check_page(page)) - goto fail; - } + struct folio *folio = read_mapping_folio(mapping, n, NULL); + void *kaddr; + + if (IS_ERR(folio)) + return ERR_CAST(folio); + kaddr = kmap_local_folio(folio, 0); + if (unlikely(!folio_test_checked(folio))) { + if (!ufs_check_folio(folio, kaddr)) + goto fail; } - return page; + *foliop = folio; + return kaddr; fail: - ufs_put_page(page); + folio_release_kmap(folio, kaddr); return ERR_PTR(-EIO); } @@ -231,17 +226,14 @@ ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p) fs16_to_cpu(sb, p->d_reclen)); } -struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) +struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct folio **foliop) { - struct page *page = ufs_get_page(dir, 0); - struct ufs_dir_entry *de = NULL; + struct ufs_dir_entry *de = ufs_get_folio(dir, 0, foliop); - if (!IS_ERR(page)) { - de = ufs_next_entry(dir->i_sb, - (struct ufs_dir_entry *)page_address(page)); - *p = page; - } - return de; + if (!IS_ERR(de)) + return ufs_next_entry(dir->i_sb, de); + + return NULL; } /* @@ -253,7 +245,7 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) * Entry is guaranteed to be valid. */ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, - struct page **res_page) + struct folio **foliop) { struct super_block *sb = dir->i_sb; const unsigned char *name = qstr->name; @@ -261,7 +253,6 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, unsigned reclen = UFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; struct ufs_inode_info *ui = UFS_I(dir); struct ufs_dir_entry *de; @@ -270,27 +261,23 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, if (npages == 0 || namelen > UFS_MAXNAMLEN) goto out; - /* OFFSET_CACHE */ - *res_page = NULL; - start = ui->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { - char *kaddr; - page = ufs_get_page(dir, n); - if (!IS_ERR(page)) { - kaddr = page_address(page); - de = (struct ufs_dir_entry *) kaddr; + char *kaddr = ufs_get_folio(dir, n, foliop); + + if (!IS_ERR(kaddr)) { + de = (struct ufs_dir_entry *)kaddr; kaddr += ufs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (ufs_match(sb, namelen, name, de)) goto found; de = ufs_next_entry(sb, de); } - ufs_put_page(page); + folio_release_kmap(*foliop, kaddr); } if (++n >= npages) n = 0; @@ -299,7 +286,6 @@ out: return NULL; found: - *res_page = page; ui->i_dir_start_lookup = n; return de; } @@ -316,11 +302,10 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) unsigned reclen = UFS_DIR_REC_LEN(namelen); const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; unsigned short rec_len, name_len; - struct page *page = NULL; + struct folio *folio = NULL; struct ufs_dir_entry *de; unsigned long npages = dir_pages(dir); unsigned long n; - char *kaddr; loff_t pos; int err; @@ -328,21 +313,19 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) /* * We take care of directory expansion in the same loop. - * This code plays outside i_size, so it locks the page + * This code plays outside i_size, so it locks the folio * to protect that region. */ for (n = 0; n <= npages; n++) { + char *kaddr = ufs_get_folio(dir, n, &folio); char *dir_end; - page = ufs_get_page(dir, n); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; - lock_page(page); - kaddr = page_address(page); + if (IS_ERR(kaddr)) + return PTR_ERR(kaddr); + folio_lock(folio); dir_end = kaddr + ufs_last_byte(dir, n); de = (struct ufs_dir_entry *)kaddr; - kaddr += PAGE_SIZE - reclen; + kaddr += folio_size(folio) - reclen; while ((char *)de <= kaddr) { if ((char *)de == dir_end) { /* We hit i_size */ @@ -369,16 +352,15 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) goto got_it; de = (struct ufs_dir_entry *) ((char *) de + rec_len); } - unlock_page(page); - ufs_put_page(page); + folio_unlock(folio); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + - (char*)de - (char*)page_address(page); - err = ufs_prepare_chunk(page, pos, rec_len); + pos = folio_pos(folio) + offset_in_folio(folio, de); + err = ufs_prepare_chunk(folio, pos, rec_len); if (err) goto out_unlock; if (de->d_ino) { @@ -395,18 +377,17 @@ got_it: de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); - ufs_commit_chunk(page, pos, rec_len); + ufs_commit_chunk(folio, pos, rec_len); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = ufs_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: - ufs_put_page(page); -out: + folio_release_kmap(folio, de); return err; out_unlock: - unlock_page(page); + folio_unlock(folio); goto out_put; } @@ -444,19 +425,18 @@ ufs_readdir(struct file *file, struct dir_context *ctx) return 0; for ( ; n < npages; n++, offset = 0) { - char *kaddr, *limit; struct ufs_dir_entry *de; + struct folio *folio; + char *kaddr = ufs_get_folio(inode, n, &folio); + char *limit; - struct page *page = ufs_get_page(inode, n); - - if (IS_ERR(page)) { + if (IS_ERR(kaddr)) { ufs_error(sb, __func__, "bad page in #%lu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; - return -EIO; + return PTR_ERR(kaddr); } - kaddr = page_address(page); if (unlikely(need_revalidate)) { if (offset) { offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); @@ -482,13 +462,13 @@ ufs_readdir(struct file *file, struct dir_context *ctx) ufs_get_de_namlen(sb, de), fs32_to_cpu(sb, de->d_ino), d_type)) { - ufs_put_page(page); + folio_release_kmap(folio, de); return 0; } } ctx->pos += fs16_to_cpu(sb, de->d_reclen); } - ufs_put_page(page); + folio_release_kmap(folio, kaddr); } return 0; } @@ -499,19 +479,23 @@ ufs_readdir(struct file *file, struct dir_context *ctx) * previous entry. */ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, - struct page * page) + struct folio *folio) { struct super_block *sb = inode->i_sb; - char *kaddr = page_address(page); - unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); - unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen); + size_t from, to; + char *kaddr; loff_t pos; - struct ufs_dir_entry *pde = NULL; - struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from); + struct ufs_dir_entry *de, *pde = NULL; int err; UFSD("ENTER\n"); + from = offset_in_folio(folio, dir); + to = from + fs16_to_cpu(sb, dir->d_reclen); + kaddr = (char *)dir - from; + from &= ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); + de = (struct ufs_dir_entry *) (kaddr + from); + UFSD("ino %u, reclen %u, namlen %u, name %s\n", fs32_to_cpu(sb, de->d_ino), fs16_to_cpu(sb, de->d_reclen), @@ -528,21 +512,20 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, de = ufs_next_entry(sb, de); } if (pde) - from = (char*)pde - (char*)page_address(page); - - pos = page_offset(page) + from; - lock_page(page); - err = ufs_prepare_chunk(page, pos, to - from); + from = offset_in_folio(folio, pde); + pos = folio_pos(folio) + from; + folio_lock(folio); + err = ufs_prepare_chunk(folio, pos, to - from); BUG_ON(err); if (pde) pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; - ufs_commit_chunk(page, pos, to - from); + ufs_commit_chunk(folio, pos, to - from); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); err = ufs_handle_dirsync(inode); out: - ufs_put_page(page); + folio_release_kmap(folio, kaddr); UFSD("EXIT\n"); return err; } @@ -551,26 +534,25 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) { struct super_block * sb = dir->i_sb; struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct folio *folio = filemap_grab_folio(mapping, 0); const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; struct ufs_dir_entry * de; - char *base; int err; + char *kaddr; - if (!page) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); - err = ufs_prepare_chunk(page, 0, chunk_size); + err = ufs_prepare_chunk(folio, 0, chunk_size); if (err) { - unlock_page(page); + folio_unlock(folio); goto fail; } - kmap(page); - base = (char*)page_address(page); - memset(base, 0, PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); + memset(kaddr, 0, folio_size(folio)); - de = (struct ufs_dir_entry *) base; + de = (struct ufs_dir_entry *)kaddr; de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); @@ -584,12 +566,12 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) de->d_reclen = cpu_to_fs16(sb, chunk_size - UFS_DIR_REC_LEN(1)); ufs_set_de_namlen(sb, de, 2); strcpy (de->d_name, ".."); - kunmap(page); + kunmap_local(kaddr); - ufs_commit_chunk(page, 0, chunk_size); + ufs_commit_chunk(folio, 0, chunk_size); err = ufs_handle_dirsync(inode); fail: - put_page(page); + folio_put(folio); return err; } @@ -599,18 +581,17 @@ fail: int ufs_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; - struct page *page = NULL; + struct folio *folio; + char *kaddr; unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { - char *kaddr; struct ufs_dir_entry *de; - page = ufs_get_page(inode, i); - if (IS_ERR(page)) + kaddr = ufs_get_folio(inode, i, &folio); + if (IS_ERR(kaddr)) continue; - kaddr = page_address(page); de = (struct ufs_dir_entry *)kaddr; kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1); @@ -637,12 +618,12 @@ int ufs_empty_dir(struct inode * inode) } de = ufs_next_entry(sb, de); } - ufs_put_page(page); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - ufs_put_page(page); + folio_release_kmap(folio, kaddr); return 0; } diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a7bb2e63cdde..5331ae7ebf3e 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -479,9 +479,9 @@ static int ufs_read_folio(struct file *file, struct folio *folio) return block_read_full_folio(folio, ufs_getfrag_block); } -int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len) +int ufs_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(page, pos, len, ufs_getfrag_block); + return __block_write_begin(folio, pos, len, ufs_getfrag_block); } static void ufs_truncate_blocks(struct inode *); @@ -498,11 +498,11 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) static int ufs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, ufs_getfrag_block); + ret = block_write_begin(mapping, pos, len, foliop, ufs_getfrag_block); if (unlikely(ret)) ufs_write_failed(mapping, pos + len); @@ -511,11 +511,11 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping, static int ufs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ret < len) ufs_write_failed(mapping, pos + len); return ret; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 9cad29463791..24bd12186647 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -209,14 +209,14 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) { struct inode * inode = d_inode(dentry); struct ufs_dir_entry *de; - struct page *page; + struct folio *folio; int err = -ENOENT; - de = ufs_find_entry(dir, &dentry->d_name, &page); + de = ufs_find_entry(dir, &dentry->d_name, &folio); if (!de) goto out; - err = ufs_delete_entry(dir, de, page); + err = ufs_delete_entry(dir, de, folio); if (err) goto out; @@ -249,28 +249,28 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct page *dir_page = NULL; + struct folio *dir_folio = NULL; struct ufs_dir_entry * dir_de = NULL; - struct page *old_page; + struct folio *old_folio; struct ufs_dir_entry *old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; - old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_folio); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = ufs_dotdot(old_inode, &dir_page); + dir_de = ufs_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } if (new_inode) { - struct page *new_page; + struct folio *new_folio; struct ufs_dir_entry *new_de; err = -ENOTEMPTY; @@ -278,10 +278,10 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_dir; err = -ENOENT; - new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); + new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_folio); if (!new_de) goto out_dir; - ufs_set_link(new_dir, new_de, new_page, old_inode, 1); + ufs_set_link(new_dir, new_de, new_folio, old_inode, 1); inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); @@ -300,29 +300,24 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, */ inode_set_ctime_current(old_inode); - ufs_delete_entry(old_dir, old_de, old_page); + ufs_delete_entry(old_dir, old_de, old_folio); mark_inode_dirty(old_inode); if (dir_de) { if (old_dir != new_dir) - ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0); - else { - kunmap(dir_page); - put_page(dir_page); - } + ufs_set_link(old_inode, dir_de, dir_folio, new_dir, 0); + else + folio_release_kmap(dir_folio, new_dir); inode_dec_link_count(old_dir); } return 0; out_dir: - if (dir_de) { - kunmap(dir_page); - put_page(dir_page); - } + if (dir_de) + folio_release_kmap(dir_folio, dir_de); out_old: - kunmap(old_page); - put_page(old_page); + folio_release_kmap(old_folio, old_de); out: return err; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 6b499180643b..a2c762cb65a0 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -99,15 +99,17 @@ extern void ufs_put_cylinder (struct super_block *, unsigned); /* dir.c */ extern const struct inode_operations ufs_dir_inode_operations; -extern int ufs_add_link (struct dentry *, struct inode *); -extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *); -extern int ufs_make_empty(struct inode *, struct inode *); -extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **); -extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); -extern int ufs_empty_dir (struct inode *); -extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); -extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode, bool update_times); + +int ufs_add_link(struct dentry *, struct inode *); +ino_t ufs_inode_by_name(struct inode *, const struct qstr *); +int ufs_make_empty(struct inode *, struct inode *); +struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, + struct folio **); +int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct folio *); +int ufs_empty_dir(struct inode *); +struct ufs_dir_entry *ufs_dotdot(struct inode *, struct folio **); +void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct folio *folio, struct inode *inode, bool update_times); /* file.c */ extern const struct inode_operations ufs_file_inode_operations; diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 0ecd2ed792f5..bf708b68f150 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -250,9 +250,9 @@ ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value) } } -extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); -extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); -extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len); +dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); +void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); +int ufs_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); /* * These functions manipulate ufs buffers diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index fdb4da24d662..b780deb81b02 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -300,23 +300,23 @@ static int vboxsf_writepage(struct page *page, struct writeback_control *wbc) static int vboxsf_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct vboxsf_handle *sf_handle = file->private_data; - unsigned int from = pos & ~PAGE_MASK; + size_t from = offset_in_folio(folio, pos); u32 nwritten = len; u8 *buf; int err; - /* zero the stale part of the page if we did a short copy */ - if (!PageUptodate(page) && copied < len) - zero_user(page, from + copied, len - copied); + /* zero the stale part of the folio if we did a short copy */ + if (!folio_test_uptodate(folio) && copied < len) + folio_zero_range(folio, from + copied, len - copied); - buf = kmap(page); + buf = kmap(&folio->page); err = vboxsf_write(sf_handle->root, sf_handle->handle, pos, &nwritten, buf + from); - kunmap(page); + kunmap(&folio->page); if (err) { nwritten = 0; @@ -326,16 +326,16 @@ static int vboxsf_write_end(struct file *file, struct address_space *mapping, /* mtime changed */ VBOXSF_I(inode)->force_restat = 1; - if (!PageUptodate(page) && nwritten == PAGE_SIZE) - SetPageUptodate(page); + if (!folio_test_uptodate(folio) && nwritten == folio_size(folio)) + folio_mark_uptodate(folio); pos += nwritten; if (pos > inode->i_size) i_size_write(inode, pos); out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return nwritten; } @@ -343,7 +343,7 @@ out: /* * Note simple_write_begin does not read the page from disk on partial writes * this is ok since vboxsf_write_end only writes the written parts of the - * page and it does not call SetPageUptodate for partial writes. + * page and it does not call folio_mark_uptodate for partial writes. */ const struct address_space_operations vboxsf_reg_aops = { .read_folio = vboxsf_read_folio, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 496e2f72a85b..797d5b5f7b72 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -749,7 +749,7 @@ xfs_finobt_count_blocks( if (error) return error; - cur = xfs_inobt_init_cursor(pag, tp, agbp); + cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_btree_count_blocks(cur, tree_blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 513b50da6215..79babeac9d75 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -514,12 +514,18 @@ xfs_dinode_verify( return __this_address; } - if (dip->di_version > 1) { + /* + * Historical note: xfsprogs in the 3.2 era set up its incore inodes to + * have di_nlink track the link count, even if the actual filesystem + * only supported V1 inodes (i.e. di_onlink). When writing out the + * ondisk inode, it would set both the ondisk di_nlink and di_onlink to + * the the incore di_nlink value, which is why we cannot check for + * di_nlink==0 on a V1 inode. V2/3 inodes would get written out with + * di_onlink==0, so we can check that. + */ + if (dip->di_version >= 2) { if (dip->di_onlink) return __this_address; - } else { - if (dip->di_nlink) - return __this_address; } /* don't allow invalid i_size */ diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index d848222f802b..9b5d98fe1f8a 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -293,7 +293,7 @@ xfile_get_folio( * (potentially last) reference in xfile_put_folio. */ if (flags & XFILE_ALLOC) - folio_set_dirty(folio); + folio_mark_dirty(folio); return folio; } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 6f0fc7fe1f2b..25f5dffeab2a 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -158,8 +158,7 @@ static int xfs_trim_gather_extents( struct xfs_perag *pag, struct xfs_trim_cur *tcur, - struct xfs_busy_extents *extents, - uint64_t *blocks_trimmed) + struct xfs_busy_extents *extents) { struct xfs_mount *mp = pag->pag_mount; struct xfs_trans *tp; @@ -280,7 +279,6 @@ xfs_trim_gather_extents( xfs_extent_busy_insert_discard(pag, fbno, flen, &extents->extent_list); - *blocks_trimmed += flen; next_extent: if (tcur->by_bno) error = xfs_btree_increment(cur, 0, &i); @@ -327,8 +325,7 @@ xfs_trim_perag_extents( struct xfs_perag *pag, xfs_agblock_t start, xfs_agblock_t end, - xfs_extlen_t minlen, - uint64_t *blocks_trimmed) + xfs_extlen_t minlen) { struct xfs_trim_cur tcur = { .start = start, @@ -354,8 +351,7 @@ xfs_trim_perag_extents( extents->owner = extents; INIT_LIST_HEAD(&extents->extent_list); - error = xfs_trim_gather_extents(pag, &tcur, extents, - blocks_trimmed); + error = xfs_trim_gather_extents(pag, &tcur, extents); if (error) { kfree(extents); break; @@ -389,8 +385,7 @@ xfs_trim_datadev_extents( struct xfs_mount *mp, xfs_daddr_t start, xfs_daddr_t end, - xfs_extlen_t minlen, - uint64_t *blocks_trimmed) + xfs_extlen_t minlen) { xfs_agnumber_t start_agno, end_agno; xfs_agblock_t start_agbno, end_agbno; @@ -411,8 +406,7 @@ xfs_trim_datadev_extents( if (start_agno == end_agno) agend = end_agbno; - error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen, - blocks_trimmed); + error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen); if (error) last_error = error; @@ -431,9 +425,6 @@ struct xfs_trim_rtdev { /* list of rt extents to free */ struct list_head extent_list; - /* pointer to count of blocks trimmed */ - uint64_t *blocks_trimmed; - /* minimum length that caller allows us to trim */ xfs_rtblock_t minlen_fsb; @@ -551,7 +542,6 @@ xfs_trim_gather_rtextent( busyp->length = rlen; INIT_LIST_HEAD(&busyp->list); list_add_tail(&busyp->list, &tr->extent_list); - *tr->blocks_trimmed += rlen; tr->restart_rtx = rec->ar_startext + rec->ar_extcount; return 0; @@ -562,13 +552,11 @@ xfs_trim_rtdev_extents( struct xfs_mount *mp, xfs_daddr_t start, xfs_daddr_t end, - xfs_daddr_t minlen, - uint64_t *blocks_trimmed) + xfs_daddr_t minlen) { struct xfs_rtalloc_rec low = { }; struct xfs_rtalloc_rec high = { }; struct xfs_trim_rtdev tr = { - .blocks_trimmed = blocks_trimmed, .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), }; struct xfs_trans *tp; @@ -634,7 +622,7 @@ xfs_trim_rtdev_extents( return error; } #else -# define xfs_trim_rtdev_extents(m,s,e,n,b) (-EOPNOTSUPP) +# define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP) #endif /* CONFIG_XFS_RT */ /* @@ -661,7 +649,6 @@ xfs_ioc_trim( xfs_daddr_t start, end; xfs_extlen_t minlen; xfs_rfsblock_t max_blocks; - uint64_t blocks_trimmed = 0; int error, last_error = 0; if (!capable(CAP_SYS_ADMIN)) @@ -706,15 +693,13 @@ xfs_ioc_trim( end = start + BTOBBT(range.len) - 1; if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) { - error = xfs_trim_datadev_extents(mp, start, end, minlen, - &blocks_trimmed); + error = xfs_trim_datadev_extents(mp, start, end, minlen); if (error) last_error = error; } if (rt_bdev && !xfs_trim_should_stop()) { - error = xfs_trim_rtdev_extents(mp, start, end, minlen, - &blocks_trimmed); + error = xfs_trim_rtdev_extents(mp, start, end, minlen); if (error) last_error = error; } @@ -722,7 +707,8 @@ xfs_ioc_trim( if (last_error) return last_error; - range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + range.len = min_t(unsigned long long, range.len, + XFS_FSB_TO_B(mp, max_blocks)); if (copy_to_user(urange, &range, sizeof(range))) return -EFAULT; return 0; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 85dbb46452ca..71f32354944e 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -71,7 +71,7 @@ xfs_fsmap_owner_to_rmap( switch (src->fmr_owner) { case 0: /* "lowest owner id possible" */ case -1ULL: /* "highest owner id possible" */ - dest->rm_owner = 0; + dest->rm_owner = src->fmr_owner; break; case XFS_FMR_OWN_FREE: dest->rm_owner = XFS_RMAP_OWN_NULL; @@ -162,6 +162,7 @@ struct xfs_getfsmap_info { xfs_daddr_t next_daddr; /* next daddr we expect */ /* daddr of low fsmap key when we're using the rtbitmap */ xfs_daddr_t low_daddr; + xfs_daddr_t end_daddr; /* daddr of high fsmap key */ u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ /* @@ -182,6 +183,7 @@ struct xfs_getfsmap_dev { int (*fn)(struct xfs_trans *tp, const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info); + sector_t nr_sectors; }; /* Compare two getfsmap device handlers. */ @@ -252,7 +254,7 @@ xfs_getfsmap_rec_before_start( const struct xfs_rmap_irec *rec, xfs_daddr_t rec_daddr) { - if (info->low_daddr != -1ULL) + if (info->low_daddr != XFS_BUF_DADDR_NULL) return rec_daddr < info->low_daddr; if (info->low.rm_blockcount) return xfs_rmap_compare(rec, &info->low) < 0; @@ -294,6 +296,18 @@ xfs_getfsmap_helper( return 0; } + /* + * For an info->last query, we're looking for a gap between the last + * mapping emitted and the high key specified by userspace. If the + * user's query spans less than 1 fsblock, then info->high and + * info->low will have the same rm_startblock, which causes rec_daddr + * and next_daddr to be the same. Therefore, use the end_daddr that + * we calculated from userspace's high key to synthesize the record. + * Note that if the btree query found a mapping, there won't be a gap. + */ + if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL) + rec_daddr = info->end_daddr; + /* Are we just counting mappings? */ if (info->head->fmh_count == 0) { if (info->head->fmh_entries == UINT_MAX) @@ -904,17 +918,21 @@ xfs_getfsmap( /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); + handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else handlers[0].fn = xfs_getfsmap_datadev_bnobt; if (mp->m_logdev_targp != mp->m_ddev_targp) { + handlers[1].nr_sectors = XFS_FSB_TO_BB(mp, + mp->m_sb.sb_logblocks); handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); handlers[1].fn = xfs_getfsmap_logdev; } #ifdef CONFIG_XFS_RT if (mp->m_rtdev_targp) { + handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; } @@ -946,6 +964,7 @@ xfs_getfsmap( info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; + info.end_daddr = XFS_BUF_DADDR_NULL; info.fsmap_recs = fsmap_recs; info.head = head; @@ -966,8 +985,11 @@ xfs_getfsmap( * low key, zero out the low key so that we get * everything from the beginning. */ - if (handlers[i].dev == head->fmh_keys[1].fmr_device) + if (handlers[i].dev == head->fmh_keys[1].fmr_device) { dkeys[1] = head->fmh_keys[1]; + info.end_daddr = min(handlers[i].nr_sectors - 1, + dkeys[1].fmr_physical); + } if (handlers[i].dev > head->fmh_keys[0].fmr_device) memset(&dkeys[0], 0, sizeof(struct xfs_fsmap)); @@ -983,7 +1005,7 @@ xfs_getfsmap( info.dev = handlers[i].dev; info.last = false; info.pag = NULL; - info.low_daddr = -1ULL; + info.low_daddr = XFS_BUF_DADDR_NULL; info.low.rm_blockcount = 0; error = handlers[i].fn(tp, dkeys, &info); if (error) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 0c3e96c621a6..ebeab8e4dab1 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -785,6 +785,39 @@ xfs_alloc_rsum_cache( } /* + * If we changed the rt extent size (meaning there was no rt volume previously) + * and the root directory had EXTSZINHERIT and RTINHERIT set, it's possible + * that the extent size hint on the root directory is no longer congruent with + * the new rt extent size. Log the rootdir inode to fix this. + */ +static int +xfs_growfs_rt_fixup_extsize( + struct xfs_mount *mp) +{ + struct xfs_inode *ip = mp->m_rootip; + struct xfs_trans *tp; + int error = 0; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (!(ip->i_diflags & XFS_DIFLAG_RTINHERIT) || + !(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)) + goto out_iolock; + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_ichange, 0, 0, false, + &tp); + if (error) + goto out_iolock; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + +out_iolock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + +/* * Visible (exported) functions. */ @@ -812,6 +845,7 @@ xfs_growfs_rt( xfs_extlen_t rsumblocks; /* current number of rt summary blks */ xfs_sb_t *sbp; /* old superblock */ uint8_t *rsum_cache; /* old summary cache */ + xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; sbp = &mp->m_sb; @@ -821,34 +855,39 @@ xfs_growfs_rt( /* Needs to have been mounted with an rt device. */ if (!XFS_IS_REALTIME_MOUNT(mp)) return -EINVAL; + + if (!mutex_trylock(&mp->m_growlock)) + return -EWOULDBLOCK; /* * Mount should fail if the rt bitmap/summary files don't load, but * we'll check anyway. */ + error = -EINVAL; if (!mp->m_rbmip || !mp->m_rsumip) - return -EINVAL; + goto out_unlock; /* Shrink not supported. */ if (in->newblocks <= sbp->sb_rblocks) - return -EINVAL; + goto out_unlock; /* Can only change rt extent size when adding rt volume. */ if (sbp->sb_rblocks > 0 && in->extsize != sbp->sb_rextsize) - return -EINVAL; + goto out_unlock; /* Range check the extent size. */ if (XFS_FSB_TO_B(mp, in->extsize) > XFS_MAX_RTEXTSIZE || XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE) - return -EINVAL; + goto out_unlock; /* Unsupported realtime features. */ + error = -EOPNOTSUPP; if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp)) - return -EOPNOTSUPP; + goto out_unlock; nrblocks = in->newblocks; error = xfs_sb_validate_fsb_count(sbp, nrblocks); if (error) - return error; + goto out_unlock; /* * Read in the last block of the device, make sure it exists. */ @@ -856,7 +895,7 @@ xfs_growfs_rt( XFS_FSB_TO_BB(mp, nrblocks - 1), XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); if (error) - return error; + goto out_unlock; xfs_buf_relse(bp); /* @@ -864,8 +903,10 @@ xfs_growfs_rt( */ nrextents = nrblocks; do_div(nrextents, in->extsize); - if (!xfs_validate_rtextents(nrextents)) - return -EINVAL; + if (!xfs_validate_rtextents(nrextents)) { + error = -EINVAL; + goto out_unlock; + } nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); nrextslog = xfs_compute_rextslog(nrextents); nrsumlevels = nrextslog + 1; @@ -876,8 +917,11 @@ xfs_growfs_rt( * the log. This prevents us from getting a log overflow, * since we'll log basically the whole summary file at once. */ - if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) - return -EINVAL; + if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) { + error = -EINVAL; + goto out_unlock; + } + /* * Get the old block counts for bitmap and summary inodes. * These can't change since other growfs callers are locked out. @@ -889,10 +933,10 @@ xfs_growfs_rt( */ error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip); if (error) - return error; + goto out_unlock; error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip); if (error) - return error; + goto out_unlock; rsum_cache = mp->m_rsum_cache; if (nrbmblocks != sbp->sb_rbmblocks) @@ -1036,6 +1080,12 @@ error_cancel: if (error) goto out_free; + if (old_rextsize != in->extsize) { + error = xfs_growfs_rt_fixup_extsize(mp); + if (error) + goto out_free; + } + /* Update secondary superblocks now the physical grow has completed */ error = xfs_update_secondary_sbs(mp); @@ -1059,6 +1109,8 @@ out_free: } } +out_unlock: + mutex_unlock(&mp->m_growlock); return error; } |