diff options
Diffstat (limited to 'fs')
460 files changed, 11471 insertions, 5523 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 8482f2d11606..31c010372660 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -247,7 +247,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name, if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) return v9fs_remote_get_acl(dentry, name, buffer, size, type); - acl = v9fs_get_cached_acl(dentry->d_inode, type); + acl = v9fs_get_cached_acl(d_inode(dentry), type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -285,7 +285,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, int retval; struct posix_acl *acl; struct v9fs_session_info *v9ses; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (strcmp(name, "") != 0) return -EINVAL; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 099c7712631c..fb9ffcb43277 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -78,7 +78,6 @@ enum p9_cache_modes { * @cache: cache mode of type &p9_cache_modes * @cachetag: the tag of the cache associated with this session * @fscache: session cookie associated with FS-Cache - * @options: copy of options string given by user * @uname: string user name to mount hierarchy as * @aname: mount specifier for remote hierarchy * @maxdata: maximum data to be sent/recvd per protocol message diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index be35d05a4d0e..e9e04376c52c 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -231,9 +231,7 @@ static int v9fs_launder_page(struct page *page) /** * v9fs_direct_IO - 9P address space operation for direct I/O * @iocb: target I/O control block - * @iov: array of vectors that define I/O buffer * @pos: offset in file to begin the operation - * @nr_segs: size of iovec array * * The presence of v9fs_direct_IO() in the address space ops vector * allowes open() O_DIRECT flags which would have failed otherwise. diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index a345b2d659cc..bd456c668d39 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -53,7 +53,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry) dentry, dentry); /* Don't cache negative dentries */ - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return 1; return 0; } @@ -83,7 +83,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) goto out_valid; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 76c3b1ab6361..5cc00e56206e 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -138,6 +138,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) &err); if (err) return err; + if (n == 0) + return 0; rdir->head = 0; rdir->tail = n; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 2a9dd37dc426..1ef16bd8280b 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -151,7 +151,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) { struct p9_flock flock; struct p9_fid *fid; - uint8_t status; + uint8_t status = P9_LOCK_ERROR; int res = 0; unsigned char fl_type; @@ -196,7 +196,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) for (;;) { res = p9_client_lock_dotl(fid, &flock, &status); if (res < 0) - break; + goto out_unlock; if (status != P9_LOCK_BLOCKED) break; @@ -214,14 +214,16 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) case P9_LOCK_BLOCKED: res = -EAGAIN; break; + default: + WARN_ONCE(1, "unknown lock status code: %d\n", status); + /* fallthough */ case P9_LOCK_ERROR: case P9_LOCK_GRACE: res = -ENOLCK; break; - default: - BUG(); } +out_unlock: /* * incase server returned error for lock request, revert * it locally diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 3662f1d1d9cf..703342e309f5 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -595,7 +595,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) dir, dentry, flags); v9ses = v9fs_inode2v9ses(dir); - inode = dentry->d_inode; + inode = d_inode(dentry); dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { retval = PTR_ERR(dfid); @@ -864,7 +864,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT) || dentry->d_inode) + if (!(flags & O_CREAT) || d_really_is_positive(dentry)) return finish_no_open(file, res); err = 0; @@ -881,7 +881,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, } v9fs_invalidate_inode_attr(dir); - v9inode = V9FS_I(dentry->d_inode); + v9inode = V9FS_I(d_inode(dentry)); mutex_lock(&v9inode->v_mutex); if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && !v9inode->writeback_fid && @@ -908,7 +908,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, file->private_data = fid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(dentry->d_inode, file); + v9fs_cache_inode_set_cookie(d_inode(dentry), file); *opened |= FILE_CREATED; out: @@ -969,8 +969,8 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, p9_debug(P9_DEBUG_VFS, "\n"); retval = 0; - old_inode = old_dentry->d_inode; - new_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); v9ses = v9fs_inode2v9ses(old_inode); oldfid = v9fs_fid_lookup(old_dentry); if (IS_ERR(oldfid)) @@ -1061,7 +1061,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -1072,8 +1072,8 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); - generic_fillattr(dentry->d_inode, stat); + v9fs_stat2inode(st, d_inode(dentry), d_inode(dentry)->i_sb); + generic_fillattr(d_inode(dentry), stat); p9stat_free(st); kfree(st); @@ -1095,7 +1095,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) struct p9_wstat wstat; p9_debug(P9_DEBUG_VFS, "\n"); - retval = inode_change_ok(dentry->d_inode, iattr); + retval = inode_change_ok(d_inode(dentry), iattr); if (retval) return retval; @@ -1128,20 +1128,20 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) /* Write all dirty data */ if (d_is_reg(dentry)) - filemap_write_and_wait(dentry->d_inode->i_mapping); + filemap_write_and_wait(d_inode(dentry)->i_mapping); retval = p9_client_wstat(fid, &wstat); if (retval < 0) return retval; if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(dentry->d_inode)) - truncate_setsize(dentry->d_inode, iattr->ia_size); + iattr->ia_size != i_size_read(d_inode(dentry))) + truncate_setsize(d_inode(dentry), iattr->ia_size); - v9fs_invalidate_inode_attr(dentry->d_inode); + v9fs_invalidate_inode_attr(d_inode(dentry)); - setattr_copy(dentry->d_inode, iattr); - mark_inode_dirty(dentry->d_inode); + setattr_copy(d_inode(dentry), iattr); + mark_inode_dirty(d_inode(dentry)); return 0; } @@ -1403,7 +1403,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name); __putname(name); if (!retval) { - v9fs_refresh_inode(oldfid, old_dentry->d_inode); + v9fs_refresh_inode(oldfid, d_inode(old_dentry)); v9fs_invalidate_inode_attr(dir); } clunk_fid: diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 6054c16b8fae..9861c7c951a6 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -265,7 +265,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT) || dentry->d_inode) + if (!(flags & O_CREAT) || d_really_is_positive(dentry)) return finish_no_open(file, res); v9ses = v9fs_inode2v9ses(dir); @@ -481,7 +481,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -496,8 +496,8 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode_dotl(st, dentry->d_inode); - generic_fillattr(dentry->d_inode, stat); + v9fs_stat2inode_dotl(st, d_inode(dentry)); + generic_fillattr(d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -557,7 +557,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) int retval; struct p9_fid *fid; struct p9_iattr_dotl p9attr; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); p9_debug(P9_DEBUG_VFS, "\n"); @@ -795,10 +795,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, if (IS_ERR(fid)) return PTR_ERR(fid); - v9fs_refresh_inode_dotl(fid, old_dentry->d_inode); + v9fs_refresh_inode_dotl(fid, d_inode(old_dentry)); } - ihold(old_dentry->d_inode); - d_instantiate(dentry, old_dentry->d_inode); + ihold(d_inode(old_dentry)); + d_instantiate(dentry, d_inode(old_dentry)); return err; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 0afd0382822b..e99a338a4638 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -168,8 +168,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(st); goto release_sb; } - root->d_inode->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode_dotl(st, root->d_inode); + d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode_dotl(st, d_inode(root)); kfree(st); } else { struct p9_wstat *st = NULL; @@ -179,8 +179,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } - root->d_inode->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, root->d_inode, sb); + d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, d_inode(root), sb); p9stat_free(st); kfree(st); diff --git a/fs/Kconfig b/fs/Kconfig index ec35851e5b71..011f43365d7b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -32,6 +32,7 @@ source "fs/gfs2/Kconfig" source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" +source "fs/f2fs/Kconfig" config FS_DAX bool "Direct Access (DAX) support" @@ -217,7 +218,6 @@ source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" -source "fs/f2fs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index b9acadafa4a1..335055d828e4 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -298,7 +298,7 @@ out: int adfs_notify_change(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; unsigned int ia_valid = attr->ia_valid; int error; diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 5022ac96aa40..a8f463c028ce 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -138,7 +138,7 @@ affs_fix_dcache(struct inode *inode, u32 entry_ino) static int affs_remove_link(struct dentry *dentry) { - struct inode *dir, *inode = dentry->d_inode; + struct inode *dir, *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL, *link_bh = NULL; u32 link_ino, ino; @@ -268,11 +268,11 @@ affs_remove_header(struct dentry *dentry) struct buffer_head *bh = NULL; int retval; - dir = dentry->d_parent->d_inode; + dir = d_inode(dentry->d_parent); sb = dir->i_sb; retval = -ENOENT; - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) goto done; @@ -471,10 +471,9 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) bool affs_nofilenametruncate(const struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); return affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_NO_TRUNCATE); - } /* Check if the name is valid for a affs object. */ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 9628003ccd2f..a022f4accd76 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -213,7 +213,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) int affs_notify_change(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index ec8ca0efb960..181e05b46e72 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -251,7 +251,7 @@ int affs_unlink(struct inode *dir, struct dentry *dentry) { pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, - dentry->d_inode->i_ino, dentry); + d_inode(dentry)->i_ino, dentry); return affs_remove_header(dentry); } @@ -320,7 +320,7 @@ int affs_rmdir(struct inode *dir, struct dentry *dentry) { pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, - dentry->d_inode->i_ino, dentry); + d_inode(dentry)->i_ino, dentry); return affs_remove_header(dentry); } @@ -403,7 +403,7 @@ err: int affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino, dentry); @@ -430,13 +430,13 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, return retval; /* Unlink destination if it already exists */ - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { retval = affs_remove_header(new_dentry); if (retval) return retval; } - bh = affs_bread(sb, old_dentry->d_inode->i_ino); + bh = affs_bread(sb, d_inode(old_dentry)->i_ino); if (!bh) return -EIO; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 4ec35e9130e1..e10e17788f06 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -505,7 +505,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, _enter("{%x:%u},%p{%pd},", vnode->fid.vid, vnode->fid.vnode, dentry, dentry); - ASSERTCMP(dentry->d_inode, ==, NULL); + ASSERTCMP(d_inode(dentry), ==, NULL); if (dentry->d_name.len >= AFSNAMEMAX) { _leave(" = -ENAMETOOLONG"); @@ -563,8 +563,8 @@ success: _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", fid.vnode, fid.unique, - dentry->d_inode->i_ino, - dentry->d_inode->i_generation); + d_inode(dentry)->i_ino, + d_inode(dentry)->i_generation); return NULL; } @@ -586,9 +586,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - vnode = AFS_FS_I(dentry->d_inode); + vnode = AFS_FS_I(d_inode(dentry)); - if (dentry->d_inode) + if (d_really_is_positive(dentry)) _enter("{v={%x:%u} n=%pd fl=%lx},", vnode->fid.vid, vnode->fid.vnode, dentry, vnode->flags); @@ -601,7 +601,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) /* lock down the parent dentry so we can peer at it */ parent = dget_parent(dentry); - dir = AFS_FS_I(parent->d_inode); + dir = AFS_FS_I(d_inode(parent)); /* validate the parent directory */ if (test_bit(AFS_VNODE_MODIFIED, &dir->flags)) @@ -623,9 +623,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) switch (ret) { case 0: /* the filename maps to something */ - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out_bad; - if (is_bad_inode(dentry->d_inode)) { + if (is_bad_inode(d_inode(dentry))) { printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n", dentry); goto out_bad; @@ -647,7 +647,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) _debug("%pd: file deleted (uq %u -> %u I:%u)", dentry, fid.unique, vnode->fid.unique, - dentry->d_inode->i_generation); + d_inode(dentry)->i_generation); spin_lock(&vnode->lock); set_bit(AFS_VNODE_DELETED, &vnode->flags); spin_unlock(&vnode->lock); @@ -658,7 +658,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) case -ENOENT: /* the filename is unknown */ _debug("%pd: dirent not found", dentry); - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto not_found; goto out_valid; @@ -703,9 +703,9 @@ static int afs_d_delete(const struct dentry *dentry) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto zap; - if (dentry->d_inode && - (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags) || - test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags))) + if (d_really_is_positive(dentry) && + (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(d_inode(dentry))->flags) || + test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(d_inode(dentry))->flags))) goto zap; _leave(" = 0 [keep]"); @@ -814,8 +814,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) if (ret < 0) goto rmdir_error; - if (dentry->d_inode) { - vnode = AFS_FS_I(dentry->d_inode); + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); clear_nlink(&vnode->vfs_inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); afs_discard_callback_on_delete(vnode); @@ -856,8 +856,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) goto error; } - if (dentry->d_inode) { - vnode = AFS_FS_I(dentry->d_inode); + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); /* make sure we have a callback promise on the victim */ ret = afs_validate(vnode, key); @@ -869,7 +869,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) if (ret < 0) goto remove_error; - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { /* if the file wasn't deleted due to excess hard links, the * fileserver will break the callback promise on the file - if * it had one - before it returns to us, and if it was deleted, @@ -879,7 +879,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) * or it was outstanding on a different server, then it won't * break it either... */ - vnode = AFS_FS_I(dentry->d_inode); + vnode = AFS_FS_I(d_inode(dentry)); if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) _debug("AFS_VNODE_DELETED"); if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) @@ -977,7 +977,7 @@ static int afs_link(struct dentry *from, struct inode *dir, struct key *key; int ret; - vnode = AFS_FS_I(from->d_inode); + vnode = AFS_FS_I(d_inode(from)); dvnode = AFS_FS_I(dir); _enter("{%x:%u},{%x:%u},{%pd}", @@ -1089,7 +1089,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct key *key; int ret; - vnode = AFS_FS_I(old_dentry->d_inode); + vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 8a1d38ef0fc2..e06f5a23352a 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -379,7 +379,7 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, { struct inode *inode; - inode = dentry->d_inode; + inode = d_inode(dentry); _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); @@ -458,7 +458,7 @@ void afs_evict_inode(struct inode *inode) */ int afs_setattr(struct dentry *dentry, struct iattr *attr) { - struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); struct key *key; int ret; diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 938c5ab06d5a..ccd0b212e82a 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -134,7 +134,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) _enter("{%pd}", mntpt); - BUG_ON(!mntpt->d_inode); + BUG_ON(!d_inode(mntpt)); ret = -ENOMEM; devname = (char *) get_zeroed_page(GFP_KERNEL); @@ -145,7 +145,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) if (!options) goto error_no_options; - vnode = AFS_FS_I(mntpt->d_inode); + vnode = AFS_FS_I(d_inode(mntpt)); if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { /* if the directory is a pseudo directory, use the d_name */ static const char afs_root_cell[] = ":root.cell."; @@ -169,14 +169,14 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) } } else { /* read the contents of the AFS special symlink */ - loff_t size = i_size_read(mntpt->d_inode); + loff_t size = i_size_read(d_inode(mntpt)); char *buf; ret = -EINVAL; if (size > PAGE_SIZE - 1) goto error_no_page; - page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); + page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); if (IS_ERR(page)) { ret = PTR_ERR(page); goto error_no_page; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 3a57a1b0fb51..b50642870a43 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -85,7 +85,7 @@ int afs_open_socket(void) return -ENOMEM; } - ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); + ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); if (ret < 0) { destroy_workqueue(afs_async_calls); _leave(" = %d [socket]", ret); diff --git a/fs/afs/super.c b/fs/afs/super.c index c4861557e385..1fb4a5129f7d 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -529,7 +529,7 @@ static void afs_destroy_inode(struct inode *inode) static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct afs_volume_status vs; - struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); struct key *key; int ret; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index d10e619632ab..5b700ef1e59d 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -235,12 +235,12 @@ static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) { - return sbi->sb->s_root->d_inode->i_ino; + return d_inode(sbi->sb->s_root)->i_ino; } static inline int simple_positive(struct dentry *dentry) { - return dentry->d_inode && !d_unhashed(dentry); + return d_really_is_positive(dentry) && !d_unhashed(dentry); } static inline void __autofs4_add_expiring(struct dentry *dentry) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 11dd118f75e2..1cebc3c52fa5 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -374,7 +374,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (dentry->d_inode && d_is_symlink(dentry)) { + if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { DPRINTK("checking symlink %p %pd", dentry, dentry); /* * A symlink can't be "busy" in the usual sense so diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 1c55388ae633..a3ae0b2aeb5a 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -71,7 +71,7 @@ void autofs4_kill_sb(struct super_block *sb) static int autofs4_show_options(struct seq_file *m, struct dentry *root) { struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); - struct inode *root_inode = root->d_sb->s_root->d_inode; + struct inode *root_inode = d_inode(root->d_sb->s_root); if (!sbi) return 0; @@ -352,8 +352,8 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) inode->i_mode = mode; if (sb->s_root) { - inode->i_uid = sb->s_root->d_inode->i_uid; - inode->i_gid = sb->s_root->d_inode->i_gid; + inode->i_uid = d_inode(sb->s_root)->i_uid; + inode->i_gid = d_inode(sb->s_root)->i_gid; } inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_ino = get_next_ino(); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 7e44fdd03e2d..c6d7d3dbd52a 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -240,7 +240,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry, spin_lock(&expiring->d_lock); /* We've already been dentry_iput or unlinked */ - if (!expiring->d_inode) + if (d_really_is_negative(expiring)) goto next; qstr = &expiring->d_name; @@ -371,7 +371,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * having d_mountpoint() true, so there's no need to call back * to the daemon. */ - if (dentry->d_inode && d_is_symlink(dentry)) { + if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { spin_unlock(&sbi->fs_lock); goto done; } @@ -459,7 +459,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) return 0; if (d_mountpoint(dentry)) return 0; - inode = ACCESS_ONCE(dentry->d_inode); + inode = d_inode_rcu(dentry); if (inode && S_ISLNK(inode->i_mode)) return -EISDIR; if (list_empty(&dentry->d_subdirs)) @@ -485,7 +485,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * an incorrect ELOOP error return. */ if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || - (dentry->d_inode && d_is_symlink(dentry))) + (d_really_is_positive(dentry) && d_is_symlink(dentry))) status = -EISDIR; } spin_unlock(&sbi->fs_lock); @@ -625,8 +625,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) } dput(ino->dentry); - dentry->d_inode->i_size = 0; - clear_nlink(dentry->d_inode); + d_inode(dentry)->i_size = 0; + clear_nlink(d_inode(dentry)); dir->i_mtime = CURRENT_TIME; @@ -719,8 +719,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) atomic_dec(&p_ino->count); } dput(ino->dentry); - dentry->d_inode->i_size = 0; - clear_nlink(dentry->d_inode); + d_inode(dentry)->i_size = 0; + clear_nlink(d_inode(dentry)); if (dir->i_nlink) drop_nlink(dir); @@ -839,7 +839,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) */ int is_autofs4_dentry(struct dentry *dentry) { - return dentry && dentry->d_inode && + return dentry && d_really_is_positive(dentry) && dentry->d_op == &autofs4_dentry_operations && dentry->d_fsdata != NULL; } diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index 1e8ea192be2b..de58cc7b8076 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -18,7 +18,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) struct autofs_info *ino = autofs4_dentry_ino(dentry); if (ino && !autofs4_oz_mode(sbi)) ino->last_used = jiffies; - nd_set_link(nd, dentry->d_inode->i_private); + nd_set_link(nd, d_inode(dentry)->i_private); return NULL; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 2ad05ab93db8..35b755e79c2d 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -322,7 +322,7 @@ static int validate_request(struct autofs_wait_queue **wait, * continue on and create a new request. */ if (!IS_ROOT(dentry)) { - if (dentry->d_inode && d_unhashed(dentry)) { + if (d_really_is_positive(dentry) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; new = d_lookup(parent, &dentry->d_name); if (new) @@ -364,7 +364,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, if (pid == 0 || tgid == 0) return -ENOENT; - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { /* * A wait for a negative dentry is invalid for certain * cases. A direct or offset mount "always" has its mount diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 16e0a48bfccd..7943533c3868 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -471,7 +471,7 @@ static void * befs_follow_link(struct dentry *dentry, struct nameidata *nd) { struct super_block *sb = dentry->d_sb; - struct befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); + struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; char *link; @@ -501,7 +501,7 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd) static void * befs_fast_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); + struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); nd_set_link(nd, befs_ino->i_data.symlink); return NULL; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 7a8182770649..3ec6113146c0 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -153,7 +153,7 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, static int bfs_link(struct dentry *old, struct inode *dir, struct dentry *new) { - struct inode *inode = old->d_inode; + struct inode *inode = d_inode(old); struct bfs_sb_info *info = BFS_SB(inode->i_sb); int err; @@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir, static int bfs_unlink(struct inode *dir, struct dentry *dentry) { int error = -ENOENT; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh; struct bfs_dirent *de; struct bfs_sb_info *info = BFS_SB(inode->i_sb); @@ -216,7 +216,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, int error = -ENOENT; old_bh = new_bh = NULL; - old_inode = old_dentry->d_inode; + old_inode = d_inode(old_dentry); if (S_ISDIR(old_inode->i_mode)) return -EINVAL; @@ -231,7 +231,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; error = -EPERM; - new_inode = new_dentry->d_inode; + new_inode = d_inode(new_dentry); new_bh = bfs_find_entry(new_dir, new_dentry->d_name.name, new_dentry->d_name.len, &new_de); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 9dcb05409ba7..78f005f37847 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -591,7 +591,7 @@ static void kill_node(Node *e) write_unlock(&entries_lock); if (dentry) { - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); d_drop(dentry); dput(dentry); simple_release_fs(&bm_mnt, &entry_count); @@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, case 3: /* Delete this handler. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); kill_node(e); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(root); break; default: @@ -675,14 +675,14 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); root = dget(sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out; err = -EEXIST; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto out2; inode = bm_get_inode(sb, S_IFREG | 0644); @@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, out2: dput(dentry); out: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(root); if (err) { @@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, case 3: /* Delete all handlers. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); while (!list_empty(&entries)) kill_node(list_entry(entries.next, Node, list)); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(root); break; default: diff --git a/fs/block_dev.c b/fs/block_dev.c index 897ee0503932..c7e4163ede87 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -152,7 +152,8 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) struct inode *inode = file->f_mapping->host; return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset, - blkdev_get_block, NULL, NULL, 0); + blkdev_get_block, NULL, NULL, + DIO_SKIP_DIO_COUNT); } int __sync_blockdev(struct block_device *bdev, int wait) @@ -1716,7 +1717,7 @@ struct block_device *lookup_bdev(const char *pathname) if (error) return ERR_PTR(error); - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) goto fail; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 4dabeb893b7c..df9932b00d08 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -87,7 +87,7 @@ BTRFS_WORK_HELPER(scrubwrc_helper); BTRFS_WORK_HELPER(scrubnc_helper); static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(const char *name, int flags, int max_active, +__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); @@ -132,7 +132,7 @@ static inline void __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, - int flags, + unsigned int flags, int max_active, int thresh) { diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index e386c29ef1f6..ec2ee477f8ba 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -66,7 +66,7 @@ BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); BTRFS_WORK_HELPER_PROTO(scrubnc_helper); struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, - int flags, + unsigned int flags, int max_active, int thresh); void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f55721ff9385..9de772ee0031 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1206,7 +1206,7 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, struct ulist *roots = NULL; struct ulist_iterator uiter; struct ulist_node *node; - struct seq_list elem = {}; + struct seq_list elem = SEQ_LIST_INIT(elem); int ret = 0; tmp = ulist_alloc(GFP_NOFS); @@ -1610,7 +1610,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index de5e4f2adfea..0ef5cc13fae2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -66,7 +66,11 @@ struct btrfs_inode { */ struct btrfs_key location; - /* Lock for counters */ + /* + * Lock for counters and all fields used to determine if the inode is in + * the log or not (last_trans, last_sub_trans, last_log_commit, + * logged_trans). + */ spinlock_t lock; /* the extent_tree has caches of all the extent mappings to disk */ @@ -250,6 +254,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode) static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) { + int ret = 0; + + spin_lock(&BTRFS_I(inode)->lock); if (BTRFS_I(inode)->logged_trans == generation && BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit && @@ -263,9 +270,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) */ smp_mb(); if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) - return 1; + ret = 1; } - return 0; + spin_unlock(&BTRFS_I(inode)->lock); + return ret; } #define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index d897ef803b3b..ce7dec88f4b8 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2990,8 +2990,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, - GFP_NOFS); + mapped_datav = kmalloc_array(bio->bi_vcnt, + sizeof(*mapped_datav), GFP_NOFS); if (!mapped_datav) goto leave; cur_bytenr = dev_bytenr; @@ -3241,8 +3241,5 @@ void btrfsic_unmount(struct btrfs_root *root, mutex_unlock(&btrfsic_mutex); - if (is_vmalloc_addr(state)) - vfree(state); - else - kfree(state); + kvfree(state); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e9df8862012c..ce62324c78e7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -622,7 +622,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->orig_bio = bio; nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE); - cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, + cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!cb->compressed_pages) goto fail1; @@ -750,7 +750,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; -static struct btrfs_compress_op *btrfs_compress_op[] = { +static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, &btrfs_lzo_compress, }; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d181f70caae0..13a4dc0436c9 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -77,7 +77,7 @@ struct btrfs_compress_op { size_t srclen, size_t destlen); }; -extern struct btrfs_compress_op btrfs_zlib_compress; -extern struct btrfs_compress_op btrfs_lzo_compress; +extern const struct btrfs_compress_op btrfs_zlib_compress; +extern const struct btrfs_compress_op btrfs_lzo_compress; #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6d67f32e648d..0f11ebc92f02 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -578,7 +578,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (!tree_mod_need_log(fs_info, eb)) return 0; - tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags); + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags); if (!tm_list) return -ENOMEM; @@ -677,7 +677,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (log_removal && btrfs_header_level(old_root) > 0) { nritems = btrfs_header_nritems(old_root); - tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), flags); if (!tm_list) { ret = -ENOMEM; @@ -814,7 +814,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) return 0; - tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *), + tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; @@ -905,8 +905,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) return 0; nritems = btrfs_header_nritems(eb); - tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), - GFP_NOFS); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; @@ -1073,7 +1072,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ } - clean_tree_block(trans, root, buf); + clean_tree_block(trans, root->fs_info, buf); *last_ref = 1; } return 0; @@ -1678,7 +1677,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, continue; } - cur = btrfs_find_tree_block(root, blocknr); + cur = btrfs_find_tree_block(root->fs_info, blocknr); if (cur) uptodate = btrfs_buffer_uptodate(cur, gen, 0); else @@ -1943,7 +1942,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - clean_tree_block(trans, root, mid); + clean_tree_block(trans, root->fs_info, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1997,7 +1996,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - clean_tree_block(trans, root, right); + clean_tree_block(trans, root->fs_info, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -2041,7 +2040,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - clean_tree_block(trans, root, mid); + clean_tree_block(trans, root->fs_info, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -2259,7 +2258,7 @@ static void reada_for_search(struct btrfs_root *root, search = btrfs_node_blockptr(node, slot); blocksize = root->nodesize; - eb = btrfs_find_tree_block(root, search); + eb = btrfs_find_tree_block(root->fs_info, search); if (eb) { free_extent_buffer(eb); return; @@ -2319,7 +2318,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root, block1); + eb = btrfs_find_tree_block(root->fs_info, block1); /* * if we get -eagain from btrfs_buffer_uptodate, we * don't want to return eagain here. That will loop @@ -2332,7 +2331,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root, block2); + eb = btrfs_find_tree_block(root->fs_info, block2); if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); @@ -2450,7 +2449,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); - tmp = btrfs_find_tree_block(root, blocknr); + tmp = btrfs_find_tree_block(root->fs_info, blocknr); if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { @@ -3126,7 +3125,8 @@ again: * higher levels * */ -static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, +static void fixup_low_keys(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_disk_key *key, int level) { int i; @@ -3137,7 +3137,7 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, if (!path->nodes[i]) break; t = path->nodes[i]; - tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); + tree_mod_log_set_node_key(fs_info, t, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) @@ -3151,7 +3151,8 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; @@ -3173,7 +3174,7 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_item_key(eb, &disk_key, slot); btrfs_mark_buffer_dirty(eb); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } /* @@ -3692,7 +3693,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (left_nritems) btrfs_mark_buffer_dirty(left); else - clean_tree_block(trans, root, left); + clean_tree_block(trans, root->fs_info, left); btrfs_mark_buffer_dirty(right); @@ -3704,7 +3705,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root, path->nodes[0]); + clean_tree_block(trans, root->fs_info, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3928,10 +3929,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (right_nritems) btrfs_mark_buffer_dirty(right); else - clean_tree_block(trans, root, right); + clean_tree_block(trans, root->fs_info, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -4168,6 +4169,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int mid; int slot; struct extent_buffer *right; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int wret; int split; @@ -4271,10 +4273,10 @@ again: btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); - write_extent_buffer(right, root->fs_info->fsid, + write_extent_buffer(right, fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); - write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + write_extent_buffer(right, fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); @@ -4297,7 +4299,7 @@ again: path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; @@ -4615,7 +4617,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } item = btrfs_item_nr(slot); @@ -4716,7 +4718,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -4888,7 +4890,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, level + 1); + fixup_low_keys(root->fs_info, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); } @@ -4981,7 +4983,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_set_path_blocking(path); - clean_tree_block(trans, root, leaf); + clean_tree_block(trans, root->fs_info, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { @@ -4990,7 +4992,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9c89cae39ee..6f364e1d8d3d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1061,6 +1061,12 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +#define BTRFS_QGROUP_LEVEL_SHIFT 48 +static inline u64 btrfs_qgroup_level(u64 qgroupid) +{ + return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; +} + /* * is subvolume quota turned on? */ @@ -1256,6 +1262,20 @@ struct btrfs_caching_control { atomic_t count; }; +struct btrfs_io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_root *root; + struct inode *inode; + unsigned long size; + int index; + int num_pages; + int entries; + int bitmaps; + unsigned check_crcs:1; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; @@ -1321,6 +1341,9 @@ struct btrfs_block_group_cache { /* For dirty block groups */ struct list_head dirty_list; + struct list_head io_list; + + struct btrfs_io_ctl io_ctl; }; /* delayed seq elem */ @@ -1329,6 +1352,8 @@ struct seq_list { u64 seq; }; +#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } + enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -1472,6 +1497,12 @@ struct btrfs_fs_info { struct mutex chunk_mutex; struct mutex volume_mutex; + /* + * this is taken to make sure we don't set block groups ro after + * the free space cache has been allocated on them + */ + struct mutex ro_block_group_mutex; + /* this is used during read/modify/write to make sure * no two ios are trying to mod the same stripe at the same * time @@ -1513,6 +1544,7 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; + struct rw_semaphore delayed_iput_sem; /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; @@ -3295,6 +3327,9 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) } /* extent-tree.c */ + +u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes); + static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) { @@ -3385,6 +3420,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int no_quota); +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, @@ -3417,7 +3454,7 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_ALL, }; -int btrfs_check_data_free_space(struct inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3440,6 +3477,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, unsigned short type); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); +void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); @@ -3486,7 +3524,8 @@ int btrfs_previous_item(struct btrfs_root *root, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); -void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); @@ -4180,7 +4219,8 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || - (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && + !btrfs_qgroup_level(rootid))) return 1; return 0; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 82f0c7c95474..a2ae42720a6a 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1383,7 +1383,7 @@ out: static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, - struct btrfs_root *root, int nr) + struct btrfs_fs_info *fs_info, int nr) { struct btrfs_async_delayed_work *async_work; @@ -1399,7 +1399,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, btrfs_async_run_delayed_root, NULL, NULL); async_work->nr = nr; - btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); + btrfs_queue_work(fs_info->delayed_workers, &async_work->work); return 0; } @@ -1426,6 +1426,7 @@ static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; + struct btrfs_fs_info *fs_info = root->fs_info; delayed_root = btrfs_get_delayed_root(root); @@ -1438,7 +1439,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) seq = atomic_read(&delayed_root->items_seq); - ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); + ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0); if (ret) return; @@ -1447,7 +1448,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) return; } - btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); + btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); } /* Will return 0 or -ENOMEM */ @@ -1801,6 +1802,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); + inode->i_version = btrfs_stack_inode_sequence(inode_item); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6d16bea94e1c..8f8ed7d20bac 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -489,11 +489,13 @@ update_existing_ref(struct btrfs_trans_handle *trans, * existing and update must have the same bytenr */ static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_node *existing, +update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *existing, struct btrfs_delayed_ref_node *update) { struct btrfs_delayed_ref_head *existing_ref; struct btrfs_delayed_ref_head *ref; + int old_ref_mod; existing_ref = btrfs_delayed_node_to_head(existing); ref = btrfs_delayed_node_to_head(update); @@ -541,7 +543,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ + old_ref_mod = existing_ref->total_ref_mod; existing->ref_mod += update->ref_mod; + existing_ref->total_ref_mod += update->ref_mod; + + /* + * If we are going to from a positive ref mod to a negative or vice + * versa we need to make sure to adjust pending_csums accordingly. + */ + if (existing_ref->is_data) { + if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0) + delayed_refs->pending_csums -= existing->num_bytes; + if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0) + delayed_refs->pending_csums += existing->num_bytes; + } spin_unlock(&existing_ref->lock); } @@ -605,6 +620,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref->is_data = is_data; head_ref->ref_root = RB_ROOT; head_ref->processing = 0; + head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); @@ -614,7 +630,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - update_existing_head_ref(&existing->node, ref); + update_existing_head_ref(delayed_refs, &existing->node, ref); /* * we've updated the existing ref, free the newly * allocated ref @@ -622,6 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + if (is_data && count_mod < 0) + delayed_refs->pending_csums += num_bytes; delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a764e2340d48..5eb0892396d0 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -88,6 +88,14 @@ struct btrfs_delayed_ref_head { struct rb_node href_node; struct btrfs_delayed_extent_op *extent_op; + + /* + * This is used to track the final ref_mod from all the refs associated + * with this head ref, this is not adjusted as delayed refs are run, + * this is meant to track if we need to do the csum accounting or not. + */ + int total_ref_mod; + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree @@ -138,6 +146,8 @@ struct btrfs_delayed_ref_root { /* total number of head nodes ready for processing */ unsigned long num_heads_ready; + u64 pending_csums; + /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5ec03d999c37..0573848c7333 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -670,8 +670,8 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: srcdev = dev_replace->srcdev; - args->status.progress_1000 = div64_u64(dev_replace->cursor_left, - div64_u64(btrfs_device_get_total_bytes(srcdev), 1000)); + args->status.progress_1000 = div_u64(dev_replace->cursor_left, + div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } btrfs_dev_replace_unlock(dev_replace); @@ -806,7 +806,7 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_dev_replace_status(fs_info, status_args); progress = status_args->status.progress_1000; kfree(status_args); - do_div(progress, 10); + progress = div_u64(progress, 10); printk_in_rcu(KERN_INFO "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", dev_replace->srcdev->missing ? "<missing disk>" : diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 639f2663ed3f..2ef9a4b72d06 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -54,7 +54,7 @@ #include <asm/cpufeature.h> #endif -static struct extent_io_ops btree_extent_io_ops; +static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, @@ -274,10 +274,11 @@ void btrfs_csum_final(u32 crc, char *result) * compute the csum for a btree block, and either verify it or write it * into the csum field of the block. */ -static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, +static int csum_tree_block(struct btrfs_fs_info *fs_info, + struct extent_buffer *buf, int verify) { - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); char *result = NULL; unsigned long len; unsigned long cur_len; @@ -302,7 +303,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, offset += cur_len; } if (csum_size > sizeof(inline_result)) { - result = kzalloc(csum_size * sizeof(char), GFP_NOFS); + result = kzalloc(csum_size, GFP_NOFS); if (!result) return 1; } else { @@ -321,7 +322,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, printk_ratelimited(KERN_WARNING "BTRFS: %s checksum verify failed on %llu wanted %X found %X " "level %d\n", - root->fs_info->sb->s_id, buf->start, + fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); @@ -418,12 +419,6 @@ static int btrfs_check_super_csum(char *raw_disk_sb) if (memcmp(raw_disk_sb, result, csum_size)) ret = 1; - - if (ret && btrfs_super_generation(disk_sb) < 10) { - printk(KERN_WARNING - "BTRFS: super block crcs don't match, older mkfs detected\n"); - ret = 0; - } } if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { @@ -501,7 +496,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, * we only fill in the checksum field in the first page of a multi-page block */ -static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) +static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) { u64 start = page_offset(page); u64 found_start; @@ -513,14 +508,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) found_start = btrfs_header_bytenr(eb); if (WARN_ON(found_start != start || !PageUptodate(page))) return 0; - csum_tree_block(root, eb, 0); + csum_tree_block(fs_info, eb, 0); return 0; } -static int check_tree_block_fsid(struct btrfs_root *root, +static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u8 fsid[BTRFS_UUID_SIZE]; int ret = 1; @@ -640,7 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, ret = -EIO; goto err; } - if (check_tree_block_fsid(root, eb)) { + if (check_tree_block_fsid(root->fs_info, eb)) { printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", eb->fs_info->sb->s_id, eb->start); ret = -EIO; @@ -657,7 +652,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(root, eb, 1); + ret = csum_tree_block(root->fs_info, eb, 1); if (ret) { ret = -EIO; goto err; @@ -882,7 +877,7 @@ static int btree_csum_one_bio(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root, bvec->bv_page); + ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); if (ret) break; } @@ -1119,10 +1114,10 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, return 0; } -struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, +struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) { - return find_extent_buffer(root->fs_info, bytenr); + return find_extent_buffer(fs_info, bytenr); } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, @@ -1165,11 +1160,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, } -void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, +void clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, struct extent_buffer *buf) { - struct btrfs_fs_info *fs_info = root->fs_info; - if (btrfs_header_generation(buf) == fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); @@ -2146,6 +2140,267 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) } } +static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) +{ + mutex_init(&fs_info->scrub_lock); + atomic_set(&fs_info->scrubs_running, 0); + atomic_set(&fs_info->scrub_pause_req, 0); + atomic_set(&fs_info->scrubs_paused, 0); + atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->scrub_pause_wait); + fs_info->scrub_workers_refcnt = 0; +} + +static void btrfs_init_balance(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_running, 0); + atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); + fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); +} + +static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info, + struct btrfs_root *tree_root) +{ + fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + set_nlink(fs_info->btree_inode, 1); + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; + fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + + RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, + fs_info->btree_inode->i_mapping); + BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + set_bit(BTRFS_INODE_DUMMY, + &BTRFS_I(fs_info->btree_inode)->runtime_flags); + btrfs_insert_inode_hash(fs_info->btree_inode); +} + +static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) +{ + fs_info->dev_replace.lock_owner = 0; + atomic_set(&fs_info->dev_replace.nesting_level, 0); + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); + mutex_init(&fs_info->dev_replace.lock_management_lock); + mutex_init(&fs_info->dev_replace.lock); + init_waitqueue_head(&fs_info->replace_wait); +} + +static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->qgroup_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_op_tree = RB_ROOT; + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + fs_info->qgroup_seq = 1; + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + fs_info->qgroup_ulist = NULL; + mutex_init(&fs_info->qgroup_rescan_lock); +} + +static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices) +{ + int max_active = fs_info->thread_pool_size; + unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; + + fs_info->workers = + btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, + max_active, 16); + + fs_info->delalloc_workers = + btrfs_alloc_workqueue("delalloc", flags, max_active, 2); + + fs_info->flush_workers = + btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); + + fs_info->caching_workers = + btrfs_alloc_workqueue("cache", flags, max_active, 0); + + /* + * a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the + * devices + */ + fs_info->submit_workers = + btrfs_alloc_workqueue("submit", flags, + min_t(u64, fs_devices->num_devices, + max_active), 64); + + fs_info->fixup_workers = + btrfs_alloc_workqueue("fixup", flags, 1, 0); + + /* + * endios are largely parallel and should have a very + * low idle thresh + */ + fs_info->endio_workers = + btrfs_alloc_workqueue("endio", flags, max_active, 4); + fs_info->endio_meta_workers = + btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + fs_info->endio_meta_write_workers = + btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + fs_info->endio_raid56_workers = + btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->endio_repair_workers = + btrfs_alloc_workqueue("endio-repair", flags, 1, 0); + fs_info->rmw_workers = + btrfs_alloc_workqueue("rmw", flags, max_active, 2); + fs_info->endio_write_workers = + btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + fs_info->endio_freespace_worker = + btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + fs_info->delayed_workers = + btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + fs_info->readahead_workers = + btrfs_alloc_workqueue("readahead", flags, max_active, 2); + fs_info->qgroup_rescan_workers = + btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + fs_info->extent_workers = + btrfs_alloc_workqueue("extent-refs", flags, + min_t(u64, fs_devices->num_devices, + max_active), 8); + + if (!(fs_info->workers && fs_info->delalloc_workers && + fs_info->submit_workers && fs_info->flush_workers && + fs_info->endio_workers && fs_info->endio_meta_workers && + fs_info->endio_meta_write_workers && + fs_info->endio_repair_workers && + fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_freespace_worker && fs_info->rmw_workers && + fs_info->caching_workers && fs_info->readahead_workers && + fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->extent_workers && + fs_info->qgroup_rescan_workers)) { + return -ENOMEM; + } + + return 0; +} + +static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *log_tree_root; + struct btrfs_super_block *disk_super = fs_info->super_copy; + u64 bytenr = btrfs_super_log_root(disk_super); + + if (fs_devices->rw_devices == 0) { + printk(KERN_WARNING "BTRFS: log replay required " + "on RO media\n"); + return -EIO; + } + + log_tree_root = btrfs_alloc_root(fs_info); + if (!log_tree_root) + return -ENOMEM; + + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, log_tree_root, fs_info, + BTRFS_TREE_LOG_OBJECTID); + + log_tree_root->node = read_tree_block(tree_root, bytenr, + fs_info->generation + 1); + if (!log_tree_root->node || + !extent_buffer_uptodate(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + return -EIO; + } + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); + if (ret) { + btrfs_error(tree_root->fs_info, ret, + "Failed to recover log tree"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + return ret; + } + + if (fs_info->sb->s_flags & MS_RDONLY) { + ret = btrfs_commit_super(tree_root); + if (ret) + return ret; + } + + return 0; +} + +static int btrfs_read_roots(struct btrfs_fs_info *fs_info, + struct btrfs_root *tree_root) +{ + struct btrfs_root *root; + struct btrfs_key location; + int ret; + + location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = 0; + + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->extent_root = root; + + location.objectid = BTRFS_DEV_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->dev_root = root; + btrfs_init_devices_late(fs_info); + + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->csum_root = root; + + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (!IS_ERR(root)) { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->quota_enabled = 1; + fs_info->pending_quota_state = 1; + fs_info->quota_root = root; + } + + location.objectid = BTRFS_UUID_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + if (ret != -ENOENT) + return ret; + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->uuid_root = root; + } + + return 0; +} + int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -2160,21 +2415,12 @@ int open_ctree(struct super_block *sb, struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; - struct btrfs_root *extent_root; - struct btrfs_root *csum_root; struct btrfs_root *chunk_root; - struct btrfs_root *dev_root; - struct btrfs_root *quota_root; - struct btrfs_root *uuid_root; - struct btrfs_root *log_tree_root; int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; int max_active; - int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; - bool create_uuid_tree; - bool check_uuid_tree; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); @@ -2241,11 +2487,12 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); - mutex_init(&fs_info->unused_bg_unpin_mutex); rwlock_init(&fs_info->tree_mod_log_lock); + mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); seqlock_init(&fs_info->profiles_lock); + init_rwsem(&fs_info->delayed_iput_sem); init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -2276,7 +2523,7 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; - fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64); + fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); @@ -2294,55 +2541,18 @@ int open_ctree(struct super_block *sb, } btrfs_init_delayed_root(fs_info->delayed_root); - mutex_init(&fs_info->scrub_lock); - atomic_set(&fs_info->scrubs_running, 0); - atomic_set(&fs_info->scrub_pause_req, 0); - atomic_set(&fs_info->scrubs_paused, 0); - atomic_set(&fs_info->scrub_cancel_req, 0); - init_waitqueue_head(&fs_info->replace_wait); - init_waitqueue_head(&fs_info->scrub_pause_wait); - fs_info->scrub_workers_refcnt = 0; + btrfs_init_scrub(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY fs_info->check_integrity_print_mask = 0; #endif - - spin_lock_init(&fs_info->balance_lock); - mutex_init(&fs_info->balance_mutex); - atomic_set(&fs_info->balance_running, 0); - atomic_set(&fs_info->balance_pause_req, 0); - atomic_set(&fs_info->balance_cancel_req, 0); - fs_info->balance_ctl = NULL; - init_waitqueue_head(&fs_info->balance_wait_q); + btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); sb->s_bdi = &fs_info->bdi; - fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - set_nlink(fs_info->btree_inode, 1); - /* - * we set the i_size on the btree inode to the max possible int. - * the real end of the address space is determined by all of - * the devices in the system - */ - fs_info->btree_inode->i_size = OFFSET_MAX; - fs_info->btree_inode->i_mapping->a_ops = &btree_aops; - - RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); - extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, - fs_info->btree_inode->i_mapping); - BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; - extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); - - BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; - - BTRFS_I(fs_info->btree_inode)->root = tree_root; - memset(&BTRFS_I(fs_info->btree_inode)->location, 0, - sizeof(struct btrfs_key)); - set_bit(BTRFS_INODE_DUMMY, - &BTRFS_I(fs_info->btree_inode)->runtime_flags); - btrfs_insert_inode_hash(fs_info->btree_inode); + btrfs_init_btree_inode(fs_info, tree_root); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -2363,26 +2573,14 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); + mutex_init(&fs_info->ro_block_group_mutex); init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); sema_init(&fs_info->uuid_tree_rescan_sem, 1); - fs_info->dev_replace.lock_owner = 0; - atomic_set(&fs_info->dev_replace.nesting_level, 0); - mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); - mutex_init(&fs_info->dev_replace.lock_management_lock); - mutex_init(&fs_info->dev_replace.lock); - spin_lock_init(&fs_info->qgroup_lock); - mutex_init(&fs_info->qgroup_ioctl_lock); - fs_info->qgroup_tree = RB_ROOT; - fs_info->qgroup_op_tree = RB_ROOT; - INIT_LIST_HEAD(&fs_info->dirty_qgroups); - fs_info->qgroup_seq = 1; - fs_info->quota_enabled = 0; - fs_info->pending_quota_state = 0; - fs_info->qgroup_ulist = NULL; - mutex_init(&fs_info->qgroup_rescan_lock); + btrfs_init_dev_replace_locks(fs_info); + btrfs_init_qgroup(fs_info); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -2554,75 +2752,9 @@ int open_ctree(struct super_block *sb, max_active = fs_info->thread_pool_size; - fs_info->workers = - btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, - max_active, 16); - - fs_info->delalloc_workers = - btrfs_alloc_workqueue("delalloc", flags, max_active, 2); - - fs_info->flush_workers = - btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); - - fs_info->caching_workers = - btrfs_alloc_workqueue("cache", flags, max_active, 0); - - /* - * a higher idle thresh on the submit workers makes it much more - * likely that bios will be send down in a sane order to the - * devices - */ - fs_info->submit_workers = - btrfs_alloc_workqueue("submit", flags, - min_t(u64, fs_devices->num_devices, - max_active), 64); - - fs_info->fixup_workers = - btrfs_alloc_workqueue("fixup", flags, 1, 0); - - /* - * endios are largely parallel and should have a very - * low idle thresh - */ - fs_info->endio_workers = - btrfs_alloc_workqueue("endio", flags, max_active, 4); - fs_info->endio_meta_workers = - btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); - fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); - fs_info->endio_raid56_workers = - btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); - fs_info->endio_repair_workers = - btrfs_alloc_workqueue("endio-repair", flags, 1, 0); - fs_info->rmw_workers = - btrfs_alloc_workqueue("rmw", flags, max_active, 2); - fs_info->endio_write_workers = - btrfs_alloc_workqueue("endio-write", flags, max_active, 2); - fs_info->endio_freespace_worker = - btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); - fs_info->delayed_workers = - btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); - fs_info->readahead_workers = - btrfs_alloc_workqueue("readahead", flags, max_active, 2); - fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); - fs_info->extent_workers = - btrfs_alloc_workqueue("extent-refs", flags, - min_t(u64, fs_devices->num_devices, - max_active), 8); - - if (!(fs_info->workers && fs_info->delalloc_workers && - fs_info->submit_workers && fs_info->flush_workers && - fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->endio_meta_write_workers && - fs_info->endio_repair_workers && - fs_info->endio_write_workers && fs_info->endio_raid56_workers && - fs_info->endio_freespace_worker && fs_info->rmw_workers && - fs_info->caching_workers && fs_info->readahead_workers && - fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->extent_workers && - fs_info->qgroup_rescan_workers)) { - err = -ENOMEM; + ret = btrfs_init_workqueues(fs_info, fs_devices); + if (ret) { + err = ret; goto fail_sb_buffer; } @@ -2688,7 +2820,7 @@ int open_ctree(struct super_block *sb, * keep the device that is marked to be the target device for the * dev_replace procedure */ - btrfs_close_extra_devices(fs_info, fs_devices, 0); + btrfs_close_extra_devices(fs_devices, 0); if (!fs_devices->latest_bdev) { printk(KERN_ERR "BTRFS: failed to read devices on %s\n", @@ -2714,61 +2846,9 @@ retry_root_backup: tree_root->commit_root = btrfs_root_node(tree_root); btrfs_set_root_refs(&tree_root->root_item, 1); - location.objectid = BTRFS_EXTENT_TREE_OBJECTID; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = 0; - - extent_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(extent_root)) { - ret = PTR_ERR(extent_root); - goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state); - fs_info->extent_root = extent_root; - - location.objectid = BTRFS_DEV_TREE_OBJECTID; - dev_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(dev_root)) { - ret = PTR_ERR(dev_root); - goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state); - fs_info->dev_root = dev_root; - btrfs_init_devices_late(fs_info); - - location.objectid = BTRFS_CSUM_TREE_OBJECTID; - csum_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(csum_root)) { - ret = PTR_ERR(csum_root); + ret = btrfs_read_roots(fs_info, tree_root); + if (ret) goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state); - fs_info->csum_root = csum_root; - - location.objectid = BTRFS_QUOTA_TREE_OBJECTID; - quota_root = btrfs_read_tree_root(tree_root, &location); - if (!IS_ERR(quota_root)) { - set_bit(BTRFS_ROOT_TRACK_DIRTY, "a_root->state); - fs_info->quota_enabled = 1; - fs_info->pending_quota_state = 1; - fs_info->quota_root = quota_root; - } - - location.objectid = BTRFS_UUID_TREE_OBJECTID; - uuid_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(uuid_root)) { - ret = PTR_ERR(uuid_root); - if (ret != -ENOENT) - goto recovery_tree_root; - create_uuid_tree = true; - check_uuid_tree = false; - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state); - fs_info->uuid_root = uuid_root; - create_uuid_tree = false; - check_uuid_tree = - generation != btrfs_super_uuid_tree_generation(disk_super); - } fs_info->generation = generation; fs_info->last_trans_committed = generation; @@ -2792,7 +2872,7 @@ retry_root_backup: goto fail_block_groups; } - btrfs_close_extra_devices(fs_info, fs_devices, 1); + btrfs_close_extra_devices(fs_devices, 1); ret = btrfs_sysfs_add_one(fs_info); if (ret) { @@ -2806,7 +2886,7 @@ retry_root_backup: goto fail_sysfs; } - ret = btrfs_read_block_groups(extent_root); + ret = btrfs_read_block_groups(fs_info->extent_root); if (ret) { printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); goto fail_sysfs; @@ -2864,48 +2944,11 @@ retry_root_backup: /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0) { - u64 bytenr = btrfs_super_log_root(disk_super); - - if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "BTRFS: log replay required " - "on RO media\n"); - err = -EIO; - goto fail_qgroup; - } - - log_tree_root = btrfs_alloc_root(fs_info); - if (!log_tree_root) { - err = -ENOMEM; - goto fail_qgroup; - } - - __setup_root(nodesize, sectorsize, stripesize, - log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); - - log_tree_root->node = read_tree_block(tree_root, bytenr, - generation + 1); - if (!log_tree_root->node || - !extent_buffer_uptodate(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); - goto fail_qgroup; - } - /* returns with log_tree_root freed on success */ - ret = btrfs_recover_log_trees(log_tree_root); + ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { - btrfs_error(tree_root->fs_info, ret, - "Failed to recover log tree"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + err = ret; goto fail_qgroup; } - - if (sb->s_flags & MS_RDONLY) { - ret = btrfs_commit_super(tree_root); - if (ret) - goto fail_qgroup; - } } ret = btrfs_find_orphan_roots(tree_root); @@ -2966,7 +3009,7 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (create_uuid_tree) { + if (!fs_info->uuid_root) { pr_info("BTRFS: creating UUID tree\n"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { @@ -2975,8 +3018,9 @@ retry_root_backup: close_ctree(tree_root); return ret; } - } else if (check_uuid_tree || - btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { + } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || + fs_info->generation != + btrfs_super_uuid_tree_generation(disk_super)) { pr_info("BTRFS: checking UUID tree\n"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { @@ -3668,7 +3712,7 @@ void close_ctree(struct btrfs_root *root) if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) - btrfs_err(root->fs_info, "commit super ret %d", ret); + btrfs_err(fs_info, "commit super ret %d", ret); } if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) @@ -3680,10 +3724,10 @@ void close_ctree(struct btrfs_root *root) fs_info->closing = 2; smp_mb(); - btrfs_free_qgroup_config(root->fs_info); + btrfs_free_qgroup_config(fs_info); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { - btrfs_info(root->fs_info, "at unmount delalloc count %lld", + btrfs_info(fs_info, "at unmount delalloc count %lld", percpu_counter_sum(&fs_info->delalloc_bytes)); } @@ -3723,7 +3767,7 @@ void close_ctree(struct btrfs_root *root) btrfs_free_stripe_hash_table(fs_info); - btrfs_free_block_rsv(root, root->orphan_block_rsv); + __btrfs_free_block_rsv(root->orphan_block_rsv); root->orphan_block_rsv = NULL; lock_chunks(root); @@ -4134,7 +4178,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { - eb = btrfs_find_tree_block(root, start); + eb = btrfs_find_tree_block(root->fs_info, start); start += root->nodesize; if (!eb) continue; @@ -4285,7 +4329,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) return 0; } -static struct extent_io_ops btree_extent_io_ops = { +static const struct extent_io_ops btree_extent_io_ops = { .readpage_end_io_hook = btree_readpage_end_io_hook, .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 27d44c0fd236..d4cbfeeeedd4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,7 +52,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr); void clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf); + struct btrfs_fs_info *fs_info, struct extent_buffer *buf); int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); @@ -61,7 +61,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); -struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, +struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 37d164540c3a..8d052209f473 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -152,7 +152,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, static struct dentry *btrfs_get_parent(struct dentry *child) { - struct inode *dir = child->d_inode; + struct inode *dir = d_inode(child); struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -220,8 +220,8 @@ fail: static int btrfs_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *inode = child->d_inode; - struct inode *dir = parent->d_inode; + struct inode *inode = d_inode(child); + struct inode *dir = d_inode(parent); struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_inode_ref *iref; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8b353ad02f03..7effed6f2fa6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2538,6 +2538,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * list before we release it. */ if (btrfs_delayed_ref_is_head(ref)) { + if (locked_ref->is_data && + locked_ref->total_ref_mod < 0) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= ref->num_bytes; + spin_unlock(&delayed_refs->lock); + } btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; } @@ -2561,8 +2567,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, */ spin_lock(&delayed_refs->lock); avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; - avg = div64_u64(avg, 4); - fs_info->avg_delayed_ref_runtime = avg; + fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ spin_unlock(&delayed_refs->lock); } return 0; @@ -2624,7 +2629,26 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) * We don't ever fill up leaves all the way so multiply by 2 just to be * closer to what we're really going to want to ouse. */ - return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); + return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +/* + * Takes the number of bytes to be csumm'ed and figures out how many leaves it + * would require to store the csums for that many bytes. + */ +u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) +{ + u64 csum_size; + u64 num_csums_per_leaf; + u64 num_csums; + + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = div64_u64(csum_size, + (u64)btrfs_super_csum_size(root->fs_info->super_copy)); + num_csums = div64_u64(csum_bytes, root->sectorsize); + num_csums += num_csums_per_leaf - 1; + num_csums = div64_u64(num_csums, num_csums_per_leaf); + return num_csums; } int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, @@ -2632,7 +2656,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, { struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; - u64 num_bytes; + u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; + u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; + u64 num_bytes, num_dirty_bgs_bytes; int ret = 0; num_bytes = btrfs_calc_trans_metadata_size(root, 1); @@ -2640,17 +2666,22 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, if (num_heads > 1) num_bytes += (num_heads - 1) * root->nodesize; num_bytes <<= 1; + num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize; + num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root, + num_dirty_bgs); global_rsv = &root->fs_info->global_block_rsv; /* * If we can't allocate any more chunks lets make sure we have _lots_ of * wiggle room since running delayed refs can create more delayed refs. */ - if (global_rsv->space_info->full) + if (global_rsv->space_info->full) { + num_dirty_bgs_bytes <<= 1; num_bytes <<= 1; + } spin_lock(&global_rsv->lock); - if (global_rsv->reserved <= num_bytes) + if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) ret = 1; spin_unlock(&global_rsv->lock); return ret; @@ -3147,10 +3178,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, bi = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); fail: - if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_release_path(path); return ret; } @@ -3193,7 +3222,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct inode *inode = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; - int num_pages = 0; + u64 num_pages = 0; int retries = 0; int ret = 0; @@ -3267,15 +3296,14 @@ again: if (ret) goto out_put; - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) goto out_put; } spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE) || - block_group->delalloc_bytes) { + !btrfs_test_opt(root, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3293,14 +3321,14 @@ again: * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); + num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024); if (!num_pages) num_pages = 1; num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages); + ret = btrfs_check_data_free_space(inode, num_pages, num_pages); if (ret) goto out_put; @@ -3351,16 +3379,188 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, return 0; } +/* + * transaction commit does final block group cache writeback during a + * critical section where nothing is allowed to change the FS. This is + * required in order for the cache to actually match the block group, + * but can introduce a lot of latency into the commit. + * + * So, btrfs_start_dirty_block_groups is here to kick off block group + * cache IO. There's a chance we'll have to redo some of it if the + * block group changes again during the commit, but it greatly reduces + * the commit latency by getting rid of the easy block groups while + * we're still allowing others to join the commit. + */ +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path = NULL; + LIST_HEAD(dirty); + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + int loops = 0; + + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cur_trans->dirty_bgs)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + return 0; + } + list_splice_init(&cur_trans->dirty_bgs, &dirty); + spin_unlock(&cur_trans->dirty_bgs_lock); + +again: + /* + * make sure all the block groups on our dirty list actually + * exist + */ + btrfs_create_pending_block_groups(trans, root); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + /* + * cache_write_mutex is here only to save us from balance or automatic + * removal of empty block groups deleting this block group while we are + * writing out the cache + */ + mutex_lock(&trans->transaction->cache_write_mutex); + while (!list_empty(&dirty)) { + cache = list_first_entry(&dirty, + struct btrfs_block_group_cache, + dirty_list); + /* + * this can happen if something re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); + } + + + /* + * btrfs_wait_cache_io uses the cache->dirty_list to decide + * if it should update the cache_state. Don't delete + * until after we wait. + * + * Since we're not running in the commit critical section + * we need the dirty_bgs_lock to protect from update_block_group + */ + spin_lock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + + /* + * the cache_write_mutex is protecting + * the io_list + */ + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = write_one_cache_group(trans, root, path, cache); + /* + * Our block group might still be attached to the list + * of new block groups in the transaction handle of some + * other task (struct btrfs_trans_handle->new_bgs). This + * means its block group item isn't yet in the extent + * tree. If this happens ignore the error, as we will + * try again later in the critical section of the + * transaction commit. + */ + if (ret == -ENOENT) { + ret = 0; + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &cur_trans->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret) { + btrfs_abort_transaction(trans, root, ret); + } + } + + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + + if (ret) + break; + + /* + * Avoid blocking other tasks for too long. It might even save + * us from writing caches for block groups that are going to be + * removed. + */ + mutex_unlock(&trans->transaction->cache_write_mutex); + mutex_lock(&trans->transaction->cache_write_mutex); + } + mutex_unlock(&trans->transaction->cache_write_mutex); + + /* + * go through delayed refs for all the stuff we've just kicked off + * and then loop back (just once) + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + if (!ret && loops == 0) { + loops++; + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&cur_trans->dirty_bgs, &dirty); + /* + * dirty_bgs_lock protects us from concurrent block group + * deletes too (not just cache_write_mutex). + */ + if (!list_empty(&dirty)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + goto again; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } + + btrfs_free_path(path); + return ret; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_group_cache *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; + int should_put; struct btrfs_path *path; - - if (list_empty(&cur_trans->dirty_bgs)) - return 0; + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -3376,16 +3576,64 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, dirty_list); + + /* + * this can happen if cache_save_setup re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); + } + + /* + * don't remove from the dirty list until after we've waited + * on any pending IO + */ list_del_init(&cache->dirty_list); - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - cache_save_setup(cache, trans, path); - if (!ret) - ret = btrfs_run_delayed_refs(trans, root, - (unsigned long) -1); - if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) - btrfs_write_out_cache(root, trans, cache, path); + should_put = 1; + + cache_save_setup(cache, trans, path); + if (!ret) + ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); + + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { ret = write_one_cache_group(trans, root, path, cache); + if (ret) + btrfs_abort_transaction(trans, root, ret); + } + + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + } + + while (!list_empty(io)) { + cache = list_first_entry(io, struct btrfs_block_group_cache, + io_list); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, cache->key.objectid); btrfs_put_block_group(cache); } @@ -3635,19 +3883,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) * This will check the space that the inode allocates from to make sure we have * enough space for bytes. */ -int btrfs_check_data_free_space(struct inode *inode, u64 bytes) +int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 used; - int ret = 0, committed = 0, alloc_chunk = 1; + int ret = 0; + int need_commit = 2; + int have_pinned_space; /* make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, root->sectorsize); if (btrfs_is_free_space_inode(inode)) { - committed = 1; + need_commit = 0; ASSERT(current->journal_info); } @@ -3669,7 +3919,7 @@ again: * if we don't have enough free bytes in this space then we need * to alloc a new chunk. */ - if (!data_sinfo->full && alloc_chunk) { + if (!data_sinfo->full) { u64 alloc_target; data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; @@ -3697,8 +3947,10 @@ alloc: if (ret < 0) { if (ret != -ENOSPC) return ret; - else + else { + have_pinned_space = 1; goto commit_trans; + } } if (!data_sinfo) @@ -3709,26 +3961,39 @@ alloc: /* * If we don't have enough pinned space to deal with this - * allocation don't bother committing the transaction. + * allocation, and no removed chunk in current transaction, + * don't bother committing the transaction. */ - if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, - bytes) < 0) - committed = 1; + have_pinned_space = percpu_counter_compare( + &data_sinfo->total_bytes_pinned, + used + bytes - data_sinfo->total_bytes); spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ commit_trans: - if (!committed && + if (need_commit && !atomic_read(&root->fs_info->open_ioctl_trans)) { - committed = 1; + need_commit--; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - goto again; + if (have_pinned_space >= 0 || + trans->transaction->have_free_bgs || + need_commit > 0) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + /* + * make sure that all running delayed iput are + * done + */ + down_write(&root->fs_info->delayed_iput_sem); + up_write(&root->fs_info->delayed_iput_sem); + goto again; + } else { + btrfs_end_transaction(trans, root); + } } trace_btrfs_space_reservation(root->fs_info, @@ -3736,12 +4001,16 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } + ret = btrfs_qgroup_reserve(root, write_bytes); + if (ret) + goto out; data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 1); +out: spin_unlock(&data_sinfo->lock); - return 0; + return ret; } /* @@ -4298,8 +4567,13 @@ out: static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, struct btrfs_fs_info *fs_info, u64 used) { - return (used >= div_factor_fine(space_info->total_bytes, 98) && - !btrfs_fs_closing(fs_info) && + u64 thresh = div_factor_fine(space_info->total_bytes, 98); + + /* If we're just plain full then async reclaim just slows us down. */ + if (space_info->bytes_used >= thresh) + return 0; + + return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } @@ -4354,10 +4628,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (!btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) return; - } while (flush_state <= COMMIT_TRANS); - - if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) - queue_work(system_unbound_wq, work); + } while (flush_state < COMMIT_TRANS); } void btrfs_init_async_reclaim_work(struct work_struct *work) @@ -4700,6 +4971,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } +void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) +{ + kfree(rsv); +} + int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush) @@ -4812,10 +5088,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * csum_size * 2; - num_bytes += div64_u64(data_used + meta_used, 50); + num_bytes += div_u64(data_used + meta_used, 50); if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3); + num_bytes = div_u64(meta_used, 3); return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); } @@ -4998,8 +5274,6 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, u64 qgroup_reserved) { btrfs_block_rsv_release(root, rsv, (u64)-1); - if (qgroup_reserved) - btrfs_qgroup_free(root, qgroup_reserved); } /** @@ -5066,30 +5340,18 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, int reserve) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 csum_size; - int num_csums_per_leaf; - int num_csums; - int old_csums; + u64 old_csums, num_csums; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && BTRFS_I(inode)->csum_bytes == 0) return 0; - old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); if (reserve) BTRFS_I(inode)->csum_bytes += num_bytes; else BTRFS_I(inode)->csum_bytes -= num_bytes; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); - num_csums_per_leaf = (int)div64_u64(csum_size, - sizeof(struct btrfs_csum_item) + - sizeof(struct btrfs_disk_key)); - num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); - num_csums = num_csums + num_csums_per_leaf - 1; - num_csums = num_csums / num_csums_per_leaf; - - old_csums = old_csums + num_csums_per_leaf - 1; - old_csums = old_csums / num_csums_per_leaf; + num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); /* No change, no need to reserve more */ if (old_csums == num_csums) @@ -5163,8 +5425,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_unlock(&BTRFS_I(inode)->lock); if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, num_bytes + - nr_extents * root->nodesize); + ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize); if (ret) goto out_fail; } @@ -5172,8 +5433,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, num_bytes + - nr_extents * root->nodesize); + btrfs_qgroup_free(root, nr_extents * root->nodesize); goto out_fail; } @@ -5290,10 +5550,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); - if (root->fs_info->quota_enabled) { - btrfs_qgroup_free(root, num_bytes + - dropped * root->nodesize); - } btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); @@ -5318,7 +5574,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) { int ret; - ret = btrfs_check_data_free_space(inode, num_bytes); + ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes); if (ret) return ret; @@ -5390,14 +5646,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, if (!alloc && cache->cached == BTRFS_CACHE_NO) cache_block_group(cache, 1); - spin_lock(&trans->transaction->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &trans->transaction->dirty_bgs); - btrfs_get_block_group(cache); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); - byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -5446,6 +5694,16 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->unused_bgs_lock); } } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + trans->transaction->num_dirty_bgs++; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; @@ -6956,15 +7214,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, return -ENOSPC; } - if (btrfs_test_opt(root, DISCARD)) - ret = btrfs_discard_extent(root, start, len, NULL); - if (pin) pin_down_extent(root, cache, start, len, 1); else { + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); } + btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -7095,9 +7353,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); if (ret) { + btrfs_free_path(path); btrfs_free_and_pin_reserved_extent(root, ins->objectid, root->nodesize); - btrfs_free_path(path); return ret; } @@ -7217,7 +7475,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); - clean_tree_block(trans, root, buf); + clean_tree_block(trans, root->fs_info, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); @@ -7311,7 +7569,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, * returns the key for the extent through ins, and a tree buffer for * the first block of the extent through buf. * - * returns the tree buffer or NULL. + * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -7322,6 +7580,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; + struct btrfs_delayed_extent_op *extent_op; u64 flags = 0; int ret; u32 blocksize = root->nodesize; @@ -7342,13 +7601,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ret = btrfs_reserve_extent(root, blocksize, blocksize, empty_size, hint, &ins, 0, 0); - if (ret) { - unuse_block_rsv(root->fs_info, block_rsv, blocksize); - return ERR_PTR(ret); - } + if (ret) + goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); - BUG_ON(IS_ERR(buf)); /* -ENOMEM */ + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_free_reserved; + } if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -7358,9 +7618,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; extent_op = btrfs_alloc_delayed_extent_op(); - BUG_ON(!extent_op); /* -ENOMEM */ + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; + } if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); else @@ -7375,13 +7637,24 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, - ins.objectid, - ins.offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, 0); - BUG_ON(ret); /* -ENOMEM */ + ins.objectid, ins.offset, + parent, root_objectid, level, + BTRFS_ADD_DELAYED_EXTENT, + extent_op, 0); + if (ret) + goto out_free_delayed; } return buf; + +out_free_delayed: + btrfs_free_delayed_extent_op(extent_op); +out_free_buf: + free_extent_buffer(buf); +out_free_reserved: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); +out_unuse: + unuse_block_rsv(root->fs_info, block_rsv, blocksize); + return ERR_PTR(ret); } struct walk_control { @@ -7815,7 +8088,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); blocksize = root->nodesize; - next = btrfs_find_tree_block(root, bytenr); + next = btrfs_find_tree_block(root->fs_info, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr); if (!next) @@ -8016,7 +8289,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } - clean_tree_block(trans, root, eb); + clean_tree_block(trans, root->fs_info, eb); } if (eb == root->node) { @@ -8533,10 +8806,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); +again: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); + /* + * we're not allowed to set block groups readonly after the dirty + * block groups cache has started writing. If it already started, + * back off and let this transaction commit + */ + mutex_lock(&root->fs_info->ro_block_group_mutex); + if (trans->transaction->dirty_bg_run) { + u64 transid = trans->transid; + + mutex_unlock(&root->fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans, root); + + ret = btrfs_wait_for_commit(root, transid); + if (ret) + return ret; + goto again; + } + + ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8551,6 +8844,7 @@ out: alloc_flags = update_block_group_flags(root, cache->flags); check_system_chunk(trans, root, alloc_flags); } + mutex_unlock(&root->fs_info->ro_block_group_mutex); btrfs_end_transaction(trans, root); return ret; @@ -8720,7 +9014,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) min_free <<= 1; } else if (index == BTRFS_RAID_RAID0) { dev_min = fs_devices->rw_devices; - do_div(min_free, dev_min); + min_free = div64_u64(min_free, dev_min); } /* We need to do this so that we can look at pending chunks */ @@ -8992,6 +9286,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); INIT_LIST_HEAD(&cache->dirty_list); + INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); @@ -9355,7 +9650,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; } + /* + * get the inode first so any iput calls done for the io_list + * aren't the final iput (no unlinks allowed now) + */ inode = lookup_free_space_inode(tree_root, block_group, path); + + mutex_lock(&trans->transaction->cache_write_mutex); + /* + * make sure our free spache cache IO is done before remove the + * free space inode + */ + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); + + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_wait_cache_io(root, trans, block_group, + &block_group->io_ctl, path, + block_group->key.objectid); + btrfs_put_block_group(block_group); + spin_lock(&trans->transaction->dirty_bgs_lock); + } + + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + mutex_unlock(&trans->transaction->cache_write_mutex); + if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); if (ret) { @@ -9448,18 +9774,29 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&trans->transaction->dirty_bgs_lock); if (!list_empty(&block_group->dirty_list)) { - list_del_init(&block_group->dirty_list); - btrfs_put_block_group(block_group); + WARN_ON(1); + } + if (!list_empty(&block_group->io_list)) { + WARN_ON(1); } spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); + + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + WARN_ON(block_group->space_info->total_bytes + < block_group->key.offset); + WARN_ON(block_group->space_info->bytes_readonly + < block_group->key.offset); + WARN_ON(block_group->space_info->disk_total + < block_group->key.offset * factor); + } block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; block_group->space_info->disk_total -= block_group->key.offset * factor; + spin_unlock(&block_group->space_info->lock); memcpy(&key, &block_group->key, sizeof(key)); @@ -9647,8 +9984,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* Reset pinned so btrfs_put_block_group doesn't complain */ + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + space_info->bytes_pinned -= block_group->pinned; + space_info->bytes_readonly += block_group->pinned; + percpu_counter_add(&space_info->total_bytes_pinned, + -block_group->pinned); block_group->pinned = 0; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + /* * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d688cfe5d496..c32d226bfecc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4514,8 +4514,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } ret = fiemap_fill_next_extent(fieinfo, em_start, disko, em_len, flags); - if (ret) + if (ret) { + if (ret == 1) + ret = 0; goto out_free; + } } out_free: free_extent_map(em); @@ -4557,36 +4560,37 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) do { index--; page = eb->pages[index]; - if (page && mapped) { + if (!page) + continue; + if (mapped) spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to - * this eb. + * We need to make sure we haven't be attached + * to a new eb. */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { - BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - page_cache_release(page); - } - spin_unlock(&page->mapping->private_lock); - - } - if (page) { - /* One for when we alloced the page */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ page_cache_release(page); } + + if (mapped) + spin_unlock(&page->mapping->private_lock); + + /* One for when we alloced the page */ + page_cache_release(page); } while (index != 0); } @@ -4768,6 +4772,25 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); + /* + * Lock our eb's refs_lock to avoid races with + * free_extent_buffer. When we get our eb it might be flagged + * with EXTENT_BUFFER_STALE and another task running + * free_extent_buffer might have seen that flag set, + * eb->refs == 2, that the buffer isn't under IO (dirty and + * writeback flags not set) and it's still in the tree (flag + * EXTENT_BUFFER_TREE_REF set), therefore being in the process + * of decrementing the extent buffer's reference count twice. + * So here we could race and increment the eb's reference count, + * clear its stale flag, mark it as dirty and drop our reference + * before the other task finishes executing free_extent_buffer, + * which would later result in an attempt to free an extent + * buffer that is dirty. + */ + if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { + spin_lock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); + } mark_extent_buffer_accessed(eb, NULL); return eb; } @@ -4867,6 +4890,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, mark_extent_buffer_accessed(exists, p); goto free_eb; } + exists = NULL; /* * Do this so attach doesn't complain and we need to @@ -4930,12 +4954,12 @@ again: return eb; free_eb: + WARN_ON(!atomic_dec_and_test(&eb->refs)); for (i = 0; i < num_pages; i++) { if (eb->pages[i]) unlock_page(eb->pages[i]); } - WARN_ON(!atomic_dec_and_test(&eb->refs)); btrfs_release_extent_buffer(eb); return exists; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 695b0ccfb755..c668f36898d3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -97,7 +97,7 @@ struct extent_io_tree { u64 dirty_bytes; int track_uptodate; spinlock_t lock; - struct extent_io_ops *ops; + const struct extent_io_ops *ops; }; struct extent_state { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 84a2d1868271..58ece6558430 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -185,8 +185,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, - GFP_NOFS); + btrfs_bio->csum_allocated = kmalloc_array(nblocks, + csum_size, GFP_NOFS); if (!btrfs_bio->csum_allocated) { btrfs_free_path(path); return -ENOMEM; @@ -553,7 +553,7 @@ static noinline void truncate_one_csum(struct btrfs_root *root, btrfs_truncate_item(root, path, new_size, 0); key->offset = end_byte; - btrfs_set_item_key_safe(root, path, key); + btrfs_set_item_key_safe(root->fs_info, path, key); } else { BUG(); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index faa7d390841b..b072e17479aa 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -273,11 +273,7 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) defrag = rb_entry(node, struct inode_defrag, rb_node); kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - if (need_resched()) { - spin_unlock(&fs_info->defrag_inodes_lock); - cond_resched(); - spin_lock(&fs_info->defrag_inodes_lock); - } + cond_resched_lock(&fs_info->defrag_inodes_lock); node = rb_first(&fs_info->defrag_inodes); } @@ -868,7 +864,7 @@ next_slot: memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); extent_offset += end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); @@ -1126,7 +1122,7 @@ again: ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, @@ -1160,7 +1156,7 @@ again: trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1485,7 +1481,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, PAGE_CACHE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); - pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -1514,7 +1510,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = btrfs_check_data_free_space(inode, reserve_bytes); + ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); if (ret == -ENOSPC && (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) { @@ -1635,8 +1631,8 @@ again: btrfs_end_write_no_snapshoting(root); if (only_release_metadata && copied > 0) { - u64 lockstart = round_down(pos, root->sectorsize); - u64 lockend = lockstart + + lockstart = round_down(pos, root->sectorsize); + lockend = lockstart + (dirty_pages << PAGE_CACHE_SHIFT) - 1; set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, @@ -1809,7 +1805,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occured. */ + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_sub_trans = root->log_transid; + spin_unlock(&BTRFS_I(inode)->lock); if (num_written > 0) { err = generic_write_sync(file, pos, num_written); if (err < 0) @@ -1864,7 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; @@ -2162,7 +2160,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, u64 num_bytes; key.offset = offset; - btrfs_set_item_key_safe(root, path, &key); + btrfs_set_item_key_safe(root->fs_info, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - @@ -2545,7 +2543,6 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -2570,14 +2567,9 @@ static long btrfs_fallocate(struct file *file, int mode, * Make sure we have enough space before we do the * allocation. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); if (ret) return ret; - if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); - if (ret) - goto out_reserve_fail; - } mutex_lock(&inode->i_mutex); ret = inode_newsize_ok(inode, alloc_end); @@ -2667,23 +2659,35 @@ static long btrfs_fallocate(struct file *file, int mode, 1 << inode->i_blkbits, offset + len, &alloc_hint); - - if (ret < 0) { - free_extent_map(em); - break; - } } else if (actual_end > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + /* * We didn't need to allocate any more space, but we * still extended the size of the file so we need to - * update i_size. + * update i_size and the inode item. */ - inode->i_ctime = CURRENT_TIME; - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, NULL); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, + NULL); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_end_transaction(trans, + root); + } } free_extent_map(em); + if (ret < 0) + break; cur_offset = last_byte; if (cur_offset >= alloc_end) { @@ -2695,9 +2699,6 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); - if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, alloc_end - alloc_start); -out_reserve_fail: /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index a71978578fa7..9dbe5b548fa6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -85,7 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, } mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + mapping_gfp_mask(inode->i_mapping) & + ~(__GFP_FS | __GFP_HIGHMEM)); return inode; } @@ -170,13 +171,13 @@ static int __create_free_space_inode(struct btrfs_root *root, key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; key.type = 0; - ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { btrfs_release_path(path); return ret; } + leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); @@ -225,9 +226,37 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, struct inode *inode) { int ret = 0; + struct btrfs_path *path = btrfs_alloc_path(); + + if (!path) { + ret = -ENOMEM; + goto fail; + } + + if (block_group) { + mutex_lock(&trans->transaction->cache_write_mutex); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + btrfs_wait_cache_io(root, trans, block_group, + &block_group->io_ctl, path, + block_group->key.objectid); + btrfs_put_block_group(block_group); + } + + /* + * now that we've truncated the cache away, its no longer + * setup or written + */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + } + btrfs_free_path(path); btrfs_i_size_write(inode, 0); truncate_pagecache(inode, 0); @@ -235,15 +264,23 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, /* * We don't need an orphan item because truncating the free space cache * will never be split across transactions. + * We don't need to check for -EAGAIN because we're a free space + * cache inode */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); if (ret) { + mutex_unlock(&trans->transaction->cache_write_mutex); btrfs_abort_transaction(trans, root, ret); return ret; } ret = btrfs_update_inode(trans, root, inode); + + if (block_group) + mutex_unlock(&trans->transaction->cache_write_mutex); + +fail: if (ret) btrfs_abort_transaction(trans, root, ret); @@ -269,18 +306,7 @@ static int readahead_cache(struct inode *inode) return 0; } -struct io_ctl { - void *cur, *orig; - struct page *page; - struct page **pages; - struct btrfs_root *root; - unsigned long size; - int index; - int num_pages; - unsigned check_crcs:1; -}; - -static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, +static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, struct btrfs_root *root, int write) { int num_pages; @@ -296,45 +322,46 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) return -ENOSPC; - memset(io_ctl, 0, sizeof(struct io_ctl)); + memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); - io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); + io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); if (!io_ctl->pages) return -ENOMEM; io_ctl->num_pages = num_pages; io_ctl->root = root; io_ctl->check_crcs = check_crcs; + io_ctl->inode = inode; return 0; } -static void io_ctl_free(struct io_ctl *io_ctl) +static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { kfree(io_ctl->pages); + io_ctl->pages = NULL; } -static void io_ctl_unmap_page(struct io_ctl *io_ctl) +static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) { if (io_ctl->cur) { - kunmap(io_ctl->page); io_ctl->cur = NULL; io_ctl->orig = NULL; } } -static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) +static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) { ASSERT(io_ctl->index < io_ctl->num_pages); io_ctl->page = io_ctl->pages[io_ctl->index++]; - io_ctl->cur = kmap(io_ctl->page); + io_ctl->cur = page_address(io_ctl->page); io_ctl->orig = io_ctl->cur; io_ctl->size = PAGE_CACHE_SIZE; if (clear) memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); } -static void io_ctl_drop_pages(struct io_ctl *io_ctl) +static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) { int i; @@ -349,7 +376,7 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl) } } -static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode, int uptodate) { struct page *page; @@ -383,7 +410,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, return 0; } -static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) +static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { __le64 *val; @@ -406,7 +433,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) io_ctl->cur += sizeof(u64); } -static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) +static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { __le64 *gen; @@ -435,7 +462,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) return 0; } -static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp; u32 crc = ~(u32)0; @@ -453,13 +480,12 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); io_ctl_unmap_page(io_ctl); - tmp = kmap(io_ctl->pages[0]); + tmp = page_address(io_ctl->pages[0]); tmp += index; *tmp = crc; - kunmap(io_ctl->pages[0]); } -static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp, val; u32 crc = ~(u32)0; @@ -473,10 +499,9 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - tmp = kmap(io_ctl->pages[0]); + tmp = page_address(io_ctl->pages[0]); tmp += index; val = *tmp; - kunmap(io_ctl->pages[0]); io_ctl_map_page(io_ctl, 0); crc = btrfs_csum_data(io_ctl->orig + offset, crc, @@ -492,7 +517,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) return 0; } -static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, +static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, void *bitmap) { struct btrfs_free_space_entry *entry; @@ -522,7 +547,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, return 0; } -static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) +static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) { if (!io_ctl->cur) return -ENOSPC; @@ -545,7 +570,7 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) return 0; } -static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) +static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) { /* * If we're not on the boundary we know we've modified the page and we @@ -562,7 +587,7 @@ static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) } } -static int io_ctl_read_entry(struct io_ctl *io_ctl, +static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; @@ -589,7 +614,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, return 0; } -static int io_ctl_read_bitmap(struct io_ctl *io_ctl, +static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry) { int ret; @@ -648,7 +673,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, { struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct io_ctl io_ctl; + struct btrfs_io_ctl io_ctl; struct btrfs_key key; struct btrfs_free_space *e, *n; LIST_HEAD(bitmaps); @@ -877,7 +902,7 @@ out: } static noinline_for_stack -int write_cache_extent_entries(struct io_ctl *io_ctl, +int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, int *entries, int *bitmaps, @@ -885,6 +910,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, { int ret; struct btrfs_free_cluster *cluster = NULL; + struct btrfs_free_cluster *cluster_locked = NULL; struct rb_node *node = rb_first(&ctl->free_space_offset); struct btrfs_trim_range *trim_entry; @@ -896,6 +922,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, } if (!node && cluster) { + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); node = rb_first(&cluster->root); cluster = NULL; } @@ -919,9 +947,15 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, node = rb_next(node); if (!node && cluster) { node = rb_first(&cluster->root); + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); cluster = NULL; } } + if (cluster_locked) { + spin_unlock(&cluster_locked->lock); + cluster_locked = NULL; + } /* * Make sure we don't miss any range that was removed from our rbtree @@ -939,6 +973,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, return 0; fail: + if (cluster_locked) + spin_unlock(&cluster_locked->lock); return -ENOSPC; } @@ -1000,7 +1036,7 @@ fail: static noinline_for_stack int write_pinned_extent_entries(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, - struct io_ctl *io_ctl, + struct btrfs_io_ctl *io_ctl, int *entries) { u64 start, extent_start, extent_end, len; @@ -1050,7 +1086,7 @@ write_pinned_extent_entries(struct btrfs_root *root, } static noinline_for_stack int -write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) +write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) { struct list_head *pos, *n; int ret; @@ -1083,10 +1119,7 @@ static int flush_dirty_cache(struct inode *inode) } static void noinline_for_stack -cleanup_write_cache_enospc(struct inode *inode, - struct io_ctl *io_ctl, - struct extent_state **cached_state, - struct list_head *bitmap_list) +cleanup_bitmap_list(struct list_head *bitmap_list) { struct list_head *pos, *n; @@ -1095,12 +1128,85 @@ cleanup_write_cache_enospc(struct inode *inode, list_entry(pos, struct btrfs_free_space, list); list_del_init(&entry->list); } +} + +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, + struct btrfs_io_ctl *io_ctl, + struct extent_state **cached_state, + struct list_head *bitmap_list) +{ io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, cached_state, GFP_NOFS); } +int btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset) +{ + int ret; + struct inode *inode = io_ctl->inode; + + if (!inode) + return 0; + + if (block_group) + root = root->fs_info->tree_root; + + /* Flush the dirty pages in the cache file. */ + ret = flush_dirty_cache(inode); + if (ret) + goto out; + + /* Update the cache item to tell everyone this cache file is valid. */ + ret = update_cache_item(trans, root, inode, path, offset, + io_ctl->entries, io_ctl->bitmaps); +out: + io_ctl_free(io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + if (block_group) { +#ifdef DEBUG + btrfs_err(root->fs_info, + "failed to write free space cache for block group %llu", + block_group->key.objectid); +#endif + } + } + btrfs_update_inode(trans, root, inode); + + if (block_group) { + /* the dirty list is protected by the dirty_bgs_lock */ + spin_lock(&trans->transaction->dirty_bgs_lock); + + /* the disk_cache_state is protected by the block group lock */ + spin_lock(&block_group->lock); + + /* + * only mark this as written if we didn't get put back on + * the dirty list while waiting for IO. Otherwise our + * cache state won't be right, and we won't get written again + */ + if (!ret && list_empty(&block_group->dirty_list)) + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + else if (ret) + block_group->disk_cache_state = BTRFS_DC_ERROR; + + spin_unlock(&block_group->lock); + spin_unlock(&trans->transaction->dirty_bgs_lock); + io_ctl->inode = NULL; + iput(inode); + } + + return ret; + +} + /** * __btrfs_write_out_cache - write out cached info to an inode * @root - the root the inode belongs to @@ -1112,27 +1218,29 @@ cleanup_write_cache_enospc(struct inode *inode, * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successfull in writing the cache out, - * and -1 if it was not. + * or an errno if it was not. */ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 offset) { struct extent_state *cached_state = NULL; - struct io_ctl io_ctl; LIST_HEAD(bitmap_list); int entries = 0; int bitmaps = 0; int ret; + int must_iput = 0; if (!i_size_read(inode)) - return -1; + return -EIO; - ret = io_ctl_init(&io_ctl, inode, root, 1); + WARN_ON(io_ctl->pages); + ret = io_ctl_init(io_ctl, inode, root, 1); if (ret) - return -1; + return ret; if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { down_write(&block_group->data_rwsem); @@ -1143,55 +1251,59 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, up_write(&block_group->data_rwsem); BTRFS_I(inode)->generation = 0; ret = 0; + must_iput = 1; goto out; } spin_unlock(&block_group->lock); } /* Lock all pages first so we can lock the extent safely. */ - io_ctl_prepare_pages(&io_ctl, inode, 0); + ret = io_ctl_prepare_pages(io_ctl, inode, 0); + if (ret) + goto out; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state); - io_ctl_set_generation(&io_ctl, trans->transid); + io_ctl_set_generation(io_ctl, trans->transid); mutex_lock(&ctl->cache_writeout_mutex); /* Write out the extent entries in the free space cache */ - ret = write_cache_extent_entries(&io_ctl, ctl, + spin_lock(&ctl->tree_lock); + ret = write_cache_extent_entries(io_ctl, ctl, block_group, &entries, &bitmaps, &bitmap_list); - if (ret) { - mutex_unlock(&ctl->cache_writeout_mutex); - goto out_nospc; - } + if (ret) + goto out_nospc_locked; /* * Some spaces that are freed in the current transaction are pinned, * they will be added into free space cache after the transaction is * committed, we shouldn't lose them. + * + * If this changes while we are working we'll get added back to + * the dirty list and redo it. No locking needed */ - ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); - if (ret) { - mutex_unlock(&ctl->cache_writeout_mutex); - goto out_nospc; - } + ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries); + if (ret) + goto out_nospc_locked; /* * At last, we write out all the bitmaps and keep cache_writeout_mutex * locked while doing it because a concurrent trim can be manipulating * or freeing the bitmap. */ - ret = write_bitmap_entries(&io_ctl, &bitmap_list); + ret = write_bitmap_entries(io_ctl, &bitmap_list); + spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); if (ret) goto out_nospc; /* Zero out the rest of the pages just to make sure */ - io_ctl_zero_remaining_pages(&io_ctl); + io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages, 0, i_size_read(inode), &cached_state); if (ret) goto out_nospc; @@ -1202,30 +1314,44 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * Release the pages and unlock the extent, we will flush * them out later */ - io_ctl_drop_pages(&io_ctl); + io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); - /* Flush the dirty pages in the cache file. */ - ret = flush_dirty_cache(inode); + /* + * at this point the pages are under IO and we're happy, + * The caller is responsible for waiting on them and updating the + * the cache and the inode + */ + io_ctl->entries = entries; + io_ctl->bitmaps = bitmaps; + + ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); if (ret) goto out; - /* Update the cache item to tell everyone this cache file is valid. */ - ret = update_cache_item(trans, root, inode, path, offset, - entries, bitmaps); + return 0; + out: - io_ctl_free(&io_ctl); + io_ctl->inode = NULL; + io_ctl_free(io_ctl); if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); + if (must_iput) + iput(inode); return ret; +out_nospc_locked: + cleanup_bitmap_list(&bitmap_list); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + out_nospc: - cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); + cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list); if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); @@ -1241,7 +1367,6 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; - enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN; root = root->fs_info->tree_root; @@ -1250,34 +1375,34 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); return 0; } - - if (block_group->delalloc_bytes) { - block_group->disk_cache_state = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - return 0; - } spin_unlock(&block_group->lock); inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, + ret = __btrfs_write_out_cache(root, inode, ctl, block_group, + &block_group->io_ctl, trans, path, block_group->key.objectid); if (ret) { - dcs = BTRFS_DC_ERROR; - ret = 0; #ifdef DEBUG btrfs_err(root->fs_info, "failed to write free space cache for block group %llu", block_group->key.objectid); #endif + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&block_group->lock); + + block_group->io_ctl.inode = NULL; + iput(inode); } - spin_lock(&block_group->lock); - block_group->disk_cache_state = dcs; - spin_unlock(&block_group->lock); - iput(inode); + /* + * if ret == 0 the caller is expected to call btrfs_wait_cache_io + * to wait for IO and put the inode + */ + return ret; } @@ -1298,11 +1423,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; - u64 bytes_per_bitmap; + u32 bytes_per_bitmap; bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; bitmap_start = offset - ctl->start; - bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); + bitmap_start = div_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; bitmap_start += ctl->start; @@ -1521,10 +1646,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg); - max_bitmaps = max(max_bitmaps, 1); + max_bitmaps = max_t(u32, max_bitmaps, 1); ASSERT(ctl->total_bitmaps <= max_bitmaps); @@ -1537,7 +1662,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) max_bytes = MAX_CACHE_BYTES_PER_GIG; else max_bytes = MAX_CACHE_BYTES_PER_GIG * - div64_u64(size, 1024 * 1024 * 1024); + div_u64(size, 1024 * 1024 * 1024); /* * we want to account for 1 more bitmap than what we have so we can make @@ -1552,14 +1677,14 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) } /* - * we want the extent entry threshold to always be at most 1/2 the maxw + * we want the extent entry threshold to always be at most 1/2 the max * bytes we can have, or whatever is less than that. */ extent_bytes = max_bytes - bitmap_bytes; - extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); + extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); ctl->extents_thresh = - div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); + div_u64(extent_bytes, sizeof(struct btrfs_free_space)); } static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, @@ -1673,7 +1798,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, */ if (*bytes >= align) { tmp = entry->offset - ctl->start + align - 1; - do_div(tmp, align); + tmp = div64_u64(tmp, align); tmp = tmp * align + ctl->start; align_off = tmp - entry->offset; } else { @@ -2402,11 +2527,8 @@ static void __btrfs_remove_free_space_cache_locked( } else { free_bitmap(ctl, info); } - if (need_resched()) { - spin_unlock(&ctl->tree_lock); - cond_resched(); - spin_lock(&ctl->tree_lock); - } + + cond_resched_lock(&ctl->tree_lock); } } @@ -2431,11 +2553,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster); - if (need_resched()) { - spin_unlock(&ctl->tree_lock); - cond_resched(); - spin_lock(&ctl->tree_lock); - } + + cond_resched_lock(&ctl->tree_lock); } __btrfs_remove_free_space_cache_locked(ctl); spin_unlock(&ctl->tree_lock); @@ -3346,13 +3465,29 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; int ret; + struct btrfs_io_ctl io_ctl; + bool release_metadata = true; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return 0; - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); + memset(&io_ctl, 0, sizeof(io_ctl)); + ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, + trans, path, 0); + if (!ret) { + /* + * At this point writepages() didn't error out, so our metadata + * reservation is released when the writeback finishes, at + * inode.c:btrfs_finish_ordered_io(), regardless of it finishing + * with or without an error. + */ + release_metadata = false; + ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0); + } + if (ret) { - btrfs_delalloc_release_metadata(inode, inode->i_size); + if (release_metadata) + btrfs_delalloc_release_metadata(inode, inode->i_size); #ifdef DEBUG btrfs_err(root->fs_info, "failed to write free ino cache for root %llu", diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 88b2238a0aed..a16a029ad3b1 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -48,6 +48,8 @@ struct btrfs_free_space_op { struct btrfs_free_space *info); }; +struct btrfs_io_ctl; + struct inode *lookup_free_space_inode(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); @@ -60,14 +62,19 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, struct inode *inode); int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group); +int btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset); int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); - struct inode *lookup_free_ino_inode(struct btrfs_root *root, struct btrfs_path *path); int create_free_ino_inode(struct btrfs_root *root, diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 74faea3a516e..f6a596d5a637 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -456,7 +456,7 @@ again: } if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) { if (ret != -ENOSPC) btrfs_abort_transaction(trans, root, ret); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 43192e10cc43..8bb013672aee 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -59,6 +59,7 @@ #include "backref.h" #include "hash.h" #include "props.h" +#include "qgroup.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -470,7 +471,7 @@ again: */ if (inode_need_compress(inode)) { WARN_ON(pages); - pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { /* just bail out to the uncompressed code */ goto cont; @@ -752,7 +753,6 @@ retry: } goto out_free; } - /* * here we're doing allocation and writeback of the * compressed pages @@ -3110,6 +3110,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) if (empty) return; + down_read(&fs_info->delayed_iput_sem); + spin_lock(&fs_info->delayed_iput_lock); list_splice_init(&fs_info->delayed_iputs, &list); spin_unlock(&fs_info->delayed_iput_lock); @@ -3120,6 +3122,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) iput(delayed->inode); kfree(delayed); } + + up_read(&root->fs_info->delayed_iput_sem); } /* @@ -3628,25 +3632,28 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode->i_version = btrfs_inode_sequence(leaf, inode_item); + inode->i_generation = BTRFS_I(inode)->generation; + inode->i_rdev = 0; + rdev = btrfs_inode_rdev(leaf, inode_item); + + BTRFS_I(inode)->index_cnt = (u64)-1; + BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + +cache_index: /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any * idea about which extents were modified before we were evicted from * cache. + * + * This is required for both inode re-read from disk and delayed inode + * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == root->fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - inode->i_version = btrfs_inode_sequence(leaf, inode_item); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; - rdev = btrfs_inode_rdev(leaf, inode_item); - - BTRFS_I(inode)->index_cnt = (u64)-1; - BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); - -cache_index: path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) @@ -4016,16 +4023,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) { struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int ret; trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); + btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0); - ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); if (ret) goto out; @@ -4124,7 +4131,7 @@ out: static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = 0; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; @@ -4151,7 +4158,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) goto out; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); if (!err) btrfs_i_size_write(inode, 0); @@ -4162,6 +4169,21 @@ out: return err; } +static int truncate_space_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytes_deleted) +{ + int ret; + + bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); + ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, + bytes_deleted, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->bytes_reserved += bytes_deleted; + return ret; + +} + /* * this can truncate away extent items, csum items and directory items. * It starts at a high offset and removes keys until it can't find @@ -4197,9 +4219,21 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int ret; int err = 0; u64 ino = btrfs_ino(inode); + u64 bytes_deleted = 0; + bool be_nice = 0; + bool should_throttle = 0; + bool should_end = 0; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); + /* + * for non-free space inodes and ref cows, we want to back off from + * time to time + */ + if (!btrfs_is_free_space_inode(inode) && + test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + be_nice = 1; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -4229,6 +4263,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.type = (u8)-1; search_again: + /* + * with a 16K leaf size and 128MB extents, you can actually queue + * up a huge file in a single leaf. Most of the time that + * bytes_deleted is > 0, it will be huge by the time we get here + */ + if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (btrfs_should_end_transaction(trans, root)) { + err = -EAGAIN; + goto error; + } + } + + path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) { @@ -4371,22 +4418,39 @@ delete: } else { break; } + should_throttle = 0; + if (found_extent && (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root == root->fs_info->tree_root)) { btrfs_set_path_blocking(path); + bytes_deleted += extent_num_bytes; ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), ino, extent_offset, 0); BUG_ON(ret); + if (btrfs_should_throttle_delayed_refs(trans, root)) + btrfs_async_run_delayed_refs(root, + trans->delayed_ref_updates * 2, 0); + if (be_nice) { + if (truncate_space_check(trans, root, + extent_num_bytes)) { + should_end = 1; + } + if (btrfs_should_throttle_delayed_refs(trans, + root)) { + should_throttle = 1; + } + } } if (found_type == BTRFS_INODE_ITEM_KEY) break; if (path->slots[0] == 0 || - path->slots[0] != pending_del_slot) { + path->slots[0] != pending_del_slot || + should_throttle || should_end) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -4399,6 +4463,23 @@ delete: pending_del_nr = 0; } btrfs_release_path(path); + if (should_throttle) { + unsigned long updates = trans->delayed_ref_updates; + if (updates) { + trans->delayed_ref_updates = 0; + ret = btrfs_run_delayed_refs(trans, root, updates * 2); + if (ret && !err) + err = ret; + } + } + /* + * if we failed to refill our space rsv, bail out + * and let the transaction restart + */ + if (should_end) { + err = -EAGAIN; + goto error; + } goto search_again; } else { path->slots[0]--; @@ -4415,7 +4496,18 @@ error: if (last_size != (u64)-1 && root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); + btrfs_free_path(path); + + if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + unsigned long updates = trans->delayed_ref_updates; + if (updates) { + trans->delayed_ref_updates = 0; + ret = btrfs_run_delayed_refs(trans, root, updates * 2); + if (ret && !err) + err = ret; + } + } return err; } @@ -4826,7 +4918,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; int err; @@ -4924,6 +5016,7 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; + int steal_from_global = 0; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); int ret; @@ -4991,9 +5084,20 @@ void btrfs_evict_inode(struct inode *inode) * hard as possible to get this to work. */ if (ret) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); + steal_from_global++; + else + steal_from_global = 0; + ret = 0; - if (ret) { + /* + * steal_from_global == 0: we reserved stuff, hooray! + * steal_from_global == 1: we didn't reserve stuff, boo! + * steal_from_global == 2: we've committed, still not a lot of + * room but maybe we'll have room in the global reserve this + * time. + * steal_from_global == 3: abandon all hope! + */ + if (steal_from_global > 2) { btrfs_warn(root->fs_info, "Could not get space for a delete, will truncate on mount %d", ret); @@ -5009,10 +5113,40 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + /* + * We can't just steal from the global reserve, we need tomake + * sure there is room to do it, if not we need to commit and try + * again. + */ + if (steal_from_global) { + if (!btrfs_check_space_for_delayed_refs(trans, root)) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, + min_size); + else + ret = -ENOSPC; + } + + /* + * Couldn't steal from the global reserve, we have too much + * pending stuff built up, commit the transaction and try it + * again. + */ + if (ret) { + ret = btrfs_commit_transaction(trans, root); + if (ret) { + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + continue; + } else { + steal_from_global = 0; + } + trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - if (ret != -ENOSPC) + if (ret != -ENOSPC && ret != -EAGAIN) break; trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -5416,10 +5550,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (!inode && !IS_ROOT(dentry)) - inode = dentry->d_parent->d_inode; + inode = d_inode(dentry->d_parent); if (inode) { root = BTRFS_I(inode)->root; @@ -6226,7 +6360,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); u64 index; int err; int drop_inode = 0; @@ -8129,7 +8263,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset)) return 0; - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); smp_mb__after_atomic(); /* @@ -8169,7 +8303,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, current->journal_info = &outstanding_extents; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { - inode_dio_done(inode); + inode_dio_end(inode); flags = DIO_LOCKING | DIO_SKIP_HOLES; wakeup = false; } @@ -8188,7 +8322,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } out: if (wakeup) - inode_dio_done(inode); + inode_dio_end(inode); if (relock) mutex_lock(&inode->i_mutex); @@ -8581,7 +8715,7 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); - if (ret != -ENOSPC) { + if (ret != -ENOSPC && ret != -EAGAIN) { err = ret; break; } @@ -8875,7 +9009,7 @@ static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { u64 delalloc_bytes; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); u32 blocksize = inode->i_sb->s_blocksize; generic_fillattr(inode, stat); @@ -8896,8 +9030,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; - struct inode *new_inode = new_dentry->d_inode; - struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = d_inode(new_dentry); + struct inode *old_inode = d_inode(old_dentry); struct timespec ctime = CURRENT_TIME; u64 index = 0; u64 root_objectid; @@ -9009,7 +9143,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_name.len); } else { ret = __btrfs_unlink_inode(trans, root, old_dir, - old_dentry->d_inode, + d_inode(old_dentry), old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) @@ -9033,12 +9167,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, dest, new_dir, - new_dentry->d_inode, + d_inode(new_dentry), new_dentry->d_name.name, new_dentry->d_name.len); } if (!ret && new_inode->i_nlink == 0) - ret = btrfs_orphan_add(trans, new_dentry->d_inode); + ret = btrfs_orphan_add(trans, d_inode(new_dentry)); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_fail; @@ -9451,6 +9585,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans, root); break; } + btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 74609b931ba5..1c22c6518504 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -456,6 +456,13 @@ static noinline int create_subvol(struct inode *dir, if (ret) return ret; + /* + * Don't create subvolume whose level is not zero. Or qgroup will be + * screwed up since it assume subvolme qgroup's level to be 0. + */ + if (btrfs_qgroup_level(objectid)) + return -ENOSPC; + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * The same as the snapshot creation, please see the comment @@ -717,7 +724,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; @@ -761,10 +768,10 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) { int error; - if (!victim->d_inode) + if (d_really_is_negative(victim)) return -ENOENT; - BUG_ON(victim->d_parent->d_inode != dir); + BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(dir, MAY_WRITE | MAY_EXEC); @@ -772,8 +779,8 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) || - IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || + IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -792,7 +799,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) { - if (child->d_inode) + if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; @@ -810,7 +817,7 @@ static noinline int btrfs_mksubvol(struct path *parent, u64 *async_transid, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct inode *dir = parent->dentry->d_inode; + struct inode *dir = d_inode(parent->dentry); struct dentry *dentry; int error; @@ -824,7 +831,7 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_unlock; error = -EEXIST; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto out_dput; error = btrfs_may_create(dir, dentry); @@ -1564,7 +1571,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } - do_div(new_size, root->sectorsize); + new_size = div_u64(new_size, root->sectorsize); new_size *= root->sectorsize; printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", @@ -2294,7 +2301,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, { struct dentry *parent = file->f_path.dentry; struct dentry *dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; @@ -2333,12 +2340,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_unlock_dir; } - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { err = -ENOENT; goto out_dput; } - inode = dentry->d_inode; + inode = d_inode(dentry); dest = BTRFS_I(inode)->root; if (!capable(CAP_SYS_ADMIN)) { /* @@ -2403,7 +2410,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, "Attempt to delete subvolume %llu during send", dest->root_key.objectid); err = -EPERM; - goto out_dput; + goto out_unlock_inode; } d_invalidate(dentry); @@ -2498,6 +2505,7 @@ out_up_write: root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); } +out_unlock_inode: mutex_unlock(&inode->i_mutex); if (!err) { shrink_dcache_sb(root->fs_info->sb); @@ -2897,6 +2905,9 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, if (src == dst) return -EINVAL; + if (len == 0) + return 0; + btrfs_double_lock(src, loff, dst, dst_loff, len); ret = extent_same_check_offsets(src, loff, len); @@ -3039,7 +3050,7 @@ out: static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 disko) { - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); struct ulist *roots; struct ulist_iterator uiter; struct ulist_node *root_node = NULL; @@ -3202,6 +3213,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { + u64 next_key_min_offset = key.offset + 1; + /* * note the key will change type as we walk through the * tree. @@ -3282,7 +3295,7 @@ process_slot: } else if (key.offset >= off + len) { break; } - + next_key_min_offset = key.offset + datal; size = btrfs_item_size_nr(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), @@ -3497,7 +3510,7 @@ process_slot: break; } btrfs_release_path(path); - key.offset++; + key.offset = next_key_min_offset; } ret = 0; @@ -3626,6 +3639,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (off + len == src->i_size) len = ALIGN(src->i_size, bs) - off; + if (len == 0) { + ret = 0; + goto out_unlock; + } + /* verify the end result is block aligned */ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || !IS_ALIGNED(destoff, bs)) @@ -4624,6 +4642,11 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) sa->src, sa->dst); } + /* update qgroup status and info */ + err = btrfs_run_qgroups(trans, root->fs_info); + if (err < 0) + btrfs_error(root->fs_info, ret, + "failed to update qgroup status and info\n"); err = btrfs_end_transaction(trans, root); if (err && !ret) ret = err; @@ -4669,8 +4692,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) /* FIXME: check if the IDs really exist */ if (sa->create) { - ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid, - NULL); + ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid); } else { ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 617553cdb7d3..a2f051347731 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -434,7 +434,7 @@ out: return ret; } -struct btrfs_compress_op btrfs_lzo_compress = { +const struct btrfs_compress_op btrfs_lzo_compress = { .alloc_workspace = lzo_alloc_workspace, .free_workspace = lzo_free_workspace, .compress_pages = lzo_compress_pages, diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h index b7816cefbd13..1b10a3cd1195 100644 --- a/fs/btrfs/math.h +++ b/fs/btrfs/math.h @@ -28,8 +28,7 @@ static inline u64 div_factor(u64 num, int factor) if (factor == 10) return num; num *= factor; - do_div(num, 10); - return num; + return div_u64(num, 10); } static inline u64 div_factor_fine(u64 num, int factor) @@ -37,8 +36,7 @@ static inline u64 div_factor_fine(u64 num, int factor) if (factor == 100) return num; num *= factor; - do_div(num, 100); - return num; + return div_u64(num, 100); } #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 157cc54fc634..760c4a5e096b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -722,6 +722,7 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { int ret = 0; + int ret_wb = 0; u64 end; u64 orig_end; struct btrfs_ordered_extent *ordered; @@ -741,9 +742,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (ret) return ret; - ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); - if (ret) - return ret; + /* + * If we have a writeback error don't return immediately. Wait first + * for any ordered extents that haven't completed yet. This is to make + * sure no one can dirty the same page ranges and call writepages() + * before the ordered extents complete - to avoid failures (-EEXIST) + * when adding the new ordered extents to the ordered tree. + */ + ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; while (1) { @@ -767,7 +773,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) break; end--; } - return ret; + return ret_wb ? ret_wb : ret; } /* diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 129b1dd28527..dca137b04095 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -425,3 +425,5 @@ static const char *prop_compression_extract(struct inode *inode) return NULL; } + + diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 058c79eecbfb..3d6546581bb9 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -644,9 +644,8 @@ out: } static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 qgroupid, - u64 flags, u64 max_rfer, u64 max_excl, - u64 rsv_rfer, u64 rsv_excl) + struct btrfs_root *root, + struct btrfs_qgroup *qgroup) { struct btrfs_path *path; struct btrfs_key key; @@ -657,7 +656,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, key.objectid = 0; key.type = BTRFS_QGROUP_LIMIT_KEY; - key.offset = qgroupid; + key.offset = qgroup->qgroupid; path = btrfs_alloc_path(); if (!path) @@ -673,11 +672,11 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); - btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); - btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); - btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); - btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer); - btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl); + btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); + btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); + btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); + btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); + btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); btrfs_mark_buffer_dirty(l); @@ -967,6 +966,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, fs_info->pending_quota_state = 0; quota_root = fs_info->quota_root; fs_info->quota_root = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -982,7 +982,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - clean_tree_block(trans, tree_root, quota_root->node); + clean_tree_block(trans, tree_root->fs_info, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); @@ -1001,6 +1001,110 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + * + * Caller should hold fs_info->qgroup_lock. + */ +static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 ref_root, + u64 num_bytes, int sign) +{ + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret = 0; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + if (sign > 0) + qgroup->reserved -= num_bytes; + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = u64_to_ptr(unode->aux); + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + if (sign > 0) + qgroup->reserved -= num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + return ret; +} + + +/* + * Quick path for updating qgroup with only excl refs. + * + * In that case, just update all parent will be enough. + * Or we needs to do a full rescan. + * Caller should also hold fs_info->qgroup_lock. + * + * Return 0 for quick update, return >0 for need to full rescan + * and mark INCONSISTENT flag. + * Return < 0 for other error. + */ +static int quick_update_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 src, u64 dst, + int sign) +{ + struct btrfs_qgroup *qgroup; + int ret = 1; + int err = 0; + + qgroup = find_qgroup_rb(fs_info, src); + if (!qgroup) + goto out; + if (qgroup->excl == qgroup->rfer) { + ret = 0; + err = __qgroup_excl_accounting(fs_info, tmp, dst, + qgroup->excl, sign); + if (err < 0) { + ret = err; + goto out; + } + } +out: + if (ret) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { @@ -1008,8 +1112,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; + struct ulist *tmp; int ret = 0; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + + /* Check the level of src and dst first */ + if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) + return -EINVAL; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; if (!quota_root) { @@ -1043,23 +1156,33 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, spin_lock(&fs_info->qgroup_lock); ret = add_relation_rb(quota_root->fs_info, src, dst); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + goto out; + } + ret = quick_update_accounting(fs_info, tmp, src, dst, 1); spin_unlock(&fs_info->qgroup_lock); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); + ulist_free(tmp); return ret; } -int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, +int __del_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; + struct ulist *tmp; int ret = 0; int err; - mutex_lock(&fs_info->qgroup_ioctl_lock); + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + quota_root = fs_info->quota_root; if (!quota_root) { ret = -EINVAL; @@ -1088,14 +1211,27 @@ exist: spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); + ret = quick_update_accounting(fs_info, tmp, src, dst, -1); spin_unlock(&fs_info->qgroup_lock); out: + ulist_free(tmp); + return ret; +} + +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + ret = __del_qgroup_relation(trans, fs_info, src, dst); mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; } int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, char *name) + struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; @@ -1133,6 +1269,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *list; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); @@ -1147,15 +1284,24 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; } else { - /* check if there are no relations to this qgroup */ - if (!list_empty(&qgroup->groups) || - !list_empty(&qgroup->members)) { + /* check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { ret = -EBUSY; goto out; } } ret = del_qgroup_item(trans, quota_root, qgroupid); + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, next_group); + ret = __del_qgroup_relation(trans, fs_info, + qgroupid, + list->group->qgroupid); + if (ret) + goto out; + } + spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); @@ -1184,23 +1330,27 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; } - ret = update_qgroup_limit_item(trans, quota_root, qgroupid, - limit->flags, limit->max_rfer, - limit->max_excl, limit->rsv_rfer, - limit->rsv_excl); + + spin_lock(&fs_info->qgroup_lock); + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) + qgroup->max_rfer = limit->max_rfer; + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) + qgroup->max_excl = limit->max_excl; + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) + qgroup->rsv_rfer = limit->rsv_rfer; + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) + qgroup->rsv_excl = limit->rsv_excl; + qgroup->lim_flags |= limit->flags; + + spin_unlock(&fs_info->qgroup_lock); + + ret = update_qgroup_limit_item(trans, quota_root, qgroup); if (ret) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; btrfs_info(fs_info, "unable to update quota limit for %llu", qgroupid); } - spin_lock(&fs_info->qgroup_lock); - qgroup->lim_flags = limit->flags; - qgroup->max_rfer = limit->max_rfer; - qgroup->max_excl = limit->max_excl; - qgroup->rsv_rfer = limit->rsv_rfer; - qgroup->rsv_excl = limit->rsv_excl; - spin_unlock(&fs_info->qgroup_lock); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; @@ -1256,14 +1406,14 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1, return -1; if (oper1->bytenr > oper2->bytenr) return 1; - if (oper1->seq < oper2->seq) - return -1; - if (oper1->seq > oper2->seq) - return 1; if (oper1->ref_root < oper2->ref_root) return -1; if (oper1->ref_root > oper2->ref_root) return 1; + if (oper1->seq < oper2->seq) + return -1; + if (oper1->seq > oper2->seq) + return 1; if (oper1->type < oper2->type) return -1; if (oper1->type > oper2->type) @@ -1372,19 +1522,10 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, return 0; } -/* - * The easy accounting, if we are adding/removing the only ref for an extent - * then this qgroup and all of the parent qgroups get their refrence and - * exclusive counts adjusted. - */ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_operation *oper) { - struct btrfs_qgroup *qgroup; struct ulist *tmp; - struct btrfs_qgroup_list *glist; - struct ulist_node *unode; - struct ulist_iterator uiter; int sign = 0; int ret = 0; @@ -1395,9 +1536,7 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->qgroup_lock); if (!fs_info->quota_root) goto out; - qgroup = find_qgroup_rb(fs_info, oper->ref_root); - if (!qgroup) - goto out; + switch (oper->type) { case BTRFS_QGROUP_OPER_ADD_EXCL: sign = 1; @@ -1408,43 +1547,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, default: ASSERT(0); } - qgroup->rfer += sign * oper->num_bytes; - qgroup->rfer_cmpr += sign * oper->num_bytes; - - WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); - qgroup->excl += sign * oper->num_bytes; - qgroup->excl_cmpr += sign * oper->num_bytes; - - qgroup_dirty(fs_info, qgroup); - - /* Get all of the parent groups that contain this qgroup */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } - - /* Iterate all of the parents and adjust their reference counts */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - qgroup = u64_to_ptr(unode->aux); - qgroup->rfer += sign * oper->num_bytes; - qgroup->rfer_cmpr += sign * oper->num_bytes; - WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); - qgroup->excl += sign * oper->num_bytes; - qgroup->excl_cmpr += sign * oper->num_bytes; - qgroup_dirty(fs_info, qgroup); - - /* Add any parents of the parents */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } - } - ret = 0; + ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root, + oper->num_bytes, sign); out: spin_unlock(&fs_info->qgroup_lock); ulist_free(tmp); @@ -1845,7 +1949,7 @@ static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, struct ulist *roots = NULL; struct ulist *qgroups, *tmp; struct btrfs_qgroup *qgroup; - struct seq_list elem = {}; + struct seq_list elem = SEQ_LIST_INIT(elem); u64 seq; int old_roots = 0; int new_roots = 0; @@ -1967,7 +2071,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, int err; struct btrfs_qgroup *qg; u64 root_obj = 0; - struct seq_list elem = {}; + struct seq_list elem = SEQ_LIST_INIT(elem); parents = ulist_alloc(GFP_NOFS); if (!parents) @@ -2156,6 +2260,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + ret = update_qgroup_limit_item(trans, quota_root, qgroup); + if (ret) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; spin_lock(&fs_info->qgroup_lock); } if (fs_info->quota_enabled) @@ -2219,6 +2327,11 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, ret = -EINVAL; goto out; } + + if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) { + ret = -EINVAL; + goto out; + } ++i_qgroups; } } @@ -2230,17 +2343,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, if (ret) goto out; - if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { - ret = update_qgroup_limit_item(trans, quota_root, objectid, - inherit->lim.flags, - inherit->lim.max_rfer, - inherit->lim.max_excl, - inherit->lim.rsv_rfer, - inherit->lim.rsv_excl); - if (ret) - goto out; - } - if (srcid) { struct btrfs_root *srcroot; struct btrfs_key srckey; @@ -2286,6 +2388,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, goto unlock; } + if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { + dstgroup->lim_flags = inherit->lim.flags; + dstgroup->max_rfer = inherit->lim.max_rfer; + dstgroup->max_excl = inherit->lim.max_excl; + dstgroup->rsv_rfer = inherit->lim.rsv_rfer; + dstgroup->rsv_excl = inherit->lim.rsv_excl; + + ret = update_qgroup_limit_item(trans, quota_root, dstgroup); + if (ret) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_info(fs_info, "unable to update quota limit for %llu", + dstgroup->qgroupid); + goto unlock; + } + } + if (srcid) { srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) @@ -2302,6 +2420,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dstgroup->excl_cmpr = level_size; srcgroup->excl = level_size; srcgroup->excl_cmpr = level_size; + + /* inherit the limit info */ + dstgroup->lim_flags = srcgroup->lim_flags; + dstgroup->max_rfer = srcgroup->max_rfer; + dstgroup->max_excl = srcgroup->max_excl; + dstgroup->rsv_rfer = srcgroup->rsv_rfer; + dstgroup->rsv_excl = srcgroup->rsv_excl; + qgroup_dirty(fs_info, dstgroup); qgroup_dirty(fs_info, srcgroup); } @@ -2358,12 +2484,6 @@ out: return ret; } -/* - * reserve some space for a qgroup and all its parents. The reservation takes - * place with start_transaction or dealloc_reserve, similar to ENOSPC - * accounting. If not enough space is available, EDQUOT is returned. - * We assume that the requested space is new for all qgroups. - */ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) { struct btrfs_root *quota_root; @@ -2513,7 +2633,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) /* * returns < 0 on error, 0 when more leafs are to be scanned. - * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. + * returns 1 when done. */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, @@ -2522,7 +2642,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, { struct btrfs_key found; struct ulist *roots = NULL; - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; u64 seq; int new_roots; @@ -2618,6 +2738,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; + int ret = 0; path = btrfs_alloc_path(); if (!path) @@ -2660,7 +2781,7 @@ out: mutex_lock(&fs_info->qgroup_rescan_lock); fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - if (err == 2 && + if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } else if (err < 0) { @@ -2668,13 +2789,33 @@ out: } mutex_unlock(&fs_info->qgroup_rescan_lock); + /* + * only update status, since the previous part has alreay updated the + * qgroup info. + */ + trans = btrfs_start_transaction(fs_info->quota_root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + btrfs_err(fs_info, + "fail to start transaction for status update: %d\n", + err); + goto done; + } + ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d\n", err); + } + btrfs_end_transaction(trans, fs_info->quota_root); + if (err >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", - err == 2 ? " (inconsistency flag cleared)" : ""); + err > 0 ? " (inconsistency flag cleared)" : ""); } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } +done: complete_all(&fs_info->qgroup_rescan_completion); } @@ -2709,7 +2850,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, mutex_unlock(&fs_info->qgroup_rescan_lock); goto err; } - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 18cc68ca3090..c5242aa9a4b2 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -70,8 +70,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - char *name); + struct btrfs_fs_info *fs_info, u64 qgroupid); int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid); int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5264858ed768..fa72068bd256 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -237,12 +237,8 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) } x = cmpxchg(&info->stripe_hash_table, NULL, table); - if (x) { - if (is_vmalloc_addr(x)) - vfree(x); - else - kfree(x); - } + if (x) + kvfree(x); return 0; } @@ -453,10 +449,7 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) if (!info->stripe_hash_table) return; btrfs_clear_rbio_cache(info); - if (is_vmalloc_addr(info->stripe_hash_table)) - vfree(info->stripe_hash_table); - else - kfree(info->stripe_hash_table); + kvfree(info->stripe_hash_table); info->stripe_hash_table = NULL; } @@ -1807,8 +1800,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int err; int i; - pointers = kzalloc(rbio->real_stripes * sizeof(void *), - GFP_NOFS); + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { err = -ENOMEM; goto cleanup_io; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d83085381bcc..74b24b01d574 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3027,7 +3027,7 @@ int prealloc_file_extent_cluster(struct inode *inode, mutex_lock(&inode->i_mutex); ret = btrfs_check_data_free_space(inode, cluster->end + - 1 - cluster->start); + 1 - cluster->start, 0); if (ret) goto out; @@ -3430,7 +3430,9 @@ static int block_use_full_backref(struct reloc_control *rc, } static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct inode *inode, u64 ino) + struct btrfs_block_group_cache *block_group, + struct inode *inode, + u64 ino) { struct btrfs_key key; struct btrfs_root *root = fs_info->tree_root; @@ -3463,7 +3465,7 @@ truncate: goto out; } - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode); btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); @@ -3509,6 +3511,7 @@ static int find_data_references(struct reloc_control *rc, */ if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { ret = delete_block_group_cache(rc->extent_root->fs_info, + rc->block_group, NULL, ref_objectid); if (ret != -ENOENT) return ret; @@ -4223,7 +4226,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_free_path(path); if (!IS_ERR(inode)) - ret = delete_block_group_cache(fs_info, inode, 0); + ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); else ret = PTR_ERR(inode); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ec57687c9a4d..ab5811545a98 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -964,9 +964,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * the statistics. */ - sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * - sizeof(*sblocks_for_recheck), - GFP_NOFS); + sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, + sizeof(*sblocks_for_recheck), GFP_NOFS); if (!sblocks_for_recheck) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2319,7 +2318,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, unsigned long *bitmap, u64 start, u64 len) { - int offset; + u32 offset; int nsectors; int sectorsize = sparity->sctx->dev_root->sectorsize; @@ -2329,7 +2328,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, } start -= sparity->logic_start; - offset = (int)do_div(start, sparity->stripe_len); + start = div_u64_rem(start, sparity->stripe_len, &offset); offset /= sectorsize; nsectors = (int)len / sectorsize; @@ -2612,8 +2611,8 @@ static int get_raid56_logic_offset(u64 physical, int num, int j = 0; u64 stripe_nr; u64 last_offset; - int stripe_index; - int rot; + u32 stripe_index; + u32 rot; last_offset = (physical - map->stripes[num].physical) * nr_data_stripes(map); @@ -2624,12 +2623,11 @@ static int get_raid56_logic_offset(u64 physical, int num, for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; - stripe_nr = *offset; - do_div(stripe_nr, map->stripe_len); - do_div(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64(*offset, map->stripe_len); + stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); /* Work out the disk rotation on this stripe-set */ - rot = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); /* calculate which stripe this data locates */ rot += i; stripe_index = rot % map->num_stripes; @@ -2995,10 +2993,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nstripes = length; physical = map->stripes[num].physical; offset = 0; - do_div(nstripes, map->stripe_len); + nstripes = div_u64(length, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; @@ -3563,7 +3560,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { int ret = 0; - int flags = WQ_FREEZABLE | WQ_UNBOUND; + unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; if (fs_info->scrub_workers_refcnt == 0) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d6033f540cc7..a1216f9b4917 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3067,48 +3067,6 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, return NULL; } -static int path_loop(struct send_ctx *sctx, struct fs_path *name, - u64 ino, u64 gen, u64 *ancestor_ino) -{ - int ret = 0; - u64 parent_inode = 0; - u64 parent_gen = 0; - u64 start_ino = ino; - - *ancestor_ino = 0; - while (ino != BTRFS_FIRST_FREE_OBJECTID) { - fs_path_reset(name); - - if (is_waiting_for_rm(sctx, ino)) - break; - if (is_waiting_for_move(sctx, ino)) { - if (*ancestor_ino == 0) - *ancestor_ino = ino; - ret = get_first_ref(sctx->parent_root, ino, - &parent_inode, &parent_gen, name); - } else { - ret = __get_cur_name_and_parent(sctx, ino, gen, - &parent_inode, - &parent_gen, name); - if (ret > 0) { - ret = 0; - break; - } - } - if (ret < 0) - break; - if (parent_inode == start_ino) { - ret = 1; - if (*ancestor_ino == 0) - *ancestor_ino = ino; - break; - } - ino = parent_inode; - gen = parent_gen; - } - return ret; -} - static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; @@ -3120,7 +3078,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; int ret; - u64 ancestor = 0; name = fs_path_alloc(); from_path = fs_path_alloc(); @@ -3152,22 +3109,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) goto out; sctx->send_progress = sctx->cur_ino + 1; - ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); - if (ret) { - LIST_HEAD(deleted_refs); - ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); - ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, - &pm->update_refs, &deleted_refs, - pm->is_orphan); - if (ret < 0) - goto out; - if (rmdir_ino) { - dm = get_waiting_dir_move(sctx, pm->ino); - ASSERT(dm); - dm->rmdir_ino = rmdir_ino; - } - goto out; - } fs_path_reset(name); to_path = name; name = NULL; @@ -3610,10 +3551,27 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (ret) { + struct name_cache_entry *nce; + ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) goto out; + /* + * Make sure we clear our orphanized inode's + * name from the name cache. This is because the + * inode ow_inode might be an ancestor of some + * other inode that will be orphanized as well + * later and has an inode number greater than + * sctx->send_progress. We need to prevent + * future name lookups from using the old name + * and get instead the orphan name. + */ + nce = name_cache_search(sctx, ow_inode, ow_gen); + if (nce) { + name_cache_delete(sctx, nce); + kfree(nce); + } } else { ret = send_unlink(sctx, cur->full_path); if (ret < 0) @@ -5852,19 +5810,20 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) ret = PTR_ERR(clone_root); goto out; } - clone_sources_to_rollback = i + 1; spin_lock(&clone_root->root_item_lock); - clone_root->send_in_progress++; - if (!btrfs_root_readonly(clone_root)) { + if (!btrfs_root_readonly(clone_root) || + btrfs_root_dead(clone_root)) { spin_unlock(&clone_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; goto out; } + clone_root->send_in_progress++; spin_unlock(&clone_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); sctx->clone_roots[i].root = clone_root; + clone_sources_to_rollback = i + 1; } vfree(clone_sources_tmp); clone_sources_tmp = NULL; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 05fef198ff94..9e66f5e724db 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -901,6 +901,15 @@ find_root: if (IS_ERR(new_root)) return ERR_CAST(new_root); + if (!(sb->s_flags & MS_RDONLY)) { + int ret; + down_read(&fs_info->cleanup_work_sem); + ret = btrfs_orphan_cleanup(new_root); + up_read(&fs_info->cleanup_work_sem); + if (ret) + return ERR_PTR(ret); + } + dir_id = btrfs_root_dirid(&new_root->root_item); setup_root: location.objectid = dir_id; @@ -916,7 +925,7 @@ setup_root: * a reference to the dentry. We will have already gotten a reference * to the inode in btrfs_fill_super so we're good to go. */ - if (!new && sb->s_root->d_inode == inode) { + if (!new && d_inode(sb->s_root) == inode) { iput(inode); return dget(sb->s_root); } @@ -1221,7 +1230,7 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, root = mount_subtree(mnt, subvol_name); - if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) { struct super_block *s = root->d_sb; dput(root); root = ERR_PTR(-EINVAL); @@ -1714,7 +1723,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ - do_div(avail_space, BTRFS_STRIPE_LEN); + avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN); avail_space *= BTRFS_STRIPE_LEN; /* @@ -1886,8 +1895,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; - buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; + buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32; + buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid; return 0; } @@ -1908,6 +1917,17 @@ static struct file_system_type btrfs_fs_type = { }; MODULE_ALIAS_FS("btrfs"); +static int btrfs_control_open(struct inode *inode, struct file *file) +{ + /* + * The control file's private_data is used to hold the + * transaction when it is started and is used to keep + * track of whether a transaction is already in progress. + */ + file->private_data = NULL; + return 0; +} + /* * used by btrfsctl to scan devices when no FS is mounted */ @@ -2009,6 +2029,7 @@ static const struct super_operations btrfs_super_ops = { }; static const struct file_operations btrfs_ctl_fops = { + .open = btrfs_control_open, .unlocked_ioctl = btrfs_control_ioctl, .compat_ioctl = btrfs_control_ioctl, .owner = THIS_MODULE, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 94edb0a2a026..e8a4c86d274d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -459,7 +459,7 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; -static u64 supported_feature_masks[3] = { +static const u64 supported_feature_masks[3] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index f7dd298b3cf6..3a4bbed723fd 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -61,11 +61,23 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) /* convert from attribute */ -#define to_btrfs_feature_attr(a) \ - container_of(a, struct btrfs_feature_attr, kobj_attr) -#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr) -#define attr_to_btrfs_feature_attr(a) \ - to_btrfs_feature_attr(attr_to_btrfs_attr(a)) +static inline struct btrfs_feature_attr * +to_btrfs_feature_attr(struct kobj_attribute *a) +{ + return container_of(a, struct btrfs_feature_attr, kobj_attr); +} + +static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) +{ + return container_of(attr, struct kobj_attribute, attr); +} + +static inline struct btrfs_feature_attr * +attr_to_btrfs_feature_attr(struct attribute *attr) +{ + return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); +} + char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 73f299ebdabb..c32a7ba76bca 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -232,7 +232,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root) init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); - ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL); + ret = btrfs_create_qgroup(NULL, fs_info, 5); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; @@ -301,7 +301,7 @@ static int test_multiple_refs(struct btrfs_root *root) test_msg("Qgroup multiple refs test\n"); /* We have 5 created already from the previous test */ - ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL); + ret = btrfs_create_qgroup(NULL, fs_info, 256); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8be4278e25e8..5628e25250c0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -35,7 +35,7 @@ #define BTRFS_ROOT_TRANS_TAG 0 -static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { +static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | __TRANS_START), @@ -64,6 +64,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); + if (transaction->delayed_refs.pending_csums) + printk(KERN_ERR "pending csums is %llu\n", + transaction->delayed_refs.pending_csums); while (!list_empty(&transaction->pending_chunks)) { struct extent_map *em; @@ -93,11 +96,8 @@ static void clear_btree_io_tree(struct extent_io_tree *tree) */ ASSERT(!waitqueue_active(&state->wq)); free_extent_state(state); - if (need_resched()) { - spin_unlock(&tree->lock); - cond_resched(); - spin_lock(&tree->lock); - } + + cond_resched_lock(&tree->lock); } spin_unlock(&tree->lock); } @@ -222,10 +222,12 @@ loop: atomic_set(&cur_trans->use_count, 2); cur_trans->have_free_bgs = 0; cur_trans->start_time = get_seconds(); + cur_trans->dirty_bg_run = 0; cur_trans->delayed_refs.href_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.pending_csums = 0; cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; @@ -250,6 +252,9 @@ loop: INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->pending_ordered); INIT_LIST_HEAD(&cur_trans->dirty_bgs); + INIT_LIST_HEAD(&cur_trans->io_bgs); + mutex_init(&cur_trans->cache_write_mutex); + cur_trans->num_dirty_bgs = 0; spin_lock_init(&cur_trans->dirty_bgs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, @@ -721,7 +726,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) { - err = btrfs_run_delayed_refs(trans, root, updates); + err = btrfs_run_delayed_refs(trans, root, updates * 2); if (err) /* Error code will also eval true */ return err; } @@ -1057,6 +1062,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; + struct list_head *io_bgs = &trans->transaction->io_bgs; struct list_head *next; struct extent_buffer *eb; int ret; @@ -1110,7 +1116,7 @@ again: return ret; } - while (!list_empty(dirty_bgs)) { + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { ret = btrfs_write_dirty_block_groups(trans, root); if (ret) return ret; @@ -1810,6 +1816,37 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } + if (!cur_trans->dirty_bg_run) { + int run_it = 0; + + /* this mutex is also taken before trying to set + * block groups readonly. We need to make sure + * that nobody has set a block group readonly + * after a extents from that block group have been + * allocated for cache files. btrfs_set_block_group_ro + * will wait for the transaction to commit if it + * finds dirty_bg_run = 1 + * + * The dirty_bg_run flag is also used to make sure only + * one process starts all the block group IO. It wouldn't + * hurt to have more than one go through, but there's no + * real advantage to it either. + */ + mutex_lock(&root->fs_info->ro_block_group_mutex); + if (!cur_trans->dirty_bg_run) { + run_it = 1; + cur_trans->dirty_bg_run = 1; + } + mutex_unlock(&root->fs_info->ro_block_group_mutex); + + if (run_it) + ret = btrfs_start_dirty_block_groups(trans, root); + } + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + spin_lock(&root->fs_info->trans_lock); list_splice(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { @@ -2003,6 +2040,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, assert_qgroups_uptodate(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); + ASSERT(list_empty(&cur_trans->io_bgs)); update_super_roots(root); btrfs_set_super_log_root(root->fs_info->super_copy, 0); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 937050a2b68e..0b24755596ba 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -64,9 +64,19 @@ struct btrfs_transaction { struct list_head pending_ordered; struct list_head switch_commits; struct list_head dirty_bgs; + struct list_head io_bgs; + u64 num_dirty_bgs; + + /* + * we need to make sure block group deletion doesn't race with + * free space cache writeout. This mutex keeps them from stomping + * on each other + */ + struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; + int dirty_bg_run; }; #define __TRANS_FREEZABLE (1U << 0) @@ -136,9 +146,11 @@ struct btrfs_pending_snapshot { static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_trans = trans->transaction->transid; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + spin_unlock(&BTRFS_I(inode)->lock); } int btrfs_end_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c5b8ba37f88e..d04968374e9d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -492,11 +492,19 @@ insert: if (btrfs_inode_generation(eb, src_item) == 0) { struct extent_buffer *dst_eb = path->nodes[0]; + const u64 ino_size = btrfs_inode_size(eb, src_item); + /* + * For regular files an ino_size == 0 is used only when + * logging that an inode exists, as part of a directory + * fsync, and the inode wasn't fsynced before. In this + * case don't set the size of the inode in the fs/subvol + * tree, otherwise we would be throwing valid data away. + */ if (S_ISREG(btrfs_inode_mode(eb, src_item)) && - S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { + S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && + ino_size != 0) { struct btrfs_map_token token; - u64 ino_size = btrfs_inode_size(eb, src_item); btrfs_init_map_token(&token); btrfs_set_token_inode_size(dst_eb, dst_item, @@ -1951,6 +1959,104 @@ out: return ret; } +static int replay_xattr_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + const u64 ino) +{ + struct btrfs_key search_key; + struct btrfs_path *log_path; + int i; + int nritems; + int ret; + + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + search_key.objectid = ino; + search_key.type = BTRFS_XATTR_ITEM_KEY; + search_key.offset = 0; +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; +process_leaf: + nritems = btrfs_header_nritems(path->nodes[0]); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + u32 total_size; + u32 cur; + + btrfs_item_key_to_cpu(path->nodes[0], &key, i); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { + ret = 0; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); + total_size = btrfs_item_size_nr(path->nodes[0], i); + cur = 0; + while (cur < total_size) { + u16 name_len = btrfs_dir_name_len(path->nodes[0], di); + u16 data_len = btrfs_dir_data_len(path->nodes[0], di); + u32 this_len = sizeof(*di) + name_len + data_len; + char *name; + + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(path->nodes[0], name, + (unsigned long)(di + 1), name_len); + + log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, + name, name_len, 0); + btrfs_release_path(log_path); + if (!log_di) { + /* Doesn't exist in log tree, so delete it. */ + btrfs_release_path(path); + di = btrfs_lookup_xattr(trans, root, path, ino, + name, name_len, -1); + kfree(name); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + ASSERT(di); + ret = btrfs_delete_one_dir_name(trans, root, + path, di); + if (ret) + goto out; + btrfs_release_path(path); + search_key = key; + goto again; + } + kfree(name); + if (IS_ERR(log_di)) { + ret = PTR_ERR(log_di); + goto out; + } + cur += this_len; + di = (struct btrfs_dir_item *)((char *)di + this_len); + } + } + ret = btrfs_next_leaf(root, path); + if (ret > 0) + ret = 0; + else if (ret == 0) + goto process_leaf; +out: + btrfs_free_path(log_path); + btrfs_release_path(path); + return ret; +} + + /* * deletion replay happens before we copy any new directory items * out of the log or out of backreferences from inodes. It @@ -2104,6 +2210,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + ret = replay_xattr_deletes(wc->trans, root, log, + path, key.objectid); + if (ret) + break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, @@ -2230,7 +2340,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); + clean_tree_block(trans, root->fs_info, + next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -2308,7 +2419,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); + clean_tree_block(trans, root->fs_info, + next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -2384,7 +2496,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, log, next); + clean_tree_block(trans, log->fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -3020,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, int key_type, + struct btrfs_log_ctx *ctx, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; @@ -3104,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, src = path->nodes[0]; nritems = btrfs_header_nritems(src); for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + btrfs_item_key_to_cpu(src, &min_key, i); if (min_key.objectid != ino || min_key.type != key_type) @@ -3114,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, err = ret; goto done; } + + /* + * We must make sure that when we log a directory entry, + * the corresponding inode, after log replay, has a + * matching link count. For example: + * + * touch foo + * mkdir mydir + * sync + * ln foo mydir/bar + * xfs_io -c "fsync" mydir + * <crash> + * <mount fs and log replay> + * + * Would result in a fsync log that when replayed, our + * file inode would have a link count of 1, but we get + * two directory entries pointing to the same inode. + * After removing one of the names, it would not be + * possible to remove the other name, which resulted + * always in stale file handle errors, and would not + * be possible to rmdir the parent directory, since + * its i_size could never decrement to the value + * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. + */ + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(src, di, &tmp); + if (ctx && + (btrfs_dir_transid(src, di) == trans->transid || + btrfs_dir_type(src, di) == BTRFS_FT_DIR) && + tmp.type != BTRFS_ROOT_ITEM_KEY) + ctx->log_new_dentries = true; } path->slots[0] = nritems; @@ -3175,7 +3321,8 @@ done: static noinline int log_directory_changes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) { u64 min_key; u64 max_key; @@ -3187,7 +3334,7 @@ again: max_key = 0; while (1) { ret = log_dir_items(trans, root, inode, path, - dst_path, key_type, min_key, + dst_path, key_type, ctx, min_key, &max_key); if (ret) return ret; @@ -3963,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, if (ret < 0) { return ret; } else if (ret > 0) { - *size_ret = i_size_read(inode); + *size_ret = 0; } else { struct btrfs_inode_item *item; @@ -4070,10 +4217,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - if (inode_only == LOG_INODE_EXISTS) { - max_key_type = BTRFS_INODE_EXTREF_KEY; - max_key.type = max_key_type; - } + if (inode_only == LOG_INODE_EXISTS) + max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { if (inode_only == LOG_INODE_EXISTS) { @@ -4098,7 +4243,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { - max_key.type = BTRFS_INODE_EXTREF_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { @@ -4106,20 +4251,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &BTRFS_I(inode)->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - ret = btrfs_truncate_inode_items(trans, log, - inode, 0, 0); + while(1) { + ret = btrfs_truncate_inode_items(trans, + log, inode, 0, 0); + if (ret != -EAGAIN) + break; + } } - } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags) || + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags) || inode_only == LOG_INODE_EXISTS) { - if (inode_only == LOG_INODE_ALL) { - clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); + if (inode_only == LOG_INODE_ALL) fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; - } else { - max_key.type = BTRFS_INODE_EXTREF_KEY; - } + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { @@ -4277,15 +4421,18 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { - ret = log_directory_changes(trans, root, inode, path, dst_path); + ret = log_directory_changes(trans, root, inode, path, dst_path, + ctx); if (ret) { err = ret; goto out_unlock; } } + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; + spin_unlock(&BTRFS_I(inode)->lock); out_unlock: if (unlikely(err)) btrfs_put_logged_extents(&logged_list); @@ -4327,9 +4474,9 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, goto out; if (!S_ISDIR(inode->i_mode)) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) goto out; - inode = parent->d_inode; + inode = d_inode(parent); } while (1) { @@ -4355,7 +4502,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, break; } - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; if (IS_ROOT(parent)) @@ -4364,7 +4511,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, parent = dget_parent(parent); dput(old_parent); old_parent = parent; - inode = parent->d_inode; + inode = d_inode(parent); } dput(old_parent); @@ -4372,6 +4519,181 @@ out: return ret; } +struct btrfs_dir_list { + u64 ino; + struct list_head list; +}; + +/* + * Log the inodes of the new dentries of a directory. See log_dir_items() for + * details about the why it is needed. + * This is a recursive operation - if an existing dentry corresponds to a + * directory, that directory's new entries are logged too (same behaviour as + * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * the dentries point to we do not lock their i_mutex, otherwise lockdep + * complains about the following circular lock dependency / possible deadlock: + * + * CPU0 CPU1 + * ---- ---- + * lock(&type->i_mutex_dir_key#3/2); + * lock(sb_internal#2); + * lock(&type->i_mutex_dir_key#3/2); + * lock(&sb->s_type->i_mutex_key#14); + * + * Where sb_internal is the lock (a counter that works as a lock) acquired by + * sb_start_intwrite() in btrfs_start_transaction(). + * Not locking i_mutex of the inodes is still safe because: + * + * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible + * that while logging the inode new references (names) are added or removed + * from the inode, leaving the logged inode item with a link count that does + * not match the number of logged inode reference items. This is fine because + * at log replay time we compute the real number of links and correct the + * link count in the inode item (see replay_one_buffer() and + * link_to_fixup_dir()); + * + * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that + * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and + * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item + * has a size that doesn't match the sum of the lengths of all the logged + * names. This does not result in a problem because if a dir_item key is + * logged but its matching dir_index key is not logged, at log replay time we + * don't use it to replay the respective name (see replay_one_name()). On the + * other hand if only the dir_index key ends up being logged, the respective + * name is added to the fs/subvol tree with both the dir_item and dir_index + * keys created (see replay_one_name()). + * The directory's inode item with a wrong i_size is not a problem as well, + * since we don't use it at log replay time to set the i_size in the inode + * item of the fs/subvol tree (see overwrite_item()). + */ +static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *start_inode, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_path *path; + LIST_HEAD(dir_list); + struct btrfs_dir_list *dir_elem; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); + if (!dir_elem) { + btrfs_free_path(path); + return -ENOMEM; + } + dir_elem->ino = btrfs_ino(start_inode); + list_add_tail(&dir_elem->list, &dir_list); + + while (!list_empty(&dir_list)) { + struct extent_buffer *leaf; + struct btrfs_key min_key; + int nritems; + int i; + + dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, + list); + if (ret) + goto next_dir_inode; + + min_key.objectid = dir_elem->ino; + min_key.type = BTRFS_DIR_ITEM_KEY; + min_key.offset = 0; +again: + btrfs_release_path(path); + ret = btrfs_search_forward(log, &min_key, path, trans->transid); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + +process_leaf: + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + struct inode *di_inode; + struct btrfs_dir_list *new_dir_elem; + int log_mode = LOG_INODE_EXISTS; + int type; + + btrfs_item_key_to_cpu(leaf, &min_key, i); + if (min_key.objectid != dir_elem->ino || + min_key.type != BTRFS_DIR_ITEM_KEY) + goto next_dir_inode; + + di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + type = btrfs_dir_type(leaf, di); + if (btrfs_dir_transid(leaf, di) < trans->transid && + type != BTRFS_FT_DIR) + continue; + btrfs_dir_item_key_to_cpu(leaf, di, &di_key); + if (di_key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + di_inode = btrfs_iget(root->fs_info->sb, &di_key, + root, NULL); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + goto next_dir_inode; + } + + if (btrfs_inode_in_log(di_inode, trans->transid)) { + iput(di_inode); + continue; + } + + ctx->log_new_dentries = false; + if (type == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + btrfs_release_path(path); + ret = btrfs_log_inode(trans, root, di_inode, + log_mode, 0, LLONG_MAX, ctx); + iput(di_inode); + if (ret) + goto next_dir_inode; + if (ctx->log_new_dentries) { + new_dir_elem = kmalloc(sizeof(*new_dir_elem), + GFP_NOFS); + if (!new_dir_elem) { + ret = -ENOMEM; + goto next_dir_inode; + } + new_dir_elem->ino = di_key.objectid; + list_add_tail(&new_dir_elem->list, &dir_list); + } + break; + } + if (i == nritems) { + ret = btrfs_next_leaf(log, path); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + goto process_leaf; + } + if (min_key.offset < (u64)-1) { + min_key.offset++; + goto again; + } +next_dir_inode: + list_del(&dir_elem->list); + kfree(dir_elem); + } + + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -4394,6 +4716,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, const struct dentry * const first_parent = parent; const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > last_committed); + bool log_dentries = false; + struct inode *orig_inode = inode; sb = inode->i_sb; @@ -4449,11 +4773,14 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) + log_dentries = true; + while (1) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; - inode = parent->d_inode; + inode = d_inode(parent); if (root != BTRFS_I(inode)->root) break; @@ -4485,7 +4812,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, dput(old_parent); old_parent = parent; } - ret = 0; + if (log_dentries) + ret = log_new_dir_dentries(trans, root, orig_inode, ctx); + else + ret = 0; end_trans: dput(old_parent); if (ret < 0) { @@ -4515,7 +4845,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, + ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, start, end, 0, ctx); dput(parent); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 154990c26dcb..6916a781ea02 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -29,6 +29,7 @@ struct btrfs_log_ctx { int log_ret; int log_transid; int io_err; + bool log_new_dentries; struct list_head list; }; @@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) ctx->log_ret = 0; ctx->log_transid = 0; ctx->io_err = 0; + ctx->log_new_dentries = false; INIT_LIST_HEAD(&ctx->list); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8222f6f74147..96aebf3bcd5b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -366,8 +366,8 @@ loop_lock: btrfsic_submit_bio(cur->bi_rw, cur); num_run++; batch_run++; - if (need_resched()) - cond_resched(); + + cond_resched(); /* * we made progress, there is more work to do and the bdi @@ -400,8 +400,7 @@ loop_lock: * against it before looping */ last_waited = ioc->last_waited; - if (need_resched()) - cond_resched(); + cond_resched(); continue; } spin_lock(&device->io_lock); @@ -609,8 +608,7 @@ error: return ERR_PTR(-ENOMEM); } -void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, int step) +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; struct btrfs_device *latest_dev = NULL; @@ -1060,6 +1058,7 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, struct extent_map *em; struct list_head *search_list = &trans->transaction->pending_chunks; int ret = 0; + u64 physical_start = *start; again: list_for_each_entry(em, search_list, list) { @@ -1070,9 +1069,9 @@ again: for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev != device) continue; - if (map->stripes[i].physical >= *start + len || + if (map->stripes[i].physical >= physical_start + len || map->stripes[i].physical + em->orig_block_len <= - *start) + physical_start) continue; *start = map->stripes[i].physical + em->orig_block_len; @@ -1136,11 +1135,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; -again: + max_hole_start = search_start; max_hole_size = 0; - hole_size = 0; +again: if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { ret = -ENOSPC; goto out; @@ -1195,8 +1194,14 @@ again: */ if (contains_pending_extent(trans, device, &search_start, - hole_size)) - hole_size = 0; + hole_size)) { + if (key.offset >= search_start) { + hole_size = key.offset - search_start; + } else { + WARN_ON_ONCE(1); + hole_size = 0; + } + } if (hole_size > max_hole_size) { max_hole_start = search_start; @@ -1233,21 +1238,23 @@ next: * allocated dev extents, and when shrinking the device, * search_end may be smaller than search_start. */ - if (search_end > search_start) + if (search_end > search_start) { hole_size = search_end - search_start; - if (hole_size > max_hole_size) { - max_hole_start = search_start; - max_hole_size = hole_size; - } + if (contains_pending_extent(trans, device, &search_start, + hole_size)) { + btrfs_release_path(path); + goto again; + } - if (contains_pending_extent(trans, device, &search_start, hole_size)) { - btrfs_release_path(path); - goto again; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } } /* See above. */ - if (hole_size < num_bytes) + if (max_hole_size < num_bytes) ret = -ENOSPC; else ret = 0; @@ -2487,8 +2494,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, } static int btrfs_free_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, + struct btrfs_root *root, u64 chunk_objectid, u64 chunk_offset) { int ret; @@ -2580,7 +2586,6 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct map_lookup *map; u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - u64 chunk_tree = root->fs_info->chunk_root->objectid; int i, ret = 0; /* Just in case */ @@ -2634,8 +2639,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } } } - ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, - chunk_offset); + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out; @@ -2664,8 +2668,8 @@ out: } static int btrfs_relocate_chunk(struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset) + u64 chunk_objectid, + u64 chunk_offset) { struct btrfs_root *extent_root; struct btrfs_trans_handle *trans; @@ -2707,7 +2711,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) struct btrfs_chunk *chunk; struct btrfs_key key; struct btrfs_key found_key; - u64 chunk_tree = chunk_root->root_key.objectid; u64 chunk_type; bool retried = false; int failed = 0; @@ -2744,7 +2747,7 @@ again: btrfs_release_path(path); if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_relocate_chunk(chunk_root, chunk_tree, + ret = btrfs_relocate_chunk(chunk_root, found_key.objectid, found_key.offset); if (ret == -ENOSPC) @@ -3022,7 +3025,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf, stripe_offset = btrfs_stripe_offset(leaf, stripe); stripe_length = btrfs_chunk_length(leaf, chunk); - do_div(stripe_length, factor); + stripe_length = div_u64(stripe_length, factor); if (stripe_offset < bargs->pend && stripe_offset + stripe_length > bargs->pstart) @@ -3255,7 +3258,6 @@ again: } ret = btrfs_relocate_chunk(chunk_root, - chunk_root->root_key.objectid, found_key.objectid, found_key.offset); if (ret && ret != -ENOSPC) @@ -3957,7 +3959,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; u64 length; - u64 chunk_tree; u64 chunk_objectid; u64 chunk_offset; int ret; @@ -4027,13 +4028,11 @@ again: break; } - chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); - ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, - chunk_offset); + ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); if (ret && ret != -ENOSPC) goto done; if (ret == -ENOSPC) @@ -4131,7 +4130,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } -static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { +static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, .dev_stripes = 1, @@ -4289,7 +4288,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); - devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), GFP_NOFS); if (!devices_info) return -ENOMEM; @@ -4400,8 +4399,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, */ if (stripe_size * data_stripes > max_chunk_size) { u64 mask = (1ULL << 24) - 1; - stripe_size = max_chunk_size; - do_div(stripe_size, data_stripes); + + stripe_size = div_u64(max_chunk_size, data_stripes); /* bump the answer up to a 16MB boundary */ stripe_size = (stripe_size + mask) & ~mask; @@ -4413,10 +4412,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; } - do_div(stripe_size, dev_stripes); + stripe_size = div_u64(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - do_div(stripe_size, raid_stripe_len); + stripe_size = div_u64(stripe_size, raid_stripe_len); stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -4954,7 +4953,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 stripe_nr_orig; u64 stripe_nr_end; u64 stripe_len; - int stripe_index; + u32 stripe_index; int i; int ret = 0; int num_stripes; @@ -4995,7 +4994,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - do_div(stripe_nr, stripe_len); + stripe_nr = div64_u64(stripe_nr, stripe_len); stripe_offset = stripe_nr * stripe_len; BUG_ON(offset < stripe_offset); @@ -5011,7 +5010,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* allow a write of a full stripe, but make sure we don't * allow straddling of stripes */ - do_div(raid56_full_stripe_start, full_stripe_len); + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, + full_stripe_len); raid56_full_stripe_start *= full_stripe_len; } @@ -5136,7 +5136,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index = 0; stripe_nr_orig = stripe_nr; stripe_nr_end = ALIGN(offset + *length, map->stripe_len); - do_div(stripe_nr_end, map->stripe_len); + stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); stripe_end_offset = stripe_nr_end * map->stripe_len - (offset + *length); @@ -5144,7 +5144,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & REQ_DISCARD) num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); - stripe_index = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { @@ -5170,9 +5171,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - int factor = map->num_stripes / map->sub_stripes; + u32 factor = map->num_stripes / map->sub_stripes; - stripe_index = do_div(stripe_nr, factor); + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) @@ -5198,8 +5199,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ - stripe_nr = raid56_full_stripe_start; - do_div(stripe_nr, stripe_len * nr_data_stripes(map)); + stripe_nr = div_u64(raid56_full_stripe_start, + stripe_len * nr_data_stripes(map)); /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -5209,32 +5210,32 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index = 0; stripe_offset = 0; } else { - u64 tmp; - /* * Mirror #0 or #1 means the original data block. * Mirror #2 is RAID5 parity block. * Mirror #3 is RAID6 Q block. */ - stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64_rem(stripe_nr, + nr_data_stripes(map), &stripe_index); if (mirror_num > 1) stripe_index = nr_data_stripes(map) + mirror_num - 2; /* We distribute the parity blocks across stripes */ - tmp = stripe_nr + stripe_index; - stripe_index = do_div(tmp, map->num_stripes); + div_u64_rem(stripe_nr + stripe_index, map->num_stripes, + &stripe_index); if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && mirror_num <= 1) mirror_num = 1; } } else { /* - * after this do_div call, stripe_nr is the number of stripes - * on this device we have to walk to find the data, and - * stripe_index is the number of our device in the stripe array + * after this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array */ - stripe_index = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); mirror_num = stripe_index + 1; } BUG_ON(stripe_index >= map->num_stripes); @@ -5261,7 +5262,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { u64 tmp; - int i, rot; + unsigned rot; bbio->raid_map = (u64 *)((void *)bbio->stripes + sizeof(struct btrfs_bio_stripe) * @@ -5269,8 +5270,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, sizeof(int) * tgtdev_indexes); /* Work out the disk rotation on this stripe-set */ - tmp = stripe_nr; - rot = do_div(tmp, num_stripes); + div_u64_rem(stripe_nr, num_stripes, &rot); /* Fill in the logical address of each stripe */ tmp = stripe_nr * nr_data_stripes(map); @@ -5285,8 +5285,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } if (rw & REQ_DISCARD) { - int factor = 0; - int sub_stripes = 0; + u32 factor = 0; + u32 sub_stripes = 0; u64 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; @@ -5437,9 +5437,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } if (found) { - u64 length = map->stripe_len; - - if (physical_of_found + length <= + if (physical_of_found + map->stripe_len <= dev_replace->cursor_left) { struct btrfs_bio_stripe *tgtdev_stripe = bbio->stripes + num_stripes; @@ -5535,15 +5533,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, rmap_len = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID10) - do_div(length, map->num_stripes / map->sub_stripes); + length = div_u64(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) - do_div(length, map->num_stripes); + length = div_u64(length, map->num_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - do_div(length, nr_data_stripes(map)); + length = div_u64(length, nr_data_stripes(map)); rmap_len = map->stripe_len * nr_data_stripes(map); } - buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); BUG_ON(!buf); /* -ENOMEM */ for (i = 0; i < map->num_stripes; i++) { @@ -5554,11 +5552,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, continue; stripe_nr = physical - map->stripes[i].physical; - do_div(stripe_nr, map->stripe_len); + stripe_nr = div_u64(stripe_nr, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; - do_div(stripe_nr, map->sub_stripes); + stripe_nr = div_u64(stripe_nr, map->sub_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; } /* else if RAID[56], multiply by nr_data_stripes(). @@ -5835,8 +5833,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 length = 0; u64 map_length; int ret; - int dev_nr = 0; - int total_devs = 1; + int dev_nr; + int total_devs; struct btrfs_bio *bbio = NULL; length = bio->bi_iter.bi_size; @@ -5877,11 +5875,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, BUG(); } - while (dev_nr < total_devs) { + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); - dev_nr++; continue; } @@ -5894,7 +5891,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, ret = breakup_stripe_bio(root, bbio, first_bio, dev, dev_nr, rw, async_submit); BUG_ON(ret); - dev_nr++; continue; } @@ -5909,7 +5905,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, submit_stripe_bio(root, bbio, bio, bbio->stripes[dev_nr].physical, dev_nr, rw, async_submit); - dev_nr++; } btrfs_bio_counter_dec(root->fs_info); return 0; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 83069dec6898..ebc31331a837 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -421,8 +421,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, int step); +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 883b93623bc5..6f518c90e1c1 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -261,7 +261,7 @@ out: ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct btrfs_key key, found_key; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -364,22 +364,42 @@ const struct xattr_handler *btrfs_xattr_handlers[] = { /* * Check if the attribute is in a supported namespace. * - * This applied after the check for the synthetic attributes in the system + * This is applied after the check for the synthetic attributes in the system * namespace. */ -static bool btrfs_is_valid_xattr(const char *name) +static int btrfs_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN) || - !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || - !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) || - !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN); + int len = strlen(name); + int prefixlen = 0; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) + prefixlen = XATTR_SECURITY_PREFIX_LEN; + else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + prefixlen = XATTR_SYSTEM_PREFIX_LEN; + else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + prefixlen = XATTR_TRUSTED_PREFIX_LEN; + else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + prefixlen = XATTR_USER_PREFIX_LEN; + else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + prefixlen = XATTR_BTRFS_PREFIX_LEN; + else + return -EOPNOTSUPP; + + /* + * The name cannot consist of just prefix + */ + if (len <= prefixlen) + return -EINVAL; + + return 0; } ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { + int ret; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -388,15 +408,17 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_getxattr(dentry, name, buffer, size); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; - return __btrfs_getxattr(dentry->d_inode, name, buffer, size); + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; + return __btrfs_getxattr(d_inode(dentry), name, buffer, size); } int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; + int ret; /* * The permission on security.* and system.* is not checked @@ -413,23 +435,25 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_setxattr(dentry, name, value, size, flags); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(dentry->d_inode, name, + return btrfs_set_prop(d_inode(dentry), name, value, size, flags); if (size == 0) value = ""; /* empty EA, do not remove */ - return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size, + return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size, flags); } int btrfs_removexattr(struct dentry *dentry, const char *name) { - struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; + int ret; /* * The permission on security.* and system.* is not checked @@ -446,14 +470,15 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_removexattr(dentry, name); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(dentry->d_inode, name, + return btrfs_set_prop(d_inode(dentry), name, NULL, 0, XATTR_REPLACE); - return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, + return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0, XATTR_REPLACE); } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index fb22fd8d8fb8..82990b8f872b 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -403,7 +403,7 @@ next: return ret; } -struct btrfs_compress_op btrfs_zlib_compress = { +const struct btrfs_compress_op btrfs_zlib_compress = { .alloc_workspace = zlib_alloc_workspace, .free_workspace = zlib_free_workspace, .compress_pages = zlib_compress_pages, diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index fbb08e97438d..6af790fc3df8 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -123,11 +123,11 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) /* check parameters */ ret = -EOPNOTSUPP; - if (!root->d_inode || - !root->d_inode->i_op->lookup || - !root->d_inode->i_op->mkdir || - !root->d_inode->i_op->setxattr || - !root->d_inode->i_op->getxattr || + if (d_is_negative(root) || + !d_backing_inode(root)->i_op->lookup || + !d_backing_inode(root)->i_op->mkdir || + !d_backing_inode(root)->i_op->setxattr || + !d_backing_inode(root)->i_op->getxattr || !root->d_sb->s_op->statfs || !root->d_sb->s_op->sync_fs) goto error_unsupported; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 232426214fdd..afa023dded5b 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -441,12 +441,12 @@ static int cachefiles_attr_changed(struct fscache_object *_object) fscache_set_store_limit(&object->fscache, ni_size); - oi_size = i_size_read(object->backer->d_inode); + oi_size = i_size_read(d_backing_inode(object->backer)); if (oi_size == ni_size) return 0; cachefiles_begin_secure(cache, &saved_cred); - mutex_lock(&object->backer->d_inode->i_mutex); + mutex_lock(&d_inode(object->backer)->i_mutex); /* if there's an extension to a partial page at the end of the backing * file, we need to discard the partial page so that we pick up new @@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) ret = notify_change(object->backer, &newattrs, NULL); truncate_failed: - mutex_unlock(&object->backer->d_inode->i_mutex); + mutex_unlock(&d_inode(object->backer)->i_mutex); cachefiles_end_secure(cache, saved_cred); if (ret == -EIO) { diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 1e51714eb33e..ab857ab9f40d 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -286,13 +286,13 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, if (ret < 0) { cachefiles_io_error(cache, "Unlink security error"); } else { - ret = vfs_unlink(dir->d_inode, rep, NULL); + ret = vfs_unlink(d_inode(dir), rep, NULL); if (preemptive) cachefiles_mark_object_buried(cache, rep); } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); @@ -303,7 +303,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); try_again: /* first step is to make up a grave dentry in the graveyard */ @@ -355,7 +355,7 @@ try_again: return -EIO; } - if (grave->d_inode) { + if (d_is_positive(grave)) { unlock_rename(cache->graveyard, dir); dput(grave); grave = NULL; @@ -387,8 +387,8 @@ try_again: if (ret < 0) { cachefiles_io_error(cache, "Rename security error %d", ret); } else { - ret = vfs_rename(dir->d_inode, rep, - cache->graveyard->d_inode, grave, NULL, 0); + ret = vfs_rename(d_inode(dir), rep, + d_inode(cache->graveyard), grave, NULL, 0); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); @@ -415,18 +415,18 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); ASSERT(object->dentry); - ASSERT(object->dentry->d_inode); + ASSERT(d_backing_inode(object->dentry)); ASSERT(object->dentry->d_parent); dir = dget_parent(object->dentry); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = 0; } else { /* we need to check that our parent is _still_ our parent - it @@ -438,7 +438,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore * it */ - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = 0; } } @@ -473,7 +473,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, path.mnt = cache->mnt; ASSERT(parent->dentry); - ASSERT(parent->dentry->d_inode); + ASSERT(d_backing_inode(parent->dentry)); if (!(d_is_dir(parent->dentry))) { // TODO: convert file to dir @@ -497,7 +497,7 @@ lookup_again: /* search the current directory for the element name */ _debug("lookup '%s'", name); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); start = jiffies; next = lookup_one_len(name, dir, nlen); @@ -505,21 +505,21 @@ lookup_again: if (IS_ERR(next)) goto lookup_error; - _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative"); + _debug("next -> %p %s", next, d_backing_inode(next) ? "positive" : "negative"); if (!key) - object->new = !next->d_inode; + object->new = !d_backing_inode(next); /* if this element of the path doesn't exist, then the lookup phase * failed, and we can release any readers in the certain knowledge that * there's nothing for them to actually read */ - if (!next->d_inode) + if (d_is_negative(next)) fscache_object_lookup_negative(&object->fscache); /* we need to create the object if it's negative */ if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { /* index objects and intervening tree levels must be subdirs */ - if (!next->d_inode) { + if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) goto create_error; @@ -529,26 +529,26 @@ lookup_again: if (ret < 0) goto create_error; start = jiffies; - ret = vfs_mkdir(dir->d_inode, next, 0); + ret = vfs_mkdir(d_inode(dir), next, 0); cachefiles_hist(cachefiles_mkdir_histogram, start); if (ret < 0) goto create_error; - ASSERT(next->d_inode); + ASSERT(d_backing_inode(next)); _debug("mkdir -> %p{%p{ino=%lu}}", - next, next->d_inode, next->d_inode->i_ino); + next, d_backing_inode(next), d_backing_inode(next)->i_ino); } else if (!d_can_lookup(next)) { pr_err("inode %lu is not a directory\n", - next->d_inode->i_ino); + d_backing_inode(next)->i_ino); ret = -ENOBUFS; goto error; } } else { /* non-index objects start out life as files */ - if (!next->d_inode) { + if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) goto create_error; @@ -558,21 +558,21 @@ lookup_again: if (ret < 0) goto create_error; start = jiffies; - ret = vfs_create(dir->d_inode, next, S_IFREG, true); + ret = vfs_create(d_inode(dir), next, S_IFREG, true); cachefiles_hist(cachefiles_create_histogram, start); if (ret < 0) goto create_error; - ASSERT(next->d_inode); + ASSERT(d_backing_inode(next)); _debug("create -> %p{%p{ino=%lu}}", - next, next->d_inode, next->d_inode->i_ino); + next, d_backing_inode(next), d_backing_inode(next)->i_ino); } else if (!d_can_lookup(next) && !d_is_reg(next) ) { pr_err("inode %lu is not a file or directory\n", - next->d_inode->i_ino); + d_backing_inode(next)->i_ino); ret = -ENOBUFS; goto error; } @@ -581,7 +581,7 @@ lookup_again: /* process the next component */ if (key) { _debug("advance"); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(dir); dir = next; next = NULL; @@ -617,7 +617,7 @@ lookup_again: /* note that we're now using this object */ ret = cachefiles_mark_object_active(cache, object); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(dir); dir = NULL; @@ -646,7 +646,7 @@ lookup_again: const struct address_space_operations *aops; ret = -EPERM; - aops = object->dentry->d_inode->i_mapping->a_ops; + aops = d_backing_inode(object->dentry)->i_mapping->a_ops; if (!aops->bmap) goto check_error; @@ -659,7 +659,7 @@ lookup_again: object->new = 0; fscache_obtained_object(&object->fscache); - _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino); + _leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino); return 0; create_error: @@ -695,7 +695,7 @@ lookup_error: cachefiles_io_error(cache, "Lookup failed"); next = NULL; error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(next); error_out2: dput(dir); @@ -719,7 +719,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _enter(",,%s", dirname); /* search the current directory for the element name */ - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&d_inode(dir)->i_mutex); start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); @@ -731,10 +731,10 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } _debug("subdir -> %p %s", - subdir, subdir->d_inode ? "positive" : "negative"); + subdir, d_backing_inode(subdir) ? "positive" : "negative"); /* we need to create the subdir if it doesn't exist yet */ - if (!subdir->d_inode) { + if (d_is_negative(subdir)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) goto mkdir_error; @@ -746,22 +746,22 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, ret = security_path_mkdir(&path, subdir, 0700); if (ret < 0) goto mkdir_error; - ret = vfs_mkdir(dir->d_inode, subdir, 0700); + ret = vfs_mkdir(d_inode(dir), subdir, 0700); if (ret < 0) goto mkdir_error; - ASSERT(subdir->d_inode); + ASSERT(d_backing_inode(subdir)); _debug("mkdir -> %p{%p{ino=%lu}}", subdir, - subdir->d_inode, - subdir->d_inode->i_ino); + d_backing_inode(subdir), + d_backing_inode(subdir)->i_ino); } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); /* we need to make sure the subdir is a directory */ - ASSERT(subdir->d_inode); + ASSERT(d_backing_inode(subdir)); if (!d_can_lookup(subdir)) { pr_err("%s is not a directory\n", dirname); @@ -770,18 +770,18 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } ret = -EPERM; - if (!subdir->d_inode->i_op->setxattr || - !subdir->d_inode->i_op->getxattr || - !subdir->d_inode->i_op->lookup || - !subdir->d_inode->i_op->mkdir || - !subdir->d_inode->i_op->create || - (!subdir->d_inode->i_op->rename && - !subdir->d_inode->i_op->rename2) || - !subdir->d_inode->i_op->rmdir || - !subdir->d_inode->i_op->unlink) + if (!d_backing_inode(subdir)->i_op->setxattr || + !d_backing_inode(subdir)->i_op->getxattr || + !d_backing_inode(subdir)->i_op->lookup || + !d_backing_inode(subdir)->i_op->mkdir || + !d_backing_inode(subdir)->i_op->create || + (!d_backing_inode(subdir)->i_op->rename && + !d_backing_inode(subdir)->i_op->rename2) || + !d_backing_inode(subdir)->i_op->rmdir || + !d_backing_inode(subdir)->i_op->unlink) goto check_error; - _leave(" = [%lu]", subdir->d_inode->i_ino); + _leave(" = [%lu]", d_backing_inode(subdir)->i_ino); return subdir; check_error: @@ -790,19 +790,19 @@ check_error: return ERR_PTR(ret); mkdir_error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } @@ -827,7 +827,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, // dir, filename); /* look up the victim */ - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); @@ -836,13 +836,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, goto lookup_error; //_debug("victim -> %p %s", - // victim, victim->d_inode ? "positive" : "negative"); + // victim, d_backing_inode(victim) ? "positive" : "negative"); /* if the object is no longer there then we probably retired the object * at the netfs's request whilst the cull was in progress */ - if (!victim->d_inode) { - mutex_unlock(&dir->d_inode->i_mutex); + if (d_is_negative(victim)) { + mutex_unlock(&d_inode(dir)->i_mutex); dput(victim); _leave(" = -ENOENT [absent]"); return ERR_PTR(-ENOENT); @@ -871,13 +871,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, object_in_use: read_unlock(&cache->active_lock); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(victim); //_leave(" = -EBUSY [in use]"); return ERR_PTR(-EBUSY); lookup_error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = PTR_ERR(victim); if (ret == -ENOENT) { /* file or dir now absent - probably retired by netfs */ @@ -913,7 +913,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, return PTR_ERR(victim); _debug("victim -> %p %s", - victim, victim->d_inode ? "positive" : "negative"); + victim, d_backing_inode(victim) ? "positive" : "negative"); /* okay... the victim is not being used so we can cull it * - start by marking it as stale @@ -936,7 +936,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, return 0; error_unlock: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); error: dput(victim); if (ret == -ENOENT) { @@ -971,7 +971,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, if (IS_ERR(victim)) return PTR_ERR(victim); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(victim); //_leave(" = 0"); return 0; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index c6cd8d7a4eef..3cbb0e834694 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -74,12 +74,12 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, static int cachefiles_read_reissue(struct cachefiles_object *object, struct cachefiles_one_read *monitor) { - struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; struct page *backpage = monitor->back_page, *backpage2; int ret; _enter("{ino=%lx},{%lx,%lx}", - object->backer->d_inode->i_ino, + d_backing_inode(object->backer)->i_ino, backpage->index, backpage->flags); /* skip if the page was truncated away completely */ @@ -157,7 +157,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op) object = container_of(op->op.object, struct cachefiles_object, fscache); - _enter("{ino=%lu}", object->backer->d_inode->i_ino); + _enter("{ino=%lu}", d_backing_inode(object->backer)->i_ino); max = 8; spin_lock_irq(&object->work_lock); @@ -247,7 +247,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter); /* attempt to get hold of the backing page */ - bmapping = object->backer->d_inode->i_mapping; + bmapping = d_backing_inode(object->backer)->i_mapping; newpage = NULL; for (;;) { @@ -408,7 +408,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, if (!object->backer) goto enobufs; - inode = object->backer->d_inode; + inode = d_backing_inode(object->backer); ASSERT(S_ISREG(inode->i_mode)); ASSERT(inode->i_mapping->a_ops->bmap); ASSERT(inode->i_mapping->a_ops->readpages); @@ -468,7 +468,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, struct list_head *list) { struct cachefiles_one_read *monitor = NULL; - struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; int ret = 0; @@ -705,7 +705,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, if (cachefiles_has_space(cache, 0, *nr_pages) < 0) space = 0; - inode = object->backer->d_inode; + inode = d_backing_inode(object->backer); ASSERT(S_ISREG(inode->i_mode)); ASSERT(inode->i_mapping->a_ops->bmap); ASSERT(inode->i_mapping->a_ops->readpages); diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index 396c18ea2764..31bbc0528b11 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -55,14 +55,14 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache, { int ret; - ret = security_inode_mkdir(root->d_inode, root, 0); + ret = security_inode_mkdir(d_backing_inode(root), root, 0); if (ret < 0) { pr_err("Security denies permission to make dirs: error %d", ret); return ret; } - ret = security_inode_create(root->d_inode, root, 0); + ret = security_inode_create(d_backing_inode(root), root, 0); if (ret < 0) pr_err("Security denies permission to create files: error %d", ret); @@ -95,7 +95,7 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache, /* use the cache root dir's security context as the basis with * which create files */ - ret = set_create_files_as(new, root->d_inode); + ret = set_create_files_as(new, d_backing_inode(root)); if (ret < 0) { abort_creds(new); cachefiles_begin_secure(cache, _saved_cred); diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index a8a68745e11d..d31c1a72d8a5 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -33,7 +33,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) int ret; ASSERT(dentry); - ASSERT(dentry->d_inode); + ASSERT(d_backing_inode(dentry)); if (!object->fscache.cookie) strcpy(type, "C3"); @@ -52,7 +52,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) if (ret != -EEXIST) { pr_err("Can't set xattr on %pd [%lu] (err %d)\n", - dentry, dentry->d_inode->i_ino, + dentry, d_backing_inode(dentry)->i_ino, -ret); goto error; } @@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) goto bad_type_length; pr_err("Can't read xattr on %pd [%lu] (err %d)\n", - dentry, dentry->d_inode->i_ino, + dentry, d_backing_inode(dentry)->i_ino, -ret); goto error; } @@ -84,14 +84,14 @@ error: bad_type_length: pr_err("Cache object %lu type xattr length incorrect\n", - dentry->d_inode->i_ino); + d_backing_inode(dentry)->i_ino); ret = -EIO; goto error; bad_type: xtype[2] = 0; pr_err("Cache object %pd [%lu] type %s not %s\n", - dentry, dentry->d_inode->i_ino, + dentry, d_backing_inode(dentry)->i_ino, xtype, type); ret = -EIO; goto error; @@ -165,7 +165,7 @@ int cachefiles_check_auxdata(struct cachefiles_object *object) int ret; ASSERT(dentry); - ASSERT(dentry->d_inode); + ASSERT(d_backing_inode(dentry)); ASSERT(object->fscache.cookie->def->check_aux); auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); @@ -204,7 +204,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, _enter("%p,#%d", object, auxdata->len); ASSERT(dentry); - ASSERT(dentry->d_inode); + ASSERT(d_backing_inode(dentry)); auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp); if (!auxbuf) { @@ -225,7 +225,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, cachefiles_io_error_obj(object, "Can't read xattr on %lu (err %d)", - dentry->d_inode->i_ino, -ret); + d_backing_inode(dentry)->i_ino, -ret); goto error; } @@ -276,7 +276,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, cachefiles_io_error_obj(object, "Can't update xattr on %lu" " (error %d)", - dentry->d_inode->i_ino, -ret); + d_backing_inode(dentry)->i_ino, -ret); goto error; } } @@ -291,7 +291,7 @@ error: bad_type_length: pr_err("Cache object %lu xattr length incorrect\n", - dentry->d_inode->i_ino); + d_backing_inode(dentry)->i_ino); ret = -EIO; goto error; @@ -316,7 +316,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, cachefiles_io_error(cache, "Can't remove xattr from %lu" " (error %d)", - dentry->d_inode->i_ino, -ret); + d_backing_inode(dentry)->i_ino, -ret); } _leave(" = %d", ret); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 155ab9c0246b..e162bcd105ee 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1146,6 +1146,10 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); + if (r < 0) + page_cache_release(page); + else + *pagep = page; } while (r == -EAGAIN); return r; @@ -1534,19 +1538,27 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); - err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, - "inline_version", &inline_version, - sizeof(inline_version), - CEPH_OSD_CMPXATTR_OP_GT, - CEPH_OSD_CMPXATTR_MODE_U64); - if (err) - goto out_put; - - err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, - "inline_version", &inline_version, - sizeof(inline_version), 0, 0); - if (err) - goto out_put; + { + __le64 xattr_buf = cpu_to_le64(inline_version); + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, + "inline_version", &xattr_buf, + sizeof(xattr_buf), + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64); + if (err) + goto out_put; + } + + { + char xattr_buf[32]; + int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf), + "%llu", inline_version); + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, + "inline_version", + xattr_buf, xattr_len, 0, 0); + if (err) + goto out_put; + } ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, req, false); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8172775428a0..be5ea6af8366 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -896,6 +896,18 @@ int ceph_is_any_caps(struct inode *inode) return ret; } +static void drop_inode_snap_realm(struct ceph_inode_info *ci) +{ + struct ceph_snap_realm *realm = ci->i_snap_realm; + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm_counter++; + ci->i_snap_realm = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, + realm); +} + /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * @@ -946,15 +958,13 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) if (removed) ceph_put_cap(mdsc, cap); - if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { - struct ceph_snap_realm *realm = ci->i_snap_realm; - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm_counter++; - ci->i_snap_realm = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); - } + /* when reconnect denied, we remove session caps forcibly, + * i_wr_ref can be non-zero. If there are ongoing write, + * keep i_snap_realm. + */ + if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) + drop_inode_snap_realm(ci); + if (!__ceph_is_any_real_caps(ci)) __cap_delay_cancel(mdsc, ci); } @@ -1394,6 +1404,13 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) int was = ci->i_dirty_caps; int dirty = 0; + if (!ci->i_auth_cap) { + pr_warn("__mark_dirty_caps %p %llx mask %s, " + "but no auth cap (session was closed?)\n", + inode, ceph_ino(inode), ceph_cap_string(mask)); + return 0; + } + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); @@ -1404,7 +1421,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ci->i_snap_realm->cached_context); dout(" inode %p now dirty snapc %p auth cap %p\n", &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); - WARN_ON(!ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &mdsc->cap_dirty); @@ -1545,7 +1561,19 @@ retry_locked: if (!mdsc->stopping && inode->i_nlink > 0) { if (want) { retain |= CEPH_CAP_ANY; /* be greedy */ + } else if (S_ISDIR(inode->i_mode) && + (issued & CEPH_CAP_FILE_SHARED) && + __ceph_dir_is_complete(ci)) { + /* + * If a directory is complete, we want to keep + * the exclusive cap. So that MDS does not end up + * revoking the shared cap on every create/unlink + * operation. + */ + want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + retain |= want; } else { + retain |= CEPH_CAP_ANY_SHARED; /* * keep RD only if we didn't have the file open RW, @@ -2309,6 +2337,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) wake = 1; } } + /* see comment in __ceph_remove_cap() */ + if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) + drop_inode_snap_realm(ci); } spin_unlock(&ci->i_ceph_lock); @@ -3391,7 +3422,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, int ceph_encode_dentry_release(void **p, struct dentry *dentry, int mds, int drop, int unless) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); int force = 0; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 1b2355109b9f..31f831471ed2 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -84,7 +84,7 @@ static int mdsc_show(struct seq_file *s, void *p) path = NULL; spin_lock(&req->r_dentry->d_lock); seq_printf(s, " #%llx/%pd (%s)", - ceph_ino(req->r_dentry->d_parent->d_inode), + ceph_ino(d_inode(req->r_dentry->d_parent)), req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 83e9976f7189..4248307fea90 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -49,9 +49,9 @@ int ceph_init_dentry(struct dentry *dentry) goto out_unlock; } - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) d_set_d_op(dentry, &ceph_dentry_ops); - else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) + else if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) d_set_d_op(dentry, &ceph_snapdir_dentry_ops); else d_set_d_op(dentry, &ceph_snap_dentry_ops); @@ -77,7 +77,7 @@ struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) spin_lock(&dentry->d_lock); if (!IS_ROOT(dentry)) { - inode = dentry->d_parent->d_inode; + inode = d_inode(dentry->d_parent); ihold(inode); } spin_unlock(&dentry->d_lock); @@ -122,7 +122,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, { struct ceph_file_info *fi = file->private_data; struct dentry *parent = file->f_path.dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct list_head *p; struct dentry *dentry, *last; struct ceph_dentry_info *di; @@ -161,15 +161,15 @@ more: } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (di->lease_shared_gen == shared_gen && - !d_unhashed(dentry) && dentry->d_inode && - ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && - ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && + !d_unhashed(dentry) && d_really_is_positive(dentry) && + ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && + ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && fpos_cmp(ctx->pos, di->offset) <= 0) break; dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry, dentry, di->offset, ctx->pos, d_unhashed(dentry) ? " unhashed" : "", - !dentry->d_inode ? " null" : ""); + !d_inode(dentry) ? " null" : ""); spin_unlock(&dentry->d_lock); p = p->prev; dentry = list_entry(p, struct dentry, d_child); @@ -189,11 +189,11 @@ more: } dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, - dentry, dentry, dentry->d_inode); + dentry, dentry, d_inode(dentry)); if (!dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, - ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), - dentry->d_inode->i_mode >> 12)) { + ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino), + d_inode(dentry)->i_mode >> 12)) { if (last) { /* remember our position */ fi->dentry = last; @@ -281,6 +281,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); if ((ctx->pos == 2 || fi->dentry) && + ceph_test_mount_opt(fsc, DCACHE) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete_ordered(ci) && @@ -336,16 +337,23 @@ more: ceph_mdsc_put_request(req); return err; } - req->r_inode = inode; - ihold(inode); - req->r_dentry = dget(file->f_path.dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); req->r_direct_is_hash = true; - req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + if (fi->last_name) { + req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + return -ENOMEM; + } + } req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); + + req->r_inode = inode; + ihold(inode); + req->r_dentry = dget(file->f_path.dentry); err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); @@ -535,7 +543,7 @@ int ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ + struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ /* .snap dir? */ if (err == -ENOENT && @@ -571,8 +579,8 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, err = 0; if (!req->r_reply_info.head->is_dentry) { dout("ENOENT and no trace, dentry %p inode %p\n", - dentry, dentry->d_inode); - if (dentry->d_inode) { + dentry, d_inode(dentry)); + if (d_really_is_positive(dentry)) { d_drop(dentry); err = -ENOENT; } else { @@ -619,7 +627,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(err); /* can we conclude ENOENT locally? */ - if (dentry->d_inode == NULL) { + if (d_really_is_negative(dentry)) { struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); @@ -629,6 +637,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && + ceph_test_mount_opt(fsc, DCACHE) && __ceph_dir_is_complete(ci) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); @@ -725,7 +734,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, ceph_mdsc_put_request(req); out: if (!err) - ceph_init_inode_acls(dentry->d_inode, &acls); + ceph_init_inode_acls(d_inode(dentry), &acls); else d_drop(dentry); ceph_release_acls_info(&acls); @@ -755,10 +764,15 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, err = PTR_ERR(req); goto out; } - req->r_dentry = dget(dentry); - req->r_num_caps = 2; req->r_path2 = kstrdup(dest, GFP_NOFS); + if (!req->r_path2) { + err = -ENOMEM; + ceph_mdsc_put_request(req); + goto out; + } req->r_locked_dir = dir; + req->r_dentry = dget(dentry); + req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; err = ceph_mdsc_do_request(mdsc, dir, req); @@ -821,7 +835,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ceph_mdsc_put_request(req); out: if (!err) - ceph_init_inode_acls(dentry->d_inode, &acls); + ceph_init_inode_acls(d_inode(dentry), &acls); else d_drop(dentry); ceph_release_acls_info(&acls); @@ -858,8 +872,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, if (err) { d_drop(dentry); } else if (!req->r_reply_info.head->is_dentry) { - ihold(old_dentry->d_inode); - d_instantiate(dentry, old_dentry->d_inode); + ihold(d_inode(old_dentry)); + d_instantiate(dentry, d_inode(old_dentry)); } ceph_mdsc_put_request(req); return err; @@ -892,7 +906,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_mds_request *req; int err = -EROFS; int op; @@ -933,16 +947,20 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + int op = CEPH_MDS_OP_RENAME; int err; if (ceph_snap(old_dir) != ceph_snap(new_dir)) return -EXDEV; - if (ceph_snap(old_dir) != CEPH_NOSNAP || - ceph_snap(new_dir) != CEPH_NOSNAP) - return -EROFS; + if (ceph_snap(old_dir) != CEPH_NOSNAP) { + if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR) + op = CEPH_MDS_OP_RENAMESNAP; + else + return -EROFS; + } dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); ihold(old_dir); @@ -957,8 +975,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_RDCACHE on source inode (mds will lock it) */ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; - if (new_dentry->d_inode) - req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); + if (d_really_is_positive(new_dentry)) + req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); err = ceph_mdsc_do_request(mdsc, old_dir, req); if (!err && !req->r_reply_info.head->is_dentry) { /* @@ -1024,7 +1042,7 @@ static int dentry_lease_is_valid(struct dentry *dentry) if (di->lease_renew_after && time_after(jiffies, di->lease_renew_after)) { /* we should renew */ - dir = dentry->d_parent->d_inode; + dir = d_inode(dentry->d_parent); session = ceph_get_mds_session(s); seq = di->lease_seq; di->lease_renew_after = 0; @@ -1074,22 +1092,22 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, - dentry, dentry->d_inode, ceph_dentry(dentry)->offset); + dentry, d_inode(dentry), ceph_dentry(dentry)->offset); dir = ceph_get_dentry_parent_inode(dentry); /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, - dentry, dentry->d_inode); + dentry, d_inode(dentry)); valid = 1; - } else if (dentry->d_inode && - ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { + } else if (d_really_is_positive(dentry) && + ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { valid = 1; } else if (dentry_lease_is_valid(dentry) || dir_lease_is_valid(dir, dentry)) { - if (dentry->d_inode) - valid = ceph_is_any_caps(dentry->d_inode); + if (d_really_is_positive(dentry)) + valid = ceph_is_any_caps(d_inode(dentry)); else valid = 1; } @@ -1151,7 +1169,7 @@ static void ceph_d_prune(struct dentry *dentry) * we hold d_lock, so d_parent is stable, and d_fsdata is never * cleared until d_release */ - ceph_dir_clear_complete(dentry->d_parent->d_inode); + ceph_dir_clear_complete(d_inode(dentry->d_parent)); } /* @@ -1240,11 +1258,12 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, dout("dir_fsync %p wait on tid %llu (until %llu)\n", inode, req->r_tid, last_tid); if (req->r_timeout) { - ret = wait_for_completion_timeout( - &req->r_safe_completion, req->r_timeout); - if (ret > 0) + unsigned long time_left = wait_for_completion_timeout( + &req->r_safe_completion, + req->r_timeout); + if (time_left > 0) ret = 0; - else if (ret == 0) + else ret = -EIO; /* timed out */ } else { wait_for_completion(&req->r_safe_completion); @@ -1372,6 +1391,7 @@ const struct inode_operations ceph_snapdir_iops = { .getattr = ceph_getattr, .mkdir = ceph_mkdir, .rmdir = ceph_unlink, + .rename = ceph_rename, }; const struct dentry_operations ceph_dentry_ops = { diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 8d7d782f4382..fe02ae7f056a 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -136,8 +136,8 @@ static struct dentry *__get_parent(struct super_block *sb, return ERR_CAST(req); if (child) { - req->r_inode = child->d_inode; - ihold(child->d_inode); + req->r_inode = d_inode(child); + ihold(d_inode(child)); } else { req->r_ino1 = (struct ceph_vino) { .ino = ino, @@ -164,7 +164,7 @@ static struct dentry *__get_parent(struct super_block *sb, return ERR_PTR(err); } dout("__get_parent ino %llx parent %p ino %llx.%llx\n", - child ? ceph_ino(child->d_inode) : ino, + child ? ceph_ino(d_inode(child)) : ino, dentry, ceph_vinop(inode)); return dentry; } @@ -172,11 +172,11 @@ static struct dentry *__get_parent(struct super_block *sb, static struct dentry *ceph_get_parent(struct dentry *child) { /* don't re-export snaps */ - if (ceph_snap(child->d_inode) != CEPH_NOSNAP) + if (ceph_snap(d_inode(child)) != CEPH_NOSNAP) return ERR_PTR(-EINVAL); dout("get_parent %p ino %llx.%llx\n", - child, ceph_vinop(child->d_inode)); + child, ceph_vinop(d_inode(child))); return __get_parent(child->d_sb, child, 0); } @@ -209,32 +209,32 @@ static int ceph_get_name(struct dentry *parent, char *name, struct ceph_mds_request *req; int err; - mdsc = ceph_inode_to_client(child->d_inode)->mdsc; + mdsc = ceph_inode_to_client(d_inode(child))->mdsc; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, USE_ANY_MDS); if (IS_ERR(req)) return PTR_ERR(req); - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); - req->r_inode = child->d_inode; - ihold(child->d_inode); - req->r_ino2 = ceph_vino(parent->d_inode); - req->r_locked_dir = parent->d_inode; + req->r_inode = d_inode(child); + ihold(d_inode(child)); + req->r_ino2 = ceph_vino(d_inode(parent)); + req->r_locked_dir = d_inode(parent); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); if (!err) { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; memcpy(name, rinfo->dname, rinfo->dname_len); name[rinfo->dname_len] = 0; dout("get_name %p ino %llx.%llx name %s\n", - child, ceph_vinop(child->d_inode), name); + child, ceph_vinop(d_inode(child)), name); } else { dout("get_name %p ino %llx.%llx err %d\n", - child, ceph_vinop(child->d_inode), err); + child, ceph_vinop(d_inode(child)), err); } ceph_mdsc_put_request(req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b9b8eb225f66..3b6b522b4b31 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -291,14 +291,14 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } if (err) goto out_req; - if (dn || dentry->d_inode == NULL || d_is_symlink(dentry)) { + if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { /* make vfs retry on splice, ENOENT, or symlink */ dout("atomic_open finish_no_open on dn %p\n", dn); err = finish_no_open(file, dn); } else { dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { - ceph_init_inode_acls(dentry->d_inode, &acls); + ceph_init_inode_acls(d_inode(dentry), &acls); *opened |= FILE_CREATED; } err = finish_open(file, dentry, ceph_open, opened); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 119c43c80638..e876e1944519 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -940,7 +940,7 @@ static void update_dentry_lease(struct dentry *dentry, dentry, duration, ttl); /* make lease_rdcache_gen match directory */ - dir = dentry->d_parent->d_inode; + dir = d_inode(dentry->d_parent); di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; if (duration == 0) @@ -980,7 +980,7 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, { struct dentry *realdn; - BUG_ON(dn->d_inode); + BUG_ON(d_inode(dn)); /* dn must be unhashed */ if (!d_unhashed(dn)) @@ -998,13 +998,13 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, "inode %p ino %llx.%llx\n", dn, d_count(dn), realdn, d_count(realdn), - realdn->d_inode, ceph_vinop(realdn->d_inode)); + d_inode(realdn), ceph_vinop(d_inode(realdn))); dput(dn); dn = realdn; } else { BUG_ON(!ceph_dentry(dn)); dout("dn %p attached to %p ino %llx.%llx\n", - dn, dn->d_inode, ceph_vinop(dn->d_inode)); + dn, d_inode(dn), ceph_vinop(d_inode(dn))); } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); @@ -1125,11 +1125,11 @@ retry_lookup: dput(parent); goto done; } - } else if (dn->d_inode && - (ceph_ino(dn->d_inode) != vino.ino || - ceph_snap(dn->d_inode) != vino.snap)) { + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != vino.ino || + ceph_snap(d_inode(dn)) != vino.snap)) { dout(" dn %p points to wrong inode %p\n", - dn, dn->d_inode); + dn, d_inode(dn)); d_delete(dn); dput(dn); goto retry_lookup; @@ -1183,7 +1183,7 @@ retry_lookup: BUG_ON(!dn); BUG_ON(!dir); - BUG_ON(dn->d_parent->d_inode != dir); + BUG_ON(d_inode(dn->d_parent) != dir); BUG_ON(ceph_ino(dir) != le64_to_cpu(rinfo->diri.in->ino)); BUG_ON(ceph_snap(dir) != @@ -1235,7 +1235,7 @@ retry_lookup: /* null dentry? */ if (!rinfo->head->is_target) { dout("fill_trace null dentry\n"); - if (dn->d_inode) { + if (d_really_is_positive(dn)) { ceph_dir_clear_ordered(dir); dout("d_delete %p\n", dn); d_delete(dn); @@ -1252,7 +1252,7 @@ retry_lookup: } /* attach proper inode */ - if (!dn->d_inode) { + if (d_really_is_negative(dn)) { ceph_dir_clear_ordered(dir); ihold(in); dn = splice_dentry(dn, in, &have_lease); @@ -1261,9 +1261,9 @@ retry_lookup: goto done; } req->r_dentry = dn; /* may have spliced */ - } else if (dn->d_inode && dn->d_inode != in) { + } else if (d_really_is_positive(dn) && d_inode(dn) != in) { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, dn->d_inode, ceph_vinop(dn->d_inode), + dn, d_inode(dn), ceph_vinop(d_inode(dn)), ceph_vinop(in)); have_lease = false; } @@ -1363,7 +1363,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, return readdir_prepopulate_inodes_only(req, session); if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { - snapdir = ceph_get_snapdir(parent->d_inode); + snapdir = ceph_get_snapdir(d_inode(parent)); parent = d_find_alias(snapdir); dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", rinfo->dir_nr, parent); @@ -1371,7 +1371,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, dout("readdir_prepopulate %d items under dn %p\n", rinfo->dir_nr, parent); if (rinfo->dir_dir) - ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); + ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); } /* FIXME: release caps/leases if error occurs */ @@ -1405,11 +1405,11 @@ retry_lookup: err = ret; goto out; } - } else if (dn->d_inode && - (ceph_ino(dn->d_inode) != vino.ino || - ceph_snap(dn->d_inode) != vino.snap)) { + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != vino.ino || + ceph_snap(d_inode(dn)) != vino.snap)) { dout(" dn %p points to wrong inode %p\n", - dn, dn->d_inode); + dn, d_inode(dn)); d_delete(dn); dput(dn); goto retry_lookup; @@ -1423,8 +1423,8 @@ retry_lookup: } /* inode */ - if (dn->d_inode) { - in = dn->d_inode; + if (d_really_is_positive(dn)) { + in = d_inode(dn); } else { in = ceph_get_inode(parent->d_sb, vino); if (IS_ERR(in)) { @@ -1440,13 +1440,13 @@ retry_lookup: req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); - if (!dn->d_inode) + if (d_really_is_negative(dn)) iput(in); d_drop(dn); goto next_item; } - if (!dn->d_inode) { + if (d_really_is_negative(dn)) { struct dentry *realdn = splice_dentry(dn, in, NULL); if (IS_ERR(realdn)) { err = PTR_ERR(realdn); @@ -1693,7 +1693,7 @@ retry: */ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct ceph_inode_info *ci = ceph_inode(dentry->d_inode); + struct ceph_inode_info *ci = ceph_inode(d_inode(dentry)); nd_set_link(nd, ci->i_symlink); return NULL; } @@ -1714,7 +1714,7 @@ static const struct inode_operations ceph_symlink_iops = { */ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; @@ -1990,7 +1990,7 @@ int ceph_permission(struct inode *inode, int mask) int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); int err; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 71c073f38e54..84f37f34f9aa 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -679,7 +679,7 @@ static struct dentry *get_nonsnap_parent(struct dentry *dentry) * except to resplice to another snapdir, and either the old or new * result is a valid result. */ - while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) dentry = dentry->d_parent; return dentry; } @@ -716,20 +716,20 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ struct dentry *parent = req->r_dentry->d_parent; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); if (dir->i_sb != mdsc->fsc->sb) { /* not this fs! */ - inode = req->r_dentry->d_inode; + inode = d_inode(req->r_dentry); } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ struct dentry *dn = get_nonsnap_parent(parent); - inode = dn->d_inode; + inode = d_inode(dn); dout("__choose_mds using nonsnap parent %p\n", inode); } else { /* dentry target */ - inode = req->r_dentry->d_inode; + inode = d_inode(req->r_dentry); if (!inode || mode == USE_AUTH_MDS) { /* dir + name */ inode = dir; @@ -1021,6 +1021,33 @@ static void cleanup_cap_releases(struct ceph_mds_session *session) spin_unlock(&session->s_cap_lock); } +static void cleanup_session_requests(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_request *req; + struct rb_node *p; + + dout("cleanup_session_requests mds%d\n", session->s_mds); + mutex_lock(&mdsc->mutex); + while (!list_empty(&session->s_unsafe)) { + req = list_first_entry(&session->s_unsafe, + struct ceph_mds_request, r_unsafe_item); + list_del_init(&req->r_unsafe_item); + pr_info(" dropping unsafe request %llu\n", req->r_tid); + __unregister_request(mdsc, req); + } + /* zero r_attempts, so kick_requests() will re-send requests */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_session && + req->r_session->s_mds == session->s_mds) + req->r_attempts = 0; + } + mutex_unlock(&mdsc->mutex); +} + /* * Helper to safely iterate over all caps associated with a session, with * special care taken to handle a racing __ceph_remove_cap(). @@ -1098,7 +1125,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); __ceph_remove_cap(cap, false); - if (!__ceph_is_any_real_caps(ci)) { + if (!ci->i_auth_cap) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -1120,13 +1147,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, mdsc->num_cap_flushing--; drop = 1; } - if (drop && ci->i_wrbuffer_ref) { - pr_info(" dropping dirty data for %p %lld\n", - inode, ceph_ino(inode)); - ci->i_wrbuffer_ref = 0; - ci->i_wrbuffer_ref_head = 0; - drop++; - } spin_unlock(&mdsc->cap_dirty_lock); } spin_unlock(&ci->i_ceph_lock); @@ -1712,7 +1732,7 @@ retry: seq = read_seqbegin(&rename_lock); rcu_read_lock(); for (temp = dentry; !IS_ROOT(temp);) { - struct inode *inode = temp->d_inode; + struct inode *inode = d_inode(temp); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) len++; /* slash only */ else if (stop_on_nosnap && inode && @@ -1736,7 +1756,7 @@ retry: struct inode *inode; spin_lock(&temp->d_lock); - inode = temp->d_inode; + inode = d_inode(temp); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { dout("build_path path+%d: %p SNAPDIR\n", pos, temp); @@ -1770,7 +1790,7 @@ retry: goto retry; } - *base = ceph_ino(temp->d_inode); + *base = ceph_ino(d_inode(temp)); *plen = len; dout("build_path on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), *base, len, path); @@ -1783,8 +1803,8 @@ static int build_dentry_path(struct dentry *dentry, { char *path; - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) { - *pino = ceph_ino(dentry->d_parent->d_inode); + if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) { + *pino = ceph_ino(d_inode(dentry->d_parent)); *ppath = dentry->d_name.name; *ppathlen = dentry->d_name.len; return 0; @@ -1853,7 +1873,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, */ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, - int mds) + int mds, bool drop_cap_releases) { struct ceph_msg *msg; struct ceph_mds_request_head *head; @@ -1925,7 +1945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, releases = 0; if (req->r_inode_drop) releases += ceph_encode_inode_release(&p, - req->r_inode ? req->r_inode : req->r_dentry->d_inode, + req->r_inode ? req->r_inode : d_inode(req->r_dentry), mds, req->r_inode_drop, req->r_inode_unless, 0); if (req->r_dentry_drop) releases += ceph_encode_dentry_release(&p, req->r_dentry, @@ -1935,8 +1955,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, mds, req->r_old_dentry_drop, req->r_old_dentry_unless); if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, - req->r_old_dentry->d_inode, + d_inode(req->r_old_dentry), mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); + + if (drop_cap_releases) { + releases = 0; + p = msg->front.iov_base + req->r_request_release_offset; + } + head->num_releases = cpu_to_le16(releases); /* time stamp */ @@ -1989,7 +2015,7 @@ static void complete_request(struct ceph_mds_client *mdsc, */ static int __prepare_send_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, - int mds) + int mds, bool drop_cap_releases) { struct ceph_mds_request_head *rhead; struct ceph_msg *msg; @@ -2048,7 +2074,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, ceph_msg_put(req->r_request); req->r_request = NULL; } - msg = create_request_message(mdsc, req, mds); + msg = create_request_message(mdsc, req, mds, drop_cap_releases); if (IS_ERR(msg)) { req->r_err = PTR_ERR(msg); complete_request(mdsc, req); @@ -2132,7 +2158,7 @@ static int __do_request(struct ceph_mds_client *mdsc, if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; - err = __prepare_send_request(mdsc, req, mds); + err = __prepare_send_request(mdsc, req, mds, false); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2590,6 +2616,7 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_CLOSE: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect denied\n", session->s_mds); + cleanup_session_requests(mdsc, session); remove_session_caps(session); wake = 2; /* for good measure */ wake_up_all(&mdsc->session_close_wq); @@ -2658,7 +2685,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { - err = __prepare_send_request(mdsc, req, session->s_mds); + err = __prepare_send_request(mdsc, req, session->s_mds, true); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2679,7 +2706,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, continue; /* only old requests */ if (req->r_session && req->r_session->s_mds == session->s_mds) { - err = __prepare_send_request(mdsc, req, session->s_mds); + err = __prepare_send_request(mdsc, req, + session->s_mds, true); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2864,7 +2892,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, spin_unlock(&session->s_cap_lock); /* trim unused caps to reduce MDS's cache rejoin time */ - shrink_dcache_parent(mdsc->fsc->sb->s_root); + if (mdsc->fsc->sb->s_root) + shrink_dcache_parent(mdsc->fsc->sb->s_root); ceph_con_close(&session->s_con); ceph_con_open(&session->s_con, @@ -3133,7 +3162,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, di->lease_renew_from && di->lease_renew_after == 0) { unsigned long duration = - le32_to_cpu(h->duration_ms) * HZ / 1000; + msecs_to_jiffies(le32_to_cpu(h->duration_ms)); di->lease_seq = seq; dentry->d_time = di->lease_renew_from + duration; diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 51cc23e48111..89e6bc321df3 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -75,6 +75,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LSSNAP: return "lssnap"; case CEPH_MDS_OP_MKSNAP: return "mksnap"; case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + case CEPH_MDS_OP_RENAMESNAP: return "renamesnap"; case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a63997b8bcff..4e9905374078 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -44,7 +44,7 @@ static void ceph_put_super(struct super_block *s) static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry)); struct ceph_monmap *monmap = fsc->client->monc.monmap; struct ceph_statfs st; u64 fsid; @@ -345,6 +345,11 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->rsize = CEPH_RSIZE_DEFAULT; fsopt->rasize = CEPH_RASIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + if (!fsopt->snapdir_name) { + err = -ENOMEM; + goto out; + } + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; @@ -406,31 +411,20 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) { struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); struct ceph_mount_options *fsopt = fsc->mount_options; - struct ceph_options *opt = fsc->client->options; - - if (opt->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsid=%pU", &opt->fsid); - if (opt->flags & CEPH_OPT_NOSHARE) - seq_puts(m, ",noshare"); - if (opt->flags & CEPH_OPT_NOCRC) - seq_puts(m, ",nocrc"); - if (opt->flags & CEPH_OPT_NOMSGAUTH) - seq_puts(m, ",nocephx_require_signatures"); - if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) - seq_puts(m, ",notcp_nodelay"); - - if (opt->name) - seq_printf(m, ",name=%s", opt->name); - if (opt->key) - seq_puts(m, ",secret=<hidden>"); - - if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) - seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); - if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) - seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); - if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) - seq_printf(m, ",osdkeepalivetimeout=%d", - opt->osd_keepalive_timeout); + size_t pos; + int ret; + + /* a comma between MNT/MS and client options */ + seq_putc(m, ','); + pos = m->count; + + ret = ceph_print_client_options(m, fsc->client); + if (ret) + return ret; + + /* retract our comma if no client options */ + if (m->count == pos) + m->count--; if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) seq_puts(m, ",dirstat"); @@ -438,14 +432,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",norbytes"); if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); - if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) - seq_puts(m, ",dcache"); - else + if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) seq_puts(m, ",nodcache"); if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) seq_puts(m, ",fsc"); - else - seq_puts(m, ",nofsc"); #ifdef CONFIG_CEPH_FS_POSIX_ACL if (fsopt->sb_flags & MS_POSIXACL) @@ -477,6 +467,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + return 0; } @@ -730,6 +721,11 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, if (IS_ERR(req)) return ERR_CAST(req); req->r_path1 = kstrdup(path, GFP_NOFS); + if (!req->r_path1) { + root = ERR_PTR(-ENOMEM); + goto out; + } + req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; @@ -976,7 +972,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, if (IS_ERR(res)) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", res, - res->d_inode, ceph_vinop(res->d_inode)); + d_inode(res), ceph_vinop(d_inode(res))); return res; out_splat: diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 04c8124ed30e..fa20e1318939 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -36,7 +36,8 @@ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ -#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) +#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ + CEPH_MOUNT_OPT_DCACHE) #define ceph_set_mount_opt(fsc, opt) \ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; @@ -881,7 +882,6 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); /* file.c */ extern const struct file_operations ceph_file_fops; -extern const struct address_space_operations ceph_aops; extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5a492caf34cb..cd7ffad4041d 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -776,12 +776,12 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_getxattr(dentry, name, value, size); - return __ceph_getxattr(dentry->d_inode, name, value, size); + return __ceph_getxattr(d_inode(dentry), name, value, size); } ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); u32 vir_namelen = 0; @@ -847,7 +847,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, const char *value, size_t size, int flags) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; @@ -877,16 +877,23 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = PTR_ERR(req); goto out; } - req->r_inode = inode; - ihold(inode); - req->r_inode_drop = CEPH_CAP_XATTR_SHARED; - req->r_num_caps = 1; + req->r_args.setxattr.flags = cpu_to_le32(flags); req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + err = -ENOMEM; + goto out; + } req->r_pagelist = pagelist; pagelist = NULL; + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; + dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); @@ -901,7 +908,7 @@ out: int __ceph_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); int issued; @@ -995,7 +1002,7 @@ out: int ceph_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) return -EROFS; if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) @@ -1011,7 +1018,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_mds_request *req; int err; @@ -1019,12 +1026,14 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) + return -ENOMEM; + req->r_inode = inode; ihold(inode); - req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; - req->r_path2 = kstrdup(name, GFP_NOFS); - + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; @@ -1032,7 +1041,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) int __ceph_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); int issued; @@ -1098,7 +1107,7 @@ out: int ceph_removexattr(struct dentry *dentry, const char *name) { - if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) return -EROFS; if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index b8602f199815..430e0348c99e 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -301,7 +301,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) if (full_path == NULL) goto cdda_exit; - cifs_sb = CIFS_SB(mntpt->d_inode->i_sb); + cifs_sb = CIFS_SB(d_inode(mntpt)->i_sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { mnt = ERR_CAST(tlink); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index eaab4b2a0595..f5089bde3635 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -607,7 +607,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) p = s = full_path; do { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct dentry *child; if (!dir) { diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index fa13d5e79f64..84650a51c7c4 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1898,7 +1898,7 @@ static void cifs_writev_requeue(struct cifs_writedata *wdata) { int i, rc = 0; - struct inode *inode = wdata->cfile->dentry->d_inode; + struct inode *inode = d_inode(wdata->cfile->dentry); struct TCP_Server_Info *server; unsigned int rest_len; @@ -1981,7 +1981,7 @@ cifs_writev_complete(struct work_struct *work) { struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); - struct inode *inode = wdata->cfile->dentry->d_inode; + struct inode *inode = d_inode(wdata->cfile->dentry); int i = 0; if (wdata->result == 0) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index b72bc29cba23..338d56936f6a 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -745,13 +745,13 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, goto lookup_out; } - if (direntry->d_inode != NULL) { + if (d_really_is_positive(direntry)) { cifs_dbg(FYI, "non-NULL inode in lookup\n"); } else { cifs_dbg(FYI, "NULL inode in lookup\n"); } cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", - full_path, direntry->d_inode); + full_path, d_inode(direntry)); if (pTcon->unix_ext) { rc = cifs_get_inode_info_unix(&newInode, full_path, @@ -792,7 +792,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - if (direntry->d_inode) { + if (d_really_is_positive(direntry)) { if (cifs_revalidate_dentry(direntry)) return 0; else { @@ -803,7 +803,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) * attributes will have been updated by * cifs_revalidate_dentry(). */ - if (IS_AUTOMOUNT(direntry->d_inode) && + if (IS_AUTOMOUNT(d_inode(direntry)) && !(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) { spin_lock(&direntry->d_lock); direntry->d_flags |= DCACHE_NEED_AUTOMOUNT; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ca2bc5406306..cafbf10521d5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -273,7 +273,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifsFileInfo *cfile; struct cifs_fid_locks *fdlocks; @@ -357,7 +357,7 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { - struct inode *inode = cifs_file->dentry->d_inode; + struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct cifsInodeInfo *cifsi = CIFS_I(inode); @@ -386,7 +386,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (list_empty(&cifsi->openFileList)) { cifs_dbg(FYI, "closing last open instance for inode %p\n", - cifs_file->dentry->d_inode); + d_inode(cifs_file->dentry)); /* * In strict cache mode we need invalidate mapping on the last * close because it may cause a error when we open this file @@ -572,7 +572,7 @@ static int cifs_relock_file(struct cifsFileInfo *cfile) { struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; @@ -620,7 +620,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) return rc; } - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifs_sb = CIFS_SB(inode->i_sb); tcon = tlink_tcon(cfile->tlink); server = tcon->ses->server; @@ -874,7 +874,7 @@ cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, { bool rc = false; struct cifs_fid_locks *cur; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); list_for_each_entry(cur, &cinode->llist, llist) { rc = cifs_find_fid_lock_conflict(cur, offset, length, type, @@ -899,7 +899,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, { int rc = 0; struct cifsLockInfo *conf_lock; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; bool exist; @@ -927,7 +927,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, static void cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock) { - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); down_write(&cinode->lock_sem); list_add_tail(&lock->llist, &cfile->llist->locks); up_write(&cinode->lock_sem); @@ -944,7 +944,7 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock, bool wait) { struct cifsLockInfo *conf_lock; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); bool exist; int rc = 0; @@ -1125,7 +1125,7 @@ struct lock_to_push { static int cifs_push_posix_locks(struct cifsFileInfo *cfile) { - struct inode *inode = cfile->dentry->d_inode; + struct inode *inode = d_inode(cfile->dentry); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock; struct file_lock_context *flctx = inode->i_flctx; @@ -1214,7 +1214,7 @@ static int cifs_push_locks(struct cifsFileInfo *cfile) { struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; @@ -1382,7 +1382,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, unsigned int max_num, num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifsLockInfo *li, *tmp; __u64 length = 1 + flock->fl_end - flock->fl_start; struct list_head tmp_llist; @@ -1488,7 +1488,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - struct inode *inode = cfile->dentry->d_inode; + struct inode *inode = d_inode(cfile->dentry); if (posix_lck) { int posix_lock_type; @@ -1643,7 +1643,7 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, struct TCP_Server_Info *server; unsigned int xid; struct dentry *dentry = open_file->dentry; - struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(d_inode(dentry)); struct cifs_io_parms io_parms; cifs_sb = CIFS_SB(dentry->d_sb); @@ -1676,7 +1676,7 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, break; } - len = min(server->ops->wp_retry_size(dentry->d_inode), + len = min(server->ops->wp_retry_size(d_inode(dentry)), (unsigned int)write_size - total_written); /* iov[0] is reserved for smb header */ iov[1].iov_base = (char *)write_data + total_written; @@ -1696,9 +1696,9 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, return rc; } } else { - spin_lock(&dentry->d_inode->i_lock); + spin_lock(&d_inode(dentry)->i_lock); cifs_update_eof(cifsi, *offset, bytes_written); - spin_unlock(&dentry->d_inode->i_lock); + spin_unlock(&d_inode(dentry)->i_lock); *offset += bytes_written; } } @@ -1706,12 +1706,12 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, cifs_stats_bytes_written(tcon, total_written); if (total_written > 0) { - spin_lock(&dentry->d_inode->i_lock); - if (*offset > dentry->d_inode->i_size) - i_size_write(dentry->d_inode, *offset); - spin_unlock(&dentry->d_inode->i_lock); + spin_lock(&d_inode(dentry)->i_lock); + if (*offset > d_inode(dentry)->i_size) + i_size_write(d_inode(dentry), *offset); + spin_unlock(&d_inode(dentry)->i_lock); } - mark_inode_dirty_sync(dentry->d_inode); + mark_inode_dirty_sync(d_inode(dentry)); free_xid(xid); return total_written; } @@ -2406,7 +2406,7 @@ cifs_uncached_writev_complete(struct work_struct *work) { struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); - struct inode *inode = wdata->cfile->dentry->d_inode; + struct inode *inode = d_inode(wdata->cfile->dentry); struct cifsInodeInfo *cifsi = CIFS_I(inode); spin_lock(&inode->i_lock); @@ -3794,7 +3794,7 @@ void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, oplock_break); - struct inode *inode = cfile->dentry->d_inode; + struct inode *inode = d_inode(cfile->dentry); struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 3e126d7bb2ea..55b58112d122 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1067,7 +1067,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, int rc; struct cifs_fid fid; struct cifs_open_parms oparms; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; @@ -1196,7 +1196,7 @@ cifs_drop_nlink(struct inode *inode) } /* - * If dentry->d_inode is null (usually meaning the cached dentry + * If d_inode(dentry) is null (usually meaning the cached dentry * is a negative dentry) then we would attempt a standard SMB delete, but * if that fails we can not attempt the fall back mechanisms on EACCESS * but will return the EACCESS to the caller. Note that the VFS does not call @@ -1207,7 +1207,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) int rc = 0; unsigned int xid; char *full_path = NULL; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifs_inode; struct super_block *sb = dir->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -1551,13 +1551,13 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifs_put_tlink(tlink); if (!rc) { - spin_lock(&direntry->d_inode->i_lock); - i_size_write(direntry->d_inode, 0); - clear_nlink(direntry->d_inode); - spin_unlock(&direntry->d_inode->i_lock); + spin_lock(&d_inode(direntry)->i_lock); + i_size_write(d_inode(direntry), 0); + clear_nlink(d_inode(direntry)); + spin_unlock(&d_inode(direntry)->i_lock); } - cifsInode = CIFS_I(direntry->d_inode); + cifsInode = CIFS_I(d_inode(direntry)); /* force revalidate to go get info when needed */ cifsInode->time = 0; @@ -1568,7 +1568,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) */ cifsInode->time = 0; - direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = + d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); rmdir_exit: @@ -1727,7 +1727,7 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, unlink_target: /* Try unlinking the target dentry if it's not negative */ - if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) { + if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) { if (d_is_dir(target_dentry)) tmprc = cifs_rmdir(target_dir, target_dentry); else @@ -1867,7 +1867,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) { unsigned int xid; int rc = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = dentry->d_sb; char *full_path = NULL; @@ -1919,7 +1919,7 @@ int cifs_revalidate_file(struct file *filp) int cifs_revalidate_dentry(struct dentry *dentry) { int rc; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); rc = cifs_revalidate_dentry_attr(dentry); if (rc) @@ -1933,7 +1933,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, { struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int rc; /* @@ -2110,7 +2110,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) int rc; unsigned int xid; char *full_path = NULL; - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; @@ -2251,7 +2251,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) unsigned int xid; kuid_t uid = INVALID_UID; kgid_t gid = INVALID_GID; - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); char *full_path = NULL; @@ -2409,7 +2409,7 @@ cifs_setattr_exit: int cifs_setattr(struct dentry *direntry, struct iattr *attrs) { - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 2ec6037f61c7..252e672d5604 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -586,12 +586,12 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, * if source file is cached (oplocked) revalidate will not go to server * until the file is closed or oplock broken so update nlinks locally */ - if (old_file->d_inode) { - cifsInode = CIFS_I(old_file->d_inode); + if (d_really_is_positive(old_file)) { + cifsInode = CIFS_I(d_inode(old_file)); if (rc == 0) { - spin_lock(&old_file->d_inode->i_lock); - inc_nlink(old_file->d_inode); - spin_unlock(&old_file->d_inode->i_lock); + spin_lock(&d_inode(old_file)->i_lock); + inc_nlink(d_inode(old_file)); + spin_unlock(&d_inode(old_file)->i_lock); /* * parent dir timestamps will update from srv within a @@ -629,7 +629,7 @@ cifs_hl_exit: void * cifs_follow_link(struct dentry *direntry, struct nameidata *nd) { - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); int rc = -ENOMEM; unsigned int xid; char *full_path = NULL; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 337946355b29..8442b8b8e0be 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -473,7 +473,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) continue; cifs_dbg(FYI, "file id match, oplock break\n"); - pCifsInode = CIFS_I(netfile->dentry->d_inode); + pCifsInode = CIFS_I(d_inode(netfile->dentry)); set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &pCifsInode->flags); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index c295338e0a98..b4a47237486b 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -78,7 +78,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, { struct dentry *dentry, *alias; struct inode *inode; - struct super_block *sb = parent->d_inode->i_sb; + struct super_block *sb = d_inode(parent)->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); @@ -88,7 +88,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, return; if (dentry) { - inode = dentry->d_inode; + inode = d_inode(dentry); if (inode) { /* * If we're generating inode numbers, then we don't diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index d2979036a4c7..7bfdd6066276 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -722,7 +722,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, static void cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) { - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); cfile->fid.netfid = fid->netfid; cifs_set_oplock_level(cinode, oplock); cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 7198eac5dddd..2ab297dae5a7 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -95,7 +95,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, unsigned int max_num, num = 0, max_buf; struct smb2_lock_element *buf, *cur; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifsLockInfo *li, *tmp; __u64 length = 1 + flock->fl_end - flock->fl_start; struct list_head tmp_llist; @@ -231,7 +231,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) unsigned int xid; unsigned int max_num, max_buf; struct smb2_lock_element *buf; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifs_fid_locks *fdlocks; xid = get_xid(); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 22dfdf17d065..1c5907019045 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -453,7 +453,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, list_for_each(tmp, &tcon->openFileList) { cfile = list_entry(tmp, struct cifsFileInfo, tlist); - cinode = CIFS_I(cfile->dentry->d_inode); + cinode = CIFS_I(d_inode(cfile->dentry)); if (memcmp(cinode->lease_key, rsp->LeaseKey, SMB2_LEASE_KEY_SIZE)) @@ -590,7 +590,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) continue; cifs_dbg(FYI, "file id match, oplock break\n"); - cinode = CIFS_I(cfile->dentry->d_inode); + cinode = CIFS_I(d_inode(cfile->dentry)); if (!CIFS_CACHE_WRITE(cinode) && rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index eab05e1aa587..54daee5ad4c1 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -524,7 +524,7 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) static void smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) { - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; cfile->fid.persistent_fid = fid->persistent_fid; @@ -793,7 +793,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, * If extending file more than one page make sparse. Many Linux fs * make files sparse by default when extending via ftruncate */ - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); if (!set_alloc && (size > inode->i_size + 8192)) { __u8 set_sparse = 1; @@ -1032,7 +1032,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); /* if file not oplocked can't be sure whether asking to extend size */ @@ -1083,7 +1083,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); /* Need to make file sparse, if not already, before freeing range. */ @@ -1115,7 +1115,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); /* if file not oplocked can't be sure whether asking to extend size */ diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 72a4d10653d6..ff9e1f8b16a4 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -50,9 +50,9 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; @@ -111,9 +111,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; @@ -177,12 +177,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, memcpy(pacl, ea_value, value_size); if (pTcon->ses->server->ops->set_acl) rc = pTcon->ses->server->ops->set_acl(pacl, - value_size, direntry->d_inode, + value_size, d_inode(direntry), full_path, CIFS_ACL_DACL); else rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ - CIFS_I(direntry->d_inode)->time = 0; + CIFS_I(d_inode(direntry))->time = 0; kfree(pacl); } #else @@ -246,9 +246,9 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; @@ -324,7 +324,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, goto get_ea_exit; /* rc already EOPNOTSUPP */ pacl = pTcon->ses->server->ops->get_acl(cifs_sb, - direntry->d_inode, full_path, &acllen); + d_inode(direntry), full_path, &acllen); if (IS_ERR(pacl)) { rc = PTR_ERR(pacl); cifs_dbg(VFS, "%s: error %zd getting sec desc\n", @@ -382,9 +382,9 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 46ee6f238985..5bb630a769e0 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -94,8 +94,8 @@ static void coda_flag_children(struct dentry *parent, int flag) spin_lock(&parent->d_lock); list_for_each_entry(de, &parent->d_subdirs, d_child) { /* don't know what to do with negative dentries */ - if (de->d_inode ) - coda_flag_inode(de->d_inode, flag); + if (d_inode(de) ) + coda_flag_inode(d_inode(de), flag); } spin_unlock(&parent->d_lock); return; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 60cb88c1dd2b..fda9f4311212 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -201,7 +201,7 @@ err_out: static int coda_link(struct dentry *source_de, struct inode *dir_inode, struct dentry *de) { - struct inode *inode = source_de->d_inode; + struct inode *inode = d_inode(source_de); const char * name = de->d_name.name; int len = de->d_name.len; int error; @@ -266,7 +266,7 @@ static int coda_unlink(struct inode *dir, struct dentry *de) return error; coda_dir_update_mtime(dir); - drop_nlink(de->d_inode); + drop_nlink(d_inode(de)); return 0; } @@ -279,8 +279,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); if (!error) { /* VFS may delete the child */ - if (de->d_inode) - clear_nlink(de->d_inode); + if (d_really_is_positive(de)) + clear_nlink(d_inode(de)); /* fix the link count of the parent */ coda_dir_drop_nlink(dir); @@ -303,14 +303,14 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, coda_i2f(new_dir), old_length, new_length, (const char *) old_name, (const char *)new_name); if (!error) { - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { if (d_is_dir(new_dentry)) { coda_dir_drop_nlink(old_dir); coda_dir_inc_nlink(new_dir); } coda_dir_update_mtime(old_dir); coda_dir_update_mtime(new_dir); - coda_flag_inode(new_dentry->d_inode, C_VATTR); + coda_flag_inode(d_inode(new_dentry), C_VATTR); } else { coda_flag_inode(old_dir, C_VATTR); coda_flag_inode(new_dir, C_VATTR); @@ -449,13 +449,13 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = de->d_inode; + inode = d_inode(de); if (!inode || is_root_inode(inode)) goto out; if (is_bad_inode(inode)) goto bad; - cii = ITOC(de->d_inode); + cii = ITOC(d_inode(de)); if (!(cii->c_flags & (C_PURGE | C_FLUSH))) goto out; @@ -487,11 +487,11 @@ static int coda_dentry_delete(const struct dentry * dentry) { int flags; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return 0; - flags = (ITOC(dentry->d_inode)->c_flags) & C_PURGE; - if (is_bad_inode(dentry->d_inode) || flags) { + flags = (ITOC(d_inode(dentry))->c_flags) & C_PURGE; + if (is_bad_inode(d_inode(dentry)) || flags) { return 1; } return 0; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 82ec68b59208..cac1390b87a3 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -257,15 +257,15 @@ static void coda_evict_inode(struct inode *inode) int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - int err = coda_revalidate_inode(dentry->d_inode); + int err = coda_revalidate_inode(d_inode(dentry)); if (!err) - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return err; } int coda_setattr(struct dentry *de, struct iattr *iattr) { - struct inode *inode = de->d_inode; + struct inode *inode = d_inode(de); struct coda_vattr vattr; int error; diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 4326d172fc27..f36a4040afb8 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -72,7 +72,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd, if (error) return error; - target_inode = path.dentry->d_inode; + target_inode = d_inode(path.dentry); /* return if it is not a Coda inode */ if (target_inode->i_sb != inode->i_sb) { diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 5bb6e27298a4..9b1ffaa0572e 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -820,8 +820,8 @@ int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out) case CODA_FLUSH: coda_cache_clear_all(sb); shrink_dcache_sb(sb); - if (sb->s_root->d_inode) - coda_flag_inode(sb->s_root->d_inode, C_FLUSH); + if (d_really_is_positive(sb->s_root)) + coda_flag_inode(d_inode(sb->s_root), C_FLUSH); break; case CODA_PURGEUSER: diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index acb3d63bc9dc..c81ce7f200a6 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -289,7 +289,7 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry) configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata); error = configfs_create(dentry, mode, init_dir); if (!error) { - inc_nlink(p->d_inode); + inc_nlink(d_inode(p)); item->ci_dentry = dentry; } else { struct configfs_dirent *sd = dentry->d_fsdata; @@ -375,8 +375,8 @@ static void remove_dir(struct dentry * d) list_del_init(&sd->s_sibling); spin_unlock(&configfs_dirent_lock); configfs_put(sd); - if (d->d_inode) - simple_rmdir(parent->d_inode,d); + if (d_really_is_positive(d)) + simple_rmdir(d_inode(parent),d); pr_debug(" o %pd removing done (%d)\n", d, d_count(d)); @@ -513,7 +513,7 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex /* Abort if racing with mkdir() */ if (sd->s_type & CONFIGFS_USET_IN_MKDIR) { if (wait_mutex) - *wait_mutex = &sd->s_dentry->d_inode->i_mutex; + *wait_mutex = &d_inode(sd->s_dentry)->i_mutex; return -EAGAIN; } @@ -624,13 +624,13 @@ static void detach_groups(struct config_group *group) child = sd->s_dentry; - mutex_lock(&child->d_inode->i_mutex); + mutex_lock(&d_inode(child)->i_mutex); configfs_detach_group(sd->s_element); - child->d_inode->i_flags |= S_DEAD; + d_inode(child)->i_flags |= S_DEAD; dont_mount(child); - mutex_unlock(&child->d_inode->i_mutex); + mutex_unlock(&d_inode(child)->i_mutex); d_delete(child); dput(child); @@ -672,7 +672,7 @@ static int create_default_group(struct config_group *parent_group, sd = child->d_fsdata; sd->s_type |= CONFIGFS_USET_DEFAULT; } else { - BUG_ON(child->d_inode); + BUG_ON(d_inode(child)); d_drop(child); dput(child); } @@ -818,11 +818,11 @@ static int configfs_attach_item(struct config_item *parent_item, * the VFS may already have hit and used them. Thus, * we must lock them as rmdir() would. */ - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); configfs_remove_dir(item); - dentry->d_inode->i_flags |= S_DEAD; + d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); d_delete(dentry); } } @@ -858,16 +858,16 @@ static int configfs_attach_group(struct config_item *parent_item, * We must also lock the inode to remove it safely in case of * error, as rmdir() would. */ - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); ret = populate_groups(to_config_group(item)); if (ret) { configfs_detach_item(item); - dentry->d_inode->i_flags |= S_DEAD; + d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); } configfs_adjust_dir_dirent_depth_after_populate(sd); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); if (ret) d_delete(dentry); } @@ -1075,7 +1075,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, * subsystem is really registered, and so we need to lock out * configfs_[un]register_subsystem(). */ - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); root_sd = root->d_fsdata; @@ -1111,7 +1111,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, out_unlock_dirent_lock: spin_unlock(&configfs_dirent_lock); out_unlock_fs: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); /* * If we succeeded, the fs is pinned via other methods. If not, @@ -1453,11 +1453,11 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) down_write(&configfs_rename_sem); parent = item->parent->dentry; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { - if (!new_dentry->d_inode) { + if (d_really_is_negative(new_dentry)) { error = config_item_set_name(item, "%s", new_name); if (!error) { d_add(new_dentry, NULL); @@ -1469,7 +1469,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) error = -EEXIST; dput(new_dentry); } - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); up_write(&configfs_rename_sem); return error; @@ -1482,7 +1482,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) struct configfs_dirent * parent_sd = dentry->d_fsdata; int err; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); /* * Fake invisibility if dir belongs to a group/default groups hierarchy * being attached @@ -1495,7 +1495,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) else err = 0; } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return err; } @@ -1505,11 +1505,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file) struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * cursor = file->private_data; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); spin_lock(&configfs_dirent_lock); list_del_init(&cursor->s_sibling); spin_unlock(&configfs_dirent_lock); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); release_configfs_dirent(cursor); @@ -1567,7 +1567,7 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx) spin_lock(&configfs_dirent_lock); dentry = next->s_dentry; if (dentry) - inode = dentry->d_inode; + inode = d_inode(dentry); if (inode) ino = inode->i_ino; spin_unlock(&configfs_dirent_lock); @@ -1590,7 +1590,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry * dentry = file->f_path.dentry; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); switch (whence) { case 1: offset += file->f_pos; @@ -1598,7 +1598,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -1624,7 +1624,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&configfs_dirent_lock); } } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return offset; } @@ -1654,7 +1654,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) sd = root->d_fsdata; link_group(to_config_group(sd->s_element), group); - mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); err = -ENOMEM; dentry = d_alloc_name(root, group->cg_item.ci_name); @@ -1664,7 +1664,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) err = configfs_attach_group(sd->s_element, &group->cg_item, dentry); if (err) { - BUG_ON(dentry->d_inode); + BUG_ON(d_inode(dentry)); d_drop(dentry); dput(dentry); } else { @@ -1674,7 +1674,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) } } - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); if (err) { unlink_group(group); @@ -1695,9 +1695,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) return; } - mutex_lock_nested(&root->d_inode->i_mutex, + mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { @@ -1706,13 +1706,13 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) spin_unlock(&configfs_dirent_lock); mutex_unlock(&configfs_symlink_mutex); configfs_detach_group(&group->cg_item); - dentry->d_inode->i_flags |= S_DEAD; + d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); d_delete(dentry); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(dentry); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 56d2cdc9ae0a..403269ffcdf3 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -326,10 +326,10 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, CONFIGFS_ITEM_ATTR); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); return error; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 5423a6a6ecc8..8d89f5fd0331 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -56,7 +56,7 @@ static const struct inode_operations configfs_inode_operations ={ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct configfs_dirent * sd = dentry->d_fsdata; struct iattr * sd_iattr; unsigned int ia_valid = iattr->ia_valid; @@ -186,7 +186,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in if (!dentry) return -ENOENT; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return -EEXIST; sd = dentry->d_fsdata; @@ -194,7 +194,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in if (!inode) return -ENOMEM; - p_inode = dentry->d_parent->d_inode; + p_inode = d_inode(dentry->d_parent); p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; configfs_set_inode_lock_class(sd, inode); @@ -236,11 +236,11 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dentry->d_lock); - if (!d_unhashed(dentry) && dentry->d_inode) { + if (!d_unhashed(dentry) && d_really_is_positive(dentry)) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - simple_unlink(parent->d_inode, dentry); + simple_unlink(d_inode(parent), dentry); } else spin_unlock(&dentry->d_lock); } @@ -251,11 +251,11 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) struct configfs_dirent * sd; struct configfs_dirent * parent_sd = dir->d_fsdata; - if (dir->d_inode == NULL) + if (d_really_is_negative(dir)) /* no inode means this hasn't been made visible yet */ return; - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&d_inode(dir)->i_mutex); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; @@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) break; } } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); } diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index da94e41bdbf6..537356742091 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -173,5 +173,5 @@ MODULE_LICENSE("GPL"); MODULE_VERSION("0.0.2"); MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); -module_init(configfs_init); +core_initcall(configfs_init); module_exit(configfs_exit); @@ -209,7 +209,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, } /* Protects against truncate */ - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); retval = dax_io(inode, iter, pos, end, get_block, &bh); @@ -219,7 +219,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); - inode_dio_done(inode); + inode_dio_end(inode); out: return retval; } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 517e64938438..830a7e76f5c6 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -45,7 +45,7 @@ const struct file_operations debugfs_file_operations = { static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, dentry->d_inode->i_private); + nd_set_link(nd, d_inode(dentry)->i_private); return NULL; } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index c9ee0dfe90b5..c1e7ffb0dab6 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -46,7 +46,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb) static inline int debugfs_positive(struct dentry *dentry) { - return dentry->d_inode && !d_unhashed(dentry); + return d_really_is_positive(dentry) && !d_unhashed(dentry); } struct debugfs_mount_opts { @@ -124,7 +124,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) static int debugfs_apply_options(struct super_block *sb) { struct debugfs_fs_info *fsi = sb->s_fs_info; - struct inode *inode = sb->s_root->d_inode; + struct inode *inode = d_inode(sb->s_root); struct debugfs_mount_opts *opts = &fsi->mount_opts; inode->i_mode &= ~S_IALLUGO; @@ -188,7 +188,7 @@ static struct vfsmount *debugfs_automount(struct path *path) { struct vfsmount *(*f)(void *); f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; - return f(path->dentry->d_inode->i_private); + return f(d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { @@ -270,20 +270,20 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); dentry = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(dentry) && dentry->d_inode) { + if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { dput(dentry); dentry = ERR_PTR(-EEXIST); } if (IS_ERR(dentry)) - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); return dentry; } static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return NULL; @@ -291,7 +291,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); return dentry; } @@ -344,7 +344,7 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode, inode->i_fop = fops ? fops : &debugfs_file_operations; inode->i_private = data; d_instantiate(dentry, inode); - fsnotify_create(dentry->d_parent->d_inode, dentry); + fsnotify_create(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_file); @@ -384,7 +384,7 @@ struct dentry *debugfs_create_file_size(const char *name, umode_t mode, struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); if (de) - de->d_inode->i_size = file_size; + d_inode(de)->i_size = file_size; return de; } EXPORT_SYMBOL_GPL(debugfs_create_file_size); @@ -426,8 +426,8 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + inc_nlink(d_inode(dentry->d_parent)); + fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); @@ -525,9 +525,9 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) if (debugfs_positive(dentry)) { dget(dentry); if (d_is_dir(dentry)) - ret = simple_rmdir(parent->d_inode, dentry); + ret = simple_rmdir(d_inode(parent), dentry); else - simple_unlink(parent->d_inode, dentry); + simple_unlink(d_inode(parent), dentry); if (!ret) d_delete(dentry); dput(dentry); @@ -557,12 +557,12 @@ void debugfs_remove(struct dentry *dentry) return; parent = dentry->d_parent; - if (!parent || !parent->d_inode) + if (!parent || d_really_is_negative(parent)) return; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); ret = __debugfs_remove(dentry, parent); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); if (!ret) simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -588,12 +588,12 @@ void debugfs_remove_recursive(struct dentry *dentry) return; parent = dentry->d_parent; - if (!parent || !parent->d_inode) + if (!parent || d_really_is_negative(parent)) return; parent = dentry; down: - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -608,7 +608,7 @@ void debugfs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); parent = child; goto down; } @@ -629,10 +629,10 @@ void debugfs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); child = parent; parent = parent->d_parent; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); if (child != dentry) /* go up */ @@ -640,7 +640,7 @@ void debugfs_remove_recursive(struct dentry *dentry) if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); @@ -672,27 +672,27 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ - if (!old_dir->d_inode || !new_dir->d_inode) + if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) goto exit; /* Source does not exist, cyclic rename, or mountpoint? */ - if (!old_dentry->d_inode || old_dentry == trap || + if (d_really_is_negative(old_dentry) || old_dentry == trap || d_mountpoint(old_dentry)) goto exit; dentry = lookup_one_len(new_name, new_dir, strlen(new_name)); /* Lookup failed, cyclic rename or target exists? */ - if (IS_ERR(dentry) || dentry == trap || dentry->d_inode) + if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry)) goto exit; old_name = fsnotify_oldname_init(old_dentry->d_name.name); - error = simple_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, + error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir), dentry); if (error) { fsnotify_oldname_free(old_name); goto exit; } d_move(old_dentry, dentry); - fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, + fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name, d_is_dir(old_dentry), NULL, old_dentry); fsnotify_oldname_free(old_name); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index cfe8466f7fef..add566303c68 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -253,7 +253,7 @@ static int mknod_ptmx(struct super_block *sb) if (!uid_valid(root_uid) || !gid_valid(root_gid)) return -EINVAL; - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); /* If we have already created ptmx node, return */ if (fsi->ptmx_dentry) { @@ -290,7 +290,7 @@ static int mknod_ptmx(struct super_block *sb) fsi->ptmx_dentry = dentry; rc = 0; out: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return rc; } @@ -298,7 +298,7 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) { struct inode *inode; if (fsi->ptmx_dentry) { - inode = fsi->ptmx_dentry->d_inode; + inode = d_inode(fsi->ptmx_dentry); inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; } } @@ -602,18 +602,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, sprintf(s, "%d", index); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = d_alloc_name(root, s); if (dentry) { d_add(dentry, inode); - fsnotify_create(root->d_inode, dentry); + fsnotify_create(d_inode(root), dentry); } else { iput(inode); inode = ERR_PTR(-ENOMEM); } - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return inode; } @@ -658,7 +658,7 @@ void devpts_pty_kill(struct inode *inode) BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = d_find_alias(inode); @@ -667,7 +667,7 @@ void devpts_pty_kill(struct inode *inode) dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); } static int __init init_devpts_fs(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index c3b560b24a46..745d2342651a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -253,7 +253,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, if (dio->end_io && dio->result) dio->end_io(dio->iocb, offset, transferred, dio->private); - inode_dio_done(dio->inode); + if (!(dio->flags & DIO_SKIP_DIO_COUNT)) + inode_dio_end(dio->inode); + if (is_async) { if (dio->rw & WRITE) { int err; @@ -1195,7 +1197,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, /* * Will be decremented at I/O completion time. */ - atomic_inc(&inode->i_dio_count); + if (!(dio->flags & DIO_SKIP_DIO_COUNT)) + inode_dio_begin(inode); retval = 0; sdio.blkbits = blkbits; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d08e079ea5d3..754fd6c0b747 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -921,8 +921,8 @@ static int tcp_accept_from_sock(struct connection *con) mutex_unlock(&connections_lock); memset(&peeraddr, 0, sizeof(peeraddr)); - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &newsock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &newsock); if (result < 0) return -ENOMEM; @@ -1173,8 +1173,8 @@ static void tcp_connect_to_sock(struct connection *con) goto out; /* Create a socket to communicate with */ - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (result < 0) goto out_err; @@ -1258,8 +1258,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con, addr_len = sizeof(struct sockaddr_in6); /* Create a socket to communicate with */ - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (result < 0) { log_print("Can't create listening comms socket"); goto create_out; @@ -1365,8 +1365,8 @@ static int sctp_listen_for_all(void) log_print("Using SCTP for communications"); - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET, - IPPROTO_SCTP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_SEQPACKET, IPPROTO_SCTP, &sock); if (result < 0) { log_print("Can't create comms socket, check SCTP is loaded"); goto out; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 719e1ce1c609..97315f2f6816 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1326,7 +1326,7 @@ static int ecryptfs_read_headers_virt(char *page_virt, if (rc) goto out; if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED)) - ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); + ecryptfs_i_size_init(page_virt, d_inode(ecryptfs_dentry)); offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset), &bytes_read); @@ -1425,7 +1425,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) { int rc; char *page_virt; - struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; + struct inode *ecryptfs_inode = d_inode(ecryptfs_dentry); struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 4000f6b3a750..8db0b464483f 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -54,11 +54,11 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { struct inode *lower_inode = - ecryptfs_inode_to_lower(dentry->d_inode); + ecryptfs_inode_to_lower(d_inode(dentry)); - fsstack_copy_attr_all(dentry->d_inode, lower_inode); + fsstack_copy_attr_all(d_inode(dentry), lower_inode); } return rc; } diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index a65786e26b05..72afcc629d7b 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -130,7 +130,7 @@ struct kmem_cache *ecryptfs_file_info_cache; static int read_or_initialize_metadata(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_crypt_stat *crypt_stat; int rc; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index b08b5187f662..fc850b55db67 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry) struct dentry *dir; dir = dget_parent(dentry); - mutex_lock_nested(&(dir->d_inode->i_mutex), I_MUTEX_PARENT); + mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT); return dir; } static void unlock_dir(struct dentry *dir) { - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(dir); } @@ -131,7 +131,7 @@ struct inode *ecryptfs_get_inode(struct inode *lower_inode, static int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, struct super_block *sb) { - struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb); + struct inode *inode = ecryptfs_get_inode(d_inode(lower_dentry), sb); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -189,21 +189,21 @@ ecryptfs_do_create(struct inode *directory_inode, lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true); + rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); inode = ERR_PTR(rc); goto out_lock; } - inode = __ecryptfs_get_inode(lower_dentry->d_inode, + inode = __ecryptfs_get_inode(d_inode(lower_dentry), directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(lower_dir_dentry->d_inode, lower_dentry, NULL); + vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL); goto out_lock; } - fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(directory_inode, d_inode(lower_dir_dentry)); out_lock: unlock_dir(lower_dir_dentry); return inode; @@ -332,7 +332,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry, struct inode *dir_inode) { - struct inode *inode, *lower_inode = lower_dentry->d_inode; + struct inode *inode, *lower_inode = d_inode(lower_dentry); struct ecryptfs_dentry_info *dentry_info; struct vfsmount *lower_mnt; int rc = 0; @@ -347,14 +347,14 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, } lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); - fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); + fsstack_copy_attr_atime(dir_inode, d_inode(lower_dentry->d_parent)); BUG_ON(!d_count(lower_dentry)); ecryptfs_set_dentry_private(dentry, dentry_info); dentry_info->lower_path.mnt = lower_mnt; dentry_info->lower_path.dentry = lower_dentry; - if (!lower_dentry->d_inode) { + if (d_really_is_negative(lower_dentry)) { /* We want to add because we couldn't find in lower */ d_add(dentry, NULL); return 0; @@ -400,11 +400,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, int rc = 0; lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - mutex_lock(&lower_dir_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, lower_dir_dentry, ecryptfs_dentry->d_name.len); - mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -412,7 +412,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, ecryptfs_dentry); goto out; } - if (lower_dentry->d_inode) + if (d_really_is_positive(lower_dentry)) goto interpose; mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; @@ -429,11 +429,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out; } - mutex_lock(&lower_dir_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); lower_dentry = lookup_one_len(encrypted_and_encoded_name, lower_dir_dentry, encrypted_and_encoded_name_size); - mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -458,24 +458,24 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, u64 file_size_save; int rc; - file_size_save = i_size_read(old_dentry->d_inode); + file_size_save = i_size_read(d_inode(old_dentry)); lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); dget(lower_old_dentry); dget(lower_new_dentry); lower_dir_dentry = lock_parent(lower_new_dentry); - rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, + rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry), lower_new_dentry, NULL); - if (rc || !lower_new_dentry->d_inode) + if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - set_nlink(old_dentry->d_inode, - ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink); - i_size_write(new_dentry->d_inode, file_size_save); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + set_nlink(d_inode(old_dentry), + ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink); + i_size_write(d_inode(new_dentry), file_size_save); out_lock: unlock_dir(lower_dir_dentry); dput(lower_new_dentry); @@ -485,7 +485,7 @@ out_lock: static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) { - return ecryptfs_do_unlink(dir, dentry, dentry->d_inode); + return ecryptfs_do_unlink(dir, dentry, d_inode(dentry)); } static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, @@ -510,20 +510,20 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, strlen(symname)); if (rc) goto out_lock; - rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, + rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry, encoded_symname); kfree(encoded_symname); - if (rc || !lower_dentry->d_inode) + if (rc || d_really_is_negative(lower_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); out_lock: unlock_dir(lower_dir_dentry); dput(lower_dentry); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) d_drop(dentry); return rc; } @@ -536,18 +536,18 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode); - if (rc || !lower_dentry->d_inode) + rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode); + if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); out: unlock_dir(lower_dir_dentry); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) d_drop(dentry); return rc; } @@ -562,12 +562,12 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) dget(dentry); lower_dir_dentry = lock_parent(lower_dentry); dget(lower_dentry); - rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry); + rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry); dput(lower_dentry); - if (!rc && dentry->d_inode) - clear_nlink(dentry->d_inode); - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); + if (!rc && d_really_is_positive(dentry)) + clear_nlink(d_inode(dentry)); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); unlock_dir(lower_dir_dentry); if (!rc) d_drop(dentry); @@ -584,17 +584,17 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev); - if (rc || !lower_dentry->d_inode) + rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev); + if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); out: unlock_dir(lower_dir_dentry); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) d_drop(dentry); return rc; } @@ -617,7 +617,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, dget(lower_new_dentry); lower_old_dir_dentry = dget_parent(lower_old_dentry); lower_new_dir_dentry = dget_parent(lower_new_dentry); - target_inode = new_dentry->d_inode; + target_inode = d_inode(new_dentry); trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); /* source should not be ancestor of target */ if (trap == lower_old_dentry) { @@ -629,17 +629,17 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = -ENOTEMPTY; goto out_lock; } - rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, - lower_new_dir_dentry->d_inode, lower_new_dentry, + rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry, + d_inode(lower_new_dir_dentry), lower_new_dentry, NULL, 0); if (rc) goto out_lock; if (target_inode) fsstack_copy_attr_all(target_inode, ecryptfs_inode_to_lower(target_inode)); - fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); + fsstack_copy_attr_all(new_dir, d_inode(lower_new_dir_dentry)); if (new_dir != old_dir) - fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); + fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); out_lock: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_new_dir_dentry); @@ -662,7 +662,7 @@ static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz) return ERR_PTR(-ENOMEM); old_fs = get_fs(); set_fs(get_ds()); - rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, + rc = d_inode(lower_dentry)->i_op->readlink(lower_dentry, (char __user *)lower_buf, PATH_MAX); set_fs(old_fs); @@ -681,8 +681,8 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) char *buf = ecryptfs_readlink_lower(dentry, &len); if (IS_ERR(buf)) goto out; - fsstack_copy_attr_atime(dentry->d_inode, - ecryptfs_dentry_to_lower(dentry)->d_inode); + fsstack_copy_attr_atime(d_inode(dentry), + d_inode(ecryptfs_dentry_to_lower(dentry))); buf[len] = '\0'; out: nd_set_link(nd, buf); @@ -738,7 +738,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, struct iattr *lower_ia) { int rc = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ecryptfs_crypt_stat *crypt_stat; loff_t i_size = i_size_read(inode); loff_t lower_size_before_truncate; @@ -751,7 +751,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, rc = ecryptfs_get_lower_file(dentry, inode); if (rc) return rc; - crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; /* Switch on growing or shrinking file */ if (ia->ia_size > i_size) { char zero[] = { 0x00 }; @@ -858,7 +858,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) struct iattr lower_ia = { .ia_valid = 0 }; int rc; - rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length); + rc = ecryptfs_inode_newsize_ok(d_inode(dentry), new_length); if (rc) return rc; @@ -866,9 +866,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) if (!rc && lower_ia.ia_valid & ATTR_SIZE) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - mutex_lock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); } return rc; } @@ -900,10 +900,10 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) struct inode *lower_inode; struct ecryptfs_crypt_stat *crypt_stat; - crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) ecryptfs_init_crypt_stat(crypt_stat); - inode = dentry->d_inode; + inode = d_inode(dentry); lower_inode = ecryptfs_inode_to_lower(inode); lower_dentry = ecryptfs_dentry_to_lower(dentry); mutex_lock(&crypt_stat->cs_mutex); @@ -967,9 +967,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) lower_ia.ia_valid &= ~ATTR_MODE; - mutex_lock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: fsstack_copy_attr_all(inode, lower_inode); return rc; @@ -983,7 +983,7 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { char *target; size_t targetsiz; @@ -1007,9 +1007,9 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat); if (!rc) { - fsstack_copy_attr_all(dentry->d_inode, - ecryptfs_inode_to_lower(dentry->d_inode)); - generic_fillattr(dentry->d_inode, stat); + fsstack_copy_attr_all(d_inode(dentry), + ecryptfs_inode_to_lower(d_inode(dentry))); + generic_fillattr(d_inode(dentry), stat); stat->blocks = lower_stat.blocks; } return rc; @@ -1023,14 +1023,14 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, struct dentry *lower_dentry; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op->setxattr) { + if (!d_inode(lower_dentry)->i_op->setxattr) { rc = -EOPNOTSUPP; goto out; } rc = vfs_setxattr(lower_dentry, name, value, size, flags); - if (!rc && dentry->d_inode) - fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); + if (!rc && d_really_is_positive(dentry)) + fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry)); out: return rc; } @@ -1041,14 +1041,14 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, { int rc = 0; - if (!lower_dentry->d_inode->i_op->getxattr) { + if (!d_inode(lower_dentry)->i_op->getxattr) { rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value, + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value, size); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: return rc; } @@ -1068,13 +1068,13 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) struct dentry *lower_dentry; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op->listxattr) { + if (!d_inode(lower_dentry)->i_op->listxattr) { rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->listxattr(lower_dentry, list, size); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: return rc; } @@ -1085,13 +1085,13 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name) struct dentry *lower_dentry; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op->removexattr) { + if (!d_inode(lower_dentry)->i_op->removexattr) { rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->removexattr(lower_dentry, name); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: return rc; } diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index f1ea610362c6..866bb18efefe 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -144,7 +144,7 @@ int ecryptfs_privileged_open(struct file **lower_file, /* Corresponding dput() and mntput() are done when the * lower file is fput() when all eCryptfs files for the inode are * released. */ - flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR; + flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) goto out; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index c095d3264259..4f4d0474bee9 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -546,11 +546,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } - if (check_ruid && !uid_eq(path.dentry->d_inode->i_uid, current_uid())) { + if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { rc = -EPERM; printk(KERN_ERR "Mount of device (uid: %d) not owned by " "requested user (uid: %d)\n", - i_uid_read(path.dentry->d_inode), + i_uid_read(d_inode(path.dentry)), from_kuid(&init_user_ns, current_uid())); goto out_free; } @@ -584,7 +584,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } - inode = ecryptfs_get_inode(path.dentry->d_inode, s); + inode = ecryptfs_get_inode(d_inode(path.dentry), s); rc = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 4626976794e7..cf208522998e 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -420,7 +420,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) void *xattr_virt; struct dentry *lower_dentry = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry; - struct inode *lower_inode = lower_dentry->d_inode; + struct inode *lower_inode = d_inode(lower_dentry); int rc; if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) { diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 07ab49745e31..3381b9da9ee6 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -145,12 +145,12 @@ out: static int efivarfs_unlink(struct inode *dir, struct dentry *dentry) { - struct efivar_entry *var = dentry->d_inode->i_private; + struct efivar_entry *var = d_inode(dentry)->i_private; if (efivar_entry_delete(var)) return -EINVAL; - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); dput(dentry); return 0; }; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index ddbce42548c9..86a2121828c3 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -121,7 +121,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, int len, i; int err = -ENOMEM; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); + entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return err; @@ -144,7 +144,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; - inode = efivarfs_get_inode(sb, root->d_inode, S_IFREG | 0644, 0); + inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0); if (!inode) goto fail_name; diff --git a/fs/efs/namei.c b/fs/efs/namei.c index bbee8f063dfa..40ba9cc41bf7 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -111,9 +111,9 @@ struct dentry *efs_get_parent(struct dentry *child) struct dentry *parent = ERR_PTR(-ENOENT); efs_ino_t ino; - ino = efs_find_entry(child->d_inode, "..", 2); + ino = efs_find_entry(d_inode(child), "..", 2); if (ino) - parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino)); + parent = d_obtain_alias(efs_iget(d_inode(child)->i_sb, ino)); return parent; } diff --git a/fs/exec.c b/fs/exec.c index 02bfd980a40c..1977c2a553ac 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -659,6 +659,9 @@ int setup_arg_pages(struct linux_binprm *bprm, if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; + /* Add space for stack randomization. */ + stack_base += (STACK_RND_MASK << PAGE_SHIFT); + /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; @@ -1275,6 +1278,53 @@ static void check_unsafe_exec(struct linux_binprm *bprm) spin_unlock(&p->fs->lock); } +static void bprm_fill_uid(struct linux_binprm *bprm) +{ + struct inode *inode; + unsigned int mode; + kuid_t uid; + kgid_t gid; + + /* clear any previous set[ug]id data from a previous binary */ + bprm->cred->euid = current_euid(); + bprm->cred->egid = current_egid(); + + if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) + return; + + if (task_no_new_privs(current)) + return; + + inode = file_inode(bprm->file); + mode = READ_ONCE(inode->i_mode); + if (!(mode & (S_ISUID|S_ISGID))) + return; + + /* Be careful if suid/sgid is set */ + mutex_lock(&inode->i_mutex); + + /* reload atomically mode/uid/gid now that lock held */ + mode = inode->i_mode; + uid = inode->i_uid; + gid = inode->i_gid; + mutex_unlock(&inode->i_mutex); + + /* We ignore suid/sgid if there are no mappings for them in the ns */ + if (!kuid_has_mapping(bprm->cred->user_ns, uid) || + !kgid_has_mapping(bprm->cred->user_ns, gid)) + return; + + if (mode & S_ISUID) { + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->euid = uid; + } + + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->egid = gid; + } +} + /* * Fill the binprm structure from the inode. * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes @@ -1283,36 +1333,9 @@ static void check_unsafe_exec(struct linux_binprm *bprm) */ int prepare_binprm(struct linux_binprm *bprm) { - struct inode *inode = file_inode(bprm->file); - umode_t mode = inode->i_mode; int retval; - - /* clear any previous set[ug]id data from a previous binary */ - bprm->cred->euid = current_euid(); - bprm->cred->egid = current_egid(); - - if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && - !task_no_new_privs(current) && - kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && - kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { - /* Set-uid? */ - if (mode & S_ISUID) { - bprm->per_clear |= PER_CLEAR_ON_SETID; - bprm->cred->euid = inode->i_uid; - } - - /* Set-gid? */ - /* - * If setgid is set but no group execute bit then this - * is a candidate for mandatory locking, not a setgid - * executable. - */ - if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { - bprm->per_clear |= PER_CLEAR_ON_SETID; - bprm->cred->egid = inode->i_gid; - } - } + bprm_fill_uid(bprm); /* fill in binprm security blob */ retval = security_bprm_set_creds(bprm); diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index d7defd557601..4deb0b05b011 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -379,7 +379,7 @@ ino_t exofs_parent_ino(struct dentry *child) struct exofs_dir_entry *de; ino_t ino; - de = exofs_dotdot(child->d_inode, &page); + de = exofs_dotdot(d_inode(child), &page); if (!de) return 0; @@ -429,7 +429,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, int exofs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned chunk_size = exofs_chunk_size(dir); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 35073aaec6e0..786e4cc8c889 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1028,7 +1028,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) */ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; /* if we are about to modify an object, and it hasn't been diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 28907460e8fa..5ae25e431191 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -141,7 +141,7 @@ out_fail: static int exofs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); @@ -191,7 +191,7 @@ out_dir: static int exofs_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct exofs_dir_entry *de; struct page *page; int err = -ENOENT; @@ -213,7 +213,7 @@ out: static int exofs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = -ENOTEMPTY; if (exofs_empty_dir(inode)) { @@ -230,8 +230,8 @@ static int exofs_rmdir(struct inode *dir, struct dentry *dentry) static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *dir_page = NULL; struct exofs_dir_entry *dir_de = NULL; struct page *old_page; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index fcc2e565f540..b795c567b5e1 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -958,7 +958,7 @@ static struct dentry *exofs_get_parent(struct dentry *child) if (!ino) return ERR_PTR(-ESTALE); - return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(exofs_iget(d_inode(child)->i_sb, ino)); } static struct inode *exofs_nfs_get_inode(struct super_block *sb, diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c index 832e2624b80b..6f6f3a4c1365 100644 --- a/fs/exofs/symlink.c +++ b/fs/exofs/symlink.c @@ -37,7 +37,7 @@ static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct exofs_i_info *oi = exofs_i(dentry->d_inode); + struct exofs_i_info *oi = exofs_i(d_inode(dentry)); nd_set_link(nd, (char *)oi->i_data); return NULL; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 6e1d4ab09d72..796b491e6978 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -486,7 +486,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, */ int ext2_add_link (struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned chunk_size = ext2_chunk_size(dir); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 6c14bb8322fa..5c04a0ddea80 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -278,7 +278,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) avefreeb = free_blocks / ngroups; ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - if ((parent == sb->s_root->d_inode) || + if ((parent == d_inode(sb->s_root)) || (EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) { struct ext2_group_desc *best_desc = NULL; int best_ndir = inodes_per_group; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 5d9213963fae..f460ae36d5b7 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1544,7 +1544,7 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) int ext2_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, iattr); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index ce422931f411..3e074a9ccbe6 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -79,10 +79,10 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, uns struct dentry *ext2_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = ext2_inode_by_name(child->d_inode, &dotdot); + unsigned long ino = ext2_inode_by_name(d_inode(child), &dotdot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(ext2_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext2_iget(d_inode(child)->i_sb, ino)); } /* @@ -208,7 +208,7 @@ out_fail: static int ext2_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int err; dquot_initialize(dir); @@ -275,7 +275,7 @@ out_dir: static int ext2_unlink(struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct ext2_dir_entry_2 * de; struct page * page; int err = -ENOENT; @@ -299,7 +299,7 @@ out: static int ext2_rmdir (struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); int err = -ENOTEMPTY; if (ext2_empty_dir(inode)) { @@ -316,8 +316,8 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry) static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct inode * new_dir, struct dentry * new_dentry ) { - struct inode * old_inode = old_dentry->d_inode; - struct inode * new_inode = new_dentry->d_inode; + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct ext2_dir_entry_2 * dir_de = NULL; struct page * old_page; diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index 565cf817bbf1..20608f17c2e5 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -23,7 +23,7 @@ static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct ext2_inode_info *ei = EXT2_I(dentry->d_inode); + struct ext2_inode_info *ei = EXT2_I(d_inode(dentry)); nd_set_link(nd, (char *)ei->i_data); return NULL; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 91426141c33a..0b6bfd3a398b 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -243,7 +243,7 @@ cleanup: static int ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; struct ext2_xattr_entry *entry; char *end; @@ -319,7 +319,7 @@ cleanup: /* * Inode operation listxattr() * - * dentry->d_inode->i_mutex: don't care + * d_inode(dentry)->i_mutex: don't care */ ssize_t ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index c0ebc4db8849..702fc6840246 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -28,7 +28,7 @@ ext2_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -38,7 +38,7 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 7e192574c001..42b6e9874bcc 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -32,7 +32,7 @@ ext2_xattr_trusted_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -42,7 +42,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index f470e44c4b8d..ecdc4605192c 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -36,7 +36,7 @@ ext2_xattr_user_get(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER, + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER, name, buffer, size); } @@ -49,7 +49,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name, if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER, + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index a1b810230cc5..3ad242e5840e 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -210,7 +210,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) avefreeb = freeb / ngroups; ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - if ((parent == sb->s_root->d_inode) || + if ((parent == d_inode(sb->s_root)) || (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) { int best_ndir = inodes_per_group; int best_group = -1; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 13c0868c7160..2ee2dc4351d1 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3240,7 +3240,7 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) */ int ext3_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error, rc = 0; const unsigned int ia_valid = attr->ia_valid; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index f197736dccfa..4264b9bd0002 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1049,19 +1049,19 @@ struct dentry *ext3_get_parent(struct dentry *child) struct ext3_dir_entry_2 * de; struct buffer_head *bh; - bh = ext3_find_entry(child->d_inode, &dotdot, &de); + bh = ext3_find_entry(d_inode(child), &dotdot, &de); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); brelse(bh); - if (!ext3_valid_inum(child->d_inode->i_sb, ino)) { - ext3_error(child->d_inode->i_sb, "ext3_get_parent", + if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) { + ext3_error(d_inode(child)->i_sb, "ext3_get_parent", "bad inode number: %lu", ino); return ERR_PTR(-EIO); } - return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino)); } #define S_SHIFT 12 @@ -1243,7 +1243,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext3_dir_entry_2 *de, struct buffer_head * bh) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned long offset = 0; @@ -1330,7 +1330,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct inode *inode, struct buffer_head *bh) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct buffer_head *bh2; @@ -1435,7 +1435,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, static int ext3_add_entry (handle_t *handle, struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct buffer_head * bh; struct ext3_dir_entry_2 *de; struct super_block * sb; @@ -1489,7 +1489,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, struct dx_entry *entries, *at; struct dx_hash_info hinfo; struct buffer_head * bh; - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct super_block * sb = dir->i_sb; struct ext3_dir_entry_2 *de; int err; @@ -2111,7 +2111,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) @@ -2125,7 +2125,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) @@ -2173,7 +2173,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) @@ -2187,7 +2187,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) if (!bh) goto end_unlink; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) @@ -2328,7 +2328,7 @@ static int ext3_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { handle_t *handle; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int err, retries = 0; if (inode->i_nlink >= EXT3_LINK_MAX) @@ -2391,8 +2391,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, /* Initialize quotas before so that eventual writes go * in separate transaction */ - if (new_dentry->d_inode) - dquot_initialize(new_dentry->d_inode); + if (d_really_is_positive(new_dentry)) + dquot_initialize(d_inode(new_dentry)); handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); @@ -2409,12 +2409,12 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ - old_inode = old_dentry->d_inode; + old_inode = d_inode(old_dentry); retval = -ENOENT; if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) goto end_rename; - new_inode = new_dentry->d_inode; + new_inode = d_inode(new_dentry); new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh) { if (!new_inode) { diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f037b4b27300..a9312f0a54e5 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1170,7 +1170,7 @@ static int parse_options (char *options, struct super_block *sb, return 0; } - journal_inode = path.dentry->d_inode; + journal_inode = d_inode(path.dentry); if (!S_ISBLK(journal_inode->i_mode)) { ext3_msg(sb, KERN_ERR, "error: journal path %s " "is not a block device", journal_path); @@ -2947,7 +2947,7 @@ static int ext3_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext3_journal_start(sb->s_root->d_inode, 2); + handle = ext3_journal_start(d_inode(sb->s_root), 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); @@ -2994,7 +2994,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, * When we journal data on quota file, we have to flush journal to see * all updates to the file when we bypass pagecache... */ - if (ext3_should_journal_data(path->dentry->d_inode)) { + if (ext3_should_journal_data(d_inode(path->dentry))) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c index 6b01c3eab1f3..ea96df3c58db 100644 --- a/fs/ext3/symlink.c +++ b/fs/ext3/symlink.c @@ -23,7 +23,7 @@ static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct ext3_inode_info *ei = EXT3_I(dentry->d_inode); + struct ext3_inode_info *ei = EXT3_I(d_inode(dentry)); nd_set_link(nd, (char*)ei->i_data); return NULL; } diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 24215dc09a18..7cf36501ccf4 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -137,7 +137,7 @@ ext3_xattr_handler(int name_index) /* * Inode operation listxattr() * - * dentry->d_inode->i_mutex: don't care + * d_inode(dentry)->i_mutex: don't care */ ssize_t ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -355,7 +355,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, static int ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; @@ -391,7 +391,7 @@ cleanup: static int ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ext3_xattr_ibody_header *header; struct ext3_inode *raw_inode; struct ext3_iloc iloc; @@ -432,7 +432,7 @@ ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int i_error, b_error; - down_read(&EXT3_I(dentry->d_inode)->xattr_sem); + down_read(&EXT3_I(d_inode(dentry))->xattr_sem); i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size); if (i_error < 0) { b_error = 0; @@ -445,7 +445,7 @@ ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (b_error < 0) i_error = 0; } - up_read(&EXT3_I(dentry->d_inode)->xattr_sem); + up_read(&EXT3_I(d_inode(dentry))->xattr_sem); return i_error + b_error; } diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 722c2bf9645d..c9506d5e3b13 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -29,7 +29,7 @@ ext3_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -39,7 +39,7 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index d75727cc67fa..206cc66dc285 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -32,7 +32,7 @@ ext3_xattr_trusted_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -42,7 +42,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name, + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 5612af3567e0..021508ad1616 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -34,7 +34,7 @@ ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER, + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER, name, buffer, size); } @@ -46,7 +46,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER, + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index efea5d5c44ce..024f2284d3f6 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -64,6 +64,28 @@ config EXT4_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. +config EXT4_ENCRYPTION + tristate "Ext4 Encryption" + depends on EXT4_FS + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_ECB + select CRYPTO_XTS + select CRYPTO_CTS + select CRYPTO_SHA256 + select KEYS + select ENCRYPTED_KEYS + help + Enable encryption of ext4 files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. + +config EXT4_FS_ENCRYPTION + bool + default y + depends on EXT4_ENCRYPTION + config EXT4_DEBUG bool "EXT4 debugging support" depends on EXT4_FS diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 0310fec2ee3d..75285ea9aa05 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -8,7 +8,9 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ - xattr_trusted.o inline.o + xattr_trusted.o inline.o readpage.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o +ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o \ + crypto_key.o crypto_fname.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index d40c8dbbb0d6..69b1e73026a5 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -4,11 +4,6 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/capability.h> -#include <linux/fs.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 83a6f497c4e0..955bf49a7945 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -14,7 +14,6 @@ #include <linux/time.h> #include <linux/capability.h> #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" @@ -641,8 +640,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, * fail EDQUOT for metdata, but we do account for it. */ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_alloc_block_nofail(inode, EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); } diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index b610779a958c..4a606afb171f 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -8,7 +8,6 @@ */ #include <linux/buffer_head.h> -#include <linux/jbd2.h> #include "ext4.h" unsigned int ext4_count_free(char *bitmap, unsigned int numchars) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 41eb9dcfac7e..3522340c7a99 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -16,7 +16,6 @@ #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/blkdev.h> -#include <linux/mutex.h> #include <linux/slab.h> #include "ext4.h" diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c new file mode 100644 index 000000000000..8ff15273ab0c --- /dev/null +++ b/fs/ext4/crypto.c @@ -0,0 +1,558 @@ +/* + * linux/fs/ext4/crypto.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption functions for ext4 + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <keys/user-type.h> +#include <keys/encrypted-type.h> +#include <linux/crypto.h> +#include <linux/ecryptfs.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/key.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <linux/spinlock_types.h> + +#include "ext4_extents.h" +#include "xattr.h" + +/* Encryption added and removed here! (L: */ + +static unsigned int num_prealloc_crypto_pages = 32; +static unsigned int num_prealloc_crypto_ctxs = 128; + +module_param(num_prealloc_crypto_pages, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_pages, + "Number of crypto pages to preallocate"); +module_param(num_prealloc_crypto_ctxs, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_ctxs, + "Number of crypto contexts to preallocate"); + +static mempool_t *ext4_bounce_page_pool; + +static LIST_HEAD(ext4_free_crypto_ctxs); +static DEFINE_SPINLOCK(ext4_crypto_ctx_lock); + +/** + * ext4_release_crypto_ctx() - Releases an encryption context + * @ctx: The encryption context to release. + * + * If the encryption context was allocated from the pre-allocated pool, returns + * it to that pool. Else, frees it. + * + * If there's a bounce page in the context, this frees that. + */ +void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) +{ + unsigned long flags; + + if (ctx->bounce_page) { + if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) + __free_page(ctx->bounce_page); + else + mempool_free(ctx->bounce_page, ext4_bounce_page_pool); + ctx->bounce_page = NULL; + } + ctx->control_page = NULL; + if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { + if (ctx->tfm) + crypto_free_tfm(ctx->tfm); + kfree(ctx); + } else { + spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); + list_add(&ctx->free_list, &ext4_free_crypto_ctxs); + spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); + } +} + +/** + * ext4_alloc_and_init_crypto_ctx() - Allocates and inits an encryption context + * @mask: The allocation mask. + * + * Return: An allocated and initialized encryption context on success. An error + * value or NULL otherwise. + */ +static struct ext4_crypto_ctx *ext4_alloc_and_init_crypto_ctx(gfp_t mask) +{ + struct ext4_crypto_ctx *ctx = kzalloc(sizeof(struct ext4_crypto_ctx), + mask); + + if (!ctx) + return ERR_PTR(-ENOMEM); + return ctx; +} + +/** + * ext4_get_crypto_ctx() - Gets an encryption context + * @inode: The inode for which we are doing the crypto + * + * Allocates and initializes an encryption context. + * + * Return: An allocated and initialized encryption context on success; error + * value or NULL otherwise. + */ +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) +{ + struct ext4_crypto_ctx *ctx = NULL; + int res = 0; + unsigned long flags; + struct ext4_encryption_key *key = &EXT4_I(inode)->i_encryption_key; + + if (!ext4_read_workqueue) + ext4_init_crypto(); + + /* + * We first try getting the ctx from a free list because in + * the common case the ctx will have an allocated and + * initialized crypto tfm, so it's probably a worthwhile + * optimization. For the bounce page, we first try getting it + * from the kernel allocator because that's just about as fast + * as getting it from a list and because a cache of free pages + * should generally be a "last resort" option for a filesystem + * to be able to do its job. + */ + spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); + ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs, + struct ext4_crypto_ctx, free_list); + if (ctx) + list_del(&ctx->free_list); + spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); + if (!ctx) { + ctx = ext4_alloc_and_init_crypto_ctx(GFP_NOFS); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto out; + } + ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; + } + + /* Allocate a new Crypto API context if we don't already have + * one or if it isn't the right mode. */ + BUG_ON(key->mode == EXT4_ENCRYPTION_MODE_INVALID); + if (ctx->tfm && (ctx->mode != key->mode)) { + crypto_free_tfm(ctx->tfm); + ctx->tfm = NULL; + ctx->mode = EXT4_ENCRYPTION_MODE_INVALID; + } + if (!ctx->tfm) { + switch (key->mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + ctx->tfm = crypto_ablkcipher_tfm( + crypto_alloc_ablkcipher("xts(aes)", 0, 0)); + break; + case EXT4_ENCRYPTION_MODE_AES_256_GCM: + /* TODO(mhalcrow): AEAD w/ gcm(aes); + * crypto_aead_setauthsize() */ + ctx->tfm = ERR_PTR(-ENOTSUPP); + break; + default: + BUG(); + } + if (IS_ERR_OR_NULL(ctx->tfm)) { + res = PTR_ERR(ctx->tfm); + ctx->tfm = NULL; + goto out; + } + ctx->mode = key->mode; + } + BUG_ON(key->size != ext4_encryption_key_size(key->mode)); + + /* There shouldn't be a bounce page attached to the crypto + * context at this point. */ + BUG_ON(ctx->bounce_page); + +out: + if (res) { + if (!IS_ERR_OR_NULL(ctx)) + ext4_release_crypto_ctx(ctx); + ctx = ERR_PTR(res); + } + return ctx; +} + +struct workqueue_struct *ext4_read_workqueue; +static DEFINE_MUTEX(crypto_init); + +/** + * ext4_exit_crypto() - Shutdown the ext4 encryption system + */ +void ext4_exit_crypto(void) +{ + struct ext4_crypto_ctx *pos, *n; + + list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) { + if (pos->bounce_page) { + if (pos->flags & + EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) { + __free_page(pos->bounce_page); + } else { + mempool_free(pos->bounce_page, + ext4_bounce_page_pool); + } + } + if (pos->tfm) + crypto_free_tfm(pos->tfm); + kfree(pos); + } + INIT_LIST_HEAD(&ext4_free_crypto_ctxs); + if (ext4_bounce_page_pool) + mempool_destroy(ext4_bounce_page_pool); + ext4_bounce_page_pool = NULL; + if (ext4_read_workqueue) + destroy_workqueue(ext4_read_workqueue); + ext4_read_workqueue = NULL; +} + +/** + * ext4_init_crypto() - Set up for ext4 encryption. + * + * We only call this when we start accessing encrypted files, since it + * results in memory getting allocated that wouldn't otherwise be used. + * + * Return: Zero on success, non-zero otherwise. + */ +int ext4_init_crypto(void) +{ + int i, res; + + mutex_lock(&crypto_init); + if (ext4_read_workqueue) + goto already_initialized; + ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0); + if (!ext4_read_workqueue) { + res = -ENOMEM; + goto fail; + } + + for (i = 0; i < num_prealloc_crypto_ctxs; i++) { + struct ext4_crypto_ctx *ctx; + + ctx = ext4_alloc_and_init_crypto_ctx(GFP_KERNEL); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto fail; + } + list_add(&ctx->free_list, &ext4_free_crypto_ctxs); + } + + ext4_bounce_page_pool = + mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!ext4_bounce_page_pool) { + res = -ENOMEM; + goto fail; + } +already_initialized: + mutex_unlock(&crypto_init); + return 0; +fail: + ext4_exit_crypto(); + mutex_unlock(&crypto_init); + return res; +} + +void ext4_restore_control_page(struct page *data_page) +{ + struct ext4_crypto_ctx *ctx = + (struct ext4_crypto_ctx *)page_private(data_page); + + set_page_private(data_page, (unsigned long)NULL); + ClearPagePrivate(data_page); + unlock_page(data_page); + ext4_release_crypto_ctx(ctx); +} + +/** + * ext4_crypt_complete() - The completion callback for page encryption + * @req: The asynchronous encryption request context + * @res: The result of the encryption operation + */ +static void ext4_crypt_complete(struct crypto_async_request *req, int res) +{ + struct ext4_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +typedef enum { + EXT4_DECRYPT = 0, + EXT4_ENCRYPT, +} ext4_direction_t; + +static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, + struct inode *inode, + ext4_direction_t rw, + pgoff_t index, + struct page *src_page, + struct page *dest_page) + +{ + u8 xts_tweak[EXT4_XTS_TWEAK_SIZE]; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist dst, src; + struct ext4_inode_info *ei = EXT4_I(inode); + struct crypto_ablkcipher *atfm = __crypto_ablkcipher_cast(ctx->tfm); + int res = 0; + + BUG_ON(!ctx->tfm); + BUG_ON(ctx->mode != ei->i_encryption_key.mode); + + if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { + printk_ratelimited(KERN_ERR + "%s: unsupported crypto algorithm: %d\n", + __func__, ctx->mode); + return -ENOTSUPP; + } + + crypto_ablkcipher_clear_flags(atfm, ~0); + crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY); + + res = crypto_ablkcipher_setkey(atfm, ei->i_encryption_key.raw, + ei->i_encryption_key.size); + if (res) { + printk_ratelimited(KERN_ERR + "%s: crypto_ablkcipher_setkey() failed\n", + __func__); + return res; + } + req = ablkcipher_request_alloc(atfm, GFP_NOFS); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", + __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_crypt_complete, &ecr); + + BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index)); + memcpy(xts_tweak, &index, sizeof(index)); + memset(&xts_tweak[sizeof(index)], 0, + EXT4_XTS_TWEAK_SIZE - sizeof(index)); + + sg_init_table(&dst, 1); + sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); + ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, + xts_tweak); + if (rw == EXT4_DECRYPT) + res = crypto_ablkcipher_decrypt(req); + else + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + ablkcipher_request_free(req); + if (res) { + printk_ratelimited( + KERN_ERR + "%s: crypto_ablkcipher_encrypt() returned %d\n", + __func__, res); + return res; + } + return 0; +} + +/** + * ext4_encrypt() - Encrypts a page + * @inode: The inode for which the encryption should take place + * @plaintext_page: The page to encrypt. Must be locked. + * + * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx + * encryption context. + * + * Called on the page write path. The caller must call + * ext4_restore_control_page() on the returned ciphertext page to + * release the bounce buffer and the encryption context. + * + * Return: An allocated page with the encrypted content on success. Else, an + * error value or NULL. + */ +struct page *ext4_encrypt(struct inode *inode, + struct page *plaintext_page) +{ + struct ext4_crypto_ctx *ctx; + struct page *ciphertext_page = NULL; + int err; + + BUG_ON(!PageLocked(plaintext_page)); + + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + return (struct page *) ctx; + + /* The encryption operation will require a bounce page. */ + ciphertext_page = alloc_page(GFP_NOFS); + if (!ciphertext_page) { + /* This is a potential bottleneck, but at least we'll have + * forward progress. */ + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS); + if (WARN_ON_ONCE(!ciphertext_page)) { + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS | __GFP_WAIT); + } + ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->bounce_page = ciphertext_page; + ctx->control_page = plaintext_page; + err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, + plaintext_page, ciphertext_page); + if (err) { + ext4_release_crypto_ctx(ctx); + return ERR_PTR(err); + } + SetPagePrivate(ciphertext_page); + set_page_private(ciphertext_page, (unsigned long)ctx); + lock_page(ciphertext_page); + return ciphertext_page; +} + +/** + * ext4_decrypt() - Decrypts a page in-place + * @ctx: The encryption context. + * @page: The page to decrypt. Must be locked. + * + * Decrypts page in-place using the ctx encryption context. + * + * Called from the read completion callback. + * + * Return: Zero on success, non-zero otherwise. + */ +int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page) +{ + BUG_ON(!PageLocked(page)); + + return ext4_page_crypto(ctx, page->mapping->host, + EXT4_DECRYPT, page->index, page, page); +} + +/* + * Convenience function which takes care of allocating and + * deallocating the encryption context + */ +int ext4_decrypt_one(struct inode *inode, struct page *page) +{ + int ret; + + struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode); + + if (!ctx) + return -ENOMEM; + ret = ext4_decrypt(ctx, page); + ext4_release_crypto_ctx(ctx); + return ret; +} + +int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + struct ext4_crypto_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + ext4_lblk_t lblk = ex->ee_block; + ext4_fsblk_t pblk = ext4_ext_pblock(ex); + unsigned int len = ext4_ext_get_actual_len(ex); + int err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE); + + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = alloc_page(GFP_NOFS); + if (!ciphertext_page) { + /* This is a potential bottleneck, but at least we'll have + * forward progress. */ + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS); + if (WARN_ON_ONCE(!ciphertext_page)) { + ciphertext_page = mempool_alloc(ext4_bounce_page_pool, + GFP_NOFS | __GFP_WAIT); + } + ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->bounce_page = ciphertext_page; + + while (len--) { + err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page); + if (err) + goto errout; + + bio = bio_alloc(GFP_KERNEL, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = pblk; + err = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (err) { + bio_put(bio); + goto errout; + } + err = submit_bio_wait(WRITE, bio); + if (err) + goto errout; + } + err = 0; +errout: + ext4_release_crypto_ctx(ctx); + return err; +} + +bool ext4_valid_contents_enc_mode(uint32_t mode) +{ + return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS); +} + +/** + * ext4_validate_encryption_key_size() - Validate the encryption key size + * @mode: The key mode. + * @size: The key size to validate. + * + * Return: The validated key size for @mode. Zero if invalid. + */ +uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) +{ + if (size == ext4_encryption_key_size(mode)) + return size; + return 0; +} diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c new file mode 100644 index 000000000000..fded02f72299 --- /dev/null +++ b/fs/ext4/crypto_fname.c @@ -0,0 +1,719 @@ +/* + * linux/fs/ext4/crypto_fname.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains functions for filename crypto management in ext4 + * + * Written by Uday Savagaonkar, 2014. + * + * This has not yet undergone a rigorous security audit. + * + */ + +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <keys/encrypted-type.h> +#include <keys/user-type.h> +#include <linux/crypto.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/key.h> +#include <linux/key.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <linux/spinlock_types.h> + +#include "ext4.h" +#include "ext4_crypto.h" +#include "xattr.h" + +/** + * ext4_dir_crypt_complete() - + */ +static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res) +{ + struct ext4_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +bool ext4_valid_filenames_enc_mode(uint32_t mode) +{ + return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS); +} + +/** + * ext4_fname_encrypt() - + * + * This function encrypts the input filename, and returns the length of the + * ciphertext. Errors are returned as negative numbers. We trust the caller to + * allocate sufficient memory to oname string. + */ +static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct ext4_str *oname) +{ + u32 ciphertext_len; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct crypto_ablkcipher *tfm = ctx->ctfm; + int res = 0; + char iv[EXT4_CRYPTO_BLOCK_SIZE]; + struct scatterlist sg[1]; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + char *workbuf; + + if (iname->len <= 0 || iname->len > ctx->lim) + return -EIO; + + ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? + EXT4_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); + ciphertext_len = (ciphertext_len > ctx->lim) + ? ctx->lim : ciphertext_len; + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited( + KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_dir_crypt_complete, &ecr); + + /* Map the workpage */ + workbuf = kmap(ctx->workpage); + + /* Copy the input */ + memcpy(workbuf, iname->name, iname->len); + if (iname->len < ciphertext_len) + memset(workbuf + iname->len, 0, ciphertext_len - iname->len); + + /* Initialize IV */ + memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); + + /* Create encryption request */ + sg_init_table(sg, 1); + sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); + ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + if (res >= 0) { + /* Copy the result to output */ + memcpy(oname->name, workbuf, ciphertext_len); + res = ciphertext_len; + } + kunmap(ctx->workpage); + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited( + KERN_ERR "%s: Error (error code %d)\n", __func__, res); + } + oname->len = ciphertext_len; + return res; +} + +/* + * ext4_fname_decrypt() + * This function decrypts the input filename, and returns + * the length of the plaintext. + * Errors are returned as negative numbers. + * We trust the caller to allocate sufficient memory to oname string. + */ +static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, + const struct ext4_str *iname, + struct ext4_str *oname) +{ + struct ext4_str tmp_in[2], tmp_out[1]; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist sg[1]; + struct crypto_ablkcipher *tfm = ctx->ctfm; + int res = 0; + char iv[EXT4_CRYPTO_BLOCK_SIZE]; + char *workbuf; + + if (iname->len <= 0 || iname->len > ctx->lim) + return -EIO; + + tmp_in[0].name = iname->name; + tmp_in[0].len = iname->len; + tmp_out[0].name = oname->name; + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited( + KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_dir_crypt_complete, &ecr); + + /* Map the workpage */ + workbuf = kmap(ctx->workpage); + + /* Copy the input */ + memcpy(workbuf, iname->name, iname->len); + + /* Initialize IV */ + memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); + + /* Create encryption request */ + sg_init_table(sg, 1); + sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); + ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv); + res = crypto_ablkcipher_decrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + if (res >= 0) { + /* Copy the result to output */ + memcpy(oname->name, workbuf, iname->len); + res = iname->len; + } + kunmap(ctx->workpage); + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited( + KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n", + __func__, res); + return res; + } + + oname->len = strnlen(oname->name, iname->len); + return oname->len; +} + +static const char *lookup_table = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + +/** + * ext4_fname_encode_digest() - + * + * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. + * The encoded string is roughly 4/3 times the size of the input string. + */ +static int digest_encode(const char *src, int len, char *dst) +{ + int i = 0, bits = 0, ac = 0; + char *cp = dst; + + while (i < len) { + ac += (((unsigned char) src[i]) << bits); + bits += 8; + do { + *cp++ = lookup_table[ac & 0x3f]; + ac >>= 6; + bits -= 6; + } while (bits >= 6); + i++; + } + if (bits) + *cp++ = lookup_table[ac & 0x3f]; + return cp - dst; +} + +static int digest_decode(const char *src, int len, char *dst) +{ + int i = 0, bits = 0, ac = 0; + const char *p; + char *cp = dst; + + while (i < len) { + p = strchr(lookup_table, src[i]); + if (p == NULL || src[i] == 0) + return -2; + ac += (p - lookup_table) << bits; + bits += 6; + if (bits >= 8) { + *cp++ = ac & 0xff; + ac >>= 8; + bits -= 8; + } + i++; + } + if (ac) + return -1; + return cp - dst; +} + +/** + * ext4_free_fname_crypto_ctx() - + * + * Frees up a crypto context. + */ +void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx) +{ + if (ctx == NULL || IS_ERR(ctx)) + return; + + if (ctx->ctfm && !IS_ERR(ctx->ctfm)) + crypto_free_ablkcipher(ctx->ctfm); + if (ctx->htfm && !IS_ERR(ctx->htfm)) + crypto_free_hash(ctx->htfm); + if (ctx->workpage && !IS_ERR(ctx->workpage)) + __free_page(ctx->workpage); + kfree(ctx); +} + +/** + * ext4_put_fname_crypto_ctx() - + * + * Return: The crypto context onto free list. If the free list is above a + * threshold, completely frees up the context, and returns the memory. + * + * TODO: Currently we directly free the crypto context. Eventually we should + * add code it to return to free list. Such an approach will increase + * efficiency of directory lookup. + */ +void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) +{ + if (*ctx == NULL || IS_ERR(*ctx)) + return; + ext4_free_fname_crypto_ctx(*ctx); + *ctx = NULL; +} + +/** + * ext4_search_fname_crypto_ctx() - + */ +static struct ext4_fname_crypto_ctx *ext4_search_fname_crypto_ctx( + const struct ext4_encryption_key *key) +{ + return NULL; +} + +/** + * ext4_alloc_fname_crypto_ctx() - + */ +struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx( + const struct ext4_encryption_key *key) +{ + struct ext4_fname_crypto_ctx *ctx; + + ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS); + if (ctx == NULL) + return ERR_PTR(-ENOMEM); + if (key->mode == EXT4_ENCRYPTION_MODE_INVALID) { + /* This will automatically set key mode to invalid + * As enum for ENCRYPTION_MODE_INVALID is zero */ + memset(&ctx->key, 0, sizeof(ctx->key)); + } else { + memcpy(&ctx->key, key, sizeof(struct ext4_encryption_key)); + } + ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == key->mode) + ? 0 : 1; + ctx->ctfm_key_is_ready = 0; + ctx->ctfm = NULL; + ctx->htfm = NULL; + ctx->workpage = NULL; + return ctx; +} + +/** + * ext4_get_fname_crypto_ctx() - + * + * Allocates a free crypto context and initializes it to hold + * the crypto material for the inode. + * + * Return: NULL if not encrypted. Error value on error. Valid pointer otherwise. + */ +struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( + struct inode *inode, u32 max_ciphertext_len) +{ + struct ext4_fname_crypto_ctx *ctx; + struct ext4_inode_info *ei = EXT4_I(inode); + int res; + + /* Check if the crypto policy is set on the inode */ + res = ext4_encrypted_inode(inode); + if (res == 0) + return NULL; + + if (!ext4_has_encryption_key(inode)) + ext4_generate_encryption_key(inode); + + /* Get a crypto context based on the key. + * A new context is allocated if no context matches the requested key. + */ + ctx = ext4_search_fname_crypto_ctx(&(ei->i_encryption_key)); + if (ctx == NULL) + ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_encryption_key)); + if (IS_ERR(ctx)) + return ctx; + + ctx->flags = ei->i_crypt_policy_flags; + if (ctx->has_valid_key) { + if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { + printk_once(KERN_WARNING + "ext4: unsupported key mode %d\n", + ctx->key.mode); + return ERR_PTR(-ENOKEY); + } + + /* As a first cut, we will allocate new tfm in every call. + * later, we will keep the tfm around, in case the key gets + * re-used */ + if (ctx->ctfm == NULL) { + ctx->ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))", + 0, 0); + } + if (IS_ERR(ctx->ctfm)) { + res = PTR_ERR(ctx->ctfm); + printk( + KERN_DEBUG "%s: error (%d) allocating crypto tfm\n", + __func__, res); + ctx->ctfm = NULL; + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(res); + } + if (ctx->ctfm == NULL) { + printk( + KERN_DEBUG "%s: could not allocate crypto tfm\n", + __func__); + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-ENOMEM); + } + if (ctx->workpage == NULL) + ctx->workpage = alloc_page(GFP_NOFS); + if (IS_ERR(ctx->workpage)) { + res = PTR_ERR(ctx->workpage); + printk( + KERN_DEBUG "%s: error (%d) allocating work page\n", + __func__, res); + ctx->workpage = NULL; + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(res); + } + if (ctx->workpage == NULL) { + printk( + KERN_DEBUG "%s: could not allocate work page\n", + __func__); + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-ENOMEM); + } + ctx->lim = max_ciphertext_len; + crypto_ablkcipher_clear_flags(ctx->ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); + + /* If we are lucky, we will get a context that is already + * set up with the right key. Else, we will have to + * set the key */ + if (!ctx->ctfm_key_is_ready) { + /* Since our crypto objectives for filename encryption + * are pretty weak, + * we directly use the inode master key */ + res = crypto_ablkcipher_setkey(ctx->ctfm, + ctx->key.raw, ctx->key.size); + if (res) { + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-EIO); + } + ctx->ctfm_key_is_ready = 1; + } else { + /* In the current implementation, key should never be + * marked "ready" for a context that has just been + * allocated. So we should never reach here */ + BUG(); + } + } + if (ctx->htfm == NULL) + ctx->htfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(ctx->htfm)) { + res = PTR_ERR(ctx->htfm); + printk(KERN_DEBUG "%s: error (%d) allocating hash tfm\n", + __func__, res); + ctx->htfm = NULL; + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(res); + } + if (ctx->htfm == NULL) { + printk(KERN_DEBUG "%s: could not allocate hash tfm\n", + __func__); + ext4_put_fname_crypto_ctx(&ctx); + return ERR_PTR(-ENOMEM); + } + + return ctx; +} + +/** + * ext4_fname_crypto_round_up() - + * + * Return: The next multiple of block size + */ +u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) +{ + return ((size+blksize-1)/blksize)*blksize; +} + +/** + * ext4_fname_crypto_namelen_on_disk() - + */ +int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, + u32 namelen) +{ + u32 ciphertext_len; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + + if (ctx == NULL) + return -EIO; + if (!(ctx->has_valid_key)) + return -EACCES; + ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ? + EXT4_CRYPTO_BLOCK_SIZE : namelen; + ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); + ciphertext_len = (ciphertext_len > ctx->lim) + ? ctx->lim : ciphertext_len; + return (int) ciphertext_len; +} + +/** + * ext4_fname_crypto_alloc_obuff() - + * + * Allocates an output buffer that is sufficient for the crypto operation + * specified by the context and the direction. + */ +int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, + u32 ilen, struct ext4_str *crypto_str) +{ + unsigned int olen; + int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + + if (!ctx) + return -EIO; + if (padding < EXT4_CRYPTO_BLOCK_SIZE) + padding = EXT4_CRYPTO_BLOCK_SIZE; + olen = ext4_fname_crypto_round_up(ilen, padding); + crypto_str->len = olen; + if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) + olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; + /* Allocated buffer can hold one more character to null-terminate the + * string */ + crypto_str->name = kmalloc(olen+1, GFP_NOFS); + if (!(crypto_str->name)) + return -ENOMEM; + return 0; +} + +/** + * ext4_fname_crypto_free_buffer() - + * + * Frees the buffer allocated for crypto operation. + */ +void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) +{ + if (!crypto_str) + return; + kfree(crypto_str->name); + crypto_str->name = NULL; +} + +/** + * ext4_fname_disk_to_usr() - converts a filename from disk space to user space + */ +int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, + const struct ext4_str *iname, + struct ext4_str *oname) +{ + char buf[24]; + int ret; + + if (ctx == NULL) + return -EIO; + if (iname->len < 3) { + /*Check for . and .. */ + if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') { + oname->name[0] = '.'; + oname->name[iname->len-1] = '.'; + oname->len = iname->len; + return oname->len; + } + } + if (ctx->has_valid_key) + return ext4_fname_decrypt(ctx, iname, oname); + + if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { + ret = digest_encode(iname->name, iname->len, oname->name); + oname->len = ret; + return ret; + } + if (hinfo) { + memcpy(buf, &hinfo->hash, 4); + memcpy(buf+4, &hinfo->minor_hash, 4); + } else + memset(buf, 0, 8); + memcpy(buf + 8, iname->name + iname->len - 16, 16); + oname->name[0] = '_'; + ret = digest_encode(buf, 24, oname->name+1); + oname->len = ret + 1; + return ret + 1; +} + +int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, + const struct ext4_dir_entry_2 *de, + struct ext4_str *oname) +{ + struct ext4_str iname = {.name = (unsigned char *) de->name, + .len = de->name_len }; + + return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname); +} + + +/** + * ext4_fname_usr_to_disk() - converts a filename from user space to disk space + */ +int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct ext4_str *oname) +{ + int res; + + if (ctx == NULL) + return -EIO; + if (iname->len < 3) { + /*Check for . and .. */ + if (iname->name[0] == '.' && + iname->name[iname->len-1] == '.') { + oname->name[0] = '.'; + oname->name[iname->len-1] = '.'; + oname->len = iname->len; + return oname->len; + } + } + if (ctx->has_valid_key) { + res = ext4_fname_encrypt(ctx, iname, oname); + return res; + } + /* Without a proper key, a user is not allowed to modify the filenames + * in a directory. Consequently, a user space name cannot be mapped to + * a disk-space name */ + return -EACCES; +} + +/* + * Calculate the htree hash from a filename from user space + */ +int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct dx_hash_info *hinfo) +{ + struct ext4_str tmp; + int ret = 0; + char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1]; + + if (!ctx || + ((iname->name[0] == '.') && + ((iname->len == 1) || + ((iname->name[1] == '.') && (iname->len == 2))))) { + ext4fs_dirhash(iname->name, iname->len, hinfo); + return 0; + } + + if (!ctx->has_valid_key && iname->name[0] == '_') { + if (iname->len != 33) + return -ENOENT; + ret = digest_decode(iname->name+1, iname->len, buf); + if (ret != 24) + return -ENOENT; + memcpy(&hinfo->hash, buf, 4); + memcpy(&hinfo->minor_hash, buf + 4, 4); + return 0; + } + + if (!ctx->has_valid_key && iname->name[0] != '_') { + if (iname->len > 43) + return -ENOENT; + ret = digest_decode(iname->name, iname->len, buf); + ext4fs_dirhash(buf, ret, hinfo); + return 0; + } + + /* First encrypt the plaintext name */ + ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp); + if (ret < 0) + return ret; + + ret = ext4_fname_encrypt(ctx, iname, &tmp); + if (ret >= 0) { + ext4fs_dirhash(tmp.name, tmp.len, hinfo); + ret = 0; + } + + ext4_fname_crypto_free_buffer(&tmp); + return ret; +} + +int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, + int len, const char * const name, + struct ext4_dir_entry_2 *de) +{ + int ret = -ENOENT; + int bigname = (*name == '_'); + + if (ctx->has_valid_key) { + if (cstr->name == NULL) { + struct qstr istr; + + ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr); + if (ret < 0) + goto errout; + istr.name = name; + istr.len = len; + ret = ext4_fname_encrypt(ctx, &istr, cstr); + if (ret < 0) + goto errout; + } + } else { + if (cstr->name == NULL) { + cstr->name = kmalloc(32, GFP_KERNEL); + if (cstr->name == NULL) + return -ENOMEM; + if ((bigname && (len != 33)) || + (!bigname && (len > 43))) + goto errout; + ret = digest_decode(name+bigname, len-bigname, + cstr->name); + if (ret < 0) { + ret = -ENOENT; + goto errout; + } + cstr->len = ret; + } + if (bigname) { + if (de->name_len < 16) + return 0; + ret = memcmp(de->name + de->name_len - 16, + cstr->name + 8, 16); + return (ret == 0) ? 1 : 0; + } + } + if (de->name_len != cstr->len) + return 0; + ret = memcmp(de->name, cstr->name, cstr->len); + return (ret == 0) ? 1 : 0; +errout: + kfree(cstr->name); + cstr->name = NULL; + return ret; +} diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c new file mode 100644 index 000000000000..52170d0b7c40 --- /dev/null +++ b/fs/ext4/crypto_key.c @@ -0,0 +1,166 @@ +/* + * linux/fs/ext4/crypto_key.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions for ext4 + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#include <keys/encrypted-type.h> +#include <keys/user-type.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <uapi/linux/keyctl.h> + +#include "ext4.h" +#include "xattr.h" + +static void derive_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct ext4_completion_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->res = rc; + complete(&ecr->completion); +} + +/** + * ext4_derive_key_aes() - Derive a key using AES-128-ECB + * @deriving_key: Encryption key used for derivatio. + * @source_key: Source key to which to apply derivation. + * @derived_key: Derived key. + * + * Return: Zero on success; non-zero otherwise. + */ +static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], + char source_key[EXT4_AES_256_XTS_KEY_SIZE], + char derived_key[EXT4_AES_256_XTS_KEY_SIZE]) +{ + int res = 0; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, + 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + derive_crypt_complete, &ecr); + res = crypto_ablkcipher_setkey(tfm, deriving_key, + EXT4_AES_128_ECB_KEY_SIZE); + if (res < 0) + goto out; + sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE); + sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, + EXT4_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + +out: + if (req) + ablkcipher_request_free(req); + if (tfm) + crypto_free_ablkcipher(tfm); + return res; +} + +/** + * ext4_generate_encryption_key() - generates an encryption key + * @inode: The inode to generate the encryption key for. + */ +int ext4_generate_encryption_key(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + + (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; + struct key *keyring_key = NULL; + struct ext4_encryption_key *master_key; + struct ext4_encryption_context ctx; + struct user_key_payload *ukp; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + + if (res != sizeof(ctx)) { + if (res > 0) + res = -EINVAL; + goto out; + } + res = 0; + + ei->i_crypt_policy_flags = ctx.flags; + if (S_ISREG(inode->i_mode)) + crypt_key->mode = ctx.contents_encryption_mode; + else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + crypt_key->mode = ctx.filenames_encryption_mode; + else { + printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n"); + BUG(); + } + crypt_key->size = ext4_encryption_key_size(crypt_key->mode); + BUG_ON(!crypt_key->size); + if (DUMMY_ENCRYPTION_ENABLED(sbi)) { + memset(crypt_key->raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + goto out; + } + memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, + EXT4_KEY_DESC_PREFIX_SIZE); + sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE, + "%*phN", EXT4_KEY_DESCRIPTOR_SIZE, + ctx.master_key_descriptor); + full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + + (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + if (IS_ERR(keyring_key)) { + res = PTR_ERR(keyring_key); + keyring_key = NULL; + goto out; + } + BUG_ON(keyring_key->type != &key_type_logon); + ukp = ((struct user_key_payload *)keyring_key->payload.data); + if (ukp->datalen != sizeof(struct ext4_encryption_key)) { + res = -EINVAL; + goto out; + } + master_key = (struct ext4_encryption_key *)ukp->data; + BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != + EXT4_KEY_DERIVATION_NONCE_SIZE); + BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); + res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw); +out: + if (keyring_key) + key_put(keyring_key); + if (res < 0) + crypt_key->mode = EXT4_ENCRYPTION_MODE_INVALID; + return res; +} + +int ext4_has_encryption_key(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + + return (crypt_key->mode != EXT4_ENCRYPTION_MODE_INVALID); +} diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c new file mode 100644 index 000000000000..a6d6291aea16 --- /dev/null +++ b/fs/ext4/crypto_policy.c @@ -0,0 +1,198 @@ +/* + * linux/fs/ext4/crypto_policy.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption policy functions for ext4 + * + * Written by Michael Halcrow, 2015. + */ + +#include <linux/random.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "ext4.h" +#include "xattr.h" + +static int ext4_inode_has_encryption_context(struct inode *inode) +{ + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0); + return (res > 0); +} + +/* + * check whether the policy is consistent with the encryption context + * for the inode + */ +static int ext4_is_encryption_context_consistent_with_policy( + struct inode *inode, const struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx)); + if (res != sizeof(ctx)) + return 0; + return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == + policy->flags) && + (ctx.contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx.filenames_encryption_mode == + policy->filenames_encryption_mode)); +} + +static int ext4_create_encryption_context_from_policy( + struct inode *inode, const struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + int res = 0; + + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE); + if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid contents encryption mode %d\n", __func__, + policy->contents_encryption_mode); + return -EINVAL; + } + if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid filenames encryption mode %d\n", __func__, + policy->filenames_encryption_mode); + return -EINVAL; + } + if (policy->flags & ~EXT4_POLICY_FLAGS_VALID) + return -EINVAL; + ctx.contents_encryption_mode = policy->contents_encryption_mode; + ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; + BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); + get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + + res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), 0); + if (!res) + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + return res; +} + +int ext4_process_policy(const struct ext4_encryption_policy *policy, + struct inode *inode) +{ + if (policy->version != 0) + return -EINVAL; + + if (!ext4_inode_has_encryption_context(inode)) { + if (!ext4_empty_dir(inode)) + return -ENOTEMPTY; + return ext4_create_encryption_context_from_policy(inode, + policy); + } + + if (ext4_is_encryption_context_consistent_with_policy(inode, policy)) + return 0; + + printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", + __func__); + return -EINVAL; +} + +int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return -ENOENT; + if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + policy->version = 0; + policy->contents_encryption_mode = ctx.contents_encryption_mode; + policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; + memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE); + return 0; +} + +int ext4_is_child_context_consistent_with_parent(struct inode *parent, + struct inode *child) +{ + struct ext4_encryption_context parent_ctx, child_ctx; + int res; + + if ((parent == NULL) || (child == NULL)) { + pr_err("parent %p child %p\n", parent, child); + BUG_ON(1); + } + /* no restrictions if the parent directory is not encrypted */ + if (!ext4_encrypted_inode(parent)) + return 1; + res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) + return 0; + /* if the child directory is not encrypted, this is always a problem */ + if (!ext4_encrypted_inode(child)) + return 0; + res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) + return 0; + return (memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode)); +} + +/** + * ext4_inherit_context() - Sets a child context from its parent + * @parent: Parent inode from which the context is inherited. + * @child: Child inode that inherits the context from @parent. + * + * Return: Zero on success, non-zero otherwise + */ +int ext4_inherit_context(struct inode *parent, struct inode *child) +{ + struct ext4_encryption_context ctx; + int res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + + if (res != sizeof(ctx)) { + if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + ctx.contents_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, + EXT4_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + goto out; + } + } + get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), 0); +out: + if (!res) + ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); + return res; +} diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index c24143ea9c08..5665d82d2332 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -22,10 +22,8 @@ */ #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/buffer_head.h> #include <linux/slab.h> -#include <linux/rbtree.h> #include "ext4.h" #include "xattr.h" @@ -110,7 +108,10 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int err; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; + struct buffer_head *bh = NULL; int dir_has_error = 0; + struct ext4_fname_crypto_ctx *enc_ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); @@ -127,17 +128,28 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) if (ext4_has_inline_data(inode)) { int has_inline_data = 1; - int ret = ext4_read_inline_dir(file, ctx, + err = ext4_read_inline_dir(file, ctx, &has_inline_data); if (has_inline_data) - return ret; + return err; + } + + enc_ctx = ext4_get_fname_crypto_ctx(inode, EXT4_NAME_LEN); + if (IS_ERR(enc_ctx)) + return PTR_ERR(enc_ctx); + if (enc_ctx) { + err = ext4_fname_crypto_alloc_buffer(enc_ctx, EXT4_NAME_LEN, + &fname_crypto_str); + if (err < 0) { + ext4_put_fname_crypto_ctx(&enc_ctx); + return err; + } } offset = ctx->pos & (sb->s_blocksize - 1); while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; - struct buffer_head *bh = NULL; map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; @@ -180,6 +192,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) (unsigned long long)ctx->pos); ctx->pos += sb->s_blocksize - offset; brelse(bh); + bh = NULL; continue; } set_buffer_verified(bh); @@ -226,25 +239,44 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - if (!dir_emit(ctx, de->name, - de->name_len, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type))) { - brelse(bh); - return 0; + if (enc_ctx == NULL) { + /* Directory is not encrypted */ + if (!dir_emit(ctx, de->name, + de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; + } else { + /* Directory is encrypted */ + err = ext4_fname_disk_to_usr(enc_ctx, + NULL, de, &fname_crypto_str); + if (err < 0) + goto errout; + if (!dir_emit(ctx, + fname_crypto_str.name, err, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; } } ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } - offset = 0; + if ((ctx->pos < inode->i_size) && !dir_relax(inode)) + goto done; brelse(bh); - if (ctx->pos < inode->i_size) { - if (!dir_relax(inode)) - return 0; - } + bh = NULL; + offset = 0; } - return 0; +done: + err = 0; +errout: +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ext4_put_fname_crypto_ctx(&enc_ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); +#endif + brelse(bh); + return err; } static inline int is_32bit_api(void) @@ -384,10 +416,15 @@ void ext4_htree_free_dir_info(struct dir_private_info *p) /* * Given a directory entry, enter it into the fname rb tree. + * + * When filename encryption is enabled, the dirent will hold the + * encrypted filename, while the htree will hold decrypted filename. + * The decrypted filename is passed in via ent_name. parameter. */ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, - struct ext4_dir_entry_2 *dirent) + struct ext4_dir_entry_2 *dirent, + struct ext4_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; @@ -398,17 +435,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, p = &info->root.rb_node; /* Create and allocate the fname structure */ - len = sizeof(struct fname) + dirent->name_len + 1; + len = sizeof(struct fname) + ent_name->len + 1; new_fn = kzalloc(len, GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; new_fn->minor_hash = minor_hash; new_fn->inode = le32_to_cpu(dirent->inode); - new_fn->name_len = dirent->name_len; + new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; - memcpy(new_fn->name, dirent->name, dirent->name_len); - new_fn->name[dirent->name_len] = 0; + memcpy(new_fn->name, ent_name->name, ent_name->len); + new_fn->name[ent_name->len] = 0; while (*p) { parent = *p; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c8eb32eefc3c..9a83f149ac85 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -422,7 +422,7 @@ enum { EXT4_INODE_DIRTY = 8, EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ EXT4_INODE_NOCOMPR = 10, /* Don't compress */ - EXT4_INODE_ENCRYPT = 11, /* Compression error */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ /* End compression flags --- maybe not all used */ EXT4_INODE_INDEX = 12, /* hash-indexed directory */ EXT4_INODE_IMAGIC = 13, /* AFS directory */ @@ -582,6 +582,15 @@ enum { #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +/* Encryption algorithms */ +#define EXT4_ENCRYPTION_MODE_INVALID 0 +#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 +#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 +#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 +#define EXT4_ENCRYPTION_MODE_AES_256_CTS 4 + +#include "ext4_crypto.h" + /* * ioctl commands */ @@ -603,6 +612,9 @@ enum { #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) +#define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy) +#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) +#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -899,6 +911,7 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; + char i_crypt_policy_flags; /* Indicate the inline data space. */ u16 i_inline_off; @@ -939,6 +952,11 @@ struct ext4_inode_info { /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Encryption params */ + struct ext4_encryption_key i_encryption_key; +#endif }; /* @@ -1049,12 +1067,6 @@ extern void ext4_set_bits(void *bm, int cur, int len); /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 -/* Encryption algorithms */ -#define EXT4_ENCRYPTION_MODE_INVALID 0 -#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 -#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 -#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 - /* * Structure of the super block */ @@ -1142,7 +1154,8 @@ struct ext4_super_block { __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_checksum_type; /* metadata checksum algorithm used */ - __le16 s_reserved_pad; + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ __le32 s_snapshot_inum; /* Inode number of active snapshot */ __le32 s_snapshot_id; /* sequential ID of active snapshot */ @@ -1169,7 +1182,9 @@ struct ext4_super_block { __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ - __le32 s_reserved[105]; /* Padding to the end of the block */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_reserved[100]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1180,8 +1195,16 @@ struct ext4_super_block { /* * run-time mount flags */ -#define EXT4_MF_MNTDIR_SAMPLED 0x0001 -#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ + EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif /* Number of quota types we support */ #define EXT4_MAXQUOTAS 2 @@ -1351,6 +1374,12 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Encryption */ + uint32_t s_file_encryption_mode; + uint32_t s_dir_encryption_mode; +#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1466,6 +1495,18 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_SB(sb) (sb) #endif +/* + * Returns true if the inode is inode is encrypted + */ +static inline int ext4_encrypted_inode(struct inode *inode) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); +#else + return 0; +#endif +} + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* @@ -1575,8 +1616,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP | \ - EXT4_FEATURE_INCOMPAT_INLINE_DATA) + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -2001,6 +2043,102 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); +/* crypto_policy.c */ +int ext4_is_child_context_consistent_with_parent(struct inode *parent, + struct inode *child); +int ext4_inherit_context(struct inode *parent, struct inode *child); +void ext4_to_hex(char *dst, char *src, size_t src_size); +int ext4_process_policy(const struct ext4_encryption_policy *policy, + struct inode *inode); +int ext4_get_policy(struct inode *inode, + struct ext4_encryption_policy *policy); + +/* crypto.c */ +bool ext4_valid_contents_enc_mode(uint32_t mode); +uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); +extern struct workqueue_struct *ext4_read_workqueue; +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode); +void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx); +void ext4_restore_control_page(struct page *data_page); +struct page *ext4_encrypt(struct inode *inode, + struct page *plaintext_page); +int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page); +int ext4_decrypt_one(struct inode *inode, struct page *page); +int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +int ext4_init_crypto(void); +void ext4_exit_crypto(void); +static inline int ext4_sb_has_crypto(struct super_block *sb) +{ + return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); +} +#else +static inline int ext4_init_crypto(void) { return 0; } +static inline void ext4_exit_crypto(void) { } +static inline int ext4_sb_has_crypto(struct super_block *sb) +{ + return 0; +} +#endif + +/* crypto_fname.c */ +bool ext4_valid_filenames_enc_mode(uint32_t mode); +u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); +int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, + u32 ilen, struct ext4_str *crypto_str); +int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, + const struct ext4_str *iname, + struct ext4_str *oname); +int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, + struct dx_hash_info *hinfo, + const struct ext4_dir_entry_2 *de, + struct ext4_str *oname); +int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct ext4_str *oname); +int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, + const struct qstr *iname, + struct dx_hash_info *hinfo); +int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, + u32 namelen); +int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, + int len, const char * const name, + struct ext4_dir_entry_2 *de); + + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); +struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, + u32 max_len); +void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); +#else +static inline +void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { } +static inline +struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, + u32 max_len) +{ + return NULL; +} +static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } +#endif + + +/* crypto_key.c */ +int ext4_generate_encryption_key(struct inode *inode); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +int ext4_has_encryption_key(struct inode *inode); +#else +static inline int ext4_has_encryption_key(struct inode *inode) +{ + return 0; +} +#endif + + /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, @@ -2011,17 +2149,20 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, - __u32 minor_hash, - struct ext4_dir_entry_2 *dirent); + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct ext4_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, const char *name, int namelen, struct ext4_dir_entry_2 **dest_de); -void ext4_insert_dentry(struct inode *inode, +int ext4_insert_dentry(struct inode *dir, + struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, + const struct qstr *iname, const char *name, int namelen); static inline void ext4_update_dx_flag(struct inode *inode) { @@ -2099,6 +2240,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); /* inode.c */ +int ext4_inode_is_fast_symlink(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, @@ -2189,6 +2331,7 @@ extern int ext4_generic_delete_entry(handle_t *handle, void *entry_buf, int buf_size, int csum_size); +extern int ext4_empty_dir(struct inode *inode); /* resize.c */ extern int ext4_group_add(struct super_block *sb, @@ -2698,6 +2841,10 @@ static inline void ext4_set_de_type(struct super_block *sb, de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } +/* readpages.c */ +extern int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; @@ -2742,7 +2889,6 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblocks); -extern int ext4_extent_tree_init(handle_t *, struct inode *); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h new file mode 100644 index 000000000000..d75159c101ce --- /dev/null +++ b/fs/ext4/ext4_crypto.h @@ -0,0 +1,156 @@ +/* + * linux/fs/ext4/ext4_crypto.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption header content for ext4 + * + * Written by Michael Halcrow, 2015. + */ + +#ifndef _EXT4_CRYPTO_H +#define _EXT4_CRYPTO_H + +#include <linux/fs.h> + +#define EXT4_KEY_DESCRIPTOR_SIZE 8 + +/* Policy provided via an ioctl on the topmost directory */ +struct ext4_encryption_policy { + char version; + char contents_encryption_mode; + char filenames_encryption_mode; + char flags; + char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; +} __attribute__((__packed__)); + +#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1 +#define EXT4_KEY_DERIVATION_NONCE_SIZE 16 + +#define EXT4_POLICY_FLAGS_PAD_4 0x00 +#define EXT4_POLICY_FLAGS_PAD_8 0x01 +#define EXT4_POLICY_FLAGS_PAD_16 0x02 +#define EXT4_POLICY_FLAGS_PAD_32 0x03 +#define EXT4_POLICY_FLAGS_PAD_MASK 0x03 +#define EXT4_POLICY_FLAGS_VALID 0x03 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Reserved + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct ext4_encryption_context { + char format; + char contents_encryption_mode; + char filenames_encryption_mode; + char flags; + char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; + char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE]; +} __attribute__((__packed__)); + +/* Encryption parameters */ +#define EXT4_XTS_TWEAK_SIZE 16 +#define EXT4_AES_128_ECB_KEY_SIZE 16 +#define EXT4_AES_256_GCM_KEY_SIZE 32 +#define EXT4_AES_256_CBC_KEY_SIZE 32 +#define EXT4_AES_256_CTS_KEY_SIZE 32 +#define EXT4_AES_256_XTS_KEY_SIZE 64 +#define EXT4_MAX_KEY_SIZE 64 + +#define EXT4_KEY_DESC_PREFIX "ext4:" +#define EXT4_KEY_DESC_PREFIX_SIZE 5 + +struct ext4_encryption_key { + uint32_t mode; + char raw[EXT4_MAX_KEY_SIZE]; + uint32_t size; +}; + +#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL 0x00000002 + +struct ext4_crypto_ctx { + struct crypto_tfm *tfm; /* Crypto API context */ + struct page *bounce_page; /* Ciphertext page on write path */ + struct page *control_page; /* Original page on write path */ + struct bio *bio; /* The bio for this context */ + struct work_struct work; /* Work queue for read complete path */ + struct list_head free_list; /* Free list */ + int flags; /* Flags */ + int mode; /* Encryption mode for tfm */ +}; + +struct ext4_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \ + struct ext4_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + +static inline int ext4_encryption_key_size(int mode) +{ + switch (mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + return EXT4_AES_256_XTS_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_GCM: + return EXT4_AES_256_GCM_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_CBC: + return EXT4_AES_256_CBC_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_CTS: + return EXT4_AES_256_CTS_KEY_SIZE; + default: + BUG(); + } + return 0; +} + +#define EXT4_FNAME_NUM_SCATTER_ENTRIES 4 +#define EXT4_CRYPTO_BLOCK_SIZE 16 +#define EXT4_FNAME_CRYPTO_DIGEST_SIZE 32 + +struct ext4_str { + unsigned char *name; + u32 len; +}; + +struct ext4_fname_crypto_ctx { + u32 lim; + char tmp_buf[EXT4_CRYPTO_BLOCK_SIZE]; + struct crypto_ablkcipher *ctfm; + struct crypto_hash *htfm; + struct page *workpage; + struct ext4_encryption_key key; + unsigned flags : 8; + unsigned has_valid_key : 1; + unsigned ctfm_key_is_ready : 1; +}; + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct ext4_encrypted_symlink_data { + __le16 len; + char encrypted_path[1]; +} __attribute__((__packed__)); + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 encrypted_symlink_data_len(u32 l) +{ + if (l < EXT4_CRYPTO_BLOCK_SIZE) + l = EXT4_CRYPTO_BLOCK_SIZE; + return (l + sizeof(struct ext4_encrypted_symlink_data) - 1); +} + +#endif /* _EXT4_CRYPTO_H */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 3445035c7e01..d41843181818 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -87,6 +87,12 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) ext4_put_nojournal(handle); return 0; } + + if (!handle->h_transaction) { + err = jbd2_journal_stop(handle); + return handle->h_err ? handle->h_err : err; + } + sb = handle->h_transaction->t_journal->j_private; err = handle->h_err; rc = jbd2_journal_stop(handle); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bed43081720f..e003a1e81dc3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -377,7 +377,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); ext4_lblk_t last = lblock + len - 1; - if (lblock > last) + if (len == 0 || lblock > last) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -1717,12 +1717,6 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, { unsigned short ext1_ee_len, ext2_ee_len; - /* - * Make sure that both extents are initialized. We don't merge - * unwritten extents so that we can be sure that end_io code has - * the extent that was written properly split out and conversion to - * initialized is trivial. - */ if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; @@ -3128,6 +3122,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); + if (ext4_encrypted_inode(inode)) + return ext4_encrypted_zeroout(inode, ex); + ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); if (ret > 0) ret = 0; @@ -4535,19 +4532,7 @@ got_allocated_blocks: */ reserved_clusters = get_reserved_cluster_alloc(inode, map->m_lblk, allocated); - if (map_from_cluster) { - if (reserved_clusters) { - /* - * We have clusters reserved for this range. - * But since we are not doing actual allocation - * and are simply using blocks from previously - * allocated cluster, we should release the - * reservation and not claim quota. - */ - ext4_da_update_reserve_space(inode, - reserved_clusters, 0); - } - } else { + if (!map_from_cluster) { BUG_ON(allocated_clusters < reserved_clusters); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); @@ -4803,12 +4788,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, else max_blocks -= lblk; - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | - EXT4_EX_NOCACHE; - if (mode & FALLOC_FL_KEEP_SIZE) - flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - mutex_lock(&inode->i_mutex); /* @@ -4825,15 +4804,28 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = inode_newsize_ok(inode, new_size); if (ret) goto out_mutex; - /* - * If we have a partial block after EOF we have to allocate - * the entire block. - */ - if (partial_end) - max_blocks += 1; } + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + + /* Preallocate the range including the unaligned edges */ + if (partial_begin || partial_end) { + ret = ext4_alloc_file_blocks(file, + round_down(offset, 1 << blkbits) >> blkbits, + (round_up((offset + len), 1 << blkbits) - + round_down(offset, 1 << blkbits)) >> blkbits, + new_size, flags, mode); + if (ret) + goto out_mutex; + + } + + /* Zero range excluding the unaligned edges */ if (max_blocks > 0) { + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE); /* Now release the pages and zero block aligned part of pages*/ truncate_pagecache_range(inode, start, end - 1); @@ -4847,19 +4839,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags, mode); if (ret) goto out_dio; - /* - * Remove entire range from the extent status tree. - * - * ext4_es_remove_extent(inode, lblk, max_blocks) is - * NOT sufficient. I'm not sure why this is the case, - * but let's be conservative and remove the extent - * status tree for the entire inode. There should be - * no outstanding delalloc extents thanks to the - * filemap_write_and_wait_range() call above. - */ - ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); - if (ret) - goto out_dio; } if (!partial_begin && !partial_end) goto out_dio; @@ -4922,6 +4901,20 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ext4_lblk_t lblk; unsigned int blkbits = inode->i_blkbits; + /* + * Encrypted inodes can't handle collapse range or insert + * range since we would need to re-encrypt blocks with a + * different IV or XTS tweak (which are based on the logical + * block number). + * + * XXX It's not clear why zero range isn't working, but we'll + * leave it disabled for encrypted inodes for now. This is a + * bug we should fix.... + */ + if (ext4_encrypted_inode(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))) + return -EOPNOTSUPP; + /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) @@ -4934,13 +4927,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; - /* - * currently supporting (pre)allocate mode for extent-based - * files _only_ - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return -EOPNOTSUPP; - if (mode & FALLOC_FL_COLLAPSE_RANGE) return ext4_collapse_range(inode, offset, len); @@ -4962,6 +4948,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) mutex_lock(&inode->i_mutex); + /* + * We only support preallocation for extent-based files only + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; @@ -5402,6 +5396,14 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size, ioffset; int ret; + /* + * We need to test this early because xfstests assumes that a + * collapse range of (0, 1) will return EOPNOTSUPP if the file + * system does not support collapse range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + /* Collapse range works only on fs block size aligned offsets. */ if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || len & (EXT4_CLUSTER_SIZE(sb) - 1)) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e04d45733976..26724aeece73 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -9,12 +9,10 @@ * * Ext4 extents status tree core functions. */ -#include <linux/rbtree.h> #include <linux/list_sort.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "ext4.h" -#include "extents_status.h" #include <trace/events/ext4.h> @@ -705,6 +703,14 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, BUG_ON(end < lblk); + if ((status & EXTENT_STATUS_DELAYED) && + (status & EXTENT_STATUS_WRITTEN)) { + ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " + " delayed and written which can potentially " + " cause data loss.\n", lblk, len); + WARN_ON(1); + } + newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index e576d682b353..0613c256c344 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -20,7 +20,6 @@ #include <linux/time.h> #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> #include <linux/quotaops.h> @@ -221,6 +220,13 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { + struct inode *inode = file->f_mapping->host; + + if (ext4_encrypted_inode(inode)) { + int err = ext4_generate_encryption_key(inode); + if (err) + return 0; + } file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -238,6 +244,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) struct vfsmount *mnt = filp->f_path.mnt; struct path path; char buf[64], *cp; + int ret; if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && !(sb->s_flags & MS_RDONLY))) { @@ -276,11 +283,17 @@ static int ext4_file_open(struct inode * inode, struct file * filp) * writing and the journal is present */ if (filp->f_mode & FMODE_WRITE) { - int ret = ext4_inode_attach_jinode(inode); + ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } - return dquot_file_open(inode, filp); + ret = dquot_file_open(inode, filp); + if (!ret && ext4_encrypted_inode(inode)) { + ret = ext4_generate_encryption_key(inode); + if (ret) + ret = -EACCES; + } + return ret; } /* diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index a8bc47f75fa0..8850254136ae 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -26,7 +26,6 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/writeback.h> -#include <linux/jbd2.h> #include <linux/blkdev.h> #include "ext4.h" @@ -56,7 +55,7 @@ static int ext4_sync_parent(struct inode *inode) dentry = d_find_any_alias(inode); if (!dentry) break; - next = igrab(dentry->d_parent->d_inode); + next = igrab(d_inode(dentry->d_parent)); dput(dentry); if (!next) break; diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 3d586f02883e..e026aa941fd5 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -10,7 +10,6 @@ */ #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/cryptohash.h> #include "ext4.h" diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ac644c31ca67..1eaa6cb96cd0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -14,7 +14,6 @@ #include <linux/time.h> #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> @@ -444,7 +443,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if (S_ISDIR(mode) && - ((parent == sb->s_root->d_inode) || + ((parent == d_inode(sb->s_root)) || (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { int best_ndir = inodes_per_group; int ret = -1; @@ -997,6 +996,12 @@ got: ei->i_block_group = group; ei->i_last_alloc_group = ~0; + /* If the directory encrypted, then we should encrypt the inode. */ + if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) && + (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(sbi))) + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); @@ -1029,11 +1034,28 @@ got: ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; - +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if ((sbi->s_file_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID) && + (sbi->s_dir_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID)) { + ei->i_inline_off = 0; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_set_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + } else { + /* Inline data and encryption are incompatible + * We turn off inline data since encryption is enabled */ + ei->i_inline_off = 1; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + } +#else ei->i_inline_off = 0; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - +#endif ret = inode; err = dquot_alloc_inode(inode); if (err) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 3580629e42d3..958824019509 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -682,11 +682,11 @@ retry: * via ext4_inode_block_unlocked_dio(). Check inode's state * while holding extra i_dio_count ref. */ - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); smp_mb(); if (unlikely(ext4_test_inode_state(inode, EXT4_STATE_DIOREAD_LOCK))) { - inode_dio_done(inode); + inode_dio_end(inode); goto locked; } if (IS_DAX(inode)) @@ -697,7 +697,7 @@ retry: inode->i_sb->s_bdev, iter, offset, ext4_get_block, NULL, NULL, 0); - inode_dio_done(inode); + inode_dio_end(inode); } else { locked: if (IS_DAX(inode)) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 4b143febf21f..095c7a258d97 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -11,11 +11,13 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ + +#include <linux/fiemap.h> + #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "truncate.h" -#include <linux/fiemap.h> #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) @@ -972,7 +974,7 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, offset = 0; while ((void *)de < dlimit) { de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); - trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n", + trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n", offset, de_len, de->name_len, de->name, de->name_len, le32_to_cpu(de->inode)); if (ext4_check_dir_entry(dir, NULL, de, bh, @@ -998,7 +1000,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, struct ext4_iloc *iloc, void *inline_start, int inline_size) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; int err; @@ -1014,7 +1016,8 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(inode, de, inline_size, name, namelen); + ext4_insert_dentry(dir, inode, de, inline_size, &dentry->d_name, + name, namelen); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); @@ -1251,7 +1254,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, int ret, inline_size; void *inline_start; struct ext4_iloc iloc; - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); ret = ext4_get_inode_loc(dir, &iloc); if (ret) @@ -1327,6 +1330,7 @@ int htree_inlinedir_to_tree(struct file *dir_file, struct ext4_iloc iloc; void *dir_buf = NULL; struct ext4_dir_entry_2 fake; + struct ext4_str tmp_str; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1398,8 +1402,10 @@ int htree_inlinedir_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - err = ext4_htree_store_dirent(dir_file, - hinfo->hash, hinfo->minor_hash, de); + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, hinfo->hash, + hinfo->minor_hash, de, &tmp_str); if (err) { count = err; goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b49cf6e59953..0554b0b5957b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -20,7 +20,6 @@ #include <linux/fs.h> #include <linux/time.h> -#include <linux/jbd2.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> @@ -36,7 +35,6 @@ #include <linux/kernel.h> #include <linux/printk.h> #include <linux/slab.h> -#include <linux/ratelimit.h> #include <linux/bitops.h> #include "ext4_jbd2.h" @@ -140,7 +138,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, /* * Test whether an inode is a fast symlink. */ -static int ext4_inode_is_fast_symlink(struct inode *inode) +int ext4_inode_is_fast_symlink(struct inode *inode) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; @@ -533,6 +531,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && ext4_find_delalloc_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; @@ -637,6 +636,7 @@ found: status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && ext4_find_delalloc_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; @@ -887,6 +887,95 @@ int do_journal_get_write_access(handle_t *handle, static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block) +{ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + struct inode *inode = page->mapping->host; + unsigned block_start, block_end; + sector_t block; + int err = 0; + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; + bool decrypt = false; + + BUG_ON(!PageLocked(page)); + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > to); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + bbits = ilog2(blocksize); + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + + for (bh = head, block_start = 0; bh != head || !block_start; + block++, block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } + continue; + } + if (buffer_new(bh)) + clear_buffer_new(bh); + if (!buffer_mapped(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, block, bh, 1); + if (err) + break; + if (buffer_new(bh)) { + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + if (PageUptodate(page)) { + clear_buffer_new(bh); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + continue; + } + if (block_end > to || block_start < from) + zero_user_segments(page, to, block_end, + block_start, from); + continue; + } + } + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + continue; + } + if (!buffer_uptodate(bh) && !buffer_delay(bh) && + !buffer_unwritten(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++ = bh; + decrypt = ext4_encrypted_inode(inode) && + S_ISREG(inode->i_mode); + } + } + /* + * If we issued read requests, let them complete. + */ + while (wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + err = -EIO; + } + if (unlikely(err)) + page_zero_new_buffers(page, from, to); + else if (decrypt) + err = ext4_decrypt_one(inode, page); + return err; +} +#endif + static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -949,11 +1038,19 @@ retry_journal: /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ext4_should_dioread_nolock(inode)) + ret = ext4_block_write_begin(page, pos, len, + ext4_get_block_write); + else + ret = ext4_block_write_begin(page, pos, len, + ext4_get_block); +#else if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_write); else ret = __block_write_begin(page, pos, len, ext4_get_block); - +#endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, page_buffers(page), from, to, NULL, @@ -2575,7 +2672,12 @@ retry_journal: /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ret = ext4_block_write_begin(page, pos, len, + ext4_da_get_block_prep); +#else ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); +#endif if (ret < 0) { unlock_page(page); ext4_journal_stop(handle); @@ -2821,7 +2923,7 @@ static int ext4_readpage(struct file *file, struct page *page) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) - return mpage_readpage(page, ext4_get_block); + return ext4_mpage_readpages(page->mapping, NULL, page, 1); return ret; } @@ -2836,7 +2938,7 @@ ext4_readpages(struct file *file, struct address_space *mapping, if (ext4_has_inline_data(inode)) return 0; - return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); + return ext4_mpage_readpages(mapping, pages, NULL, nr_pages); } static void ext4_invalidatepage(struct page *page, unsigned int offset, @@ -2977,7 +3079,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * overwrite DIO as i_dio_count needs to be incremented under i_mutex. */ if (iov_iter_rw(iter) == WRITE) - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); @@ -3033,6 +3135,9 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION + BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); +#endif if (IS_DAX(inode)) ret = dax_do_io(iocb, inode, iter, offset, get_block_func, ext4_end_io_dio, dio_flags); @@ -3079,7 +3184,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, retake_lock: if (iov_iter_rw(iter) == WRITE) - inode_dio_done(inode); + inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) { up_read(&EXT4_I(inode)->i_data_sem); @@ -3097,6 +3202,11 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return 0; +#endif + /* * If we are doing data journalling we don't support O_DIRECT */ @@ -3261,6 +3371,13 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) goto unlock; + if (S_ISREG(inode->i_mode) && + ext4_encrypted_inode(inode)) { + /* We expect the key to be set. */ + BUG_ON(!ext4_has_encryption_key(inode)); + BUG_ON(blocksize != PAGE_CACHE_SIZE); + WARN_ON_ONCE(ext4_decrypt_one(inode, page)); + } } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); @@ -4096,7 +4213,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (ext4_inode_is_fast_symlink(inode)) { + if (ext4_inode_is_fast_symlink(inode) && + !ext4_encrypted_inode(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); @@ -4227,7 +4345,7 @@ static void ext4_update_other_inodes_time(struct super_block *sb, int inode_size = EXT4_INODE_SIZE(sb); oi.orig_ino = orig_ino; - ino = orig_ino & ~(inodes_per_block - 1); + ino = (orig_ino & ~(inodes_per_block - 1)) + 1; for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; @@ -4521,7 +4639,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) */ int ext4_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; @@ -4669,7 +4787,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode; unsigned long long delalloc_blocks; - inode = dentry->d_inode; + inode = d_inode(dentry); generic_fillattr(inode, stat); /* diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f58a0d106726..2cb9e178d1c5 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -8,12 +8,12 @@ */ #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/capability.h> #include <linux/time.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/file.h> +#include <linux/random.h> #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -196,6 +196,16 @@ journal_err_out: return err; } +static int uuid_is_zero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return 0; + return 1; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -615,7 +625,78 @@ resizefs_out: } case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); + case EXT4_IOC_SET_ENCRYPTION_POLICY: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_encryption_policy policy; + int err = 0; + + if (copy_from_user(&policy, + (struct ext4_encryption_policy __user *)arg, + sizeof(policy))) { + err = -EFAULT; + goto encryption_policy_out; + } + err = ext4_process_policy(&policy, inode); +encryption_policy_out: + return err; +#else + return -EOPNOTSUPP; +#endif + } + case EXT4_IOC_GET_ENCRYPTION_PWSALT: { + int err, err2; + struct ext4_sb_info *sbi = EXT4_SB(sb); + handle_t *handle; + + if (!ext4_sb_has_crypto(sb)) + return -EOPNOTSUPP; + if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { + err = mnt_want_write_file(filp); + if (err) + return err; + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto pwsalt_err_exit; + } + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto pwsalt_err_journal; + generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); + err = ext4_handle_dirty_metadata(handle, NULL, + sbi->s_sbh); + pwsalt_err_journal: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + pwsalt_err_exit: + mnt_drop_write_file(filp); + if (err) + return err; + } + if (copy_to_user((void *) arg, sbi->s_es->s_encrypt_pw_salt, + 16)) + return -EFAULT; + return 0; + } + case EXT4_IOC_GET_ENCRYPTION_POLICY: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_encryption_policy policy; + int err = 0; + + if (!ext4_encrypted_inode(inode)) + return -ENOENT; + err = ext4_get_policy(inode, &policy); + if (err) + return err; + if (copy_to_user((void *)arg, &policy, sizeof(policy))) + return -EFAULT; + return 0; +#else + return -EOPNOTSUPP; +#endif + } default: return -ENOTTY; } @@ -680,6 +761,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FITRIM: case EXT4_IOC_RESIZE_FS: case EXT4_IOC_PRECACHE_EXTENTS: + case EXT4_IOC_SET_ENCRYPTION_POLICY: + case EXT4_IOC_GET_ENCRYPTION_PWSALT: + case EXT4_IOC_GET_ENCRYPTION_POLICY: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 3cb267aee802..b52374e42102 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode) EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); - tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2291923dae4e..814f3beb4369 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -26,7 +26,6 @@ #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/jbd2.h> #include <linux/time.h> #include <linux/fcntl.h> #include <linux/stat.h> @@ -254,8 +253,9 @@ static struct dx_frame *dx_probe(const struct qstr *d_name, struct dx_hash_info *hinfo, struct dx_frame *frame); static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, + unsigned blocksize, struct dx_hash_info *hinfo, + struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, struct dx_map_entry *offsets, int count, unsigned blocksize); @@ -586,8 +586,10 @@ struct stats unsigned bcount; }; -static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, - int size, int show_names) +static struct stats dx_show_leaf(struct inode *dir, + struct dx_hash_info *hinfo, + struct ext4_dir_entry_2 *de, + int size, int show_names) { unsigned names = 0, space = 0; char *base = (char *) de; @@ -600,12 +602,73 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent { if (show_names) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + int len; + char *name; + struct ext4_str fname_crypto_str + = {.name = NULL, .len = 0}; + struct ext4_fname_crypto_ctx *ctx = NULL; + int res; + + name = de->name; + len = de->name_len; + ctx = ext4_get_fname_crypto_ctx(dir, + EXT4_NAME_LEN); + if (IS_ERR(ctx)) { + printk(KERN_WARNING "Error acquiring" + " crypto ctxt--skipping crypto\n"); + ctx = NULL; + } + if (ctx == NULL) { + /* Directory is not encrypted */ + ext4fs_dirhash(de->name, + de->name_len, &h); + printk("%*.s:(U)%x.%u ", len, + name, h.hash, + (unsigned) ((char *) de + - base)); + } else { + /* Directory is encrypted */ + res = ext4_fname_crypto_alloc_buffer( + ctx, de->name_len, + &fname_crypto_str); + if (res < 0) { + printk(KERN_WARNING "Error " + "allocating crypto " + "buffer--skipping " + "crypto\n"); + ext4_put_fname_crypto_ctx(&ctx); + ctx = NULL; + } + res = ext4_fname_disk_to_usr(ctx, NULL, de, + &fname_crypto_str); + if (res < 0) { + printk(KERN_WARNING "Error " + "converting filename " + "from disk to usr" + "\n"); + name = "??"; + len = 2; + } else { + name = fname_crypto_str.name; + len = fname_crypto_str.len; + } + ext4fs_dirhash(de->name, de->name_len, + &h); + printk("%*.s:(E)%x.%u ", len, name, + h.hash, (unsigned) ((char *) de + - base)); + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer( + &fname_crypto_str); + } +#else int len = de->name_len; char *name = de->name; - while (len--) printk("%c", *name++); ext4fs_dirhash(de->name, de->name_len, &h); - printk(":%x.%u ", h.hash, + printk("%*.s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); +#endif } space += EXT4_DIR_REC_LEN(de->name_len); names++; @@ -623,7 +686,6 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, unsigned count = dx_get_count(entries), names = 0, space = 0, i; unsigned bcount = 0; struct buffer_head *bh; - int err; printk("%i indexed blocks...\n", count); for (i = 0; i < count; i++, entries++) { @@ -637,7 +699,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): - dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); + dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) + bh->b_data, blocksize, 0); names += stats.names; space += stats.space; bcount += stats.bcount; @@ -687,8 +750,28 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (d_name) { + struct ext4_fname_crypto_ctx *ctx = NULL; + int res; + + /* Check if the directory is encrypted */ + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) { + ret_err = ERR_PTR(PTR_ERR(ctx)); + goto fail; + } + res = ext4_fname_usr_to_hash(ctx, d_name, hinfo); + if (res < 0) { + ret_err = ERR_PTR(res); + goto fail; + } + ext4_put_fname_crypto_ctx(&ctx); + } +#else if (d_name) ext4fs_dirhash(d_name->name, d_name->len, hinfo); +#endif hash = hinfo->hash; if (root->info.unused_flags & 1) { @@ -773,6 +856,7 @@ fail: brelse(frame->bh); frame--; } + if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) ext4_warning(dir->i_sb, "Corrupt dir inode %lu, running e2fsck is " @@ -878,6 +962,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -889,6 +975,24 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Check if the directory is encrypted */ + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + brelse(bh); + return err; + } + if (ctx != NULL) { + err = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + &fname_crypto_str); + if (err < 0) { + ext4_put_fname_crypto_ctx(&ctx); + brelse(bh); + return err; + } + } +#endif for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, @@ -904,14 +1008,37 @@ static int htree_dirblock_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - if ((err = ext4_htree_store_dirent(dir_file, - hinfo->hash, hinfo->minor_hash, de)) != 0) { - brelse(bh); - return err; + if (ctx == NULL) { + /* Directory is not encrypted */ + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &tmp_str); + } else { + /* Directory is encrypted */ + err = ext4_fname_disk_to_usr(ctx, hinfo, de, + &fname_crypto_str); + if (err < 0) { + count = err; + goto errout; + } + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &fname_crypto_str); + } + if (err != 0) { + count = err; + goto errout; } count++; } +errout: brelse(bh); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); +#endif return count; } @@ -935,6 +1062,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, int count = 0; int ret, err; __u32 hashval; + struct ext4_str tmp_str; dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); @@ -970,14 +1098,22 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; - if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0) + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, 0, 0, + de, &tmp_str); + if (err != 0) goto errout; count++; } if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; de = ext4_next_entry(de, dir->i_sb->s_blocksize); - if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, 2, 0, + de, &tmp_str); + if (err != 0) goto errout; count++; } @@ -1035,8 +1171,8 @@ static inline int search_dirblock(struct buffer_head *bh, * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ -static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, +static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, + unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; @@ -1106,57 +1242,89 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) * `len <= EXT4_NAME_LEN' is guaranteed by caller. * `de != NULL' is guaranteed by caller. */ -static inline int ext4_match (int len, const char * const name, - struct ext4_dir_entry_2 * de) +static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx, + struct ext4_str *fname_crypto_str, + int len, const char * const name, + struct ext4_dir_entry_2 *de) { - if (len != de->name_len) - return 0; + int res; + if (!de->inode) return 0; - return !memcmp(name, de->name, len); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ctx) + return ext4_fname_match(ctx, fname_crypto_str, len, name, de); +#endif + if (len != de->name_len) + return 0; + res = memcmp(name, de->name, len); + return (res == 0) ? 1 : 0; } /* * Returns 0 if not found, -1 on failure, and 1 on success */ -int search_dir(struct buffer_head *bh, - char *search_buf, - int buf_size, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 **res_dir) +int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, + struct inode *dir, const struct qstr *d_name, + unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; const char *name = d_name->name; int namelen = d_name->len; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + int res; + + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return -1; de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ + if ((char *) de + de->name_len <= dlimit) { + res = ext4_match(ctx, &fname_crypto_str, namelen, + name, de); + if (res < 0) { + res = -1; + goto return_result; + } + if (res > 0) { + /* found a match - just to be sure, do + * a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, + bh->b_size, offset)) { + res = -1; + goto return_result; + } + *res_dir = de; + res = 1; + goto return_result; + } - if ((char *) de + namelen <= dlimit && - ext4_match (namelen, name, de)) { - /* found a match - just to be sure, do a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, - bh->b_size, offset)) - return -1; - *res_dir = de; - return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); - if (de_len <= 0) - return -1; + if (de_len <= 0) { + res = -1; + goto return_result; + } offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } - return 0; + + res = 0; +return_result: + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return res; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, @@ -1345,6 +1513,9 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q ext4_lblk_t block; int retval; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + *res_dir = NULL; +#endif frame = dx_probe(d_name, dir, &hinfo, frames); if (IS_ERR(frame)) return (struct buffer_head *) frame; @@ -1417,6 +1588,18 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi ino); return ERR_PTR(-EIO); } + if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) && + !ext4_is_child_context_consistent_with_parent(dir, + inode)) { + iput(inode); + ext4_warning(inode->i_sb, + "Inconsistent encryption contexts: %lu/%lu\n", + (unsigned long) dir->i_ino, + (unsigned long) inode->i_ino); + return ERR_PTR(-EPERM); + } } return d_splice_alias(inode, dentry); } @@ -1429,7 +1612,7 @@ struct dentry *ext4_get_parent(struct dentry *child) struct ext4_dir_entry_2 * de; struct buffer_head *bh; - bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); + bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL); if (IS_ERR(bh)) return (struct dentry *) bh; if (!bh) @@ -1437,13 +1620,13 @@ struct dentry *ext4_get_parent(struct dentry *child) ino = le32_to_cpu(de->inode); brelse(bh); - if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { - EXT4_ERROR_INODE(child->d_inode, + if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) { + EXT4_ERROR_INODE(d_inode(child), "bad parent inode number: %u", ino); return ERR_PTR(-EIO); } - return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino)); } /* @@ -1541,7 +1724,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); - count = dx_make_map((struct ext4_dir_entry_2 *) data1, + count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1, blocksize, hinfo, map); map -= count; dx_sort_map(map, count); @@ -1564,7 +1747,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, hash2, split, count-split)); /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); + de2 = dx_move_dirents(data1, data2, map + split, count - split, + blocksize); de = dx_pack_dirents(data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - (char *) de, @@ -1580,8 +1764,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, initialize_dirent_tail(t, blocksize); } - dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); - dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1, + blocksize, 1)); + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, + blocksize, 1)); /* Which block gets the new entry? */ if (hinfo->hash >= hash2) { @@ -1618,15 +1804,40 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, int nlen, rlen; unsigned int offset = 0; char *top; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + int res; + + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return -1; + + if (ctx != NULL) { + /* Calculate record length needed to store the entry */ + res = ext4_fname_crypto_namelen_on_disk(ctx, namelen); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + return res; + } + reclen = EXT4_DIR_REC_LEN(res); + } de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, - buf, buf_size, offset)) - return -EIO; - if (ext4_match(namelen, name, de)) - return -EEXIST; + buf, buf_size, offset)) { + res = -EIO; + goto return_result; + } + /* Provide crypto context and crypto buffer to ext4 match */ + res = ext4_match(ctx, &fname_crypto_str, namelen, name, de); + if (res < 0) + goto return_result; + if (res > 0) { + res = -EEXIST; + goto return_result; + } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) @@ -1634,26 +1845,62 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } - if ((char *) de > top) - return -ENOSPC; - *dest_de = de; - return 0; + if ((char *) de > top) + res = -ENOSPC; + else { + *dest_de = de; + res = 0; + } +return_result: + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return res; } -void ext4_insert_dentry(struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - const char *name, int namelen) +int ext4_insert_dentry(struct inode *dir, + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const struct qstr *iname, + const char *name, int namelen) { int nlen, rlen; + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + struct ext4_str tmp_str; + int res; + + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return -EIO; + /* By default, the input name would be written to the disk */ + tmp_str.name = (unsigned char *)name; + tmp_str.len = namelen; + if (ctx != NULL) { + /* Directory is encrypted */ + res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + &fname_crypto_str); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + return -ENOMEM; + } + res = ext4_fname_usr_to_disk(ctx, iname, &fname_crypto_str); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return res; + } + tmp_str.name = fname_crypto_str.name; + tmp_str.len = fname_crypto_str.len; + } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if (de->inode) { struct ext4_dir_entry_2 *de1 = - (struct ext4_dir_entry_2 *)((char *)de + nlen); + (struct ext4_dir_entry_2 *)((char *)de + nlen); de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); de = de1; @@ -1661,9 +1908,14 @@ void ext4_insert_dentry(struct inode *inode, de->file_type = EXT4_FT_UNKNOWN; de->inode = cpu_to_le32(inode->i_ino); ext4_set_de_type(inode->i_sb, de, inode->i_mode); - de->name_len = namelen; - memcpy(de->name, name, namelen); + de->name_len = tmp_str.len; + + memcpy(de->name, tmp_str.name, tmp_str.len); + ext4_put_fname_crypto_ctx(&ctx); + ext4_fname_crypto_free_buffer(&fname_crypto_str); + return 0; } + /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large @@ -1676,7 +1928,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext4_dir_entry_2 *de, struct buffer_head *bh) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned int blocksize = dir->i_sb->s_blocksize; @@ -1700,8 +1952,12 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, return err; } - /* By now the buffer is marked for journaling */ - ext4_insert_dentry(inode, de, blocksize, name, namelen); + /* By now the buffer is marked for journaling. Due to crypto operations, + * the following function call may fail */ + err = ext4_insert_dentry(dir, inode, de, blocksize, &dentry->d_name, + name, namelen); + if (err < 0) + return err; /* * XXX shouldn't update any times until successful @@ -1732,9 +1988,14 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct inode *inode, struct buffer_head *bh) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_fname_crypto_ctx *ctx = NULL; + int res; +#else const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; +#endif struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[2], *frame; @@ -1748,7 +2009,13 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct dx_hash_info hinfo; ext4_lblk_t block; struct fake_dirent *fde; - int csum_size = 0; + int csum_size = 0; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); +#endif if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); @@ -1815,7 +2082,18 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + res = ext4_fname_usr_to_hash(ctx, &dentry->d_name, &hinfo); + if (res < 0) { + ext4_put_fname_crypto_ctx(&ctx); + ext4_mark_inode_dirty(handle, dir); + brelse(bh); + return res; + } + ext4_put_fname_crypto_ctx(&ctx); +#else ext4fs_dirhash(name, namelen, &hinfo); +#endif memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; @@ -1864,8 +2142,8 @@ out_frames: static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; - struct buffer_head *bh; + struct inode *dir = d_inode(dentry->d_parent); + struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; struct super_block *sb; @@ -1889,14 +2167,14 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; if (retval == 1) { retval = 0; - return retval; + goto out; } } if (is_dx(dir)) { retval = ext4_dx_add_entry(handle, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) - return retval; + goto out; ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; ext4_mark_inode_dirty(handle, dir); @@ -1908,14 +2186,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return PTR_ERR(bh); retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (retval != -ENOSPC) { - brelse(bh); - return retval; - } + if (retval != -ENOSPC) + goto out; if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) - return make_indexed_dir(handle, dentry, inode, bh); + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + retval = make_indexed_dir(handle, dentry, inode, bh); + bh = NULL; /* make_indexed_dir releases bh */ + goto out; + } brelse(bh); } bh = ext4_append(handle, dir, &block); @@ -1931,6 +2210,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } retval = add_dirent_to_buf(handle, dentry, inode, de, bh); +out: brelse(bh); if (retval == 0) ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); @@ -1947,7 +2227,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct dx_entry *entries, *at; struct dx_hash_info hinfo; struct buffer_head *bh; - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; @@ -2237,7 +2517,20 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - err = ext4_add_nondir(handle, dentry, inode); + err = 0; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (!err && (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) { + err = ext4_inherit_context(dir, inode); + if (err) { + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + } + } +#endif + if (!err) + err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) ext4_handle_sync(handle); } @@ -2418,6 +2711,14 @@ retry: err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) { + err = ext4_inherit_context(dir, inode); + if (err) + goto out_clear_inode; + } +#endif err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); @@ -2450,7 +2751,7 @@ out_stop: /* * routine to check that the specified directory is empty (for rmdir) */ -static int empty_dir(struct inode *inode) +int ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; @@ -2702,7 +3003,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); @@ -2711,14 +3012,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (!bh) goto end_rmdir; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) goto end_rmdir; retval = -ENOTEMPTY; - if (!empty_dir(inode)) + if (!ext4_empty_dir(inode)) goto end_rmdir; handle = ext4_journal_start(dir, EXT4_HT_DIR, @@ -2771,7 +3072,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); @@ -2780,7 +3081,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (!bh) goto end_unlink; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) @@ -2828,16 +3129,25 @@ static int ext4_symlink(struct inode *dir, { handle_t *handle; struct inode *inode; - int l, err, retries = 0; + int err, len = strlen(symname); int credits; - - l = strlen(symname)+1; - if (l > dir->i_sb->s_blocksize) + bool encryption_required; + struct ext4_str disk_link; + struct ext4_encrypted_symlink_data *sd = NULL; + + disk_link.len = len + 1; + disk_link.name = (char *) symname; + + encryption_required = (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); + if (encryption_required) + disk_link.len = encrypted_symlink_data_len(len) + 1; + if (disk_link.len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; dquot_initialize(dir); - if (l > EXT4_N_BLOCKS * 4) { + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { /* * For non-fast symlinks, we just allocate inode and put it on * orphan list in the first transaction => we need bitmap, @@ -2856,16 +3166,49 @@ static int ext4_symlink(struct inode *dir, credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; } -retry: + inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; + if (IS_ERR(inode)) { + if (handle) + ext4_journal_stop(handle); + return PTR_ERR(inode); + } + + if (encryption_required) { + struct ext4_fname_crypto_ctx *ctx = NULL; + struct qstr istr; + struct ext4_str ostr; + + sd = kzalloc(disk_link.len, GFP_NOFS); + if (!sd) { + err = -ENOMEM; + goto err_drop_inode; + } + err = ext4_inherit_context(dir, inode); + if (err) + goto err_drop_inode; + ctx = ext4_get_fname_crypto_ctx(inode, + inode->i_sb->s_blocksize); + if (IS_ERR_OR_NULL(ctx)) { + /* We just set the policy, so ctx should not be NULL */ + err = (ctx == NULL) ? -EIO : PTR_ERR(ctx); + goto err_drop_inode; + } + istr.name = (const unsigned char *) symname; + istr.len = len; + ostr.name = sd->encrypted_path; + err = ext4_fname_usr_to_disk(ctx, &istr, &ostr); + ext4_put_fname_crypto_ctx(&ctx); + if (err < 0) + goto err_drop_inode; + sd->len = cpu_to_le16(ostr.len); + disk_link.name = (char *) sd; + } - if (l > EXT4_N_BLOCKS * 4) { + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); /* @@ -2881,9 +3224,10 @@ retry: drop_nlink(inode); err = ext4_orphan_add(handle, inode); ext4_journal_stop(handle); + handle = NULL; if (err) goto err_drop_inode; - err = __page_symlink(inode, symname, l, 1); + err = __page_symlink(inode, disk_link.name, disk_link.len, 1); if (err) goto err_drop_inode; /* @@ -2895,34 +3239,37 @@ retry: EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); + handle = NULL; goto err_drop_inode; } set_nlink(inode, 1); err = ext4_orphan_del(handle, inode); - if (err) { - ext4_journal_stop(handle); - clear_nlink(inode); + if (err) goto err_drop_inode; - } } else { /* clear the extent format for fast symlink */ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); - inode->i_op = &ext4_fast_symlink_inode_operations; - memcpy((char *)&EXT4_I(inode)->i_data, symname, l); - inode->i_size = l-1; + inode->i_op = encryption_required ? + &ext4_symlink_inode_operations : + &ext4_fast_symlink_inode_operations; + memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, + disk_link.len); + inode->i_size = disk_link.len - 1; } EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) ext4_handle_sync(handle); -out_stop: if (handle) ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) - goto retry; + kfree(sd); return err; err_drop_inode: + if (handle) + ext4_journal_stop(handle); + kfree(sd); + clear_nlink(inode); unlock_new_inode(inode); iput(inode); return err; @@ -2932,12 +3279,14 @@ static int ext4_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { handle_t *handle; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int err, retries = 0; if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; - + if (ext4_encrypted_inode(dir) && + !ext4_is_child_context_consistent_with_parent(dir, inode)) + return -EPERM; dquot_initialize(dir); retry: @@ -3204,12 +3553,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, - .inode = old_dentry->d_inode, + .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, - .inode = new_dentry->d_inode, + .inode = d_inode(new_dentry), }; int force_reread; int retval; @@ -3238,6 +3587,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; + if ((old.dir != new.dir) && + ext4_encrypted_inode(new.dir) && + !ext4_is_child_context_consistent_with_parent(new.dir, + old.inode)) { + retval = -EPERM; + goto end_rename; + } + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { @@ -3258,12 +3615,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); if (!(flags & RENAME_WHITEOUT)) { handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rename; + } } else { whiteout = ext4_whiteout_for_rename(&old, credits, &handle); - if (IS_ERR(whiteout)) - return PTR_ERR(whiteout); + if (IS_ERR(whiteout)) { + retval = PTR_ERR(whiteout); + whiteout = NULL; + goto end_rename; + } } if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) @@ -3272,7 +3635,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old.inode->i_mode)) { if (new.inode) { retval = -ENOTEMPTY; - if (!empty_dir(new.inode)) + if (!ext4_empty_dir(new.inode)) goto end_rename; } else { retval = -EMLINK; @@ -3346,8 +3709,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_dec_count(handle, old.dir); if (new.inode) { - /* checked empty_dir above, can't have another parent, - * ext4_dec_count() won't work for many-linked dirs */ + /* checked ext4_empty_dir above, can't have another + * parent, ext4_dec_count() won't work for many-linked + * dirs */ clear_nlink(new.inode); } else { ext4_inc_count(handle, new.dir); @@ -3385,12 +3749,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, - .inode = old_dentry->d_inode, + .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, - .inode = new_dentry->d_inode, + .inode = d_inode(new_dentry), }; u8 new_file_type; int retval; @@ -3427,8 +3791,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, handle = ext4_journal_start(old.dir, EXT4_HT_DIR, (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rename; + } if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 464984261e69..5765f88b3904 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -8,7 +8,6 @@ #include <linux/fs.h> #include <linux/time.h> -#include <linux/jbd2.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> @@ -24,7 +23,6 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> -#include <linux/ratelimit.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -68,6 +66,10 @@ static void ext4_finish_bio(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct page *data_page = NULL; + struct ext4_crypto_ctx *ctx = NULL; +#endif struct buffer_head *bh, *head; unsigned bio_start = bvec->bv_offset; unsigned bio_end = bio_start + bvec->bv_len; @@ -77,6 +79,15 @@ static void ext4_finish_bio(struct bio *bio) if (!page) continue; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (!page->mapping) { + /* The bounce data pages are unmapped. */ + data_page = page; + ctx = (struct ext4_crypto_ctx *)page_private(data_page); + page = ctx->control_page; + } +#endif + if (error) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); @@ -101,8 +112,13 @@ static void ext4_finish_bio(struct bio *bio) } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); local_irq_restore(flags); - if (!under_io) + if (!under_io) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ctx) + ext4_restore_control_page(data_page); +#endif end_page_writeback(page); + } } } @@ -377,6 +393,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, static int io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, + struct page *page, struct buffer_head *bh) { int ret; @@ -390,7 +407,7 @@ submit_and_retry: if (ret) return ret; } - ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); + ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; io->io_next_block++; @@ -403,6 +420,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct writeback_control *wbc, bool keep_towrite) { + struct page *data_page = NULL; struct inode *inode = page->mapping->host; unsigned block_start, blocksize; struct buffer_head *bh, *head; @@ -462,19 +480,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io, set_buffer_async_write(bh); } while ((bh = bh->b_this_page) != head); - /* Now submit buffers to write */ bh = head = page_buffers(page); + + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + data_page = ext4_encrypt(inode, page); + if (IS_ERR(data_page)) { + ret = PTR_ERR(data_page); + data_page = NULL; + goto out; + } + } + + /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) continue; - ret = io_submit_add_bh(io, inode, bh); + ret = io_submit_add_bh(io, inode, + data_page ? data_page : page, bh); if (ret) { /* * We only get here on ENOMEM. Not much else * we can do but mark the page as dirty, and * better luck next time. */ - redirty_page_for_writepage(wbc, page); break; } nr_submitted++; @@ -483,6 +511,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* Error stopped previous loop? Clean up buffers... */ if (ret) { + out: + if (data_page) + ext4_restore_control_page(data_page); + printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); + redirty_page_for_writepage(wbc, page); do { clear_buffer_async_write(bh); bh = bh->b_this_page; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c new file mode 100644 index 000000000000..171b9ac4b45e --- /dev/null +++ b/fs/ext4/readpage.c @@ -0,0 +1,328 @@ +/* + * linux/fs/ext4/readpage.c + * + * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2015, Google, Inc. + * + * This was originally taken from fs/mpage.c + * + * The intent is the ext4_mpage_readpages() function here is intended + * to replace mpage_readpages() in the general case, not just for + * encrypted files. It has some limitations (see below), where it + * will fall back to read_block_full_page(), but these limitations + * should only be hit when page_size != block_size. + * + * This will allow us to attach a callback function to support ext4 + * encryption. + * + * If anything unusual happens, such as: + * + * - encountering a page which has buffers + * - encountering a page which has a non-hole after a hole + * - encountering a page with non-contiguous blocks + * + * then this code just gives up and calls the buffer_head-based read function. + * It does handle a page which has holes at the end - that is a common case: + * the end-of-file on blocksize < PAGE_CACHE_SIZE setups. + * + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/kdev_t.h> +#include <linux/gfp.h> +#include <linux/bio.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/highmem.h> +#include <linux/prefetch.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/pagevec.h> +#include <linux/cleancache.h> + +#include "ext4.h" + +/* + * Call ext4_decrypt on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_crypto_ctx *ctx = + container_of(work, struct ext4_crypto_ctx, work); + struct bio *bio = ctx->bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + int ret = ext4_decrypt(ctx, page); + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else + SetPageUptodate(page); + unlock_page(page); + } + ext4_release_crypto_ctx(ctx); + bio_put(bio); +#else + BUG(); +#endif +} + +static inline bool ext4_bio_encrypted(struct bio *bio) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + return unlikely(bio->bi_private != NULL); +#else + return false; +#endif +} + +/* + * I/O completion handler for multipage BIOs. + * + * The mpage code never puts partial pages into a BIO (except for end-of-file). + * If a page does not map to a contiguous run of blocks then it simply falls + * back to block_read_full_page(). + * + * Why is this? If a page's completion depends on a number of different BIOs + * which can complete in any order (or at the same time) then determining the + * status of that page is hard. See end_buffer_async_read() for the details. + * There is no point in duplicating all that complexity. + */ +static void mpage_end_io(struct bio *bio, int err) +{ + struct bio_vec *bv; + int i; + + if (ext4_bio_encrypted(bio)) { + struct ext4_crypto_ctx *ctx = bio->bi_private; + + if (err) { + ext4_release_crypto_ctx(ctx); + } else { + INIT_WORK(&ctx->work, completion_pages); + ctx->bio = bio; + queue_work(ext4_read_workqueue, &ctx->work); + return; + } + } + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } + + bio_put(bio); +} + +int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages) +{ + struct bio *bio = NULL; + unsigned page_idx; + sector_t last_block_in_bio = 0; + + struct inode *inode = mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; + const unsigned blocksize = 1 << blkbits; + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t blocks[MAX_BUF_PER_PAGE]; + unsigned page_block; + struct block_device *bdev = inode->i_sb->s_bdev; + int length; + unsigned relative_block = 0; + struct ext4_map_blocks map; + + map.m_pblk = 0; + map.m_lblk = 0; + map.m_len = 0; + map.m_flags = 0; + + for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { + int fully_mapped = 1; + unsigned first_hole = blocks_per_page; + + prefetchw(&page->flags); + if (pages) { + page = list_entry(pages->prev, struct page, lru); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) + goto next_page; + } + + if (page_has_buffers(page)) + goto confused; + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + last_block = block_in_file + nr_pages * blocks_per_page; + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + if (last_block > last_block_in_file) + last_block = last_block_in_file; + page_block = 0; + + /* + * Map blocks using the previous result first. + */ + if ((map.m_flags & EXT4_MAP_MAPPED) && + block_in_file > map.m_lblk && + block_in_file < (map.m_lblk + map.m_len)) { + unsigned map_offset = block_in_file - map.m_lblk; + unsigned last = map.m_len - map_offset; + + for (relative_block = 0; ; relative_block++) { + if (relative_block == last) { + /* needed? */ + map.m_flags &= ~EXT4_MAP_MAPPED; + break; + } + if (page_block == blocks_per_page) + break; + blocks[page_block] = map.m_pblk + map_offset + + relative_block; + page_block++; + block_in_file++; + } + } + + /* + * Then do more ext4_map_blocks() calls until we are + * done with this page. + */ + while (page_block < blocks_per_page) { + if (block_in_file < last_block) { + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; + + if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { + set_error_page: + SetPageError(page); + zero_user_segment(page, 0, + PAGE_CACHE_SIZE); + unlock_page(page); + goto next_page; + } + } + if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { + fully_mapped = 0; + if (first_hole == blocks_per_page) + first_hole = page_block; + page_block++; + block_in_file++; + continue; + } + if (first_hole != blocks_per_page) + goto confused; /* hole -> non-hole */ + + /* Contiguous blocks? */ + if (page_block && blocks[page_block-1] != map.m_pblk-1) + goto confused; + for (relative_block = 0; ; relative_block++) { + if (relative_block == map.m_len) { + /* needed? */ + map.m_flags &= ~EXT4_MAP_MAPPED; + break; + } else if (page_block == blocks_per_page) + break; + blocks[page_block] = map.m_pblk+relative_block; + page_block++; + block_in_file++; + } + } + if (first_hole != blocks_per_page) { + zero_user_segment(page, first_hole << blkbits, + PAGE_CACHE_SIZE); + if (first_hole == 0) { + SetPageUptodate(page); + unlock_page(page); + goto next_page; + } + } else if (fully_mapped) { + SetPageMappedToDisk(page); + } + if (fully_mapped && blocks_per_page == 1 && + !PageUptodate(page) && cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } + + /* + * This page will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (last_block_in_bio != blocks[0] - 1)) { + submit_and_realloc: + submit_bio(READ, bio); + bio = NULL; + } + if (bio == NULL) { + struct ext4_crypto_ctx *ctx = NULL; + + if (ext4_encrypted_inode(inode) && + S_ISREG(inode->i_mode)) { + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + goto set_error_page; + } + bio = bio_alloc(GFP_KERNEL, + min_t(int, nr_pages, bio_get_nr_vecs(bdev))); + if (!bio) { + if (ctx) + ext4_release_crypto_ctx(ctx); + goto set_error_page; + } + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); + bio->bi_end_io = mpage_end_io; + bio->bi_private = ctx; + } + + length = first_hole << blkbits; + if (bio_add_page(bio, page, length, 0) < length) + goto submit_and_realloc; + + if (((map.m_flags & EXT4_MAP_BOUNDARY) && + (relative_block == map.m_len)) || + (first_hole != blocks_per_page)) { + submit_bio(READ, bio); + bio = NULL; + } else + last_block_in_bio = blocks[blocks_per_page - 1]; + goto next_page; + confused: + if (bio) { + submit_bio(READ, bio); + bio = NULL; + } + if (!PageUptodate(page)) + block_read_full_page(page, ext4_get_block); + else + unlock_page(page); + next_page: + if (pages) + page_cache_release(page); + } + BUG_ON(pages && !list_empty(pages)); + if (bio) + submit_bio(READ, bio); + return 0; +} diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 8a8ec6293b19..cf0c472047e3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1432,12 +1432,15 @@ static int ext4_flex_group_add(struct super_block *sb, goto exit; /* * We will always be modifying at least the superblock and GDT - * block. If we are adding a group past the last current GDT block, + * blocks. If we are adding a group past the last current GDT block, * we will also modify the inode and the dindirect block. If we * are adding a group with superblock/GDT backups we will also * modify each of the reserved GDT dindirect blocks. */ - credit = flex_gd->count * 4 + reserved_gdb; + credit = 3; /* sb, resize inode, resize inode dindirect */ + /* GDT blocks */ + credit += 1 + DIV_ROUND_UP(flex_gd->count, EXT4_DESC_PER_BLOCK(sb)); + credit += reserved_gdb; /* Reserved GDT dindirect blocks */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit); if (IS_ERR(handle)) { err = PTR_ERR(handle); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d348c7d29d80..ca9d4a2fed41 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -21,7 +21,6 @@ #include <linux/fs.h> #include <linux/time.h> #include <linux/vmalloc.h> -#include <linux/jbd2.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> @@ -295,6 +294,8 @@ static void __save_error_info(struct super_block *sb, const char *func, struct ext4_super_block *es = EXT4_SB(sb)->s_es; EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + if (bdev_read_only(sb->s_bdev)) + return; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); es->s_last_error_time = cpu_to_le32(get_seconds()); strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); @@ -323,22 +324,6 @@ static void save_error_info(struct super_block *sb, const char *func, ext4_commit_super(sb, 1); } -/* - * The del_gendisk() function uninitializes the disk-specific data - * structures, including the bdi structure, without telling anyone - * else. Once this happens, any attempt to call mark_buffer_dirty() - * (for example, by ext4_commit_super), will cause a kernel OOPS. - * This is a kludge to prevent these oops until we can put in a proper - * hook in del_gendisk() to inform the VFS and file system layers. - */ -static int block_device_ejected(struct super_block *sb) -{ - struct inode *bd_inode = sb->s_bdev->bd_inode; - struct backing_dev_info *bdi = inode_to_bdi(bd_inode); - - return bdi->dev == NULL; -} - static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; @@ -893,6 +878,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID; +#endif return &ei->vfs_inode; } @@ -1120,7 +1108,7 @@ enum { Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, + Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, @@ -1211,6 +1199,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, + {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1412,6 +1401,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, + {Opt_test_dummy_encryption, 0, MOPT_GTE0}, {Opt_err, 0, 0} }; @@ -1568,7 +1558,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } - journal_inode = path.dentry->d_inode; + journal_inode = d_inode(path.dentry); if (!S_ISBLK(journal_inode->i_mode)) { ext4_msg(sb, KERN_ERR, "error: journal path %s " "is not a block device", journal_path); @@ -1588,6 +1578,15 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); + } else if (token == Opt_test_dummy_encryption) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION; + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mode enabled"); +#else + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mount option ignored"); +#endif } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) @@ -2685,11 +2684,13 @@ static struct attribute *ext4_attrs[] = { EXT4_INFO_ATTR(lazy_itable_init); EXT4_INFO_ATTR(batched_discard); EXT4_INFO_ATTR(meta_bg_resize); +EXT4_INFO_ATTR(encryption); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), ATTR_LIST(meta_bg_resize), + ATTR_LIST(encryption), NULL, }; @@ -3448,6 +3449,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_bdev->bd_part) sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, sectors[1]); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Modes of operations for file and directory encryption. */ + sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID; +#endif /* Cleanup superblock name */ for (cp = sb->s_id; (cp = strchr(cp, '/'));) @@ -3692,6 +3698,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) && + es->s_encryption_level) { + ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", + es->s_encryption_level); + goto failed_mount; + } + if (sb->s_blocksize != blocksize) { /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { @@ -4054,6 +4067,13 @@ no_journal: } } + if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) && + !(sb->s_flags & MS_RDONLY) && + !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) { + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); + ext4_commit_super(sb, 1); + } + /* * Get the # of file system overhead blocks from the * superblock if present. @@ -4570,7 +4590,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; - if (!sbh || block_device_ejected(sb)) + if (!sbh) return error; if (buffer_write_io_error(sbh)) { /* @@ -5199,7 +5219,7 @@ static int ext4_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2); + handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); @@ -5247,7 +5267,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * all updates to the file when we bypass pagecache... */ if (EXT4_SB(sb)->s_journal && - ext4_should_journal_data(path->dentry->d_inode)) { + ext4_should_journal_data(d_inode(path->dentry))) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index ff3711932018..187b78920314 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -18,22 +18,115 @@ */ #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/namei.h> #include "ext4.h" #include "xattr.h" +#ifdef CONFIG_EXT4_FS_ENCRYPTION static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); + struct page *cpage = NULL; + char *caddr, *paddr = NULL; + struct ext4_str cstr, pstr; + struct inode *inode = d_inode(dentry); + struct ext4_fname_crypto_ctx *ctx = NULL; + struct ext4_encrypted_symlink_data *sd; + loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); + int res; + u32 plen, max_size = inode->i_sb->s_blocksize; + + if (!ext4_encrypted_inode(inode)) + return page_follow_link_light(dentry, nd); + + ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize); + if (IS_ERR(ctx)) + return ctx; + + if (ext4_inode_is_fast_symlink(inode)) { + caddr = (char *) EXT4_I(inode)->i_data; + max_size = sizeof(EXT4_I(inode)->i_data); + } else { + cpage = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(cpage)) { + ext4_put_fname_crypto_ctx(&ctx); + return cpage; + } + caddr = kmap(cpage); + caddr[size] = 0; + } + + /* Symlink is encrypted */ + sd = (struct ext4_encrypted_symlink_data *)caddr; + cstr.name = sd->encrypted_path; + cstr.len = le32_to_cpu(sd->len); + if ((cstr.len + + sizeof(struct ext4_encrypted_symlink_data) - 1) > + max_size) { + /* Symlink data on the disk is corrupted */ + res = -EIO; + goto errout; + } + plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ? + EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len; + paddr = kmalloc(plen + 1, GFP_NOFS); + if (!paddr) { + res = -ENOMEM; + goto errout; + } + pstr.name = paddr; + res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr); + if (res < 0) + goto errout; + /* Null-terminate the name */ + if (res <= plen) + paddr[res] = '\0'; + nd_set_link(nd, paddr); + ext4_put_fname_crypto_ctx(&ctx); + if (cpage) { + kunmap(cpage); + page_cache_release(cpage); + } + return NULL; +errout: + ext4_put_fname_crypto_ctx(&ctx); + if (cpage) { + kunmap(cpage); + page_cache_release(cpage); + } + kfree(paddr); + return ERR_PTR(res); +} + +static void ext4_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + struct page *page = cookie; + + if (!page) { + kfree(nd_get_link(nd)); + } else { + kunmap(page); + page_cache_release(page); + } +} +#endif + +static void *ext4_follow_fast_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ext4_inode_info *ei = EXT4_I(d_inode(dentry)); nd_set_link(nd, (char *) ei->i_data); return NULL; } const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, +#ifdef CONFIG_EXT4_FS_ENCRYPTION + .follow_link = ext4_follow_link, + .put_link = ext4_put_link, +#else .follow_link = page_follow_link_light, .put_link = page_put_link, +#endif .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -43,7 +136,7 @@ const struct inode_operations ext4_symlink_inode_operations = { const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ext4_follow_link, + .follow_link = ext4_follow_fast_link, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1e09fc77395c..16e28c08d1e8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -55,7 +55,6 @@ #include <linux/slab.h> #include <linux/mbcache.h> #include <linux/quotaops.h> -#include <linux/rwsem.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" @@ -179,7 +178,7 @@ ext4_xattr_handler(int name_index) /* * Inode operation listxattr() * - * dentry->d_inode->i_mutex: don't care + * d_inode(dentry)->i_mutex: don't care */ ssize_t ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -424,7 +423,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, static int ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); @@ -461,7 +460,7 @@ cleanup: static int ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; @@ -502,7 +501,7 @@ ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret, ret2; - down_read(&EXT4_I(dentry->d_inode)->xattr_sem); + down_read(&EXT4_I(d_inode(dentry))->xattr_sem); ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; @@ -515,7 +514,7 @@ ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) goto errout; ret += ret2; errout: - up_read(&EXT4_I(dentry->d_inode)->xattr_sem); + up_read(&EXT4_I(d_inode(dentry))->xattr_sem); return ret; } @@ -639,8 +638,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) free += EXT4_XATTR_LEN(name_len); } if (i->value) { - if (free < EXT4_XATTR_SIZE(i->value_len) || - free < EXT4_XATTR_LEN(name_len) + + if (free < EXT4_XATTR_LEN(name_len) + EXT4_XATTR_SIZE(i->value_len)) return -ENOSPC; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 29bedf5589f6..ddc0957760ba 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -23,6 +23,7 @@ #define EXT4_XATTR_INDEX_SECURITY 6 #define EXT4_XATTR_INDEX_SYSTEM 7 #define EXT4_XATTR_INDEX_RICHACL 8 +#define EXT4_XATTR_INDEX_ENCRYPTION 9 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ @@ -98,6 +99,8 @@ extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; +#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" + extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index d2a200624af5..95d90e0560f0 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -33,7 +33,7 @@ ext4_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -43,7 +43,7 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 95f1f4ab59a4..891ee2ddfbd6 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -36,7 +36,7 @@ ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -46,7 +46,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 0edb7611ffbe..6ed932b3c043 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -37,7 +37,7 @@ ext4_xattr_user_get(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER, + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER, name, buffer, size); } @@ -49,7 +49,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER, + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 94e2d2ffabe1..05f0f663f14c 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -1,5 +1,5 @@ config F2FS_FS - tristate "F2FS filesystem support (EXPERIMENTAL)" + tristate "F2FS filesystem support" depends on BLOCK help F2FS is based on Log-structured File System (LFS), which supports diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 742202779bd5..4320ffab3495 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -351,13 +351,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, *acl = f2fs_acl_clone(p, GFP_NOFS); if (!*acl) - return -ENOMEM; + goto no_mem; ret = f2fs_acl_create_masq(*acl, mode); - if (ret < 0) { - posix_acl_release(*acl); - return -ENOMEM; - } + if (ret < 0) + goto no_mem_clone; if (ret == 0) { posix_acl_release(*acl); @@ -378,6 +376,12 @@ no_acl: *default_acl = NULL; *acl = NULL; return 0; + +no_mem_clone: + posix_acl_release(*acl); +no_mem: + posix_acl_release(p); + return -ENOMEM; } int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7f794b72b3b7..a5e17a2a0781 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -276,7 +276,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - if (f2fs_write_meta_page(page, &wbc)) { + if (mapping->a_ops->writepage(page, &wbc)) { unlock_page(page); break; } @@ -464,20 +464,19 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) void recover_orphan_inodes(struct f2fs_sb_info *sbi) { - block_t start_blk, orphan_blkaddr, i, j; + block_t start_blk, orphan_blocks, i, j; if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) return; set_sbi_flag(sbi, SBI_POR_DOING); - start_blk = __start_cp_addr(sbi) + 1 + - le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); - orphan_blkaddr = __start_sum_addr(sbi) - 1; + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); + orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); - ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP); + ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP); - for (i = 0; i < orphan_blkaddr; i++) { + for (i = 0; i < orphan_blocks; i++) { struct page *page = get_meta_page(sbi, start_blk + i); struct f2fs_orphan_block *orphan_blk; @@ -615,7 +614,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) unsigned long blk_size = sbi->blocksize; unsigned long long cp1_version = 0, cp2_version = 0; unsigned long long cp_start_blk_no; - unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); + unsigned int cp_blks = 1 + __cp_payload(sbi); block_t cp_blk_no; int i; @@ -796,6 +795,7 @@ retry: * wribacking dentry pages in the freeing inode. */ f2fs_submit_merged_bio(sbi, DATA, WRITE); + cond_resched(); } goto retry; } @@ -884,7 +884,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; void *kaddr; int i; - int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); + int cp_payload_blks = __cp_payload(sbi); /* * This avoids to conduct wrong roll-forward operations and uses @@ -1048,17 +1048,18 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); - mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - cpc->reason != CP_DISCARD && cpc->reason != CP_UMOUNT) + (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC)) goto out; if (unlikely(f2fs_cp_error(sbi))) goto out; if (f2fs_readonly(sbi->sb)) goto out; + + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); + if (block_operations(sbi)) goto out; @@ -1085,6 +1086,10 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); + + if (cpc->reason == CP_RECOVERY) + f2fs_msg(sbi->sb, KERN_NOTICE, + "checkpoint: version = %llx", ckpt_ver); out: mutex_unlock(&sbi->cp_mutex); trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); @@ -1103,14 +1108,9 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi) im->ino_num = 0; } - /* - * considering 512 blocks in a segment 8 blocks are needed for cp - * and log segment summaries. Remaining blocks are used to keep - * orphan entries with the limitation one reserved segment - * for cp pack we can have max 1020*504 orphan entries - */ sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - - NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK; + NR_CURSEG_TYPE - __cp_payload(sbi)) * + F2FS_ORPHANS_PER_BLOCK; } int __init create_checkpoint_caches(void) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 319eda511c4f..1e1aae669fa8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -25,6 +25,9 @@ #include "trace.h" #include <trace/events/f2fs.h> +static struct kmem_cache *extent_tree_slab; +static struct kmem_cache *extent_node_slab; + static void f2fs_read_end_io(struct bio *bio, int err) { struct bio_vec *bvec; @@ -197,7 +200,7 @@ alloc_new: * ->node_page * update block addresses in the node page */ -static void __set_data_blkaddr(struct dnode_of_data *dn) +void set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn; __le32 *addr_array; @@ -226,7 +229,7 @@ int reserve_new_block(struct dnode_of_data *dn) trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); dn->data_blkaddr = NEW_ADDR; - __set_data_blkaddr(dn); + set_data_blkaddr(dn); mark_inode_dirty(dn->inode); sync_inode_page(dn); return 0; @@ -248,73 +251,62 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -static int check_extent_cache(struct inode *inode, pgoff_t pgofs, - struct buffer_head *bh_result) +static void f2fs_map_bh(struct super_block *sb, pgoff_t pgofs, + struct extent_info *ei, struct buffer_head *bh_result) +{ + unsigned int blkbits = sb->s_blocksize_bits; + size_t max_size = bh_result->b_size; + size_t mapped_size; + + clear_buffer_new(bh_result); + map_bh(bh_result, sb, ei->blk + pgofs - ei->fofs); + mapped_size = (ei->fofs + ei->len - pgofs) << blkbits; + bh_result->b_size = min(max_size, mapped_size); +} + +static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) { struct f2fs_inode_info *fi = F2FS_I(inode); pgoff_t start_fofs, end_fofs; block_t start_blkaddr; - if (is_inode_flag_set(fi, FI_NO_EXTENT)) - return 0; - - read_lock(&fi->ext.ext_lock); + read_lock(&fi->ext_lock); if (fi->ext.len == 0) { - read_unlock(&fi->ext.ext_lock); - return 0; + read_unlock(&fi->ext_lock); + return false; } stat_inc_total_hit(inode->i_sb); start_fofs = fi->ext.fofs; end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk_addr; + start_blkaddr = fi->ext.blk; if (pgofs >= start_fofs && pgofs <= end_fofs) { - unsigned int blkbits = inode->i_sb->s_blocksize_bits; - size_t count; - - set_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, - start_blkaddr + pgofs - start_fofs); - count = end_fofs - pgofs + 1; - if (count < (UINT_MAX >> blkbits)) - bh_result->b_size = (count << blkbits); - else - bh_result->b_size = UINT_MAX; - + *ei = fi->ext; stat_inc_read_hit(inode->i_sb); - read_unlock(&fi->ext.ext_lock); - return 1; + read_unlock(&fi->ext_lock); + return true; } - read_unlock(&fi->ext.ext_lock); - return 0; + read_unlock(&fi->ext_lock); + return false; } -void update_extent_cache(struct dnode_of_data *dn) +static bool update_extent_info(struct inode *inode, pgoff_t fofs, + block_t blkaddr) { - struct f2fs_inode_info *fi = F2FS_I(dn->inode); - pgoff_t fofs, start_fofs, end_fofs; + struct f2fs_inode_info *fi = F2FS_I(inode); + pgoff_t start_fofs, end_fofs; block_t start_blkaddr, end_blkaddr; int need_update = true; - f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); - - /* Update the page address in the parent node */ - __set_data_blkaddr(dn); - - if (is_inode_flag_set(fi, FI_NO_EXTENT)) - return; - - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + - dn->ofs_in_node; - - write_lock(&fi->ext.ext_lock); + write_lock(&fi->ext_lock); start_fofs = fi->ext.fofs; end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk_addr; - end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1; + start_blkaddr = fi->ext.blk; + end_blkaddr = fi->ext.blk + fi->ext.len - 1; /* Drop and initialize the matched extent */ if (fi->ext.len == 1 && fofs == start_fofs) @@ -322,24 +314,24 @@ void update_extent_cache(struct dnode_of_data *dn) /* Initial extent */ if (fi->ext.len == 0) { - if (dn->data_blkaddr != NULL_ADDR) { + if (blkaddr != NULL_ADDR) { fi->ext.fofs = fofs; - fi->ext.blk_addr = dn->data_blkaddr; + fi->ext.blk = blkaddr; fi->ext.len = 1; } goto end_update; } /* Front merge */ - if (fofs == start_fofs - 1 && dn->data_blkaddr == start_blkaddr - 1) { + if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) { fi->ext.fofs--; - fi->ext.blk_addr--; + fi->ext.blk--; fi->ext.len++; goto end_update; } /* Back merge */ - if (fofs == end_fofs + 1 && dn->data_blkaddr == end_blkaddr + 1) { + if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) { fi->ext.len++; goto end_update; } @@ -351,8 +343,7 @@ void update_extent_cache(struct dnode_of_data *dn) fi->ext.len = fofs - start_fofs; } else { fi->ext.fofs = fofs + 1; - fi->ext.blk_addr = start_blkaddr + - fofs - start_fofs + 1; + fi->ext.blk = start_blkaddr + fofs - start_fofs + 1; fi->ext.len -= fofs - start_fofs + 1; } } else { @@ -366,27 +357,583 @@ void update_extent_cache(struct dnode_of_data *dn) need_update = true; } end_update: - write_unlock(&fi->ext.ext_lock); - if (need_update) - sync_inode_page(dn); + write_unlock(&fi->ext_lock); + return need_update; +} + +static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node *parent, struct rb_node **p) +{ + struct extent_node *en; + + en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); + if (!en) + return NULL; + + en->ei = *ei; + INIT_LIST_HEAD(&en->list); + + rb_link_node(&en->rb_node, parent, p); + rb_insert_color(&en->rb_node, &et->root); + et->count++; + atomic_inc(&sbi->total_ext_node); + return en; +} + +static void __detach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + rb_erase(&en->rb_node, &et->root); + et->count--; + atomic_dec(&sbi->total_ext_node); + + if (et->cached_en == en) + et->cached_en = NULL; +} + +static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi, + nid_t ino) +{ + struct extent_tree *et; + + down_read(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, ino); + if (!et) { + up_read(&sbi->extent_tree_lock); + return NULL; + } + atomic_inc(&et->refcount); + up_read(&sbi->extent_tree_lock); + + return et; +} + +static struct extent_tree *__grab_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + nid_t ino = inode->i_ino; + + down_write(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, ino); + if (!et) { + et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); + f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + memset(et, 0, sizeof(struct extent_tree)); + et->ino = ino; + et->root = RB_ROOT; + et->cached_en = NULL; + rwlock_init(&et->lock); + atomic_set(&et->refcount, 0); + et->count = 0; + sbi->total_ext_tree++; + } + atomic_inc(&et->refcount); + up_write(&sbi->extent_tree_lock); + + return et; +} + +static struct extent_node *__lookup_extent_tree(struct extent_tree *et, + unsigned int fofs) +{ + struct rb_node *node = et->root.rb_node; + struct extent_node *en; + + if (et->cached_en) { + struct extent_info *cei = &et->cached_en->ei; + + if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) + return et->cached_en; + } + + while (node) { + en = rb_entry(node, struct extent_node, rb_node); + + if (fofs < en->ei.fofs) { + node = node->rb_left; + } else if (fofs >= en->ei.fofs + en->ei.len) { + node = node->rb_right; + } else { + et->cached_en = en; + return en; + } + } + return NULL; +} + +static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_node *prev; + struct rb_node *node; + + node = rb_prev(&en->rb_node); + if (!node) + return NULL; + + prev = rb_entry(node, struct extent_node, rb_node); + if (__is_back_mergeable(&en->ei, &prev->ei)) { + en->ei.fofs = prev->ei.fofs; + en->ei.blk = prev->ei.blk; + en->ei.len += prev->ei.len; + __detach_extent_node(sbi, et, prev); + return prev; + } + return NULL; +} + +static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_node *next; + struct rb_node *node; + + node = rb_next(&en->rb_node); + if (!node) + return NULL; + + next = rb_entry(node, struct extent_node, rb_node); + if (__is_front_mergeable(&en->ei, &next->ei)) { + en->ei.len += next->ei.len; + __detach_extent_node(sbi, et, next); + return next; + } + return NULL; +} + +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct extent_node **den) +{ + struct rb_node **p = &et->root.rb_node; + struct rb_node *parent = NULL; + struct extent_node *en; + + while (*p) { + parent = *p; + en = rb_entry(parent, struct extent_node, rb_node); + + if (ei->fofs < en->ei.fofs) { + if (__is_front_mergeable(ei, &en->ei)) { + f2fs_bug_on(sbi, !den); + en->ei.fofs = ei->fofs; + en->ei.blk = ei->blk; + en->ei.len += ei->len; + *den = __try_back_merge(sbi, et, en); + return en; + } + p = &(*p)->rb_left; + } else if (ei->fofs >= en->ei.fofs + en->ei.len) { + if (__is_back_mergeable(ei, &en->ei)) { + f2fs_bug_on(sbi, !den); + en->ei.len += ei->len; + *den = __try_front_merge(sbi, et, en); + return en; + } + p = &(*p)->rb_right; + } else { + f2fs_bug_on(sbi, 1); + } + } + + return __attach_extent_node(sbi, et, ei, parent, p); +} + +static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, bool free_all) +{ + struct rb_node *node, *next; + struct extent_node *en; + unsigned int count = et->count; + + node = rb_first(&et->root); + while (node) { + next = rb_next(node); + en = rb_entry(node, struct extent_node, rb_node); + + if (free_all) { + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + } + + if (free_all || list_empty(&en->list)) { + __detach_extent_node(sbi, et, en); + kmem_cache_free(extent_node_slab, en); + } + node = next; + } + + return count - et->count; +} + +static void f2fs_init_extent_tree(struct inode *inode, + struct f2fs_extent *i_ext) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en; + struct extent_info ei; + + if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) + return; + + et = __grab_extent_tree(inode); + + write_lock(&et->lock); + if (et->count) + goto out; + + set_extent_info(&ei, le32_to_cpu(i_ext->fofs), + le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + + en = __insert_extent_tree(sbi, et, &ei, NULL); + if (en) { + et->cached_en = en; + + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + } +out: + write_unlock(&et->lock); + atomic_dec(&et->refcount); +} + +static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en; + + trace_f2fs_lookup_extent_tree_start(inode, pgofs); + + et = __find_extent_tree(sbi, inode->i_ino); + if (!et) + return false; + + read_lock(&et->lock); + en = __lookup_extent_tree(et, pgofs); + if (en) { + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_move_tail(&en->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + stat_inc_read_hit(sbi->sb); + } + stat_inc_total_hit(sbi->sb); + read_unlock(&et->lock); + + trace_f2fs_lookup_extent_tree_end(inode, pgofs, en); + + atomic_dec(&et->refcount); + return en ? true : false; +} + +static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, + block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; + struct extent_node *den = NULL; + struct extent_info ei, dei; + unsigned int endofs; + + trace_f2fs_update_extent_tree(inode, fofs, blkaddr); + + et = __grab_extent_tree(inode); + + write_lock(&et->lock); + + /* 1. lookup and remove existing extent info in cache */ + en = __lookup_extent_tree(et, fofs); + if (!en) + goto update_extent; + + dei = en->ei; + __detach_extent_node(sbi, et, en); + + /* 2. if extent can be split more, split and insert the left part */ + if (dei.len > 1) { + /* insert left part of split extent into cache */ + if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, dei.fofs, dei.blk, + fofs - dei.fofs); + en1 = __insert_extent_tree(sbi, et, &ei, NULL); + } + + /* insert right part of split extent into cache */ + endofs = dei.fofs + dei.len - 1; + if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, fofs + 1, + fofs - dei.fofs + dei.blk, endofs - fofs); + en2 = __insert_extent_tree(sbi, et, &ei, NULL); + } + } + +update_extent: + /* 3. update extent in extent cache */ + if (blkaddr) { + set_extent_info(&ei, fofs, blkaddr, 1); + en3 = __insert_extent_tree(sbi, et, &ei, &den); + } + + /* 4. update in global extent list */ + spin_lock(&sbi->extent_lock); + if (en && !list_empty(&en->list)) + list_del(&en->list); + /* + * en1 and en2 split from en, they will become more and more smaller + * fragments after splitting several times. So if the length is smaller + * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree. + */ + if (en1) + list_add_tail(&en1->list, &sbi->extent_list); + if (en2) + list_add_tail(&en2->list, &sbi->extent_list); + if (en3) { + if (list_empty(&en3->list)) + list_add_tail(&en3->list, &sbi->extent_list); + else + list_move_tail(&en3->list, &sbi->extent_list); + } + if (den && !list_empty(&den->list)) + list_del(&den->list); + spin_unlock(&sbi->extent_lock); + + /* 5. release extent node */ + if (en) + kmem_cache_free(extent_node_slab, en); + if (den) + kmem_cache_free(extent_node_slab, den); + + write_unlock(&et->lock); + atomic_dec(&et->refcount); +} + +void f2fs_preserve_extent_tree(struct inode *inode) +{ + struct extent_tree *et; + struct extent_info *ext = &F2FS_I(inode)->ext; + bool sync = false; + + if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) + return; + + et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino); + if (!et) { + if (ext->len) { + ext->len = 0; + update_inode_page(inode); + } + return; + } + + read_lock(&et->lock); + if (et->count) { + struct extent_node *en; + + if (et->cached_en) { + en = et->cached_en; + } else { + struct rb_node *node = rb_first(&et->root); + + if (!node) + node = rb_last(&et->root); + en = rb_entry(node, struct extent_node, rb_node); + } + + if (__is_extent_same(ext, &en->ei)) + goto out; + + *ext = en->ei; + sync = true; + } else if (ext->len) { + ext->len = 0; + sync = true; + } +out: + read_unlock(&et->lock); + atomic_dec(&et->refcount); + + if (sync) + update_inode_page(inode); +} + +void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; + struct extent_node *en, *tmp; + unsigned long ino = F2FS_ROOT_INO(sbi); + struct radix_tree_iter iter; + void **slot; + unsigned int found; + unsigned int node_cnt = 0, tree_cnt = 0; + + if (!test_opt(sbi, EXTENT_CACHE)) + return; + + if (available_free_memory(sbi, EXTENT_CACHE)) + return; + + spin_lock(&sbi->extent_lock); + list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { + if (!nr_shrink--) + break; + list_del_init(&en->list); + } + spin_unlock(&sbi->extent_lock); + + down_read(&sbi->extent_tree_lock); + while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root, + (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { + unsigned i; + + ino = treevec[found - 1]->ino + 1; + for (i = 0; i < found; i++) { + struct extent_tree *et = treevec[i]; + + atomic_inc(&et->refcount); + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, false); + write_unlock(&et->lock); + atomic_dec(&et->refcount); + } + } + up_read(&sbi->extent_tree_lock); + + down_write(&sbi->extent_tree_lock); + radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter, + F2FS_ROOT_INO(sbi)) { + struct extent_tree *et = (struct extent_tree *)*slot; + + if (!atomic_read(&et->refcount) && !et->count) { + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + tree_cnt++; + } + } + up_write(&sbi->extent_tree_lock); + + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); +} + +void f2fs_destroy_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + unsigned int node_cnt = 0; + + if (!test_opt(sbi, EXTENT_CACHE)) + return; + + et = __find_extent_tree(sbi, inode->i_ino); + if (!et) + goto out; + + /* free all extent info belong to this extent tree */ + write_lock(&et->lock); + node_cnt = __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + + atomic_dec(&et->refcount); + + /* try to find and delete extent tree entry in radix tree */ + down_write(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino); + if (!et) { + up_write(&sbi->extent_tree_lock); + goto out; + } + f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + up_write(&sbi->extent_tree_lock); +out: + trace_f2fs_destroy_extent_tree(inode, node_cnt); return; } +void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext) +{ + if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) + f2fs_init_extent_tree(inode, i_ext); + + write_lock(&F2FS_I(inode)->ext_lock); + get_extent_info(&F2FS_I(inode)->ext, *i_ext); + write_unlock(&F2FS_I(inode)->ext_lock); +} + +static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + return false; + + if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) + return f2fs_lookup_extent_tree(inode, pgofs, ei); + + return lookup_extent_info(inode, pgofs, ei); +} + +void f2fs_update_extent_cache(struct dnode_of_data *dn) +{ + struct f2fs_inode_info *fi = F2FS_I(dn->inode); + pgoff_t fofs; + + f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return; + + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; + + if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE)) + return f2fs_update_extent_tree(dn->inode, fofs, + dn->data_blkaddr); + + if (update_extent_info(dn->inode, fofs, dn->data_blkaddr)) + sync_inode_page(dn); +} + struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; + struct extent_info ei; int err; struct f2fs_io_info fio = { .type = DATA, .rw = sync ? READ_SYNC : READA, }; + /* + * If sync is false, it needs to check its block allocation. + * This is need and triggered by two flows: + * gc and truncate_partial_data_page. + */ + if (!sync) + goto search; + page = find_get_page(mapping, index); if (page && PageUptodate(page)) return page; f2fs_put_page(page, 0); +search: + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + goto got_it; + } set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, LOOKUP_NODE); @@ -401,6 +948,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) if (unlikely(dn.data_blkaddr == NEW_ADDR)) return ERR_PTR(-EINVAL); +got_it: page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); @@ -435,6 +983,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; + struct extent_info ei; int err; struct f2fs_io_info fio = { .type = DATA, @@ -445,6 +994,11 @@ repeat: if (!page) return ERR_PTR(-ENOMEM); + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + goto got_it; + } + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { @@ -458,6 +1012,7 @@ repeat: return ERR_PTR(-ENOENT); } +got_it: if (PageUptodate(page)) return page; @@ -569,19 +1124,26 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; + + dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + if (dn->data_blkaddr == NEW_ADDR) + goto alloc; + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) return -ENOSPC; +alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) seg = CURSEG_DIRECT_IO; - allocate_data_block(sbi, NULL, NULL_ADDR, &dn->data_blkaddr, &sum, seg); + allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, + &sum, seg); /* direct IO doesn't use extent cache to maximize the performance */ - __set_data_blkaddr(dn); + set_data_blkaddr(dn); /* update i_size */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + @@ -615,7 +1177,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); while (dn.ofs_in_node < end_offset && len) { - if (dn.data_blkaddr == NULL_ADDR) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { if (__allocate_data_block(&dn)) goto sync_out; allocated = true; @@ -659,13 +1224,16 @@ static int __get_data_block(struct inode *inode, sector_t iblock, int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; pgoff_t pgofs, end_offset; int err = 0, ofs = 1; + struct extent_info ei; bool allocated = false; /* Get the page offset from the block offset(iblock) */ pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); - if (check_extent_cache(inode, pgofs, bh_result)) + if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + f2fs_map_bh(inode->i_sb, pgofs, &ei, bh_result); goto out; + } if (create) f2fs_lock_op(F2FS_I_SB(inode)); @@ -682,7 +1250,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, goto put_out; if (dn.data_blkaddr != NULL_ADDR) { - set_buffer_new(bh_result); + clear_buffer_new(bh_result); map_bh(bh_result, inode->i_sb, dn.data_blkaddr); } else if (create) { err = __allocate_data_block(&dn); @@ -727,6 +1295,7 @@ get_next: if (err) goto sync_out; allocated = true; + set_buffer_new(bh_result); blkaddr = dn.data_blkaddr; } /* Give more consecutive addresses for the readahead */ @@ -813,8 +1382,10 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) fio->blk_addr = dn.data_blkaddr; /* This page is already truncated */ - if (fio->blk_addr == NULL_ADDR) + if (fio->blk_addr == NULL_ADDR) { + ClearPageUptodate(page); goto out_writepage; + } set_page_writeback(page); @@ -827,10 +1398,15 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) need_inplace_update(inode))) { rewrite_data_page(page, fio); set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + trace_f2fs_do_write_data_page(page, IPU); } else { write_data_page(page, &dn, fio); - update_extent_cache(&dn); + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + trace_f2fs_do_write_data_page(page, OPU); set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); } out_writepage: f2fs_put_dnode(&dn); @@ -909,6 +1485,8 @@ done: clear_cold_data(page); out: inode_dec_dirty_pages(inode); + if (err) + ClearPageUptodate(page); unlock_page(page); if (need_balance_fs) f2fs_balance_fs(sbi); @@ -950,6 +1528,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + diff = nr_pages_to_write(sbi, DATA, wbc); if (!S_ISDIR(inode->i_mode)) { @@ -1236,6 +1818,37 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, get_data_block); } +void init_extent_cache_info(struct f2fs_sb_info *sbi) +{ + INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); + init_rwsem(&sbi->extent_tree_lock); + INIT_LIST_HEAD(&sbi->extent_list); + spin_lock_init(&sbi->extent_lock); + sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_node, 0); +} + +int __init create_extent_cache(void) +{ + extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", + sizeof(struct extent_tree)); + if (!extent_tree_slab) + return -ENOMEM; + extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", + sizeof(struct extent_node)); + if (!extent_node_slab) { + kmem_cache_destroy(extent_tree_slab); + return -ENOMEM; + } + return 0; +} + +void destroy_extent_cache(void) +{ + kmem_cache_destroy(extent_node_slab); + kmem_cache_destroy(extent_tree_slab); +} + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index e671373cc8ab..f5388f37217e 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -35,6 +35,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) /* validation check of the segment numbers */ si->hit_ext = sbi->read_hit_ext; si->total_ext = sbi->total_hit_ext; + si->ext_tree = sbi->total_ext_tree; + si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_dirs = sbi->n_dirty_dirs; @@ -185,6 +187,9 @@ get_cache: si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); for (i = 0; i <= UPDATE_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); + si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_node) * + sizeof(struct extent_node); si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; @@ -260,13 +265,20 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "CP calls: %d\n", si->cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); - seq_printf(s, " - data segments : %d\n", si->data_segs); - seq_printf(s, " - node segments : %d\n", si->node_segs); - seq_printf(s, "Try to move %d blocks\n", si->tot_blks); - seq_printf(s, " - data blocks : %d\n", si->data_blks); - seq_printf(s, " - node blocks : %d\n", si->node_blks); + seq_printf(s, " - data segments : %d (%d)\n", + si->data_segs, si->bg_data_segs); + seq_printf(s, " - node segments : %d (%d)\n", + si->node_segs, si->bg_node_segs); + seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, + si->bg_data_blks + si->bg_node_blks); + seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, + si->bg_data_blks); + seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, + si->bg_node_blks); seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", si->hit_ext, si->total_ext); + seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree); + seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - inmem: %4d, wb: %4d\n", si->inmem_pages, si->wb_pages); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index b74097a7f6d9..3a3302ab7871 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -59,9 +59,8 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, }; -void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) +void set_de_type(struct f2fs_dir_entry *de, umode_t mode) { - umode_t mode = inode->i_mode; de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } @@ -127,22 +126,19 @@ struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots, *max_slots = 0; while (bit_pos < d->max) { if (!test_bit_le(bit_pos, d->bitmap)) { - if (bit_pos == 0) - max_len = 1; - else if (!test_bit_le(bit_pos - 1, d->bitmap)) - max_len++; bit_pos++; + max_len++; continue; } + de = &d->dentry[bit_pos]; if (early_match_name(name->len, namehash, de) && !memcmp(d->filename[bit_pos], name->name, name->len)) goto found; - if (max_slots && *max_slots >= 0 && max_len > *max_slots) { + if (max_slots && max_len > *max_slots) *max_slots = max_len; - max_len = 0; - } + max_len = 0; /* remain bug on condition */ if (unlikely(!de->name_len)) @@ -219,14 +215,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, unsigned int max_depth; unsigned int level; + *res_page = NULL; + if (f2fs_has_inline_dentry(dir)) return find_in_inline_dir(dir, child, res_page); if (npages == 0) return NULL; - *res_page = NULL; - name_hash = f2fs_dentry_hash(child); max_depth = F2FS_I(dir)->i_current_depth; @@ -285,7 +281,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, lock_page(page); f2fs_wait_on_page_writeback(page, type); de->ino = cpu_to_le32(inode->i_ino); - set_de_type(de, inode); + set_de_type(de, inode->i_mode); f2fs_dentry_kunmap(dir, page); set_page_dirty(page); dir->i_mtime = dir->i_ctime = CURRENT_TIME; @@ -331,14 +327,14 @@ void do_make_empty_dir(struct inode *inode, struct inode *parent, de->hash_code = 0; de->ino = cpu_to_le32(inode->i_ino); memcpy(d->filename[0], ".", 1); - set_de_type(de, inode); + set_de_type(de, inode->i_mode); de = &d->dentry[1]; de->hash_code = 0; de->name_len = cpu_to_le16(2); de->ino = cpu_to_le32(parent->i_ino); memcpy(d->filename[1], "..", 2); - set_de_type(de, inode); + set_de_type(de, parent->i_mode); test_and_set_bit_le(0, (void *)d->bitmap); test_and_set_bit_le(1, (void *)d->bitmap); @@ -435,7 +431,7 @@ error: void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { if (S_ISDIR(inode->i_mode)) { inc_nlink(dir); set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); @@ -450,7 +446,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) + if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) clear_inode_flag(F2FS_I(inode), FI_INC_LINK); } @@ -474,30 +470,47 @@ next: goto next; } +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos) +{ + struct f2fs_dir_entry *de; + int slots = GET_DENTRY_SLOTS(name->len); + int i; + + de = &d->dentry[bit_pos]; + de->hash_code = name_hash; + de->name_len = cpu_to_le16(name->len); + memcpy(d->filename[bit_pos], name->name, name->len); + de->ino = cpu_to_le32(ino); + set_de_type(de, mode); + for (i = 0; i < slots; i++) + test_and_set_bit_le(bit_pos + i, (void *)d->bitmap); +} + /* * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ int __f2fs_add_link(struct inode *dir, const struct qstr *name, - struct inode *inode) + struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; unsigned int level; unsigned int current_depth; unsigned long bidx, block; f2fs_hash_t dentry_hash; - struct f2fs_dir_entry *de; unsigned int nbucket, nblock; size_t namelen = name->len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; + struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(namelen); - struct page *page; + struct page *page = NULL; int err = 0; - int i; if (f2fs_has_inline_dentry(dir)) { - err = f2fs_add_inline_entry(dir, name, inode); + err = f2fs_add_inline_entry(dir, name, inode, ino, mode); if (!err || err != -EAGAIN) return err; else @@ -547,30 +560,31 @@ start: add_dentry: f2fs_wait_on_page_writeback(dentry_page, DATA); - down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name, NULL); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto fail; + if (inode) { + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, name, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } } - de = &dentry_blk->dentry[bit_pos]; - de->hash_code = dentry_hash; - de->name_len = cpu_to_le16(namelen); - memcpy(dentry_blk->filename[bit_pos], name->name, name->len); - de->ino = cpu_to_le32(inode->i_ino); - set_de_type(de, inode); - for (i = 0; i < slots; i++) - test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + + make_dentry_ptr(&d, (void *)dentry_blk, 1); + f2fs_update_dentry(ino, mode, &d, name, dentry_hash, bit_pos); + set_page_dirty(dentry_page); - /* we don't need to mark_inode_dirty now */ - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); - f2fs_put_page(page, 1); + if (inode) { + /* we don't need to mark_inode_dirty now */ + F2FS_I(inode)->i_pino = dir->i_ino; + update_inode(inode, page); + f2fs_put_page(page, 1); + } update_parent_metadata(dir, inode, current_depth); fail: - up_write(&F2FS_I(inode)->i_sem); + if (inode) + up_write(&F2FS_I(inode)->i_sem); if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { update_inode_page(dir); @@ -669,6 +683,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK) { truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); + ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7fa3313ab0e2..8de34ab6d5b1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -50,6 +50,7 @@ #define F2FS_MOUNT_FLUSH_MERGE 0x00000400 #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 +#define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -102,6 +103,7 @@ enum { CP_UMOUNT, CP_FASTBOOT, CP_SYNC, + CP_RECOVERY, CP_DISCARD, }; @@ -216,6 +218,15 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) +/* + * should be same as XFS_IOC_GOINGDOWN. + * Flags for going down operation used by FS_IOC_GOINGDOWN + */ +#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */ +#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ +#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ +#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -273,14 +284,34 @@ enum { #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ +/* vector size for gang look-up from extent cache that consists of radix tree */ +#define EXT_TREE_VEC_SIZE 64 + /* for in-memory extent cache entry */ -#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */ +#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ + +/* number of extent info in extent cache we try to shrink */ +#define EXTENT_CACHE_SHRINK_NUMBER 128 struct extent_info { - rwlock_t ext_lock; /* rwlock for consistency */ - unsigned int fofs; /* start offset in a file */ - u32 blk_addr; /* start block address of the extent */ - unsigned int len; /* length of the extent */ + unsigned int fofs; /* start offset in a file */ + u32 blk; /* start block address of the extent */ + unsigned int len; /* length of the extent */ +}; + +struct extent_node { + struct rb_node rb_node; /* rb node located in rb-tree */ + struct list_head list; /* node in global extent list of sbi */ + struct extent_info ei; /* extent info */ +}; + +struct extent_tree { + nid_t ino; /* inode number */ + struct rb_root root; /* root of extent info rb-tree */ + struct extent_node *cached_en; /* recently accessed extent node */ + rwlock_t lock; /* protect extent info rb-tree */ + atomic_t refcount; /* reference count of rb-tree */ + unsigned int count; /* # of extent node in rb-tree*/ }; /* @@ -309,6 +340,7 @@ struct f2fs_inode_info { nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ struct extent_info ext; /* in-memory extent cache entry */ + rwlock_t ext_lock; /* rwlock for single extent cache */ struct inode_entry *dirty_dir; /* the pointer of dirty dir */ struct radix_tree_root inmem_root; /* radix tree for inmem pages */ @@ -319,21 +351,51 @@ struct f2fs_inode_info { static inline void get_extent_info(struct extent_info *ext, struct f2fs_extent i_ext) { - write_lock(&ext->ext_lock); ext->fofs = le32_to_cpu(i_ext.fofs); - ext->blk_addr = le32_to_cpu(i_ext.blk_addr); + ext->blk = le32_to_cpu(i_ext.blk); ext->len = le32_to_cpu(i_ext.len); - write_unlock(&ext->ext_lock); } static inline void set_raw_extent(struct extent_info *ext, struct f2fs_extent *i_ext) { - read_lock(&ext->ext_lock); i_ext->fofs = cpu_to_le32(ext->fofs); - i_ext->blk_addr = cpu_to_le32(ext->blk_addr); + i_ext->blk = cpu_to_le32(ext->blk); i_ext->len = cpu_to_le32(ext->len); - read_unlock(&ext->ext_lock); +} + +static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, + u32 blk, unsigned int len) +{ + ei->fofs = fofs; + ei->blk = blk; + ei->len = len; +} + +static inline bool __is_extent_same(struct extent_info *ei1, + struct extent_info *ei2) +{ + return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk && + ei1->len == ei2->len); +} + +static inline bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front) +{ + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); +} + +static inline bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back) +{ + return __is_extent_mergeable(back, cur); +} + +static inline bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front) +{ + return __is_extent_mergeable(cur, front); } struct f2fs_nm_info { @@ -502,6 +564,10 @@ enum page_type { META, NR_PAGE_TYPE, META_FLUSH, + INMEM, /* the below types are used by tracepoints only. */ + INMEM_DROP, + IPU, + OPU, }; struct f2fs_io_info { @@ -571,6 +637,14 @@ struct f2fs_sb_info { struct list_head dir_inode_list; /* dir inode list */ spinlock_t dir_inode_lock; /* for dir inode list lock */ + /* for extent tree cache */ + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + int total_ext_tree; /* extent tree count */ + atomic_t total_ext_node; /* extent info count */ + /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ unsigned int log_blocksize; /* log2 block size */ @@ -920,12 +994,17 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) return 0; } +static inline block_t __cp_payload(struct f2fs_sb_info *sbi) +{ + return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); +} + static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); int offset; - if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload) > 0) { + if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) return &ckpt->sit_nat_version_bitmap; else @@ -1166,8 +1245,10 @@ enum { FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ FI_VOLATILE_FILE, /* indicate volatile file */ + FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ + FI_INLINE_DOTS, /* indicate inline dot dentries */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -1204,6 +1285,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi, set_inode_flag(fi, FI_INLINE_DENTRY); if (ri->i_inline & F2FS_DATA_EXIST) set_inode_flag(fi, FI_DATA_EXIST); + if (ri->i_inline & F2FS_INLINE_DOTS) + set_inode_flag(fi, FI_INLINE_DOTS); } static inline void set_raw_inline(struct f2fs_inode_info *fi, @@ -1219,6 +1302,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi, ri->i_inline |= F2FS_INLINE_DENTRY; if (is_inode_flag_set(fi, FI_DATA_EXIST)) ri->i_inline |= F2FS_DATA_EXIST; + if (is_inode_flag_set(fi, FI_INLINE_DOTS)) + ri->i_inline |= F2FS_INLINE_DOTS; } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -1264,6 +1349,11 @@ static inline int f2fs_exist_data(struct inode *inode) return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST); } +static inline int f2fs_has_inline_dots(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS); +} + static inline bool f2fs_is_atomic_file(struct inode *inode) { return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); @@ -1274,6 +1364,11 @@ static inline bool f2fs_is_volatile_file(struct inode *inode) return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); } +static inline bool f2fs_is_first_block_written(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); +} + static inline bool f2fs_is_drop_cache(struct inode *inode) { return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE); @@ -1290,12 +1385,6 @@ static inline int f2fs_has_inline_dentry(struct inode *inode) return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY); } -static inline void *inline_dentry_addr(struct page *page) -{ - struct f2fs_inode *ri = F2FS_INODE(page); - return (void *)&(ri->i_addr[1]); -} - static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) { if (!f2fs_has_inline_dentry(dir)) @@ -1363,7 +1452,7 @@ struct dentry *f2fs_get_parent(struct dentry *child); * dir.c */ extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; -void set_de_type(struct f2fs_dir_entry *, struct inode *); +void set_de_type(struct f2fs_dir_entry *, umode_t); struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *, struct f2fs_dentry_ptr *); bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, @@ -1382,7 +1471,10 @@ ino_t f2fs_inode_by_name(struct inode *, struct qstr *); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); int update_dent_inode(struct inode *, const struct qstr *); -int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, + const struct qstr *, f2fs_hash_t , unsigned int); +int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, + umode_t); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); int f2fs_do_tmpfile(struct inode *, struct inode *); @@ -1391,8 +1483,8 @@ bool f2fs_empty_dir(struct inode *); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { - return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name, - inode); + return __f2fs_add_link(d_inode(dentry->d_parent), &dentry->d_name, + inode, inode->i_ino, inode->i_mode); } /* @@ -1519,14 +1611,22 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, struct f2fs_io_info *); +void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -void update_extent_cache(struct dnode_of_data *); +void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); +void f2fs_destroy_extent_tree(struct inode *); +void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *); +void f2fs_update_extent_cache(struct dnode_of_data *); +void f2fs_preserve_extent_tree(struct inode *); struct page *find_data_page(struct inode *, pgoff_t, bool); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct page *, struct f2fs_io_info *); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); +void init_extent_cache_info(struct f2fs_sb_info *); +int __init create_extent_cache(void); +void destroy_extent_cache(void); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); @@ -1554,7 +1654,7 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - int hit_ext, total_ext; + int hit_ext, total_ext, ext_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; @@ -1566,7 +1666,9 @@ struct f2fs_stat_info { int dirty_count, node_pages, meta_pages; int prefree_count, call_count, cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; + int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; + int bg_data_blks, bg_node_blks; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; @@ -1615,31 +1717,36 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) -#define stat_inc_seg_count(sbi, type) \ +#define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) \ + if (type == SUM_TYPE_DATA) { \ si->data_segs++; \ - else \ + si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ + } else { \ si->node_segs++; \ + si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \ + } \ } while (0) #define stat_inc_tot_blk_count(si, blks) \ (si->tot_blks += (blks)) -#define stat_inc_data_blk_count(sbi, blks) \ +#define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ + si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ } while (0) -#define stat_inc_node_blk_count(sbi, blks) \ +#define stat_inc_node_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ + si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ } while (0) int f2fs_build_stats(struct f2fs_sb_info *); @@ -1661,10 +1768,10 @@ void f2fs_destroy_root_stats(void); #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) #define stat_inc_inplace_blocks(sbi) -#define stat_inc_seg_count(si, type) +#define stat_inc_seg_count(sbi, type, gc_type) #define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(si, blks) -#define stat_inc_node_blk_count(sbi, blks) +#define stat_inc_data_blk_count(sbi, blks, gc_type) +#define stat_inc_node_blk_count(sbi, blks, gc_type) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } @@ -1688,6 +1795,7 @@ extern struct kmem_cache *inode_entry_slab; */ bool f2fs_may_inline(struct inode *); void read_inline_data(struct page *, struct page *); +bool truncate_inline_inode(struct page *, u64); int f2fs_read_inline_data(struct inode *, struct page *); int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); int f2fs_convert_inline_inode(struct inode *); @@ -1697,7 +1805,8 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *, struct page **); struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *); +int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, + nid_t, umode_t); void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); bool f2fs_empty_inline_dir(struct inode *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index df6a0596eccf..2b52e48d7482 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -241,6 +241,8 @@ go_write: * will be used only for fsynced inodes after checkpoint. */ try_to_fix_pino(inode); + clear_inode_flag(fi, FI_APPEND_WRITE); + clear_inode_flag(fi, FI_UPDATE_WRITE); goto out; } sync_nodes: @@ -433,8 +435,12 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) continue; dn->data_blkaddr = NULL_ADDR; - update_extent_cache(dn); + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); invalidate_blocks(sbi, blkaddr); + if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) + clear_inode_flag(F2FS_I(dn->inode), + FI_FIRST_BLOCK_WRITTEN); nr_free++; } if (nr_free) { @@ -454,15 +460,16 @@ void truncate_data_blocks(struct dnode_of_data *dn) truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); } -static int truncate_partial_data_page(struct inode *inode, u64 from) +static int truncate_partial_data_page(struct inode *inode, u64 from, + bool force) { unsigned offset = from & (PAGE_CACHE_SIZE - 1); struct page *page; - if (!offset) + if (!offset && !force) return 0; - page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false); + page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, force); if (IS_ERR(page)) return 0; @@ -473,7 +480,8 @@ static int truncate_partial_data_page(struct inode *inode, u64 from) f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); - set_page_dirty(page); + if (!force) + set_page_dirty(page); out: f2fs_put_page(page, 1); return 0; @@ -487,6 +495,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) pgoff_t free_from; int count = 0, err = 0; struct page *ipage; + bool truncate_page = false; trace_f2fs_truncate_blocks_enter(inode, from); @@ -502,7 +511,10 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { + if (truncate_inline_inode(ipage, from)) + set_page_dirty(ipage); f2fs_put_page(ipage, 1); + truncate_page = true; goto out; } @@ -533,7 +545,7 @@ out: /* lastly zero out the first data page */ if (!err) - err = truncate_partial_data_page(inode, from); + err = truncate_partial_data_page(inode, from, truncate_page); trace_f2fs_truncate_blocks_exit(inode, err); return err; @@ -562,7 +574,7 @@ void f2fs_truncate(struct inode *inode) int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); stat->blocks <<= 3; return 0; @@ -601,7 +613,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct f2fs_inode_info *fi = F2FS_I(inode); int err; @@ -997,6 +1009,9 @@ static int f2fs_ioc_release_volatile_write(struct file *filp) if (!f2fs_is_volatile_file(inode)) return 0; + if (!f2fs_is_first_block_written(inode)) + return truncate_partial_data_page(inode, 0, true); + punch_hole(inode, 0, F2FS_BLKSIZE); return 0; } @@ -1029,6 +1044,41 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) return ret; } +static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + __u32 in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__u32 __user *)arg)) + return -EFAULT; + + switch (in) { + case F2FS_GOING_DOWN_FULLSYNC: + sb = freeze_bdev(sb->s_bdev); + if (sb && !IS_ERR(sb)) { + f2fs_stop_checkpoint(sbi); + thaw_bdev(sb->s_bdev, sb); + } + break; + case F2FS_GOING_DOWN_METASYNC: + /* do checkpoint only */ + f2fs_sync_fs(sb, 1); + f2fs_stop_checkpoint(sbi); + break; + case F2FS_GOING_DOWN_NOSYNC: + f2fs_stop_checkpoint(sbi); + break; + default: + return -EINVAL; + } + return 0; +} + static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1078,6 +1128,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_release_volatile_write(filp); case F2FS_IOC_ABORT_VOLATILE_WRITE: return f2fs_ioc_abort_volatile_write(filp); + case F2FS_IOC_SHUTDOWN: + return f2fs_ioc_shutdown(filp, arg); case FITRIM: return f2fs_ioc_fitrim(filp, arg); default: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 76adbc3641f1..ed58211fe79b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -435,7 +435,7 @@ next_step: set_page_dirty(node_page); } f2fs_put_page(node_page, 1); - stat_inc_node_blk_count(sbi, 1); + stat_inc_node_blk_count(sbi, 1, gc_type); } if (initial) { @@ -622,7 +622,7 @@ next_step: if (IS_ERR(data_page)) continue; move_data_page(inode, data_page, gc_type); - stat_inc_data_blk_count(sbi, 1); + stat_inc_data_blk_count(sbi, 1, gc_type); } } @@ -680,7 +680,7 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, } blk_finish_plug(&plug); - stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); + stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type); stat_inc_call_count(sbi->stat_info); f2fs_put_page(sum_page, 1); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 1484c00133cd..8140e4f0e538 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -21,7 +21,7 @@ bool f2fs_may_inline(struct inode *inode) if (f2fs_is_atomic_file(inode)) return false; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; if (i_size_read(inode) > MAX_INLINE_DATA) @@ -50,10 +50,19 @@ void read_inline_data(struct page *page, struct page *ipage) SetPageUptodate(page); } -static void truncate_inline_data(struct page *ipage) +bool truncate_inline_inode(struct page *ipage, u64 from) { + void *addr; + + if (from >= MAX_INLINE_DATA) + return false; + + addr = inline_data_addr(ipage); + f2fs_wait_on_page_writeback(ipage, NODE); - memset(inline_data_addr(ipage), 0, MAX_INLINE_DATA); + memset(addr + from, 0, MAX_INLINE_DATA - from); + + return true; } int f2fs_read_inline_data(struct inode *inode, struct page *page) @@ -122,7 +131,8 @@ no_update: set_page_writeback(page); fio.blk_addr = dn->data_blkaddr; write_data_page(page, dn, &fio); - update_extent_cache(dn); + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); f2fs_wait_on_page_writeback(page, DATA); if (dirty) inode_dec_dirty_pages(dn->inode); @@ -131,7 +141,7 @@ no_update: set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_data(dn->inode_page); + truncate_inline_inode(dn->inode_page, 0); clear_out: stat_dec_inline_inode(dn->inode); f2fs_clear_inline_inode(dn->inode); @@ -245,7 +255,7 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - truncate_inline_data(ipage); + truncate_inline_inode(ipage, 0); f2fs_clear_inline_inode(inode); update_inode(inode, ipage); f2fs_put_page(ipage, 1); @@ -363,7 +373,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_data(ipage); + truncate_inline_inode(ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); @@ -380,21 +390,18 @@ out: } int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, - struct inode *inode) + struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_dir_entry *de; size_t namelen = name->len; struct f2fs_inline_dentry *dentry_blk = NULL; + struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(namelen); - struct page *page; + struct page *page = NULL; int err = 0; - int i; - - name_hash = f2fs_dentry_hash(name); ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) @@ -410,32 +417,34 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, goto out; } - down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name, ipage); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto fail; + if (inode) { + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, name, ipage); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } } f2fs_wait_on_page_writeback(ipage, NODE); - de = &dentry_blk->dentry[bit_pos]; - de->hash_code = name_hash; - de->name_len = cpu_to_le16(namelen); - memcpy(dentry_blk->filename[bit_pos], name->name, name->len); - de->ino = cpu_to_le32(inode->i_ino); - set_de_type(de, inode); - for (i = 0; i < slots; i++) - test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + + name_hash = f2fs_dentry_hash(name); + make_dentry_ptr(&d, (void *)dentry_blk, 2); + f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos); + set_page_dirty(ipage); /* we don't need to mark_inode_dirty now */ - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); - f2fs_put_page(page, 1); + if (inode) { + F2FS_I(inode)->i_pino = dir->i_ino; + update_inode(inode, page); + f2fs_put_page(page, 1); + } update_parent_metadata(dir, inode, 0); fail: - up_write(&F2FS_I(inode)->i_sem); + if (inode) + up_write(&F2FS_I(inode)->i_sem); if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { update_inode(dir, ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2d002e3738a7..e622ec95409e 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -51,6 +51,15 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) } } +static bool __written_first_block(struct f2fs_inode *ri) +{ + block_t addr = le32_to_cpu(ri->i_addr[0]); + + if (addr != NEW_ADDR && addr != NULL_ADDR) + return true; + return false; +} + static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { @@ -130,7 +139,8 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - get_extent_info(&fi->ext, ri->i_ext); + f2fs_init_extent_cache(inode, &ri->i_ext); + get_inline_info(fi, ri); /* check data exist */ @@ -140,6 +150,9 @@ static int do_read_inode(struct inode *inode) /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); + if (__written_first_block(ri)) + set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + f2fs_put_page(node_page, 1); stat_inc_inline_inode(inode); @@ -220,7 +233,11 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); + + read_lock(&F2FS_I(inode)->ext_lock); set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); + read_unlock(&F2FS_I(inode)->ext_lock); + set_raw_inline(F2FS_I(inode), ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); @@ -328,6 +345,12 @@ void f2fs_evict_inode(struct inode *inode) no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); + + /* update extent info in inode */ + if (inode->i_nlink) + f2fs_preserve_extent_tree(inode); + f2fs_destroy_extent_tree(inode); + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e79639a9787a..658e8079aaf9 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -14,6 +14,7 @@ #include <linux/sched.h> #include <linux/ctype.h> #include <linux/dcache.h> +#include <linux/namei.h> #include "f2fs.h" #include "node.h" @@ -150,7 +151,7 @@ out: static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct f2fs_sb_info *sbi = F2FS_I_SB(dir); int err; @@ -181,10 +182,48 @@ out: struct dentry *f2fs_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot); + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino)); +} + +static int __recover_dot_dentries(struct inode *dir, nid_t pino) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct qstr dot = QSTR_INIT(".", 1); + struct qstr dotdot = QSTR_INIT("..", 2); + struct f2fs_dir_entry *de; + struct page *page; + int err = 0; + + f2fs_lock_op(sbi); + + de = f2fs_find_entry(dir, &dot, &page); + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + } else { + err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); + if (err) + goto out; + } + + de = f2fs_find_entry(dir, &dotdot, &page); + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + } else { + err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); + } +out: + if (!err) { + clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS); + mark_inode_dirty(dir); + } + + f2fs_unlock_op(sbi); + return err; } static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, @@ -206,6 +245,16 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, inode = f2fs_iget(dir->i_sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + + if (f2fs_has_inline_dots(inode)) { + int err; + + err = __recover_dot_dentries(inode, dir->i_ino); + if (err) { + iget_failed(inode); + return ERR_PTR(err); + } + } } return d_splice_alias(inode, dentry); @@ -214,7 +263,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, static int f2fs_unlink(struct inode *dir, struct dentry *dentry) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct f2fs_dir_entry *de; struct page *page; int err = -ENOENT; @@ -247,6 +296,21 @@ fail: return err; } +static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct page *page = page_follow_link_light(dentry, nd); + + if (IS_ERR_OR_NULL(page)) + return page; + + /* this is broken symlink case */ + if (*nd_get_link(nd) == 0) { + page_put_link(dentry, nd, page); + return ERR_PTR(-ENOENT); + } + return page; +} + static int f2fs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { @@ -276,6 +340,17 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); unlock_new_inode(inode); + /* + * Let's flush symlink data in order to avoid broken symlink as much as + * possible. Nevertheless, fsyncing is the best way, but there is no + * way to get a file descriptor in order to flush that. + * + * Note that, it needs to do dir->fsync to make this recoverable. + * If the symlink path is stored into inline_data, there is no + * performance regression. + */ + filemap_write_and_wait_range(inode->i_mapping, 0, symlen - 1); + if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); return err; @@ -326,7 +401,7 @@ out_fail: static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (f2fs_empty_dir(inode)) return f2fs_unlink(dir, dentry); return -ENOTEMPTY; @@ -374,8 +449,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *old_dir_page; struct page *old_page, *new_page; struct f2fs_dir_entry *old_dir_entry = NULL; @@ -501,8 +576,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *old_dir_page, *new_dir_page; struct page *old_page, *new_page; struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; @@ -693,6 +768,8 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); + + stat_inc_inline_inode(inode); d_tmpfile(dentry, inode); unlock_new_inode(inode); return 0; @@ -729,7 +806,7 @@ const struct inode_operations f2fs_dir_inode_operations = { const struct inode_operations f2fs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, + .follow_link = f2fs_follow_link, .put_link = page_put_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 97bd9d3db882..8ab0cf1930bd 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -41,7 +41,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) /* only uses low memory */ avail_ram = val.totalram - val.totalhigh; - /* give 25%, 25%, 50%, 50% memory for each components respectively */ + /* + * give 25%, 25%, 50%, 50%, 50% memory for each components respectively + */ if (type == FREE_NIDS) { mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> PAGE_CACHE_SHIFT; @@ -62,6 +64,11 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size += (sbi->im[i].ino_num * sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == EXTENT_CACHE) { + mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) + + atomic_read(&sbi->total_ext_node) * + sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { if (sbi->sb->s_bdi->dirty_exceeded) return false; @@ -494,7 +501,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) /* if inline_data is set, should not report any block indices */ if (f2fs_has_inline_data(dn->inode) && index) { - err = -EINVAL; + err = -ENOENT; f2fs_put_page(npage[0], 1); goto release_out; } @@ -995,6 +1002,7 @@ static int read_node_page(struct page *page, int rw) get_node_info(sbi, page->index, &ni); if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); f2fs_put_page(page, 1); return -ENOENT; } @@ -1306,6 +1314,7 @@ static int f2fs_write_node_page(struct page *page, /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); unlock_page(page); return 0; @@ -1821,6 +1830,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; struct page *page = NULL; + struct f2fs_nm_info *nm_i = NM_I(sbi); /* * there are two steps to flush nat entries: @@ -1874,7 +1884,9 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, set->entry_cnt); + down_write(&nm_i->nat_tree_lock); radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + up_write(&nm_i->nat_tree_lock); kmem_cache_free(nat_entry_set_slab, set); } @@ -1902,6 +1914,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); + down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; @@ -1910,6 +1923,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) __adjust_nat_entry_set(setvec[idx], &sets, MAX_NAT_JENTRIES(sum)); } + up_write(&nm_i->nat_tree_lock); /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index f405bbf2435a..c56026f1725c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -120,6 +120,7 @@ enum mem_type { NAT_ENTRIES, /* indicates the cached nat entry */ DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ + EXTENT_CACHE, /* indicates extent cache */ BASE_CHECK, /* check kernel status */ }; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 41afb9534bbd..8d8ea99f2156 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -93,10 +93,9 @@ static int recover_dentry(struct inode *inode, struct page *ipage) } retry: de = f2fs_find_entry(dir, &name, &page); - if (de && inode->i_ino == le32_to_cpu(de->ino)) { - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + if (de && inode->i_ino == le32_to_cpu(de->ino)) goto out_unmap_put; - } + if (de) { einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { @@ -115,7 +114,7 @@ retry: iput(einode); goto retry; } - err = __f2fs_add_link(dir, &name, inode); + err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); if (err) goto out_err; @@ -187,11 +186,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (entry) { - if (IS_INODE(page) && is_dent_dnode(page)) - set_inode_flag(F2FS_I(entry->inode), - FI_INC_LINK); - } else { + if (!entry) { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) @@ -212,8 +207,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) if (IS_ERR(entry->inode)) { err = PTR_ERR(entry->inode); kmem_cache_free(fsync_entry_slab, entry); - if (err == -ENOENT) + if (err == -ENOENT) { + err = 0; goto next; + } break; } list_add_tail(&entry->list, head); @@ -256,6 +253,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct f2fs_summary_block *sum_node; struct f2fs_summary sum; struct page *sum_page, *node_page; + struct dnode_of_data tdn = *dn; nid_t ino, nid; struct inode *inode; unsigned int offset; @@ -283,17 +281,15 @@ got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); if (dn->inode->i_ino == nid) { - struct dnode_of_data tdn = *dn; tdn.nid = nid; + if (!dn->inode_page_locked) + lock_page(dn->inode_page); tdn.node_page = dn->inode_page; tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); - truncate_data_blocks_range(&tdn, 1); - return 0; + goto truncate_out; } else if (dn->nid == nid) { - struct dnode_of_data tdn = *dn; tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); - truncate_data_blocks_range(&tdn, 1); - return 0; + goto truncate_out; } /* Get the node page */ @@ -317,18 +313,33 @@ got_it: bidx = start_bidx_of_node(offset, F2FS_I(inode)) + le16_to_cpu(sum.ofs_in_node); - if (ino != dn->inode->i_ino) { - truncate_hole(inode, bidx, bidx + 1); + /* + * if inode page is locked, unlock temporarily, but its reference + * count keeps alive. + */ + if (ino == dn->inode->i_ino && dn->inode_page_locked) + unlock_page(dn->inode_page); + + set_new_dnode(&tdn, inode, NULL, NULL, 0); + if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) + goto out; + + if (tdn.data_blkaddr == blkaddr) + truncate_data_blocks_range(&tdn, 1); + + f2fs_put_dnode(&tdn); +out: + if (ino != dn->inode->i_ino) iput(inode); - } else { - struct dnode_of_data tdn; - set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0); - if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) - return 0; - if (tdn.data_blkaddr != NULL_ADDR) - truncate_data_blocks_range(&tdn, 1); - f2fs_put_page(tdn.node_page, 1); - } + else if (dn->inode_page_locked) + lock_page(dn->inode_page); + return 0; + +truncate_out: + if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + truncate_data_blocks_range(&tdn, 1); + if (dn->inode->i_ino == nid && !dn->inode_page_locked) + unlock_page(dn->inode_page); return 0; } @@ -384,7 +395,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, src = datablock_addr(dn.node_page, dn.ofs_in_node); dest = datablock_addr(page, dn.ofs_in_node); - if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) { + if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR && + dest >= MAIN_BLKADDR(sbi) && dest < MAX_BLKADDR(sbi)) { + if (src == NULL_ADDR) { err = reserve_new_block(&dn); /* We should not get -ENOSPC */ @@ -401,14 +414,13 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, /* write dummy data page */ recover_data_page(sbi, NULL, &sum, src, dest); dn.data_blkaddr = dest; - update_extent_cache(&dn); + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); recovered++; } dn.ofs_in_node++; } - /* write node page in place */ - set_summary(&sum, dn.nid, 0, 0); if (IS_INODE(dn.node_page)) sync_inode_page(&dn); @@ -552,7 +564,7 @@ out: mutex_unlock(&sbi->cp_mutex); } else if (need_writecp) { struct cp_control cpc = { - .reason = CP_SYNC, + .reason = CP_RECOVERY, }; mutex_unlock(&sbi->cp_mutex); write_checkpoint(sbi, &cpc); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index daee4ab913da..f939660941bb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -205,6 +205,8 @@ retry: list_add_tail(&new->list, &fi->inmem_pages); inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); mutex_unlock(&fi->inmem_lock); + + trace_f2fs_register_inmem_page(page, INMEM); } void commit_inmem_pages(struct inode *inode, bool abort) @@ -238,11 +240,13 @@ void commit_inmem_pages(struct inode *inode, bool abort) f2fs_wait_on_page_writeback(cur->page, DATA); if (clear_page_dirty_for_io(cur->page)) inode_dec_dirty_pages(inode); + trace_f2fs_commit_inmem_page(cur->page, INMEM); do_write_data_page(cur->page, &fio); submit_bio = true; } f2fs_put_page(cur->page, 1); } else { + trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); put_page(cur->page); } radix_tree_delete(&fi->inmem_root, cur->page->index); @@ -277,6 +281,9 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) { + /* try to shrink extent cache when there is no enough memory */ + f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + /* check the # of cached NAT entries and prefree segments */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || excess_prefree_segs(sbi) || @@ -549,7 +556,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); - if (end - start < cpc->trim_minlen) + if (force && end - start < cpc->trim_minlen) continue; __add_discard_entry(sbi, cpc, start, end); @@ -1164,6 +1171,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); + mutex_lock(&sit_i->sentry_lock); /* direct_io'ed data is aligned to the segment for better performance */ if (direct_io && curseg->next_blkoff) @@ -1178,7 +1186,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, */ __add_sum_entry(sbi, type, sum); - mutex_lock(&sit_i->sentry_lock); __refresh_next_blkoff(sbi, curseg); stat_inc_block_count(sbi, curseg); @@ -1730,6 +1737,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); + if (!sit_i->dirty_sentries) + goto out; + /* * add and account sit entries of dirty bitmap in sit entry * set temporarily @@ -1744,9 +1754,6 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL)) remove_sits_in_journal(sbi); - if (!sit_i->dirty_sentries) - goto out; - /* * there are two steps to flush sit entries: * #1, flush sit entries to journal in current cold data summary block. diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 7fd35111cf62..85d7fa7514b2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -336,7 +336,8 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) clear_bit(segno, free_i->free_segmap); free_i->free_segments++; - next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno); + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f2fe666a6ea9..b2dd1b01f076 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -57,6 +57,8 @@ enum { Opt_flush_merge, Opt_nobarrier, Opt_fastboot, + Opt_extent_cache, + Opt_noinline_data, Opt_err, }; @@ -78,6 +80,8 @@ static match_table_t f2fs_tokens = { {Opt_flush_merge, "flush_merge"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, + {Opt_extent_cache, "extent_cache"}, + {Opt_noinline_data, "noinline_data"}, {Opt_err, NULL}, }; @@ -367,6 +371,12 @@ static int parse_options(struct super_block *sb, char *options) case Opt_fastboot: set_opt(sbi, FASTBOOT); break; + case Opt_extent_cache: + set_opt(sbi, EXTENT_CACHE); + break; + case Opt_noinline_data: + clear_opt(sbi, INLINE_DATA); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -392,7 +402,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; - rwlock_init(&fi->ext.ext_lock); + rwlock_init(&fi->ext_lock); init_rwsem(&fi->i_sem); INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS); INIT_LIST_HEAD(&fi->inmem_pages); @@ -591,6 +601,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",disable_ext_identify"); if (test_opt(sbi, INLINE_DATA)) seq_puts(seq, ",inline_data"); + else + seq_puts(seq, ",noinline_data"); if (test_opt(sbi, INLINE_DENTRY)) seq_puts(seq, ",inline_dentry"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) @@ -599,6 +611,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",nobarrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); + if (test_opt(sbi, EXTENT_CACHE)) + seq_puts(seq, ",extent_cache"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -959,7 +973,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct buffer_head *raw_super_buf; struct inode *root; long err = -EINVAL; - bool retry = true; + bool retry = true, need_fsck = false; char *options = NULL; int i; @@ -984,6 +998,7 @@ try_onemore: sbi->active_logs = NR_CURSEG_TYPE; set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_DATA); #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -1072,6 +1087,8 @@ try_onemore: INIT_LIST_HEAD(&sbi->dir_inode_list); spin_lock_init(&sbi->dir_inode_lock); + init_extent_cache_info(sbi); + init_ino_entry_info(sbi); /* setup f2fs internal modules */ @@ -1146,9 +1163,6 @@ try_onemore: if (err) goto free_proc; - if (!retry) - set_sbi_flag(sbi, SBI_NEED_FSCK); - /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { /* @@ -1160,8 +1174,13 @@ try_onemore: err = -EROFS; goto free_kobj; } + + if (need_fsck) + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = recover_fsync_data(sbi); if (err) { + need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%ld", err); goto free_kobj; @@ -1212,7 +1231,7 @@ free_sbi: /* give only one another chance */ if (retry) { - retry = 0; + retry = false; shrink_dcache_sb(sb); goto try_onemore; } @@ -1278,10 +1297,13 @@ static int __init init_f2fs_fs(void) err = create_checkpoint_caches(); if (err) goto free_segment_manager_caches; + err = create_extent_cache(); + if (err) + goto free_checkpoint_caches; f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); if (!f2fs_kset) { err = -ENOMEM; - goto free_checkpoint_caches; + goto free_extent_cache; } err = register_filesystem(&f2fs_fs_type); if (err) @@ -1292,6 +1314,8 @@ static int __init init_f2fs_fs(void) free_kset: kset_unregister(f2fs_kset); +free_extent_cache: + destroy_extent_cache(); free_checkpoint_caches: destroy_checkpoint_caches(); free_segment_manager_caches: @@ -1309,6 +1333,7 @@ static void __exit exit_f2fs_fs(void) remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); + destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); destroy_node_manager_caches(); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 5072bf9ae0ef..9757f65a05bc 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -83,7 +83,7 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, } if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_getxattr(dentry->d_inode, type, name, buffer, size, NULL); + return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL); } static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, @@ -108,7 +108,7 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_setxattr(dentry->d_inode, type, name, + return f2fs_setxattr(d_inode(dentry), type, name, value, size, NULL, flags); } @@ -130,19 +130,20 @@ static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (strcmp(name, "") != 0) return -EINVAL; - *((char *)buffer) = F2FS_I(inode)->i_advise; + if (buffer) + *((char *)buffer) = F2FS_I(inode)->i_advise; return sizeof(char); } static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int type) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (strcmp(name, "") != 0) return -EINVAL; @@ -152,6 +153,7 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; + mark_inode_dirty(inode); return 0; } @@ -442,7 +444,7 @@ cleanup: ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct f2fs_xattr_entry *entry; void *base_addr; int error = 0; diff --git a/fs/fat/file.c b/fs/fat/file.c index cf50d93565a2..442d50a0e33e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -305,7 +305,7 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; @@ -377,7 +377,7 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) int fat_setattr(struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index cc6a8541b668..b7e2b33aa793 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -308,7 +308,7 @@ out: static int msdos_rmdir(struct inode *dir, struct dentry *dentry) { struct super_block *sb = dir->i_sb; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct fat_slot_info sinfo; int err; @@ -402,7 +402,7 @@ out: /***** Unlink a file */ static int msdos_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct fat_slot_info sinfo; int err; @@ -440,8 +440,8 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, int err, old_attrs, is_dir, update_dotdot, corrupt = 0; old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; - old_inode = old_dentry->d_inode; - new_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); err = fat_scan(old_dir, old_name, &old_sinfo); if (err) { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 7e0974eebd8e..7092584f424a 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -33,7 +33,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) { int ret = 1; spin_lock(&dentry->d_lock); - if (dentry->d_time != dentry->d_parent->d_inode->i_version) + if (dentry->d_time != d_inode(dentry->d_parent)->i_version) ret = 0; spin_unlock(&dentry->d_lock); return ret; @@ -45,7 +45,7 @@ static int vfat_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; /* This is not negative dentry. Always valid. */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return 1; return vfat_revalidate_shortname(dentry); } @@ -65,7 +65,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags) * positive dentry isn't good idea. So it's unsupported like * rename("filename", "FILENAME") for now. */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return 1; /* @@ -801,7 +801,7 @@ out: static int vfat_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; int err; @@ -832,7 +832,7 @@ out: static int vfat_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; int err; @@ -915,8 +915,8 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct super_block *sb = old_dir->i_sb; old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; - old_inode = old_dentry->d_inode; - new_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); mutex_lock(&MSDOS_SB(sb)->s_lock); err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo); if (err) diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index 93e14933dcb6..eb192656fba2 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -266,7 +266,7 @@ struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart) * Find the parent for a directory that is not currently connected to * the filesystem root. * - * On entry, the caller holds child_dir->d_inode->i_mutex. + * On entry, the caller holds d_inode(child_dir)->i_mutex. */ static struct dentry *fat_get_parent(struct dentry *child_dir) { @@ -276,7 +276,7 @@ static struct dentry *fat_get_parent(struct dentry *child_dir) struct inode *parent_inode = NULL; struct msdos_sb_info *sbi = MSDOS_SB(sb); - if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) { + if (!fat_get_dotdot_entry(d_inode(child_dir), &bh, &de)) { int parent_logstart = fat_get_start(sbi, de); parent_inode = fat_dget(sb, parent_logstart); if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO) diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index c36aeaf92e41..8b9229e2ca5c 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -76,7 +76,7 @@ const struct address_space_operations vxfs_immed_aops = { static void * vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np) { - struct vxfs_inode_info *vip = VXFS_INO(dp->d_inode); + struct vxfs_inode_info *vip = VXFS_INO(d_inode(dp)); nd_set_link(np, vip->vii_immed.vi_immed); return NULL; } diff --git a/fs/fs_pin.c b/fs/fs_pin.c index b06c98796afb..611b5408f6ec 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -9,8 +9,8 @@ static DEFINE_SPINLOCK(pin_lock); void pin_remove(struct fs_pin *pin) { spin_lock(&pin_lock); - hlist_del(&pin->m_list); - hlist_del(&pin->s_list); + hlist_del_init(&pin->m_list); + hlist_del_init(&pin->s_list); spin_unlock(&pin_lock); spin_lock_irq(&pin->wait.lock); pin->done = 1; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 205e0d5d5307..f863ac6647ac 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -244,7 +244,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) return 0; parent = fuse_control_sb->s_root; - inc_nlink(parent->d_inode); + inc_nlink(d_inode(parent)); sprintf(name, "%u", fc->dev); parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, &simple_dir_inode_operations, @@ -283,11 +283,11 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) for (i = fc->ctl_ndents - 1; i >= 0; i--) { struct dentry *dentry = fc->ctl_dentry[i]; - dentry->d_inode->i_private = NULL; + d_inode(dentry)->i_private = NULL; d_drop(dentry); dput(dentry); } - drop_nlink(fuse_control_sb->s_root->d_inode); + drop_nlink(d_inode(fuse_control_sb->s_root)); } static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1545b711ddcf..0572bca49f15 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -192,7 +192,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) struct fuse_inode *fi; int ret; - inode = ACCESS_ONCE(entry->d_inode); + inode = d_inode_rcu(entry); if (inode && is_bad_inode(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || @@ -220,7 +220,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version = fuse_get_attr_version(fc); parent = dget_parent(entry); - fuse_lookup_init(fc, &args, get_node_id(parent->d_inode), + fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)), &entry->d_name, &outarg); ret = fuse_simple_request(fc, &args); dput(parent); @@ -254,7 +254,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) return -ECHILD; } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { parent = dget_parent(entry); - fuse_advise_use_readdirplus(parent->d_inode); + fuse_advise_use_readdirplus(d_inode(parent)); dput(parent); } } @@ -487,7 +487,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, entry = res; } - if (!(flags & O_CREAT) || entry->d_inode) + if (!(flags & O_CREAT) || d_really_is_positive(entry)) goto no_open; /* Only creates */ @@ -653,7 +653,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.in.args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fc->lock); @@ -689,7 +689,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.in.args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { - clear_nlink(entry->d_inode); + clear_nlink(d_inode(entry)); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); } else if (err == -EINTR) @@ -721,12 +721,12 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, err = fuse_simple_request(fc, &args); if (!err) { /* ctime changes */ - fuse_invalidate_attr(oldent->d_inode); - fuse_update_ctime(oldent->d_inode); + fuse_invalidate_attr(d_inode(oldent)); + fuse_update_ctime(d_inode(oldent)); if (flags & RENAME_EXCHANGE) { - fuse_invalidate_attr(newent->d_inode); - fuse_update_ctime(newent->d_inode); + fuse_invalidate_attr(d_inode(newent)); + fuse_update_ctime(d_inode(newent)); } fuse_invalidate_attr(olddir); @@ -734,10 +734,10 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (!(flags & RENAME_EXCHANGE) && newent->d_inode) { - fuse_invalidate_attr(newent->d_inode); + if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { + fuse_invalidate_attr(d_inode(newent)); fuse_invalidate_entry_cache(newent); - fuse_update_ctime(newent->d_inode); + fuse_update_ctime(d_inode(newent)); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the @@ -746,7 +746,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, directory), then there can be inconsistency between the dcache and the real filesystem. Tough luck. */ fuse_invalidate_entry(oldent); - if (newent->d_inode) + if (d_really_is_positive(newent)) fuse_invalidate_entry(newent); } @@ -788,7 +788,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, { int err; struct fuse_link_in inarg; - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); @@ -961,9 +961,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, fuse_invalidate_attr(parent); fuse_invalidate_entry(entry); - if (child_nodeid != 0 && entry->d_inode) { - mutex_lock(&entry->d_inode->i_mutex); - if (get_node_id(entry->d_inode) != child_nodeid) { + if (child_nodeid != 0 && d_really_is_positive(entry)) { + mutex_lock(&d_inode(entry)->i_mutex); + if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; goto badentry; } @@ -977,13 +977,13 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, err = -ENOTEMPTY; goto badentry; } - entry->d_inode->i_flags |= S_DEAD; + d_inode(entry)->i_flags |= S_DEAD; } dont_mount(entry); - clear_nlink(entry->d_inode); + clear_nlink(d_inode(entry)); err = 0; badentry: - mutex_unlock(&entry->d_inode->i_mutex); + mutex_unlock(&d_inode(entry)->i_mutex); if (!err) d_delete(entry); } else { @@ -1169,7 +1169,7 @@ static int fuse_direntplus_link(struct file *file, struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); struct dentry *dentry; struct dentry *alias; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct fuse_conn *fc; struct inode *inode; @@ -1205,7 +1205,7 @@ static int fuse_direntplus_link(struct file *file, name.hash = full_name_hash(name.name, name.len); dentry = d_lookup(parent, &name); if (dentry) { - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) { d_drop(dentry); } else if (get_node_id(inode) != o->nodeid || @@ -1367,7 +1367,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) static char *read_link(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); char *link; @@ -1712,7 +1712,7 @@ error: static int fuse_setattr(struct dentry *entry, struct iattr *attr) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; @@ -1726,7 +1726,7 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, struct kstat *stat) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); if (!fuse_allow_current_process(fc)) @@ -1738,7 +1738,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, static int fuse_setxattr(struct dentry *entry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_setxattr_in inarg; @@ -1774,7 +1774,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name, static ssize_t fuse_getxattr(struct dentry *entry, const char *name, void *value, size_t size) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; @@ -1815,7 +1815,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; @@ -1857,7 +1857,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) static int fuse_removexattr(struct dentry *entry, const char *name) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); int err; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e8799c11424b..082ac1c97f39 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -421,7 +421,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) memset(&outarg, 0, sizeof(outarg)); args.in.numargs = 0; args.in.h.opcode = FUSE_STATFS; - args.in.h.nodeid = get_node_id(dentry->d_inode); + args.in.h.nodeid = get_node_id(d_inode(dentry)); args.out.numargs = 1; args.out.args[0].size = sizeof(outarg); args.out.args[0].value = &outarg; @@ -740,7 +740,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb, static struct dentry *fuse_get_parent(struct dentry *child) { - struct inode *child_inode = child->d_inode; + struct inode *child_inode = d_inode(child); struct fuse_conn *fc = get_fuse_conn(child_inode); struct inode *inode; struct dentry *parent; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 589f4ea9381c..30822b148f3e 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -48,9 +48,9 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; parent = dget_parent(dentry); - sdp = GFS2_SB(parent->d_inode); - dip = GFS2_I(parent->d_inode); - inode = dentry->d_inode; + sdp = GFS2_SB(d_inode(parent)); + dip = GFS2_I(d_inode(parent)); + inode = d_inode(dentry); if (inode) { if (is_bad_inode(inode)) @@ -68,7 +68,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) goto fail; } - error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip); + error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip); switch (error) { case 0: if (!inode) @@ -113,10 +113,10 @@ static int gfs2_dentry_delete(const struct dentry *dentry) { struct gfs2_inode *ginode; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return 0; - ginode = GFS2_I(dentry->d_inode); + ginode = GFS2_I(d_inode(dentry)); if (!ginode->i_iopen_gh.gh_gl) return 0; diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index c41d255b6a7b..5d15e9498b48 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -49,7 +49,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len, fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); *len = GFS2_SMALL_FH_SIZE; - if (!parent || inode == sb->s_root->d_inode) + if (!parent || inode == d_inode(sb->s_root)) return *len; ip = GFS2_I(parent); @@ -88,8 +88,8 @@ static int get_name_filldir(struct dir_context *ctx, const char *name, static int gfs2_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *dir = parent->d_inode; - struct inode *inode = child->d_inode; + struct inode *dir = d_inode(parent); + struct inode *inode = d_inode(child); struct gfs2_inode *dip, *ip; struct get_name_filldir gnfd = { .ctx.actor = get_name_filldir, @@ -128,7 +128,7 @@ static int gfs2_get_name(struct dentry *parent, char *name, static struct dentry *gfs2_get_parent(struct dentry *child) { - return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1)); + return d_obtain_alias(gfs2_lookupi(d_inode(child), &gfs2_qdotdot, 1)); } static struct dentry *gfs2_get_dentry(struct super_block *sb, diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 08bc84d7e768..1b3ca7a2e3fc 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -295,7 +295,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) || (name->len == 2 && memcmp(name->name, "..", 2) == 0 && - dir == sb->s_root->d_inode)) { + dir == d_inode(sb->s_root))) { igrab(dir); return dir; } @@ -687,7 +687,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, } gfs2_set_inode_flags(inode); - if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) || + if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) || (dip->i_diskflags & GFS2_DIF_TOPDIR)) aflags |= GFS2_AF_ORLOV; @@ -888,7 +888,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, { struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_sbd *sdp = GFS2_SB(dir); - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[2]; struct buffer_head *dibh; @@ -1055,7 +1055,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, static int gfs2_unlink_inode(struct gfs2_inode *dip, const struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); int error; @@ -1091,7 +1091,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) { struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_sbd *sdp = GFS2_SB(dir); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; @@ -1241,7 +1241,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, return PTR_ERR(d); if (d != NULL) dentry = d; - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { if (!(*opened & FILE_OPENED)) return finish_no_open(file, d); dput(d); @@ -1282,7 +1282,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) error = -EINVAL; break; } - if (dir == sb->s_root->d_inode) { + if (dir == d_inode(sb->s_root)) { error = 0; break; } @@ -1321,7 +1321,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, { struct gfs2_inode *odip = GFS2_I(odir); struct gfs2_inode *ndip = GFS2_I(ndir); - struct gfs2_inode *ip = GFS2_I(odentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(odentry)); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; @@ -1332,8 +1332,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, unsigned int x; int error; - if (ndentry->d_inode) { - nip = GFS2_I(ndentry->d_inode); + if (d_really_is_positive(ndentry)) { + nip = GFS2_I(d_inode(ndentry)); if (ip == nip) return 0; } @@ -1457,7 +1457,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Check out the dir to be renamed */ if (dir_rename) { - error = gfs2_permission(odentry->d_inode, MAY_WRITE); + error = gfs2_permission(d_inode(odentry), MAY_WRITE); if (error) goto out_gunlock; } @@ -1550,7 +1550,7 @@ out: static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_holder i_gh; struct buffer_head *dibh; unsigned int size; @@ -1742,7 +1742,7 @@ out: static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder i_gh; int error; @@ -1798,7 +1798,7 @@ out: static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int error; @@ -1821,7 +1821,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, static int gfs2_setxattr(struct dentry *dentry, const char *name, const void *data, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; @@ -1841,7 +1841,7 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name, static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, void *data, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; @@ -1862,7 +1862,7 @@ static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, static int gfs2_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index efc8e254787c..35b49f44c72f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -647,7 +647,7 @@ out_unlock: static int init_journal(struct gfs2_sbd *sdp, int undo) { - struct inode *master = sdp->sd_master_dir->d_inode; + struct inode *master = d_inode(sdp->sd_master_dir); struct gfs2_holder ji_gh; struct gfs2_inode *ip; int jindex = 1; @@ -782,7 +782,7 @@ static struct lock_class_key gfs2_quota_imutex_key; static int init_inodes(struct gfs2_sbd *sdp, int undo) { int error = 0; - struct inode *master = sdp->sd_master_dir->d_inode; + struct inode *master = d_inode(sdp->sd_master_dir); if (undo) goto fail_qinode; @@ -848,7 +848,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) char buf[30]; int error = 0; struct gfs2_inode *ip; - struct inode *master = sdp->sd_master_dir->d_inode; + struct inode *master = d_inode(sdp->sd_master_dir); if (sdp->sd_args.ar_spectator) return 0; @@ -1357,7 +1357,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, return ERR_PTR(error); } s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, - path.dentry->d_inode->i_sb->s_bdev); + d_inode(path.dentry)->i_sb->s_bdev); path_put(&path); if (IS_ERR(s)) { pr_warn("gfs2 mount does not exist\n"); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 1666382b198d..859c6edbf81a 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1171,7 +1171,7 @@ static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *s static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct super_block *sb = dentry->d_inode->i_sb; + struct super_block *sb = d_inode(dentry)->i_sb; struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_statfs_change_host sc; int error; diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index fd260ce8869a..4c096fa9e2a1 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -420,7 +420,7 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_ea_request er; struct gfs2_holder i_gh; int error; @@ -586,7 +586,7 @@ out: static int gfs2_xattr_get(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_ea_location el; int error; @@ -1230,7 +1230,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, static int gfs2_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int type) { - return __gfs2_xattr_set(dentry->d_inode, name, value, + return __gfs2_xattr_set(d_inode(dentry), name, value, size, flags, type); } diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index e057ec542a6a..8d931b157bbe 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -16,7 +16,7 @@ int hfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; hfs_cat_rec rec; struct hfs_cat_file *file; @@ -59,7 +59,7 @@ out: ssize_t hfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; hfs_cat_rec rec; struct hfs_cat_file *file; @@ -105,7 +105,7 @@ out: ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode)) return -EOPNOTSUPP; diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 36d1a6ae7655..70788e03820a 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -253,7 +253,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) */ static int hfs_remove(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int res; if (S_ISDIR(inode->i_mode) && inode->i_size != 2) @@ -285,18 +285,18 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, int res; /* Unlink destination if it already exists */ - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { res = hfs_remove(new_dir, new_dentry); if (res) return res; } - res = hfs_cat_move(old_dentry->d_inode->i_ino, + res = hfs_cat_move(d_inode(old_dentry)->i_ino, old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); if (!res) hfs_cat_build_key(old_dir->i_sb, - (btree_key *)&HFS_I(old_dentry->d_inode)->cat_key, + (btree_key *)&HFS_I(d_inode(old_dentry))->cat_key, new_dir->i_ino, &new_dentry->d_name); return res; } diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 75fd5d873c19..b99ebddb10cb 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -600,7 +600,7 @@ static int hfs_file_release(struct inode *inode, struct file *file) int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); int error; diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index 91b91fd3a901..2875961fdc10 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -21,7 +21,7 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); if(!inode) return 1; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 3074609befc3..d0f39dcbb58e 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -81,7 +81,7 @@ again: HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)-> create_date || entry.file.create_date == - HFSPLUS_I(sb->s_root->d_inode)-> + HFSPLUS_I(d_inode(sb->s_root))-> create_date) && HFSPLUS_SB(sb)->hidden_dir) { struct qstr str; @@ -296,8 +296,8 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, struct dentry *dst_dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb); - struct inode *inode = src_dentry->d_inode; - struct inode *src_dir = src_dentry->d_parent->d_inode; + struct inode *inode = d_inode(src_dentry); + struct inode *src_dir = d_inode(src_dentry->d_parent); struct qstr str; char name[32]; u32 cnid, id; @@ -353,7 +353,7 @@ out: static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct qstr str; char name[32]; u32 cnid; @@ -410,7 +410,7 @@ out: static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int res; if (inode->i_size != 2) @@ -529,7 +529,7 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, int res; /* Unlink destination if it already exists */ - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { if (d_is_dir(new_dentry)) res = hfsplus_rmdir(new_dir, new_dentry); else diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index b0afedbef12b..6dd107d7421e 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -243,7 +243,7 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 8e98f5db6ad6..0624ce4e0702 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -26,7 +26,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); struct hfsplus_vh *vh = sbi->s_vhdr; struct hfsplus_vh *bvh = sbi->s_backup_vhdr; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 89f262d8fcd8..416b1dbafe51 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -440,7 +440,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, return -ENOMEM; strcpy(xattr_name, prefix); strcpy(xattr_name + prefixlen, name); - res = __hfsplus_setxattr(dentry->d_inode, xattr_name, value, size, + res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size, flags); kfree(xattr_name); return res; @@ -600,7 +600,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, strcpy(xattr_name, prefix); strcpy(xattr_name + prefixlen, name); - res = __hfsplus_getxattr(dentry->d_inode, xattr_name, value, size); + res = __hfsplus_getxattr(d_inode(dentry), xattr_name, value, size); kfree(xattr_name); return res; @@ -620,7 +620,7 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, char *buffer, size_t size) { ssize_t res = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; u16 entry_type; u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; @@ -688,7 +688,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) { ssize_t err; ssize_t res = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; u16 key_len = 0; struct hfsplus_attr_key attr_key; @@ -868,7 +868,7 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, * creates), so we pass the name through unmodified (after * ensuring it doesn't conflict with another namespace). */ - return __hfsplus_getxattr(dentry->d_inode, name, buffer, size); + return __hfsplus_getxattr(d_inode(dentry), name, buffer, size); } static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, @@ -890,7 +890,7 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, * creates), so we pass the name through unmodified (after * ensuring it doesn't conflict with another namespace). */ - return __hfsplus_setxattr(dentry->d_inode, name, buffer, size, flags); + return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags); } static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index b83a0343378b..07d8d8f52faf 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -581,7 +581,7 @@ static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (name == NULL) goto out_put; - fd = file_create(name, mode & S_IFMT); + fd = file_create(name, mode & 0777); if (fd < 0) error = fd; else @@ -807,7 +807,7 @@ static int hostfs_permission(struct inode *ino, int desired) static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hostfs_iattr attrs; char *name; int err; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 7ce4b74234a1..933c73780813 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -257,7 +257,7 @@ void hpfs_write_inode_nolock(struct inode *i) int hpfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error = -EINVAL; hpfs_lock(inode->i_sb); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index bdbc2c3080a4..a0872f239f04 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -359,7 +359,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) unsigned len = dentry->d_name.len; struct quad_buffer_head qbh; struct hpfs_dirent *de; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); dnode_secno dno; int r; int rep = 0; @@ -433,7 +433,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) unsigned len = dentry->d_name.len; struct quad_buffer_head qbh; struct hpfs_dirent *de; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); dnode_secno dno; int n_items = 0; int err; @@ -522,8 +522,8 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned old_len = old_dentry->d_name.len; const unsigned char *new_name = new_dentry->d_name.name; unsigned new_len = new_dentry->d_name.len; - struct inode *i = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *i = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct quad_buffer_head qbh, qbh1; struct hpfs_dirent *dep, *nde; struct hpfs_dirent de; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 043ac9d77262..fa2bd5366ecf 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -153,9 +153,9 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, return ERR_PTR(-ENOENT); parent = HPPFS_I(ino)->proc_dentry; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); proc_dentry = lookup_one_len(name->name, parent, name->len); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); if (IS_ERR(proc_dentry)) return proc_dentry; @@ -637,25 +637,25 @@ static const struct super_operations hppfs_sbops = { static int hppfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; - return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, + struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; + return d_inode(proc_dentry)->i_op->readlink(proc_dentry, buffer, buflen); } static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; - return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd); + return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, nd); } static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { - struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; - if (proc_dentry->d_inode->i_op->put_link) - proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie); + if (d_inode(proc_dentry)->i_op->put_link) + d_inode(proc_dentry)->i_op->put_link(proc_dentry, nd, cookie); } static const struct inode_operations hppfs_dir_iops = { @@ -670,7 +670,7 @@ static const struct inode_operations hppfs_link_iops = { static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) { - struct inode *proc_ino = dentry->d_inode; + struct inode *proc_ino = d_inode(dentry); struct inode *inode = new_inode(sb); if (!inode) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2640d88b0e63..87724c1d7be6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -393,7 +393,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; @@ -587,7 +587,7 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); - struct hstate *h = hstate_inode(dentry->d_inode); + struct hstate *h = hstate_inode(d_inode(dentry)); buf->f_type = HUGETLBFS_MAGIC; buf->f_bsize = huge_page_size(h); diff --git a/fs/inode.c b/fs/inode.c index f00b16f45507..ea37cd17b53f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1587,7 +1587,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; - struct inode *inode = path->dentry->d_inode; + struct inode *inode = d_inode(path->dentry); struct timespec now; if (inode->i_flags & S_NOATIME) @@ -1639,7 +1639,7 @@ EXPORT_SYMBOL(touch_atime); */ int should_remove_suid(struct dentry *dentry) { - umode_t mode = dentry->d_inode->i_mode; + umode_t mode = d_inode(dentry)->i_mode; int kill = 0; /* suid always must be killed */ @@ -1675,7 +1675,7 @@ static int __remove_suid(struct dentry *dentry, int kill) int file_remove_suid(struct file *file) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int killsuid; int killpriv; int error = 0; @@ -1946,20 +1946,6 @@ void inode_dio_wait(struct inode *inode) EXPORT_SYMBOL(inode_dio_wait); /* - * inode_dio_done - signal finish of a direct I/O requests - * @inode: inode the direct I/O happens on - * - * This is called once we've finished processing a direct I/O request, - * and is used to wake up callers waiting for direct I/O to be quiesced. - */ -void inode_dio_done(struct inode *inode) -{ - if (atomic_dec_and_test(&inode->i_dio_count)) - wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); -} -EXPORT_SYMBOL(inode_dio_done); - -/* * inode_set_flags - atomically set some inode flags * * Note: the caller should be holding i_mutex, or else be sure that diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 12088d8de3fa..0c5f721b4e91 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -44,7 +44,7 @@ static struct dentry *isofs_export_get_parent(struct dentry *child) { unsigned long parent_block = 0; unsigned long parent_offset = 0; - struct inode *child_inode = child->d_inode; + struct inode *child_inode = d_inode(child); struct iso_inode_info *e_child_inode = ISOFS_I(child_inode); struct iso_directory_record *de = NULL; struct buffer_head * bh = NULL; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index b5128c6e63ad..a9079d035ae5 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -842,15 +842,23 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, { jbd2_journal_revoke_header_t *header; int offset, max; + int csum_size = 0; + __u32 rcount; int record_len = 4; header = (jbd2_journal_revoke_header_t *) bh->b_data; offset = sizeof(jbd2_journal_revoke_header_t); - max = be32_to_cpu(header->r_count); + rcount = be32_to_cpu(header->r_count); if (!jbd2_revoke_block_csum_verify(journal, header)) return -EINVAL; + if (jbd2_journal_has_csum_v2or3(journal)) + csum_size = sizeof(struct jbd2_journal_revoke_tail); + if (rcount > journal->j_blocksize - csum_size) + return -EINVAL; + max = rcount; + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) record_len = 8; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index c6cbaef2bda1..14214da80eb8 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -577,7 +577,7 @@ static void write_one_revoke_record(journal_t *journal, { int csum_size = 0; struct buffer_head *descriptor; - int offset; + int sz, offset; journal_header_t *header; /* If we are already aborting, this all becomes a noop. We @@ -594,9 +594,14 @@ static void write_one_revoke_record(journal_t *journal, if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + sz = 8; + else + sz = 4; + /* Make sure we have a descriptor with space left for the record */ if (descriptor) { - if (offset >= journal->j_blocksize - csum_size) { + if (offset + sz > journal->j_blocksize - csum_size) { flush_descriptor(journal, descriptor, offset, write_op); descriptor = NULL; } @@ -619,16 +624,13 @@ static void write_one_revoke_record(journal_t *journal, *descriptorp = descriptor; } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); - offset += 8; - - } else { + else * ((__be32 *)(&descriptor->b_data[offset])) = cpu_to_be32(record->blocknr); - offset += 4; - } + offset += sz; *offsetp = offset; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5f09370c90a8..ff2f2e6ad311 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -551,7 +551,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) int result; int wanted; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -627,7 +626,6 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) tid_t tid; int need_to_start, ret; - WARN_ON(!transaction); /* If we've had an abort of any type, don't even think about * actually doing the restart! */ if (is_handle_aborted(handle)) @@ -785,7 +783,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, int need_copy = 0; unsigned long start_lock, time_lock; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -1051,7 +1048,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) int err; jbd_debug(5, "journal_head %p\n", jh); - WARN_ON(!transaction); err = -EROFS; if (is_handle_aborted(handle)) goto out; @@ -1266,7 +1262,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int ret = 0; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -1397,7 +1392,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) int err = 0; int was_modified = 0; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -1530,8 +1524,22 @@ int jbd2_journal_stop(handle_t *handle) tid_t tid; pid_t pid; - if (!transaction) - goto free_and_exit; + if (!transaction) { + /* + * Handle is already detached from the transaction so + * there is nothing to do other than decrease a refcount, + * or free the handle if refcount drops to zero + */ + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + return err; + } else { + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); + goto free_and_exit; + } + } journal = transaction->t_journal; J_ASSERT(journal_current_handle() == handle); @@ -2373,7 +2381,6 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) transaction_t *transaction = handle->h_transaction; journal_t *journal; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index f21b6fb5e4c4..1ba5c97943b8 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -224,14 +224,14 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) { struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb); struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); - struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(dentry->d_inode); + struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(d_inode(dentry)); int ret; uint32_t now = get_seconds(); ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, dead_f, now); if (dead_f->inocache) - set_nlink(dentry->d_inode, dead_f->inocache->pino_nlink); + set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink); if (!ret) dir_i->i_mtime = dir_i->i_ctime = ITIME(now); return ret; @@ -241,8 +241,8 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry) { - struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dentry->d_inode->i_sb); - struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(d_inode(old_dentry)->i_sb); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry)); struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); int ret; uint8_t type; @@ -256,7 +256,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de return -EPERM; /* XXX: This is ugly */ - type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12; + type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12; if (!type) type = DT_REG; now = get_seconds(); @@ -264,11 +264,11 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de if (!ret) { mutex_lock(&f->sem); - set_nlink(old_dentry->d_inode, ++f->inocache->pino_nlink); + set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink); mutex_unlock(&f->sem); - d_instantiate(dentry, old_dentry->d_inode); + d_instantiate(dentry, d_inode(old_dentry)); dir_i->i_mtime = dir_i->i_ctime = ITIME(now); - ihold(old_dentry->d_inode); + ihold(d_inode(old_dentry)); } return ret; } @@ -585,7 +585,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) { struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb); struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); - struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry)); struct jffs2_full_dirent *fd; int ret; uint32_t now = get_seconds(); @@ -599,7 +599,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) dentry->d_name.len, f, now); if (!ret) { dir_i->i_mtime = dir_i->i_ctime = ITIME(now); - clear_nlink(dentry->d_inode); + clear_nlink(d_inode(dentry)); drop_nlink(dir_i); } return ret; @@ -770,8 +770,8 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, * the VFS can't check whether the victim is empty. The filesystem * needs to do that for itself. */ - if (new_dentry->d_inode) { - victim_f = JFFS2_INODE_INFO(new_dentry->d_inode); + if (d_really_is_positive(new_dentry)) { + victim_f = JFFS2_INODE_INFO(d_inode(new_dentry)); if (d_is_dir(new_dentry)) { struct jffs2_full_dirent *fd; @@ -794,12 +794,12 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, /* Make a hard link */ /* XXX: This is ugly */ - type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12; + type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12; if (!type) type = DT_REG; now = get_seconds(); ret = jffs2_do_link(c, JFFS2_INODE_INFO(new_dir_i), - old_dentry->d_inode->i_ino, type, + d_inode(old_dentry)->i_ino, type, new_dentry->d_name.name, new_dentry->d_name.len, now); if (ret) @@ -808,9 +808,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, if (victim_f) { /* There was a victim. Kill it off nicely */ if (d_is_dir(new_dentry)) - clear_nlink(new_dentry->d_inode); + clear_nlink(d_inode(new_dentry)); else - drop_nlink(new_dentry->d_inode); + drop_nlink(d_inode(new_dentry)); /* Don't oops if the victim was a dirent pointing to an inode which didn't exist. */ if (victim_f->inocache) { @@ -836,9 +836,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, if (ret) { /* Oh shit. We really ought to make a single node which can do both atomically */ - struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry)); mutex_lock(&f->sem); - inc_nlink(old_dentry->d_inode); + inc_nlink(d_inode(old_dentry)); if (f->inocache && !d_is_dir(old_dentry)) f->inocache->pino_nlink++; mutex_unlock(&f->sem); @@ -846,8 +846,8 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n", __func__, ret); /* Might as well let the VFS know */ - d_instantiate(new_dentry, old_dentry->d_inode); - ihold(old_dentry->d_inode); + d_instantiate(new_dentry, d_inode(old_dentry)); + ihold(d_inode(old_dentry)); new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); return ret; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 601afd1afddf..fe5ea080b4ec 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -190,7 +190,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int rc; rc = inode_change_ok(inode, iattr); diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index aca97f35b292..d4b43fb7adb1 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -54,7 +54,7 @@ static int jffs2_security_getxattr(struct dentry *dentry, const char *name, if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, name, buffer, size); } @@ -64,7 +64,7 @@ static int jffs2_security_setxattr(struct dentry *dentry, const char *name, if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); } diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 3d76f28a2ba9..d86c5e3176a1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -140,14 +140,14 @@ static struct dentry *jffs2_get_parent(struct dentry *child) BUG_ON(!d_is_dir(child)); - f = JFFS2_INODE_INFO(child->d_inode); + f = JFFS2_INODE_INFO(d_inode(child)); pino = f->inocache->pino_nlink; JFFS2_DEBUG("Parent of directory ino #%u is #%u\n", f->inocache->ino, pino); - return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino)); + return d_obtain_alias(jffs2_iget(d_inode(child)->i_sb, pino)); } static const struct export_operations jffs2_export_ops = { diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index c7c77b0dfccd..1fefa25d0fa5 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -31,7 +31,7 @@ const struct inode_operations jffs2_symlink_inode_operations = static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry)); char *p = (char *)f->target; /* diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 762c7a3cf43d..f092fee5be50 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -960,7 +960,7 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) { ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_inode_cache *ic = f->inocache; @@ -1266,7 +1266,6 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ if (rc) { JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n", __func__, rc, totlen); - rc = rc ? rc : -EBADFD; goto out; } rc = save_xattr_ref(c, ref); diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 1c868194c504..ceaf9c693225 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -21,7 +21,7 @@ static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, name, buffer, size); } @@ -30,7 +30,7 @@ static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); } diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 916b5c966039..a71391eba514 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -21,7 +21,7 @@ static int jffs2_user_getxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER, name, buffer, size); } @@ -30,7 +30,7 @@ static int jffs2_user_setxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER, name, buffer, size, flags); } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index ae46788b9723..e98d39d75cf4 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -100,7 +100,7 @@ static int jfs_release(struct inode *inode, struct file *file) int jfs_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int rc; rc = inode_change_ok(inode, iattr); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 38fdc533f4ec..66db7bc0ed10 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -346,7 +346,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) { int rc; tid_t tid; /* transaction id */ - struct inode *ip = dentry->d_inode; + struct inode *ip = d_inode(dentry); ino_t ino; struct component_name dname; struct inode *iplist[2]; @@ -472,7 +472,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) { int rc; tid_t tid; /* transaction id */ - struct inode *ip = dentry->d_inode; + struct inode *ip = d_inode(dentry); ino_t ino; struct component_name dname; /* object name */ struct inode *iplist[2]; @@ -791,7 +791,7 @@ static int jfs_link(struct dentry *old_dentry, { int rc; tid_t tid; - struct inode *ip = old_dentry->d_inode; + struct inode *ip = d_inode(old_dentry); ino_t ino; struct component_name dname; struct btstack btstack; @@ -879,7 +879,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, struct component_name dname; int ssize; /* source pathname size */ struct btstack btstack; - struct inode *ip = dentry->d_inode; + struct inode *ip = d_inode(dentry); unchar *i_fastsymlink; s64 xlen = 0; int bmask = 0, xsize; @@ -1086,8 +1086,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, dquot_initialize(old_dir); dquot_initialize(new_dir); - old_ip = old_dentry->d_inode; - new_ip = new_dentry->d_inode; + old_ip = d_inode(old_dentry); + new_ip = d_inode(new_dentry); if ((rc = get_UCSname(&old_dname, old_dentry))) goto out1; @@ -1500,9 +1500,9 @@ struct dentry *jfs_get_parent(struct dentry *dentry) unsigned long parent_ino; parent_ino = - le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot); + le32_to_cpu(JFS_IP(d_inode(dentry))->i_dtroot.header.idotdot); - return d_obtain_alias(jfs_iget(dentry->d_inode->i_sb, parent_ino)); + return d_obtain_alias(jfs_iget(d_inode(dentry)->i_sb, parent_ino)); } const struct inode_operations jfs_dir_inode_operations = { @@ -1578,7 +1578,7 @@ static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags) * positive dentry isn't good idea. So it's unsupported like * rename("filename", "FILENAME") for now. */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return 1; /* diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index 205b946d8e0d..80f42bcc4ef1 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -24,7 +24,7 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - char *s = JFS_IP(dentry->d_inode)->i_inline; + char *s = JFS_IP(d_inode(dentry))->i_inline; nd_set_link(nd, s); return NULL; } diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 46325d5c34fc..48b15a6e5558 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -849,7 +849,7 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t value_len, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct jfs_inode_info *ji = JFS_IP(inode); int rc; tid_t tid; @@ -872,7 +872,7 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, tid = txBegin(inode->i_sb, 0); mutex_lock(&ji->commit_mutex); - rc = __jfs_setxattr(tid, dentry->d_inode, name, value, value_len, + rc = __jfs_setxattr(tid, d_inode(dentry), name, value, value_len, flags); if (!rc) rc = txCommit(tid, 1, &inode, 0); @@ -959,7 +959,7 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, return -EOPNOTSUPP; } - err = __jfs_getxattr(dentry->d_inode, name, data, buf_size); + err = __jfs_getxattr(d_inode(dentry), name, data, buf_size); return err; } @@ -976,7 +976,7 @@ static inline int can_list(struct jfs_ea *ea) ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); char *buffer; ssize_t size = 0; int xattr_size; @@ -1029,7 +1029,7 @@ ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) int jfs_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct jfs_inode_info *ji = JFS_IP(inode); int rc; tid_t tid; @@ -1047,7 +1047,7 @@ int jfs_removexattr(struct dentry *dentry, const char *name) tid = txBegin(inode->i_sb, 0); mutex_lock(&ji->commit_mutex); - rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); + rc = __jfs_setxattr(tid, d_inode(dentry), name, NULL, 0, XATTR_REPLACE); if (!rc) rc = txCommit(tid, 1, &inode, 0); txEnd(tid); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 6acc9648f986..fffca9517321 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -444,7 +444,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; /* Always perform fresh lookup for negatives */ - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out_bad_unlocked; kn = dentry->d_fsdata; @@ -518,7 +518,14 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + /* + * If the ino of the sysfs entry created for a kmem cache gets + * allocated from an ida layer, which is accounted to the memcg that + * owns the cache, the memcg will get pinned forever. So do not account + * ino ida allocations. + */ + ret = ida_simple_get(&root->ino_ida, 1, 0, + GFP_KERNEL | __GFP_NOACCOUNT); if (ret < 0) goto err_out2; kn->ino = ret; diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 9000874a945b..2da8493a380b 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -111,7 +111,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct kernfs_node *kn = dentry->d_fsdata; int error; @@ -172,11 +172,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - error = security_inode_setsecurity(dentry->d_inode, suffix, + error = security_inode_setsecurity(d_inode(dentry), suffix, value, size, flags); if (error) return error; - error = security_inode_getsecctx(dentry->d_inode, + error = security_inode_getsecctx(d_inode(dentry), &secdata, &secdata_len); if (error) return error; @@ -271,7 +271,7 @@ int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct kernfs_node *kn = dentry->d_fsdata; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); mutex_lock(&kernfs_mutex); kernfs_refresh_inode(kn, inode); diff --git a/fs/libfs.c b/fs/libfs.c index 0ab65122ee45..cb1fb4b9b637 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -22,13 +22,13 @@ static inline int simple_positive(struct dentry *dentry) { - return dentry->d_inode && !d_unhashed(dentry); + return d_really_is_positive(dentry) && !d_unhashed(dentry); } int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9); return 0; @@ -94,7 +94,7 @@ EXPORT_SYMBOL(dcache_dir_close); loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); switch (whence) { case 1: offset += file->f_pos; @@ -102,7 +102,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -129,7 +129,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&dentry->d_lock); } } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return offset; } EXPORT_SYMBOL(dcache_dir_lseek); @@ -169,7 +169,7 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) spin_unlock(&next->d_lock); spin_unlock(&dentry->d_lock); if (!dir_emit(ctx, next->d_name.name, next->d_name.len, - next->d_inode->i_ino, dt_type(next->d_inode))) + d_inode(next)->i_ino, dt_type(d_inode(next)))) return 0; spin_lock(&dentry->d_lock); spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); @@ -270,7 +270,7 @@ EXPORT_SYMBOL(simple_open); int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); @@ -304,7 +304,7 @@ EXPORT_SYMBOL(simple_empty); int simple_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; drop_nlink(inode); @@ -318,7 +318,7 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) if (!simple_empty(dentry)) return -ENOTEMPTY; - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); simple_unlink(dir, dentry); drop_nlink(dir); return 0; @@ -328,16 +328,16 @@ EXPORT_SYMBOL(simple_rmdir); int simple_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int they_are_dirs = d_is_dir(old_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { simple_unlink(new_dir, new_dentry); if (they_are_dirs) { - drop_nlink(new_dentry->d_inode); + drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { @@ -368,7 +368,7 @@ EXPORT_SYMBOL(simple_rename); */ int simple_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, iattr); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 665ef5a05183..a563ddbc19e6 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -31,7 +31,7 @@ static struct hlist_head nlm_files[FILE_NRHASH]; static DEFINE_MUTEX(nlm_file_mutex); -#ifdef NFSD_DEBUG +#ifdef CONFIG_SUNRPC_DEBUG static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) { u32 *fhp = (u32*)f->data; diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 6bdc347008f5..4cf38f118549 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -213,7 +213,7 @@ static void abort_transaction(struct inode *inode, struct logfs_transaction *ta) static int logfs_unlink(struct inode *dir, struct dentry *dentry) { struct logfs_super *super = logfs_super(dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct logfs_transaction *ta; struct page *page; pgoff_t index; @@ -271,7 +271,7 @@ static inline int logfs_empty_dir(struct inode *dir) static int logfs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (!logfs_empty_dir(inode)) return -ENOTEMPTY; @@ -537,7 +537,7 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, static int logfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ihold(inode); @@ -607,7 +607,7 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, /* 2. write target dd */ mutex_lock(&super->s_dirop_mutex); logfs_add_transaction(new_dir, ta); - err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode); + err = logfs_write_dir(new_dir, new_dentry, d_inode(old_dentry)); if (!err) err = write_inode(new_dir); @@ -658,8 +658,8 @@ static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct logfs_super *super = logfs_super(old_dir->i_sb); - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); int isdir = S_ISDIR(old_inode->i_mode); struct logfs_disk_dentry dd; struct logfs_transaction *ta; @@ -719,7 +719,7 @@ out: static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - if (new_dentry->d_inode) + if (d_really_is_positive(new_dentry)) return logfs_rename_target(old_dir, old_dentry, new_dir, new_dentry); return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry); diff --git a/fs/logfs/file.c b/fs/logfs/file.c index b2c13f739ffa..1a6f0167b16a 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -241,7 +241,7 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) static int logfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = 0; err = inode_change_ok(inode, attr); diff --git a/fs/minix/dir.c b/fs/minix/dir.c index dfaf6fa9b7b5..118e4e7bc935 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -156,7 +156,7 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; - struct inode * dir = dentry->d_parent->d_inode; + struct inode * dir = d_inode(dentry->d_parent); struct super_block * sb = dir->i_sb; struct minix_sb_info * sbi = minix_sb(sb); unsigned long n; @@ -203,7 +203,7 @@ found: int minix_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct super_block * sb = dir->i_sb; diff --git a/fs/minix/file.c b/fs/minix/file.c index 6d63e27ec961..94f0eb9a6e2c 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -23,7 +23,7 @@ const struct file_operations minix_file_operations = { static int minix_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 3f57af196a7d..1182d1e26a9c 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -626,8 +626,8 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct super_block *sb = dentry->d_sb; - generic_fillattr(dentry->d_inode, stat); - if (INODE_VERSION(dentry->d_inode) == MINIX_V1) + generic_fillattr(d_inode(dentry), stat); + if (INODE_VERSION(d_inode(dentry)) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index cd950e2331b6..a795a11e50c7 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -104,7 +104,7 @@ out_fail: static int minix_link(struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); @@ -151,7 +151,7 @@ out_dir: static int minix_unlink(struct inode * dir, struct dentry *dentry) { int err = -ENOENT; - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct page * page; struct minix_dir_entry * de; @@ -171,7 +171,7 @@ end_unlink: static int minix_rmdir(struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); int err = -ENOTEMPTY; if (minix_empty_dir(inode)) { @@ -187,8 +187,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, struct inode * new_dir, struct dentry *new_dentry) { - struct inode * old_inode = old_dentry->d_inode; - struct inode * new_inode = new_dentry->d_inode; + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct minix_dir_entry * dir_de = NULL; struct page * old_page; diff --git a/fs/namei.c b/fs/namei.c index ffab2e06e147..fe30d3be43a8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1415,6 +1415,7 @@ static int lookup_fast(struct nameidata *nd, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; + bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (!dentry) goto unlazy; @@ -1424,8 +1425,11 @@ static int lookup_fast(struct nameidata *nd, * the dentry name information from lookup. */ *inode = dentry->d_inode; + negative = d_is_negative(dentry); if (read_seqcount_retry(&dentry->d_seq, seq)) return -ECHILD; + if (negative) + return -ENOENT; /* * This sequence count validates that the parent had no @@ -1472,6 +1476,10 @@ unlazy: goto need_lookup; } + if (unlikely(d_is_negative(dentry))) { + dput(dentry); + return -ENOENT; + } path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); @@ -1583,14 +1591,15 @@ static inline int walk_component(struct nameidata *nd, struct path *path, goto out_err; inode = path->dentry->d_inode; + err = -ENOENT; + if (d_is_negative(path->dentry)) + goto out_path_put; } - err = -ENOENT; - if (d_is_negative(path->dentry)) - goto out_path_put; if (should_follow_link(path->dentry, follow)) { if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, path->dentry))) { + if (unlikely(nd->path.mnt != path->mnt || + unlazy_walk(nd, path->dentry))) { err = -ECHILD; goto out_err; } @@ -3035,17 +3044,17 @@ retry_lookup: BUG_ON(nd->flags & LOOKUP_RCU); inode = path->dentry->d_inode; -finish_lookup: - /* we _can_ be in RCU mode here */ error = -ENOENT; if (d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } - +finish_lookup: + /* we _can_ be in RCU mode here */ if (should_follow_link(path->dentry, !symlink_ok)) { if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, path->dentry))) { + if (unlikely(nd->path.mnt != path->mnt || + unlazy_walk(nd, path->dentry))) { error = -ECHILD; goto out; } @@ -3224,7 +3233,7 @@ static struct file *path_openat(int dfd, struct filename *pathname, if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); - goto out; + goto out2; } error = path_init(dfd, pathname, flags, nd); @@ -3254,6 +3263,7 @@ static struct file *path_openat(int dfd, struct filename *pathname, } out: path_cleanup(nd); +out2: if (!(opened & FILE_OPENED)) { BUG_ON(!error); put_filp(file); diff --git a/fs/namespace.c b/fs/namespace.c index 82ef1405260e..1b9e11167bae 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -632,14 +632,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) */ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) { - struct mount *p, *res; - res = p = __lookup_mnt(mnt, dentry); + struct mount *p, *res = NULL; + p = __lookup_mnt(mnt, dentry); if (!p) goto out; + if (!(p->mnt.mnt_flags & MNT_UMOUNT)) + res = p; hlist_for_each_entry_continue(p, mnt_hash) { if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) break; - res = p; + if (!(p->mnt.mnt_flags & MNT_UMOUNT)) + res = p; } out: return res; @@ -795,10 +798,8 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) /* * vfsmount lock must be held for write */ -static void detach_mnt(struct mount *mnt, struct path *old_path) +static void unhash_mnt(struct mount *mnt) { - old_path->dentry = mnt->mnt_mountpoint; - old_path->mnt = &mnt->mnt_parent->mnt; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); @@ -811,6 +812,26 @@ static void detach_mnt(struct mount *mnt, struct path *old_path) /* * vfsmount lock must be held for write */ +static void detach_mnt(struct mount *mnt, struct path *old_path) +{ + old_path->dentry = mnt->mnt_mountpoint; + old_path->mnt = &mnt->mnt_parent->mnt; + unhash_mnt(mnt); +} + +/* + * vfsmount lock must be held for write + */ +static void umount_mnt(struct mount *mnt) +{ + /* old mountpoint will be dropped when we can do that */ + mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint; + unhash_mnt(mnt); +} + +/* + * vfsmount lock must be held for write + */ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) @@ -1078,6 +1099,13 @@ static void mntput_no_expire(struct mount *mnt) rcu_read_unlock(); list_del(&mnt->mnt_instance); + + if (unlikely(!list_empty(&mnt->mnt_mounts))) { + struct mount *p, *tmp; + list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { + umount_mnt(p); + } + } unlock_mount_hash(); if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { @@ -1298,17 +1326,15 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static void namespace_unlock(void) { - struct hlist_head head = unmounted; + struct hlist_head head; - if (likely(hlist_empty(&head))) { - up_write(&namespace_sem); - return; - } + hlist_move_list(&unmounted, &head); - head.first->pprev = &head.first; - INIT_HLIST_HEAD(&unmounted); up_write(&namespace_sem); + if (likely(hlist_empty(&head))) + return; + synchronize_rcu(); group_pin_kill(&head); @@ -1319,49 +1345,63 @@ static inline void namespace_lock(void) down_write(&namespace_sem); } +enum umount_tree_flags { + UMOUNT_SYNC = 1, + UMOUNT_PROPAGATE = 2, + UMOUNT_CONNECTED = 4, +}; /* * mount_lock must be held * namespace_sem must be held for write - * how = 0 => just this tree, don't propagate - * how = 1 => propagate; we know that nobody else has reference to any victims - * how = 2 => lazy umount */ -void umount_tree(struct mount *mnt, int how) +static void umount_tree(struct mount *mnt, enum umount_tree_flags how) { - HLIST_HEAD(tmp_list); + LIST_HEAD(tmp_list); struct mount *p; + if (how & UMOUNT_PROPAGATE) + propagate_mount_unlock(mnt); + + /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { - hlist_del_init_rcu(&p->mnt_hash); - hlist_add_head(&p->mnt_hash, &tmp_list); + p->mnt.mnt_flags |= MNT_UMOUNT; + list_move(&p->mnt_list, &tmp_list); } - hlist_for_each_entry(p, &tmp_list, mnt_hash) + /* Hide the mounts from mnt_mounts */ + list_for_each_entry(p, &tmp_list, mnt_list) { list_del_init(&p->mnt_child); + } - if (how) + /* Add propogated mounts to the tmp_list */ + if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); - while (!hlist_empty(&tmp_list)) { - p = hlist_entry(tmp_list.first, struct mount, mnt_hash); - hlist_del_init_rcu(&p->mnt_hash); + while (!list_empty(&tmp_list)) { + bool disconnect; + p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; - if (how < 2) + if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; - pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted); + disconnect = !(((how & UMOUNT_CONNECTED) && + mnt_has_parent(p) && + (p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) || + IS_MNT_LOCKED_AND_LAZY(p)); + + pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, + disconnect ? &unmounted : NULL); if (mnt_has_parent(p)) { - hlist_del_init(&p->mnt_mp_list); - put_mountpoint(p->mnt_mp); mnt_add_count(p->mnt_parent, -1); - /* old mountpoint will be dropped when we can do that */ - p->mnt_ex_mountpoint = p->mnt_mountpoint; - p->mnt_mountpoint = p->mnt.mnt_root; - p->mnt_parent = p; - p->mnt_mp = NULL; + if (!disconnect) { + /* Don't forget about p */ + list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); + } else { + umount_mnt(p); + } } change_mnt_propagation(p, MS_PRIVATE); } @@ -1447,14 +1487,14 @@ static int do_umount(struct mount *mnt, int flags) if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 2); + umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 1); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } @@ -1480,13 +1520,20 @@ void __detach_mounts(struct dentry *dentry) namespace_lock(); mp = lookup_mountpoint(dentry); - if (!mp) + if (IS_ERR_OR_NULL(mp)) goto out_unlock; lock_mount_hash(); while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); - umount_tree(mnt, 2); + if (mnt->mnt.mnt_flags & MNT_UMOUNT) { + struct mount *p, *tmp; + list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { + hlist_add_head(&p->mnt_umount.s_list, &unmounted); + umount_mnt(p); + } + } + else umount_tree(mnt, UMOUNT_CONNECTED); } unlock_mount_hash(); put_mountpoint(mp); @@ -1648,7 +1695,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, out: if (res) { lock_mount_hash(); - umount_tree(res, 0); + umount_tree(res, UMOUNT_SYNC); unlock_mount_hash(); } return q; @@ -1660,8 +1707,11 @@ struct vfsmount *collect_mounts(struct path *path) { struct mount *tree; namespace_lock(); - tree = copy_tree(real_mount(path->mnt), path->dentry, - CL_COPY_ALL | CL_PRIVATE); + if (!check_mnt(real_mount(path->mnt))) + tree = ERR_PTR(-EINVAL); + else + tree = copy_tree(real_mount(path->mnt), path->dentry, + CL_COPY_ALL | CL_PRIVATE); namespace_unlock(); if (IS_ERR(tree)) return ERR_CAST(tree); @@ -1672,7 +1722,7 @@ void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); lock_mount_hash(); - umount_tree(real_mount(mnt), 0); + umount_tree(real_mount(mnt), UMOUNT_SYNC); unlock_mount_hash(); namespace_unlock(); } @@ -1855,7 +1905,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); - umount_tree(child, 0); + umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); @@ -2035,7 +2085,7 @@ static int do_loopback(struct path *path, const char *old_name, err = graft_tree(mnt, parent, mp); if (err) { lock_mount_hash(); - umount_tree(mnt, 0); + umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } out2: @@ -2406,7 +2456,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) while (!list_empty(&graveyard)) { mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); - umount_tree(mnt, 1); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } unlock_mount_hash(); namespace_unlock(); @@ -2477,7 +2527,7 @@ static void shrink_submounts(struct mount *mnt) m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); - umount_tree(m, 1); + umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC); } } } @@ -3129,6 +3179,12 @@ bool fs_fully_visible(struct file_system_type *type) if (mnt->mnt.mnt_sb->s_type != type) continue; + /* This mount is not fully visible if it's root directory + * is not the root directory of the filesystem. + */ + if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) + continue; + /* This mount is not fully visible if there are any child mounts * that cover anything except for empty directories. */ diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index e7ca827d7694..80021c709af9 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -127,7 +127,7 @@ static inline int ncp_case_sensitive(const struct inode *i) static int ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) { - struct inode *inode = ACCESS_ONCE(dentry->d_inode); + struct inode *inode = d_inode_rcu(dentry); if (!inode) return 0; @@ -162,7 +162,7 @@ ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry, if (len != name->len) return 1; - pinode = ACCESS_ONCE(parent->d_inode); + pinode = d_inode_rcu(parent); if (!pinode) return 1; @@ -180,7 +180,7 @@ ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry, static int ncp_delete_dentry(const struct dentry * dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode) { if (is_bad_inode(inode)) @@ -224,7 +224,7 @@ ncp_force_unlink(struct inode *dir, struct dentry* dentry) memset(&info, 0, sizeof(info)); /* remove the Read-Only flag on the NW server */ - inode = dentry->d_inode; + inode = d_inode(dentry); old_nwattr = NCP_FINFO(inode)->nwattr; info.attributes = old_nwattr & ~(aRONLY|aDELETEINHIBIT|aRENAMEINHIBIT); @@ -254,7 +254,7 @@ ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_na { struct nw_modify_dos_info info; int res=0x90,res2; - struct inode *old_inode = old_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); __le32 old_nwattr = NCP_FINFO(old_inode)->nwattr; __le32 new_nwattr = 0; /* shut compiler warning */ int old_nwattr_changed = 0; @@ -268,8 +268,8 @@ ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_na res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info); if (!res2) old_nwattr_changed = 1; - if (new_dentry && new_dentry->d_inode) { - new_nwattr = NCP_FINFO(new_dentry->d_inode)->nwattr; + if (new_dentry && d_really_is_positive(new_dentry)) { + new_nwattr = NCP_FINFO(d_inode(new_dentry))->nwattr; info.attributes = new_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info); if (!res2) @@ -324,9 +324,9 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) return -ECHILD; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto finished; server = NCP_SERVER(dir); @@ -367,7 +367,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) * what we remember, it's not valid any more. */ if (!res) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); mutex_lock(&inode->i_mutex); if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) { @@ -388,7 +388,7 @@ finished: static time_t ncp_obtain_mtime(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ncp_server *server = NCP_SERVER(inode); struct nw_info_struct i; @@ -404,7 +404,7 @@ static time_t ncp_obtain_mtime(struct dentry *dentry) static inline void ncp_invalidate_dircache_entries(struct dentry *parent) { - struct ncp_server *server = NCP_SERVER(parent->d_inode); + struct ncp_server *server = NCP_SERVER(d_inode(parent)); struct dentry *dentry; spin_lock(&parent->d_lock); @@ -418,7 +418,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent) static int ncp_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct page *page = NULL; struct ncp_server *server = NCP_SERVER(inode); union ncp_dir_cache *cache = NULL; @@ -491,13 +491,13 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx) goto invalid_cache; } spin_unlock(&dentry->d_lock); - if (!dent->d_inode) { + if (d_really_is_negative(dent)) { dput(dent); goto invalid_cache; } over = !dir_emit(ctx, dent->d_name.name, dent->d_name.len, - dent->d_inode->i_ino, DT_UNKNOWN); + d_inode(dent)->i_ino, DT_UNKNOWN); dput(dent); if (over) goto finished; @@ -571,7 +571,7 @@ static void ncp_d_prune(struct dentry *dentry) { if (!dentry->d_fsdata) /* not referenced from page cache */ return; - NCP_FINFO(dentry->d_parent->d_inode)->flags &= ~NCPI_DIR_CACHE; + NCP_FINFO(d_inode(dentry->d_parent))->flags &= ~NCPI_DIR_CACHE; } static int @@ -580,7 +580,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, int inval_childs) { struct dentry *newdent, *dentry = file->f_path.dentry; - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct ncp_cache_control ctl = *ctrl; struct qstr qname; int valid = 0; @@ -621,7 +621,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, dentry_update_name_case(newdent, &qname); } - if (!newdent->d_inode) { + if (d_really_is_negative(newdent)) { struct inode *inode; entry->opened = 0; @@ -637,7 +637,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, spin_unlock(&dentry->d_lock); } } else { - struct inode *inode = newdent->d_inode; + struct inode *inode = d_inode(newdent); mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); ncp_update_inode2(inode, entry); @@ -659,10 +659,10 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, ctl.cache = kmap(ctl.page); } if (ctl.cache) { - if (newdent->d_inode) { + if (d_really_is_positive(newdent)) { newdent->d_fsdata = newdent; ctl.cache->dentry[ctl.idx] = newdent; - ino = newdent->d_inode->i_ino; + ino = d_inode(newdent)->i_ino; ncp_new_dentry(newdent); } valid = 1; @@ -807,7 +807,7 @@ int ncp_conn_logged_in(struct super_block *sb) } dent = sb->s_root; if (dent) { - struct inode* ino = dent->d_inode; + struct inode* ino = d_inode(dent); if (ino) { ncp_update_known_namespace(server, volNumber, NULL); NCP_FINFO(ino)->volNumber = volNumber; @@ -815,7 +815,7 @@ int ncp_conn_logged_in(struct super_block *sb) NCP_FINFO(ino)->DosDirNum = DosDirNum; result = 0; } else { - ncp_dbg(1, "sb->s_root->d_inode == NULL!\n"); + ncp_dbg(1, "d_inode(sb->s_root) == NULL!\n"); } } else { ncp_dbg(1, "sb->s_root == NULL!\n"); @@ -1055,7 +1055,7 @@ out: static int ncp_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ncp_server *server; int error; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 01a9e16e9782..9605a2f63549 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -812,7 +812,7 @@ static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) if (!d) { goto dflt; } - i = d->d_inode; + i = d_inode(d); if (!i) { goto dflt; } @@ -865,7 +865,7 @@ dflt:; int ncp_notify_change(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int result = 0; __le32 info_mask; struct nw_modify_dos_info info; @@ -878,7 +878,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) goto out; result = -EPERM; - if (IS_DEADDIR(dentry->d_inode)) + if (IS_DEADDIR(d_inode(dentry))) goto out; /* ageing the dentry to force validation */ diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index cf7e043a9447..79b113048eac 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -376,7 +376,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg struct dentry* dentry = inode->i_sb->s_root; if (dentry) { - struct inode* s_inode = dentry->d_inode; + struct inode* s_inode = d_inode(dentry); if (s_inode) { sr.volNumber = NCP_FINFO(s_inode)->volNumber; @@ -384,7 +384,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg sr.namespace = server->name_space[sr.volNumber]; result = 0; } else - ncp_dbg(1, "s_root->d_inode==NULL\n"); + ncp_dbg(1, "d_inode(s_root)==NULL\n"); } else ncp_dbg(1, "s_root==NULL\n"); } else { @@ -431,7 +431,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg if (result == 0) { dentry = inode->i_sb->s_root; if (dentry) { - struct inode* s_inode = dentry->d_inode; + struct inode* s_inode = d_inode(dentry); if (s_inode) { NCP_FINFO(s_inode)->volNumber = vnum; @@ -439,7 +439,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg NCP_FINFO(s_inode)->DosDirNum = dosde; server->root_setuped = 1; } else { - ncp_dbg(1, "s_root->d_inode==NULL\n"); + ncp_dbg(1, "d_inode(s_root)==NULL\n"); result = -EIO; } } else { diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index 2b502a0d7941..88dbbc9fcf4d 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -727,7 +727,7 @@ int ncp_del_file_or_subdir2(struct ncp_server *server, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); __u8 volnum; __le32 dirent; diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index 1a63bfdb4a65..421b6f91e8ec 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -156,7 +156,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { goto failfree; } - inode=dentry->d_inode; + inode=d_inode(dentry); if (ncp_make_open(inode, O_WRONLY)) goto failfree; diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 1e987acf20c9..8664417955a2 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -22,7 +22,7 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o obj-$(CONFIG_NFS_V4) += nfsv4.o CFLAGS_nfs4trace.o += -I$(src) nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ - delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ + delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \ dns_resolve.o nfs4trace.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 1cac3c175d18..d2554fe140a3 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -890,6 +890,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .free_deviceid_node = bl_free_deviceid_node, .pg_read_ops = &bl_pg_read_ops, .pg_write_ops = &bl_pg_write_ops, + .sync = pnfs_generic_sync, }; static int __init nfs4blocklayout_init(void) diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 5aed4f98df41..e535599a0719 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -33,7 +33,7 @@ bl_free_deviceid_node(struct nfs4_deviceid_node *d) container_of(d, struct pnfs_block_dev, node); bl_free_device(dev); - kfree(dev); + kfree_rcu(dev, node.rcu); } static int diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 351be9205bf8..8d129bb7355a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -128,7 +128,7 @@ nfs41_callback_svc(void *vrqstp) if (try_to_freeze()) continue; - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, @@ -142,10 +142,10 @@ nfs41_callback_svc(void *vrqstp) error); } else { spin_unlock_bh(&serv->sv_cb_lock); - /* schedule_timeout to game the hung task watchdog */ - schedule_timeout(60 * HZ); + schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } + flush_signals(current); } return 0; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 19874151e95c..892aefff3630 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -31,7 +31,6 @@ #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index a6ad68865880..029d688a969f 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -378,7 +378,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct if (freeme == NULL) goto out; } - list_add_rcu(&delegation->super_list, &server->delegations); + list_add_tail_rcu(&delegation->super_list, &server->delegations); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -514,7 +514,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode) delegation = nfs_inode_detach_delegation(inode); if (delegation != NULL) - nfs_do_return_delegation(inode, delegation, 0); + nfs_do_return_delegation(inode, delegation, 1); } /** diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c19e16f0b2d0..b2c8b31b2be7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -416,15 +416,14 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { struct nfs_inode *nfsi; - if (dentry->d_inode == NULL) - goto different; + if (d_really_is_negative(dentry)) + return 0; - nfsi = NFS_I(dentry->d_inode); + nfsi = NFS_I(d_inode(dentry)); if (entry->fattr->fileid == nfsi->fileid) return 1; if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) return 1; -different: return 0; } @@ -473,7 +472,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) struct qstr filename = QSTR_INIT(entry->name, entry->len); struct dentry *dentry; struct dentry *alias; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct inode *inode; int status; @@ -497,9 +496,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) goto out; if (nfs_same_file(dentry, entry)) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - status = nfs_refresh_inode(dentry->d_inode, entry->fattr); + status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) - nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label); + nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label); goto out; } else { d_invalidate(dentry); @@ -544,6 +543,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en if (scratch == NULL) return -ENOMEM; + if (buflen == 0) + goto out_nopages; + xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); @@ -565,6 +567,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en break; } while (!entry->eof); +out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { array = nfs_readdir_get_array(page); if (!IS_ERR(array)) { @@ -870,7 +873,7 @@ static bool nfs_dir_mapping_need_revalidate(struct inode *dir) static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); nfs_readdir_descriptor_t my_desc, *desc = &my_desc; struct nfs_open_dir_context *dir_ctx = file->private_data; @@ -1118,15 +1121,15 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) { parent = ACCESS_ONCE(dentry->d_parent); - dir = ACCESS_ONCE(parent->d_inode); + dir = d_inode_rcu(parent); if (!dir) return -ECHILD; } else { parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); } nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) { if (nfs_neg_need_reval(dir, dentry, flags)) { @@ -1242,7 +1245,7 @@ out_error: } /* - * A weaker form of d_revalidate for revalidating just the dentry->d_inode + * A weaker form of d_revalidate for revalidating just the d_inode(dentry) * when we don't really care about the dentry name. This is called when a * pathwalk ends on a dentry that was not found via a normal lookup in the * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals). @@ -1253,7 +1256,7 @@ out_error: static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) { int error; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); /* * I believe we can only get a negative dentry here in the case of a @@ -1287,7 +1290,7 @@ static int nfs_dentry_delete(const struct dentry *dentry) dentry, dentry->d_flags); /* Unhash any dentry with a stale inode */ - if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode)) + if (d_really_is_positive(dentry) && NFS_STALE(d_inode(dentry))) return 1; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { @@ -1491,7 +1494,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, int err; /* Expect a negative dentry */ - BUG_ON(dentry->d_inode); + BUG_ON(d_inode(dentry)); dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); @@ -1587,7 +1590,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) goto no_open; - inode = dentry->d_inode; + inode = d_inode(dentry); /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. @@ -1598,12 +1601,12 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) { parent = ACCESS_ONCE(dentry->d_parent); - dir = ACCESS_ONCE(parent->d_inode); + dir = d_inode_rcu(parent); if (!dir) return -ECHILD; } else { parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); } if (!nfs_neg_need_reval(dir, dentry, flags)) ret = 1; @@ -1643,14 +1646,14 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs4_label *label) { struct dentry *parent = dget_parent(dentry); - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct inode *inode; int error = -EACCES; d_drop(dentry); /* We may have been initialized further down */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto out; if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); @@ -1768,7 +1771,7 @@ EXPORT_SYMBOL_GPL(nfs_mkdir); static void nfs_dentry_handle_enoent(struct dentry *dentry) { - if (dentry->d_inode != NULL && !d_unhashed(dentry)) + if (d_really_is_positive(dentry) && !d_unhashed(dentry)) d_delete(dentry); } @@ -1780,13 +1783,13 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_rmdir_enter(dir, dentry); - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { nfs_wait_on_sillyrename(dentry); error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); /* Ensure the VFS deletes this inode */ switch (error) { case 0: - clear_nlink(dentry->d_inode); + clear_nlink(d_inode(dentry)); break; case -ENOENT: nfs_dentry_handle_enoent(dentry); @@ -1808,8 +1811,8 @@ EXPORT_SYMBOL_GPL(nfs_rmdir); */ static int nfs_safe_remove(struct dentry *dentry) { - struct inode *dir = dentry->d_parent->d_inode; - struct inode *inode = dentry->d_inode; + struct inode *dir = d_inode(dentry->d_parent); + struct inode *inode = d_inode(dentry); int error = -EBUSY; dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry); @@ -1853,7 +1856,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) if (d_count(dentry) > 1) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ - write_inode_now(dentry->d_inode, 0); + write_inode_now(d_inode(dentry), 0); error = nfs_sillyrename(dir, dentry); goto out; } @@ -1931,7 +1934,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ - if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0, + if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0, GFP_KERNEL)) { SetPageUptodate(page); unlock_page(page); @@ -1950,7 +1953,7 @@ EXPORT_SYMBOL_GPL(nfs_symlink); int nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int error; dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n", @@ -1997,8 +2000,8 @@ EXPORT_SYMBOL_GPL(nfs_link); int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct dentry *dentry = NULL, *rehash = NULL; struct rpc_task *task; int error = -EBUSY; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 682f65fe09b5..38678d9a5cc4 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -129,22 +129,25 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) int i; ssize_t count; - WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count); - - count = dreq->mirrors[hdr->pgio_mirror_idx].count; - if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { - count = hdr->io_start + hdr->good_bytes - dreq->io_start; - dreq->mirrors[hdr->pgio_mirror_idx].count = count; - } - - /* update the dreq->count by finding the minimum agreed count from all - * mirrors */ - count = dreq->mirrors[0].count; + if (dreq->mirror_count == 1) { + dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; + dreq->count += hdr->good_bytes; + } else { + /* mirrored writes */ + count = dreq->mirrors[hdr->pgio_mirror_idx].count; + if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { + count = hdr->io_start + hdr->good_bytes - dreq->io_start; + dreq->mirrors[hdr->pgio_mirror_idx].count = count; + } + /* update the dreq->count by finding the minimum agreed count from all + * mirrors */ + count = dreq->mirrors[0].count; - for (i = 1; i < dreq->mirror_count; i++) - count = min(count, dreq->mirrors[i].count); + for (i = 1; i < dreq->mirror_count; i++) + count = min(count, dreq->mirrors[i].count); - dreq->count = count; + dreq->count = count; + } } /* @@ -258,18 +261,11 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) if (!IS_SWAPFILE(inode)) return 0; -#ifndef CONFIG_NFS_SWAP - dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", - iocb->ki_filp, (long long) pos, iter->nr_segs); - - return -EINVAL; -#else VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) return nfs_file_direct_read(iocb, iter, pos); return nfs_file_direct_write(iocb, iter); -#endif /* CONFIG_NFS_SWAP */ } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -386,7 +382,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) if (write) nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_done(inode); + inode_dio_end(inode); if (dreq->iocb) { long res = (long) dreq->error; @@ -403,8 +399,8 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) static void nfs_direct_readpage_release(struct nfs_page *req) { dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", - req->wb_context->dentry->d_inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), + d_inode(req->wb_context->dentry)->i_sb->s_id, + (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), req->wb_bytes, (long long)req_offset(req)); nfs_release_request(req); @@ -486,7 +482,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, &nfs_direct_read_completion_ops); get_dreq(dreq); desc.pg_dreq = dreq; - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); while (iov_iter_count(iter)) { struct page **pagevec; @@ -538,7 +534,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * generic layer handle the completion. */ if (requested_bytes == 0) { - inode_dio_done(inode); + inode_dio_end(inode); nfs_direct_req_release(dreq); return result < 0 ? result : -EIO; } @@ -872,7 +868,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); NFS_I(inode)->write_io += iov_iter_count(iter); while (iov_iter_count(iter)) { @@ -928,7 +924,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * generic layer handle the completion. */ if (requested_bytes == 0) { - inode_dio_done(inode); + inode_dio_end(inode); nfs_direct_req_release(dreq); return result < 0 ? result : -EIO; } @@ -1030,6 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (i_size_read(inode) < iocb->ki_pos) i_size_write(inode, iocb->ki_pos); spin_unlock(&inode->i_lock); + generic_write_sync(file, pos, result); } } nfs_direct_req_release(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c40e4363e746..8b8d83a526ce 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -280,6 +280,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); + nfs_inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -782,7 +783,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * Flush all pending writes before doing anything * with locks.. */ - nfs_sync_mapping(filp->f_mapping); + vfs_fsync(filp, 0); l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); if (!IS_ERR(l_ctx)) { diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 91e88a7ecef0..a46bf6de9ce4 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -258,7 +258,8 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) hdr->res.verf->committed != NFS_DATA_SYNC) return; - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -373,7 +374,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, } if (data->verf.committed == NFS_UNSTABLE) - pnfs_commit_set_layoutcommit(data); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } @@ -1086,7 +1087,7 @@ filelayout_alloc_deviceid_node(struct nfs_server *server, } static void -filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) +filelayout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); } @@ -1137,7 +1138,8 @@ static struct pnfs_layoutdriver_type filelayout_type = { .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, .alloc_deviceid_node = filelayout_alloc_deviceid_node, - .free_deviceid_node = filelayout_free_deveiceid_node, + .free_deviceid_node = filelayout_free_deviceid_node, + .sync = pnfs_nfs_generic_sync, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4f372e224603..4946ef40ba87 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -55,7 +55,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) nfs4_pnfs_ds_put(ds); } kfree(dsaddr->stripe_indices); - kfree(dsaddr); + kfree_rcu(dsaddr, id_node.rcu); } /* Decode opaque device data and return the result */ diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 315cc68945b9..7d05089e52d6 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -11,10 +11,10 @@ #include <linux/module.h> #include <linux/sunrpc/metrics.h> -#include <linux/nfs_idmap.h> #include "flexfilelayout.h" #include "../nfs4session.h" +#include "../nfs4idmap.h" #include "../internal.h" #include "../delegation.h" #include "../nfs4trace.h" @@ -891,7 +891,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, static void ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) { - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -1074,7 +1075,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, } if (data->verf.committed == NFS_UNSTABLE) - pnfs_commit_set_layoutcommit(data); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } @@ -1414,7 +1415,7 @@ ff_layout_get_ds_info(struct inode *inode) } static void -ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d) +ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, id_node)); @@ -1498,7 +1499,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .pg_read_ops = &ff_layout_pg_read_ops, .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, - .free_deviceid_node = ff_layout_free_deveiceid_node, + .free_deviceid_node = ff_layout_free_deviceid_node, .mark_request_commit = pnfs_layout_mark_request_commit, .clear_request_commit = pnfs_generic_clear_request_commit, .scan_commit_lists = pnfs_generic_scan_commit_lists, @@ -1508,6 +1509,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .write_pagelist = ff_layout_write_pagelist, .alloc_deviceid_node = ff_layout_alloc_deviceid_node, .encode_layoutreturn = ff_layout_encode_layoutreturn, + .sync = pnfs_nfs_generic_sync, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e2c01f204a95..77a2d026aa12 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -30,7 +30,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { nfs4_print_deviceid(&mirror_ds->id_node.deviceid); nfs4_pnfs_ds_put(mirror_ds->ds); - kfree(mirror_ds); + kfree_rcu(mirror_ds, id_node.rcu); } /* Decode opaque device data and construct new_ds using it */ diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 9ac3846cb59e..a608ffd28acc 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -56,11 +56,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i * This again causes shrink_dcache_for_umount_subtree() to * Oops, since the test for IS_ROOT() will fail. */ - spin_lock(&sb->s_root->d_inode->i_lock); + spin_lock(&d_inode(sb->s_root)->i_lock); spin_lock(&sb->s_root->d_lock); hlist_del_init(&sb->s_root->d_u.d_alias); spin_unlock(&sb->s_root->d_lock); - spin_unlock(&sb->s_root->d_inode->i_lock); + spin_unlock(&d_inode(sb->s_root)->i_lock); } return 0; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d42dff6d5e98..f734562c6d24 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -133,6 +133,13 @@ void nfs_evict_inode(struct inode *inode) nfs_clear_inode(inode); } +int nfs_sync_inode(struct inode *inode) +{ + nfs_inode_dio_wait(inode); + return nfs_wb_all(inode); +} +EXPORT_SYMBOL_GPL(nfs_sync_inode); + /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk */ @@ -192,7 +199,6 @@ void nfs_zap_caches(struct inode *inode) nfs_zap_caches_locked(inode); spin_unlock(&inode->i_lock); } -EXPORT_SYMBOL_GPL(nfs_zap_caches); void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { @@ -495,7 +501,7 @@ EXPORT_SYMBOL_GPL(nfs_fhget); int nfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; int error = -ENOMEM; @@ -525,10 +531,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) { - nfs_inode_dio_wait(inode); - nfs_wb_all(inode); - } + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -621,7 +625,7 @@ static void nfs_request_parent_use_readdirplus(struct dentry *dentry) struct dentry *parent; parent = dget_parent(dentry); - nfs_force_use_readdirplus(parent->d_inode); + nfs_force_use_readdirplus(d_inode(parent)); dput(parent); } @@ -637,15 +641,16 @@ static bool nfs_need_revalidate_inode(struct inode *inode) int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; int err = 0; trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - nfs_inode_dio_wait(inode); - err = filemap_write_and_wait(inode->i_mapping); + mutex_lock(&inode->i_mutex); + err = nfs_sync_inode(inode); + mutex_unlock(&inode->i_mutex); if (err) goto out; } @@ -708,7 +713,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) { struct nfs_lock_context *res, *new = NULL; - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); @@ -736,7 +741,7 @@ EXPORT_SYMBOL_GPL(nfs_get_lock_context); void nfs_put_lock_context(struct nfs_lock_context *l_ctx) { struct nfs_open_context *ctx = l_ctx->open_context; - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; @@ -763,7 +768,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) return; if (!is_sync) return; - inode = ctx->dentry->d_inode; + inode = d_inode(ctx->dentry); if (!list_empty(&NFS_I(inode)->open_files)) return; server = NFS_SERVER(inode); @@ -810,7 +815,7 @@ EXPORT_SYMBOL_GPL(get_nfs_open_context); static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); struct super_block *sb = ctx->dentry->d_sb; if (!list_empty(&ctx->list)) { @@ -842,7 +847,7 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context); */ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) { - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); @@ -885,7 +890,7 @@ static void nfs_file_clear_open_context(struct file *filp) struct nfs_open_context *ctx = nfs_file_open_context(filp); if (ctx) { - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); filp->private_data = NULL; spin_lock(&inode->i_lock); @@ -1588,6 +1593,19 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); + +static inline bool nfs_fileid_valid(struct nfs_inode *nfsi, + struct nfs_fattr *fattr) +{ + bool ret1 = true, ret2 = true; + + if (fattr->valid & NFS_ATTR_FATTR_FILEID) + ret1 = (nfsi->fileid == fattr->fileid); + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + ret2 = (nfsi->fileid == fattr->mounted_on_fileid); + return ret1 || ret2; +} + /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state @@ -1614,7 +1632,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) { + if (!nfs_fileid_valid(nfsi, fattr)) { printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, @@ -1819,7 +1837,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; - nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); + nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index b5a0afc3ee10..c8162c660c44 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -139,7 +139,7 @@ EXPORT_SYMBOL_GPL(nfs_path); struct vfsmount *nfs_d_automount(struct path *path) { struct vfsmount *mnt; - struct nfs_server *server = NFS_SERVER(path->dentry->d_inode); + struct nfs_server *server = NFS_SERVER(d_inode(path->dentry)); struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; @@ -180,16 +180,16 @@ out_nofree: static int nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - if (NFS_FH(dentry->d_inode)->size != 0) + if (NFS_FH(d_inode(dentry))->size != 0) return nfs_getattr(mnt, dentry, stat); - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return 0; } static int nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) { - if (NFS_FH(dentry->d_inode)->size != 0) + if (NFS_FH(d_inode(dentry))->size != 0) return nfs_setattr(dentry, attr); return -EACCES; } @@ -279,7 +279,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, struct dentry *parent = dget_parent(dentry); /* Look it up again to get its attributes */ - err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL); + err = server->nfs_client->rpc_ops->lookup(d_inode(parent), &dentry->d_name, fh, fattr, NULL); dput(parent); if (err != 0) return ERR_PTR(err); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 658e586ca438..1ebe2fc7cda2 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -279,7 +279,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, ssize_t nfs3_listxattr(struct dentry *dentry, char *data, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); ssize_t result = 0; int error; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1f11d2533ee4..cb28cceefebe 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -120,7 +120,7 @@ static int nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nfs3_sattrargs arg = { .fh = NFS_FH(inode), .sattr = sattr, @@ -386,13 +386,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, * not sure this buys us anything (and I'd have * to revamp the NFSv3 XDR code) */ status = nfs3_proc_setattr(dentry, data->res.fattr, sattr); - nfs_post_op_update_inode(dentry->d_inode, data->res.fattr); + nfs_post_op_update_inode(d_inode(dentry), data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) goto out_release_acls; } - status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); out_release_acls: posix_acl_release(acl); @@ -570,7 +570,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) if (status != 0) goto out_release_acls; - status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); out_release_acls: posix_acl_release(acl); @@ -623,7 +623,7 @@ static int nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page **pages, unsigned int count, int plus) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); __be32 *verf = NFS_I(dir)->cookieverf; struct nfs3_readdirargs arg = { .fh = NFS_FH(dir), @@ -715,7 +715,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (status != 0) goto out_release_acls; - status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); out_release_acls: posix_acl_release(acl); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index cb170722769c..3a9e75235f30 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -36,13 +36,16 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, loff_t offset, loff_t len) { struct inode *inode = file_inode(filep); + struct nfs_server *server = NFS_SERVER(inode); struct nfs42_falloc_args args = { .falloc_fh = NFS_FH(inode), .falloc_offset = offset, .falloc_length = len, + .falloc_bitmask = server->cache_consistency_bitmask, + }; + struct nfs42_falloc_res res = { + .falloc_server = server, }; - struct nfs42_falloc_res res; - struct nfs_server *server = NFS_SERVER(inode); int status; msg->rpc_argp = &args; @@ -52,8 +55,17 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, if (status) return status; - return nfs4_call_sync(server->client, server, msg, - &args.seq_args, &res.seq_res, 0); + res.falloc_fattr = nfs_alloc_fattr(); + if (!res.falloc_fattr) + return -ENOMEM; + + status = nfs4_call_sync(server->client, server, msg, + &args.seq_args, &res.seq_res, 0); + if (status == 0) + status = nfs_post_op_update_inode(inode, res.falloc_fattr); + + kfree(res.falloc_fattr); + return status; } static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, @@ -84,9 +96,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + + mutex_unlock(&inode->i_mutex); return err; } @@ -101,9 +117,16 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) return -EOPNOTSUPP; + nfs_wb_all(inode); + mutex_lock(&inode->i_mutex); + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) + truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + + mutex_unlock(&inode->i_mutex); return err; } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 038a7e1521fa..1a25b27248f2 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -25,16 +25,20 @@ #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_allocate_maxsz) + encode_allocate_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_allocate_maxsz) + decode_allocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_deallocate_maxsz) + encode_deallocate_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_deallocate_maxsz) + decode_deallocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_seek_maxsz) @@ -92,6 +96,7 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->falloc_fh, &hdr); encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); encode_nops(&hdr); } @@ -110,6 +115,7 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->falloc_fh, &hdr); encode_deallocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); encode_nops(&hdr); } @@ -183,6 +189,9 @@ static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp, if (status) goto out; status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); out: return status; } @@ -207,6 +216,9 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, if (status) goto out; status = decode_deallocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); out: return status; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 86d6214ea022..e42be52a8c18 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -4,7 +4,6 @@ */ #include <linux/module.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include <linux/nfs_mount.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/auth.h> @@ -15,6 +14,7 @@ #include "callback.h" #include "delegation.h" #include "nfs4session.h" +#include "nfs4idmap.h" #include "pnfs.h" #include "netns.h" @@ -1130,7 +1130,7 @@ error: */ static int nfs_probe_destination(struct nfs_server *server) { - struct inode *inode = server->super->s_root->d_inode; + struct inode *inode = d_inode(server->super->s_root); struct nfs_fattr *fattr; int error; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 0181cde1d102..f58c17b3b480 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -10,6 +10,8 @@ #include "fscache.h" #include "pnfs.h" +#include "nfstrace.h" + #ifdef CONFIG_NFS_V4_2 #include "nfs42.h" #endif @@ -46,7 +48,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) openflags &= ~(O_CREAT|O_EXCL); parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); err = PTR_ERR(ctx); @@ -57,7 +59,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (openflags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; - nfs_wb_all(inode); + nfs_sync_inode(inode); } inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); @@ -74,7 +76,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) goto out_drop; } } - if (inode != dentry->d_inode) + if (inode != d_inode(dentry)) goto out_drop; nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); @@ -100,6 +102,9 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ret; struct inode *inode = file_inode(file); + trace_nfs_fsync_enter(inode); + + nfs_inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -107,7 +112,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) mutex_lock(&inode->i_mutex); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) - ret = pnfs_layoutcommit_inode(inode, true); + ret = pnfs_sync_inode(inode, !!datasync); mutex_unlock(&inode->i_mutex); /* * If nfs_file_fsync_commit detected a server reboot, then @@ -118,6 +123,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) end = LLONG_MAX; } while (ret == -EAGAIN); + trace_nfs_fsync_exit(inode, ret); return ret; } @@ -152,15 +158,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); if (mode & FALLOC_FL_PUNCH_HOLE) - ret = nfs42_proc_deallocate(filep, offset, len); - else - ret = nfs42_proc_allocate(filep, offset, len); - mutex_unlock(&inode->i_mutex); - - nfs_zap_caches(inode); - return ret; + return nfs42_proc_deallocate(filep, offset, len); + return nfs42_proc_allocate(filep, offset, len); } #endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/nfs/idmap.c b/fs/nfs/nfs4idmap.c index 857e2a99acc8..2e1737c40a29 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -36,7 +36,6 @@ #include <linux/types.h> #include <linux/parser.h> #include <linux/fs.h> -#include <linux/nfs_idmap.h> #include <net/net_namespace.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> @@ -49,6 +48,7 @@ #include "internal.h" #include "netns.h" +#include "nfs4idmap.h" #include "nfs4trace.h" #define NFS_UINT_MAXLEN 11 diff --git a/fs/nfs/nfs4idmap.h b/fs/nfs/nfs4idmap.h new file mode 100644 index 000000000000..de44d7330ab3 --- /dev/null +++ b/fs/nfs/nfs4idmap.h @@ -0,0 +1,68 @@ +/* + * fs/nfs/nfs4idmap.h + * + * UID and GID to name mapping for clients. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen <marius@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef NFS_IDMAP_H +#define NFS_IDMAP_H + +#include <linux/uidgid.h> +#include <uapi/linux/nfs_idmap.h> + + +/* Forward declaration to make this header independent of others */ +struct nfs_client; +struct nfs_server; +struct nfs_fattr; +struct nfs4_string; + +int nfs_idmap_init(void); +void nfs_idmap_quit(void); +int nfs_idmap_new(struct nfs_client *); +void nfs_idmap_delete(struct nfs_client *); + +void nfs_fattr_init_names(struct nfs_fattr *fattr, + struct nfs4_string *owner_name, + struct nfs4_string *group_name); +void nfs_fattr_free_names(struct nfs_fattr *); +void nfs_fattr_map_and_free_names(struct nfs_server *, struct nfs_fattr *); + +int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, kuid_t *); +int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, kgid_t *); +int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t); +int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t); + +int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res); + +extern unsigned int nfs_idmap_cache_timeout; +#endif /* NFS_IDMAP_H */ diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3d83cb1fdc70..f592672373cb 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -375,7 +375,7 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry * dprintk("%s: getting locations for %pd2\n", __func__, dentry); - err = nfs4_proc_fs_locations(client, parent->d_inode, &dentry->d_name, fs_locations, page); + err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page); dput(parent); if (err != 0 || fs_locations->nlocations <= 0 || @@ -396,7 +396,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, { rpc_authflavor_t flavor = server->client->cl_auth->au_flavor; struct dentry *parent = dget_parent(dentry); - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct qstr *name = &dentry->d_name; struct rpc_clnt *client; struct vfsmount *mnt; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 627f37c44456..55e1e3af23a3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -38,6 +38,7 @@ #include <linux/mm.h> #include <linux/delay.h> #include <linux/errno.h> +#include <linux/file.h> #include <linux/string.h> #include <linux/ratelimit.h> #include <linux/printk.h> @@ -51,7 +52,6 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/module.h> -#include <linux/nfs_idmap.h> #include <linux/xattr.h> #include <linux/utsname.h> #include <linux/freezer.h> @@ -63,6 +63,7 @@ #include "callback.h" #include "pnfs.h" #include "netns.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "fscache.h" @@ -185,7 +186,8 @@ const u32 nfs4_fattr_bitmap[3] = { | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_METADATA - | FATTR4_WORD1_TIME_MODIFY, + | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_MOUNTED_ON_FILEID, #ifdef CONFIG_NFS_V4_SECURITY_LABEL FATTR4_WORD2_SECURITY_LABEL #endif @@ -293,7 +295,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = xdr_one; /* bitmap length */ *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ *p++ = htonl(8); /* attribute buffer length */ - p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode)); + p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry))); } *p++ = xdr_one; /* next */ @@ -305,7 +307,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = xdr_one; /* bitmap length */ *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ *p++ = htonl(8); /* attribute buffer length */ - p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode)); + p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent))); readdir->pgbase = (char *)p - (char *)start; readdir->count -= readdir->pgbase; @@ -1004,7 +1006,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, gfp_t gfp_mask) { struct dentry *parent = dget_parent(dentry); - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct nfs_server *server = NFS_SERVER(dir); struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); struct nfs4_opendata *p; @@ -1057,7 +1059,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: case NFS4_OPEN_CLAIM_DELEG_PREV_FH: - p->o_arg.fh = NFS_FH(dentry->d_inode); + p->o_arg.fh = NFS_FH(d_inode(dentry)); } if (attrs != NULL && attrs->ia_valid != 0) { __u32 verf[2]; @@ -1794,7 +1796,7 @@ static const struct rpc_call_ops nfs4_open_confirm_ops = { */ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) { - struct nfs_server *server = NFS_SERVER(data->dir->d_inode); + struct nfs_server *server = NFS_SERVER(d_inode(data->dir)); struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM], @@ -1951,7 +1953,7 @@ static const struct rpc_call_ops nfs4_open_ops = { static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) { - struct inode *dir = data->dir->d_inode; + struct inode *dir = d_inode(data->dir); struct nfs_server *server = NFS_SERVER(dir); struct nfs_openargs *o_arg = &data->o_arg; struct nfs_openres *o_res = &data->o_res; @@ -1998,7 +2000,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) static int _nfs4_recover_proc_open(struct nfs4_opendata *data) { - struct inode *dir = data->dir->d_inode; + struct inode *dir = d_inode(data->dir); struct nfs_openres *o_res = &data->o_res; int status; @@ -2067,7 +2069,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred, */ static int _nfs4_proc_open(struct nfs4_opendata *data) { - struct inode *dir = data->dir->d_inode; + struct inode *dir = d_inode(data->dir); struct nfs_server *server = NFS_SERVER(dir); struct nfs_openargs *o_arg = &data->o_arg; struct nfs_openres *o_res = &data->o_res; @@ -2314,7 +2316,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); dentry = opendata->dentry; - if (dentry->d_inode == NULL) { + if (d_really_is_negative(dentry)) { /* FIXME: Is this d_drop() ever needed? */ d_drop(dentry); dentry = d_add_unique(dentry, igrab(state->inode)); @@ -2325,7 +2327,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, ctx->dentry = dget(dentry); } nfs_set_verifier(dentry, - nfs_save_change_attribute(opendata->dir->d_inode)); + nfs_save_change_attribute(d_inode(opendata->dir))); } ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); @@ -2333,7 +2335,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, goto out; ctx->state = state; - if (dentry->d_inode == state->inode) { + if (d_inode(dentry) == state->inode) { nfs_inode_attach_open_context(ctx); if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) nfs4_schedule_stateid_recovery(server, state); @@ -2374,10 +2376,10 @@ static int _nfs4_do_open(struct inode *dir, status = nfs4_recover_expired_lease(server); if (status != 0) goto err_put_state_owner; - if (dentry->d_inode != NULL) - nfs4_return_incompatible_delegation(dentry->d_inode, fmode); + if (d_really_is_positive(dentry)) + nfs4_return_incompatible_delegation(d_inode(dentry), fmode); status = -ENOMEM; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) claim = NFS4_OPEN_CLAIM_FH; opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, label, claim, GFP_KERNEL); @@ -2400,8 +2402,8 @@ static int _nfs4_do_open(struct inode *dir, } opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; } - if (dentry->d_inode != NULL) - opendata->state = nfs4_get_open_state(dentry->d_inode, sp); + if (d_really_is_positive(dentry)) + opendata->state = nfs4_get_open_state(d_inode(dentry), sp); status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx); if (status != 0) @@ -3095,16 +3097,13 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info, bool auth_probe) { - int status; + int status = 0; - switch (auth_probe) { - case false: + if (!auth_probe) status = nfs4_lookup_root(server, fhandle, info); - if (status != -NFS4ERR_WRONGSEC) - break; - default: + + if (auth_probe || status == NFS4ERR_WRONGSEC) status = nfs4_do_find_root_sec(server, fhandle, info); - } if (status == 0) status = nfs4_server_capabilities(server, fhandle); @@ -3254,7 +3253,7 @@ static int nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct rpc_cred *cred = NULL; struct nfs4_state *state = NULL; struct nfs4_label *label = NULL; @@ -3871,13 +3870,13 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page **pages, unsigned int count, int plus) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct nfs4_readdir_arg args = { .fh = NFS_FH(dir), .pages = pages, .pgbase = 0, .count = count, - .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, + .bitmask = NFS_SERVER(d_inode(dentry))->attr_bitmask, .plus = plus, }; struct nfs4_readdir_res res; @@ -3914,8 +3913,8 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, do { err = _nfs4_proc_readdir(dentry, cred, cookie, pages, count, plus); - trace_nfs4_readdir(dentry->d_inode, err); - err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), err, + trace_nfs4_readdir(d_inode(dentry), err); + err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err, &exception); } while (exception.retry); return err; @@ -4830,7 +4829,7 @@ nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) struct nfs4_label ilabel, *olabel = NULL; struct nfs_fattr fattr; struct rpc_cred *cred; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int status; if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) @@ -5606,6 +5605,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->server = server; atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); + get_file(fl->fl_file); memcpy(&p->fl, fl, sizeof(p->fl)); return p; out_free_seqid: @@ -5670,7 +5670,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; switch (task->tk_status) { case 0: - renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), + renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), data->timestamp); if (data->arg.new_lock) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); @@ -5718,6 +5718,7 @@ static void nfs4_lock_release(void *calldata) nfs_free_seqid(data->arg.lock_seqid); nfs4_put_lock_state(data->lsp); put_nfs_open_context(data->ctx); + fput(data->fl.fl_file); kfree(data); dprintk("%s: done!\n", __func__); } @@ -6112,7 +6113,7 @@ static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key, if (strcmp(key, "") != 0) return -EINVAL; - return nfs4_proc_set_acl(dentry->d_inode, buf, buflen); + return nfs4_proc_set_acl(d_inode(dentry), buf, buflen); } static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, @@ -6121,7 +6122,7 @@ static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, if (strcmp(key, "") != 0) return -EINVAL; - return nfs4_proc_get_acl(dentry->d_inode, buf, buflen); + return nfs4_proc_get_acl(d_inode(dentry), buf, buflen); } static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, @@ -6130,7 +6131,7 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, { size_t len = sizeof(XATTR_NAME_NFSV4_ACL); - if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode))) + if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)))) return 0; if (list && len <= list_len) @@ -6158,7 +6159,7 @@ static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key, void *buf, size_t buflen, int type) { if (security_ismaclabel(key)) - return nfs4_get_security_label(dentry->d_inode, buf, buflen); + return nfs4_get_security_label(d_inode(dentry), buf, buflen); return -EOPNOTSUPP; } @@ -6168,10 +6169,10 @@ static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list, { size_t len = 0; - if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) { - len = security_inode_listsecurity(dentry->d_inode, NULL, 0); + if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) { + len = security_inode_listsecurity(d_inode(dentry), NULL, 0); if (list && len <= list_len) - security_inode_listsecurity(dentry->d_inode, list, len); + security_inode_listsecurity(d_inode(dentry), list, len); } return len; } @@ -7944,6 +7945,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, { struct nfs4_getdeviceinfo_args args = { .pdev = pdev, + .notify_types = NOTIFY_DEVICEID4_CHANGE | + NOTIFY_DEVICEID4_DELETE, }; struct nfs4_getdeviceinfo_res res = { .pdev = pdev, @@ -7958,6 +7961,11 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, dprintk("--> %s\n", __func__); status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + if (res.notification & ~args.notify_types) + dprintk("%s: unsupported notification\n", __func__); + if (res.notification != args.notify_types) + pdev->nocache = 1; + dprintk("<-- %s status=%d\n", __func__, status); return status; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f95e3b58bbc3..2782cfca2265 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -42,7 +42,6 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> @@ -57,6 +56,7 @@ #include "callback.h" #include "delegation.h" #include "internal.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "pnfs.h" #include "netns.h" @@ -1902,7 +1902,7 @@ static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) goto out; } - inode = server->super->s_root->d_inode; + inode = d_inode(server->super->s_root); result = nfs4_proc_get_locations(inode, locations, page, cred); if (result) { dprintk("<-- %s: failed to retrieve fs_locations: %d\n", @@ -2021,7 +2021,7 @@ restart: rcu_read_unlock(); - inode = server->super->s_root->d_inode; + inode = d_inode(server->super->s_root); status = nfs4_proc_fsid_present(inode, cred); if (status != -NFS4ERR_MOVED) goto restart; /* wasn't this one */ diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 75090feeafad..6fb7cb6b3f4b 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -3,12 +3,12 @@ */ #include <linux/init.h> #include <linux/module.h> -#include <linux/nfs_idmap.h> #include <linux/nfs4_mount.h> #include <linux/nfs_fs.h> #include "delegation.h" #include "internal.h" #include "nfs4_fs.h" +#include "nfs4idmap.h" #include "dns_resolve.h" #include "pnfs.h" #include "nfs.h" @@ -91,10 +91,11 @@ static void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - pnfs_return_layout(inode); - pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ nfs_inode_return_delegation_noreclaim(inode); + /* Note that above delegreturn would trigger pnfs return-on-close */ + pnfs_return_layout(inode); + pnfs_destroy_layout(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); } diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index b6ebe7e445f6..0fbd3ab1be22 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -6,10 +6,10 @@ * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com> */ #include <linux/sysctl.h> -#include <linux/nfs_idmap.h> #include <linux/nfs_fs.h> #include "nfs4_fs.h" +#include "nfs4idmap.h" #include "callback.h" static const int nfs_set_port_min = 0; diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 1c32adbe728d..470af1a78bec 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -418,7 +418,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->fileid = 0; __entry->fhandle = 0; } - __entry->dir = NFS_FILEID(ctx->dentry->d_parent->d_inode); + __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent)); __assign_str(name, ctx->dentry->d_name.name); ), @@ -1110,7 +1110,7 @@ TRACE_EVENT(nfs4_layoutget, ), TP_fast_assign( - const struct inode *inode = ctx->dentry->d_inode; + const struct inode *inode = d_inode(ctx->dentry); __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5c399ec41079..0aea97841d30 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,10 +52,10 @@ #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include "nfs4_fs.h" #include "internal.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "pnfs.h" #include "netns.h" @@ -1920,7 +1920,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr, p = reserve_space(xdr, 4 + 4); *p++ = cpu_to_be32(1); /* bitmap length */ - *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE); + *p++ = cpu_to_be32(args->notify_types); } static void @@ -5753,8 +5753,9 @@ out_overflow: #if defined(CONFIG_NFS_V4_1) static int decode_getdeviceinfo(struct xdr_stream *xdr, - struct pnfs_device *pdev) + struct nfs4_getdeviceinfo_res *res) { + struct pnfs_device *pdev = res->pdev; __be32 *p; uint32_t len, type; int status; @@ -5802,12 +5803,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, if (unlikely(!p)) goto out_overflow; - if (be32_to_cpup(p++) & - ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) { - dprintk("%s: unsupported notification\n", - __func__); - } - + res->notification = be32_to_cpup(p++); for (i = 1; i < len; i++) { if (be32_to_cpup(p++)) { dprintk("%s: unsupported notification\n", @@ -7061,7 +7057,7 @@ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, status = decode_sequence(xdr, &res->seq_res, rqstp); if (status != 0) goto out; - status = decode_getdeviceinfo(xdr, res->pdev); + status = decode_getdeviceinfo(xdr, res); out: return status; } @@ -7365,6 +7361,11 @@ nfs4_stat_to_errno(int stat) .p_name = #proc, \ } +#define STUB(proc) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_name = #proc, \ +} + struct rpc_procinfo nfs4_procedures[] = { PROC(READ, enc_read, dec_read), PROC(WRITE, enc_write, dec_write), @@ -7417,6 +7418,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + STUB(GETDEVICELIST), PROC(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c index 4eb0aead69b6..c74f7af23d77 100644 --- a/fs/nfs/nfstrace.c +++ b/fs/nfs/nfstrace.c @@ -7,3 +7,6 @@ #define CREATE_TRACE_POINTS #include "nfstrace.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 24e1d7403c0b..5aaed363556a 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -57,7 +57,7 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) dprintk("%s: free od=%p\n", __func__, de->od.od); osduld_put_device(de->od.od); - kfree(de); + kfree_rcu(d, rcu); } struct objio_segment { @@ -637,6 +637,8 @@ static struct pnfs_layoutdriver_type objlayout_type = { .pg_read_ops = &objio_pg_read_ops, .pg_write_ops = &objio_pg_write_ops, + .sync = pnfs_generic_sync, + .free_deviceid_node = objio_free_deviceid_node, .encode_layoutcommit = objlayout_encode_layoutcommit, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index d57190a0d533..282b39369510 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -938,7 +938,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, if (prev) { if (!nfs_match_open_context(req->wb_context, prev->wb_context)) return false; - flctx = req->wb_context->dentry->d_inode->i_flctx; + flctx = d_inode(req->wb_context->dentry)->i_flctx; if (flctx != NULL && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock)) && diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4f802b02fbb9..230606243be6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1090,6 +1090,7 @@ bool pnfs_roc(struct inode *ino) pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); + pnfs_layoutcommit_inode(ino, true); return true; out_noroc: @@ -1104,8 +1105,10 @@ out_noroc: } } spin_unlock(&ino->i_lock); - if (layoutreturn) + if (layoutreturn) { + pnfs_layoutcommit_inode(ino, true); pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + } return false; } @@ -1841,7 +1844,8 @@ void pnfs_ld_write_done(struct nfs_pgio_header *hdr) { trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); if (!hdr->pnfs_error) { - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); hdr->mds_ops->rpc_call_done(&hdr->task, hdr); } else pnfs_ld_handle_write_error(hdr); @@ -1902,7 +1906,6 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) pnfs_put_lseg(hdr->lseg); nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) @@ -2032,7 +2035,6 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) pnfs_put_lseg(hdr->lseg); nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) @@ -2099,64 +2101,34 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void -pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) +pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg, + loff_t end_pos) { - struct inode *inode = hdr->inode; struct nfs_inode *nfsi = NFS_I(inode); - loff_t end_pos = hdr->mds_offset + hdr->res.count; bool mark_as_dirty = false; spin_lock(&inode->i_lock); if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { - mark_as_dirty = true; - dprintk("%s: Set layoutcommit for inode %lu ", - __func__, inode->i_ino); - } - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { - /* references matched in nfs4_layoutcommit_release */ - pnfs_get_lseg(hdr->lseg); - } - if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - spin_unlock(&inode->i_lock); - dprintk("%s: lseg %p end_pos %llu\n", - __func__, hdr->lseg, nfsi->layout->plh_lwb); - - /* if pnfs_layoutcommit_inode() runs between inode locks, the next one - * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ - if (mark_as_dirty) - mark_inode_dirty_sync(inode); -} -EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); - -void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data) -{ - struct inode *inode = data->inode; - struct nfs_inode *nfsi = NFS_I(inode); - bool mark_as_dirty = false; - - spin_lock(&inode->i_lock); - if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { mark_as_dirty = true; dprintk("%s: Set layoutcommit for inode %lu ", __func__, inode->i_ino); - } - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) { + } else if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) { /* references matched in nfs4_layoutcommit_release */ - pnfs_get_lseg(data->lseg); + pnfs_get_lseg(lseg); } - if (data->lwb > nfsi->layout->plh_lwb) - nfsi->layout->plh_lwb = data->lwb; spin_unlock(&inode->i_lock); dprintk("%s: lseg %p end_pos %llu\n", - __func__, data->lseg, nfsi->layout->plh_lwb); + __func__, lseg, nfsi->layout->plh_lwb); /* if pnfs_layoutcommit_inode() runs between inode locks, the next one * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ if (mark_as_dirty) mark_inode_dirty_sync(inode); } -EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit); +EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) { @@ -2216,7 +2188,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) pnfs_list_write_lseg(inode, &data->lseg_list); end_pos = nfsi->layout->plh_lwb; - nfsi->layout->plh_lwb = 0; nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); spin_unlock(&inode->i_lock); @@ -2233,11 +2204,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = ld->prepare_layoutcommit(&data->args); if (status) { spin_lock(&inode->i_lock); - if (end_pos < nfsi->layout->plh_lwb) + set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); + if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; spin_unlock(&inode->i_lock); put_rpccred(data->cred); - set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); goto clear_layoutcommitting; } } @@ -2258,6 +2229,13 @@ clear_layoutcommitting: } EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode); +int +pnfs_generic_sync(struct inode *inode, bool datasync) +{ + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_generic_sync); + struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { struct nfs4_threshold *thp; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 635f0865671c..1e6308f82fc3 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -155,6 +155,8 @@ struct pnfs_layoutdriver_type { int how, struct nfs_commit_info *cinfo); + int (*sync)(struct inode *inode, bool datasync); + /* * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS @@ -203,6 +205,7 @@ struct pnfs_device { struct page **pages; unsigned int pgbase; unsigned int pglen; /* reply buffer length */ + unsigned char nocache : 1;/* May not be cached */ }; #define NFS4_PNFS_GETDEVLIST_MAXNUM 16 @@ -263,10 +266,11 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); -void pnfs_set_layoutcommit(struct nfs_pgio_header *); -void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data); +void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); +int pnfs_generic_sync(struct inode *inode, bool datasync); +int pnfs_nfs_generic_sync(struct inode *inode, bool datasync); int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); @@ -291,6 +295,7 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */ + NFS_DEVICEID_NOCACHE, /* device may not be cached */ }; /* pnfs_dev.c */ @@ -302,6 +307,7 @@ struct nfs4_deviceid_node { unsigned long flags; unsigned long timestamp_unavailable; struct nfs4_deviceid deviceid; + struct rcu_head rcu; atomic_t ref; }; @@ -426,7 +432,7 @@ static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; if (lseg == NULL || ld->mark_request_commit == NULL) @@ -438,7 +444,7 @@ pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static inline bool pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; if (ld == NULL || ld->clear_request_commit == NULL) @@ -486,6 +492,14 @@ pnfs_ld_read_whole_page(struct inode *inode) return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE; } +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return 0; + return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync); +} + static inline bool pnfs_layoutcommit_outstanding(struct inode *inode) { @@ -568,6 +582,12 @@ pnfs_ld_read_whole_page(struct inode *inode) return false; } +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + return 0; +} + static inline bool pnfs_roc(struct inode *ino) { diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index aa2ec0015183..2961fcd7a2df 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -149,6 +149,8 @@ nfs4_get_device_info(struct nfs_server *server, */ d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev, gfp_flags); + if (d && pdev->nocache) + set_bit(NFS_DEVICEID_NOCACHE, &d->flags); out_free_pages: for (i = 0; i < max_pages; i++) @@ -175,8 +177,8 @@ __nfs4_find_get_deviceid(struct nfs_server *server, rcu_read_lock(); d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id, hash); - if (d != NULL) - atomic_inc(&d->ref); + if (d != NULL && !atomic_inc_not_zero(&d->ref)) + d = NULL; rcu_read_unlock(); return d; } @@ -235,12 +237,11 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, return; } hlist_del_init_rcu(&d->node); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); spin_unlock(&nfs4_deviceid_lock); - synchronize_rcu(); /* balance the initial ref set in pnfs_insert_deviceid */ - if (atomic_dec_and_test(&d->ref)) - d->ld->free_deviceid_node(d); + nfs4_put_deviceid_node(d); } EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); @@ -271,6 +272,11 @@ EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) { + if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) { + if (atomic_add_unless(&d->ref, -1, 2)) + return false; + nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid); + } if (!atomic_dec_and_test(&d->ref)) return false; d->ld->free_deviceid_node(d); @@ -314,6 +320,7 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash) if (d->nfs_client == clp && atomic_read(&d->ref)) { hlist_del_init_rcu(&d->node); hlist_add_head(&d->tmpnode, &tmp); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); } rcu_read_unlock(); spin_unlock(&nfs4_deviceid_lock); @@ -321,12 +328,10 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash) if (hlist_empty(&tmp)) return; - synchronize_rcu(); while (!hlist_empty(&tmp)) { d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); hlist_del(&d->tmpnode); - if (atomic_dec_and_test(&d->ref)) - d->ld->free_deviceid_node(d); + nfs4_put_deviceid_node(d); } } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 54e36b38fb5f..f37e25b6311c 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -561,7 +561,7 @@ static bool load_v3_ds_connect(void) return(get_v3_ds_connect != NULL); } -void __exit nfs4_pnfs_v3_ds_connect_unload(void) +void nfs4_pnfs_v3_ds_connect_unload(void) { if (get_v3_ds_connect) { symbol_put(nfs3_set_ds_client); @@ -868,3 +868,13 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, nfs_request_add_commit_list(req, list, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); + +int +pnfs_nfs_generic_sync(struct inode *inode, bool datasync) +{ + if (datasync) + return 0; + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync); + diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index c63189acd052..b417bbcd9704 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -118,7 +118,7 @@ static int nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nfs_sattrargs arg = { .fh = NFS_FH(inode), .sattr = sattr @@ -487,7 +487,7 @@ static int nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page **pages, unsigned int count, int plus) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct nfs_readdirargs arg = { .fh = NFS_FH(dir), .cookie = cookie, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index b8f5c63f77b2..ae0ff7a11b40 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -117,7 +117,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, static void nfs_readpage_release(struct nfs_page *req) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), req->wb_bytes, @@ -284,7 +284,7 @@ int nfs_readpage(struct file *file, struct page *page) dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_CACHE_SIZE, page_file_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - nfs_inc_stats(inode, NFSIOS_READPAGES); + nfs_add_stats(inode, NFSIOS_READPAGES, 1); /* * Try to flush any pending writes to the file.. diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 322b2de02988..f175b833b6ba 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -43,7 +43,6 @@ #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/namei.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> @@ -433,7 +432,7 @@ int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct nfs_server *server = NFS_SB(dentry->d_sb); unsigned char blockbits; unsigned long blockres; - struct nfs_fh *fh = NFS_FH(dentry->d_inode); + struct nfs_fh *fh = NFS_FH(d_inode(dentry)); struct nfs_fsstat res; int error = -ENOMEM; @@ -447,7 +446,7 @@ int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) pd_dentry = dget_parent(dentry); if (pd_dentry != NULL) { - nfs_zap_caches(pd_dentry->d_inode); + nfs_zap_caches(d_inode(pd_dentry)); dput(pd_dentry); } } @@ -2193,7 +2192,7 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->version != nfss->nfs_client->rpc_ops->version || data->minorversion != nfss->nfs_client->cl_minorversion || data->retrans != nfss->client->cl_timeout->to_retries || - data->selected_flavor != nfss->client->cl_auth->au_flavor || + !nfs_auth_info_match(&data->auth_info, nfss->client->cl_auth->au_flavor) || data->acregmin != nfss->acregmin / HZ || data->acregmax != nfss->acregmax / HZ || data->acdirmin != nfss->acdirmin / HZ || @@ -2241,7 +2240,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->wsize = nfss->wsize; data->retrans = nfss->client->cl_timeout->to_retries; data->selected_flavor = nfss->client->cl_auth->au_flavor; - data->auth_info = nfss->auth_info; data->acregmin = nfss->acregmin / HZ; data->acregmax = nfss->acregmax / HZ; data->acdirmin = nfss->acdirmin / HZ; @@ -2526,7 +2524,7 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { /* clone any lsm security options from the parent to the new sb */ - if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) + if (d_inode(mntroot)->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) return -ESTALE; return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); } diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 05c9e02f4153..2d56200655fe 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -45,7 +45,7 @@ error: static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct page *page; void *err; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index de54129336c6..fa538b2ba251 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -143,7 +143,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n nfs_free_dname(data); ret = nfs_copy_dname(alias, data); spin_lock(&alias->d_lock); - if (ret == 0 && alias->d_inode != NULL && + if (ret == 0 && d_really_is_positive(alias) && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { devname_garbage = alias->d_fsdata; alias->d_fsdata = data; @@ -190,7 +190,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) parent = dget_parent(dentry); if (parent == NULL) goto out_free; - dir = parent->d_inode; + dir = d_inode(parent); /* Non-exclusive lock protects against concurrent lookup() calls */ spin_lock(&dir->i_lock); if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { @@ -210,21 +210,21 @@ out_free: void nfs_wait_on_sillyrename(struct dentry *dentry) { - struct nfs_inode *nfsi = NFS_I(dentry->d_inode); + struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1); } void nfs_block_sillyrename(struct dentry *dentry) { - struct nfs_inode *nfsi = NFS_I(dentry->d_inode); + struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1); } void nfs_unblock_sillyrename(struct dentry *dentry) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct nfs_inode *nfsi = NFS_I(dir); struct nfs_unlinkdata *data; @@ -367,8 +367,8 @@ static void nfs_async_rename_release(void *calldata) struct nfs_renamedata *data = calldata; struct super_block *sb = data->old_dir->i_sb; - if (data->old_dentry->d_inode) - nfs_mark_for_revalidate(data->old_dentry->d_inode); + if (d_really_is_positive(data->old_dentry)) + nfs_mark_for_revalidate(d_inode(data->old_dentry)); dput(data->old_dentry); dput(data->new_dentry); @@ -529,10 +529,10 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto out; - fileid = NFS_FILEID(dentry->d_inode); + fileid = NFS_FILEID(d_inode(dentry)); /* Return delegation in anticipation of the rename */ - NFS_PROTO(dentry->d_inode)->return_delegation(dentry->d_inode); + NFS_PROTO(d_inode(dentry))->return_delegation(d_inode(dentry)); sdentry = NULL; do { @@ -554,7 +554,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) */ if (IS_ERR(sdentry)) goto out; - } while (sdentry->d_inode != NULL); /* need negative lookup */ + } while (d_inode(sdentry) != NULL); /* need negative lookup */ /* queue unlink first. Can't do this from rpc_release as it * has to allocate memory diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 759931088094..dfc19f1575a1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -580,7 +580,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_inc_stats(inode, NFSIOS_WRITEPAGES); + nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page_file_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); @@ -702,7 +702,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *head; @@ -861,7 +861,7 @@ static void nfs_clear_request_commit(struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct nfs_commit_info cinfo; nfs_init_cinfo_from_inode(&cinfo, inode); @@ -1591,7 +1591,7 @@ void nfs_init_commit(struct nfs_commit_data *data, struct nfs_commit_info *cinfo) { struct nfs_page *first = nfs_list_entry(head->next); - struct inode *inode = first->wb_context->dentry->d_inode; + struct inode *inode = d_inode(first->wb_context->dentry); /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -1690,7 +1690,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) dprintk("NFS: commit (%s/%llu %d@%lld)", req->wb_context->dentry->d_sb->s_id, - (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), req->wb_bytes, (long long)req_offset(req)); if (status < 0) { @@ -1840,18 +1840,20 @@ EXPORT_SYMBOL_GPL(nfs_write_inode); */ int nfs_wb_all(struct inode *inode) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = 0, - .range_end = LLONG_MAX, - }; int ret; trace_nfs_writeback_inode_enter(inode); - ret = sync_inode(inode, &wbc); + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + goto out; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + goto out; + pnfs_sync_inode(inode, true); + ret = 0; +out: trace_nfs_writeback_inode_exit(inode, ret); return ret; } diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index fc2d108f5272..a0b77fc1bd39 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -108,7 +108,7 @@ config NFSD_V4_SECURITY_LABEL config NFSD_FAULT_INJECTION bool "NFS server manual fault injection" - depends on NFSD_V4 && DEBUG_KERNEL + depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS help This option enables support for manually injecting faults into the NFS server. This is intended to be used for diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 03d647bf195d..cdefaa331a07 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -181,6 +181,17 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, } const struct nfsd4_layout_ops bl_layout_ops = { + /* + * Pretend that we send notification to the client. This is a blatant + * lie to force recent Linux clients to cache our device IDs. + * We rarely ever change the device ID, so the harm of leaking deviceids + * for a while isn't too bad. Unfortunately RFC5661 is a complete mess + * in this regard, but I filed errata 4119 for this a while ago, and + * hopefully the Linux client will eventually start caching deviceids + * without this again. + */ + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo, .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo, .proc_layoutget = nfsd4_block_proc_layoutget, diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c3e3b6e55ae2..f79521a59747 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -599,7 +599,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out4; } - err = check_export(exp.ex_path.dentry->d_inode, &exp.ex_flags, + err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags, exp.ex_uuid); if (err) goto out4; @@ -691,8 +691,7 @@ static int svc_export_match(struct cache_head *a, struct cache_head *b) struct svc_export *orig = container_of(a, struct svc_export, h); struct svc_export *new = container_of(b, struct svc_export, h); return orig->ex_client == new->ex_client && - orig->ex_path.dentry == new->ex_path.dentry && - orig->ex_path.mnt == new->ex_path.mnt; + path_equal(&orig->ex_path, &new->ex_path); } static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) @@ -891,7 +890,7 @@ exp_rootfh(struct net *net, struct auth_domain *clp, char *name, printk("nfsd: exp_rootfh path not found %s", name); return err; } - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", name, path.dentry, clp->name, @@ -1159,6 +1158,7 @@ static struct flags { { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, + { NFSEXP_PNFS, {"pnfs", ""}}, { 0, {"", ""}} }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index ac54ea60b3f6..d54701f6dc78 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -42,7 +42,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); @@ -103,7 +103,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, if (nfserr) goto out; - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { error = -EOPNOTSUPP; goto out_errno; @@ -266,9 +266,9 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, * nfsd_dispatch actually ensures the following cannot happen. * However, it seems fragile to depend on that. */ - if (dentry == NULL || dentry->d_inode == NULL) + if (dentry == NULL || d_really_is_negative(dentry)) return 0; - inode = dentry->d_inode; + inode = d_inode(dentry); p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); *p++ = htonl(resp->mask); diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 34cbbab6abd7..882b1a14bc3e 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -39,7 +39,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); @@ -94,7 +94,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, if (nfserr) goto out; - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { error = -EOPNOTSUPP; goto out_errno; @@ -174,8 +174,8 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, struct dentry *dentry = resp->fh.fh_dentry; p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0 && dentry && dentry->d_inode) { - struct inode *inode = dentry->d_inode; + if (resp->status == 0 && dentry && d_really_is_positive(dentry)) { + struct inode *inode = d_inode(dentry); struct kvec *head = rqstp->rq_res.head; unsigned int base; int n; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 12f2aab4f614..7b755b7f785c 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -166,7 +166,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, rqstp->rq_vec, argp->vlen, &resp->count); if (nfserr == 0) { - struct inode *inode = resp->fh.fh_dentry->d_inode; + struct inode *inode = d_inode(resp->fh.fh_dentry); resp->eof = (argp->offset + resp->count) >= inode->i_size; } @@ -551,7 +551,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, * different read/write sizes for file systems known to have * problems with large blocks */ if (nfserr == 0) { - struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; + struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb; /* Note that we don't care for remote fs's here */ if (sb->s_magic == MSDOS_SUPER_MAGIC) { @@ -587,7 +587,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); if (nfserr == 0) { - struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; + struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb; /* Note that we don't care for remote fs's here */ switch (sb->s_magic) { diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 39c5eb3ad33a..e4b2b4322553 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -146,7 +146,7 @@ static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) default: case FSIDSOURCE_DEV: p = xdr_encode_hyper(p, (u64)huge_encode_dev - (fhp->fh_dentry->d_inode->i_sb->s_dev)); + (d_inode(fhp->fh_dentry)->i_sb->s_dev)); break; case FSIDSOURCE_FSID: p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); @@ -203,14 +203,14 @@ static __be32 * encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) { struct dentry *dentry = fhp->fh_dentry; - if (dentry && dentry->d_inode) { + if (dentry && d_really_is_positive(dentry)) { __be32 err; struct kstat stat; err = fh_getattr(fhp, &stat); if (!err) { *p++ = xdr_one; /* attributes follow */ - lease_get_mtime(dentry->d_inode, &stat.mtime); + lease_get_mtime(d_inode(dentry), &stat.mtime); return encode_fattr3(rqstp, p, fhp, &stat); } } @@ -233,7 +233,7 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) { struct dentry *dentry = fhp->fh_dentry; - if (dentry && dentry->d_inode && fhp->fh_post_saved) { + if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) { if (fhp->fh_pre_saved) { *p++ = xdr_one; p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size); @@ -260,11 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); - fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; + fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version; if (err) { fhp->fh_post_saved = 0; /* Grab the ctime anyway - set_change_info might use it */ - fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime; + fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime; } else fhp->fh_post_saved = 1; } @@ -628,7 +628,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_attrstat *resp) { if (resp->status == 0) { - lease_get_mtime(resp->fh.fh_dentry->d_inode, + lease_get_mtime(d_inode(resp->fh.fh_dentry), &resp->stat.mtime); p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat); } @@ -828,7 +828,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, return rv; if (d_mountpoint(dchild)) goto out; - if (!dchild->d_inode) + if (d_really_is_negative(dchild)) goto out; rv = fh_compose(fhp, exp, dchild, &cd->fh); out: diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 59fd76651781..67242bf7c6cc 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -139,7 +139,7 @@ int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error = 0; struct posix_acl *pacl = NULL, *dpacl = NULL; unsigned int flags = 0; @@ -499,43 +499,13 @@ static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_s state->mask.allow |= astate->allow; } -/* - * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS, - * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate - * to traditional read/write/execute permissions. - * - * It's problematic to reject acls that use certain mode bits, because it - * places the burden on users to learn the rules about which bits one - * particular server sets, without giving the user a lot of help--we return an - * error that could mean any number of different things. To make matters - * worse, the problematic bits might be introduced by some application that's - * automatically mapping from some other acl model. - * - * So wherever possible we accept anything, possibly erring on the side of - * denying more permissions than necessary. - * - * However we do reject *explicit* DENY's of a few bits representing - * permissions we could never deny: - */ - -static inline int check_deny(u32 mask, int isowner) -{ - if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL)) - return -EINVAL; - if (!isowner) - return 0; - if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)) - return -EINVAL; - return 0; -} - static struct posix_acl * posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) { struct posix_acl_entry *pace; struct posix_acl *pacl; int nace; - int i, error = 0; + int i; /* * ACLs with no ACEs are treated differently in the inheritable @@ -560,17 +530,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) pace = pacl->a_entries; pace->e_tag = ACL_USER_OBJ; - error = check_deny(state->owner.deny, 1); - if (error) - goto out_err; low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags); for (i=0; i < state->users->n; i++) { pace++; pace->e_tag = ACL_USER; - error = check_deny(state->users->aces[i].perms.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->users->aces[i].perms.allow, &pace->e_perm, flags); pace->e_uid = state->users->aces[i].uid; @@ -579,18 +543,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) pace++; pace->e_tag = ACL_GROUP_OBJ; - error = check_deny(state->group.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags); add_to_mask(state, &state->group); for (i=0; i < state->groups->n; i++) { pace++; pace->e_tag = ACL_GROUP; - error = check_deny(state->groups->aces[i].perms.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->groups->aces[i].perms.allow, &pace->e_perm, flags); pace->e_gid = state->groups->aces[i].gid; @@ -605,15 +563,9 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) pace++; pace->e_tag = ACL_OTHER; - error = check_deny(state->other.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags); return pacl; -out_err: - posix_acl_release(pacl); - return ERR_PTR(error); } static inline void allow_bits(struct posix_ace_state *astate, u32 mask) @@ -828,7 +780,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, return error; dentry = fhp->fh_dentry; - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode->i_op->set_acl || !IS_POSIXACL(inode)) return nfserr_attrnotsupp; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 58277859a467..5694cfb7a47b 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -224,7 +224,7 @@ static int nfs_cb_stat_to_errno(int status) } static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, - enum nfsstat4 *status) + int *status) { __be32 *p; u32 op; @@ -235,7 +235,7 @@ static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, op = be32_to_cpup(p++); if (unlikely(op != expected)) goto out_unexpected; - *status = be32_to_cpup(p); + *status = nfs_cb_stat_to_errno(be32_to_cpup(p)); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -446,22 +446,16 @@ out_overflow: static int decode_cb_sequence4res(struct xdr_stream *xdr, struct nfsd4_callback *cb) { - enum nfsstat4 nfserr; int status; if (cb->cb_minorversion == 0) return 0; - status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - goto out_default; - status = decode_cb_sequence4resok(xdr, cb); -out: - return status; -out_default: - return nfs_cb_stat_to_errno(nfserr); + status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status); + if (unlikely(status || cb->cb_status)) + return status; + + return decode_cb_sequence4resok(xdr, cb); } /* @@ -524,26 +518,19 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, struct nfsd4_callback *cb) { struct nfs4_cb_compound_hdr hdr; - enum nfsstat4 nfserr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) - goto out; + return status; if (cb != NULL) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status)) - goto out; + if (unlikely(status || cb->cb_status)) + return status; } - status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - status = nfs_cb_stat_to_errno(nfserr); -out: - return status; + return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } #ifdef CONFIG_NFSD_PNFS @@ -621,24 +608,18 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, struct nfsd4_callback *cb) { struct nfs4_cb_compound_hdr hdr; - enum nfsstat4 nfserr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) - goto out; + return status; + if (cb) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status)) - goto out; + if (unlikely(status || cb->cb_status)) + return status; } - status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - status = nfs_cb_stat_to_errno(nfserr); -out: - return status; + return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); } #endif /* CONFIG_NFSD_PNFS */ @@ -898,13 +879,6 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) if (!nfsd41_cb_get_slot(clp, task)) return; } - spin_lock(&clp->cl_lock); - if (list_empty(&cb->cb_per_client)) { - /* This is the first call, not a restart */ - cb->cb_done = false; - list_add(&cb->cb_per_client, &clp->cl_callbacks); - } - spin_unlock(&clp->cl_lock); rpc_call_start(task); } @@ -918,22 +892,33 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) if (clp->cl_minorversion) { /* No need for lock, access serialized in nfsd4_cb_prepare */ - ++clp->cl_cb_session->se_cb_seq_nr; + if (!task->tk_status) + ++clp->cl_cb_session->se_cb_seq_nr; clear_bit(0, &clp->cl_cb_slot_busy); rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, clp->cl_cb_session->se_cb_seq_nr); } - if (clp->cl_cb_client != task->tk_client) { - /* We're shutting down or changing cl_cb_client; leave - * it to nfsd4_process_cb_update to restart the call if - * necessary. */ + /* + * If the backchannel connection was shut down while this + * task was queued, we need to resubmit it after setting up + * a new backchannel connection. + * + * Note that if we lost our callback connection permanently + * the submission code will error out, so we don't need to + * handle that case here. + */ + if (task->tk_flags & RPC_TASK_KILLED) { + task->tk_status = 0; + cb->cb_need_restart = true; return; } - if (cb->cb_done) - return; + if (cb->cb_status) { + WARN_ON_ONCE(task->tk_status); + task->tk_status = cb->cb_status; + } switch (cb->cb_ops->done(cb, task)) { case 0: @@ -949,21 +934,17 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) default: BUG(); } - cb->cb_done = true; } static void nfsd4_cb_release(void *calldata) { struct nfsd4_callback *cb = calldata; - struct nfs4_client *clp = cb->cb_clp; - - if (cb->cb_done) { - spin_lock(&clp->cl_lock); - list_del(&cb->cb_per_client); - spin_unlock(&clp->cl_lock); + if (cb->cb_need_restart) + nfsd4_run_cb(cb); + else cb->cb_ops->release(cb); - } + } static const struct rpc_call_ops nfsd4_cb_ops = { @@ -1058,9 +1039,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) nfsd4_mark_cb_down(clp, err); return; } - /* Yay, the callback channel's back! Restart any callbacks: */ - list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) - queue_work(callback_wq, &cb->cb_work); } static void @@ -1071,8 +1049,12 @@ nfsd4_run_cb_work(struct work_struct *work) struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; - if (cb->cb_ops && cb->cb_ops->prepare) - cb->cb_ops->prepare(cb); + if (cb->cb_need_restart) { + cb->cb_need_restart = false; + } else { + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + } if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); @@ -1084,6 +1066,15 @@ nfsd4_run_cb_work(struct work_struct *work) cb->cb_ops->release(cb); return; } + + /* + * Don't send probe messages for 4.1 or later. + */ + if (!cb->cb_ops && clp->cl_minorversion) { + clp->cl_cb_state = NFSD4_CB_UP; + return; + } + cb->cb_msg.rpc_cred = clp->cl_cb_cred; rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); @@ -1098,8 +1089,8 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_resp = cb; cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); - INIT_LIST_HEAD(&cb->cb_per_client); - cb->cb_done = true; + cb->cb_status = 0; + cb->cb_need_restart = false; } void nfsd4_run_cb(struct nfsd4_callback *cb) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 92b9d97aff4f..864e2003e8de 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -52,7 +52,7 @@ static inline void nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval) { - struct inode *inode = resfh->fh_dentry->d_inode; + struct inode *inode = d_inode(resfh->fh_dentry); int status; mutex_lock(&inode->i_mutex); @@ -110,7 +110,7 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * in current environment or not. */ if (bmval[0] & FATTR4_WORD0_ACL) { - if (!IS_POSIXACL(dentry->d_inode)) + if (!IS_POSIXACL(d_inode(dentry))) return nfserr_attrnotsupp; } @@ -209,7 +209,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) { - umode_t mode = fh->fh_dentry->d_inode->i_mode; + umode_t mode = d_inode(fh->fh_dentry)->i_mode; if (S_ISREG(mode)) return nfs_ok; @@ -470,7 +470,7 @@ out: fh_put(resfh); kfree(resfh); } - nfsd4_cleanup_open_state(cstate, open, status); + nfsd4_cleanup_open_state(cstate, open); nfsd4_bump_seqid(cstate, status); return status; } @@ -881,7 +881,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &exp, &dentry); if (err) return err; - if (dentry->d_inode == NULL) { + if (d_really_is_negative(dentry)) { exp_put(exp); err = nfserr_noent; } else @@ -1030,6 +1030,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); return status; } + if (!file) + return nfserr_bad_stateid; status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, fallocate->falloc_offset, @@ -1069,6 +1071,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); return status; } + if (!file) + return nfserr_bad_stateid; switch (seek->seek_whence) { case NFS4_CONTENT_DATA: @@ -1308,7 +1312,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls)) goto out_put_stid; - nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode, + nfserr = ops->proc_layoutget(d_inode(current_fh->fh_dentry), current_fh, lgp); if (nfserr) goto out_put_stid; @@ -1342,7 +1346,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type); if (!ops) goto out; - inode = current_fh->fh_dentry->d_inode; + inode = d_inode(current_fh->fh_dentry); nfserr = nfserr_inval; if (new_size <= seg->offset) { @@ -1815,7 +1819,7 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, bmap0 &= ~FATTR4_WORD0_FILEHANDLE; } if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) { - ret += NFSD4_MAX_SEC_LABEL_LEN + 12; + ret += NFS4_MAXLABELLEN + 12; bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL; } /* @@ -2282,13 +2286,13 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_allocate, .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_ALLOCATE", - .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_DEALLOCATE] = { .op_func = (nfsd4op_func)nfsd4_deallocate, .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_DEALLOCATE", - .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 1c307f02baa8..d88ea7b9a85c 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -192,14 +192,14 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dir = nn->rec_file->f_path.dentry; /* lock the parent */ - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&d_inode(dir)->i_mutex); dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; } - if (dentry->d_inode) + if (d_really_is_positive(dentry)) /* * In the 4.1 case, where we're called from * reclaim_complete(), records from the previous reboot @@ -209,11 +209,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); + status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU); out_put: dput(dentry); out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); if (status == 0) { if (nn->in_grace) { crp = nfs4_client_to_reclaim(dname, nn); @@ -285,7 +285,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } status = iterate_dir(nn->rec_file, &ctx.ctx); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); while (!list_empty(&ctx.names)) { struct name_list *entry; entry = list_entry(ctx.names.next, struct name_list, list); @@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) list_del(&entry->list); kfree(entry); } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); nfs4_reset_creds(original_cred); return status; } @@ -316,20 +316,20 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); dir = nn->rec_file->f_path.dentry; - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; } status = -ENOENT; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out; - status = vfs_rmdir(dir->d_inode, dentry); + status = vfs_rmdir(d_inode(dir), dentry); out: dput(dentry); out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); return status; } @@ -385,7 +385,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(child->d_name.name, nn)) return 0; - status = vfs_rmdir(parent->d_inode, child); + status = vfs_rmdir(d_inode(parent), child); if (status) printk("failed to remove client recovery directory %pd\n", child); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 326a545ea7b2..039f9c8a95e8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -94,6 +94,7 @@ static struct kmem_cache *lockowner_slab; static struct kmem_cache *file_slab; static struct kmem_cache *stateid_slab; static struct kmem_cache *deleg_slab; +static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); @@ -281,6 +282,7 @@ put_nfs4_file(struct nfs4_file *fi) if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { hlist_del_rcu(&fi->fi_hash); spin_unlock(&state_lock); + WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); } @@ -471,6 +473,86 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) __nfs4_file_put_access(fp, O_RDONLY); } +/* + * Allocate a new open/delegation state counter. This is needed for + * pNFS for proper return on close semantics. + * + * Note that we only allocate it for pNFS-enabled exports, otherwise + * all pointers to struct nfs4_clnt_odstate are always NULL. + */ +static struct nfs4_clnt_odstate * +alloc_clnt_odstate(struct nfs4_client *clp) +{ + struct nfs4_clnt_odstate *co; + + co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL); + if (co) { + co->co_client = clp; + atomic_set(&co->co_odcount, 1); + } + return co; +} + +static void +hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp = co->co_file; + + lockdep_assert_held(&fp->fi_lock); + list_add(&co->co_perfile, &fp->fi_clnt_odstate); +} + +static inline void +get_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + if (co) + atomic_inc(&co->co_odcount); +} + +static void +put_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp; + + if (!co) + return; + + fp = co->co_file; + if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { + list_del(&co->co_perfile); + spin_unlock(&fp->fi_lock); + + nfsd4_return_all_file_layouts(co->co_client, fp); + kmem_cache_free(odstate_slab, co); + } +} + +static struct nfs4_clnt_odstate * +find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new) +{ + struct nfs4_clnt_odstate *co; + struct nfs4_client *cl; + + if (!new) + return NULL; + + cl = new->co_client; + + spin_lock(&fp->fi_lock); + list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) { + if (co->co_client == cl) { + get_clnt_odstate(co); + goto out; + } + } + co = new; + co->co_file = fp; + hash_clnt_odstate_locked(new); +out: + spin_unlock(&fp->fi_lock); + return co; +} + struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) { @@ -606,7 +688,8 @@ static void block_delegations(struct knfsd_fh *fh) } static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh) +alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, + struct nfs4_clnt_odstate *odstate) { struct nfs4_delegation *dp; long n; @@ -631,6 +714,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh) INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); + dp->dl_clnt_odstate = odstate; + get_clnt_odstate(odstate); dp->dl_type = NFS4_OPEN_DELEGATE_READ; dp->dl_retries = 1; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, @@ -714,6 +799,7 @@ static void destroy_delegation(struct nfs4_delegation *dp) spin_lock(&state_lock); unhash_delegation_locked(dp); spin_unlock(&state_lock); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } @@ -724,6 +810,7 @@ static void revoke_delegation(struct nfs4_delegation *dp) WARN_ON(!list_empty(&dp->dl_recall_lru)); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); if (clp->cl_minorversion == 0) @@ -933,6 +1020,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); + put_clnt_odstate(stp->st_clnt_odstate); release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); @@ -1139,7 +1227,7 @@ hash_sessionid(struct nfs4_sessionid *sessionid) return sid->sequence % SESSION_HASH_SIZE; } -#ifdef NFSD_DEBUG +#ifdef CONFIG_SUNRPC_DEBUG static inline void dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) { @@ -1538,7 +1626,6 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); - INIT_LIST_HEAD(&clp->cl_callbacks); INIT_LIST_HEAD(&clp->cl_revoked); #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&clp->cl_lo_states); @@ -1634,6 +1721,7 @@ __destroy_client(struct nfs4_client *clp) while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } @@ -3057,6 +3145,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); + INIT_LIST_HEAD(&fp->fi_clnt_odstate); fh_copy_shallow(&fp->fi_fhandle, fh); fp->fi_deleg_file = NULL; fp->fi_had_conflict = false; @@ -3073,6 +3162,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, void nfsd4_free_slabs(void) { + kmem_cache_destroy(odstate_slab); kmem_cache_destroy(openowner_slab); kmem_cache_destroy(lockowner_slab); kmem_cache_destroy(file_slab); @@ -3103,8 +3193,14 @@ nfsd4_init_slabs(void) sizeof(struct nfs4_delegation), 0, 0, NULL); if (deleg_slab == NULL) goto out_free_stateid_slab; + odstate_slab = kmem_cache_create("nfsd4_odstate", + sizeof(struct nfs4_clnt_odstate), 0, 0, NULL); + if (odstate_slab == NULL) + goto out_free_deleg_slab; return 0; +out_free_deleg_slab: + kmem_cache_destroy(deleg_slab); out_free_stateid_slab: kmem_cache_destroy(stateid_slab); out_free_file_slab: @@ -3581,6 +3677,14 @@ alloc_stateid: open->op_stp = nfs4_alloc_open_stateid(clp); if (!open->op_stp) return nfserr_jukebox; + + if (nfsd4_has_session(cstate) && + (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) { + open->op_odstate = alloc_clnt_odstate(clp); + if (!open->op_odstate) + return nfserr_jukebox; + } + return nfs_ok; } @@ -3869,7 +3973,7 @@ out_fput: static struct nfs4_delegation * nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, - struct nfs4_file *fp) + struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) { int status; struct nfs4_delegation *dp; @@ -3877,7 +3981,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - dp = alloc_init_deleg(clp, fh); + dp = alloc_init_deleg(clp, fh, odstate); if (!dp) return ERR_PTR(-ENOMEM); @@ -3903,6 +4007,7 @@ out_unlock: spin_unlock(&state_lock); out: if (status) { + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); return ERR_PTR(status); } @@ -3980,7 +4085,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, default: goto out_no_deleg; } - dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file); + dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate); if (IS_ERR(dp)) goto out_no_deleg; @@ -4049,7 +4154,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfserr_bad_stateid; if (nfsd4_is_deleg_cur(open)) goto out; - status = nfserr_jukebox; } /* @@ -4070,6 +4174,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf release_open_stateid(stp); goto out; } + + stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp, + open->op_odstate); + if (stp->st_clnt_odstate == open->op_odstate) + open->op_odstate = NULL; } update_stateid(&stp->st_stid.sc_stateid); memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); @@ -4118,7 +4227,7 @@ out: } void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, - struct nfsd4_open *open, __be32 status) + struct nfsd4_open *open) { if (open->op_openowner) { struct nfs4_stateowner *so = &open->op_openowner->oo_owner; @@ -4130,6 +4239,8 @@ void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, kmem_cache_free(file_slab, open->op_file); if (open->op_stp) nfs4_put_stid(&open->op_stp->st_stid); + if (open->op_odstate) + kmem_cache_free(odstate_slab, open->op_odstate); } __be32 @@ -4386,10 +4497,17 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s return nfserr_old_stateid; } +static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols) +{ + if (ols->st_stateowner->so_is_open_owner && + !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; + return nfs_ok; +} + static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { struct nfs4_stid *s; - struct nfs4_ol_stateid *ols; __be32 status = nfserr_bad_stateid; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) @@ -4419,13 +4537,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) break; case NFS4_OPEN_STID: case NFS4_LOCK_STID: - ols = openlockstateid(s); - if (ols->st_stateowner->so_is_open_owner - && !(openowner(ols->st_stateowner)->oo_flags - & NFS4_OO_CONFIRMED)) - status = nfserr_bad_stateid; - else - status = nfs_ok; + status = nfsd4_check_openowner_confirmed(openlockstateid(s)); break; default: printk("unknown stateid type %x\n", s->sc_type); @@ -4473,7 +4585,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; struct svc_fh *current_fh = &cstate->current_fh; - struct inode *ino = current_fh->fh_dentry->d_inode; + struct inode *ino = d_inode(current_fh->fh_dentry); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct file *file = NULL; __be32 status; @@ -4517,8 +4629,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, status = nfs4_check_fh(current_fh, stp); if (status) goto out; - if (stp->st_stateowner->so_is_open_owner - && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + status = nfsd4_check_openowner_confirmed(stp); + if (status) goto out; status = nfs4_check_openmode(stp, flags); if (status) @@ -4853,9 +4965,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - nfsd4_return_all_file_layouts(stp->st_stateowner->so_client, - stp->st_stid.sc_file); - nfsd4_close_open_stateid(stp); /* put reference from nfs4_preprocess_seqid_op */ @@ -5171,7 +5280,7 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_file *fi = ost->st_stid.sc_file; struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *cl = oo->oo_owner.so_client; - struct inode *inode = cstate->current_fh.fh_dentry->d_inode; + struct inode *inode = d_inode(cstate->current_fh.fh_dentry); struct nfs4_lockowner *lo; unsigned int strhashval; @@ -6489,6 +6598,7 @@ nfs4_state_shutdown_net(struct net *net) list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5fb7e78169a6..158badf945df 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -424,7 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, len += 4; dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); - if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN) + if (dummy32 > NFS4_MAXLABELLEN) return nfserr_badlabel; len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); @@ -2020,7 +2020,7 @@ static __be32 nfsd4_encode_path(struct xdr_stream *xdr, * dentries/path components in an array. */ for (;;) { - if (cur.dentry == root->dentry && cur.mnt == root->mnt) + if (path_equal(&cur, root)) break; if (cur.dentry == cur.mnt->mnt_root) { if (follow_up(&cur)) @@ -2292,7 +2292,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) || bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { - err = security_inode_getsecctx(dentry->d_inode, + err = security_inode_getsecctx(d_inode(dentry), &context, &contextlen); contextsupport = (err == 0); if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { @@ -2384,7 +2384,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, p = xdr_reserve_space(xdr, 8); if (!p) goto out_resource; - p = encode_change(p, &stat, dentry->d_inode); + p = encode_change(p, &stat, d_inode(dentry)); } if (bmval0 & FATTR4_WORD0_SIZE) { p = xdr_reserve_space(xdr, 8); @@ -2807,7 +2807,7 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); if (IS_ERR(dentry)) return nfserrno(PTR_ERR(dentry)); - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { /* * nfsd_buffered_readdir drops the i_mutex between * readdir and calling this callback, leaving a window @@ -3324,7 +3324,7 @@ static __be32 nfsd4_encode_splice_read( } eof = (read->rd_offset + maxcount >= - read->rd_fhp->fh_dentry->d_inode->i_size); + d_inode(read->rd_fhp->fh_dentry)->i_size); *(p++) = htonl(eof); *(p++) = htonl(maxcount); @@ -3401,7 +3401,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); eof = (read->rd_offset + maxcount >= - read->rd_fhp->fh_dentry->d_inode->i_size); + d_inode(read->rd_fhp->fh_dentry)->i_size); tmp = htonl(eof); write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); @@ -3422,6 +3422,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, unsigned long maxcount; struct xdr_stream *xdr = &resp->xdr; struct file *file = read->rd_filp; + struct svc_fh *fhp = read->rd_fhp; int starting_len = xdr->buf->len; struct raparms *ra; __be32 *p; @@ -3445,12 +3446,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len)); maxcount = min_t(unsigned long, maxcount, read->rd_length); - if (!read->rd_filp) { + if (read->rd_filp) + err = nfsd_permission(resp->rqstp, fhp->fh_export, + fhp->fh_dentry, + NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE); + else err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, &file, &ra); - if (err) - goto err_truncate; - } + if (err) + goto err_truncate; if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) err = nfsd4_encode_splice_read(resp, read, file, maxcount); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index aa47d75ddb26..9690cb4dd588 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1250,15 +1250,15 @@ static int __init init_nfsd(void) int retval; printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); - retval = register_cld_notifier(); - if (retval) - return retval; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) - goto out_unregister_notifier; - retval = nfsd4_init_slabs(); + return retval; + retval = register_cld_notifier(); if (retval) goto out_unregister_pernet; + retval = nfsd4_init_slabs(); + if (retval) + goto out_unregister_notifier; retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; @@ -1290,10 +1290,10 @@ out_exit_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); -out_unregister_pernet: - unregister_pernet_subsys(&nfsd_net_ops); out_unregister_notifier: unregister_cld_notifier(); +out_unregister_pernet: + unregister_pernet_subsys(&nfsd_net_ops); return retval; } @@ -1308,8 +1308,8 @@ static void __exit exit_nfsd(void) nfsd4_exit_pnfs(); nfsd_fault_inject_cleanup(); unregister_filesystem(&nfsd_fs_type); - unregister_pernet_subsys(&nfsd_net_ops); unregister_cld_notifier(); + unregister_pernet_subsys(&nfsd_net_ops); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 565c4da1a9eb..cf980523898b 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -24,7 +24,7 @@ #include "export.h" #undef ifdebug -#ifdef NFSD_DEBUG +#ifdef CONFIG_SUNRPC_DEBUG # define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag) #else # define ifdebug(flag) if (0) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index e9fa966fc37f..350041a40fe5 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -38,7 +38,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) /* make sure parents give x permission to user */ int err; parent = dget_parent(tdentry); - err = inode_permission(parent->d_inode, MAY_EXEC); + err = inode_permission(d_inode(parent), MAY_EXEC); if (err < 0) { dput(parent); break; @@ -340,7 +340,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) if (error) goto out; - error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type); + error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type); if (error) goto out; @@ -412,8 +412,8 @@ static inline void _fh_update_old(struct dentry *dentry, struct svc_export *exp, struct knfsd_fh *fh) { - fh->ofh_ino = ino_t_to_u32(dentry->d_inode->i_ino); - fh->ofh_generation = dentry->d_inode->i_generation; + fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino); + fh->ofh_generation = d_inode(dentry)->i_generation; if (d_is_dir(dentry) || (exp->ex_flags & NFSEXP_NOSUBTREECHECK)) fh->ofh_dirino = 0; @@ -426,7 +426,7 @@ static bool is_root_export(struct svc_export *exp) static struct super_block *exp_sb(struct svc_export *exp) { - return exp->ex_path.dentry->d_inode->i_sb; + return d_inode(exp->ex_path.dentry)->i_sb; } static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) @@ -520,12 +520,12 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, * */ - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); dev_t ex_dev = exp_sb(exp)->s_dev; dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n", MAJOR(ex_dev), MINOR(ex_dev), - (long) exp->ex_path.dentry->d_inode->i_ino, + (long) d_inode(exp->ex_path.dentry)->i_ino, dentry, (inode ? inode->i_ino : 0)); @@ -558,7 +558,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev); fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev; fhp->fh_handle.ofh_xino = - ino_t_to_u32(exp->ex_path.dentry->d_inode->i_ino); + ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino); fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry)); if (inode) _fh_update_old(dentry, exp, &fhp->fh_handle); @@ -570,7 +570,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, mk_fsid(fhp->fh_handle.fh_fsid_type, fhp->fh_handle.fh_fsid, ex_dev, - exp->ex_path.dentry->d_inode->i_ino, + d_inode(exp->ex_path.dentry)->i_ino, exp->ex_fsid, exp->ex_uuid); if (inode) @@ -597,7 +597,7 @@ fh_update(struct svc_fh *fhp) goto out_bad; dentry = fhp->fh_dentry; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out_negative; if (fhp->fh_handle.fh_version != 1) { _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle); diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index f22920442172..1e90dad4926b 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -225,7 +225,7 @@ fill_pre_wcc(struct svc_fh *fhp) { struct inode *inode; - inode = fhp->fh_dentry->d_inode; + inode = d_inode(fhp->fh_dentry); if (!fhp->fh_pre_saved) { fhp->fh_pre_mtime = inode->i_mtime; fhp->fh_pre_ctime = inode->i_ctime; @@ -264,7 +264,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) return; } - inode = dentry->d_inode; + inode = d_inode(dentry); mutex_lock_nested(&inode->i_mutex, subclass); fill_pre_wcc(fhp); fhp->fh_locked = 1; @@ -284,7 +284,7 @@ fh_unlock(struct svc_fh *fhp) { if (fhp->fh_locked) { fill_post_wcc(fhp); - mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex); fhp->fh_locked = 0; } } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index b8680738f588..aecbcd34d336 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -223,7 +223,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, } fh_init(newfhp, NFS_FHSIZE); nfserr = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); - if (!nfserr && !dchild->d_inode) + if (!nfserr && d_really_is_negative(dchild)) nfserr = nfserr_noent; dput(dchild); if (nfserr) { @@ -241,7 +241,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, } } - inode = newfhp->fh_dentry->d_inode; + inode = d_inode(newfhp->fh_dentry); /* Unfudge the mode bits */ if (attr->ia_valid & ATTR_MODE) { diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 412d7061f9e5..79d964aa8079 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -187,7 +187,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, *p++ = htonl((u32) stat->ino); *p++ = htonl((u32) stat->atime.tv_sec); *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); - lease_get_mtime(dentry->d_inode, &time); + lease_get_mtime(d_inode(dentry), &time); *p++ = htonl((u32) time.tv_sec); *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); *p++ = htonl((u32) stat->ctime.tv_sec); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4f3bfeb11766..dbc4f85a5008 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -63,12 +63,12 @@ typedef struct { struct nfsd4_callback { struct nfs4_client *cb_clp; - struct list_head cb_per_client; u32 cb_minorversion; struct rpc_message cb_msg; struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; - bool cb_done; + int cb_status; + bool cb_need_restart; }; struct nfsd4_callback_ops { @@ -126,6 +126,7 @@ struct nfs4_delegation { struct list_head dl_perfile; struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ + struct nfs4_clnt_odstate *dl_clnt_odstate; u32 dl_type; time_t dl_time; /* For recall: */ @@ -332,7 +333,6 @@ struct nfs4_client { int cl_cb_state; struct nfsd4_callback cl_cb_null; struct nfsd4_session *cl_cb_session; - struct list_head cl_callbacks; /* list of in-progress callbacks */ /* for all client information that callback code might need: */ spinlock_t cl_lock; @@ -465,6 +465,17 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) } /* + * Per-client state indicating no. of opens and outstanding delegations + * on a file from a particular client.'od' stands for 'open & delegation' + */ +struct nfs4_clnt_odstate { + struct nfs4_client *co_client; + struct nfs4_file *co_file; + struct list_head co_perfile; + atomic_t co_odcount; +}; + +/* * nfs4_file: a file opened by some number of (open) nfs4_stateowners. * * These objects are global. nfsd keeps one instance of a nfs4_file per @@ -485,6 +496,7 @@ struct nfs4_file { struct list_head fi_delegations; struct rcu_head fi_rcu; }; + struct list_head fi_clnt_odstate; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ struct file * fi_fds[3]; /* @@ -526,6 +538,7 @@ struct nfs4_ol_stateid { struct list_head st_perstateowner; struct list_head st_locks; struct nfs4_stateowner * st_stateowner; + struct nfs4_clnt_odstate * st_clnt_odstate; unsigned char st_access_bmap; unsigned char st_deny_bmap; struct nfs4_ol_stateid * st_openstp; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 368526582429..84d770be056e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -174,7 +174,7 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) return 1; if (!(exp->ex_flags & NFSEXP_V4ROOT)) return 0; - return dentry->d_inode != NULL; + return d_inode(dentry) != NULL; } __be32 @@ -270,7 +270,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, * dentry may be negative, it may need to be updated. */ err = fh_compose(resfh, exp, dentry, fhp); - if (!err && !dentry->d_inode) + if (!err && d_really_is_negative(dentry)) err = nfserr_noent; out: dput(dentry); @@ -284,7 +284,7 @@ out: static int commit_metadata(struct svc_fh *fhp) { - struct inode *inode = fhp->fh_dentry->d_inode; + struct inode *inode = d_inode(fhp->fh_dentry); const struct export_operations *export_ops = inode->i_sb->s_export_op; if (!EX_ISSYNC(fhp->fh_export)) @@ -364,7 +364,7 @@ static __be32 nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap) { - struct inode *inode = fhp->fh_dentry->d_inode; + struct inode *inode = d_inode(fhp->fh_dentry); int host_err; if (iap->ia_size < inode->i_size) { @@ -426,7 +426,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, } dentry = fhp->fh_dentry; - inode = dentry->d_inode; + inode = d_inode(dentry); /* Ignore any mode updates on symlinks */ if (S_ISLNK(inode->i_mode)) @@ -495,7 +495,7 @@ out: */ int nfsd4_is_junction(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode == NULL) return 0; @@ -521,9 +521,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); host_error = security_inode_setsecctx(dentry, label->data, label->len); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return nfserrno(host_error); } #else @@ -706,7 +706,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); /* Disallow write access to files with the append-only bit set * or any access when mandatory locking enabled @@ -1211,7 +1211,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; dentry = fhp->fh_dentry; - dirp = dentry->d_inode; + dirp = d_inode(dentry); err = nfserr_notdir; if (!dirp->i_op->lookup) @@ -1250,7 +1250,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, * Make sure the child dentry is still negative ... */ err = nfserr_exist; - if (dchild->d_inode) { + if (d_really_is_positive(dchild)) { dprintk("nfsd_create: dentry %pd/%pd not negative!\n", dentry, dchild); goto out; @@ -1353,7 +1353,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; dentry = fhp->fh_dentry; - dirp = dentry->d_inode; + dirp = d_inode(dentry); /* Get all the sanity checks out of the way before * we lock the parent. */ @@ -1376,7 +1376,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; /* If file doesn't exist, check for permissions to create one */ - if (!dchild->d_inode) { + if (d_really_is_negative(dchild)) { err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1397,7 +1397,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, v_atime = verifier[1]&0x7fffffff; } - if (dchild->d_inode) { + if (d_really_is_positive(dchild)) { err = 0; switch (createmode) { @@ -1420,17 +1420,17 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, } break; case NFS3_CREATE_EXCLUSIVE: - if ( dchild->d_inode->i_mtime.tv_sec == v_mtime - && dchild->d_inode->i_atime.tv_sec == v_atime - && dchild->d_inode->i_size == 0 ) { + if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime + && d_inode(dchild)->i_atime.tv_sec == v_atime + && d_inode(dchild)->i_size == 0 ) { if (created) *created = 1; break; } case NFS4_CREATE_EXCLUSIVE4_1: - if ( dchild->d_inode->i_mtime.tv_sec == v_mtime - && dchild->d_inode->i_atime.tv_sec == v_atime - && dchild->d_inode->i_size == 0 ) { + if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime + && d_inode(dchild)->i_atime.tv_sec == v_atime + && d_inode(dchild)->i_size == 0 ) { if (created) *created = 1; goto set_attr; @@ -1513,7 +1513,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); err = nfserr_inval; if (!inode->i_op->readlink) @@ -1576,7 +1576,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - host_err = vfs_symlink(dentry->d_inode, dnew, path); + host_err = vfs_symlink(d_inode(dentry), dnew, path); err = nfserrno(host_err); if (!err) err = nfserrno(commit_metadata(fhp)); @@ -1632,7 +1632,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, fh_lock_nested(ffhp, I_MUTEX_PARENT); ddir = ffhp->fh_dentry; - dirp = ddir->d_inode; + dirp = d_inode(ddir); dnew = lookup_one_len(name, ddir, len); host_err = PTR_ERR(dnew); @@ -1642,7 +1642,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, dold = tfhp->fh_dentry; err = nfserr_noent; - if (!dold->d_inode) + if (d_really_is_negative(dold)) goto out_dput; host_err = vfs_link(dold, dirp, dnew, NULL); if (!host_err) { @@ -1689,10 +1689,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out; fdentry = ffhp->fh_dentry; - fdir = fdentry->d_inode; + fdir = d_inode(fdentry); tdentry = tfhp->fh_dentry; - tdir = tdentry->d_inode; + tdir = d_inode(tdentry); err = nfserr_perm; if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) @@ -1717,7 +1717,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out_nfserr; host_err = -ENOENT; - if (!odentry->d_inode) + if (d_really_is_negative(odentry)) goto out_dput_old; host_err = -EINVAL; if (odentry == trap) @@ -1790,21 +1790,21 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = fhp->fh_dentry; - dirp = dentry->d_inode; + dirp = d_inode(dentry); rdentry = lookup_one_len(fname, dentry, flen); host_err = PTR_ERR(rdentry); if (IS_ERR(rdentry)) goto out_nfserr; - if (!rdentry->d_inode) { + if (d_really_is_negative(rdentry)) { dput(rdentry); err = nfserr_noent; goto out; } if (!type) - type = rdentry->d_inode->i_mode & S_IFMT; + type = d_inode(rdentry)->i_mode & S_IFMT; if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry, NULL); @@ -2015,7 +2015,7 @@ __be32 nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, struct dentry *dentry, int acc) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err; if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 0bda93e58e1b..2f8c092be2b3 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -40,7 +40,6 @@ #include "state.h" #include "nfsd.h" -#define NFSD4_MAX_SEC_LABEL_LEN 2048 #define NFSD4_MAX_TAGLEN 128 #define XDR_LEN(n) (((n) + 3) & ~3) @@ -248,6 +247,7 @@ struct nfsd4_open { struct nfs4_openowner *op_openowner; /* used during processing */ struct nfs4_file *op_file; /* used during processing */ struct nfs4_ol_stateid *op_stp; /* used during processing */ + struct nfs4_clnt_odstate *op_odstate; /* used during processing */ struct nfs4_acl *op_acl; struct xdr_netobj op_label; }; @@ -632,7 +632,7 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { BUG_ON(!fhp->fh_pre_saved); cinfo->atomic = fhp->fh_post_saved; - cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); + cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry)); cinfo->before_change = fhp->fh_pre_change; cinfo->after_change = fhp->fh_post_change; @@ -683,7 +683,7 @@ extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open); extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate); extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, - struct nfsd4_open *open, __be32 status); + struct nfsd4_open *open); extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); extern __be32 nfsd4_close(struct svc_rqst *rqstp, diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 059f37137f9a..919fd5bb14a8 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -388,7 +388,7 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, nchildren = nilfs_btree_node_get_nchildren(node); if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || - level > NILFS_BTREE_LEVEL_MAX || + level >= NILFS_BTREE_LEVEL_MAX || nchildren < 0 || nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) { pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n", diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 197a63e9d102..0ee0bed3649b 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -435,7 +435,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, */ int nilfs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned chunk_size = nilfs_chunk_size(dir); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index be936df4ba73..258d9fe2521a 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -835,7 +835,7 @@ void nilfs_evict_inode(struct inode *inode) int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) { struct nilfs_transaction_info ti; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; int err; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 0f84b257932c..22180836ec22 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -192,7 +192,7 @@ out_fail: static int nilfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct nilfs_transaction_info ti; int err; @@ -283,7 +283,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) if (!de) goto out; - inode = dentry->d_inode; + inode = d_inode(dentry); err = -EIO; if (le64_to_cpu(de->inode) != inode->i_ino) goto out; @@ -318,7 +318,7 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry) if (!err) { nilfs_mark_inode_dirty(dir); - nilfs_mark_inode_dirty(dentry->d_inode); + nilfs_mark_inode_dirty(d_inode(dentry)); err = nilfs_transaction_commit(dir->i_sb); } else nilfs_transaction_abort(dir->i_sb); @@ -328,7 +328,7 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry) static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nilfs_transaction_info ti; int err; @@ -358,8 +358,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *dir_page = NULL; struct nilfs_dir_entry *dir_de = NULL; struct page *old_page; @@ -453,13 +453,13 @@ static struct dentry *nilfs_get_parent(struct dentry *child) struct qstr dotdot = QSTR_INIT("..", 2); struct nilfs_root *root; - ino = nilfs_inode_by_name(child->d_inode, &dotdot); + ino = nilfs_inode_by_name(d_inode(child), &dotdot); if (!ino) return ERR_PTR(-ENOENT); - root = NILFS_I(child->d_inode)->i_root; + root = NILFS_I(d_inode(child))->i_root; - inode = nilfs_iget(child->d_inode->i_sb, root, ino); + inode = nilfs_iget(d_inode(child)->i_sb, root, ino); if (IS_ERR(inode)) return ERR_CAST(inode); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c1725f20a9d1..f47585bfeb01 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -610,7 +610,7 @@ static int nilfs_unfreeze(struct super_block *sb) static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root; + struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root; struct the_nilfs *nilfs = root->nilfs; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); unsigned long long blocks; @@ -681,7 +681,7 @@ static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct super_block *sb = dentry->d_sb; struct the_nilfs *nilfs = sb->s_fs_info; - struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root; + struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root; if (!nilfs_test_opt(nilfs, BARRIER)) seq_puts(seq, ",nobarrier"); @@ -1190,7 +1190,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; - root = NILFS_I(sb->s_root->d_inode)->i_root; + root = NILFS_I(d_inode(sb->s_root))->i_root; err = nilfs_attach_log_writer(sb, root); if (err) goto restore_opts; diff --git a/fs/nsfs.c b/fs/nsfs.c index af1b24fa899d..99521e7c492b 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,7 +13,7 @@ static const struct file_operations ns_file_operations = { static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = dentry->d_fsdata; return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", @@ -22,7 +22,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) static void ns_prune_dentry(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode) { struct ns_common *ns = inode->i_private; atomic_long_set(&ns->stashed, 0); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 1d0c21df0d80..d284f07eda77 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2889,7 +2889,7 @@ void ntfs_truncate_vfs(struct inode *vi) { */ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *vi = dentry->d_inode; + struct inode *vi = d_inode(dentry); int err; unsigned int ia_valid = attr->ia_valid; diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index b3973c2fd190..0f35b80d17fe 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -292,14 +292,14 @@ const struct inode_operations ntfs_dir_inode_ops = { * The code is based on the ext3 ->get_parent() implementation found in * fs/ext3/namei.c::ext3_get_parent(). * - * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_mutex down. + * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down. * * Return the dentry of the parent directory on success or the error code on * error (IS_ERR() is true). */ static struct dentry *ntfs_get_parent(struct dentry *child_dent) { - struct inode *vi = child_dent->d_inode; + struct inode *vi = d_inode(child_dent); ntfs_inode *ni = NTFS_I(vi); MFT_RECORD *mrec; ntfs_attr_search_ctx *ctx; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 8e19b9d7aba8..16eff45727ee 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1312,9 +1312,7 @@ static int o2hb_debug_init(void) int ret = -ENOMEM; o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); - if (IS_ERR_OR_NULL(o2hb_debug_dir)) { - ret = o2hb_debug_dir ? - PTR_ERR(o2hb_debug_dir) : -ENOMEM; + if (!o2hb_debug_dir) { mlog_errno(ret); goto bail; } @@ -1327,9 +1325,7 @@ static int o2hb_debug_init(void) sizeof(o2hb_live_node_bitmap), O2NM_MAX_NODES, o2hb_live_node_bitmap); - if (IS_ERR_OR_NULL(o2hb_debug_livenodes)) { - ret = o2hb_debug_livenodes ? - PTR_ERR(o2hb_debug_livenodes) : -ENOMEM; + if (!o2hb_debug_livenodes) { mlog_errno(ret); goto bail; } @@ -1342,9 +1338,7 @@ static int o2hb_debug_init(void) sizeof(o2hb_live_region_bitmap), O2NM_MAX_REGIONS, o2hb_live_region_bitmap); - if (IS_ERR_OR_NULL(o2hb_debug_liveregions)) { - ret = o2hb_debug_liveregions ? - PTR_ERR(o2hb_debug_liveregions) : -ENOMEM; + if (!o2hb_debug_liveregions) { mlog_errno(ret); goto bail; } @@ -1358,9 +1352,7 @@ static int o2hb_debug_init(void) sizeof(o2hb_quorum_region_bitmap), O2NM_MAX_REGIONS, o2hb_quorum_region_bitmap); - if (IS_ERR_OR_NULL(o2hb_debug_quorumregions)) { - ret = o2hb_debug_quorumregions ? - PTR_ERR(o2hb_debug_quorumregions) : -ENOMEM; + if (!o2hb_debug_quorumregions) { mlog_errno(ret); goto bail; } @@ -1374,9 +1366,7 @@ static int o2hb_debug_init(void) sizeof(o2hb_failed_region_bitmap), O2NM_MAX_REGIONS, o2hb_failed_region_bitmap); - if (IS_ERR_OR_NULL(o2hb_debug_failedregions)) { - ret = o2hb_debug_failedregions ? - PTR_ERR(o2hb_debug_failedregions) : -ENOMEM; + if (!o2hb_debug_failedregions) { mlog_errno(ret); goto bail; } @@ -2010,8 +2000,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) reg->hr_debug_dir = debugfs_create_dir(config_item_name(®->hr_item), dir); - if (IS_ERR_OR_NULL(reg->hr_debug_dir)) { - ret = reg->hr_debug_dir ? PTR_ERR(reg->hr_debug_dir) : -ENOMEM; + if (!reg->hr_debug_dir) { mlog_errno(ret); goto bail; } @@ -2024,9 +2013,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) O2HB_DB_TYPE_REGION_LIVENODES, sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, reg); - if (IS_ERR_OR_NULL(reg->hr_debug_livenodes)) { - ret = reg->hr_debug_livenodes ? - PTR_ERR(reg->hr_debug_livenodes) : -ENOMEM; + if (!reg->hr_debug_livenodes) { mlog_errno(ret); goto bail; } @@ -2038,9 +2025,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) sizeof(*(reg->hr_db_regnum)), O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg); - if (IS_ERR_OR_NULL(reg->hr_debug_regnum)) { - ret = reg->hr_debug_regnum ? - PTR_ERR(reg->hr_debug_regnum) : -ENOMEM; + if (!reg->hr_debug_regnum) { mlog_errno(ret); goto bail; } @@ -2052,9 +2037,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) sizeof(*(reg->hr_db_elapsed_time)), O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg); - if (IS_ERR_OR_NULL(reg->hr_debug_elapsed_time)) { - ret = reg->hr_debug_elapsed_time ? - PTR_ERR(reg->hr_debug_elapsed_time) : -ENOMEM; + if (!reg->hr_debug_elapsed_time) { mlog_errno(ret); goto bail; } @@ -2066,16 +2049,13 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) sizeof(*(reg->hr_db_pinned)), O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg); - if (IS_ERR_OR_NULL(reg->hr_debug_pinned)) { - ret = reg->hr_debug_pinned ? - PTR_ERR(reg->hr_debug_pinned) : -ENOMEM; + if (!reg->hr_debug_pinned) { mlog_errno(ret); goto bail; } - return 0; + ret = 0; bail: - debugfs_remove_recursive(reg->hr_debug_dir); return ret; } diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 4fda7a5f3088..290373024d9d 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -42,8 +42,8 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry) { unsigned long gen = - OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; - BUG_ON(dentry->d_inode); + OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; + BUG_ON(d_inode(dentry)); dentry->d_fsdata = (void *)gen; } @@ -57,7 +57,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); osb = OCFS2_SB(dentry->d_sb); trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len, @@ -71,7 +71,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) unsigned long gen = (unsigned long) dentry->d_fsdata; unsigned long pgen; spin_lock(&dentry->d_lock); - pgen = OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; + pgen = OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; spin_unlock(&dentry->d_lock); trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len, dentry->d_name.name, @@ -146,7 +146,7 @@ static int ocfs2_match_dentry(struct dentry *dentry, if (skip_unhashed && d_unhashed(dentry)) return 0; - parent = dentry->d_parent->d_inode; + parent = d_inode(dentry->d_parent); /* Negative parent dentry? */ if (!parent) return 0; @@ -243,7 +243,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, if (!inode) return 0; - if (!dentry->d_inode && dentry->d_fsdata) { + if (d_really_is_negative(dentry) && dentry->d_fsdata) { /* Converting a negative dentry to positive Clear dentry->d_fsdata */ dentry->d_fsdata = dl = NULL; @@ -446,7 +446,7 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, { int ret; struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); /* * Move within the same directory, so the actual lock info won't diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index f0344b75b14d..3d8639f38973 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -72,7 +72,7 @@ static inline int ocfs2_add_entry(handle_t *handle, struct buffer_head *parent_fe_bh, struct ocfs2_dir_lookup_result *lookup) { - return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, + return __ocfs2_add_entry(handle, d_inode(dentry->d_parent), dentry->d_name.name, dentry->d_name.len, inode, blkno, parent_fe_bh, lookup); } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a6944b25fd5b..fdf4b41d0609 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -757,6 +757,19 @@ lookup: if (tmpres) { spin_unlock(&dlm->spinlock); spin_lock(&tmpres->spinlock); + + /* + * Right after dlm spinlock was released, dlm_thread could have + * purged the lockres. Check if lockres got unhashed. If so + * start over. + */ + if (hlist_unhashed(&tmpres->hash_node)) { + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + /* Wait on the thread that is mastering the resource */ if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { __dlm_wait_on_lockres(tmpres); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 061ba6a91bf2..b5cf27dcb18a 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -208,7 +208,7 @@ static int dlmfs_file_release(struct inode *inode, static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) { int error; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; error = inode_change_ok(inode, attr); @@ -549,7 +549,7 @@ static int dlmfs_unlink(struct inode *dir, struct dentry *dentry) { int status; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); mlog(0, "unlink inode %lu\n", inode->i_ino); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 956edf67be20..8b23aa2f52dd 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2959,7 +2959,7 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); - if (IS_ERR_OR_NULL(dlm_debug->d_locking_state)) { + if (!dlm_debug->d_locking_state) { ret = -EINVAL; mlog(ML_ERROR, "Unable to create locking state debugfs file.\n"); diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 540dc4bdd042..827fc9809bc2 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -147,7 +147,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) int status; u64 blkno; struct dentry *parent; - struct inode *dir = child->d_inode; + struct inode *dir = d_inode(child); trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 913fc250d85a..d8b670cbd909 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1126,7 +1126,7 @@ out: int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) { int status = 0, size_change; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; @@ -1275,8 +1275,8 @@ int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; - struct super_block *sb = dentry->d_inode->i_sb; + struct inode *inode = d_inode(dentry); + struct super_block *sb = d_inode(dentry)->i_sb; struct ocfs2_super *osb = sb->s_fs_info; int err; @@ -2114,7 +2114,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, { int ret = 0, meta_level = 0; struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); loff_t end; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index be71ca0937f7..b254416dc8d9 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1209,7 +1209,7 @@ int ocfs2_drop_inode(struct inode *inode) */ int ocfs2_inode_revalidate(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int status = 0; trace_ocfs2_inode_revalidate(inode, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 09f90cbf0e24..176fe6afd94e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -689,8 +689,8 @@ static int ocfs2_link(struct dentry *old_dentry, struct dentry *dentry) { handle_t *handle; - struct inode *inode = old_dentry->d_inode; - struct inode *old_dir = old_dentry->d_parent->d_inode; + struct inode *inode = d_inode(old_dentry); + struct inode *old_dir = d_inode(old_dentry->d_parent); int err; struct buffer_head *fe_bh = NULL; struct buffer_head *old_dir_bh = NULL; @@ -879,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir, int status; int child_locked = 0; bool is_unlinkable = false; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); u64 blkno; @@ -898,7 +898,7 @@ static int ocfs2_unlink(struct inode *dir, dquot_initialize(dir); - BUG_ON(dentry->d_parent->d_inode != dir); + BUG_ON(d_inode(dentry->d_parent) != dir); if (inode == osb->root_inode) return -EPERM; @@ -1209,8 +1209,8 @@ static int ocfs2_rename(struct inode *old_dir, { int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0; int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0; - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct inode *orphan_dir = NULL; struct ocfs2_dinode *newfe = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; @@ -1454,7 +1454,7 @@ static int ocfs2_rename(struct inode *old_dir, should_add_orphan = true; } } else { - BUG_ON(new_dentry->d_parent->d_inode != new_dir); + BUG_ON(d_inode(new_dentry->d_parent) != new_dir); status = ocfs2_check_dir_for_entry(new_dir, new_dentry->d_name.name, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index df3a500789c7..d8c6af101f3f 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4194,7 +4194,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, bool preserve) { int ret; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct buffer_head *new_bh = NULL; if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { @@ -4263,7 +4263,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { int error; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; struct posix_acl *default_acl, *acl; @@ -4357,7 +4357,7 @@ out: /* copied from may_create in VFS. */ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) { - if (child->d_inode) + if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; @@ -4375,7 +4375,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int error; if (!inode) @@ -4463,7 +4463,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, } error = ocfs2_vfs_reflink(old_path.dentry, - new_path.dentry->d_inode, + d_inode(new_path.dentry), new_dentry, preserve); out_dput: done_path_create(&new_path, new_dentry); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 837ddce4b659..403c5660b306 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1112,7 +1112,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); - if (IS_ERR_OR_NULL(osb->osb_debug_root)) { + if (!osb->osb_debug_root) { status = -EINVAL; mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); goto read_super_error; @@ -1122,7 +1122,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root, osb, &ocfs2_osb_debug_fops); - if (IS_ERR_OR_NULL(osb->osb_ctxt)) { + if (!osb->osb_ctxt) { status = -EINVAL; mlog_errno(status); goto read_super_error; @@ -1606,9 +1606,8 @@ static int __init ocfs2_init(void) } ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); - if (IS_ERR_OR_NULL(ocfs2_debugfs_root)) { - status = ocfs2_debugfs_root ? - PTR_ERR(ocfs2_debugfs_root) : -ENOMEM; + if (!ocfs2_debugfs_root) { + status = -ENOMEM; mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); goto out4; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4ca7533be479..d03bfbf3d27d 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1020,7 +1020,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, int ret = 0, i_ret = 0, b_ret = 0; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di = NULL; - struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode); + struct ocfs2_inode_info *oi = OCFS2_I(d_inode(dentry)); if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb))) return -EOPNOTSUPP; @@ -1028,7 +1028,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) return ret; - ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0); + ret = ocfs2_inode_lock(d_inode(dentry), &di_bh, 0); if (ret < 0) { mlog_errno(ret); return ret; @@ -1037,7 +1037,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, di = (struct ocfs2_dinode *)di_bh->b_data; down_read(&oi->ip_xattr_sem); - i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size); + i_ret = ocfs2_xattr_ibody_list(d_inode(dentry), di, buffer, size); if (i_ret < 0) b_ret = 0; else { @@ -1045,13 +1045,13 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, buffer += i_ret; size -= i_ret; } - b_ret = ocfs2_xattr_block_list(dentry->d_inode, di, + b_ret = ocfs2_xattr_block_list(d_inode(dentry), di, buffer, size); if (b_ret < 0) i_ret = 0; } up_read(&oi->ip_xattr_sem); - ocfs2_inode_unlock(dentry->d_inode, 0); + ocfs2_inode_unlock(d_inode(dentry), 0); brelse(di_bh); @@ -7257,7 +7257,7 @@ static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -7267,7 +7267,7 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -7347,7 +7347,7 @@ static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -7357,7 +7357,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, value, size, flags); } @@ -7399,7 +7399,7 @@ static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name, return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name, + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name, buffer, size); } @@ -7413,7 +7413,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER, + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 1b8e9e8405b2..f833bf8d5792 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -110,7 +110,7 @@ int omfs_make_empty(struct inode *inode, struct super_block *sb) static int omfs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct omfs_inode *oi; @@ -155,7 +155,7 @@ out: static int omfs_delete_entry(struct dentry *dentry) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct inode *dirty; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; @@ -237,7 +237,7 @@ static int omfs_dir_is_empty(struct inode *inode) static int omfs_remove(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int ret; @@ -373,8 +373,8 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *new_inode = new_dentry->d_inode; - struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = d_inode(new_dentry); + struct inode *old_inode = d_inode(old_dentry); int err; if (new_inode) { diff --git a/fs/omfs/file.c b/fs/omfs/file.c index f993be7f2156..d9e26cfbb793 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -346,7 +346,7 @@ const struct file_operations omfs_file_operations = { static int omfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/open.c b/fs/open.c index 6796f04d6032..98e5a52dc68c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -231,8 +231,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EINVAL; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + if (mode & ~FALLOC_FL_SUPPORTED_MASK) return -EOPNOTSUPP; /* Punch hole and zero range are mutually exclusive */ @@ -250,6 +249,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) (mode & ~FALLOC_FL_COLLAPSE_RANGE)) return -EINVAL; + /* Insert range should only be used exclusively. */ + if ((mode & FALLOC_FL_INSERT_RANGE) && + (mode & ~FALLOC_FL_INSERT_RANGE)) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) return -EBADF; diff --git a/fs/pipe.c b/fs/pipe.c index 822da5b7cff0..8865f7963700 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -627,7 +627,7 @@ static struct vfsmount *pipe_mnt __read_mostly; static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", - dentry->d_inode->i_ino); + d_inode(dentry)->i_ino); } static const struct dentry_operations pipefs_dentry_operations = { diff --git a/fs/pnode.c b/fs/pnode.c index 260ac8f898a4..6367e1e435c6 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -362,6 +362,46 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) } /* + * Clear MNT_LOCKED when it can be shown to be safe. + * + * mount_lock lock must be held for write + */ +void propagate_mount_unlock(struct mount *mnt) +{ + struct mount *parent = mnt->mnt_parent; + struct mount *m, *child; + + BUG_ON(parent == mnt); + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); + if (child) + child->mnt.mnt_flags &= ~MNT_LOCKED; + } +} + +/* + * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted. + */ +static void mark_umount_candidates(struct mount *mnt) +{ + struct mount *parent = mnt->mnt_parent; + struct mount *m; + + BUG_ON(parent == mnt); + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + struct mount *child = __lookup_mnt_last(&m->mnt, + mnt->mnt_mountpoint); + if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { + SET_MNT_MARK(child); + } + } +} + +/* * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ @@ -378,13 +418,16 @@ static void __propagate_umount(struct mount *mnt) struct mount *child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); /* - * umount the child only if the child has no - * other children + * umount the child only if the child has no children + * and the child is marked safe to unmount. */ - if (child && list_empty(&child->mnt_mounts)) { + if (!child || !IS_MNT_MARKED(child)) + continue; + CLEAR_MNT_MARK(child); + if (list_empty(&child->mnt_mounts)) { list_del_init(&child->mnt_child); - hlist_del_init_rcu(&child->mnt_hash); - hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); + child->mnt.mnt_flags |= MNT_UMOUNT; + list_move_tail(&child->mnt_list, &mnt->mnt_list); } } } @@ -396,11 +439,14 @@ static void __propagate_umount(struct mount *mnt) * * vfsmount lock must be held for write */ -int propagate_umount(struct hlist_head *list) +int propagate_umount(struct list_head *list) { struct mount *mnt; - hlist_for_each_entry(mnt, list, mnt_hash) + list_for_each_entry_reverse(mnt, list, mnt_list) + mark_umount_candidates(mnt); + + list_for_each_entry(mnt, list, mnt_list) __propagate_umount(mnt); return 0; } diff --git a/fs/pnode.h b/fs/pnode.h index 4a246358b031..7114ce6e6b9e 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -19,6 +19,9 @@ #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) +#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED) +#define IS_MNT_LOCKED_AND_LAZY(m) \ + (((m)->mnt.mnt_flags & (MNT_LOCKED|MNT_SYNC_UMOUNT)) == MNT_LOCKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 @@ -40,14 +43,14 @@ static inline void set_mnt_shared(struct mount *mnt) void change_mnt_propagation(struct mount *, int); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); -int propagate_umount(struct hlist_head *); +int propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); +void propagate_mount_unlock(struct mount *); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); -void umount_tree(struct mount *, int); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 3a48bb789c9f..84bb65b83570 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -774,12 +774,12 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name, struct posix_acl *acl; int error; - if (!IS_POSIXACL(dentry->d_inode)) + if (!IS_POSIXACL(d_backing_inode(dentry))) return -EOPNOTSUPP; if (d_is_symlink(dentry)) return -EOPNOTSUPP; - acl = get_acl(dentry->d_inode, type); + acl = get_acl(d_backing_inode(dentry), type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -795,7 +795,7 @@ static int posix_acl_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int type) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); struct posix_acl *acl = NULL; int ret; @@ -834,7 +834,7 @@ posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size, const char *xname; size_t size; - if (!IS_POSIXACL(dentry->d_inode)) + if (!IS_POSIXACL(d_backing_inode(dentry))) return -EOPNOTSUPP; if (d_is_symlink(dentry)) return -EOPNOTSUPP; diff --git a/fs/proc/base.c b/fs/proc/base.c index 7a3b82f986dd..093ca14f5701 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -169,7 +169,7 @@ static int get_task_root(struct task_struct *task, struct path *root) static int proc_cwd_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(dentry->d_inode); + struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { @@ -186,7 +186,7 @@ static int proc_cwd_link(struct dentry *dentry, struct path *path) static int proc_root_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(dentry->d_inode); + struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { @@ -514,7 +514,7 @@ static int proc_fd_access_allowed(struct inode *inode) int proc_setattr(struct dentry *dentry, struct iattr *attr) { int error; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (attr->ia_valid & ATTR_MODE) return -EPERM; @@ -1362,7 +1362,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) struct mm_struct *mm; struct file *exe_file; - task = get_proc_task(dentry->d_inode); + task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; mm = get_task_mm(task); @@ -1382,7 +1382,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct path path; int error = -EACCES; @@ -1427,7 +1427,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { int error = -EACCES; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct path path; /* Are we allowed to snoop on the tasks file descriptors? */ @@ -1497,7 +1497,7 @@ out_unlock: int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct task_struct *task; const struct cred *cred; struct pid_namespace *pid = dentry->d_sb->s_fs_info; @@ -1554,7 +1554,7 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); task = get_proc_task(inode); if (task) { @@ -1588,7 +1588,7 @@ int pid_delete_dentry(const struct dentry *dentry) * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return proc_inode_is_dead(dentry->d_inode); + return proc_inode_is_dead(d_inode(dentry)); } const struct dentry_operations pid_dentry_operations = @@ -1626,12 +1626,12 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, child = d_alloc(dir, &qname); if (!child) goto end_instantiate; - if (instantiate(dir->d_inode, child, task, ptr) < 0) { + if (instantiate(d_inode(dir), child, task, ptr) < 0) { dput(child); goto end_instantiate; } } - inode = child->d_inode; + inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); @@ -1674,7 +1674,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_notask; } - inode = dentry->d_inode; + inode = d_inode(dentry); task = get_proc_task(inode); if (!task) goto out_notask; @@ -1727,7 +1727,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path) int rc; rc = -ENOENT; - task = get_proc_task(dentry->d_inode); + task = get_proc_task(d_inode(dentry)); if (!task) goto out; @@ -2863,13 +2863,13 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) return 0; if (pos == TGID_OFFSET - 2) { - struct inode *inode = ns->proc_self->d_inode; + struct inode *inode = d_inode(ns->proc_self); if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { - struct inode *inode = ns->proc_thread_self->d_inode; + struct inode *inode = d_inode(ns->proc_thread_self); if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; @@ -3188,7 +3188,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct task_struct *p = get_proc_task(inode); generic_fillattr(inode, stat); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index af84ad04df77..6e5fcd00733e 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -91,7 +91,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); task = get_proc_task(inode); fd = proc_fd(inode); @@ -151,14 +151,14 @@ static int proc_fd_link(struct dentry *dentry, struct path *path) struct task_struct *task; int ret = -ENOENT; - task = get_proc_task(dentry->d_inode); + task = get_proc_task(d_inode(dentry)); if (task) { files = get_files_struct(task); put_task_struct(task); } if (files) { - int fd = proc_fd(dentry->d_inode); + int fd = proc_fd(d_inode(dentry)); struct file *fd_file; spin_lock(&files->file_lock); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index be65b2082135..df6327a2b865 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -101,7 +101,7 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); int error; @@ -120,7 +120,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); if (de && de->nlink) set_nlink(inode, de->nlink); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 7697b6621cfd..8272aaba1bb0 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -396,7 +396,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = { static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct proc_dir_entry *pde = PDE(dentry->d_inode); + struct proc_dir_entry *pde = PDE(d_inode(dentry)); if (unlikely(!use_pde(pde))) return ERR_PTR(-EINVAL); nd_set_link(nd, pde->data); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index c9eac4563fa8..e512642dbbdc 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -32,7 +32,7 @@ static const struct proc_ns_operations *ns_entries[] = { static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; @@ -53,7 +53,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; char name[50]; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 1bde894bc624..350984a19c83 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -142,7 +142,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct net *net; net = get_proc_task_net(inode); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f92d5dd578a4..fea2561d773b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -604,7 +604,7 @@ static bool proc_sys_fill_cache(struct file *file, return false; } } - inode = child->d_inode; + inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); @@ -710,7 +710,7 @@ static int proc_sys_permission(struct inode *inode, int mask) static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) @@ -727,7 +727,7 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; @@ -773,12 +773,12 @@ static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; - return !PROC_I(dentry->d_inode)->sysctl->unregistering; + return !PROC_I(d_inode(dentry))->sysctl->unregistering; } static int proc_sys_delete(const struct dentry *dentry) { - return !!PROC_I(dentry->d_inode)->sysctl->unregistering; + return !!PROC_I(d_inode(dentry))->sysctl->unregistering; } static int sysctl_is_seen(struct ctl_table_header *p) @@ -805,7 +805,7 @@ static int proc_sys_compare(const struct dentry *parent, const struct dentry *de /* Although proc doesn't have negative dentries, rcu-walk means * that inode here can be NULL */ /* AV: can it, indeed? */ - inode = ACCESS_ONCE(dentry->d_inode); + inode = d_inode_rcu(dentry); if (!inode) return 1; if (name->len != len) diff --git a/fs/proc/root.c b/fs/proc/root.c index e74ac9f1a2c0..b7fa4bfe896a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -195,7 +195,7 @@ void __init proc_root_init(void) static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat ) { - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/self.c b/fs/proc/self.c index 4348bb8907c2..6195b4a7c3b1 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -46,7 +46,7 @@ static unsigned self_inum; int proc_setup_self(struct super_block *s) { - struct inode *root_inode = s->s_root->d_inode; + struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = s->s_fs_info; struct dentry *self; diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 59075b509df3..a8371993b4fb 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -47,7 +47,7 @@ static unsigned thread_self_inum; int proc_setup_thread_self(struct super_block *s) { - struct inode *root_inode = s->s_root->d_inode; + struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = s->s_fs_info; struct dentry *thread_self; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 56e1ffda4d89..dc43b5f29305 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -190,7 +190,7 @@ static const struct file_operations pstore_file_operations = { */ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { - struct pstore_private *p = dentry->d_inode->i_private; + struct pstore_private *p = d_inode(dentry)->i_private; int err; err = pstore_check_syslog_permissions(p); @@ -199,7 +199,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) if (p->psi->erase) p->psi->erase(p->type, p->id, p->count, - dentry->d_inode->i_ctime, p->psi); + d_inode(dentry)->i_ctime, p->psi); else return -EPERM; @@ -376,7 +376,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, break; } - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = d_alloc_name(root, name); if (!dentry) @@ -396,12 +396,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, list_add(&private->list, &allpstore); spin_unlock_irqrestore(&allpstore_lock, flags); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return 0; fail_lockedalloc: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); kfree(private); fail_alloc: iput(inode); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 44e73923670d..32d2e1a9774c 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -182,7 +182,7 @@ static const char *qnx6_checkroot(struct super_block *s) static char match_root[2][3] = {".\0\0", "..\0"}; int i, error = 0; struct qnx6_dir_entry *dir_entry; - struct inode *root = s->s_root->d_inode; + struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; struct page *page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ecc25cf0ee6e..20d1f74561cf 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2328,7 +2328,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id, if (path->dentry->d_sb != sb) error = -EXDEV; else - error = vfs_load_quota_inode(path->dentry->d_inode, type, + error = vfs_load_quota_inode(d_inode(path->dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; @@ -2392,20 +2392,20 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - mutex_lock(&sb->s_root->d_inode->i_mutex); + mutex_lock(&d_inode(sb->s_root)->i_mutex); dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); - mutex_unlock(&sb->s_root->d_inode->i_mutex); + mutex_unlock(&d_inode(sb->s_root)->i_mutex); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { error = -ENOENT; goto out; } error = security_quota_on(dentry); if (!error) - error = vfs_load_quota_inode(dentry->d_inode, type, format_id, + error = vfs_load_quota_inode(d_inode(dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); out: diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 0b38befa69f3..ba1323a94924 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -163,7 +163,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) */ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int old_ia_valid = ia->ia_valid; int ret = 0; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 0a7dc941aaf4..4a024e2ceb9f 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -53,8 +53,8 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh) { struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root; - return (privroot->d_inode && - deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); + return (d_really_is_positive(privroot) && + deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid); } int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 742242b60972..f6f2fbad9777 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3308,7 +3308,7 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index cd11358b10c7..b55a074653d7 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -400,7 +400,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child) struct inode *inode = NULL; struct reiserfs_dir_entry de; INITIALIZE_PATH(path_to_entry); - struct inode *dir = child->d_inode; + struct inode *dir = d_inode(child); if (dir->i_nlink == 0) { return ERR_PTR(-ENOENT); @@ -917,7 +917,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) goto end_rmdir; } - inode = dentry->d_inode; + inode = d_inode(dentry); reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); @@ -987,7 +987,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) dquot_initialize(dir); - inode = dentry->d_inode; + inode = d_inode(dentry); /* * in this transaction we can be doing at max two balancings and @@ -1174,7 +1174,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int retval; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct reiserfs_transaction_handle th; /* * We need blocks for transaction + update of quotas for @@ -1311,8 +1311,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, dquot_initialize(old_dir); dquot_initialize(new_dir); - old_inode = old_dentry->d_inode; - new_dentry_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_dentry_inode = d_inode(new_dentry); /* * make sure that oldname still exists and points to an object we diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 68b5f182984e..0111ad0466ed 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1687,7 +1687,7 @@ static __u32 find_hash_out(struct super_block *s) __u32 hash = DEFAULT_HASH; __u32 deh_hashval, teahash, r5hash, yurahash; - inode = s->s_root->d_inode; + inode = d_inode(s->s_root); make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); retval = search_by_entry_key(s, &key, &path, &de); @@ -2347,7 +2347,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, err = -EXDEV; goto out; } - inode = path->dentry->d_inode; + inode = d_inode(path->dentry); /* * We must not pack tails for quota files on reiserfs for quota * IO to work diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4e781e697c90..e87f9b52bf06 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -87,9 +87,9 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); error = dir->i_op->unlink(dir, dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); if (!error) d_delete(dentry); @@ -102,11 +102,11 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); error = dir->i_op->rmdir(dir, dentry); if (!error) - dentry->d_inode->i_flags |= S_DEAD; - mutex_unlock(&dentry->d_inode->i_mutex); + d_inode(dentry)->i_flags |= S_DEAD; + mutex_unlock(&d_inode(dentry)->i_mutex); if (!error) d_delete(dentry); @@ -120,26 +120,26 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) struct dentry *privroot = REISERFS_SB(sb)->priv_root; struct dentry *xaroot; - if (!privroot->d_inode) + if (d_really_is_negative(privroot)) return ERR_PTR(-ENODATA); - mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR); xaroot = dget(REISERFS_SB(sb)->xattr_root); if (!xaroot) xaroot = ERR_PTR(-ENODATA); - else if (!xaroot->d_inode) { + else if (d_really_is_negative(xaroot)) { int err = -ENODATA; if (xattr_may_create(flags)) - err = xattr_mkdir(privroot->d_inode, xaroot, 0700); + err = xattr_mkdir(d_inode(privroot), xaroot, 0700); if (err) { dput(xaroot); xaroot = ERR_PTR(err); } } - mutex_unlock(&privroot->d_inode->i_mutex); + mutex_unlock(&d_inode(privroot)->i_mutex); return xaroot; } @@ -156,21 +156,21 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) le32_to_cpu(INODE_PKEY(inode)->k_objectid), inode->i_generation); - mutex_lock_nested(&xaroot->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR); xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); - if (!IS_ERR(xadir) && !xadir->d_inode) { + if (!IS_ERR(xadir) && d_really_is_negative(xadir)) { int err = -ENODATA; if (xattr_may_create(flags)) - err = xattr_mkdir(xaroot->d_inode, xadir, 0700); + err = xattr_mkdir(d_inode(xaroot), xadir, 0700); if (err) { dput(xadir); xadir = ERR_PTR(err); } } - mutex_unlock(&xaroot->d_inode->i_mutex); + mutex_unlock(&d_inode(xaroot)->i_mutex); dput(xaroot); return xadir; } @@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, container_of(ctx, struct reiserfs_dentry_buf, ctx); struct dentry *dentry; - WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex)); + WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex)); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) return -ENOSPC; @@ -207,7 +207,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, dentry = lookup_one_len(name, dbuf->xadir, namelen); if (IS_ERR(dentry)) { return PTR_ERR(dentry); - } else if (!dentry->d_inode) { + } else if (d_really_is_negative(dentry)) { /* A directory entry exists, but no file? */ reiserfs_error(dentry->d_sb, "xattr-20003", "Corrupted directory: xattr %pd listed but " @@ -249,16 +249,16 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (IS_ERR(dir)) { err = PTR_ERR(dir); goto out; - } else if (!dir->d_inode) { + } else if (d_really_is_negative(dir)) { err = 0; goto out_dir; } - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); buf.xadir = dir; while (1) { - err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); if (err) break; if (!buf.count) @@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, break; buf.count = 0; } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); cleanup_dentry_buf(&buf); @@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (!err) { int jerror; - mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, + mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex, I_MUTEX_XATTR); err = action(dir, data); reiserfs_write_lock(inode->i_sb); jerror = journal_end(&th); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&dir->d_parent->d_inode->i_mutex); + mutex_unlock(&d_inode(dir->d_parent)->i_mutex); err = jerror ?: err; } } @@ -319,7 +319,7 @@ out: static int delete_one_xattr(struct dentry *dentry, void *data) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); /* This is the xattr dir, handle specially. */ if (d_is_dir(dentry)) @@ -384,27 +384,27 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (IS_ERR(xadir)) return ERR_CAST(xadir); - mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); xafile = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(xafile)) { err = PTR_ERR(xafile); goto out; } - if (xafile->d_inode && (flags & XATTR_CREATE)) + if (d_really_is_positive(xafile) && (flags & XATTR_CREATE)) err = -EEXIST; - if (!xafile->d_inode) { + if (d_really_is_negative(xafile)) { err = -ENODATA; if (xattr_may_create(flags)) - err = xattr_create(xadir->d_inode, xafile, + err = xattr_create(d_inode(xadir), xafile, 0700|S_IFREG); } if (err) dput(xafile); out: - mutex_unlock(&xadir->d_inode->i_mutex); + mutex_unlock(&d_inode(xadir)->i_mutex); dput(xadir); if (err) return ERR_PTR(err); @@ -469,21 +469,21 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) if (IS_ERR(xadir)) return PTR_ERR(xadir); - mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); dentry = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_dput; } - if (dentry->d_inode) { - err = xattr_unlink(xadir->d_inode, dentry); + if (d_really_is_positive(dentry)) { + err = xattr_unlink(d_inode(xadir), dentry); update_ctime(inode); } dput(dentry); out_dput: - mutex_unlock(&xadir->d_inode->i_mutex); + mutex_unlock(&d_inode(xadir)->i_mutex); dput(xadir); return err; } @@ -533,7 +533,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, else chunk = buffer_size - buffer_pos; - page = reiserfs_get_page(dentry->d_inode, file_pos); + page = reiserfs_get_page(d_inode(dentry), file_pos); if (IS_ERR(page)) { err = PTR_ERR(page); goto out_unlock; @@ -573,18 +573,18 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, } new_size = buffer_size + sizeof(struct reiserfs_xattr_header); - if (!err && new_size < i_size_read(dentry->d_inode)) { + if (!err && new_size < i_size_read(d_inode(dentry))) { struct iattr newattrs = { .ia_ctime = current_fs_time(inode->i_sb), .ia_size = new_size, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); - inode_dio_wait(dentry->d_inode); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR); + inode_dio_wait(d_inode(dentry)); err = reiserfs_setattr(dentry, &newattrs); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); } else update_ctime(inode); out_unlock: @@ -657,7 +657,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, down_read(&REISERFS_I(inode)->i_xattr_sem); - isize = i_size_read(dentry->d_inode); + isize = i_size_read(d_inode(dentry)); /* Just return the size needed */ if (buffer == NULL) { @@ -680,7 +680,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, else chunk = isize - file_pos; - page = reiserfs_get_page(dentry->d_inode, file_pos); + page = reiserfs_get_page(d_inode(dentry), file_pos); if (IS_ERR(page)) { err = PTR_ERR(page); goto out_unlock; @@ -775,7 +775,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; return handler->get(dentry, name, buffer, size, handler->flags); @@ -784,7 +784,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, /* * Inode operation setxattr() * - * dentry->d_inode->i_mutex down + * d_inode(dentry)->i_mutex down */ int reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, @@ -794,7 +794,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; return handler->set(dentry, name, value, size, flags, handler->flags); @@ -803,7 +803,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, /* * Inode operation removexattr() * - * dentry->d_inode->i_mutex down + * d_inode(dentry)->i_mutex down */ int reiserfs_removexattr(struct dentry *dentry, const char *name) { @@ -811,7 +811,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name) handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags); @@ -875,14 +875,14 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) .size = buffer ? size : 0, }; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return -EINVAL; if (!dentry->d_sb->s_xattr || - get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; - dir = open_xa_dir(dentry->d_inode, XATTR_REPLACE); + dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE); if (IS_ERR(dir)) { err = PTR_ERR(dir); if (err == -ENODATA) @@ -890,9 +890,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) goto out; } - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); - err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); + mutex_unlock(&d_inode(dir)->i_mutex); if (!err) err = buf.pos; @@ -905,12 +905,12 @@ out: static int create_privroot(struct dentry *dentry) { int err; - struct inode *inode = dentry->d_parent->d_inode; + struct inode *inode = d_inode(dentry->d_parent); WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); err = xattr_mkdir(inode, dentry, 0700); - if (err || !dentry->d_inode) { + if (err || d_really_is_negative(dentry)) { reiserfs_warning(dentry->d_sb, "jdm-20006", "xattrs/ACLs enabled and couldn't " "find/create .reiserfs_priv. " @@ -918,7 +918,7 @@ static int create_privroot(struct dentry *dentry) return -EOPNOTSUPP; } - dentry->d_inode->i_flags |= S_PRIVATE; + d_inode(dentry)->i_flags |= S_PRIVATE; reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " "storage.\n", PRIVROOT_NAME); @@ -997,17 +997,17 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - mutex_lock(&s->s_root->d_inode->i_mutex); + mutex_lock(&d_inode(s->s_root)->i_mutex); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; d_set_d_op(dentry, &xattr_lookup_poison_ops); - if (dentry->d_inode) - dentry->d_inode->i_flags |= S_PRIVATE; + if (d_really_is_positive(dentry)) + d_inode(dentry)->i_flags |= S_PRIVATE; } else err = PTR_ERR(dentry); - mutex_unlock(&s->s_root->d_inode->i_mutex); + mutex_unlock(&d_inode(s->s_root)->i_mutex); return err; } @@ -1026,15 +1026,15 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) if (err) goto error; - if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) { - mutex_lock(&s->s_root->d_inode->i_mutex); + if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { + mutex_lock(&d_inode(s->s_root)->i_mutex); err = create_privroot(REISERFS_SB(s)->priv_root); - mutex_unlock(&s->s_root->d_inode->i_mutex); + mutex_unlock(&d_inode(s->s_root)->i_mutex); } - if (privroot->d_inode) { + if (d_really_is_positive(privroot)) { s->s_xattr = reiserfs_xattr_handlers; - mutex_lock(&privroot->d_inode->i_mutex); + mutex_lock(&d_inode(privroot)->i_mutex); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; @@ -1045,7 +1045,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) else err = PTR_ERR(dentry); } - mutex_unlock(&privroot->d_inode->i_mutex); + mutex_unlock(&d_inode(privroot)->i_mutex); } error: diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index f620e9678dd5..15dde6262c00 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -78,7 +78,7 @@ static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode) if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) { nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); - if (!REISERFS_SB(inode->i_sb)->xattr_root->d_inode) + if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root)) nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index e7f8939a4cb5..9a3b0616f283 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -15,10 +15,10 @@ security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(dentry->d_inode)) + if (IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); } static int @@ -28,10 +28,10 @@ security_set(struct dentry *dentry, const char *name, const void *buffer, if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(dentry->d_inode)) + if (IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } static size_t security_list(struct dentry *dentry, char *list, size_t list_len, @@ -39,7 +39,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len, { const size_t len = namelen + 1; - if (IS_PRIVATE(dentry->d_inode)) + if (IS_PRIVATE(d_inode(dentry))) return 0; if (list && len <= list_len) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 5eeb0c48ba46..e4f1343714e0 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -14,10 +14,10 @@ trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); } static int @@ -27,10 +27,10 @@ trusted_set(struct dentry *dentry, const char *name, const void *buffer, if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, @@ -38,7 +38,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, { const size_t len = name_len + 1; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return 0; if (list && len <= list_size) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index e50eab046471..d0b08d3e5689 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -15,7 +15,7 @@ user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, return -EINVAL; if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); } static int @@ -27,7 +27,7 @@ user_set(struct dentry *dentry, const char *name, const void *buffer, if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } static size_t user_list(struct dentry *dentry, char *list, size_t list_size, diff --git a/fs/splice.c b/fs/splice.c index 476024bb6546..4f355a1c1a9e 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -261,6 +261,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, return ret; } +EXPORT_SYMBOL_GPL(splice_to_pipe); void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { @@ -1161,7 +1162,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, long ret, bytes; umode_t i_mode; size_t len; - int i, flags; + int i, flags, more; /* * We require the input being a regular file, as we don't want to @@ -1204,6 +1205,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * Don't block on output, we have to drain the direct pipe. */ sd->flags &= ~SPLICE_F_NONBLOCK; + more = sd->flags & SPLICE_F_MORE; while (len) { size_t read_len; @@ -1217,6 +1219,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, sd->total_len = read_len; /* + * If more data is pending, set SPLICE_F_MORE + * If this is the last data and SPLICE_F_MORE was not set + * initially, clears it. + */ + if (read_len < len) + sd->flags |= SPLICE_F_MORE; + else if (!more) + sd->flags &= ~SPLICE_F_MORE; + /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 5e1101ff276f..8073b6532cf0 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -110,7 +110,7 @@ static struct dentry *squashfs_fh_to_parent(struct super_block *sb, static struct dentry *squashfs_get_parent(struct dentry *child) { - struct inode *inode = child->d_inode; + struct inode *inode = d_inode(child); unsigned int parent_ino = squashfs_i(inode)->parent; return squashfs_export_iget(inode->i_sb, parent_ino); diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index 92fcde7b4d61..e5e0ddf5b143 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -39,7 +39,7 @@ static const struct xattr_handler *squashfs_xattr_handler(int); ssize_t squashfs_listxattr(struct dentry *d, char *buffer, size_t buffer_size) { - struct inode *inode = d->d_inode; + struct inode *inode = d_inode(d); struct super_block *sb = inode->i_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr) @@ -229,7 +229,7 @@ static int squashfs_user_get(struct dentry *d, const char *name, void *buffer, if (name[0] == '\0') return -EINVAL; - return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name, + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name, buffer, size); } @@ -259,7 +259,7 @@ static int squashfs_trusted_get(struct dentry *d, const char *name, if (name[0] == '\0') return -EINVAL; - return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name, + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name, buffer, size); } @@ -286,7 +286,7 @@ static int squashfs_security_get(struct dentry *d, const char *name, if (name[0] == '\0') return -EINVAL; - return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name, + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name, buffer, size); } diff --git a/fs/stat.c b/fs/stat.c index 19636af5e75c..cccc1aab9a8b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(generic_fillattr); */ int vfs_getattr_nosec(struct path *path, struct kstat *stat) { - struct inode *inode = path->dentry->d_inode; + struct inode *inode = d_backing_inode(path->dentry); if (inode->i_op->getattr) return inode->i_op->getattr(path->mnt, path->dentry, stat); @@ -326,7 +326,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, retry: error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty); if (!error) { - struct inode *inode = path.dentry->d_inode; + struct inode *inode = d_backing_inode(path.dentry); error = empty ? -ENOENT : -EINVAL; if (inode->i_op->readlink) { diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index d42291d08215..8f3555f00c54 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -132,7 +132,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; - struct inode * dir = dentry->d_parent->d_inode; + struct inode * dir = d_inode(dentry->d_parent); unsigned long start, n; unsigned long npages = dir_pages(dir); struct page *page = NULL; @@ -176,7 +176,7 @@ found: int sysv_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct page *page = NULL; diff --git a/fs/sysv/file.c b/fs/sysv/file.c index a48e30410ad1..82ddc09061e2 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -30,7 +30,7 @@ const struct file_operations sysv_file_operations = { static int sysv_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 66bc316927e8..2fde40acf024 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -443,7 +443,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct super_block *s = dentry->d_sb; - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 731b2bbcaab3..11e83ed0b4bf 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -118,7 +118,7 @@ out_fail: static int sysv_link(struct dentry * old_dentry, struct inode * dir, struct dentry * dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); @@ -166,7 +166,7 @@ out_dir: static int sysv_unlink(struct inode * dir, struct dentry * dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct page * page; struct sysv_dir_entry * de; int err = -ENOENT; @@ -187,7 +187,7 @@ out: static int sysv_rmdir(struct inode * dir, struct dentry * dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = -ENOTEMPTY; if (sysv_empty_dir(inode)) { @@ -208,8 +208,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, struct inode * new_dir, struct dentry * new_dentry) { - struct inode * old_inode = old_dentry->d_inode; - struct inode * new_inode = new_dentry->d_inode; + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct sysv_dir_entry * dir_de = NULL; struct page * old_page; diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c index 00d2f8a43e4e..d3fa0d703314 100644 --- a/fs/sysv/symlink.c +++ b/fs/sysv/symlink.c @@ -10,7 +10,7 @@ static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, (char *)SYSV_I(dentry->d_inode)->i_data); + nd_set_link(nd, (char *)SYSV_I(d_inode(dentry))->i_data); return NULL; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 02d1ee778df0..27060fc855d4 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -499,7 +499,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct ubifs_info *c = dir->i_sb->s_fs_info; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_inode *dir_ui = ubifs_inode(dir); int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); @@ -554,7 +554,7 @@ out_cancel: static int ubifs_unlink(struct inode *dir, struct dentry *dentry) { struct ubifs_info *c = dir->i_sb->s_fs_info; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ubifs_inode *dir_ui = ubifs_inode(dir); int sz_change = CALC_DENT_SIZE(dentry->d_name.len); int err, budgeted = 1; @@ -646,7 +646,7 @@ static int check_dir_empty(struct ubifs_info *c, struct inode *dir) static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) { struct ubifs_info *c = dir->i_sb->s_fs_info; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int sz_change = CALC_DENT_SIZE(dentry->d_name.len); int err, budgeted = 1; struct ubifs_inode *dir_ui = ubifs_inode(dir); @@ -662,7 +662,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ino, dir->i_ino); ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - err = check_dir_empty(c, dentry->d_inode); + err = check_dir_empty(c, d_inode(dentry)); if (err) return err; @@ -970,8 +970,8 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct ubifs_info *c = old_dir->i_sb->s_fs_info; - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode); int err, release, sync = 0, move = (new_dir != old_dir); int is_dir = S_ISDIR(old_inode->i_mode); @@ -1136,7 +1136,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { loff_t size; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 3ba3fef64e9e..35efc103c39c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1257,7 +1257,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, int ubifs_setattr(struct dentry *dentry, struct iattr *attr) { int err; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ubifs_info *c = inode->i_sb->s_fs_info; dbg_gen("ino %lu, mode %#x, ia_valid %#x", @@ -1302,7 +1302,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset, static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct ubifs_inode *ui = ubifs_inode(dentry->d_inode); + struct ubifs_inode *ui = ubifs_inode(d_inode(dentry)); nd_set_link(nd, ui->data); return NULL; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 90ae1a8439d9..0b9da5b6e0f9 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -930,8 +930,8 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, union ubifs_key key; struct ubifs_dent_node *dent, *dent2; int err, dlen1, dlen2, ilen, lnum, offs, len; - const struct inode *old_inode = old_dentry->d_inode; - const struct inode *new_inode = new_dentry->d_inode; + const struct inode *old_inode = d_inode(old_dentry); + const struct inode *new_inode = d_inode(new_dentry); int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 3659b1934500..96f3448b6eb4 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -364,15 +364,15 @@ int ubifs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", - name, dentry->d_inode->i_ino, dentry, size); + name, d_inode(dentry)->i_ino, dentry, size); - return setxattr(dentry->d_inode, name, value, size, flags); + return setxattr(d_inode(dentry), name, value, size, flags); } ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, size_t size) { - struct inode *inode, *host = dentry->d_inode; + struct inode *inode, *host = d_inode(dentry); struct ubifs_info *c = host->i_sb->s_fs_info; struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_inode *ui; @@ -432,7 +432,7 @@ out_unlock: ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) { union ubifs_key key; - struct inode *host = dentry->d_inode; + struct inode *host = d_inode(dentry); struct ubifs_info *c = host->i_sb->s_fs_info; struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_dent_node *xent, *pxent = NULL; @@ -535,7 +535,7 @@ out_cancel: int ubifs_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode, *host = dentry->d_inode; + struct inode *inode, *host = d_inode(dentry); struct ubifs_info *c = host->i_sb->s_fs_info; struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_dent_node *xent; diff --git a/fs/udf/file.c b/fs/udf/file.c index 5dadad9960b9..7a95b8fed302 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -249,7 +249,7 @@ const struct file_operations udf_file_operations = { static int udf_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 39661977c89c..5c03f0dfb98b 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -551,7 +551,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, static int udf_add_nondir(struct dentry *dentry, struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct udf_fileident_bh fibh; struct fileIdentDesc cfi, *fi; int err; @@ -767,7 +767,7 @@ static int empty_dir(struct inode *dir) static int udf_rmdir(struct inode *dir, struct dentry *dentry) { int retval; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct udf_fileident_bh fibh; struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; @@ -809,7 +809,7 @@ out: static int udf_unlink(struct inode *dir, struct dentry *dentry) { int retval; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct udf_fileident_bh fibh; struct fileIdentDesc *fi; struct fileIdentDesc cfi; @@ -999,7 +999,7 @@ out_no_entry: static int udf_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct udf_fileident_bh fibh; struct fileIdentDesc cfi, *fi; int err; @@ -1038,8 +1038,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct udf_fileident_bh ofibh, nfibh; struct fileIdentDesc *ofi = NULL, *nfi = NULL, *dir_fi = NULL; struct fileIdentDesc ocfi, ncfi; @@ -1179,7 +1179,7 @@ static struct dentry *udf_get_parent(struct dentry *child) struct fileIdentDesc cfi; struct udf_fileident_bh fibh; - if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) + if (!udf_find_entry(d_inode(child), &dotdot, &fibh, &cfi)) return ERR_PTR(-EACCES); if (fibh.sbh != fibh.ebh) @@ -1187,7 +1187,7 @@ static struct dentry *udf_get_parent(struct dentry *child) brelse(fibh.sbh); tloc = lelb_to_cpu(cfi.icb.extLocation); - inode = udf_iget(child->d_inode->i_sb, &tloc); + inode = udf_iget(d_inode(child)->i_sb, &tloc); if (IS_ERR(inode)) return ERR_CAST(inode); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 0ecc2cebed8f..1bfe8cabff0f 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -311,7 +311,7 @@ found: */ int ufs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct super_block *sb = dir->i_sb; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index fd65deb4b5f0..e491a93a7e9a 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -165,7 +165,7 @@ out_fail: static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int error; lock_ufs(dir->i_sb); @@ -222,7 +222,7 @@ out_fail: static int ufs_unlink(struct inode *dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct ufs_dir_entry *de; struct page *page; int err = -ENOENT; @@ -244,7 +244,7 @@ out: static int ufs_rmdir (struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); int err= -ENOTEMPTY; lock_ufs(dir->i_sb); @@ -263,8 +263,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *dir_page = NULL; struct ufs_dir_entry * dir_de = NULL; struct page *old_page; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 8092d3759a5e..b3bc3e7ae79d 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -144,10 +144,10 @@ static struct dentry *ufs_get_parent(struct dentry *child) struct qstr dot_dot = QSTR_INIT("..", 2); ino_t ino; - ino = ufs_inode_by_name(child->d_inode, &dot_dot); + ino = ufs_inode_by_name(d_inode(child), &dot_dot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(ufs_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ufs_iget(d_inode(child)->i_sb, ino)); } static const struct export_operations ufs_export_ops = { diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c index d283628b4778..5b537e2fdda3 100644 --- a/fs/ufs/symlink.c +++ b/fs/ufs/symlink.c @@ -34,7 +34,7 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct ufs_inode_info *p = UFS_I(dentry->d_inode); + struct ufs_inode_info *p = UFS_I(d_inode(dentry)); nd_set_link(nd, (char*)p->i_u1.i_symlink); return NULL; } diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index f04f89fbd4d9..21154704c168 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -492,7 +492,7 @@ out: int ufs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; int error; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index a6fbf4472017..516162be1398 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -260,6 +260,7 @@ xfs_alloc_fix_len( rlen = rlen - (k - args->mod); else rlen = rlen - args->prod + (args->mod - k); + /* casts to (int) catch length underflows */ if ((int)rlen < (int)args->minlen) return; ASSERT(rlen >= args->minlen && rlen <= args->maxlen); @@ -286,7 +287,8 @@ xfs_alloc_fix_minleft( if (diff >= 0) return 1; args->len += diff; /* shrink the allocated space */ - if (args->len >= args->minlen) + /* casts to (int) catch length underflows */ + if ((int)args->len >= (int)args->minlen) return 1; args->agbno = NULLAGBLOCK; return 0; @@ -315,6 +317,9 @@ xfs_alloc_fixup_trees( xfs_agblock_t nfbno2; /* second new free startblock */ xfs_extlen_t nflen1=0; /* first new free length */ xfs_extlen_t nflen2=0; /* second new free length */ + struct xfs_mount *mp; + + mp = cnt_cur->bc_mp; /* * Look up the record in the by-size tree if necessary. @@ -323,13 +328,13 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, i == 1 && nfbno1 == fbno && nflen1 == flen); #endif } else { if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } /* * Look up the record in the by-block tree if necessary. @@ -338,13 +343,13 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, i == 1 && nfbno1 == fbno && nflen1 == flen); #endif } else { if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } #ifdef DEBUG @@ -355,7 +360,7 @@ xfs_alloc_fixup_trees( bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, bnoblock->bb_numrecs == cntblock->bb_numrecs); } #endif @@ -386,25 +391,25 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* * Add new by-size btree entry(s). */ if (nfbno1 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } if (nfbno2 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } /* * Fix up the by-block btree entry(s). @@ -415,7 +420,7 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } else { /* * Update the by-block entry to start later|be shorter. @@ -429,10 +434,10 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } return 0; } @@ -682,7 +687,7 @@ xfs_alloc_ag_vextent_exact( error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(fbno <= args->agbno); /* @@ -783,7 +788,7 @@ xfs_alloc_find_best_extent( error = xfs_alloc_get_rec(*scur, sbno, slen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); /* @@ -946,7 +951,7 @@ restart: if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); if (ltlen >= args->minlen) break; if ((error = xfs_btree_increment(cnt_cur, 0, &i))) @@ -966,7 +971,7 @@ restart: */ if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); if (ltlena < args->minlen) @@ -999,7 +1004,7 @@ restart: cnt_cur->bc_ptrs[0] = besti; if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->len = blen; if (!xfs_alloc_fix_minleft(args)) { @@ -1088,7 +1093,7 @@ restart: if (bno_cur_lt) { if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); if (ltlena >= args->minlen) @@ -1104,7 +1109,7 @@ restart: if (bno_cur_gt) { if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, gtbno, gtlen, >bnoa, >lena); if (gtlena >= args->minlen) @@ -1303,7 +1308,7 @@ restart: error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); @@ -1342,7 +1347,7 @@ restart: * This can't happen in the second case above. */ rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), error0); if (rlen < args->maxlen) { xfs_agblock_t bestfbno; @@ -1362,13 +1367,13 @@ restart: if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); if (flen < bestrlen) break; xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), error0); if (rlen > bestrlen) { @@ -1383,7 +1388,7 @@ restart: if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); rlen = bestrlen; rbno = bestrbno; flen = bestflen; @@ -1408,7 +1413,7 @@ restart: if (!xfs_alloc_fix_minleft(args)) goto out_nominleft; rlen = args->len; - XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); /* * Allocate and initialize a cursor for the by-block tree. */ @@ -1422,7 +1427,7 @@ restart: cnt_cur = bno_cur = NULL; args->len = rlen; args->agbno = rbno; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(args->mp, args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); @@ -1467,7 +1472,7 @@ xfs_alloc_ag_vextent_small( if (i) { if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); } /* * Nothing in the btree, try the freelist. Make sure @@ -1493,7 +1498,7 @@ xfs_alloc_ag_vextent_small( } args->len = 1; args->agbno = fbno; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(args->mp, args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); @@ -1579,7 +1584,7 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * It's not contiguous, though. */ @@ -1591,7 +1596,8 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0); + XFS_WANT_CORRUPTED_GOTO(mp, + ltbno + ltlen <= bno, error0); } } /* @@ -1606,7 +1612,7 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * It's not contiguous, though. */ @@ -1618,7 +1624,7 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0); + XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0); } } /* @@ -1635,31 +1641,31 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Delete the old by-size entry on the right. */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Delete the old by-block entry for the right block. */ if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Move the by-block cursor back to the left neighbor. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); #ifdef DEBUG /* * Check that this is the right record: delete didn't @@ -1672,7 +1678,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(mp, i == 1 && xxbno == ltbno && xxlen == ltlen, error0); } @@ -1695,17 +1701,17 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Back up the by-block cursor to the left neighbor, and * update its length. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); nbno = ltbno; nlen = len + ltlen; if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) @@ -1721,10 +1727,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Update the starting block and length of the right * neighbor in the by-block tree. @@ -1743,7 +1749,7 @@ xfs_free_ag_extent( nlen = len; if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); } xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); bno_cur = NULL; @@ -1752,10 +1758,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 0, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0); if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); cnt_cur = NULL; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 15105dbc9e28..04e79d57bca6 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -86,8 +86,83 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, int move_count); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); +/* + * attr3 block 'firstused' conversion helpers. + * + * firstused refers to the offset of the first used byte of the nameval region + * of an attr leaf block. The region starts at the tail of the block and expands + * backwards towards the middle. As such, firstused is initialized to the block + * size for an empty leaf block and is reduced from there. + * + * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k. + * The in-core firstused field is 32-bit and thus supports the maximum fsb size. + * The on-disk field is only 16-bit, however, and overflows at 64k. Since this + * only occurs at exactly 64k, we use zero as a magic on-disk value to represent + * the attr block size. The following helpers manage the conversion between the + * in-core and on-disk formats. + */ + +static void +xfs_attr3_leaf_firstused_from_disk( + struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + hdr3 = (struct xfs_attr3_leaf_hdr *) from; + to->firstused = be16_to_cpu(hdr3->firstused); + } else { + to->firstused = be16_to_cpu(from->hdr.firstused); + } + + /* + * Convert from the magic fsb size value to actual blocksize. This + * should only occur for empty blocks when the block size overflows + * 16-bits. + */ + if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) { + ASSERT(!to->count && !to->usedbytes); + ASSERT(geo->blksize > USHRT_MAX); + to->firstused = geo->blksize; + } +} + +static void +xfs_attr3_leaf_firstused_to_disk( + struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + uint32_t firstused; + + /* magic value should only be seen on disk */ + ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF); + + /* + * Scale down the 32-bit in-core firstused value to the 16-bit on-disk + * value. This only overflows at the max supported value of 64k. Use the + * magic on-disk value to represent block size in this case. + */ + firstused = from->firstused; + if (firstused > USHRT_MAX) { + ASSERT(from->firstused == geo->blksize); + firstused = XFS_ATTR3_LEAF_NULLOFF; + } + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + hdr3 = (struct xfs_attr3_leaf_hdr *) to; + hdr3->firstused = cpu_to_be16(firstused); + } else { + to->hdr.firstused = cpu_to_be16(firstused); + } +} + void xfs_attr3_leaf_hdr_from_disk( + struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from) { @@ -104,7 +179,7 @@ xfs_attr3_leaf_hdr_from_disk( to->magic = be16_to_cpu(hdr3->info.hdr.magic); to->count = be16_to_cpu(hdr3->count); to->usedbytes = be16_to_cpu(hdr3->usedbytes); - to->firstused = be16_to_cpu(hdr3->firstused); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = hdr3->holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { @@ -118,7 +193,7 @@ xfs_attr3_leaf_hdr_from_disk( to->magic = be16_to_cpu(from->hdr.info.magic); to->count = be16_to_cpu(from->hdr.count); to->usedbytes = be16_to_cpu(from->hdr.usedbytes); - to->firstused = be16_to_cpu(from->hdr.firstused); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = from->hdr.holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { @@ -129,10 +204,11 @@ xfs_attr3_leaf_hdr_from_disk( void xfs_attr3_leaf_hdr_to_disk( + struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from) { - int i; + int i; ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || from->magic == XFS_ATTR3_LEAF_MAGIC); @@ -145,7 +221,7 @@ xfs_attr3_leaf_hdr_to_disk( hdr3->info.hdr.magic = cpu_to_be16(from->magic); hdr3->count = cpu_to_be16(from->count); hdr3->usedbytes = cpu_to_be16(from->usedbytes); - hdr3->firstused = cpu_to_be16(from->firstused); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); hdr3->holes = from->holes; hdr3->pad1 = 0; @@ -160,7 +236,7 @@ xfs_attr3_leaf_hdr_to_disk( to->hdr.info.magic = cpu_to_be16(from->magic); to->hdr.count = cpu_to_be16(from->count); to->hdr.usedbytes = cpu_to_be16(from->usedbytes); - to->hdr.firstused = cpu_to_be16(from->firstused); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); to->hdr.holes = from->holes; to->hdr.pad1 = 0; @@ -178,7 +254,7 @@ xfs_attr3_leaf_verify( struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr3_icleaf_hdr ichdr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; @@ -757,9 +833,10 @@ xfs_attr_shortform_allfit( struct xfs_attr3_icleaf_hdr leafhdr; int bytes; int i; + struct xfs_mount *mp = bp->b_target->bt_mount; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); bytes = sizeof(struct xfs_attr_sf_hdr); @@ -812,7 +889,7 @@ xfs_attr3_leaf_to_shortform( memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); leaf = (xfs_attr_leafblock_t *)tmpbuffer; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); /* XXX (dgc): buffer is about to be marked stale - why zero it? */ @@ -923,7 +1000,7 @@ xfs_attr3_leaf_to_node( btree = dp->d_ops->node_tree_p(node); leaf = bp2->b_addr; - xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); /* both on-disk, don't endian-flip twice */ @@ -988,7 +1065,7 @@ xfs_attr3_leaf_create( } ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); *bpp = bp; @@ -1073,7 +1150,7 @@ xfs_attr3_leaf_add( trace_xfs_attr_leaf_add(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index >= 0 && args->index <= ichdr.count); entsize = xfs_attr_leaf_newentsize(args, NULL); @@ -1126,7 +1203,7 @@ xfs_attr3_leaf_add( tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); out_log_hdr: - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); @@ -1294,7 +1371,7 @@ xfs_attr3_leaf_compact( ichdr_dst->freemap[0].base; /* write the header back to initialise the underlying buffer */ - xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, @@ -1344,9 +1421,10 @@ xfs_attr_leaf_order( { struct xfs_attr3_icleaf_hdr ichdr1; struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_mount *mp = leaf1_bp->b_target->bt_mount; - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr); return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); } @@ -1388,8 +1466,8 @@ xfs_attr3_leaf_rebalance( ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2); ASSERT(ichdr2.count == 0); args = state->args; @@ -1490,8 +1568,8 @@ xfs_attr3_leaf_rebalance( ichdr1.count, count); } - xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); - xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2); xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); @@ -1684,7 +1762,7 @@ xfs_attr3_leaf_toosmall( */ blk = &state->path.blk[ state->path.active-1 ]; leaf = blk->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf); bytes = xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + ichdr.usedbytes; @@ -1740,7 +1818,7 @@ xfs_attr3_leaf_toosmall( if (error) return error; - xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr); bytes = state->args->geo->blksize - (state->args->geo->blksize >> 2) - @@ -1805,7 +1883,7 @@ xfs_attr3_leaf_remove( trace_xfs_attr_leaf_remove(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); ASSERT(args->index >= 0 && args->index < ichdr.count); @@ -1918,12 +1996,11 @@ xfs_attr3_leaf_remove( tmp = be16_to_cpu(entry->nameidx); } ichdr.firstused = tmp; - if (!ichdr.firstused) - ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN; + ASSERT(ichdr.firstused != 0); } else { ichdr.holes = 1; /* mark as needing compaction */ } - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); @@ -1957,8 +2034,8 @@ xfs_attr3_leaf_unbalance( drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf); - xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf); entry = xfs_attr3_leaf_entryp(drop_leaf); /* @@ -2012,7 +2089,7 @@ xfs_attr3_leaf_unbalance( tmphdr.firstused = state->args->geo->blksize; /* write the header to the temp buffer to initialise it */ - xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr); if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { @@ -2039,7 +2116,7 @@ xfs_attr3_leaf_unbalance( kmem_free(tmp_leaf); } - xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->args->geo->blksize - 1); @@ -2085,7 +2162,7 @@ xfs_attr3_leaf_lookup_int( trace_xfs_attr_leaf_lookup(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); ASSERT(ichdr.count < args->geo->blksize / 8); @@ -2190,7 +2267,7 @@ xfs_attr3_leaf_getvalue( int valuelen; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count < args->geo->blksize / 8); ASSERT(args->index < ichdr.count); @@ -2391,8 +2468,9 @@ xfs_attr_leaf_lasthash( { struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; + struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr); entries = xfs_attr3_leaf_entryp(bp->b_addr); if (count) *count = ichdr.count; @@ -2486,7 +2564,7 @@ xfs_attr3_leaf_clearflag( ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); @@ -2550,7 +2628,7 @@ xfs_attr3_leaf_setflag( leaf = bp->b_addr; #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); #endif @@ -2629,11 +2707,11 @@ xfs_attr3_leaf_flipflags( entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1); ASSERT(args->index < ichdr1.count); ASSERT(args->index >= 0); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2); ASSERT(args->index2 < ichdr2.count); ASSERT(args->index2 >= 0); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index e2929da7c3ba..025c4b820c03 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -100,9 +100,11 @@ int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, +void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from); -void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to, +void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 61ec015dca16..aeffeaaac0ec 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -244,30 +244,6 @@ xfs_bmap_forkoff_reset( } } -/* - * Debug/sanity checking code - */ - -STATIC int -xfs_bmap_sanity_check( - struct xfs_mount *mp, - struct xfs_buf *bp, - int level) -{ - struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - - if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) && - block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC)) - return 0; - - if (be16_to_cpu(block->bb_level) != level || - be16_to_cpu(block->bb_numrecs) == 0 || - be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) - return 0; - - return 1; -} - #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( @@ -410,9 +386,6 @@ xfs_bmap_check_leaf_extents( goto error_norelse; } block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); if (level == 0) break; @@ -424,7 +397,8 @@ xfs_bmap_check_leaf_extents( xfs_check_block(block, mp, 0, 0); pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); @@ -1029,7 +1003,7 @@ xfs_bmap_add_attrfork_btree( if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) goto error0; /* must be at least one entry */ - XFS_WANT_CORRUPTED_GOTO(stat == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0); if ((error = xfs_btree_new_iroot(cur, flags, &stat))) goto error0; if (stat == 0) { @@ -1311,14 +1285,12 @@ xfs_bmap_read_extents( if (error) return error; block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); if (level == 0) break; pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); xfs_trans_brelse(tp, bp); } /* @@ -1345,9 +1317,6 @@ xfs_bmap_read_extents( XFS_ERRLEVEL_LOW, ip->i_mount, block); goto error0; } - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, 0), - error0); /* * Read-ahead the next leaf block, if any. */ @@ -1755,7 +1724,9 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + struct xfs_mount *mp; + mp = bma->tp ? bma->tp->t_mountp : NULL; ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); ASSERT(bma->idx >= 0); @@ -1866,15 +1837,15 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_delete(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_decrement(bma->cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -1907,7 +1878,7 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -1938,7 +1909,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, PREV.br_startoff, new->br_startblock, PREV.br_blockcount + @@ -1968,12 +1939,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2001,7 +1972,7 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -2038,12 +2009,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2084,7 +2055,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -2122,12 +2093,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2191,12 +2162,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2212,9 +2183,8 @@ xfs_bmap_add_extent_delay_real( diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); if (diff > 0) { - error = xfs_icsb_modify_counters(bma->ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0); + error = xfs_mod_fdblocks(bma->ip->i_mount, + -((int64_t)diff), false); ASSERT(!error); if (error) goto done; @@ -2265,9 +2235,8 @@ xfs_bmap_add_extent_delay_real( temp += bma->cur->bc_private.b.allocated; ASSERT(temp <= da_old); if (temp < da_old) - xfs_icsb_modify_counters(bma->ip->i_mount, - XFS_SBS_FDBLOCKS, - (int64_t)(da_old - temp), 0); + xfs_mod_fdblocks(bma->ip->i_mount, + (int64_t)(da_old - temp), false); } /* clear out the allocated field, done with it now in any case. */ @@ -2309,6 +2278,7 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ + struct xfs_mount *mp = tp->t_mountp; *logflagsp = 0; @@ -2421,19 +2391,19 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + @@ -2464,13 +2434,13 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount, @@ -2499,13 +2469,13 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, @@ -2532,7 +2502,7 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount, newext))) @@ -2569,7 +2539,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -2611,7 +2581,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -2621,7 +2591,7 @@ xfs_bmap_add_extent_unwritten_real( cur->bc_rec.b = *new; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2651,7 +2621,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -2689,7 +2659,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -2699,11 +2669,11 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2737,7 +2707,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); /* new right extent - oldext */ if ((error = xfs_bmbt_update(cur, r[1].br_startoff, r[1].br_startblock, r[1].br_blockcount, @@ -2749,7 +2719,7 @@ xfs_bmap_add_extent_unwritten_real( new->br_startoff - PREV.br_startoff; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); /* * Reset the cursor to the position of the new extent * we are about to insert as we can't trust it after @@ -2759,12 +2729,12 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); /* new middle extent - newext */ cur->bc_rec.b.br_state = new->br_state; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2944,8 +2914,8 @@ xfs_bmap_add_extent_hole_delay( } if (oldlen != newlen) { ASSERT(oldlen > newlen); - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(oldlen - newlen), 0); + xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen), + false); /* * Nothing to do for disk quota accounting here. */ @@ -2968,7 +2938,9 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ + struct xfs_mount *mp; + mp = bma->tp ? bma->tp->t_mountp : NULL; ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); @@ -3056,15 +3028,15 @@ xfs_bmap_add_extent_hole_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_delete(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_decrement(bma->cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -3097,7 +3069,7 @@ xfs_bmap_add_extent_hole_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -3131,7 +3103,7 @@ xfs_bmap_add_extent_hole_real( right.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -3161,12 +3133,12 @@ xfs_bmap_add_extent_hole_real( new->br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = new->br_state; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; } @@ -4160,18 +4132,15 @@ xfs_bmapi_reserve_delalloc( ASSERT(indlen > 0); if (rt) { - error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - -((int64_t)extsz), 0); + error = xfs_mod_frextents(mp, -((int64_t)extsz)); } else { - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)alen), 0); + error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); } if (error) goto out_unreserve_quota; - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)indlen), 0); + error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false); if (error) goto out_unreserve_blocks; @@ -4198,9 +4167,9 @@ xfs_bmapi_reserve_delalloc( out_unreserve_blocks: if (rt) - xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0); + xfs_mod_frextents(mp, extsz); else - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); + xfs_mod_fdblocks(mp, alen, false); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? @@ -4801,7 +4770,7 @@ xfs_bmap_del_extent( got.br_startblock, got.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } da_old = da_new = 0; } else { @@ -4835,7 +4804,7 @@ xfs_bmap_del_extent( } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); break; case 2: @@ -4935,7 +4904,8 @@ xfs_bmap_del_extent( got.br_startblock, temp, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, + i == 1, done); /* * Update the btree record back * to the original value. @@ -4956,7 +4926,7 @@ xfs_bmap_del_extent( error = -ENOSPC; goto done; } - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } else flags |= xfs_ilog_fext(whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, @@ -5012,10 +4982,8 @@ xfs_bmap_del_extent( * Nothing to do for disk quota accounting here. */ ASSERT(da_old >= da_new); - if (da_old > da_new) { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - da_new), 0); - } + if (da_old > da_new) + xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); done: *logflagsp = flags; return error; @@ -5284,14 +5252,13 @@ xfs_bunmapi( rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); - xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - (int64_t)rtexts, 0); + xfs_mod_frextents(mp, (int64_t)rtexts); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)del.br_blockcount, 0); + xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, + false); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); @@ -5453,6 +5420,7 @@ xfs_bmse_merge( struct xfs_bmbt_irec left; xfs_filblks_t blockcount; int error, i; + struct xfs_mount *mp = ip->i_mount; xfs_bmbt_get_all(gotp, &got); xfs_bmbt_get_all(leftp, &left); @@ -5487,19 +5455,19 @@ xfs_bmse_merge( got.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); error = xfs_btree_delete(cur, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, left.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); left.br_blockcount = blockcount; @@ -5518,50 +5486,92 @@ xfs_bmse_shift_one( int *current_ext, struct xfs_bmbt_rec_host *gotp, struct xfs_btree_cur *cur, - int *logflags) + int *logflags, + enum shift_direction direction) { struct xfs_ifork *ifp; + struct xfs_mount *mp; xfs_fileoff_t startoff; - struct xfs_bmbt_rec_host *leftp; + struct xfs_bmbt_rec_host *adj_irecp; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec left; + struct xfs_bmbt_irec adj_irec; int error; int i; + int total_extents; + mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); xfs_bmbt_get_all(gotp, &got); - startoff = got.br_startoff - offset_shift_fsb; /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); - /* - * Check for merge if we've got an extent to the left, otherwise make - * sure there's enough room at the start of the file for the shift. - */ - if (*current_ext) { - /* grab the left extent and check for a large enough hole */ - leftp = xfs_iext_get_ext(ifp, *current_ext - 1); - xfs_bmbt_get_all(leftp, &left); + if (direction == SHIFT_LEFT) { + startoff = got.br_startoff - offset_shift_fsb; + + /* + * Check for merge if we've got an extent to the left, + * otherwise make sure there's enough room at the start + * of the file for the shift. + */ + if (!*current_ext) { + if (got.br_startoff < offset_shift_fsb) + return -EINVAL; + goto update_current_ext; + } + /* + * grab the left extent and check for a large + * enough hole. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); - if (startoff < left.br_startoff + left.br_blockcount) + if (startoff < + adj_irec.br_startoff + adj_irec.br_blockcount) return -EINVAL; /* check whether to merge the extent or shift it down */ - if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) { + if (xfs_bmse_can_merge(&adj_irec, &got, + offset_shift_fsb)) { return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, gotp, leftp, cur, - logflags); + *current_ext, gotp, adj_irecp, + cur, logflags); } - } else if (got.br_startoff < offset_shift_fsb) - return -EINVAL; - + } else { + startoff = got.br_startoff + offset_shift_fsb; + /* nothing to move if this is the last extent */ + if (*current_ext >= (total_extents - 1)) + goto update_current_ext; + /* + * If this is not the last extent in the file, make sure there + * is enough room between current extent and next extent for + * accommodating the shift. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); + if (startoff + got.br_blockcount > adj_irec.br_startoff) + return -EINVAL; + /* + * Unlike a left shift (which involves a hole punch), + * a right shift does not modify extent neighbors + * in any way. We should never find mergeable extents + * in this scenario. Check anyways and warn if we + * encounter two extents that could be one. + */ + if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb)) + WARN_ON_ONCE(1); + } /* * Increment the extent index for the next iteration, update the start * offset of the in-core extent and update the btree if applicable. */ - (*current_ext)++; +update_current_ext: + if (direction == SHIFT_LEFT) + (*current_ext)++; + else + (*current_ext)--; xfs_bmbt_set_startoff(gotp, startoff); *logflags |= XFS_ILOG_CORE; if (!cur) { @@ -5573,18 +5583,18 @@ xfs_bmse_shift_one( got.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); got.br_startoff = startoff; return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, got.br_state); + got.br_blockcount, got.br_state); } /* - * Shift extent records to the left to cover a hole. + * Shift extent records to the left/right to cover/create a hole. * * The maximum number of extents to be shifted in a single operation is - * @num_exts. @start_fsb specifies the file offset to start the shift and the + * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb * is the length by which each extent is shifted. If there is no hole to shift * the extents into, this will be considered invalid operation and we abort @@ -5594,12 +5604,13 @@ int xfs_bmap_shift_extents( struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t start_fsb, + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, int *done, - xfs_fileoff_t *next_fsb, + xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, + enum shift_direction direction, int num_exts) { struct xfs_btree_cur *cur = NULL; @@ -5609,10 +5620,11 @@ xfs_bmap_shift_extents( struct xfs_ifork *ifp; xfs_extnum_t nexts = 0; xfs_extnum_t current_ext; + xfs_extnum_t total_extents; + xfs_extnum_t stop_extent; int error = 0; int whichfork = XFS_DATA_FORK; int logflags = 0; - int total_extents; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5628,6 +5640,8 @@ xfs_bmap_shift_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); + ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT); ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -5645,43 +5659,83 @@ xfs_bmap_shift_extents( } /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot use the count of real extents here. + * Instead we have to calculate it from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (total_extents == 0) { + *done = 1; + goto del_cursor; + } + + /* + * In case of first right shift, we need to initialize next_fsb + */ + if (*next_fsb == NULLFSBLOCK) { + gotp = xfs_iext_get_ext(ifp, total_extents - 1); + xfs_bmbt_get_all(gotp, &got); + *next_fsb = got.br_startoff; + if (stop_fsb > *next_fsb) { + *done = 1; + goto del_cursor; + } + } + + /* Lookup the extent index at which we have to stop */ + if (direction == SHIFT_RIGHT) { + gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); + /* Make stop_extent exclusive of shift range */ + stop_extent--; + } else + stop_extent = total_extents; + + /* * Look up the extent index for the fsb where we start shifting. We can * henceforth iterate with current_ext as extent list changes are locked * out via ilock. * * gotp can be null in 2 cases: 1) if there are no extents or 2) - * start_fsb lies in a hole beyond which there are no extents. Either + * *next_fsb lies in a hole beyond which there are no extents. Either * way, we are done. */ - gotp = xfs_iext_bno_to_ext(ifp, start_fsb, ¤t_ext); + gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); if (!gotp) { *done = 1; goto del_cursor; } - /* - * There may be delalloc extents in the data fork before the range we - * are collapsing out, so we cannot use the count of real extents here. - * Instead we have to calculate it from the incore fork. - */ - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - while (nexts++ < num_exts && current_ext < total_extents) { + /* some sanity checking before we finally start shifting extents */ + if ((direction == SHIFT_LEFT && current_ext >= stop_extent) || + (direction == SHIFT_RIGHT && current_ext <= stop_extent)) { + error = -EIO; + goto del_cursor; + } + + while (nexts++ < num_exts) { error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, - ¤t_ext, gotp, cur, &logflags); + ¤t_ext, gotp, cur, &logflags, + direction); if (error) goto del_cursor; + /* + * If there was an extent merge during the shift, the extent + * count can change. Update the total and grade the next record. + */ + if (direction == SHIFT_LEFT) { + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + stop_extent = total_extents; + } - /* update total extent count and grab the next record */ - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - if (current_ext >= total_extents) + if (current_ext == stop_extent) { + *done = 1; + *next_fsb = NULLFSBLOCK; break; + } gotp = xfs_iext_get_ext(ifp, current_ext); } - /* Check if we are done */ - if (current_ext == total_extents) { - *done = 1; - } else if (next_fsb) { + if (!*done) { xfs_bmbt_get_all(gotp, &got); *next_fsb = got.br_startoff; } @@ -5696,3 +5750,189 @@ del_cursor: return error; } + +/* + * Splits an extent into two extents at split_fsb block such that it is + * the first block of the current_ext. @current_ext is a target extent + * to be split. @split_fsb is a block where the extents is split. + * If split_fsb lies in a hole or the first block of extents, just return 0. + */ +STATIC int +xfs_bmap_split_extent_at( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t split_fsb, + xfs_fsblock_t *firstfsb, + struct xfs_bmap_free *free_list) +{ + int whichfork = XFS_DATA_FORK; + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec new; /* split extent */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_fsblock_t gotblkcnt; /* new block count for got */ + xfs_extnum_t current_ext; + int error = 0; + int logflags = 0; + int i = 0; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_split_extent_at", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + /* + * gotp can be null in 2 cases: 1) if there are no extents + * or 2) split_fsb lies in a hole beyond which there are + * no extents. Either way, we are done. + */ + gotp = xfs_iext_bno_to_ext(ifp, split_fsb, ¤t_ext); + if (!gotp) + return 0; + + xfs_bmbt_get_all(gotp, &got); + + /* + * Check split_fsb lies in a hole or the start boundary offset + * of the extent. + */ + if (got.br_startoff >= split_fsb) + return 0; + + gotblkcnt = split_fsb - got.br_startoff; + new.br_startoff = split_fsb; + new.br_startblock = got.br_startblock + gotblkcnt; + new.br_blockcount = got.br_blockcount - gotblkcnt; + new.br_state = got.br_state; + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstfsb; + cur->bc_private.b.flist = free_list; + cur->bc_private.b.flags = 0; + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + xfs_bmbt_set_blockcount(gotp, gotblkcnt); + got.br_blockcount = gotblkcnt; + + logflags = XFS_ILOG_CORE; + if (cur) { + error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state); + if (error) + goto del_cursor; + } else + logflags |= XFS_ILOG_DEXT; + + /* Add new extent */ + current_ext++; + xfs_iext_insert(ip, current_ext, 1, &new, 0); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, new.br_startoff, + new.br_startblock, new.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor); + cur->bc_rec.b.br_state = new.br_state; + + error = xfs_btree_insert(cur, &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + /* + * Convert to a btree if necessary. + */ + if (xfs_bmap_needs_btree(ip, whichfork)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list, + &cur, 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + } + +del_cursor: + if (cur) { + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + return error; +} + +int +xfs_bmap_split_extent( + struct xfs_inode *ip, + xfs_fileoff_t split_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstfsb; + int committed; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_bmap_init(&free_list, &firstfsb); + + error = xfs_bmap_split_extent_at(tp, ip, split_fsb, + &firstfsb, &free_list); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + +out: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b9d8a499d2c4..6aaa0c1c7200 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -166,6 +166,11 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) */ #define XFS_BMAP_MAX_SHIFT_EXTENTS 1 +enum shift_direction { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + #ifdef DEBUG void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int whichfork, unsigned long caller_ip); @@ -211,8 +216,10 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb, - int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, int num_exts); + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, + int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, enum shift_direction direction, + int num_exts); +int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 81cad433df85..c72283dd8d44 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -168,7 +168,7 @@ xfs_btree_check_lptr( xfs_fsblock_t bno, /* btree block disk address */ int level) /* btree block level */ { - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, level > 0 && bno != NULLFSBLOCK && XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); @@ -187,7 +187,7 @@ xfs_btree_check_sptr( { xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, level > 0 && bno != NULLAGBLOCK && bno != 0 && @@ -1825,7 +1825,7 @@ xfs_btree_lookup( error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; @@ -2285,7 +2285,7 @@ xfs_btree_rshift( if (error) goto error0; i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_increment(tcur, level, &i); if (error) @@ -3138,7 +3138,7 @@ xfs_btree_insert( goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); level++; /* @@ -3582,15 +3582,15 @@ xfs_btree_delrec( * Actually any entry but the first would suffice. */ i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_increment(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); /* Grab a pointer to the block. */ right = xfs_btree_get_block(tcur, level, &rbp); @@ -3634,12 +3634,12 @@ xfs_btree_delrec( rrecs = xfs_btree_get_numrecs(right); if (!xfs_btree_ptr_is_null(cur, &lptr)) { i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); } } @@ -3653,13 +3653,13 @@ xfs_btree_delrec( * previous block. */ i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); /* Grab a pointer to the block. */ left = xfs_btree_get_block(tcur, level, &lbp); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 9cb0115c6bd1..2385f8cd08ab 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -538,12 +538,12 @@ xfs_da3_root_split( oldroot = blk1->bp->b_addr; if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { - struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da3_icnode_hdr icnodehdr; - dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot); + dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot); btree = dp->d_ops->node_tree_p(oldroot); - size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot); - level = nodehdr.level; + size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); + level = icnodehdr.level; /* * we are about to copy oldroot to bp, so set up the type diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 0a49b0286372..74bcbabfa523 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -725,7 +725,13 @@ struct xfs_attr3_icleaf_hdr { __uint16_t magic; __uint16_t count; __uint16_t usedbytes; - __uint16_t firstused; + /* + * firstused is 32-bit here instead of 16-bit like the on-disk variant + * to support maximum fsb size of 64k without overflow issues throughout + * the attr code. Instead, the overflow condition is handled on + * conversion to/from disk. + */ + __uint32_t firstused; __u8 holes; struct { __uint16_t base; @@ -734,6 +740,12 @@ struct xfs_attr3_icleaf_hdr { }; /* + * Special value to represent fs block size in the leaf header firstused field. + * Only used when block size overflows the 2-bytes available on disk. + */ +#define XFS_ATTR3_LEAF_NULLOFF 0 + +/* * Flags used in the leaf_entry[i].flags field. * NOTE: the INCOMPLETE bit must not collide with the flags bits specified * on the system call, they are "or"ed together for various operations. diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 5ff31be9b1cd..de1ea16f5748 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -89,7 +89,7 @@ __xfs_dir3_data_check( * so just ensure that the count falls somewhere inside the * block right now. */ - XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) < + XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) < ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); break; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): @@ -107,21 +107,21 @@ __xfs_dir3_data_check( bf = ops->data_bestfree_p(hdr); count = lastfree = freeseen = 0; if (!bf[0].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset); freeseen |= 1 << 0; } if (!bf[1].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[1].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset); freeseen |= 1 << 1; } if (!bf[2].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[2].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset); freeseen |= 1 << 2; } - XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >= + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); - XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >= + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); /* * Loop over the data/unused entries. @@ -134,18 +134,18 @@ __xfs_dir3_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - XFS_WANT_CORRUPTED_RETURN(lastfree == 0); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0); + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == (char *)dup - (char *)hdr); dfp = xfs_dir2_data_freefind(hdr, bf, dup); if (dfp) { i = (int)(dfp - bf); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, (freeseen & (1 << i)) == 0); freeseen |= 1 << i; } else { - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(dup->length) <= be16_to_cpu(bf[2].length)); } @@ -160,13 +160,13 @@ __xfs_dir3_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*ops->data_entry_tag_p(dep)) == (char *)dep - (char *)hdr); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); count++; lastfree = 0; @@ -183,14 +183,15 @@ __xfs_dir3_data_check( be32_to_cpu(lep[i].hashval) == hash) break; } - XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); + XFS_WANT_CORRUPTED_RETURN(mp, + i < be32_to_cpu(btp->count)); } p += ops->data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - XFS_WANT_CORRUPTED_RETURN(freeseen == 7); + XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { @@ -198,13 +199,13 @@ __xfs_dir3_data_check( cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; if (i > 0) - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); } - XFS_WANT_CORRUPTED_RETURN(count == + XFS_WANT_CORRUPTED_RETURN(mp, count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale)); } return 0; } diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 8eb718979383..4daaa662337b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -264,68 +264,6 @@ typedef struct xfs_dsb { /* must be padded to 64 bit alignment */ } xfs_dsb_t; -/* - * Sequence number values for the fields. - */ -typedef enum { - XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS, - XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO, - XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS, - XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS, - XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE, - XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG, - XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG, - XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT, - XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO, - XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, - XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, - XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, - XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, - XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, - XFS_SBS_PQUOTINO, XFS_SBS_LSN, - XFS_SBS_FIELDCOUNT -} xfs_sb_field_t; - -/* - * Mask values, defined based on the xfs_sb_field_t values. - * Only define the ones we're using. - */ -#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x) -#define XFS_SB_UUID XFS_SB_MVAL(UUID) -#define XFS_SB_FNAME XFS_SB_MVAL(FNAME) -#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO) -#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO) -#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO) -#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM) -#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO) -#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO) -#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS) -#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) -#define XFS_SB_UNIT XFS_SB_MVAL(UNIT) -#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) -#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) -#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) -#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) -#define XFS_SB_FEATURES2 (XFS_SB_MVAL(FEATURES2) | \ - XFS_SB_MVAL(BAD_FEATURES2)) -#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) -#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) -#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) -#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) -#define XFS_SB_CRC XFS_SB_MVAL(CRC) -#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) -#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) -#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) -#define XFS_SB_MOD_BITS \ - (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ - XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ - XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ - XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ - XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \ - XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \ - XFS_SB_PQUOTINO) - /* * Misc. Flags - warning - these will be cleared by xfs_repair unless diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 116ef1ddb3e3..07349a183a11 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -376,7 +376,8 @@ xfs_ialloc_ag_alloc( */ newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && - args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) + percpu_counter_read(&args.mp->m_icount) + newlen > + args.mp->m_maxicount) return -ENOSPC; args.minlen = args.maxlen = args.mp->m_ialloc_blks; /* @@ -700,7 +701,7 @@ xfs_ialloc_next_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); } return 0; @@ -724,7 +725,7 @@ xfs_ialloc_get_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); } return 0; @@ -783,12 +784,12 @@ xfs_dialloc_ag_inobt( error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(j == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0); if (rec.ir_freecount > 0) { /* @@ -944,19 +945,19 @@ newino: error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); for (;;) { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if (rec.ir_freecount > 0) break; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); } alloc_inode: @@ -1016,7 +1017,7 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1); /* * See if we've landed in the parent inode record. The finobt @@ -1039,10 +1040,10 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(rcur, &rrec, &j); if (error) goto error_rcur; - XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur); + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur); } - XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur); + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur); if (i == 1 && j == 1) { /* * Both the left and right records are valid. Choose the closer @@ -1095,7 +1096,7 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); return 0; } } @@ -1106,12 +1107,12 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); return 0; } @@ -1133,19 +1134,19 @@ xfs_dialloc_ag_update_inobt( error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; - XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) && (rec.ir_freecount == frec->ir_freecount)); return xfs_inobt_update(cur, &rec); @@ -1340,7 +1341,8 @@ xfs_dialloc( * inode. */ if (mp->m_maxicount && - mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) { + percpu_counter_read(&mp->m_icount) + mp->m_ialloc_inos > + mp->m_maxicount) { noroom = 1; okalloc = 0; } @@ -1475,14 +1477,14 @@ xfs_difree_inobt( __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); error = xfs_inobt_get_rec(cur, &rec, &i); if (error) { xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Get the offset in the inode chunk. */ @@ -1592,7 +1594,7 @@ xfs_difree_finobt( * freed an inode in a previously fully allocated chunk. If not, * something is out of sync. */ - XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, ibtrec->ir_free, &i); @@ -1613,12 +1615,12 @@ xfs_difree_finobt( error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(i == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); rec.ir_free |= XFS_INOBT_MASK(offset); rec.ir_freecount++; - XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) && + XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) && (rec.ir_freecount == ibtrec->ir_freecount), error); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index b0a5fe95a3e2..dc4bfc5d88fc 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -111,14 +111,6 @@ xfs_mount_validate_sb( bool check_inprogress, bool check_version) { - - /* - * If the log device and data device have the - * same device number, the log is internal. - * Consequently, the sb_logstart should be non-zero. If - * we have a zero sb_logstart in this case, we may be trying to mount - * a volume filesystem in a non-volume manner. - */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; @@ -743,17 +735,15 @@ xfs_initialize_perag_data( btree += pag->pagf_btreeblks; xfs_perag_put(pag); } - /* - * Overwrite incore superblock counters with just-read data - */ + + /* Overwrite incore superblock counters with just-read data */ spin_lock(&mp->m_sb_lock); sbp->sb_ifree = ifree; sbp->sb_icount = ialloc; sbp->sb_fdblocks = bfree + bfreelst + btree; spin_unlock(&mp->m_sb_lock); - /* Fixup the per-cpu counters as well. */ - xfs_icsb_reinit_counters(mp); + xfs_reinit_percpu_counters(mp); return 0; } @@ -771,6 +761,10 @@ xfs_log_sb( struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0); + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 1d8eef9cf0f5..a56960dd1684 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1232,6 +1232,117 @@ xfs_vm_releasepage( return try_to_free_buffers(page); } +/* + * When we map a DIO buffer, we may need to attach an ioend that describes the + * type of write IO we are doing. This passes to the completion function the + * operations it needs to perform. If the mapping is for an overwrite wholly + * within the EOF then we don't need an ioend and so we don't allocate one. + * This avoids the unnecessary overhead of allocating and freeing ioends for + * workloads that don't require transactions on IO completion. + * + * If we get multiple mappings in a single IO, we might be mapping different + * types. But because the direct IO can only have a single private pointer, we + * need to ensure that: + * + * a) i) the ioend spans the entire region of unwritten mappings; or + * ii) the ioend spans all the mappings that cross or are beyond EOF; and + * b) if it contains unwritten extents, it is *permanently* marked as such + * + * We could do this by chaining ioends like buffered IO does, but we only + * actually get one IO completion callback from the direct IO, and that spans + * the entire IO regardless of how many mappings and IOs are needed to complete + * the DIO. There is only going to be one reference to the ioend and its life + * cycle is constrained by the DIO completion code. hence we don't need + * reference counting here. + */ +static void +xfs_map_direct( + struct inode *inode, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + struct xfs_ioend *ioend; + xfs_off_t size = bh_result->b_size; + int type; + + if (ISUNWRITTEN(imap)) + type = XFS_IO_UNWRITTEN; + else + type = XFS_IO_OVERWRITE; + + trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); + + if (bh_result->b_private) { + ioend = bh_result->b_private; + ASSERT(ioend->io_size > 0); + ASSERT(offset >= ioend->io_offset); + if (offset + size > ioend->io_offset + ioend->io_size) + ioend->io_size = offset - ioend->io_offset + size; + + if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) + ioend->io_type = XFS_IO_UNWRITTEN; + + trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, + ioend->io_size, ioend->io_type, + imap); + } else if (type == XFS_IO_UNWRITTEN || + offset + size > i_size_read(inode)) { + ioend = xfs_alloc_ioend(inode, type); + ioend->io_offset = offset; + ioend->io_size = size; + + bh_result->b_private = ioend; + set_buffer_defer_completion(bh_result); + + trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, + imap); + } else { + trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, + imap); + } +} + +/* + * If this is O_DIRECT or the mpage code calling tell them how large the mapping + * is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the mapping + * for blocks beyond EOF must be marked new so that sub block regions can be + * correctly zeroed. We can't do this for mappings within EOF unless the mapping + * was just allocated or is unwritten, otherwise the callers would overwrite + * existing data with zeros. Hence we have to split the mapping into a range up + * to and including EOF, and a second mapping for beyond EOF. + */ +static void +xfs_map_trim_size( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset, + ssize_t size) +{ + xfs_off_t mapping_size; + + mapping_size = imap->br_startoff + imap->br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; +} + STATIC int __xfs_get_blocks( struct inode *inode, @@ -1320,31 +1431,37 @@ __xfs_get_blocks( xfs_iunlock(ip, lockmode); } - - trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); + trace_xfs_get_blocks_alloc(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_DELALLOC, &imap); } else if (nimaps) { - trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + trace_xfs_get_blocks_found(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_OVERWRITE, &imap); xfs_iunlock(ip, lockmode); } else { trace_xfs_get_blocks_notfound(ip, offset, size); goto out_unlock; } + /* trim mapping down to size requested */ + if (direct || size > (1 << inode->i_blkbits)) + xfs_map_trim_size(inode, iblock, bh_result, + &imap, offset, size); + + /* + * For unwritten extents do not report a disk address in the buffered + * read case (treat as if we're reading into a hole). + */ if (imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK) { - /* - * For unwritten extents do not report a disk address on - * the read case (treat as if we're reading into a hole). - */ - if (create || !ISUNWRITTEN(&imap)) - xfs_map_buffer(inode, bh_result, &imap, offset); - if (create && ISUNWRITTEN(&imap)) { - if (direct) { - bh_result->b_private = inode; - set_buffer_defer_completion(bh_result); - } + imap.br_startblock != DELAYSTARTBLOCK && + (create || !ISUNWRITTEN(&imap))) { + xfs_map_buffer(inode, bh_result, &imap, offset); + if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); - } + /* direct IO needs special help */ + if (create && direct) + xfs_map_direct(inode, bh_result, &imap, offset); } /* @@ -1377,39 +1494,6 @@ __xfs_get_blocks( } } - /* - * If this is O_DIRECT or the mpage code calling tell them how large - * the mapping is, so that we can avoid repeated get_blocks calls. - * - * If the mapping spans EOF, then we have to break the mapping up as the - * mapping for blocks beyond EOF must be marked new so that sub block - * regions can be correctly zeroed. We can't do this for mappings within - * EOF unless the mapping was just allocated or is unwritten, otherwise - * the callers would overwrite existing data with zeros. Hence we have - * to split the mapping into a range up to and including EOF, and a - * second mapping for beyond EOF. - */ - if (direct || size > (1 << inode->i_blkbits)) { - xfs_off_t mapping_size; - - mapping_size = imap.br_startoff + imap.br_blockcount - iblock; - mapping_size <<= inode->i_blkbits; - - ASSERT(mapping_size > 0); - if (mapping_size > size) - mapping_size = size; - if (offset < i_size_read(inode) && - offset + mapping_size >= i_size_read(inode)) { - /* limit mapping to block that spans EOF */ - mapping_size = roundup_64(i_size_read(inode) - offset, - 1 << inode->i_blkbits); - } - if (mapping_size > LONG_MAX) - mapping_size = LONG_MAX; - - bh_result->b_size = mapping_size; - } - return 0; out_unlock: @@ -1440,9 +1524,11 @@ xfs_get_blocks_direct( /* * Complete a direct I/O write request. * - * If the private argument is non-NULL __xfs_get_blocks signals us that we - * need to issue a transaction to convert the range from unwritten to written - * extents. + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). */ STATIC void xfs_end_io_direct_write( @@ -1454,43 +1540,71 @@ xfs_end_io_direct_write( struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + struct xfs_ioend *ioend = private; - if (XFS_FORCED_SHUTDOWN(mp)) + trace_xfs_gbmap_direct_endio(ip, offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); return; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_end_io; /* - * While the generic direct I/O code updates the inode size, it does - * so only after the end_io handler is called, which means our - * end_io handler thinks the on-disk size is outside the in-core - * size. To prevent this just update it a little bit earlier here. + * dio completion end_io functions are only called on writes if more + * than 0 bytes was written. */ + ASSERT(size > 0); + + /* + * The ioend only maps whole blocks, while the IO may be sector aligned. + * Hence the ioend offset/size may not match the IO offset/size exactly. + * Because we don't map overwrites within EOF into the ioend, the offset + * may not match, but only if the endio spans EOF. Either way, write + * the IO sizes into the ioend so that completion processing does the + * right thing. + */ + ASSERT(offset + size <= ioend->io_offset + ioend->io_size); + ioend->io_size = size; + ioend->io_offset = offset; + + /* + * The ioend tells us whether we are doing unwritten extent conversion + * or an append transaction that updates the on-disk file size. These + * cases are the only cases where we should *potentially* be needing + * to update the VFS inode size. + * + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) i_size_write(inode, offset + size); + spin_unlock(&ip->i_flags_lock); /* - * For direct I/O we do not know if we need to allocate blocks or not, - * so we can't preallocate an append transaction, as that results in - * nested reservations and log space deadlocks. Hence allocate the - * transaction here. While this is sub-optimal and can block IO - * completion for some time, we're stuck with doing it this way until - * we can pass the ioend to the direct IO allocation callbacks and - * avoid nesting that way. + * If we are doing an append IO that needs to update the EOF on disk, + * do the transaction reserve now so we can use common end io + * processing. Stashing the error (if there is one) in the ioend will + * result in the ioend processing passing on the error if it is + * possible as we can't return it from here. */ - if (private && size > 0) { - xfs_iomap_write_unwritten(ip, offset, size); - } else if (offset + size > ip->i_d.di_size) { - struct xfs_trans *tp; - int error; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return; - } + if (ioend->io_type == XFS_IO_OVERWRITE) + ioend->io_error = xfs_setfilesize_trans_alloc(ioend); - xfs_setfilesize(ip, tp, offset, size); - } +out_end_io: + xfs_end_io(&ioend->io_work); + return; } STATIC ssize_t diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 83af4c149635..f9c1c64782d3 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -132,9 +132,10 @@ xfs_attr3_leaf_inactive( int size; int tmp; int i; + struct xfs_mount *mp = bp->b_target->bt_mount; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); /* * Count the number of "remote" value extents. diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index a43d370d2c58..65fb37a18e92 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -225,6 +225,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) int error, i; struct xfs_buf *bp; struct xfs_inode *dp = context->dp; + struct xfs_mount *mp = dp->i_mount; trace_xfs_attr_node_list(context); @@ -256,7 +257,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, + &leafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); if (cursor->hashval > be32_to_cpu( entries[leafhdr.count - 1].hashval)) { @@ -340,7 +342,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) xfs_trans_brelse(NULL, bp); return error; } - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); if (context->seen_enough || leafhdr.forw == 0) break; cursor->blkno = leafhdr.forw; @@ -368,11 +370,12 @@ xfs_attr3_leaf_list_int( struct xfs_attr_leaf_entry *entry; int retval; int i; + struct xfs_mount *mp = context->dp->i_mount; trace_xfs_attr_list_leaf(context); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); cursor = context->cursor; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 22a5dcb70b32..a52bbd3abc7d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1376,22 +1376,19 @@ out: } /* - * xfs_collapse_file_space() - * This routine frees disk space and shift extent for the given file. - * The first thing we do is to free data blocks in the specified range - * by calling xfs_free_file_space(). It would also sync dirty data - * and invalidate page cache over the region on which collapse range - * is working. And Shift extent records to the left to cover a hole. - * RETURNS: - * 0 on success - * errno on error - * + * @next_fsb will keep track of the extent currently undergoing shift. + * @stop_fsb will keep track of the extent at which we have to stop. + * If we are shifting left, we will start with block (offset + len) and + * shift each extent till last extent. + * If we are shifting right, we will start with last extent inside file space + * and continue until we reach the block corresponding to offset. */ -int -xfs_collapse_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len) +static int +xfs_shift_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + enum shift_direction direction) { int done = 0; struct xfs_mount *mp = ip->i_mount; @@ -1400,21 +1397,26 @@ xfs_collapse_file_space( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; int committed; - xfs_fileoff_t start_fsb; + xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); - trace_xfs_collapse_file_space(ip); + if (direction == SHIFT_LEFT) { + next_fsb = XFS_B_TO_FSB(mp, offset + len); + stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); + } else { + /* + * If right shift, delegate the work of initialization of + * next_fsb to xfs_bmap_shift_extent as it has ilock held. + */ + next_fsb = NULLFSBLOCK; + stop_fsb = XFS_B_TO_FSB(mp, offset); + } - next_fsb = XFS_B_TO_FSB(mp, offset + len); shift_fsb = XFS_B_TO_FSB(mp, len); - error = xfs_free_file_space(ip, offset, len); - if (error) - return error; - /* * Trim eofblocks to avoid shifting uninitialized post-eof preallocation * into the accessible region of the file. @@ -1427,20 +1429,28 @@ xfs_collapse_file_space( /* * Writeback and invalidate cache for the remainder of the file as we're - * about to shift down every extent from the collapse range to EOF. The - * free of the collapse range above might have already done some of - * this, but we shouldn't rely on it to do anything outside of the range - * that was freed. + * about to shift down every extent from offset to EOF. */ error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - offset + len, -1); + offset, -1); if (error) return error; error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - (offset + len) >> PAGE_CACHE_SHIFT, -1); + offset >> PAGE_CACHE_SHIFT, -1); if (error) return error; + /* + * The extent shiting code works on extent granularity. So, if + * stop_fsb is not the starting block of extent, we need to split + * the extent at stop_fsb. + */ + if (direction == SHIFT_RIGHT) { + error = xfs_bmap_split_extent(ip, stop_fsb); + if (error) + return error; + } + while (!error && !done) { tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); /* @@ -1464,7 +1474,7 @@ xfs_collapse_file_space( if (error) goto out; - xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_bmap_init(&free_list, &first_block); @@ -1472,10 +1482,9 @@ xfs_collapse_file_space( * We are using the write transaction in which max 2 bmbt * updates are allowed */ - start_fsb = next_fsb; - error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb, - &done, &next_fsb, &first_block, &free_list, - XFS_BMAP_MAX_SHIFT_EXTENTS); + error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb, + &done, stop_fsb, &first_block, &free_list, + direction, XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) goto out; @@ -1484,18 +1493,70 @@ xfs_collapse_file_space( goto out; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(ip, XFS_ILOCK_EXCL); } return error; out: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_collapse_file_space(ip); + + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; + + return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT); +} + +/* + * xfs_insert_file_space() + * This routine create hole space by shifting extents for the given file. + * The first thing we do is to sync dirty data and invalidate page cache + * over the region on which insert range is working. And split an extent + * to two extents at given offset by calling xfs_bmap_split_extent. + * And shift all extent records which are laying between [offset, + * last allocated extent] to the right to reserve hole range. + * RETURNS: + * 0 on success + * errno on error + */ +int +xfs_insert_file_space( + struct xfs_inode *ip, + loff_t offset, + loff_t len) +{ + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_insert_file_space(ip); + + return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT); +} + +/* * We need to check that the format of the data fork in the temporary inode is * valid for the target inode before doing the swap. This is not a problem with * attr1 because of the fixed fork offset, but attr2 has a dynamically sized @@ -1599,13 +1660,6 @@ xfs_swap_extent_flush( /* Verify O_DIRECT for ftmp */ if (VFS_I(ip)->i_mapping->nrpages) return -EINVAL; - - /* - * Don't try to swap extents on mmap()d files because we can't lock - * out races against page faults safely. - */ - if (mapping_mapped(VFS_I(ip)->i_mapping)) - return -EBUSY; return 0; } @@ -1633,13 +1687,14 @@ xfs_swap_extents( } /* - * Lock up the inodes against other IO and truncate to begin with. - * Then we can ensure the inodes are flushed and have no page cache - * safely. Once we have done this we can take the ilocks and do the rest - * of the checks. + * Lock the inodes against other IO, page faults and truncate to + * begin with. Then we can ensure the inodes are flushed and have no + * page cache safely. Once we have done this we can take the ilocks and + * do the rest of the checks. */ - lock_flags = XFS_IOLOCK_EXCL; + lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { @@ -1666,8 +1721,16 @@ xfs_swap_extents( xfs_trans_cancel(tp, 0); goto out_unlock; } + + /* + * Lock and join the inodes to the tansaction so that transaction commit + * or cancel will unlock the inodes from this point onwards. + */ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); lock_flags |= XFS_ILOCK_EXCL; + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, tip, lock_flags); + /* Verify all data are being swapped */ if (sxp->sx_offset != 0 || @@ -1720,9 +1783,6 @@ xfs_swap_extents( goto out_trans_cancel; } - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ijoin(tp, tip, lock_flags); - /* * Before we've swapped the forks, lets set the owners of the forks * appropriately. We have to do this as we are demand paging the btree @@ -1856,5 +1916,5 @@ out_unlock: out_trans_cancel: xfs_trans_cancel(tp, 0); - goto out_unlock; + goto out; } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 736429a72a12..af97d9a1dfb4 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -63,6 +63,8 @@ int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, xfs_off_t len); +int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 507d96a57ac7..092d652bc03d 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -537,9 +537,9 @@ xfs_buf_item_push( /* has a previous flush failed due to IO errors? */ if ((bp->b_flags & XBF_WRITE_FAIL) && - ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) { xfs_warn(bp->b_target->bt_mount, -"Detected failing async write on buffer block 0x%llx. Retrying async write.", +"Failing async write on buffer block 0x%llx. Retrying async write.", (long long)bp->b_bn); } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 799e5a2d334d..e85a9519a5ae 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -84,7 +84,7 @@ xfs_trim_extents( error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) goto out_del_cursor; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor); ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 3ee186ac1093..338e50bbfd1e 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -131,7 +131,7 @@ xfs_error_report( { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, - "Internal error %s at line %d of file %s. Caller %pF", + "Internal error %s at line %d of file %s. Caller %pS", tag, linenum, filename, ra); xfs_stack_trace(); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 279a76e52791..c0394ed126fc 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -40,25 +40,25 @@ extern void xfs_verifier_error(struct xfs_buf *bp); /* * Macros to set EFSCORRUPTED & return/branch. */ -#define XFS_WANT_CORRUPTED_GOTO(x,l) \ +#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \ { \ int fs_is_ok = (x); \ ASSERT(fs_is_ok); \ if (unlikely(!fs_is_ok)) { \ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ - XFS_ERRLEVEL_LOW, NULL); \ + XFS_ERRLEVEL_LOW, mp); \ error = -EFSCORRUPTED; \ goto l; \ } \ } -#define XFS_WANT_CORRUPTED_RETURN(x) \ +#define XFS_WANT_CORRUPTED_RETURN(mp, x) \ { \ int fs_is_ok = (x); \ ASSERT(fs_is_ok); \ if (unlikely(!fs_is_ok)) { \ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ - XFS_ERRLEVEL_LOW, NULL); \ + XFS_ERRLEVEL_LOW, mp); \ return -EFSCORRUPTED; \ } \ } diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index b97359ba2648..652cd3c5b58c 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -215,7 +215,7 @@ xfs_fs_get_parent( int error; struct xfs_inode *cip; - error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL); + error = xfs_lookup(XFS_I(d_inode(child)), &xfs_name_dotdot, &cip, NULL); if (unlikely(error)) return ERR_PTR(error); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1f12ad0a8585..8121e75352ee 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -559,7 +559,7 @@ restart: if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock); + error = xfs_break_layouts(inode, iolock, true); if (error) return error; @@ -569,21 +569,42 @@ restart: * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which implies * having to redo all checks before. + * + * We need to serialise against EOF updates that occur in IO + * completions here. We want to make sure that nobody is changing the + * size while we do this check until we have placed an IO barrier (i.e. + * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. + * The spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value + * and hence be able to correctly determine if we need to run zeroing. */ + spin_lock(&ip->i_flags_lock); if (iocb->ki_pos > i_size_read(inode)) { bool zero = false; + spin_unlock(&ip->i_flags_lock); if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); iov_iter_reexpand(from, count); + + /* + * We now have an IO submission barrier in place, but + * AIO can do EOF updates during IO completion and hence + * we now need to wait for all of them to drain. Non-AIO + * DIO will have drained before we are given the + * XFS_IOLOCK_EXCL, and so for most cases this wait is a + * no-op. + */ + inode_dio_wait(inode); goto restart; } error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); if (error) return error; - } + } else + spin_unlock(&ip->i_flags_lock); /* * Updating the timestamps will grab the ilock again from @@ -645,6 +666,8 @@ xfs_file_dio_aio_write( int iolock; size_t count = iov_iter_count(from); loff_t pos = iocb->ki_pos; + loff_t end; + struct iov_iter data; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -685,10 +708,11 @@ xfs_file_dio_aio_write( goto out; count = iov_iter_count(from); pos = iocb->ki_pos; + end = pos + count - 1; if (mapping->nrpages) { ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - pos, pos + count - 1); + pos, end); if (ret) goto out; /* @@ -698,7 +722,7 @@ xfs_file_dio_aio_write( */ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, pos >> PAGE_CACHE_SHIFT, - (pos + count - 1) >> PAGE_CACHE_SHIFT); + end >> PAGE_CACHE_SHIFT); WARN_ON_ONCE(ret); ret = 0; } @@ -715,8 +739,22 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_direct_write(iocb, from, pos); + data = *from; + ret = mapping->a_ops->direct_IO(iocb, &data, pos); + + /* see generic_file_direct_write() for why this is necessary */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); + } + + if (ret > 0) { + pos += ret; + iov_iter_advance(from, ret); + iocb->ki_pos = pos; + } out: xfs_rw_iunlock(ip, iolock); @@ -822,6 +860,11 @@ xfs_file_write_iter( return ret; } +#define XFS_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ + FALLOC_FL_INSERT_RANGE) + STATIC long xfs_file_fallocate( struct file *file, @@ -835,18 +878,21 @@ xfs_file_fallocate( enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; loff_t new_size = 0; + bool do_file_insert = 0; if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + if (mode & ~XFS_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, false); if (error) goto out_unlock; + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -873,6 +919,27 @@ xfs_file_fallocate( error = xfs_collapse_file_space(ip, offset, len); if (error) goto out_unlock; + } else if (mode & FALLOC_FL_INSERT_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + new_size = i_size_read(inode) + len; + if (offset & blksize_mask || len & blksize_mask) { + error = -EINVAL; + goto out_unlock; + } + + /* check the new inode size does not wrap through zero */ + if (new_size > inode->i_sb->s_maxbytes) { + error = -EFBIG; + goto out_unlock; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + do_file_insert = 1; } else { flags |= XFS_PREALLOC_SET; @@ -907,8 +974,19 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; error = xfs_setattr_size(ip, &iattr); + if (error) + goto out_unlock; } + /* + * Perform hole insertion now that the file size has been + * updated so that if we crash during the operation we don't + * leave shifted extents past EOF and hence losing access to + * the data that is contained within them. + */ + if (do_file_insert) + error = xfs_insert_file_space(ip, offset, len); + out_unlock: xfs_iunlock(ip, iolock); return error; @@ -997,20 +1075,6 @@ xfs_file_mmap( } /* - * mmap()d file has taken write protection fault and is being made - * writable. We can set the page state up correctly for a writable - * page, which means we can do correct delalloc accounting (ENOSPC - * checking!) and unwritten extent mapping. - */ -STATIC int -xfs_vm_page_mkwrite( - struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - return block_page_mkwrite(vma, vmf, xfs_get_blocks); -} - -/* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). */ @@ -1385,6 +1449,55 @@ xfs_file_llseek( } } +/* + * Locking for serialisation of IO during page faults. This results in a lock + * ordering of: + * + * mmap_sem (MM) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ +STATIC int +xfs_filemap_fault( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_fault(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = filemap_fault(vma, vmf); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. + */ +STATIC int +xfs_filemap_page_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_page_mkwrite(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = block_page_mkwrite(vma, vmf, xfs_get_blocks); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read_iter = xfs_file_read_iter, @@ -1415,7 +1528,7 @@ const struct file_operations xfs_dir_file_operations = { }; static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = filemap_fault, + .fault = xfs_filemap_fault, .map_pages = filemap_map_pages, - .page_mkwrite = xfs_vm_page_mkwrite, + .page_mkwrite = xfs_filemap_page_mkwrite, }; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a2e86e8a0fea..da82f1cb4b9b 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -294,7 +294,7 @@ xfs_filestream_get_parent( if (!parent) goto out_dput; - dir = igrab(parent->d_inode); + dir = igrab(d_inode(parent)); dput(parent); out_dput: @@ -322,7 +322,7 @@ xfs_filestream_lookup_ag( pip = xfs_filestream_get_parent(ip); if (!pip) - goto out; + return NULLAGNUMBER; mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); if (mru) { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 74efe5b760dc..cb7e8a29dfb6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -637,12 +637,13 @@ xfs_fs_counts( xfs_mount_t *mp, xfs_fsop_counts_t *cnt) { - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + cnt->allocino = percpu_counter_read_positive(&mp->m_icount); + cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); + cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); + spin_lock(&mp->m_sb_lock); - cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); cnt->freertx = mp->m_sb.sb_frextents; - cnt->freeino = mp->m_sb.sb_ifree; - cnt->allocino = mp->m_sb.sb_icount; spin_unlock(&mp->m_sb_lock); return 0; } @@ -692,14 +693,9 @@ xfs_reserve_blocks( * what to do. This means that the amount of free space can * change while we do this, so we need to retry if we end up * trying to reserve more space than is available. - * - * We also use the xfs_mod_incore_sb() interface so that we - * don't have to care about whether per cpu counter are - * enabled, disabled or even compiled in.... */ retry: spin_lock(&mp->m_sb_lock); - xfs_icsb_sync_counters_locked(mp, 0); /* * If our previous reservation was larger than the current value, @@ -716,7 +712,8 @@ retry: } else { __int64_t free; - free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + free = percpu_counter_sum(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); if (!free) goto out; /* ENOSPC and fdblks_delta = 0 */ @@ -755,8 +752,7 @@ out: * the extra reserve blocks from the reserve..... */ int error; - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - fdblks_delta, 0); + error = xfs_mod_fdblocks(mp, fdblks_delta, 0); if (error == -ENOSPC) goto retry; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9771b7ef62ed..76a9f2783282 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -439,11 +439,11 @@ again: *ipp = ip; /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) + * If we have a real type for an on-disk inode, we can setup the inode * now. If it's a new inode being created, xfs_ialloc will handle it. */ if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) - xfs_setup_inode(ip); + xfs_setup_existing_inode(ip); return 0; out_error_or_again: diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6163767aa856..d6ebc85192b7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared( } /* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. + * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and + * the i_lock. This routine allows various combinations of the locks to be + * obtained. * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. + * The 3 locks should always be ordered so that the IO lock is obtained first, + * the mmap lock second and the ilock last in order to prevent deadlock. * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + * Basic locking order: + * + * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * + * mmap_sem locking order: + * + * i_iolock -> page lock -> mmap_sem + * mmap_sem -> i_mmap_lock -> page_lock + * + * The difference in mmap_sem locking order mean that we cannot hold the + * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can + * fault in pages during copy in/out (for buffered IO) or require the mmap_sem + * in get_user_pages() to map the user pages into the kernel address space for + * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * page faults already hold the mmap_sem. + * + * Hence to serialise fully against both syscall and mmap based IO, we need to + * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * taken in places where we need to invalidate the page cache in a race + * free manner (e.g. truncate, hole punch and other extent manipulation + * functions). */ void xfs_ilock( @@ -150,6 +160,8 @@ xfs_ilock( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -159,6 +171,11 @@ xfs_ilock( else if (lock_flags & XFS_IOLOCK_SHARED) mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); else if (lock_flags & XFS_ILOCK_SHARED) @@ -191,6 +208,8 @@ xfs_ilock_nowait( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -202,21 +221,35 @@ xfs_ilock_nowait( if (!mrtryaccess(&ip->i_iolock)) goto out; } + + if (lock_flags & XFS_MMAPLOCK_EXCL) { + if (!mrtryupdate(&ip->i_mmaplock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + if (!mrtryaccess(&ip->i_mmaplock)) + goto out_undo_iolock; + } + if (lock_flags & XFS_ILOCK_EXCL) { if (!mrtryupdate(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } else if (lock_flags & XFS_ILOCK_SHARED) { if (!mrtryaccess(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } return 1; - out_undo_iolock: +out_undo_mmaplock: + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); +out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) mrunlock_excl(&ip->i_iolock); else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); - out: +out: return 0; } @@ -244,6 +277,8 @@ xfs_iunlock( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -254,6 +289,11 @@ xfs_iunlock( else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); + if (lock_flags & XFS_ILOCK_EXCL) mrunlock_excl(&ip->i_lock); else if (lock_flags & XFS_ILOCK_SHARED) @@ -271,11 +311,14 @@ xfs_ilock_demote( xfs_inode_t *ip, uint lock_flags) { - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & + ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrdemote(&ip->i_mmaplock); if (lock_flags & XFS_IOLOCK_EXCL) mrdemote(&ip->i_iolock); @@ -294,6 +337,12 @@ xfs_isilocked( return rwsem_is_locked(&ip->i_lock.mr_lock); } + if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { + if (!(lock_flags & XFS_MMAPLOCK_SHARED)) + return !!ip->i_mmaplock.mr_writer; + return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + } + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { if (!(lock_flags & XFS_IOLOCK_SHARED)) return !!ip->i_iolock.mr_writer; @@ -314,14 +363,27 @@ int xfs_lock_delays; #endif /* - * Bump the subclass so xfs_lock_inodes() acquires each lock with - * a different value + * Bump the subclass so xfs_lock_inodes() acquires each lock with a different + * value. This shouldn't be called for page fault locking, but we also need to + * ensure we don't overrun the number of lockdep subclasses for the iolock or + * mmaplock as that is limited to 12 by the mmap lock lockdep annotations. */ static inline int xfs_lock_inumorder(int lock_mode, int subclass) { - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT))); lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + } + + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT))); + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << + XFS_MMAPLOCK_SHIFT; + } + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; @@ -329,15 +391,14 @@ xfs_lock_inumorder(int lock_mode, int subclass) } /* - * The following routine will lock n inodes in exclusive mode. - * We assume the caller calls us with the inodes in i_ino order. + * The following routine will lock n inodes in exclusive mode. We assume the + * caller calls us with the inodes in i_ino order. * - * We need to detect deadlock where an inode that we lock - * is in the AIL and we start waiting for another inode that is locked - * by a thread in a long running transaction (such as truncate). This can - * result in deadlock since the long running trans might need to wait - * for the inode we just locked in order to push the tail and free space - * in the log. + * We need to detect deadlock where an inode that we lock is in the AIL and we + * start waiting for another inode that is locked by a thread in a long running + * transaction (such as truncate). This can result in deadlock since the long + * running trans might need to wait for the inode we just locked in order to + * push the tail and free space in the log. */ void xfs_lock_inodes( @@ -348,30 +409,27 @@ xfs_lock_inodes( int attempts = 0, i, j, try_lock; xfs_log_item_t *lp; - ASSERT(ips && (inodes >= 2)); /* we need at least two */ + /* currently supports between 2 and 5 inodes */ + ASSERT(ips && inodes >= 2 && inodes <= 5); try_lock = 0; i = 0; - again: for (; i < inodes; i++) { ASSERT(ips[i]); - if (i && (ips[i] == ips[i-1])) /* Already locked */ + if (i && (ips[i] == ips[i - 1])) /* Already locked */ continue; /* - * If try_lock is not set yet, make sure all locked inodes - * are not in the AIL. - * If any are, set try_lock to be used later. + * If try_lock is not set yet, make sure all locked inodes are + * not in the AIL. If any are, set try_lock to be used later. */ - if (!try_lock) { for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = (xfs_log_item_t *)ips[j]->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) try_lock++; - } } } @@ -381,51 +439,42 @@ again: * we can't get any, we must release all we have * and try again. */ + if (!try_lock) { + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); + continue; + } + + /* try_lock means we have an inode locked that is in the AIL. */ + ASSERT(i != 0); + if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) + continue; - if (try_lock) { - /* try_lock must be 0 if i is 0. */ + /* + * Unlock all previous guys and try again. xfs_iunlock will try + * to push the tail if the inode is in the AIL. + */ + attempts++; + for (j = i - 1; j >= 0; j--) { /* - * try_lock means we have an inode locked - * that is in the AIL. + * Check to see if we've already unlocked this one. Not + * the first one going back, and the inode ptr is the + * same. */ - ASSERT(i != 0); - if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { - attempts++; - - /* - * Unlock all previous guys and try again. - * xfs_iunlock will try to push the tail - * if the inode is in the AIL. - */ - - for(j = i - 1; j >= 0; j--) { - - /* - * Check to see if we've already - * unlocked this one. - * Not the first one going back, - * and the inode ptr is the same. - */ - if ((j != (i - 1)) && ips[j] == - ips[j+1]) - continue; - - xfs_iunlock(ips[j], lock_mode); - } + if (j != (i - 1) && ips[j] == ips[j + 1]) + continue; + + xfs_iunlock(ips[j], lock_mode); + } - if ((attempts % 5) == 0) { - delay(1); /* Don't just spin the CPU */ + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ #ifdef DEBUG - xfs_lock_delays++; + xfs_lock_delays++; #endif - } - i = 0; - try_lock = 0; - goto again; - } - } else { - xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); } + i = 0; + try_lock = 0; + goto again; } #ifdef DEBUG @@ -440,10 +489,10 @@ again: } /* - * xfs_lock_two_inodes() can only be used to lock one type of lock - * at a time - the iolock or the ilock, but not both at once. If - * we lock both at once, lockdep will report false positives saying - * we have violated locking orders. + * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. */ void xfs_lock_two_inodes( @@ -455,8 +504,12 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) - ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { @@ -818,7 +871,7 @@ xfs_ialloc( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, flags); - /* now that we have an i_mode we can setup inode ops and unlock */ + /* now that we have an i_mode we can setup the inode structure */ xfs_setup_inode(ip); *ipp = ip; @@ -1235,12 +1288,14 @@ xfs_create( xfs_trans_cancel(tp, cancel_flags); out_release_inode: /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. */ - if (ip) + if (ip) { + xfs_finish_inode_setup(ip); IRELE(ip); + } xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -1345,12 +1400,14 @@ xfs_create_tmpfile( xfs_trans_cancel(tp, cancel_flags); out_release_inode: /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. */ - if (ip) + if (ip) { + xfs_finish_inode_setup(ip); IRELE(ip); + } xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -2611,19 +2668,22 @@ xfs_remove( /* * Enter all inodes for a rename transaction into a sorted array. */ +#define __XFS_SORT_INODES 5 STATIC void xfs_sort_for_rename( - xfs_inode_t *dp1, /* in: old (source) directory inode */ - xfs_inode_t *dp2, /* in: new (target) directory inode */ - xfs_inode_t *ip1, /* in: inode of old entry */ - xfs_inode_t *ip2, /* in: inode of new entry, if it - already exists, NULL otherwise. */ - xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ - int *num_inodes) /* out: number of inodes in array */ + struct xfs_inode *dp1, /* in: old (source) directory inode */ + struct xfs_inode *dp2, /* in: new (target) directory inode */ + struct xfs_inode *ip1, /* in: inode of old entry */ + struct xfs_inode *ip2, /* in: inode of new entry */ + struct xfs_inode *wip, /* in: whiteout inode */ + struct xfs_inode **i_tab,/* out: sorted array of inodes */ + int *num_inodes) /* in/out: inodes in array */ { - xfs_inode_t *temp; int i, j; + ASSERT(*num_inodes == __XFS_SORT_INODES); + memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); + /* * i_tab contains a list of pointers to inodes. We initialize * the table here & we'll sort it. We will then use it to @@ -2631,25 +2691,24 @@ xfs_sort_for_rename( * * Note that the table may contain duplicates. e.g., dp1 == dp2. */ - i_tab[0] = dp1; - i_tab[1] = dp2; - i_tab[2] = ip1; - if (ip2) { - *num_inodes = 4; - i_tab[3] = ip2; - } else { - *num_inodes = 3; - i_tab[3] = NULL; - } + i = 0; + i_tab[i++] = dp1; + i_tab[i++] = dp2; + i_tab[i++] = ip1; + if (ip2) + i_tab[i++] = ip2; + if (wip) + i_tab[i++] = wip; + *num_inodes = i; /* * Sort the elements via bubble sort. (Remember, there are at - * most 4 elements to sort, so this is adequate.) + * most 5 elements to sort, so this is adequate.) */ for (i = 0; i < *num_inodes; i++) { for (j = 1; j < *num_inodes; j++) { if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { - temp = i_tab[j]; + struct xfs_inode *temp = i_tab[j]; i_tab[j] = i_tab[j-1]; i_tab[j-1] = temp; } @@ -2657,6 +2716,31 @@ xfs_sort_for_rename( } } +static int +xfs_finish_rename( + struct xfs_trans *tp, + struct xfs_bmap_free *free_list) +{ + int committed = 0; + int error; + + /* + * If this is a synchronous mount, make sure that the rename transaction + * goes to disk before returning to the user. + */ + if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, free_list, &committed); + if (error) { + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + return error; + } + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +} + /* * xfs_cross_rename() * @@ -2685,14 +2769,14 @@ xfs_cross_rename( ip2->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* Swap inode number for dirent in second parent */ error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* * If we're renaming one or more directories across different parents, @@ -2707,16 +2791,16 @@ xfs_cross_rename( dp1->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* transfer ip2 ".." reference to dp1 */ if (!S_ISDIR(ip1->i_d.di_mode)) { error = xfs_droplink(tp, dp2); if (error) - goto out; + goto out_trans_abort; error = xfs_bumplink(tp, dp1); if (error) - goto out; + goto out_trans_abort; } /* @@ -2734,16 +2818,16 @@ xfs_cross_rename( dp2->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* transfer ip1 ".." reference to dp2 */ if (!S_ISDIR(ip2->i_d.di_mode)) { error = xfs_droplink(tp, dp1); if (error) - goto out; + goto out_trans_abort; error = xfs_bumplink(tp, dp2); if (error) - goto out; + goto out_trans_abort; } /* @@ -2771,66 +2855,108 @@ xfs_cross_rename( } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); -out: + return xfs_finish_rename(tp, free_list); + +out_trans_abort: + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); return error; } /* + * xfs_rename_alloc_whiteout() + * + * Return a referenced, unlinked, unlocked inode that that can be used as a + * whiteout in a rename transaction. We use a tmpfile inode here so that if we + * crash between allocating the inode and linking it into the rename transaction + * recovery will free the inode and we won't leak it. + */ +static int +xfs_rename_alloc_whiteout( + struct xfs_inode *dp, + struct xfs_inode **wip) +{ + struct xfs_inode *tmpfile; + int error; + + error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile); + if (error) + return error; + + /* Satisfy xfs_bumplink that this is a real tmpfile */ + xfs_finish_inode_setup(tmpfile); + VFS_I(tmpfile)->i_state |= I_LINKABLE; + + *wip = tmpfile; + return 0; +} + +/* * xfs_rename */ int xfs_rename( - xfs_inode_t *src_dp, - struct xfs_name *src_name, - xfs_inode_t *src_ip, - xfs_inode_t *target_dp, - struct xfs_name *target_name, - xfs_inode_t *target_ip, - unsigned int flags) + struct xfs_inode *src_dp, + struct xfs_name *src_name, + struct xfs_inode *src_ip, + struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip, + unsigned int flags) { - xfs_trans_t *tp = NULL; - xfs_mount_t *mp = src_dp->i_mount; - int new_parent; /* moving to a new dir */ - int src_is_directory; /* src_name is a directory */ - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - xfs_inode_t *inodes[4]; - int spaceres; - int num_inodes; + struct xfs_mount *mp = src_dp->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + struct xfs_inode *wip = NULL; /* whiteout inode */ + struct xfs_inode *inodes[__XFS_SORT_INODES]; + int num_inodes = __XFS_SORT_INODES; + bool new_parent = (src_dp != target_dp); + bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + int cancel_flags = 0; + int spaceres; + int error; trace_xfs_rename(src_dp, target_dp, src_name, target_name); - new_parent = (src_dp != target_dp); - src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + if ((flags & RENAME_EXCHANGE) && !target_ip) + return -EINVAL; - xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, + /* + * If we are doing a whiteout operation, allocate the whiteout inode + * we will be placing at the target and ensure the type is set + * appropriately. + */ + if (flags & RENAME_WHITEOUT) { + ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); + error = xfs_rename_alloc_whiteout(target_dp, &wip); + if (error) + return error; + + /* setup target dirent info as whiteout */ + src_name->type = XFS_DIR3_FT_CHRDEV; + } + + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); - xfs_bmap_init(&free_list, &first_block); tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); if (error == -ENOSPC) { spaceres = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); } - if (error) { - xfs_trans_cancel(tp, 0); - goto std_return; - } + if (error) + goto out_trans_cancel; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * Attach the dquots to the inodes */ error = xfs_qm_vop_rename_dqattach(inodes); - if (error) { - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - } + if (error) + goto out_trans_cancel; /* * Lock all the participating inodes. Depending upon whether @@ -2851,6 +2977,8 @@ xfs_rename( xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + if (wip) + xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow renames @@ -2860,24 +2988,16 @@ xfs_rename( if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { error = -EXDEV; - goto error_return; + goto out_trans_cancel; } - /* - * Handle RENAME_EXCHANGE flags - */ - if (flags & RENAME_EXCHANGE) { - if (target_ip == NULL) { - error = -EINVAL; - goto error_return; - } - error = xfs_cross_rename(tp, src_dp, src_name, src_ip, - target_dp, target_name, target_ip, - &free_list, &first_block, spaceres); - if (error) - goto abort_return; - goto finish_rename; - } + xfs_bmap_init(&free_list, &first_block); + + /* RENAME_EXCHANGE is unique from here on. */ + if (flags & RENAME_EXCHANGE) + return xfs_cross_rename(tp, src_dp, src_name, src_ip, + target_dp, target_name, target_ip, + &free_list, &first_block, spaceres); /* * Set up the target. @@ -2890,7 +3010,7 @@ xfs_rename( if (!spaceres) { error = xfs_dir_canenter(tp, target_dp, target_name); if (error) - goto error_return; + goto out_trans_cancel; } /* * If target does not exist and the rename crosses @@ -2901,9 +3021,9 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error == -ENOSPC) - goto error_return; + goto out_bmap_cancel; if (error) - goto abort_return; + goto out_trans_abort; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2911,7 +3031,7 @@ xfs_rename( if (new_parent && src_is_directory) { error = xfs_bumplink(tp, target_dp); if (error) - goto abort_return; + goto out_trans_abort; } } else { /* target_ip != NULL */ /* @@ -2926,7 +3046,7 @@ xfs_rename( if (!(xfs_dir_isempty(target_ip)) || (target_ip->i_d.di_nlink > 2)) { error = -EEXIST; - goto error_return; + goto out_trans_cancel; } } @@ -2943,7 +3063,7 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto abort_return; + goto out_trans_abort; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2954,7 +3074,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto abort_return; + goto out_trans_abort; if (src_is_directory) { /* @@ -2962,7 +3082,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto abort_return; + goto out_trans_abort; } } /* target_ip != NULL */ @@ -2979,7 +3099,7 @@ xfs_rename( &first_block, &free_list, spaceres); ASSERT(error != -EEXIST); if (error) - goto abort_return; + goto out_trans_abort; } /* @@ -3005,49 +3125,67 @@ xfs_rename( */ error = xfs_droplink(tp, src_dp); if (error) - goto abort_return; + goto out_trans_abort; } - error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + /* + * For whiteouts, we only need to update the source dirent with the + * inode number of the whiteout inode rather than removing it + * altogether. + */ + if (wip) { + error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, &first_block, &free_list, spaceres); + } else + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); if (error) - goto abort_return; - - xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); - if (new_parent) - xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + goto out_trans_abort; -finish_rename: /* - * If this is a synchronous mount, make sure that the - * rename transaction goes to disk before returning to - * the user. + * For whiteouts, we need to bump the link count on the whiteout inode. + * This means that failures all the way up to this point leave the inode + * on the unlinked list and so cleanup is a simple matter of dropping + * the remaining reference to it. If we fail here after bumping the link + * count, we're shutting down the filesystem so we'll never see the + * intermediate state on disk. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } + if (wip) { + ASSERT(wip->i_d.di_nlink == 0); + error = xfs_bumplink(tp, wip); + if (error) + goto out_trans_abort; + error = xfs_iunlink_remove(tp, wip); + if (error) + goto out_trans_abort; + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - goto std_return; + /* + * Now we have a real link, clear the "I'm a tmpfile" state + * flag from the inode so it doesn't accidentally get misused in + * future. + */ + VFS_I(wip)->i_state &= ~I_LINKABLE; } - /* - * trans_commit will unlock src_ip, target_ip & decrement - * the vnode references. - */ - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - abort_return: + error = xfs_finish_rename(tp, &free_list); + if (wip) + IRELE(wip); + return error; + +out_trans_abort: cancel_flags |= XFS_TRANS_ABORT; - error_return: +out_bmap_cancel: xfs_bmap_cancel(&free_list); +out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); - std_return: + if (wip) + IRELE(wip); return error; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a1cd55f3f351..8f22d20368d8 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -56,6 +56,7 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ + mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ @@ -263,15 +264,20 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_SHARED (1<<1) #define XFS_ILOCK_EXCL (1<<2) #define XFS_ILOCK_SHARED (1<<3) +#define XFS_MMAPLOCK_EXCL (1<<4) +#define XFS_MMAPLOCK_SHARED (1<<5) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ - | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) + | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ + | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED) #define XFS_LOCK_FLAGS \ { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ - { XFS_ILOCK_SHARED, "ILOCK_SHARED" } + { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ + { XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \ + { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" } /* @@ -302,17 +308,26 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_SHIFT 16 #define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) +#define XFS_MMAPLOCK_SHIFT 20 + #define XFS_ILOCK_SHIFT 24 #define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) -#define XFS_IOLOCK_DEP_MASK 0x00ff0000 +#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 #define XFS_ILOCK_DEP_MASK 0xff000000 -#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK) +#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \ + XFS_MMAPLOCK_DEP_MASK | \ + XFS_ILOCK_DEP_MASK) -#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) -#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) +#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) \ + >> XFS_IOLOCK_SHIFT) +#define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \ + >> XFS_MMAPLOCK_SHIFT) +#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \ + >> XFS_ILOCK_SHIFT) /* * For multiple groups support: if S_ISGID bit is set in the parent @@ -391,6 +406,28 @@ int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); +/* from xfs_iops.c */ +/* + * When setting up a newly allocated inode, we need to call + * xfs_finish_inode_setup() once the inode is fully instantiated at + * the VFS level to prevent the rest of the world seeing the inode + * before we've completed instantiation. Otherwise we can do it + * the moment the inode lookup is complete. + */ +extern void xfs_setup_inode(struct xfs_inode *ip); +static inline void xfs_finish_inode_setup(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + unlock_new_inode(VFS_I(ip)); +} + +static inline void xfs_setup_existing_inode(struct xfs_inode *ip) +{ + xfs_setup_inode(ip); + xfs_finish_inode_setup(ip); +} + #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ac4feae45eb3..87f67c6b654c 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -82,7 +82,7 @@ xfs_find_handle( error = user_lpath((const char __user *)hreq->path, &path); if (error) return error; - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); } ip = XFS_I(inode); @@ -210,7 +210,7 @@ xfs_open_by_handle( dentry = xfs_handlereq_to_dentry(parfilp, hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); - inode = dentry->d_inode; + inode = d_inode(dentry); /* Restrict xfs_open_by_handle to directories & regular files. */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { @@ -303,7 +303,7 @@ xfs_readlink_by_handle( goto out_dput; } - error = xfs_readlink(XFS_I(dentry->d_inode), link); + error = xfs_readlink(XFS_I(d_inode(dentry)), link); if (error) goto out_kfree; error = readlink_copy(hreq->ohandle, olen, link); @@ -376,7 +376,7 @@ xfs_fssetdm_by_handle( return PTR_ERR(dentry); } - if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { + if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { error = -EPERM; goto out; } @@ -386,7 +386,7 @@ xfs_fssetdm_by_handle( goto out; } - error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: @@ -429,7 +429,7 @@ xfs_attrlist_by_handle( goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -559,7 +559,7 @@ xfs_attrmulti_by_handle( switch (ops[i].am_opcode) { case ATTR_OP_GET: ops[i].am_error = xfs_attrmulti_attr_get( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_attrvalue, &ops[i].am_length, ops[i].am_flags); break; @@ -568,7 +568,7 @@ xfs_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_attrvalue, ops[i].am_length, ops[i].am_flags); mnt_drop_write_file(parfilp); @@ -578,7 +578,7 @@ xfs_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_flags); mnt_drop_write_file(parfilp); break; @@ -631,7 +631,7 @@ xfs_ioc_space( if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; - if (ioflags & XFS_IO_INVIS) + if (ioflags & XFS_IO_INVIS) flags |= XFS_PREALLOC_INVISIBLE; error = mnt_want_write_file(filp); @@ -639,10 +639,13 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, false); if (error) goto out_unlock; + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + switch (bf->l_whence) { case 0: /*SEEK_SET*/ break; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index bfc7c7c8a0c8..b88bdc85dd3d 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -375,7 +375,7 @@ xfs_compat_attrlist_by_handle( goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -445,7 +445,7 @@ xfs_compat_attrmulti_by_handle( switch (ops[i].am_opcode) { case ATTR_OP_GET: ops[i].am_error = xfs_attrmulti_attr_get( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, compat_ptr(ops[i].am_attrvalue), &ops[i].am_length, ops[i].am_flags); break; @@ -454,7 +454,7 @@ xfs_compat_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, compat_ptr(ops[i].am_attrvalue), ops[i].am_length, ops[i].am_flags); mnt_drop_write_file(parfilp); @@ -464,7 +464,7 @@ xfs_compat_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_flags); mnt_drop_write_file(parfilp); break; @@ -504,7 +504,7 @@ xfs_compat_fssetdm_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { + if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { error = -EPERM; goto out; } @@ -514,7 +514,7 @@ xfs_compat_fssetdm_by_handle( goto out; } - error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ccb1dd0d509e..38e633bad8c2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -460,8 +460,7 @@ xfs_iomap_prealloc_size( alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), alloc_blocks); - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); - freesp = mp->m_sb.sb_fdblocks; + freesp = percpu_counter_read_positive(&mp->m_fdblocks); if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { shift = 2; if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e53a90331422..f4cd7204e236 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -187,6 +187,8 @@ xfs_generic_create( else d_instantiate(dentry, inode); + xfs_finish_inode_setup(ip); + out_free_acl: if (default_acl) posix_acl_release(default_acl); @@ -195,6 +197,7 @@ xfs_generic_create( return error; out_cleanup_inode: + xfs_finish_inode_setup(ip); if (!tmpfile) xfs_cleanup_inode(dir, inode, dentry); iput(inode); @@ -301,7 +304,7 @@ xfs_vn_link( struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct xfs_name name; int error; @@ -326,7 +329,7 @@ xfs_vn_unlink( xfs_dentry_to_name(&name, dentry, 0); - error = xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); + error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry))); if (error) return error; @@ -367,9 +370,11 @@ xfs_vn_symlink( goto out_cleanup_inode; d_instantiate(dentry, inode); + xfs_finish_inode_setup(cip); return 0; out_cleanup_inode: + xfs_finish_inode_setup(cip); xfs_cleanup_inode(dir, inode, dentry); iput(inode); out: @@ -384,22 +389,22 @@ xfs_vn_rename( struct dentry *ndentry, unsigned int flags) { - struct inode *new_inode = ndentry->d_inode; + struct inode *new_inode = d_inode(ndentry); int omode = 0; struct xfs_name oname; struct xfs_name nname; - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; /* if we are exchanging files, we need to set i_mode of both files */ if (flags & RENAME_EXCHANGE) - omode = ndentry->d_inode->i_mode; + omode = d_inode(ndentry)->i_mode; xfs_dentry_to_name(&oname, odentry, omode); - xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode); + xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode); - return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), + return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)), XFS_I(ndir), &nname, new_inode ? XFS_I(new_inode) : NULL, flags); } @@ -421,7 +426,7 @@ xfs_vn_follow_link( if (!link) goto out_err; - error = xfs_readlink(XFS_I(dentry->d_inode), link); + error = xfs_readlink(XFS_I(d_inode(dentry)), link); if (unlikely(error)) goto out_kfree; @@ -441,7 +446,7 @@ xfs_vn_getattr( struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -766,6 +771,7 @@ xfs_setattr_size( return error; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); @@ -829,55 +835,27 @@ xfs_setattr_size( inode_dio_wait(inode); /* - * Do all the page cache truncate work outside the transaction context - * as the "lock" order is page lock->log space reservation. i.e. - * locking pages inside the transaction can ABBA deadlock with - * writeback. We have to do the VFS inode size update before we truncate - * the pagecache, however, to avoid racing with page faults beyond the - * new EOF they are not serialised against truncate operations except by - * page locks and size updates. + * We've already locked out new page faults, so now we can safely remove + * pages from the page cache knowing they won't get refaulted until we + * drop the XFS_MMAP_EXCL lock after the extent manipulations are + * complete. The truncate_setsize() call also cleans partial EOF page + * PTEs on extending truncates and hence ensures sub-page block size + * filesystems are correctly handled, too. * - * Hence we are in a situation where a truncate can fail with ENOMEM - * from xfs_trans_reserve(), but having already truncated the in-memory - * version of the file (i.e. made user visible changes). There's not - * much we can do about this, except to hope that the caller sees ENOMEM - * and retries the truncate operation. + * We have to do all the page cache truncate work outside the + * transaction context as the "lock" order is page lock->log space + * reservation as defined by extent allocation in the writeback path. + * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but + * having already truncated the in-memory version of the file (i.e. made + * user visible changes). There's not much we can do about this, except + * to hope that the caller sees ENOMEM and retries the truncate + * operation. */ error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) return error; truncate_setsize(inode, newsize); - /* - * The "we can't serialise against page faults" pain gets worse. - * - * If the file is mapped then we have to clean the page at the old EOF - * when extending the file. Extending the file can expose changes the - * underlying page mapping (e.g. from beyond EOF to a hole or - * unwritten), and so on the next attempt to write to that page we need - * to remap it for write. i.e. we need .page_mkwrite() to be called. - * Hence we need to clean the page to clean the pte and so a new write - * fault will be triggered appropriately. - * - * If we do it before we change the inode size, then we can race with a - * page fault that maps the page with exactly the same problem. If we do - * it after we change the file size, then a new page fault can come in - * and allocate space before we've run the rest of the truncate - * transaction. That's kinda grotesque, but it's better than have data - * over a hole, and so that's the lesser evil that has been chosen here. - * - * The real solution, however, is to have some mechanism for locking out - * page faults while a truncate is in progress. - */ - if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) { - error = filemap_write_and_wait_range( - VFS_I(ip)->i_mapping, - round_down(oldsize, PAGE_CACHE_SIZE), - round_up(oldsize, PAGE_CACHE_SIZE) - 1); - if (error) - return error; - } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) @@ -968,16 +946,20 @@ xfs_vn_setattr( struct dentry *dentry, struct iattr *iattr) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); int error; if (iattr->ia_valid & ATTR_SIZE) { uint iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, iolock); - error = xfs_break_layouts(dentry->d_inode, &iolock); - if (!error) + error = xfs_break_layouts(d_inode(dentry), &iolock, true); + if (!error) { + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + error = xfs_setattr_size(ip, iattr); + } xfs_iunlock(ip, iolock); } else { error = xfs_setattr_nonsize(ip, iattr, 0); @@ -1228,16 +1210,12 @@ xfs_diflags_to_iflags( } /* - * Initialize the Linux inode, set up the operation vectors and - * unlock the inode. + * Initialize the Linux inode and set up the operation vectors. * - * When reading existing inodes from disk this is called directly - * from xfs_iget, when creating a new inode it is called from - * xfs_ialloc after setting up the inode. - * - * We are always called with an uninitialised linux inode here. - * We need to initialise the necessary fields and take a reference - * on it. + * When reading existing inodes from disk this is called directly from xfs_iget, + * when creating a new inode it is called from xfs_ialloc after setting up the + * inode. These callers have different criteria for clearing XFS_INEW, so leave + * it up to the caller to deal with unlocking the inode appropriately. */ void xfs_setup_inode( @@ -1324,9 +1302,4 @@ xfs_setup_inode( inode_has_no_xattr(inode); cache_no_acl(inode); } - - xfs_iflags_clear(ip, XFS_INEW); - barrier(); - - unlock_new_inode(inode); } diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index ea7a98e9cb70..a0f84abb0d09 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -extern void xfs_setup_inode(struct xfs_inode *); - /* * Internal setattr interfaces. */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 82e314258f73..80429891dc9b 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -229,7 +229,7 @@ xfs_bulkstat_grab_ichunk( error = xfs_inobt_get_rec(cur, irec, &stat); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(stat == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1); /* Check if the record contains the inode in request */ if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) { diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index c31d2c2eadc4..7c7842c85a08 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -116,15 +116,6 @@ typedef __uint64_t __psunsigned_t; #undef XFS_NATIVE_HOST #endif -/* - * Feature macros (disable/enable) - */ -#ifdef CONFIG_SMP -#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ -#else -#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ -#endif - #define irix_sgid_inherit xfs_params.sgid_inherit.val #define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a5a945fc3bdc..4f5784f85a5b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4463,10 +4463,10 @@ xlog_do_recover( xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); + xfs_reinit_percpu_counters(log->l_mp); + xfs_buf_relse(bp); - /* We've re-read the superblock so re-initialize per-cpu counters */ - xfs_icsb_reinit_counters(log->l_mp); xlog_recover_check_summary(log); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 4fa80e63eea2..2ce7ee3b4ec1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,18 +43,6 @@ #include "xfs_sysfs.h" -#ifdef HAVE_PERCPU_SB -STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, - int); -STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, - int); -STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); -#else - -#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) -#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) -#endif - static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; @@ -347,8 +335,7 @@ reread: goto reread; } - /* Initialize per-cpu counters */ - xfs_icsb_reinit_counters(mp); + xfs_reinit_percpu_counters(mp); /* no need to be quiet anymore, so reset the buf ops */ bp->b_ops = &xfs_sb_buf_ops; @@ -1087,8 +1074,6 @@ xfs_log_sbcount(xfs_mount_t *mp) if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) return 0; - xfs_icsb_sync_counters(mp, 0); - /* * we don't need to do this if we are updating the superblock * counters on every modification. @@ -1099,253 +1084,136 @@ xfs_log_sbcount(xfs_mount_t *mp) return xfs_sync_sb(mp, true); } -/* - * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply - * a delta to a specified field in the in-core superblock. Simply - * switch on the field indicated and apply the delta to that field. - * Fields are not allowed to dip below zero, so if the delta would - * do this do not apply it and return EINVAL. - * - * The m_sb_lock must be held when this routine is called. - */ -STATIC int -xfs_mod_incore_sb_unlocked( - xfs_mount_t *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) +int +xfs_mod_icount( + struct xfs_mount *mp, + int64_t delta) { - int scounter; /* short counter for 32 bit fields */ - long long lcounter; /* long counter for 64 bit fields */ - long long res_used, rem; - - /* - * With the in-core superblock spin lock held, switch - * on the indicated field. Apply the delta to the - * proper field. If the fields value would dip below - * 0, then do not apply the delta and return EINVAL. - */ - switch (field) { - case XFS_SBS_ICOUNT: - lcounter = (long long)mp->m_sb.sb_icount; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_icount = lcounter; - return 0; - case XFS_SBS_IFREE: - lcounter = (long long)mp->m_sb.sb_ifree; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_ifree = lcounter; - return 0; - case XFS_SBS_FDBLOCKS: - lcounter = (long long) - mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); - res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); - - if (delta > 0) { /* Putting blocks back */ - if (res_used > delta) { - mp->m_resblks_avail += delta; - } else { - rem = delta - res_used; - mp->m_resblks_avail = mp->m_resblks; - lcounter += rem; - } - } else { /* Taking blocks away */ - lcounter += delta; - if (lcounter >= 0) { - mp->m_sb.sb_fdblocks = lcounter + - XFS_ALLOC_SET_ASIDE(mp); - return 0; - } - - /* - * We are out of blocks, use any available reserved - * blocks if were allowed to. - */ - if (!rsvd) - return -ENOSPC; - - lcounter = (long long)mp->m_resblks_avail + delta; - if (lcounter >= 0) { - mp->m_resblks_avail = lcounter; - return 0; - } - printk_once(KERN_WARNING - "Filesystem \"%s\": reserve blocks depleted! " - "Consider increasing reserve pool size.", - mp->m_fsname); - return -ENOSPC; - } - - mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); - return 0; - case XFS_SBS_FREXTENTS: - lcounter = (long long)mp->m_sb.sb_frextents; - lcounter += delta; - if (lcounter < 0) { - return -ENOSPC; - } - mp->m_sb.sb_frextents = lcounter; - return 0; - case XFS_SBS_DBLOCKS: - lcounter = (long long)mp->m_sb.sb_dblocks; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_dblocks = lcounter; - return 0; - case XFS_SBS_AGCOUNT: - scounter = mp->m_sb.sb_agcount; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_agcount = scounter; - return 0; - case XFS_SBS_IMAX_PCT: - scounter = mp->m_sb.sb_imax_pct; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_imax_pct = scounter; - return 0; - case XFS_SBS_REXTSIZE: - scounter = mp->m_sb.sb_rextsize; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextsize = scounter; - return 0; - case XFS_SBS_RBMBLOCKS: - scounter = mp->m_sb.sb_rbmblocks; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rbmblocks = scounter; - return 0; - case XFS_SBS_RBLOCKS: - lcounter = (long long)mp->m_sb.sb_rblocks; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rblocks = lcounter; - return 0; - case XFS_SBS_REXTENTS: - lcounter = (long long)mp->m_sb.sb_rextents; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextents = lcounter; - return 0; - case XFS_SBS_REXTSLOG: - scounter = mp->m_sb.sb_rextslog; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextslog = scounter; - return 0; - default: + /* deltas are +/-64, hence the large batch size of 128. */ + __percpu_counter_add(&mp->m_icount, delta, 128); + if (percpu_counter_compare(&mp->m_icount, 0) < 0) { ASSERT(0); + percpu_counter_add(&mp->m_icount, -delta); return -EINVAL; } + return 0; } -/* - * xfs_mod_incore_sb() is used to change a field in the in-core - * superblock structure by the specified delta. This modification - * is protected by the m_sb_lock. Just use the xfs_mod_incore_sb_unlocked() - * routine to do the work. - */ int -xfs_mod_incore_sb( +xfs_mod_ifree( struct xfs_mount *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) + int64_t delta) { - int status; - -#ifdef HAVE_PERCPU_SB - ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS); -#endif - spin_lock(&mp->m_sb_lock); - status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - spin_unlock(&mp->m_sb_lock); - - return status; + percpu_counter_add(&mp->m_ifree, delta); + if (percpu_counter_compare(&mp->m_ifree, 0) < 0) { + ASSERT(0); + percpu_counter_add(&mp->m_ifree, -delta); + return -EINVAL; + } + return 0; } -/* - * Change more than one field in the in-core superblock structure at a time. - * - * The fields and changes to those fields are specified in the array of - * xfs_mod_sb structures passed in. Either all of the specified deltas - * will be applied or none of them will. If any modified field dips below 0, - * then all modifications will be backed out and EINVAL will be returned. - * - * Note that this function may not be used for the superblock values that - * are tracked with the in-memory per-cpu counters - a direct call to - * xfs_icsb_modify_counters is required for these. - */ int -xfs_mod_incore_sb_batch( +xfs_mod_fdblocks( struct xfs_mount *mp, - xfs_mod_sb_t *msb, - uint nmsb, - int rsvd) + int64_t delta, + bool rsvd) { - xfs_mod_sb_t *msbp; - int error = 0; + int64_t lcounter; + long long res_used; + s32 batch; + + if (delta > 0) { + /* + * If the reserve pool is depleted, put blocks back into it + * first. Most of the time the pool is full. + */ + if (likely(mp->m_resblks == mp->m_resblks_avail)) { + percpu_counter_add(&mp->m_fdblocks, delta); + return 0; + } + + spin_lock(&mp->m_sb_lock); + res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); + + if (res_used > delta) { + mp->m_resblks_avail += delta; + } else { + delta -= res_used; + mp->m_resblks_avail = mp->m_resblks; + percpu_counter_add(&mp->m_fdblocks, delta); + } + spin_unlock(&mp->m_sb_lock); + return 0; + } /* - * Loop through the array of mod structures and apply each individually. - * If any fail, then back out all those which have already been applied. - * Do all of this within the scope of the m_sb_lock so that all of the - * changes will be atomic. + * Taking blocks away, need to be more accurate the closer we + * are to zero. + * + * batch size is set to a maximum of 1024 blocks - if we are + * allocating of freeing extents larger than this then we aren't + * going to be hammering the counter lock so a lock per update + * is not a problem. + * + * If the counter has a value of less than 2 * max batch size, + * then make everything serialise as we are real close to + * ENOSPC. + */ +#define __BATCH 1024 + if (percpu_counter_compare(&mp->m_fdblocks, 2 * __BATCH) < 0) + batch = 1; + else + batch = __BATCH; +#undef __BATCH + + __percpu_counter_add(&mp->m_fdblocks, delta, batch); + if (percpu_counter_compare(&mp->m_fdblocks, + XFS_ALLOC_SET_ASIDE(mp)) >= 0) { + /* we had space! */ + return 0; + } + + /* + * lock up the sb for dipping into reserves before releasing the space + * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); - for (msbp = msb; msbp < (msb + nmsb); msbp++) { - ASSERT(msbp->msb_field < XFS_SBS_ICOUNT || - msbp->msb_field > XFS_SBS_FDBLOCKS); + percpu_counter_add(&mp->m_fdblocks, -delta); + if (!rsvd) + goto fdblocks_enospc; - error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, - msbp->msb_delta, rsvd); - if (error) - goto unwind; + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter >= 0) { + mp->m_resblks_avail = lcounter; + spin_unlock(&mp->m_sb_lock); + return 0; } + printk_once(KERN_WARNING + "Filesystem \"%s\": reserve blocks depleted! " + "Consider increasing reserve pool size.", + mp->m_fsname); +fdblocks_enospc: spin_unlock(&mp->m_sb_lock); - return 0; + return -ENOSPC; +} -unwind: - while (--msbp >= msb) { - error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, - -msbp->msb_delta, rsvd); - ASSERT(error == 0); - } +int +xfs_mod_frextents( + struct xfs_mount *mp, + int64_t delta) +{ + int64_t lcounter; + int ret = 0; + + spin_lock(&mp->m_sb_lock); + lcounter = mp->m_sb.sb_frextents + delta; + if (lcounter < 0) + ret = -ENOSPC; + else + mp->m_sb.sb_frextents = lcounter; spin_unlock(&mp->m_sb_lock); - return error; + return ret; } /* @@ -1407,573 +1275,3 @@ xfs_dev_is_read_only( } return 0; } - -#ifdef HAVE_PERCPU_SB -/* - * Per-cpu incore superblock counters - * - * Simple concept, difficult implementation - * - * Basically, replace the incore superblock counters with a distributed per cpu - * counter for contended fields (e.g. free block count). - * - * Difficulties arise in that the incore sb is used for ENOSPC checking, and - * hence needs to be accurately read when we are running low on space. Hence - * there is a method to enable and disable the per-cpu counters based on how - * much "stuff" is available in them. - * - * Basically, a counter is enabled if there is enough free resource to justify - * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local - * ENOSPC), then we disable the counters to synchronise all callers and - * re-distribute the available resources. - * - * If, once we redistributed the available resources, we still get a failure, - * we disable the per-cpu counter and go through the slow path. - * - * The slow path is the current xfs_mod_incore_sb() function. This means that - * when we disable a per-cpu counter, we need to drain its resources back to - * the global superblock. We do this after disabling the counter to prevent - * more threads from queueing up on the counter. - * - * Essentially, this means that we still need a lock in the fast path to enable - * synchronisation between the global counters and the per-cpu counters. This - * is not a problem because the lock will be local to a CPU almost all the time - * and have little contention except when we get to ENOSPC conditions. - * - * Basically, this lock becomes a barrier that enables us to lock out the fast - * path while we do things like enabling and disabling counters and - * synchronising the counters. - * - * Locking rules: - * - * 1. m_sb_lock before picking up per-cpu locks - * 2. per-cpu locks always picked up via for_each_online_cpu() order - * 3. accurate counter sync requires m_sb_lock + per cpu locks - * 4. modifying per-cpu counters requires holding per-cpu lock - * 5. modifying global counters requires holding m_sb_lock - * 6. enabling or disabling a counter requires holding the m_sb_lock - * and _none_ of the per-cpu locks. - * - * Disabled counters are only ever re-enabled by a balance operation - * that results in more free resources per CPU than a given threshold. - * To ensure counters don't remain disabled, they are rebalanced when - * the global resource goes above a higher threshold (i.e. some hysteresis - * is present to prevent thrashing). - */ - -#ifdef CONFIG_HOTPLUG_CPU -/* - * hot-plug CPU notifier support. - * - * We need a notifier per filesystem as we need to be able to identify - * the filesystem to balance the counters out. This is achieved by - * having a notifier block embedded in the xfs_mount_t and doing pointer - * magic to get the mount pointer from the notifier block address. - */ -STATIC int -xfs_icsb_cpu_notify( - struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - xfs_icsb_cnts_t *cntp; - xfs_mount_t *mp; - - mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier); - cntp = (xfs_icsb_cnts_t *) - per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - /* Easy Case - initialize the area and locks, and - * then rebalance when online does everything else for us. */ - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - xfs_icsb_lock(mp); - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); - xfs_icsb_unlock(mp); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* Disable all the counters, then fold the dead cpu's - * count into the total on the global superblock and - * re-enable the counters. */ - xfs_icsb_lock(mp); - spin_lock(&mp->m_sb_lock); - xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT); - xfs_icsb_disable_counter(mp, XFS_SBS_IFREE); - xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS); - - mp->m_sb.sb_icount += cntp->icsb_icount; - mp->m_sb.sb_ifree += cntp->icsb_ifree; - mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks; - - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - - xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0); - spin_unlock(&mp->m_sb_lock); - xfs_icsb_unlock(mp); - break; - } - - return NOTIFY_OK; -} -#endif /* CONFIG_HOTPLUG_CPU */ - -int -xfs_icsb_init_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t); - if (mp->m_sb_cnts == NULL) - return -ENOMEM; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - } - - mutex_init(&mp->m_icsb_mutex); - - /* - * start with all counters disabled so that the - * initial balance kicks us off correctly - */ - mp->m_icsb_counters = -1; - -#ifdef CONFIG_HOTPLUG_CPU - mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; - mp->m_icsb_notifier.priority = 0; - register_hotcpu_notifier(&mp->m_icsb_notifier); -#endif /* CONFIG_HOTPLUG_CPU */ - - return 0; -} - -void -xfs_icsb_reinit_counters( - xfs_mount_t *mp) -{ - xfs_icsb_lock(mp); - /* - * start with all counters disabled so that the - * initial balance kicks us off correctly - */ - mp->m_icsb_counters = -1; - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); - xfs_icsb_unlock(mp); -} - -void -xfs_icsb_destroy_counters( - xfs_mount_t *mp) -{ - if (mp->m_sb_cnts) { - unregister_hotcpu_notifier(&mp->m_icsb_notifier); - free_percpu(mp->m_sb_cnts); - } - mutex_destroy(&mp->m_icsb_mutex); -} - -STATIC void -xfs_icsb_lock_cntr( - xfs_icsb_cnts_t *icsbp) -{ - while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) { - ndelay(1000); - } -} - -STATIC void -xfs_icsb_unlock_cntr( - xfs_icsb_cnts_t *icsbp) -{ - clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags); -} - - -STATIC void -xfs_icsb_lock_all_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - xfs_icsb_lock_cntr(cntp); - } -} - -STATIC void -xfs_icsb_unlock_all_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - xfs_icsb_unlock_cntr(cntp); - } -} - -STATIC void -xfs_icsb_count( - xfs_mount_t *mp, - xfs_icsb_cnts_t *cnt, - int flags) -{ - xfs_icsb_cnts_t *cntp; - int i; - - memset(cnt, 0, sizeof(xfs_icsb_cnts_t)); - - if (!(flags & XFS_ICSB_LAZY_COUNT)) - xfs_icsb_lock_all_counters(mp); - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - cnt->icsb_icount += cntp->icsb_icount; - cnt->icsb_ifree += cntp->icsb_ifree; - cnt->icsb_fdblocks += cntp->icsb_fdblocks; - } - - if (!(flags & XFS_ICSB_LAZY_COUNT)) - xfs_icsb_unlock_all_counters(mp); -} - -STATIC int -xfs_icsb_counter_disabled( - xfs_mount_t *mp, - xfs_sb_field_t field) -{ - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - return test_bit(field, &mp->m_icsb_counters); -} - -STATIC void -xfs_icsb_disable_counter( - xfs_mount_t *mp, - xfs_sb_field_t field) -{ - xfs_icsb_cnts_t cnt; - - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - - /* - * If we are already disabled, then there is nothing to do - * here. We check before locking all the counters to avoid - * the expensive lock operation when being called in the - * slow path and the counter is already disabled. This is - * safe because the only time we set or clear this state is under - * the m_icsb_mutex. - */ - if (xfs_icsb_counter_disabled(mp, field)) - return; - - xfs_icsb_lock_all_counters(mp); - if (!test_and_set_bit(field, &mp->m_icsb_counters)) { - /* drain back to superblock */ - - xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT); - switch(field) { - case XFS_SBS_ICOUNT: - mp->m_sb.sb_icount = cnt.icsb_icount; - break; - case XFS_SBS_IFREE: - mp->m_sb.sb_ifree = cnt.icsb_ifree; - break; - case XFS_SBS_FDBLOCKS: - mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; - break; - default: - BUG(); - } - } - - xfs_icsb_unlock_all_counters(mp); -} - -STATIC void -xfs_icsb_enable_counter( - xfs_mount_t *mp, - xfs_sb_field_t field, - uint64_t count, - uint64_t resid) -{ - xfs_icsb_cnts_t *cntp; - int i; - - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - - xfs_icsb_lock_all_counters(mp); - for_each_online_cpu(i) { - cntp = per_cpu_ptr(mp->m_sb_cnts, i); - switch (field) { - case XFS_SBS_ICOUNT: - cntp->icsb_icount = count + resid; - break; - case XFS_SBS_IFREE: - cntp->icsb_ifree = count + resid; - break; - case XFS_SBS_FDBLOCKS: - cntp->icsb_fdblocks = count + resid; - break; - default: - BUG(); - break; - } - resid = 0; - } - clear_bit(field, &mp->m_icsb_counters); - xfs_icsb_unlock_all_counters(mp); -} - -void -xfs_icsb_sync_counters_locked( - xfs_mount_t *mp, - int flags) -{ - xfs_icsb_cnts_t cnt; - - xfs_icsb_count(mp, &cnt, flags); - - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT)) - mp->m_sb.sb_icount = cnt.icsb_icount; - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE)) - mp->m_sb.sb_ifree = cnt.icsb_ifree; - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS)) - mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; -} - -/* - * Accurate update of per-cpu counters to incore superblock - */ -void -xfs_icsb_sync_counters( - xfs_mount_t *mp, - int flags) -{ - spin_lock(&mp->m_sb_lock); - xfs_icsb_sync_counters_locked(mp, flags); - spin_unlock(&mp->m_sb_lock); -} - -/* - * Balance and enable/disable counters as necessary. - * - * Thresholds for re-enabling counters are somewhat magic. inode counts are - * chosen to be the same number as single on disk allocation chunk per CPU, and - * free blocks is something far enough zero that we aren't going thrash when we - * get near ENOSPC. We also need to supply a minimum we require per cpu to - * prevent looping endlessly when xfs_alloc_space asks for more than will - * be distributed to a single CPU but each CPU has enough blocks to be - * reenabled. - * - * Note that we can be called when counters are already disabled. - * xfs_icsb_disable_counter() optimises the counter locking in this case to - * prevent locking every per-cpu counter needlessly. - */ - -#define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64 -#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \ - (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp)) -STATIC void -xfs_icsb_balance_counter_locked( - xfs_mount_t *mp, - xfs_sb_field_t field, - int min_per_cpu) -{ - uint64_t count, resid; - int weight = num_online_cpus(); - uint64_t min = (uint64_t)min_per_cpu; - - /* disable counter and sync counter */ - xfs_icsb_disable_counter(mp, field); - - /* update counters - first CPU gets residual*/ - switch (field) { - case XFS_SBS_ICOUNT: - count = mp->m_sb.sb_icount; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) - return; - break; - case XFS_SBS_IFREE: - count = mp->m_sb.sb_ifree; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) - return; - break; - case XFS_SBS_FDBLOCKS: - count = mp->m_sb.sb_fdblocks; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp))) - return; - break; - default: - BUG(); - count = resid = 0; /* quiet, gcc */ - break; - } - - xfs_icsb_enable_counter(mp, field, count, resid); -} - -STATIC void -xfs_icsb_balance_counter( - xfs_mount_t *mp, - xfs_sb_field_t fields, - int min_per_cpu) -{ - spin_lock(&mp->m_sb_lock); - xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu); - spin_unlock(&mp->m_sb_lock); -} - -int -xfs_icsb_modify_counters( - xfs_mount_t *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) -{ - xfs_icsb_cnts_t *icsbp; - long long lcounter; /* long counter for 64 bit fields */ - int ret = 0; - - might_sleep(); -again: - preempt_disable(); - icsbp = this_cpu_ptr(mp->m_sb_cnts); - - /* - * if the counter is disabled, go to slow path - */ - if (unlikely(xfs_icsb_counter_disabled(mp, field))) - goto slow_path; - xfs_icsb_lock_cntr(icsbp); - if (unlikely(xfs_icsb_counter_disabled(mp, field))) { - xfs_icsb_unlock_cntr(icsbp); - goto slow_path; - } - - switch (field) { - case XFS_SBS_ICOUNT: - lcounter = icsbp->icsb_icount; - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_icount = lcounter; - break; - - case XFS_SBS_IFREE: - lcounter = icsbp->icsb_ifree; - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_ifree = lcounter; - break; - - case XFS_SBS_FDBLOCKS: - BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0); - - lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); - break; - default: - BUG(); - break; - } - xfs_icsb_unlock_cntr(icsbp); - preempt_enable(); - return 0; - -slow_path: - preempt_enable(); - - /* - * serialise with a mutex so we don't burn lots of cpu on - * the superblock lock. We still need to hold the superblock - * lock, however, when we modify the global structures. - */ - xfs_icsb_lock(mp); - - /* - * Now running atomically. - * - * If the counter is enabled, someone has beaten us to rebalancing. - * Drop the lock and try again in the fast path.... - */ - if (!(xfs_icsb_counter_disabled(mp, field))) { - xfs_icsb_unlock(mp); - goto again; - } - - /* - * The counter is currently disabled. Because we are - * running atomically here, we know a rebalance cannot - * be in progress. Hence we can go straight to operating - * on the global superblock. We do not call xfs_mod_incore_sb() - * here even though we need to get the m_sb_lock. Doing so - * will cause us to re-enter this function and deadlock. - * Hence we get the m_sb_lock ourselves and then call - * xfs_mod_incore_sb_unlocked() as the unlocked path operates - * directly on the global counters. - */ - spin_lock(&mp->m_sb_lock); - ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - spin_unlock(&mp->m_sb_lock); - - /* - * Now that we've modified the global superblock, we - * may be able to re-enable the distributed counters - * (e.g. lots of space just got freed). After that - * we are done. - */ - if (ret != -ENOSPC) - xfs_icsb_balance_counter(mp, field, 0); - xfs_icsb_unlock(mp); - return ret; - -balance_counter: - xfs_icsb_unlock_cntr(icsbp); - preempt_enable(); - - /* - * We may have multiple threads here if multiple per-cpu - * counters run dry at the same time. This will mean we can - * do more balances than strictly necessary but it is not - * the common slowpath case. - */ - xfs_icsb_lock(mp); - - /* - * running atomically. - * - * This will leave the counter in the correct state for future - * accesses. After the rebalance, we simply try again and our retry - * will either succeed through the fast path or slow path without - * another balance operation being required. - */ - xfs_icsb_balance_counter(mp, field, delta); - xfs_icsb_unlock(mp); - goto again; -} - -#endif diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 0d8abd6364d9..8c995a2ccb6f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -18,8 +18,6 @@ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ -#ifdef __KERNEL__ - struct xlog; struct xfs_inode; struct xfs_mru_cache; @@ -29,44 +27,6 @@ struct xfs_quotainfo; struct xfs_dir_ops; struct xfs_da_geometry; -#ifdef HAVE_PERCPU_SB - -/* - * Valid per-cpu incore superblock counters. Note that if you add new counters, - * you may need to define new counter disabled bit field descriptors as there - * are more possible fields in the superblock that can fit in a bitfield on a - * 32 bit platform. The XFS_SBS_* values for the current current counters just - * fit. - */ -typedef struct xfs_icsb_cnts { - uint64_t icsb_fdblocks; - uint64_t icsb_ifree; - uint64_t icsb_icount; - unsigned long icsb_flags; -} xfs_icsb_cnts_t; - -#define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */ - -#define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */ - -extern int xfs_icsb_init_counters(struct xfs_mount *); -extern void xfs_icsb_reinit_counters(struct xfs_mount *); -extern void xfs_icsb_destroy_counters(struct xfs_mount *); -extern void xfs_icsb_sync_counters(struct xfs_mount *, int); -extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); -extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t, - int64_t, int); - -#else -#define xfs_icsb_init_counters(mp) (0) -#define xfs_icsb_destroy_counters(mp) do { } while (0) -#define xfs_icsb_reinit_counters(mp) do { } while (0) -#define xfs_icsb_sync_counters(mp, flags) do { } while (0) -#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) -#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \ - xfs_mod_incore_sb(mp, field, delta, rsvd) -#endif - /* dynamic preallocation free space thresholds, 5% down to 1% */ enum { XFS_LOWSP_1_PCNT = 0, @@ -81,8 +41,13 @@ typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ struct xfs_ail *m_ail; /* fs active log item list */ - xfs_sb_t m_sb; /* copy of fs superblock */ + + struct xfs_sb m_sb; /* copy of fs superblock */ spinlock_t m_sb_lock; /* sb counter lock */ + struct percpu_counter m_icount; /* allocated inodes counter */ + struct percpu_counter m_ifree; /* free inodes counter */ + struct percpu_counter m_fdblocks; /* free block counter */ + struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_fsname; /* filesystem name */ int m_fsname_len; /* strlen of fs name */ @@ -152,12 +117,6 @@ typedef struct xfs_mount { const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ uint m_chsize; /* size of next field */ atomic_t m_active_trans; /* number trans frozen */ -#ifdef HAVE_PERCPU_SB - xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ - unsigned long m_icsb_counters; /* disabled per-cpu counters */ - struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ - struct mutex m_icsb_mutex; /* balancer sync lock */ -#endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks @@ -301,35 +260,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* - * Per-cpu superblock locking functions - */ -#ifdef HAVE_PERCPU_SB -static inline void -xfs_icsb_lock(xfs_mount_t *mp) -{ - mutex_lock(&mp->m_icsb_mutex); -} - -static inline void -xfs_icsb_unlock(xfs_mount_t *mp) -{ - mutex_unlock(&mp->m_icsb_mutex); -} -#else -#define xfs_icsb_lock(mp) -#define xfs_icsb_unlock(mp) -#endif - -/* - * This structure is for use by the xfs_mod_incore_sb_batch() routine. - * xfs_growfs can specify a few fields which are more than int limit - */ -typedef struct xfs_mod_sb { - xfs_sb_field_t msb_field; /* Field to modify, see below */ - int64_t msb_delta; /* Change to make to specified field */ -} xfs_mod_sb_t; - -/* * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. */ @@ -383,11 +313,14 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); - extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); -extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, - uint, int); + +extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, + bool reserved); +extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); + extern int xfs_mount_log_sb(xfs_mount_t *); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); @@ -399,6 +332,4 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern void xfs_set_low_space_thresholds(struct xfs_mount *); -#endif /* __KERNEL__ */ - #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 30ecca3037e3..f8a674d7f092 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -437,7 +437,7 @@ xfs_mru_cache_insert( if (!mru || !mru->lists) return -EINVAL; - if (radix_tree_preload(GFP_KERNEL)) + if (radix_tree_preload(GFP_NOFS)) return -ENOMEM; INIT_LIST_HEAD(&elem->list_node); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 365dd57ea760..981a657eca39 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -31,7 +31,8 @@ int xfs_break_layouts( struct inode *inode, - uint *iolock) + uint *iolock, + bool with_imutex) { struct xfs_inode *ip = XFS_I(inode); int error; @@ -40,8 +41,12 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); + if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) + mutex_unlock(&inode->i_mutex); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; + if (with_imutex) + mutex_lock(&inode->i_mutex); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index b7fbfce660f6..8147ac108820 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -8,9 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock); +int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); #else -static inline int xfs_break_layouts(struct inode *inode, uint *iolock) +static inline int +xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) { return 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index fbbb9e62e274..5538468c7f63 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -719,6 +719,7 @@ xfs_qm_qino_alloc( xfs_trans_t *tp; int error; int committed; + bool need_alloc = true; *ip = NULL; /* @@ -747,6 +748,7 @@ xfs_qm_qino_alloc( return error; mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; + need_alloc = false; } } @@ -758,7 +760,7 @@ xfs_qm_qino_alloc( return error; } - if (!*ip) { + if (need_alloc) { error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); if (error) { @@ -794,11 +796,14 @@ xfs_qm_qino_alloc( spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); - return error; } - return 0; + if (need_alloc) + xfs_finish_inode_setup(*ip); + return error; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8fcc4ccc5c79..858e1e62bbaa 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -109,8 +109,6 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ #define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ -#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */ -#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */ #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ @@ -361,28 +359,10 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_GQUOTA_ENFD; - } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { - xfs_warn(mp, - "delaylog is the default now, option is deprecated."); - } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { - xfs_warn(mp, - "nodelaylog support has been removed, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_DISCARD)) { mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { mp->m_flags &= ~XFS_MOUNT_DISCARD; - } else if (!strcmp(this_char, "ihashsize")) { - xfs_warn(mp, - "ihashsize no longer used, option is deprecated."); - } else if (!strcmp(this_char, "osyncisdsync")) { - xfs_warn(mp, - "osyncisdsync has no effect, option is deprecated."); - } else if (!strcmp(this_char, "osyncisosync")) { - xfs_warn(mp, - "osyncisosync has no effect, option is deprecated."); - } else if (!strcmp(this_char, "irixsgid")) { - xfs_warn(mp, - "irixsgid is now a sysctl(2) variable, option is deprecated."); } else { xfs_warn(mp, "unknown mount option [%s].", this_char); return -EINVAL; @@ -986,6 +966,8 @@ xfs_fs_inode_init_once( atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); + mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); } @@ -1033,23 +1015,6 @@ xfs_free_fsname( kfree(mp->m_logname); } -STATIC void -xfs_fs_put_super( - struct super_block *sb) -{ - struct xfs_mount *mp = XFS_M(sb); - - xfs_filestream_unmount(mp); - xfs_unmountfs(mp); - - xfs_freesb(mp); - xfs_icsb_destroy_counters(mp); - xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); - xfs_free_fsname(mp); - kfree(mp); -} - STATIC int xfs_fs_sync_fs( struct super_block *sb, @@ -1083,8 +1048,11 @@ xfs_fs_statfs( { struct xfs_mount *mp = XFS_M(dentry->d_sb); xfs_sb_t *sbp = &mp->m_sb; - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); __uint64_t fakeinos, id; + __uint64_t icount; + __uint64_t ifree; + __uint64_t fdblocks; xfs_extlen_t lsize; __int64_t ffree; @@ -1095,17 +1063,21 @@ xfs_fs_statfs( statp->f_fsid.val[0] = (u32)id; statp->f_fsid.val[1] = (u32)(id >> 32); - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + icount = percpu_counter_sum(&mp->m_icount); + ifree = percpu_counter_sum(&mp->m_ifree); + fdblocks = percpu_counter_sum(&mp->m_fdblocks); spin_lock(&mp->m_sb_lock); statp->f_bsize = sbp->sb_blocksize; lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; statp->f_blocks = sbp->sb_dblocks - lsize; - statp->f_bfree = statp->f_bavail = - sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + spin_unlock(&mp->m_sb_lock); + + statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp); + statp->f_bavail = statp->f_bfree; + fakeinos = statp->f_bfree << sbp->sb_inopblog; - statp->f_files = - MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); if (mp->m_maxicount) statp->f_files = min_t(typeof(statp->f_files), statp->f_files, @@ -1117,10 +1089,9 @@ xfs_fs_statfs( sbp->sb_icount); /* make sure statp->f_ffree does not underflow */ - ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + ffree = statp->f_files - (icount - ifree); statp->f_ffree = max_t(__int64_t, ffree, 0); - spin_unlock(&mp->m_sb_lock); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == @@ -1256,6 +1227,12 @@ xfs_fs_remount( /* ro -> rw */ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { + xfs_warn(mp, + "ro->rw transition prohibited on norecovery mount"); + return -EINVAL; + } + mp->m_flags &= ~XFS_MOUNT_RDONLY; /* @@ -1401,6 +1378,51 @@ xfs_finish_flags( return 0; } +static int +xfs_init_percpu_counters( + struct xfs_mount *mp) +{ + int error; + + error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); + if (error) + return -ENOMEM; + + error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL); + if (error) + goto free_icount; + + error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); + if (error) + goto free_ifree; + + return 0; + +free_ifree: + percpu_counter_destroy(&mp->m_ifree); +free_icount: + percpu_counter_destroy(&mp->m_icount); + return -ENOMEM; +} + +void +xfs_reinit_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); + percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); + percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); +} + +static void +xfs_destroy_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_destroy(&mp->m_icount); + percpu_counter_destroy(&mp->m_ifree); + percpu_counter_destroy(&mp->m_fdblocks); +} + STATIC int xfs_fs_fill_super( struct super_block *sb, @@ -1449,7 +1471,7 @@ xfs_fs_fill_super( if (error) goto out_close_devices; - error = xfs_icsb_init_counters(mp); + error = xfs_init_percpu_counters(mp); if (error) goto out_destroy_workqueues; @@ -1507,7 +1529,7 @@ xfs_fs_fill_super( out_free_sb: xfs_freesb(mp); out_destroy_counters: - xfs_icsb_destroy_counters(mp); + xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); out_close_devices: @@ -1524,6 +1546,24 @@ out_destroy_workqueues: goto out_free_sb; } +STATIC void +xfs_fs_put_super( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_notice(mp, "Unmounting Filesystem"); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + + xfs_freesb(mp); + xfs_destroy_percpu_counters(mp); + xfs_destroy_mount_workqueues(mp); + xfs_close_devices(mp); + xfs_free_fsname(mp); + kfree(mp); +} + STATIC struct dentry * xfs_fs_mount( struct file_system_type *fs_type, diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 2b830c2f322e..499058fea303 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -72,6 +72,8 @@ extern const struct export_operations xfs_export_operations; extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; +extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); + #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) #endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 25791df6f638..3df411eadb86 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -177,7 +177,7 @@ xfs_symlink( int pathlen; struct xfs_bmap_free free_list; xfs_fsblock_t first_block; - bool unlock_dp_on_error = false; + bool unlock_dp_on_error = false; uint cancel_flags; int committed; xfs_fileoff_t first_fsb; @@ -221,7 +221,7 @@ xfs_symlink( XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) - goto std_return; + return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; @@ -241,7 +241,7 @@ xfs_symlink( } if (error) { cancel_flags = 0; - goto error_return; + goto out_trans_cancel; } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); @@ -252,7 +252,7 @@ xfs_symlink( */ if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { error = -EPERM; - goto error_return; + goto out_trans_cancel; } /* @@ -261,7 +261,7 @@ xfs_symlink( error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); if (error) - goto error_return; + goto out_trans_cancel; /* * Check for ability to enter directory entry, if no space reserved. @@ -269,7 +269,7 @@ xfs_symlink( if (!resblks) { error = xfs_dir_canenter(tp, dp, link_name); if (error) - goto error_return; + goto out_trans_cancel; } /* * Initialize the bmap freelist prior to calling either @@ -282,15 +282,14 @@ xfs_symlink( */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, resblks > 0, &ip, NULL); - if (error) { - if (error == -ENOSPC) - goto error_return; - goto error1; - } + if (error) + goto out_trans_cancel; /* - * An error after we've joined dp to the transaction will result in the - * transaction cancel unlocking dp so don't do it explicitly in the + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); @@ -330,7 +329,7 @@ xfs_symlink( XFS_BMAPI_METADATA, &first_block, resblks, mval, &nmaps, &free_list); if (error) - goto error2; + goto out_bmap_cancel; if (resblks) resblks -= fs_blocks; @@ -348,7 +347,7 @@ xfs_symlink( BTOBB(byte_cnt), 0); if (!bp) { error = -ENOMEM; - goto error2; + goto out_bmap_cancel; } bp->b_ops = &xfs_symlink_buf_ops; @@ -378,7 +377,7 @@ xfs_symlink( error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, &first_block, &free_list, resblks); if (error) - goto error2; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); @@ -392,10 +391,13 @@ xfs_symlink( } error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error2; - } + if (error) + goto out_bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); @@ -403,20 +405,28 @@ xfs_symlink( *ipp = ip; return 0; - error2: - IRELE(ip); - error1: +out_bmap_cancel: xfs_bmap_cancel(&free_list); cancel_flags |= XFS_TRANS_ABORT; - error_return: +out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); +out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); - std_return: return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 51372e34d988..615781bf4ee5 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -115,7 +115,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class, __entry->refcount = refcount; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u refcount %d caller %pf", + TP_printk("dev %d:%d agno %u refcount %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->refcount, @@ -239,7 +239,7 @@ TRACE_EVENT(xfs_iext_insert, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %lld count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -283,7 +283,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %lld count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -329,7 +329,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " - "lock %d flags %s caller %pf", + "lock %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->nblks, @@ -402,7 +402,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d flags %s caller %pf", + "lock %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -447,7 +447,7 @@ TRACE_EVENT(xfs_buf_ioerror, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d error %d flags %s caller %pf", + "lock %d error %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -613,7 +613,7 @@ DECLARE_EVENT_CLASS(xfs_lock_class, __entry->lock_flags = lock_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", + TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), @@ -664,6 +664,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); DEFINE_INODE_EVENT(xfs_zero_file_space); DEFINE_INODE_EVENT(xfs_collapse_file_space); +DEFINE_INODE_EVENT(xfs_insert_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL DEFINE_INODE_EVENT(xfs_get_acl); @@ -685,6 +686,9 @@ DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); +DEFINE_INODE_EVENT(xfs_filemap_fault); +DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), @@ -702,7 +706,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, __entry->pincount = atomic_read(&ip->i_pincount); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, @@ -1217,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found); DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -1333,7 +1342,7 @@ TRACE_EVENT(xfs_bunmap, __entry->flags = flags; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" - "flags %s caller %pf", + "flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1466,7 +1475,7 @@ TRACE_EVENT(xfs_agf, ), TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " "levels b %u c %u flfirst %u fllast %u flcount %u " - "freeblks %u longest %u caller %pf", + "freeblks %u longest %u caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index eb90cd59a0ec..220ef2c906b2 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -173,7 +173,7 @@ xfs_trans_reserve( uint rtextents) { int error = 0; - int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); @@ -184,8 +184,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, - -((int64_t)blocks), rsvd); + error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); return -ENOSPC; @@ -236,8 +235,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (rtextents > 0) { - error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, - -((int64_t)rtextents), rsvd); + error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents)); if (error) { error = -ENOSPC; goto undo_log; @@ -268,8 +266,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, - (int64_t)blocks, rsvd); + xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); tp->t_blk_res = 0; } @@ -488,6 +485,54 @@ xfs_trans_apply_sb_deltas( sizeof(sbp->sb_frextents) - 1); } +STATIC int +xfs_sb_mod8( + uint8_t *field, + int8_t delta) +{ + int8_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod32( + uint32_t *field, + int32_t delta) +{ + int32_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod64( + uint64_t *field, + int64_t delta) +{ + int64_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + /* * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations * and apply superblock counter changes to the in-core superblock. The @@ -495,13 +540,6 @@ xfs_trans_apply_sb_deltas( * applied to the in-core superblock. The idea is that that has already been * done. * - * This is done efficiently with a single call to xfs_mod_incore_sb_batch(). - * However, we have to ensure that we only modify each superblock field only - * once because the application of the delta values may not be atomic. That can - * lead to ENOSPC races occurring if we have two separate modifcations of the - * free space counter to put back the entire reservation and then take away - * what we used. - * * If we are not logging superblock counters, then the inode allocated/free and * used block counts are not updated in the on disk superblock. In this case, * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we @@ -509,21 +547,15 @@ xfs_trans_apply_sb_deltas( */ void xfs_trans_unreserve_and_mod_sb( - xfs_trans_t *tp) + struct xfs_trans *tp) { - xfs_mod_sb_t msb[9]; /* If you add cases, add entries */ - xfs_mod_sb_t *msbp; - xfs_mount_t *mp = tp->t_mountp; - /* REFERENCED */ - int error; - int rsvd; - int64_t blkdelta = 0; - int64_t rtxdelta = 0; - int64_t idelta = 0; - int64_t ifreedelta = 0; - - msbp = msb; - rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + struct xfs_mount *mp = tp->t_mountp; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + int64_t blkdelta = 0; + int64_t rtxdelta = 0; + int64_t idelta = 0; + int64_t ifreedelta = 0; + int error; /* calculate deltas */ if (tp->t_blk_res > 0) @@ -547,97 +579,115 @@ xfs_trans_unreserve_and_mod_sb( /* apply the per-cpu counters */ if (blkdelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - blkdelta, rsvd); + error = xfs_mod_fdblocks(mp, blkdelta, rsvd); if (error) goto out; } if (idelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, - idelta, rsvd); + error = xfs_mod_icount(mp, idelta); if (error) goto out_undo_fdblocks; } if (ifreedelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, - ifreedelta, rsvd); + error = xfs_mod_ifree(mp, ifreedelta); if (error) goto out_undo_icount; } + if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) + return; + /* apply remaining deltas */ - if (rtxdelta != 0) { - msbp->msb_field = XFS_SBS_FREXTENTS; - msbp->msb_delta = rtxdelta; - msbp++; + spin_lock(&mp->m_sb_lock); + if (rtxdelta) { + error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta); + if (error) + goto out_undo_ifree; } - if (tp->t_flags & XFS_TRANS_SB_DIRTY) { - if (tp->t_dblocks_delta != 0) { - msbp->msb_field = XFS_SBS_DBLOCKS; - msbp->msb_delta = tp->t_dblocks_delta; - msbp++; - } - if (tp->t_agcount_delta != 0) { - msbp->msb_field = XFS_SBS_AGCOUNT; - msbp->msb_delta = tp->t_agcount_delta; - msbp++; - } - if (tp->t_imaxpct_delta != 0) { - msbp->msb_field = XFS_SBS_IMAX_PCT; - msbp->msb_delta = tp->t_imaxpct_delta; - msbp++; - } - if (tp->t_rextsize_delta != 0) { - msbp->msb_field = XFS_SBS_REXTSIZE; - msbp->msb_delta = tp->t_rextsize_delta; - msbp++; - } - if (tp->t_rbmblocks_delta != 0) { - msbp->msb_field = XFS_SBS_RBMBLOCKS; - msbp->msb_delta = tp->t_rbmblocks_delta; - msbp++; - } - if (tp->t_rblocks_delta != 0) { - msbp->msb_field = XFS_SBS_RBLOCKS; - msbp->msb_delta = tp->t_rblocks_delta; - msbp++; - } - if (tp->t_rextents_delta != 0) { - msbp->msb_field = XFS_SBS_REXTENTS; - msbp->msb_delta = tp->t_rextents_delta; - msbp++; - } - if (tp->t_rextslog_delta != 0) { - msbp->msb_field = XFS_SBS_REXTSLOG; - msbp->msb_delta = tp->t_rextslog_delta; - msbp++; - } + if (tp->t_dblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta); + if (error) + goto out_undo_frextents; } - - /* - * If we need to change anything, do it. - */ - if (msbp > msb) { - error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, - (uint)(msbp - msb), rsvd); + if (tp->t_agcount_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta); if (error) - goto out_undo_ifreecount; + goto out_undo_dblocks; } - + if (tp->t_imaxpct_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta); + if (error) + goto out_undo_agcount; + } + if (tp->t_rextsize_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rextsize, + tp->t_rextsize_delta); + if (error) + goto out_undo_imaxpct; + } + if (tp->t_rbmblocks_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, + tp->t_rbmblocks_delta); + if (error) + goto out_undo_rextsize; + } + if (tp->t_rblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta); + if (error) + goto out_undo_rbmblocks; + } + if (tp->t_rextents_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rextents, + tp->t_rextents_delta); + if (error) + goto out_undo_rblocks; + } + if (tp->t_rextslog_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_rextslog, + tp->t_rextslog_delta); + if (error) + goto out_undo_rextents; + } + spin_unlock(&mp->m_sb_lock); return; -out_undo_ifreecount: +out_undo_rextents: + if (tp->t_rextents_delta) + xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta); +out_undo_rblocks: + if (tp->t_rblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta); +out_undo_rbmblocks: + if (tp->t_rbmblocks_delta) + xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta); +out_undo_rextsize: + if (tp->t_rextsize_delta) + xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta); +out_undo_imaxpct: + if (tp->t_rextsize_delta) + xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta); +out_undo_agcount: + if (tp->t_agcount_delta) + xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta); +out_undo_dblocks: + if (tp->t_dblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta); +out_undo_frextents: + if (rtxdelta) + xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta); +out_undo_ifree: + spin_unlock(&mp->m_sb_lock); if (ifreedelta) - xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd); + xfs_mod_ifree(mp, -ifreedelta); out_undo_icount: if (idelta) - xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd); + xfs_mod_icount(mp, -idelta); out_undo_fdblocks: if (blkdelta) - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); + xfs_mod_fdblocks(mp, -blkdelta, rsvd); out: ASSERT(error == 0); return; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 69f6e475de97..c036815183cb 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -35,7 +35,7 @@ static int xfs_xattr_get(struct dentry *dentry, const char *name, void *value, size_t size, int xflags) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); int error, asize = size; if (strcmp(name, "") == 0) @@ -57,7 +57,7 @@ static int xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int xflags) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); if (strcmp(name, "") == 0) return -EINVAL; @@ -197,7 +197,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) { struct xfs_attr_list_context context; struct attrlist_cursor_kern cursor = { 0 }; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; /* |