diff options
Diffstat (limited to 'fs')
328 files changed, 22742 insertions, 15283 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 34494fbead0a..646337dc5201 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -685,16 +685,12 @@ not_found: spin_unlock(&dentry->d_lock); out_bad: - if (dentry->d_inode) { - /* don't unhash if we have submounts */ - if (have_submounts(dentry)) - goto out_skip; - } + /* don't unhash if we have submounts */ + if (check_submounts_and_drop(dentry) != 0) + goto out_skip; _debug("dropping dentry %s/%s", parent->d_name.name, dentry->d_name.name); - shrink_dcache_parent(dentry); - d_drop(dentry); dput(parent); key_put(key); @@ -755,10 +751,6 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) _enter("{%x:%u},{%s},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -820,10 +812,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) _enter("{%x:%u},{%s}", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -940,10 +928,6 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, _enter("{%x:%u},{%s},%ho,", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -1009,10 +993,6 @@ static int afs_link(struct dentry *from, struct inode *dir, dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); @@ -1057,10 +1037,6 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, content); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; - ret = -EINVAL; if (strlen(content) >= AFSPATHMAX) goto error; @@ -1131,10 +1107,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dvnode->fid.vid, new_dvnode->fid.vnode, new_dentry->d_name.name); - ret = -ENAMETOOLONG; - if (new_dentry->d_name.len >= AFSNAMEMAX) - goto error; - key = afs_request_key(orig_dvnode->volume->cell); if (IS_ERR(key)) { ret = PTR_ERR(key); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 743c7c2c949d..0f00da329e71 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -183,13 +183,14 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, return 0; } +/* Find the topmost mount satisfying test() */ static int find_autofs_mount(const char *pathname, struct path *res, int test(struct path *path, void *data), void *data) { struct path path; - int err = kern_path(pathname, 0, &path); + int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); if (err) return err; err = -ENOENT; @@ -197,10 +198,9 @@ static int find_autofs_mount(const char *pathname, if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { if (test(&path, data)) { path_get(&path); - if (!err) /* already found some */ - path_put(res); *res = path; err = 0; + break; } } if (!follow_up(&path)) @@ -486,12 +486,11 @@ static int autofs_dev_ioctl_askumount(struct file *fp, * mount if there is one or 0 if it isn't a mountpoint. * * If we aren't supplied with a file descriptor then we - * lookup the nameidata of the path and check if it is the - * root of a mount. If a type is given we are looking for - * a particular autofs mount and if we don't find a match - * we return fail. If the located nameidata path is the - * root of a mount we return 1 along with the super magic - * of the mount or 0 otherwise. + * lookup the path and check if it is the root of a mount. + * If a type is given we are looking for a particular autofs + * mount and if we don't find a match we return fail. If the + * located path is the root of a mount we return 1 along with + * the super magic of the mount or 0 otherwise. * * In both cases the the device number (as returned by * new_encode_dev()) is also returned. @@ -519,9 +518,11 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, if (!fp || param->ioctlfd == -1) { if (autofs_type_any(type)) - err = kern_path(name, LOOKUP_FOLLOW, &path); + err = kern_path_mountpoint(AT_FDCWD, + name, &path, LOOKUP_FOLLOW); else - err = find_autofs_mount(name, &path, test_by_type, &type); + err = find_autofs_mount(name, &path, + test_by_type, &type); if (err) goto out; devid = new_encode_dev(path.dentry->d_sb->s_dev); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 5e376bb93419..8defc6b3f9a2 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -40,7 +40,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) int block, off; inode = iget_locked(sb, ino); - if (IS_ERR(inode)) + if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; @@ -1045,12 +1045,22 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; - int ret = 0; + struct bio_vec *bvec; + int ret = 0, i; - if (!bio_flagged(bio, BIO_NULL_MAPPED)) - ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, - bmd->nr_sgvecs, bio_data_dir(bio) == READ, - 0, bmd->is_our_pages); + if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + /* + * if we're in a workqueue, the request is orphaned, so + * don't copy into a random user address space, just free. + */ + if (current->mm) + ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, + bmd->nr_sgvecs, bio_data_dir(bio) == READ, + 0, bmd->is_our_pages); + else if (bmd->is_our_pages) + bio_for_each_segment_all(bvec, bio, i) + __free_page(bvec->bv_page); + } bio_free_map_data(bmd); bio_put(bio); return ret; @@ -1946,7 +1956,7 @@ int bio_associate_current(struct bio *bio) /* associate blkcg if exists */ rcu_read_lock(); - css = task_subsys_state(current, blkio_subsys_id); + css = task_css(current, blkio_subsys_id); if (css && css_tryget(css)) bio->bi_css = css; rcu_read_unlock(); diff --git a/fs/block_dev.c b/fs/block_dev.c index c7bda5cd3da7..1173a4ee0830 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1519,7 +1519,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); - if (ret > 0 || ret == -EIOCBQUEUED) { + if (ret > 0) { ssize_t err; err = generic_write_sync(file, pos, ret); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index eaf133384a8f..8bc5e8ccb091 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -36,16 +36,23 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, u64 extent_item_pos, struct extent_inode_elem **eie) { - u64 data_offset; - u64 data_len; + u64 offset = 0; struct extent_inode_elem *e; - data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); + if (!btrfs_file_extent_compression(eb, fi) && + !btrfs_file_extent_encryption(eb, fi) && + !btrfs_file_extent_other_encoding(eb, fi)) { + u64 data_offset; + u64 data_len; - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) - return 1; + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + return 1; + offset = extent_item_pos - data_offset; + } e = kmalloc(sizeof(*e), GFP_NOFS); if (!e) @@ -53,7 +60,7 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, e->next = *eie; e->inum = key->objectid; - e->offset = key->offset + (extent_item_pos - data_offset); + e->offset = key->offset + offset; *eie = e; return 0; @@ -189,7 +196,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *eb; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct extent_inode_elem *eie = NULL; + struct extent_inode_elem *eie = NULL, *old = NULL; u64 disk_byte; if (level != 0) { @@ -223,6 +230,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (disk_byte == wanted_disk_byte) { eie = NULL; + old = NULL; if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, @@ -230,18 +238,20 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (ret < 0) break; } - if (!ret) { - ret = ulist_add(parents, eb->start, - (uintptr_t)eie, GFP_NOFS); - if (ret < 0) - break; - if (!extent_item_pos) { - ret = btrfs_next_old_leaf(root, path, - time_seq); - continue; - } + if (ret > 0) + goto next; + ret = ulist_add_merge(parents, eb->start, + (uintptr_t)eie, + (u64 *)&old, GFP_NOFS); + if (ret < 0) + break; + if (!ret && extent_item_pos) { + while (old->next) + old = old->next; + old->next = eie; } } +next: ret = btrfs_next_old_item(root, path, time_seq); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5bf4c39e2ad6..ed504607d8ec 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1271,7 +1271,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, BUG_ON(!eb_rewin); } - extent_buffer_get(eb_rewin); btrfs_tree_read_unlock(eb); free_extent_buffer(eb); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 4253ad580e39..5f8f3341c099 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -747,7 +747,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) WARN_ON(atomic_xchg( &fs_info->mutually_exclusive_operation_running, 1)); task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); - return PTR_RET(task); + return PTR_ERR_OR_ZERO(task); } static int btrfs_dev_replace_kthread(void *data) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 583d98bd065e..fe443fece851 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4048,7 +4048,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - u64 offset_in_extent; + u64 offset_in_extent = 0; /* break if the extent we found is outside the range */ if (em->start >= max || extent_map_end(em) < off) @@ -4064,9 +4064,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* * record the offset from the start of the extent - * for adjusting the disk offset below + * for adjusting the disk offset below. Only do this if the + * extent isn't compressed since our in ram offset may be past + * what we have actually allocated on disk. */ - offset_in_extent = em_start - em->start; + if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + offset_in_extent = em_start - em->start; em_end = extent_map_end(em); em_len = em_end - em_start; emflags = em->flags; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a005fe2c072a..4d2eb6417145 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -596,20 +596,29 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, if (no_splits) goto next; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - em->start < start) { + if (em->start < start) { split->start = em->start; split->len = start - em->start; - split->orig_start = em->orig_start; - split->block_start = em->block_start; - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - split->ram_bytes = em->ram_bytes; - split->orig_block_len = max(split->block_len, - em->orig_block_len); + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + split->generation = gen; split->bdev = em->bdev; split->flags = flags; @@ -620,8 +629,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = split2; split2 = NULL; } - if (em->block_start < EXTENT_MAP_LAST_BYTE && - testend && em->start + em->len > start + len) { + if (testend && em->start + em->len > start + len) { u64 diff = start + len - em->start; split->start = start + len; @@ -630,18 +638,28 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; - split->orig_block_len = max(em->block_len, + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, em->orig_block_len); - split->ram_bytes = em->ram_bytes; - if (compressed) { - split->block_len = em->block_len; - split->block_start = em->block_start; - split->orig_start = em->orig_start; + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + + diff; + split->orig_start = em->orig_start; + } } else { - split->block_len = split->len; - split->block_start = em->block_start + diff; - split->orig_start = em->orig_start; + split->ram_bytes = split->len; + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; } ret = add_extent_mapping(em_tree, split, modified); @@ -1709,7 +1727,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, */ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; BTRFS_I(inode)->last_sub_trans = root->log_transid; - if (num_written > 0 || num_written == -EIOCBQUEUED) { + if (num_written > 0) { err = generic_write_sync(file, pos, num_written); if (err < 0 && num_written > 0) num_written = err; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6d1b93c8aafb..7bdc83d04d54 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2166,16 +2166,23 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) continue; - extent_offset = btrfs_file_extent_offset(leaf, extent); - if (key.offset - extent_offset != offset) + /* + * 'offset' refers to the exact key.offset, + * NOT the 'offset' field in btrfs_extent_data_ref, ie. + * (key.offset - extent_offset). + */ + if (key.offset != offset) continue; + extent_offset = btrfs_file_extent_offset(leaf, extent); num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + if (extent_offset >= old->extent_offset + old->offset + old->len || extent_offset + num_bytes <= old->extent_offset + old->offset) continue; + ret = 0; break; } @@ -2187,7 +2194,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, backref->root_id = root_id; backref->inum = inum; - backref->file_pos = offset + extent_offset; + backref->file_pos = offset; backref->num_bytes = num_bytes; backref->extent_offset = extent_offset; backref->generation = btrfs_file_extent_generation(leaf, extent); @@ -2210,7 +2217,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, new->path = path; list_for_each_entry_safe(old, tmp, &new->head, list) { - ret = iterate_inodes_from_logical(old->bytenr, fs_info, + ret = iterate_inodes_from_logical(old->bytenr + + old->extent_offset, fs_info, path, record_one_backref, old); BUG_ON(ret < 0 && ret != -ENOENT); @@ -3166,7 +3174,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - ret = PTR_RET(inode); + ret = PTR_ERR_OR_ZERO(inode); if (ret && ret != -ESTALE) goto out; @@ -4391,9 +4399,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) int mask = attr->ia_valid; int ret; - if (newsize == oldsize) - return 0; - /* * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having @@ -5165,14 +5170,31 @@ next: } /* Reached end of directory/root. Bump pos past the last item. */ - if (key_type == BTRFS_DIR_INDEX_KEY) - /* - * 32-bit glibc will use getdents64, but then strtol - - * so the last number we can serve is this. - */ - ctx->pos = 0x7fffffff; - else - ctx->pos++; + ctx->pos++; + + /* + * Stop new entries from being returned after we return the last + * entry. + * + * New directory entries are assigned a strictly increasing + * offset. This means that new entries created during readdir + * are *guaranteed* to be seen in the future by that readdir. + * This has broken buggy programs which operate on names as + * they're returned by readdir. Until we re-use freed offsets + * we have this hack to stop new entries from being returned + * under the assumption that they'll never reach this huge + * offset. + * + * This is being careful not to overflow 32bit loff_t unless the + * last entry requires it because doing so has broken 32bit apps + * in the past. + */ + if (key_type == BTRFS_DIR_INDEX_KEY) { + if (ctx->pos >= INT_MAX) + ctx->pos = LLONG_MAX; + else + ctx->pos = INT_MAX; + } nopos: ret = 0; err: diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d3f3b43cae0b..2e14fd89a8b4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -219,7 +219,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) len = PAGE_ALIGN(len); if (p->buf == p->inline_buf) { - tmp_buf = kmalloc(len, GFP_NOFS); + tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); if (!tmp_buf) { tmp_buf = vmalloc(len); if (!tmp_buf) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d58cce77fc6c..af1931a5960d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -983,12 +983,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, * a dirty root struct and adds it into the list of dead roots that need to * be deleted */ -int btrfs_add_dead_root(struct btrfs_root *root) +void btrfs_add_dead_root(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); - list_add_tail(&root->root_list, &root->fs_info->dead_roots); + if (list_empty(&root->root_list)) + list_add_tail(&root->root_list, &root->fs_info->dead_roots); spin_unlock(&root->fs_info->trans_lock); - return 0; } /* @@ -1925,7 +1925,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) } root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); - list_del(&root->root_list); + list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); pr_debug("btrfs: cleaner removing %llu\n", diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 005b0375d18c..defbc4269897 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -143,7 +143,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_add_dead_root(struct btrfs_root *root); +void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2c6791493637..ff60d8978ae2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3746,8 +3746,9 @@ next_slot: } log_extents: + btrfs_release_path(path); + btrfs_release_path(dst_path); if (fast_search) { - btrfs_release_path(dst_path); ret = btrfs_log_changed_extents(trans, root, inode, dst_path); if (ret) { err = ret; @@ -3764,8 +3765,6 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { - btrfs_release_path(path); - btrfs_release_path(dst_path); ret = log_directory_changes(trans, root, inode, path, dst_path); if (ret) { err = ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 78b871753cb6..67a085381845 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3302,7 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); - return PTR_RET(tsk); + return PTR_ERR_OR_ZERO(tsk); } int btrfs_recover_balance(struct btrfs_fs_info *fs_info) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index d4c1206af9fc..43eb5592cdea 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -378,6 +378,31 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) } /* + * check if the backing cache is updated to FS-Cache + * - called by FS-Cache when evaluates if need to invalidate the cache + */ +static bool cachefiles_check_consistency(struct fscache_operation *op) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("{OBJ%x}", op->object->debug_id); + + object = container_of(op->object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_check_auxdata(object); + cachefiles_end_secure(cache, saved_cred); + + _leave(" = %d", ret); + return ret; +} + +/* * notification the attributes on an object have changed * - called with reads/writes excluded by FS-Cache */ @@ -522,4 +547,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = { .write_page = cachefiles_write_page, .uncache_page = cachefiles_uncache_page, .dissociate_pages = cachefiles_dissociate_pages, + .check_consistency = cachefiles_check_consistency, }; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 49382519907a..5349473df1b1 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -235,6 +235,7 @@ extern int cachefiles_set_object_xattr(struct cachefiles_object *object, struct cachefiles_xattr *auxdata); extern int cachefiles_update_object_xattr(struct cachefiles_object *object, struct cachefiles_xattr *auxdata); +extern int cachefiles_check_auxdata(struct cachefiles_object *object); extern int cachefiles_check_object_xattr(struct cachefiles_object *object, struct cachefiles_xattr *auxdata); extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 2476e5162609..34c88b83e39f 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -157,6 +157,42 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, } /* + * check the consistency between the backing cache and the FS-Cache cookie + */ +int cachefiles_check_auxdata(struct cachefiles_object *object) +{ + struct cachefiles_xattr *auxbuf; + struct dentry *dentry = object->dentry; + unsigned int dlen; + int ret; + + ASSERT(dentry); + ASSERT(dentry->d_inode); + ASSERT(object->fscache.cookie->def->check_aux); + + auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); + if (!auxbuf) + return -ENOMEM; + + auxbuf->len = vfs_getxattr(dentry, cachefiles_xattr_cache, + &auxbuf->type, 512 + 1); + if (auxbuf->len < 1) + return -ESTALE; + + if (auxbuf->type != object->fscache.cookie->def->type) + return -ESTALE; + + dlen = auxbuf->len - 1; + ret = fscache_check_aux(&object->fscache, &auxbuf->data, dlen); + + kfree(auxbuf); + if (ret != FSCACHE_CHECKAUX_OKAY) + return -ESTALE; + + return 0; +} + +/* * check the state xattr on a cache file * - return -ESTALE if the object should be deleted */ diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 49bc78243db9..ac9a2ef5bb9b 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -16,3 +16,12 @@ config CEPH_FS If unsure, say N. +if CEPH_FS +config CEPH_FSCACHE + bool "Enable Ceph client caching support" + depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y + help + Choose Y here to enable persistent, read-only local + caching support for Ceph clients using FS-Cache + +endif diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index bd352125e829..32e30106a2f0 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -9,3 +9,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o +ceph-$(CONFIG_CEPH_FSCACHE) += cache.o diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5318a3b704f6..6df8bd481425 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -11,6 +11,7 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/osd_client.h> /* @@ -70,15 +71,16 @@ static int ceph_set_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct inode *inode; struct ceph_inode_info *ci; - int undo = 0; struct ceph_snap_context *snapc; + int ret; if (unlikely(!mapping)) return !TestSetPageDirty(page); - if (TestSetPageDirty(page)) { + if (PageDirty(page)) { dout("%p set_page_dirty %p idx %lu -- already dirty\n", mapping->host, page, page->index); + BUG_ON(!PagePrivate(page)); return 0; } @@ -107,35 +109,19 @@ static int ceph_set_page_dirty(struct page *page) snapc, snapc->seq, snapc->num_snaps); spin_unlock(&ci->i_ceph_lock); - /* now adjust page */ - spin_lock_irq(&mapping->tree_lock); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, page->mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - - /* - * Reference snap context in page->private. Also set - * PagePrivate so that we get invalidatepage callback. - */ - page->private = (unsigned long)snapc; - SetPagePrivate(page); - } else { - dout("ANON set_page_dirty %p (raced truncate?)\n", page); - undo = 1; - } - - spin_unlock_irq(&mapping->tree_lock); - - if (undo) - /* whoops, we failed to dirty the page */ - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + /* + * Reference snap context in page->private. Also set + * PagePrivate so that we get invalidatepage callback. + */ + BUG_ON(PagePrivate(page)); + page->private = (unsigned long)snapc; + SetPagePrivate(page); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + ret = __set_page_dirty_nobuffers(page); + WARN_ON(!PageLocked(page)); + WARN_ON(!page->mapping); - BUG_ON(!PageDirty(page)); - return 1; + return ret; } /* @@ -150,11 +136,19 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, struct ceph_inode_info *ci; struct ceph_snap_context *snapc = page_snap_context(page); - BUG_ON(!PageLocked(page)); - BUG_ON(!PagePrivate(page)); - BUG_ON(!page->mapping); - inode = page->mapping->host; + ci = ceph_inode(inode); + + if (offset != 0 || length != PAGE_CACHE_SIZE) { + dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", + inode, page, page->index, offset, length); + return; + } + + ceph_invalidate_fscache_page(inode, page); + + if (!PagePrivate(page)) + return; /* * We can get non-dirty pages here due to races between @@ -164,31 +158,28 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, if (!PageDirty(page)) pr_err("%p invalidatepage %p page not dirty\n", inode, page); - if (offset == 0 && length == PAGE_CACHE_SIZE) - ClearPageChecked(page); + ClearPageChecked(page); - ci = ceph_inode(inode); - if (offset == 0 && length == PAGE_CACHE_SIZE) { - dout("%p invalidatepage %p idx %lu full dirty page\n", - inode, page, page->index); - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); - ceph_put_snap_context(snapc); - page->private = 0; - ClearPagePrivate(page); - } else { - dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n", - inode, page, page->index, offset, length); - } + dout("%p invalidatepage %p idx %lu full dirty page\n", + inode, page, page->index); + + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + page->private = 0; + ClearPagePrivate(page); } -/* just a sanity check */ static int ceph_releasepage(struct page *page, gfp_t g) { struct inode *inode = page->mapping ? page->mapping->host : NULL; dout("%p releasepage %p idx %lu\n", inode, page, page->index); WARN_ON(PageDirty(page)); - WARN_ON(PagePrivate(page)); - return 0; + + /* Can we release the page from the cache? */ + if (!ceph_release_fscache_page(page, g)) + return 0; + + return !PagePrivate(page); } /* @@ -198,11 +189,16 @@ static int readpage_nounlock(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = + struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; int err = 0; u64 len = PAGE_CACHE_SIZE; + err = ceph_readpage_from_fscache(inode, page); + + if (err == 0) + goto out; + dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, @@ -220,6 +216,9 @@ static int readpage_nounlock(struct file *filp, struct page *page) } SetPageUptodate(page); + if (err == 0) + ceph_readpage_to_fscache(inode, page); + out: return err < 0 ? err : 0; } @@ -262,6 +261,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) page->index); flush_dcache_page(page); SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); unlock_page(page); page_cache_release(page); bytes -= PAGE_CACHE_SIZE; @@ -331,11 +331,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) page = list_entry(page_list->prev, struct page, lru); BUG_ON(PageLocked(page)); list_del(&page->lru); - + dout("start_read %p adding %p idx %lu\n", inode, page, page->index); if (add_to_page_cache_lru(page, &inode->i_data, page->index, GFP_NOFS)) { + ceph_fscache_uncache_page(inode, page); page_cache_release(page); dout("start_read %p add_to_page_cache failed %p\n", inode, page); @@ -378,6 +379,12 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, int rc = 0; int max = 0; + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, + &nr_pages); + + if (rc == 0) + goto out; + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; @@ -392,6 +399,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, BUG_ON(rc == 0); } out: + ceph_fscache_readpages_cancel(inode, page_list); + dout("readpages %p file %p ret %d\n", inode, file, rc); return rc; } @@ -497,6 +506,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + ceph_readpage_to_fscache(inode, page); + set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, @@ -552,7 +563,6 @@ static void ceph_release_pages(struct page **pages, int num) pagevec_release(&pvec); } - /* * async writeback completion handler. * diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c new file mode 100644 index 000000000000..6bfe65e0b038 --- /dev/null +++ b/fs/ceph/cache.c @@ -0,0 +1,398 @@ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include "super.h" +#include "cache.h" + +struct ceph_aux_inode { + struct timespec mtime; + loff_t size; +}; + +struct fscache_netfs ceph_cache_netfs = { + .name = "ceph", + .version = 0, +}; + +static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct ceph_fs_client* fsc = cookie_netfs_data; + uint16_t klen; + + klen = sizeof(fsc->client->fsid); + if (klen > maxbuf) + return 0; + + memcpy(buffer, &fsc->client->fsid, klen); + return klen; +} + +static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { + .name = "CEPH.fsid", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = ceph_fscache_session_get_key, +}; + +int ceph_fscache_register(void) +{ + return fscache_register_netfs(&ceph_cache_netfs); +} + +void ceph_fscache_unregister(void) +{ + fscache_unregister_netfs(&ceph_cache_netfs); +} + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ + fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, + &ceph_fscache_fsid_object_def, + fsc); + + if (fsc->fscache == NULL) { + pr_err("Unable to resgister fsid: %p fscache cookie", fsc); + return 0; + } + + fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1); + if (fsc->revalidate_wq == NULL) + return -ENOMEM; + + return 0; +} + +static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct ceph_inode_info* ci = cookie_netfs_data; + uint16_t klen; + + /* use ceph virtual inode (id + snaphot) */ + klen = sizeof(ci->i_vino); + if (klen > maxbuf) + return 0; + + memcpy(buffer, &ci->i_vino, klen); + return klen; +} + +static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct ceph_aux_inode aux; + const struct ceph_inode_info* ci = cookie_netfs_data; + const struct inode* inode = &ci->vfs_inode; + + memset(&aux, 0, sizeof(aux)); + aux.mtime = inode->i_mtime; + aux.size = inode->i_size; + + memcpy(buffer, &aux, sizeof(aux)); + + return sizeof(aux); +} + +static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct ceph_inode_info* ci = cookie_netfs_data; + const struct inode* inode = &ci->vfs_inode; + + *size = inode->i_size; +} + +static enum fscache_checkaux ceph_fscache_inode_check_aux( + void *cookie_netfs_data, const void *data, uint16_t dlen) +{ + struct ceph_aux_inode aux; + struct ceph_inode_info* ci = cookie_netfs_data; + struct inode* inode = &ci->vfs_inode; + + if (dlen != sizeof(aux)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&aux, 0, sizeof(aux)); + aux.mtime = inode->i_mtime; + aux.size = inode->i_size; + + if (memcmp(data, &aux, sizeof(aux)) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + dout("ceph inode 0x%p cached okay", ci); + return FSCACHE_CHECKAUX_OKAY; +} + +static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data) +{ + struct ceph_inode_info* ci = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + dout("ceph inode 0x%p now uncached", ci); + + while (1) { + nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +static const struct fscache_cookie_def ceph_fscache_inode_object_def = { + .name = "CEPH.inode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = ceph_fscache_inode_get_key, + .get_attr = ceph_fscache_inode_get_attr, + .get_aux = ceph_fscache_inode_get_aux, + .check_aux = ceph_fscache_inode_check_aux, + .now_uncached = ceph_fscache_inode_now_uncached, +}; + +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, + struct ceph_inode_info* ci) +{ + struct inode* inode = &ci->vfs_inode; + + /* No caching for filesystem */ + if (fsc->fscache == NULL) + return; + + /* Only cache for regular files that are read only */ + if ((ci->vfs_inode.i_mode & S_IFREG) == 0) + return; + + /* Avoid multiple racing open requests */ + mutex_lock(&inode->i_mutex); + + if (ci->fscache) + goto done; + + ci->fscache = fscache_acquire_cookie(fsc->fscache, + &ceph_fscache_inode_object_def, + ci); +done: + mutex_unlock(&inode->i_mutex); + +} + +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ + struct fscache_cookie* cookie; + + if ((cookie = ci->fscache) == NULL) + return; + + ci->fscache = NULL; + + fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); + fscache_relinquish_cookie(cookie, 0); +} + +static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) +{ + if (!error) + SetPageUptodate(page); +} + +static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +{ + if (!error) + SetPageUptodate(page); + + unlock_page(page); +} + +static inline int cache_valid(struct ceph_inode_info *ci) +{ + return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && + (ci->i_fscache_gen == ci->i_rdcache_gen)); +} + + +/* Atempt to read from the fscache, + * + * This function is called from the readpage_nounlock context. DO NOT attempt to + * unlock the page here (or in the callback). + */ +int ceph_readpage_from_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_page(ci->fscache, page, + ceph_vfs_readpage_complete, NULL, + GFP_KERNEL); + + switch (ret) { + case 0: /* Page found */ + dout("page read submitted\n"); + return 0; + case -ENOBUFS: /* Pages were not found, and can't be */ + case -ENODATA: /* Pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, + ceph_vfs_readpage_complete_unlock, + NULL, mapping_gfp_mask(mapping)); + + switch (ret) { + case 0: /* All pages found */ + dout("all-page read submitted\n"); + return 0; + case -ENOBUFS: /* Some pages were not found, and can't be */ + case -ENODATA: /* some pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +void ceph_readpage_to_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!PageFsCache(page)) + return; + + if (!cache_valid(ci)) + return; + + ret = fscache_write_page(ci->fscache, page, GFP_KERNEL); + if (ret) + fscache_uncache_page(ci->fscache, page); +} + +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + fscache_wait_on_page_write(ci->fscache, page); + fscache_uncache_page(ci->fscache, page); +} + +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ + if (fsc->revalidate_wq) + destroy_workqueue(fsc->revalidate_wq); + + fscache_relinquish_cookie(fsc->fscache, 0); + fsc->fscache = NULL; +} + +static void ceph_revalidate_work(struct work_struct *work) +{ + int issued; + u32 orig_gen; + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_revalidate_work); + struct inode *inode = &ci->vfs_inode; + + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + orig_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); + + if (!(issued & CEPH_CAP_FILE_CACHE)) { + dout("revalidate_work lost cache before validation %p\n", + inode); + goto out; + } + + if (!fscache_check_consistency(ci->fscache)) + fscache_invalidate(ci->fscache); + + spin_lock(&ci->i_ceph_lock); + /* Update the new valid generation (backwards sanity check too) */ + if (orig_gen > ci->i_fscache_gen) { + ci->i_fscache_gen = orig_gen; + } + spin_unlock(&ci->i_ceph_lock); + +out: + iput(&ci->vfs_inode); +} + +void ceph_queue_revalidate(struct inode *inode) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + + if (fsc->revalidate_wq == NULL || ci->fscache == NULL) + return; + + ihold(inode); + + if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq, + &ci->i_revalidate_work)) { + dout("ceph_queue_revalidate %p\n", inode); + } else { + dout("ceph_queue_revalidate %p failed\n)", inode); + iput(inode); + } +} + +void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ + ci->fscache = NULL; + /* The first load is verifed cookie open time */ + ci->i_fscache_gen = 1; + INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work); +} diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h new file mode 100644 index 000000000000..ba949408a336 --- /dev/null +++ b/fs/ceph/cache.h @@ -0,0 +1,159 @@ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#ifndef _CEPH_CACHE_H +#define _CEPH_CACHE_H + +#ifdef CONFIG_CEPH_FSCACHE + +extern struct fscache_netfs ceph_cache_netfs; + +int ceph_fscache_register(void); +void ceph_fscache_unregister(void); + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc); +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); + +void ceph_fscache_inode_init(struct ceph_inode_info *ci); +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, + struct ceph_inode_info* ci); +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); + +int ceph_readpage_from_fscache(struct inode *inode, struct page *page); +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages); +void ceph_readpage_to_fscache(struct inode *inode, struct page *page); +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); +void ceph_queue_revalidate(struct inode *inode); + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ + fscache_invalidate(ceph_inode(inode)->fscache); +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_uncache_page(ci->fscache, page); +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + struct inode* inode = page->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_maybe_release_page(ci->fscache, page, gfp); +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_readpages_cancel(ci->fscache, pages); +} + +#else + +static inline int ceph_fscache_register(void) +{ + return 0; +} + +static inline void ceph_fscache_unregister(void) +{ +} + +static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ + return 0; +} + +static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ +} + +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ +} + +static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc, + struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *pages) +{ +} + +static inline int ceph_readpage_from_fscache(struct inode* inode, + struct page *page) +{ + return -ENOBUFS; +} + +static inline int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void ceph_readpage_to_fscache(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ +} + +static inline void ceph_invalidate_fscache_page(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + return 1; +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ +} + +static inline void ceph_queue_revalidate(struct inode *inode) +{ +} + +#endif + +#endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 25442b40c25a..13976c33332e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -10,6 +10,7 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> @@ -479,8 +480,9 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * i_rdcache_gen. */ if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; + } /* * if we are newly issued FILE_SHARED, mark dir not complete; we @@ -2072,19 +2074,17 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); - if (!(need & CEPH_CAP_FILE_WR)) - mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); - if (!(need & CEPH_CAP_FILE_WR)) - mutex_unlock(&inode->i_mutex); spin_lock(&ci->i_ceph_lock); } - if (need & CEPH_CAP_FILE_WR) { + have = __ceph_caps_issued(ci, &implemented); + + if (have & need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); - if (endoff > ci->i_wanted_max_size) { + if (endoff > ci->i_requested_max_size) { *check_max = 1; ret = 1; } @@ -2099,7 +2099,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } } - have = __ceph_caps_issued(ci, &implemented); if ((have & need) == need) { /* @@ -2141,14 +2140,17 @@ static void check_max_size(struct inode *inode, loff_t endoff) /* do we need to explicitly request a larger max_size? */ spin_lock(&ci->i_ceph_lock); - if ((endoff >= ci->i_max_size || - endoff > (inode->i_size << 1)) && - endoff > ci->i_wanted_max_size) { + if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { dout("write %p at large endoff %llu, req max_size\n", inode, endoff); ci->i_wanted_max_size = endoff; - check = 1; } + /* duplicate ceph_check_caps()'s logic */ + if (ci->i_auth_cap && + (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && + ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) + check = 1; spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); @@ -2334,6 +2336,38 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, } /* + * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. + */ +static void invalidate_aliases(struct inode *inode) +{ + struct dentry *dn, *prev = NULL; + + dout("invalidate_aliases inode %p\n", inode); + d_prune_aliases(inode); + /* + * For non-directory inode, d_find_alias() only returns + * connected dentry. After calling d_invalidate(), the + * dentry become disconnected. + * + * For directory inode, d_find_alias() can return + * disconnected dentry. But directory inode should have + * one alias at most. + */ + while ((dn = d_find_alias(inode))) { + if (dn == prev) { + dput(dn); + break; + } + d_invalidate(dn); + if (prev) + dput(prev); + prev = dn; + } + if (prev) + dput(prev); +} + +/* * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * @@ -2361,8 +2395,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, int check_caps = 0; int wake = 0; int writeback = 0; - int revoked_rdcache = 0; int queue_invalidate = 0; + int deleted_inode = 0; + int queue_revalidate = 0; dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, mds, seq, ceph_cap_string(newcaps)); @@ -2377,9 +2412,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { - if (try_nonblocking_invalidate(inode) == 0) { - revoked_rdcache = 1; - } else { + if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { @@ -2387,6 +2420,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ci->i_rdcache_revoking = ci->i_rdcache_gen; } } + + ceph_fscache_invalidate(inode); } /* side effects now are allowed */ @@ -2407,8 +2442,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, from_kgid(&init_user_ns, inode->i_gid)); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) + if ((issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); + if (inode->i_nlink == 0 && + (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + deleted_inode = 1; + } if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); @@ -2424,6 +2463,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } } + /* Do we need to revalidate our fscache cookie. Don't bother on the + * first cache cap as we already validate at cookie creation time. */ + if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) + queue_revalidate = 1; + /* size/ctime/mtime/atime? */ ceph_fill_file_size(inode, issued, le32_to_cpu(grant->truncate_seq), @@ -2508,6 +2552,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, BUG_ON(cap->issued & ~cap->implemented); spin_unlock(&ci->i_ceph_lock); + if (writeback) /* * queue inode for writeback: we can't actually call @@ -2517,6 +2562,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); + if (deleted_inode) + invalidate_aliases(inode); + if (queue_revalidate) + ceph_queue_revalidate(inode); if (wake) wake_up_all(&ci->i_cap_wq); @@ -2673,8 +2722,10 @@ static void handle_cap_trunc(struct inode *inode, truncate_seq, truncate_size, size); spin_unlock(&ci->i_ceph_lock); - if (queue_trunc) + if (queue_trunc) { ceph_queue_vmtruncate(inode); + ceph_fscache_invalidate(inode); + } } /* diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a40ceda47a32..868b61d56cac 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -793,6 +793,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + /* release LINK_SHARED on source inode (mds will lock it) */ + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; err = ceph_mdsc_do_request(mdsc, dir, req); if (err) { d_drop(dentry); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2ddf061c1c4a..3de89829e2a1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -8,9 +8,11 @@ #include <linux/namei.h> #include <linux/writeback.h> #include <linux/aio.h> +#include <linux/falloc.h> #include "super.h" #include "mds_client.h" +#include "cache.h" /* * Ceph file operations @@ -68,9 +70,23 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { struct ceph_file_info *cf; int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; switch (inode->i_mode & S_IFMT) { case S_IFREG: + /* First file open request creates the cookie, we want to keep + * this cookie around for the filetime of the inode as not to + * have to worry about fscache register / revoke / operation + * races. + * + * Also, if we know the operation is going to invalidate data + * (non readonly) just nuke the cache right away. + */ + ceph_fscache_register_inode_cookie(mdsc->fsc, ci); + if ((fmode & CEPH_FILE_MODE_WR)) + ceph_fscache_invalidate(inode); case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); @@ -181,6 +197,7 @@ int ceph_open(struct inode *inode, struct file *file) spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } + spin_unlock(&ci->i_ceph_lock); dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); @@ -191,6 +208,7 @@ int ceph_open(struct inode *inode, struct file *file) } req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; if (flags & (O_CREAT|O_TRUNC)) parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); @@ -313,9 +331,9 @@ static int striped_read(struct inode *inode, { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - u64 pos, this_len; + u64 pos, this_len, left; int io_align, page_align; - int left, pages_left; + int pages_left; int read; struct page **page_pos; int ret; @@ -346,47 +364,40 @@ more: ret = 0; hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; - dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, + dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); - if (ret > 0) { - int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; - - if (read < pos - off) { - dout(" zero gap %llu to %llu\n", off + read, pos); - ceph_zero_page_vector_range(page_align + read, - pos - off - read, pages); + if (ret >= 0) { + int didpages; + if (was_short && (pos + ret < inode->i_size)) { + u64 tmp = min(this_len - ret, + inode->i_size - pos - ret); + dout(" zero gap %llu to %llu\n", + pos + ret, pos + ret + tmp); + ceph_zero_page_vector_range(page_align + read + ret, + tmp, pages); + ret += tmp; } + + didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; pos += ret; read = pos - off; left -= ret; page_pos += didpages; pages_left -= didpages; - /* hit stripe? */ - if (left && hit_stripe) + /* hit stripe and need continue*/ + if (left && hit_stripe && pos < inode->i_size) goto more; } - if (was_short) { + if (read > 0) { + ret = read; /* did we bounce off eof? */ if (pos + left > inode->i_size) *checkeof = 1; - - /* zero trailing bytes (inside i_size) */ - if (left > 0 && pos < inode->i_size) { - if (pos + left > inode->i_size) - left = inode->i_size - pos; - - dout("zero tail %d\n", left); - ceph_zero_page_vector_range(page_align + read, left, - pages); - read += left; - } } - if (ret >= 0) - ret = read; dout("striped_read returns %d\n", ret); return ret; } @@ -618,6 +629,8 @@ out: if (check_caps) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + } else if (ret != -EOLDSNAPC && written > 0) { + ret = written; } return ret; } @@ -659,7 +672,6 @@ again: if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS) || (fi->flags & CEPH_F_SYNC)) /* hmm, this isn't really async... */ ret = ceph_sync_read(filp, base, len, ppos, &checkeof); @@ -711,13 +723,11 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, &ceph_sb_to_client(inode->i_sb)->client->osdc; ssize_t count, written = 0; int err, want, got; - bool hold_mutex; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; mutex_lock(&inode->i_mutex); - hold_mutex = true; err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); if (err) @@ -763,18 +773,31 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS) || (fi->flags & CEPH_F_SYNC)) { mutex_unlock(&inode->i_mutex); written = ceph_sync_write(file, iov->iov_base, count, pos, &iocb->ki_pos); + if (written == -EOLDSNAPC) { + dout("aio_write %p %llx.%llx %llu~%u" + "got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), + pos, (unsigned)iov->iov_len); + mutex_lock(&inode->i_mutex); + goto retry_snap; + } } else { + /* + * No need to acquire the i_truncate_mutex. Because + * the MDS revokes Fwb caps before sending truncate + * message to us. We can't get Fwb cap while there + * are pending vmtruncate. So write and vmtruncate + * can not run at the same time + */ written = generic_file_buffered_write(iocb, iov, nr_segs, pos, &iocb->ki_pos, count, 0); mutex_unlock(&inode->i_mutex); } - hold_mutex = false; if (written >= 0) { int dirty; @@ -798,18 +821,12 @@ retry_snap: written = err; } - if (written == -EOLDSNAPC) { - dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); - mutex_lock(&inode->i_mutex); - hold_mutex = true; - goto retry_snap; - } + goto out_unlocked; + out: - if (hold_mutex) - mutex_unlock(&inode->i_mutex); + mutex_unlock(&inode->i_mutex); +out_unlocked: current->backing_dev_info = NULL; - return written ? written : err; } @@ -822,7 +839,6 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) int ret; mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); @@ -871,6 +887,204 @@ out: return offset; } +static inline void ceph_zero_partial_page( + struct inode *inode, loff_t offset, unsigned size) +{ + struct page *page; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + wait_on_page_writeback(page); + zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size); + unlock_page(page); + page_cache_release(page); + } +} + +static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, + loff_t length) +{ + loff_t nearly = round_up(offset, PAGE_CACHE_SIZE); + if (offset < nearly) { + loff_t size = nearly - offset; + if (length < size) + size = length; + ceph_zero_partial_page(inode, offset, size); + offset += size; + length -= size; + } + if (length >= PAGE_CACHE_SIZE) { + loff_t size = round_down(length, PAGE_CACHE_SIZE); + truncate_pagecache_range(inode, offset, offset + size - 1); + offset += size; + length -= size; + } + if (length) + ceph_zero_partial_page(inode, offset, length); +} + +static int ceph_zero_partial_object(struct inode *inode, + loff_t offset, loff_t *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int ret = 0; + loff_t zero = 0; + int op; + + if (!length) { + op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; + length = &zero; + } else { + op = CEPH_OSD_OP_ZERO; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), + offset, length, + 1, op, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + NULL, 0, 0, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, + &inode->i_mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) { + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret == -ENOENT) + ret = 0; + } + ceph_osdc_put_request(req); + +out: + return ret; +} + +static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) +{ + int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + s32 stripe_unit = ceph_file_layout_su(ci->i_layout); + s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + s32 object_size = ceph_file_layout_object_size(ci->i_layout); + u64 object_set_size = object_size * stripe_count; + u64 nearly, t; + + /* round offset up to next period boundary */ + nearly = offset + object_set_size - 1; + t = nearly; + nearly -= do_div(t, object_set_size); + + while (length && offset < nearly) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + while (length >= object_set_size) { + int i; + loff_t pos = offset; + for (i = 0; i < stripe_count; ++i) { + ret = ceph_zero_partial_object(inode, pos, NULL); + if (ret < 0) + return ret; + pos += stripe_unit; + } + offset += object_set_size; + length -= object_set_size; + } + while (length) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + return ret; +} + +static long ceph_fallocate(struct file *file, int mode, + loff_t offset, loff_t length) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; + int want, got = 0; + int dirty; + int ret = 0; + loff_t endoff = 0; + loff_t size; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + mutex_lock(&inode->i_mutex); + + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto unlock; + } + + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { + ret = -ENOSPC; + goto unlock; + } + + size = i_size_read(inode); + if (!(mode & FALLOC_FL_KEEP_SIZE)) + endoff = offset + length; + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); + if (ret < 0) + goto unlock; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (offset < size) + ceph_zero_pagecache_range(inode, offset, length); + ret = ceph_zero_objects(inode, offset, length); + } else if (endoff > size) { + truncate_pagecache_range(inode, size, -1); + if (ceph_inode_set_size(inode, endoff)) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, NULL); + } + + if (!ret) { + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + ceph_put_cap_refs(ci, got); +unlock: + mutex_unlock(&inode->i_mutex); + return ret; +} + const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, @@ -887,5 +1101,6 @@ const struct file_operations ceph_file_fops = { .splice_write = generic_file_splice_write, .unlocked_ioctl = ceph_ioctl, .compat_ioctl = ceph_ioctl, + .fallocate = ceph_fallocate, }; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f3a2abf28a77..8549a48115f7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -12,6 +12,7 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/decode.h> /* @@ -344,6 +345,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) for (i = 0; i < CEPH_FILE_MODE_NUM; i++) ci->i_nr_by_mode[i] = 0; + mutex_init(&ci->i_truncate_mutex); ci->i_truncate_seq = 0; ci->i_truncate_size = 0; ci->i_truncate_pending = 0; @@ -377,6 +379,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work); + ceph_fscache_inode_init(ci); + return &ci->vfs_inode; } @@ -396,6 +400,8 @@ void ceph_destroy_inode(struct inode *inode) dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + ceph_fscache_unregister_inode_cookie(ci); + ceph_queue_caps_release(inode); /* @@ -430,7 +436,6 @@ void ceph_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, ceph_i_callback); } - /* * Helpers to fill in size, ctime, mtime, and atime. We have to be * careful because either the client or MDS may have more up to date @@ -455,16 +460,20 @@ int ceph_fill_file_size(struct inode *inode, int issued, dout("truncate_seq %u -> %u\n", ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq = truncate_seq; + + /* the MDS should have revoked these caps */ + WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR | + CEPH_CAP_FILE_LAZYIO)); /* * If we hold relevant caps, or in the case where we're * not the only client referencing this file and we * don't hold those caps, then we need to check whether * the file is either opened or mmaped */ - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| - CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| - CEPH_CAP_FILE_EXCL| - CEPH_CAP_FILE_LAZYIO)) || + if ((issued & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_BUFFER)) || mapping_mapped(inode->i_mapping) || __ceph_caps_file_wanted(ci)) { ci->i_truncate_pending++; @@ -478,6 +487,10 @@ int ceph_fill_file_size(struct inode *inode, int issued, truncate_size); ci->i_truncate_size = truncate_size; } + + if (queue_trunc) + ceph_fscache_invalidate(inode); + return queue_trunc; } @@ -1066,7 +1079,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, * complete. */ ceph_set_dentry_offset(req->r_old_dentry); - dout("dn %p gets new offset %lld\n", req->r_old_dentry, + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); dn = req->r_old_dentry; /* use old_dentry */ @@ -1419,18 +1432,20 @@ static void ceph_invalidate_work(struct work_struct *work) u32 orig_gen; int check = 0; + mutex_lock(&ci->i_truncate_mutex); spin_lock(&ci->i_ceph_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { /* nevermind! */ spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); goto out; } orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(inode->i_mapping, 0); spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && @@ -1445,6 +1460,7 @@ static void ceph_invalidate_work(struct work_struct *work) ci->i_rdcache_revoking); } spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); if (check) ceph_check_caps(ci, 0, NULL); @@ -1465,9 +1481,7 @@ static void ceph_vmtruncate_work(struct work_struct *work) struct inode *inode = &ci->vfs_inode; dout("vmtruncate_work %p\n", inode); - mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); - mutex_unlock(&inode->i_mutex); iput(inode); } @@ -1480,6 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); ihold(inode); + if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); @@ -1500,11 +1515,13 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) u64 to; int wrbuffer_refs, finish = 0; + mutex_lock(&ci->i_truncate_mutex); retry: spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { dout("__do_pending_vmtruncate %p none pending\n", inode); spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); return; } @@ -1521,6 +1538,9 @@ retry: goto retry; } + /* there should be no reader or writer */ + WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); + to = ci->i_truncate_size; wrbuffer_refs = ci->i_wrbuffer_ref; dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, @@ -1538,13 +1558,14 @@ retry: if (!finish) goto retry; + mutex_unlock(&ci->i_truncate_mutex); + if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); wake_up_all(&ci->i_cap_wq); } - /* * symlinks */ @@ -1586,8 +1607,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - __ceph_do_pending_vmtruncate(inode); - err = inode_change_ok(inode, attr); if (err != 0) return err; @@ -1768,7 +1787,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - __ceph_do_pending_vmtruncate(inode); + if (mask & CEPH_SETATTR_SIZE) + __ceph_do_pending_vmtruncate(inode); return err; out: spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index e0b4ef31d3c8..669622fd1ae3 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -196,8 +196,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, &dl.object_no, &dl.object_offset, &olen); - if (r < 0) + if (r < 0) { + up_read(&osdc->map_sem); return -EIO; + } dl.file_offset -= dl.object_offset; dl.object_size = ceph_file_layout_object_size(ci->i_layout); dl.block_size = ceph_file_layout_su(ci->i_layout); @@ -209,8 +211,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, - ceph_file_layout_pg_pool(ci->i_layout)); + r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, + ceph_file_layout_pg_pool(ci->i_layout)); + if (r < 0) { + up_read(&osdc->map_sem); + return r; + } dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 187bf214444d..b7bda5d9611d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -414,6 +414,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; + if (mds >= mdsc->mdsmap->m_max_mds) + return ERR_PTR(-EINVAL); + s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) return ERR_PTR(-ENOMEM); @@ -1028,6 +1031,37 @@ static void remove_session_caps(struct ceph_mds_session *session) { dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, NULL); + + spin_lock(&session->s_cap_lock); + if (session->s_nr_caps > 0) { + struct super_block *sb = session->s_mdsc->fsc->sb; + struct inode *inode; + struct ceph_cap *cap, *prev = NULL; + struct ceph_vino vino; + /* + * iterate_session_caps() skips inodes that are being + * deleted, we need to wait until deletions are complete. + * __wait_on_freeing_inode() is designed for the job, + * but it is not exported, so use lookup inode function + * to access it. + */ + while (!list_empty(&session->s_caps)) { + cap = list_entry(session->s_caps.next, + struct ceph_cap, session_caps); + if (cap == prev) + break; + prev = cap; + vino = cap->ci->i_vino; + spin_unlock(&session->s_cap_lock); + + inode = ceph_find_inode(sb, vino); + iput(inode); + + spin_lock(&session->s_cap_lock); + } + } + spin_unlock(&session->s_cap_lock); + BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); cleanup_cap_releases(session); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6627b26a800c..6a0951e43044 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -17,6 +17,7 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/ceph_features.h> #include <linux/ceph/decode.h> @@ -142,6 +143,8 @@ enum { Opt_nodcache, Opt_ino32, Opt_noino32, + Opt_fscache, + Opt_nofscache }; static match_table_t fsopt_tokens = { @@ -167,6 +170,8 @@ static match_table_t fsopt_tokens = { {Opt_nodcache, "nodcache"}, {Opt_ino32, "ino32"}, {Opt_noino32, "noino32"}, + {Opt_fscache, "fsc"}, + {Opt_nofscache, "nofsc"}, {-1, NULL} }; @@ -260,6 +265,12 @@ static int parse_fsopt_token(char *c, void *private) case Opt_noino32: fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; break; + case Opt_fscache: + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + break; + case Opt_nofscache: + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; + break; default: BUG_ON(token); } @@ -422,6 +433,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",dcache"); else seq_puts(m, ",nodcache"); + if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) + seq_puts(m, ",fsc"); + else + seq_puts(m, ",nofsc"); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); @@ -530,11 +545,18 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc->wb_pagevec_pool) goto fail_trunc_wq; + /* setup fscache */ + if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) && + (ceph_fscache_register_fs(fsc) != 0)) + goto fail_fscache; + /* caps */ fsc->min_caps = fsopt->max_readdir; return fsc; +fail_fscache: + ceph_fscache_unregister_fs(fsc); fail_trunc_wq: destroy_workqueue(fsc->trunc_wq); fail_pg_inv_wq: @@ -554,6 +576,8 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); + ceph_fscache_unregister_fs(fsc); + destroy_workqueue(fsc->wb_wq); destroy_workqueue(fsc->pg_inv_wq); destroy_workqueue(fsc->trunc_wq); @@ -588,6 +612,8 @@ static void ceph_inode_init_once(void *foo) static int __init init_caches(void) { + int error = -ENOMEM; + ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), @@ -611,15 +637,17 @@ static int __init init_caches(void) if (ceph_file_cachep == NULL) goto bad_file; - return 0; + if ((error = ceph_fscache_register())) + goto bad_file; + return 0; bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); - return -ENOMEM; + return error; } static void destroy_caches(void) @@ -629,10 +657,13 @@ static void destroy_caches(void) * destroy cache. */ rcu_barrier(); + kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); + + ceph_fscache_unregister(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cbded572345e..6014b0a3c405 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -16,6 +16,10 @@ #include <linux/ceph/libceph.h> +#ifdef CONFIG_CEPH_FSCACHE +#include <linux/fscache.h> +#endif + /* f_type in struct statfs */ #define CEPH_SUPER_MAGIC 0x00c36400 @@ -29,6 +33,7 @@ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ +#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) @@ -90,6 +95,11 @@ struct ceph_fs_client { struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; #endif + +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; + struct workqueue_struct *revalidate_wq; +#endif }; @@ -288,6 +298,7 @@ struct ceph_inode_info { int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + struct mutex i_truncate_mutex; u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ @@ -319,6 +330,12 @@ struct ceph_inode_info { struct work_struct i_vmtruncate_work; +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; + u32 i_fscache_gen; /* sequence, for delayed fscache validate */ + struct work_struct i_revalidate_work; +#endif + struct inode vfs_inode; /* at end */ }; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 45e57cc38200..fc6f4f3a1a9d 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -43,17 +43,18 @@ cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server) server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(server->secmech.md5)) { cifs_dbg(VFS, "could not allocate crypto md5\n"); - return PTR_ERR(server->secmech.md5); + rc = PTR_ERR(server->secmech.md5); + server->secmech.md5 = NULL; + return rc; } size = sizeof(struct shash_desc) + crypto_shash_descsize(server->secmech.md5); server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); if (!server->secmech.sdescmd5) { - rc = -ENOMEM; crypto_free_shash(server->secmech.md5); server->secmech.md5 = NULL; - return rc; + return -ENOMEM; } server->secmech.sdescmd5->shash.tfm = server->secmech.md5; server->secmech.sdescmd5->shash.flags = 0x0; @@ -421,7 +422,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp) if (blobptr + attrsize > blobend) break; if (type == NTLMSSP_AV_NB_DOMAIN_NAME) { - if (!attrsize) + if (!attrsize || attrsize >= CIFS_MAX_DOMAINNAME_LEN) break; if (!ses->domainName) { ses->domainName = @@ -591,6 +592,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server) { + int rc; unsigned int size; /* check if already allocated */ @@ -600,7 +602,9 @@ static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server) server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); if (IS_ERR(server->secmech.hmacmd5)) { cifs_dbg(VFS, "could not allocate crypto hmacmd5\n"); - return PTR_ERR(server->secmech.hmacmd5); + rc = PTR_ERR(server->secmech.hmacmd5); + server->secmech.hmacmd5 = NULL; + return rc; } size = sizeof(struct shash_desc) + diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 4bdd547dbf6f..85ea98d139fc 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -147,18 +147,17 @@ cifs_read_super(struct super_block *sb) goto out_no_root; } + if (cifs_sb_master_tcon(cifs_sb)->nocase) + sb->s_d_op = &cifs_ci_dentry_ops; + else + sb->s_d_op = &cifs_dentry_ops; + sb->s_root = d_make_root(inode); if (!sb->s_root) { rc = -ENOMEM; goto out_no_root; } - /* do that *after* d_make_root() - we want NULL ->d_op for root here */ - if (cifs_sb_master_tcon(cifs_sb)->nocase) - sb->s_d_op = &cifs_ci_dentry_ops; - else - sb->s_d_op = &cifs_dentry_ops; - #ifdef CONFIG_CIFS_NFSD_EXPORT if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cifs_dbg(FYI, "export ops supported\n"); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1fdc37041057..52ca861ed35e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -44,6 +44,7 @@ #define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1) #define MAX_SERVER_SIZE 15 #define MAX_SHARE_SIZE 80 +#define CIFS_MAX_DOMAINNAME_LEN 256 /* max domain name length */ #define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */ #define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ @@ -369,6 +370,9 @@ struct smb_version_operations { void (*generate_signingkey)(struct TCP_Server_Info *server); int (*calc_signature)(struct smb_rqst *rqst, struct TCP_Server_Info *server); + int (*query_mf_symlink)(const unsigned char *path, char *pbuf, + unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, + unsigned int xid); }; struct smb_version_values { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f7e584d047e2..b29a012bed33 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -497,5 +497,7 @@ void cifs_writev_complete(struct work_struct *work); struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete); void cifs_writedata_release(struct kref *refcount); - +int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, + unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, + unsigned int xid); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index fa68813396b5..d67c550c4980 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1675,7 +1675,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (strnlen(string, 256) == 256) { + if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN) + == CIFS_MAX_DOMAINNAME_LEN) { printk(KERN_WARNING "CIFS: domain name too" " long\n"); goto cifs_parse_mount_err; @@ -2276,8 +2277,8 @@ cifs_put_smb_ses(struct cifs_ses *ses) #ifdef CONFIG_KEYS -/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */ -#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1) +/* strlen("cifs:a:") + CIFS_MAX_DOMAINNAME_LEN + 1 */ +#define CIFSCREDS_DESC_SIZE (7 + CIFS_MAX_DOMAINNAME_LEN + 1) /* Populate username and pw fields from keyring if possible */ static int diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1e57f36ea1b2..9d0dd952ad79 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -647,6 +647,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) oflags, &oplock, &cfile->fid.netfid, xid); if (rc == 0) { cifs_dbg(FYI, "posix reopen succeeded\n"); + oparms.reconnect = true; goto reopen_success; } /* @@ -2552,7 +2553,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, mutex_unlock(&inode->i_mutex); } - if (rc > 0 || rc == -EIOCBQUEUED) { + if (rc > 0) { ssize_t err; err = generic_write_sync(file, pos, rc); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index b83c3f5646bd..562044f700e5 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -305,67 +305,89 @@ CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr) } int -CIFSCheckMFSymlink(struct cifs_fattr *fattr, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, unsigned int xid) +open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, + unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, + unsigned int xid) { int rc; int oplock = 0; __u16 netfid = 0; struct tcon_link *tlink; - struct cifs_tcon *pTcon; + struct cifs_tcon *ptcon; struct cifs_io_parms io_parms; - u8 *buf; - char *pbuf; - unsigned int bytes_read = 0; int buf_type = CIFS_NO_BUFFER; - unsigned int link_len = 0; FILE_ALL_INFO file_info; - if (!CIFSCouldBeMFSymlink(fattr)) - /* it's not a symlink */ - return 0; - tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); - pTcon = tlink_tcon(tlink); + ptcon = tlink_tcon(tlink); - rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ, + rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ, CREATE_NOT_DIR, &netfid, &oplock, &file_info, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc != 0) - goto out; + if (rc != 0) { + cifs_put_tlink(tlink); + return rc; + } if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { - CIFSSMBClose(xid, pTcon, netfid); + CIFSSMBClose(xid, ptcon, netfid); + cifs_put_tlink(tlink); /* it's not a symlink */ - goto out; + return rc; } - buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); - if (!buf) { - rc = -ENOMEM; - goto out; - } - pbuf = buf; io_parms.netfid = netfid; io_parms.pid = current->tgid; - io_parms.tcon = pTcon; + io_parms.tcon = ptcon; io_parms.offset = 0; io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type); - CIFSSMBClose(xid, pTcon, netfid); - if (rc != 0) { - kfree(buf); + rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type); + CIFSSMBClose(xid, ptcon, netfid); + cifs_put_tlink(tlink); + return rc; +} + + +int +CIFSCheckMFSymlink(struct cifs_fattr *fattr, + const unsigned char *path, + struct cifs_sb_info *cifs_sb, unsigned int xid) +{ + int rc = 0; + u8 *buf = NULL; + unsigned int link_len = 0; + unsigned int bytes_read = 0; + struct cifs_tcon *ptcon; + + if (!CIFSCouldBeMFSymlink(fattr)) + /* it's not a symlink */ + return 0; + + buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); + if (!buf) { + rc = -ENOMEM; goto out; } + ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb)); + if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink)) + rc = ptcon->ses->server->ops->query_mf_symlink(path, buf, + &bytes_read, cifs_sb, xid); + else + goto out; + + if (rc != 0) + goto out; + + if (bytes_read == 0) /* not a symlink */ + goto out; + rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL); - kfree(buf); if (rc == -EINVAL) { /* it's not a symlink */ rc = 0; @@ -381,7 +403,7 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr, fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO; fattr->cf_dtype = DT_LNK; out: - cifs_put_tlink(tlink); + kfree(buf); return rc; } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index ab8778469394..69d2c826a23b 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -111,6 +111,14 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, return; } + /* + * If we know that the inode will need to be revalidated immediately, + * then don't create a new dentry for it. We'll end up doing an on + * the wire call either way and this spares us an invalidation. + */ + if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) + return; + dentry = d_alloc(parent, name); if (!dentry) return; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 79358e341fd2..08dd37bb23aa 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -197,7 +197,7 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, bytes_ret = 0; } else bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName, - 256, nls_cp); + CIFS_MAX_DOMAINNAME_LEN, nls_cp); bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* account for null terminator */ @@ -255,8 +255,8 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* copy domain */ if (ses->domainName != NULL) { - strncpy(bcc_ptr, ses->domainName, 256); - bcc_ptr += strnlen(ses->domainName, 256); + strncpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + bcc_ptr += strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN); } /* else we will send a null domain name so the server will default to its own domain */ *bcc_ptr = 0; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 6457690731a2..60943978aec3 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -944,6 +944,7 @@ struct smb_version_operations smb1_operations = { .mand_lock = cifs_mand_lock, .mand_unlock_range = cifs_unlock_range, .push_mand_locks = cifs_push_mandatory_locks, + .query_mf_symlink = open_query_close_cifs_symlink, }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 301b191270b9..4f2300d020c7 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -42,6 +42,7 @@ static int smb2_crypto_shash_allocate(struct TCP_Server_Info *server) { + int rc; unsigned int size; if (server->secmech.sdeschmacsha256 != NULL) @@ -50,7 +51,9 @@ smb2_crypto_shash_allocate(struct TCP_Server_Info *server) server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0); if (IS_ERR(server->secmech.hmacsha256)) { cifs_dbg(VFS, "could not allocate crypto hmacsha256\n"); - return PTR_ERR(server->secmech.hmacsha256); + rc = PTR_ERR(server->secmech.hmacsha256); + server->secmech.hmacsha256 = NULL; + return rc; } size = sizeof(struct shash_desc) + @@ -87,7 +90,9 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) server->secmech.sdeschmacsha256 = NULL; crypto_free_shash(server->secmech.hmacsha256); server->secmech.hmacsha256 = NULL; - return PTR_ERR(server->secmech.cmacaes); + rc = PTR_ERR(server->secmech.cmacaes); + server->secmech.cmacaes = NULL; + return rc; } size = sizeof(struct shash_desc) + diff --git a/fs/dcache.c b/fs/dcache.c index 87bdb5329c3c..4d9df3c940e6 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -88,6 +88,35 @@ EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; +/** + * read_seqbegin_or_lock - begin a sequence number check or locking block + * lock: sequence lock + * seq : sequence number to be checked + * + * First try it once optimistically without taking the lock. If that fails, + * take the lock. The sequence number is also used as a marker for deciding + * whether to be a reader (even) or writer (odd). + * N.B. seq must be initialized to an even number to begin with. + */ +static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) +{ + if (!(*seq & 1)) /* Even */ + *seq = read_seqbegin(lock); + else /* Odd */ + write_seqlock(lock); +} + +static inline int need_seqretry(seqlock_t *lock, int seq) +{ + return !(seq & 1) && read_seqretry(lock, seq); +} + +static inline void done_seqretry(seqlock_t *lock, int seq) +{ + if (seq & 1) + write_sequnlock(lock); +} + /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try @@ -229,7 +258,7 @@ static void __d_free(struct rcu_head *head) */ static void d_free(struct dentry *dentry) { - BUG_ON(dentry->d_count); + BUG_ON((int)dentry->d_lockref.count > 0); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -308,8 +337,9 @@ static void dentry_unlink_inode(struct dentry * dentry) */ static void dentry_lru_add(struct dentry *dentry) { - if (list_empty(&dentry->d_lru)) { + if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) { spin_lock(&dcache_lru_lock); + dentry->d_flags |= DCACHE_LRU_LIST; list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; dentry_stat.nr_unused++; @@ -320,7 +350,7 @@ static void dentry_lru_add(struct dentry *dentry) static void __dentry_lru_del(struct dentry *dentry) { list_del_init(&dentry->d_lru); - dentry->d_flags &= ~DCACHE_SHRINK_LIST; + dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); dentry->d_sb->s_nr_dentry_unused--; dentry_stat.nr_unused--; } @@ -341,6 +371,7 @@ static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) { spin_lock(&dcache_lru_lock); if (list_empty(&dentry->d_lru)) { + dentry->d_flags |= DCACHE_LRU_LIST; list_add_tail(&dentry->d_lru, list); dentry->d_sb->s_nr_dentry_unused++; dentry_stat.nr_unused++; @@ -443,7 +474,7 @@ EXPORT_SYMBOL(d_drop); * If ref is non-zero, then decrement the refcount too. * Returns dentry requiring refcount drop, or NULL if we're done. */ -static inline struct dentry *dentry_kill(struct dentry *dentry, int ref) +static inline struct dentry *dentry_kill(struct dentry *dentry) __releases(dentry->d_lock) { struct inode *inode; @@ -466,13 +497,16 @@ relock: goto relock; } - if (ref) - dentry->d_count--; + /* + * The dentry is now unrecoverably dead to the world. + */ + lockref_mark_dead(&dentry->d_lockref); + /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); @@ -509,38 +543,31 @@ relock: */ void dput(struct dentry *dentry) { - if (!dentry) + if (unlikely(!dentry)) return; repeat: - if (dentry->d_count == 1) - might_sleep(); - spin_lock(&dentry->d_lock); - BUG_ON(!dentry->d_count); - if (dentry->d_count > 1) { - dentry->d_count--; - spin_unlock(&dentry->d_lock); + if (lockref_put_or_lock(&dentry->d_lockref)) return; - } - if (dentry->d_flags & DCACHE_OP_DELETE) { + /* Unreachable? Get rid of it */ + if (unlikely(d_unhashed(dentry))) + goto kill_it; + + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { if (dentry->d_op->d_delete(dentry)) goto kill_it; } - /* Unreachable? Get rid of it */ - if (d_unhashed(dentry)) - goto kill_it; - dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); - dentry->d_count--; + dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); return; kill_it: - dentry = dentry_kill(dentry, 1); + dentry = dentry_kill(dentry); if (dentry) goto repeat; } @@ -590,7 +617,7 @@ int d_invalidate(struct dentry * dentry) * We also need to leave mountpoints alone, * directory or not. */ - if (dentry->d_count > 1 && dentry->d_inode) { + if (dentry->d_lockref.count > 1 && dentry->d_inode) { if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { spin_unlock(&dentry->d_lock); return -EBUSY; @@ -606,20 +633,33 @@ EXPORT_SYMBOL(d_invalidate); /* This must be called with d_lock held */ static inline void __dget_dlock(struct dentry *dentry) { - dentry->d_count++; + dentry->d_lockref.count++; } static inline void __dget(struct dentry *dentry) { - spin_lock(&dentry->d_lock); - __dget_dlock(dentry); - spin_unlock(&dentry->d_lock); + lockref_get(&dentry->d_lockref); } struct dentry *dget_parent(struct dentry *dentry) { + int gotref; struct dentry *ret; + /* + * Do optimistic parent lookup without any + * locking. + */ + rcu_read_lock(); + ret = ACCESS_ONCE(dentry->d_parent); + gotref = lockref_get_not_zero(&ret->d_lockref); + rcu_read_unlock(); + if (likely(gotref)) { + if (likely(ret == ACCESS_ONCE(dentry->d_parent))) + return ret; + dput(ret); + } + repeat: /* * Don't need rcu_dereference because we re-check it was correct under @@ -634,8 +674,8 @@ repeat: goto repeat; } rcu_read_unlock(); - BUG_ON(!ret->d_count); - ret->d_count++; + BUG_ON(!ret->d_lockref.count); + ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } @@ -718,7 +758,15 @@ restart: spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); - if (!dentry->d_count) { + if (!dentry->d_lockref.count) { + /* + * inform the fs via d_prune that this dentry + * is about to be unhashed and destroyed. + */ + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) + dentry->d_op->d_prune(dentry); + __dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -744,7 +792,7 @@ static void try_prune_one_dentry(struct dentry *dentry) { struct dentry *parent; - parent = dentry_kill(dentry, 0); + parent = dentry_kill(dentry); /* * If dentry_kill returns NULL, we have nothing more to do. * if it returns the same dentry, trylocks failed. In either @@ -763,13 +811,9 @@ static void try_prune_one_dentry(struct dentry *dentry) /* Prune ancestors. */ dentry = parent; while (dentry) { - spin_lock(&dentry->d_lock); - if (dentry->d_count > 1) { - dentry->d_count--; - spin_unlock(&dentry->d_lock); + if (lockref_put_or_lock(&dentry->d_lockref)) return; - } - dentry = dentry_kill(dentry, 1); + dentry = dentry_kill(dentry); } } @@ -793,7 +837,7 @@ static void shrink_dentry_list(struct list_head *list) * the LRU because of laziness during lookup. Do not free * it - just keep it off the LRU list. */ - if (dentry->d_count) { + if (dentry->d_lockref.count) { dentry_lru_del(dentry); spin_unlock(&dentry->d_lock); continue; @@ -907,13 +951,14 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) * inform the fs that this dentry is about to be * unhashed and destroyed. */ - if (dentry->d_flags & DCACHE_OP_PRUNE) + if ((dentry->d_flags & DCACHE_OP_PRUNE) && + !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); dentry_lru_del(dentry); __d_shrink(dentry); - if (dentry->d_count != 0) { + if (dentry->d_lockref.count != 0) { printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%s}" " still in use (%d)" @@ -922,7 +967,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry->d_name.name, - dentry->d_count, + dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); BUG(); @@ -933,7 +978,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) list_del(&dentry->d_u.d_child); } else { parent = dentry->d_parent; - parent->d_count--; + parent->d_lockref.count--; list_del(&dentry->d_u.d_child); } @@ -981,7 +1026,7 @@ void shrink_dcache_for_umount(struct super_block *sb) dentry = sb->s_root; sb->s_root = NULL; - dentry->d_count--; + dentry->d_lockref.count--; shrink_dcache_for_umount_subtree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { @@ -996,7 +1041,7 @@ void shrink_dcache_for_umount(struct super_block *sb) * the parenthood after dropping the lock and check * that the sequence number still matches. */ -static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) +static struct dentry *try_to_ascend(struct dentry *old, unsigned seq) { struct dentry *new = old->d_parent; @@ -1010,7 +1055,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq */ if (new != old->d_parent || (old->d_flags & DCACHE_DENTRY_KILLED) || - (!locked && read_seqretry(&rename_lock, seq))) { + need_seqretry(&rename_lock, seq)) { spin_unlock(&new->d_lock); new = NULL; } @@ -1018,34 +1063,55 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq return new; } +/** + * enum d_walk_ret - action to talke during tree walk + * @D_WALK_CONTINUE: contrinue walk + * @D_WALK_QUIT: quit walk + * @D_WALK_NORETRY: quit when retry is needed + * @D_WALK_SKIP: skip this dentry and its children + */ +enum d_walk_ret { + D_WALK_CONTINUE, + D_WALK_QUIT, + D_WALK_NORETRY, + D_WALK_SKIP, +}; -/* - * Search for at least 1 mount point in the dentry's subdirs. - * We descend to the next level whenever the d_subdirs - * list is non-empty and continue searching. - */ - /** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. + * d_walk - walk the dentry tree + * @parent: start of walk + * @data: data passed to @enter() and @finish() + * @enter: callback when first entering the dentry + * @finish: callback when successfully finished the walk * - * Return true if the parent or its subdirectories contain - * a mount point + * The @enter() and @finish() callbacks are called with d_lock held. */ -int have_submounts(struct dentry *parent) +static void d_walk(struct dentry *parent, void *data, + enum d_walk_ret (*enter)(void *, struct dentry *), + void (*finish)(void *)) { struct dentry *this_parent; struct list_head *next; - unsigned seq; - int locked = 0; + unsigned seq = 0; + enum d_walk_ret ret; + bool retry = true; - seq = read_seqbegin(&rename_lock); again: + read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; - - if (d_mountpoint(parent)) - goto positive; spin_lock(&this_parent->d_lock); + + ret = enter(data, this_parent); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: + case D_WALK_SKIP: + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + } repeat: next = this_parent->d_subdirs.next; resume: @@ -1055,12 +1121,22 @@ resume: next = tmp->next; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - /* Have we found a mount point ? */ - if (d_mountpoint(dentry)) { + + ret = enter(data, dentry); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: spin_unlock(&dentry->d_lock); - spin_unlock(&this_parent->d_lock); - goto positive; + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + case D_WALK_SKIP: + spin_unlock(&dentry->d_lock); + continue; } + if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); @@ -1075,35 +1151,99 @@ resume: */ if (this_parent != parent) { struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); + this_parent = try_to_ascend(this_parent, seq); if (!this_parent) goto rename_retry; next = child->d_u.d_child.next; goto resume; } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) + if (need_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return 0; /* No mount points found in tree */ -positive: - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return 1; + } + if (finish) + finish(data); + +out_unlock: + spin_unlock(&this_parent->d_lock); + done_seqretry(&rename_lock, seq); + return; rename_retry: - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); + if (!retry) + return; + seq = 1; goto again; } + +/* + * Search for at least 1 mount point in the dentry's subdirs. + * We descend to the next level whenever the d_subdirs + * list is non-empty and continue searching. + */ + +/** + * have_submounts - check for mounts over a dentry + * @parent: dentry to check. + * + * Return true if the parent or its subdirectories contain + * a mount point + */ + +static enum d_walk_ret check_mount(void *data, struct dentry *dentry) +{ + int *ret = data; + if (d_mountpoint(dentry)) { + *ret = 1; + return D_WALK_QUIT; + } + return D_WALK_CONTINUE; +} + +int have_submounts(struct dentry *parent) +{ + int ret = 0; + + d_walk(parent, &ret, check_mount, NULL); + + return ret; +} EXPORT_SYMBOL(have_submounts); /* + * Called by mount code to set a mountpoint and check if the mountpoint is + * reachable (e.g. NFS can unhash a directory dentry and then the complete + * subtree can become unreachable). + * + * Only one of check_submounts_and_drop() and d_set_mounted() must succeed. For + * this reason take rename_lock and d_lock on dentry and ancestors. + */ +int d_set_mounted(struct dentry *dentry) +{ + struct dentry *p; + int ret = -ENOENT; + write_seqlock(&rename_lock); + for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { + /* Need exclusion wrt. check_submounts_and_drop() */ + spin_lock(&p->d_lock); + if (unlikely(d_unhashed(p))) { + spin_unlock(&p->d_lock); + goto out; + } + spin_unlock(&p->d_lock); + } + spin_lock(&dentry->d_lock); + if (!d_unlinked(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } + spin_unlock(&dentry->d_lock); +out: + write_sequnlock(&rename_lock); + return ret; +} + +/* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level @@ -1117,93 +1257,46 @@ EXPORT_SYMBOL(have_submounts); * drop the lock and return early due to latency * constraints. */ -static int select_parent(struct dentry *parent, struct list_head *dispose) -{ - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int found = 0; - int locked = 0; - - seq = read_seqbegin(&rename_lock); -again: - this_parent = parent; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); +struct select_data { + struct dentry *start; + struct list_head dispose; + int found; +}; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - dentry_lru_move_list(dentry, dispose); - dentry->d_flags |= DCACHE_SHRINK_LIST; - found++; - } - /* - * We can return to the caller if we have found some (this - * ensures forward progress). We'll be coming back to find - * the rest. - */ - if (found && need_resched()) { - spin_unlock(&dentry->d_lock); - goto out; - } +static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + enum d_walk_ret ret = D_WALK_CONTINUE; - /* - * Descend a level if the d_subdirs list is non-empty. - */ - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } + if (data->start == dentry) + goto out; - spin_unlock(&dentry->d_lock); - } /* - * All done at this level ... ascend and resume the search. + * move only zero ref count dentries to the dispose list. + * + * Those which are presently on the shrink list, being processed + * by shrink_dentry_list(), shouldn't be moved. Otherwise the + * loop in shrink_dcache_parent() might not make any progress + * and loop forever. */ - if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; + if (dentry->d_lockref.count) { + dentry_lru_del(dentry); + } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { + dentry_lru_move_list(dentry, &data->dispose); + dentry->d_flags |= DCACHE_SHRINK_LIST; + data->found++; + ret = D_WALK_NORETRY; } + /* + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. + */ + if (data->found && need_resched()) + ret = D_WALK_QUIT; out: - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return found; - -rename_retry: - if (found) - return found; - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; + return ret; } /** @@ -1212,18 +1305,90 @@ rename_retry: * * Prune the dcache to remove unused children of the parent dentry. */ -void shrink_dcache_parent(struct dentry * parent) +void shrink_dcache_parent(struct dentry *parent) { - LIST_HEAD(dispose); - int found; + for (;;) { + struct select_data data; + + INIT_LIST_HEAD(&data.dispose); + data.start = parent; + data.found = 0; + + d_walk(parent, &data, select_collect, NULL); + if (!data.found) + break; - while ((found = select_parent(parent, &dispose)) != 0) { - shrink_dentry_list(&dispose); + shrink_dentry_list(&data.dispose); cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); +static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + + if (d_mountpoint(dentry)) { + data->found = -EBUSY; + return D_WALK_QUIT; + } + + return select_collect(_data, dentry); +} + +static void check_and_drop(void *_data) +{ + struct select_data *data = _data; + + if (d_mountpoint(data->start)) + data->found = -EBUSY; + if (!data->found) + __d_drop(data->start); +} + +/** + * check_submounts_and_drop - prune dcache, check for submounts and drop + * + * All done as a single atomic operation relative to has_unlinked_ancestor(). + * Returns 0 if successfully unhashed @parent. If there were submounts then + * return -EBUSY. + * + * @dentry: dentry to prune and drop + */ +int check_submounts_and_drop(struct dentry *dentry) +{ + int ret = 0; + + /* Negative dentries can be dropped without further checks */ + if (!dentry->d_inode) { + d_drop(dentry); + goto out; + } + + for (;;) { + struct select_data data; + + INIT_LIST_HEAD(&data.dispose); + data.start = dentry; + data.found = 0; + + d_walk(dentry, &data, check_and_collect, check_and_drop); + ret = data.found; + + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); + + if (ret <= 0) + break; + + cond_resched(); + } + +out: + return ret; +} +EXPORT_SYMBOL(check_submounts_and_drop); + /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to @@ -1269,7 +1434,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) smp_wmb(); dentry->d_name.name = dname; - dentry->d_count = 1; + dentry->d_lockref.count = 1; dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); seqcount_init(&dentry->d_seq); @@ -1782,7 +1947,7 @@ static noinline enum slow_d_compare slow_dentry_cmp( * without taking d_lock and checking d_seq sequence count against @seq * returned here. * - * A refcount may be taken on the found dentry with the __d_rcu_to_refcount + * A refcount may be taken on the found dentry with the d_rcu_to_refcount * function. * * Alternatively, __d_lookup_rcu may be called again to look up the child of @@ -1970,7 +2135,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) goto next; } - dentry->d_count++; + dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; @@ -2069,7 +2234,7 @@ again: spin_lock(&dentry->d_lock); inode = dentry->d_inode; isdir = S_ISDIR(inode->i_mode); - if (dentry->d_count == 1) { + if (dentry->d_lockref.count == 1) { if (!spin_trylock(&inode->i_lock)) { spin_unlock(&dentry->d_lock); cpu_relax(); @@ -2506,9 +2671,39 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) return 0; } +/** + * prepend_name - prepend a pathname in front of current buffer pointer + * buffer: buffer pointer + * buflen: allocated length of the buffer + * name: name string and length qstr structure + * + * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to + * make sure that either the old or the new name pointer and length are + * fetched. However, there may be mismatch between length and pointer. + * The length cannot be trusted, we need to copy it byte-by-byte until + * the length is reached or a null byte is found. It also prepends "/" at + * the beginning of the name. The sequence number check at the caller will + * retry it again when a d_move() does happen. So any garbage in the buffer + * due to mismatched pointer and length will be discarded. + */ static int prepend_name(char **buffer, int *buflen, struct qstr *name) { - return prepend(buffer, buflen, name->name, name->len); + const char *dname = ACCESS_ONCE(name->name); + u32 dlen = ACCESS_ONCE(name->len); + char *p; + + if (*buflen < dlen + 1) + return -ENAMETOOLONG; + *buflen -= dlen + 1; + p = *buffer -= dlen + 1; + *p++ = '/'; + while (dlen--) { + char c = *dname++; + if (!c) + break; + *p++ = c; + } + return 0; } /** @@ -2518,7 +2713,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * - * Caller holds the rename_lock. + * The function tries to write out the pathname without taking any lock other + * than the RCU read lock to make sure that dentries won't go away. It only + * checks the sequence number of the global rename_lock as any change in the + * dentry's d_seq will be preceded by changes in the rename_lock sequence + * number. If the sequence number had been change, it will restart the whole + * pathname back-tracing sequence again. It performs a total of 3 trials of + * lockless back-tracing sequences before falling back to take the + * rename_lock. */ static int prepend_path(const struct path *path, const struct path *root, @@ -2527,54 +2729,66 @@ static int prepend_path(const struct path *path, struct dentry *dentry = path->dentry; struct vfsmount *vfsmnt = path->mnt; struct mount *mnt = real_mount(vfsmnt); - bool slash = false; int error = 0; + unsigned seq = 0; + char *bptr; + int blen; + rcu_read_lock(); +restart: + bptr = *buffer; + blen = *buflen; + read_seqbegin_or_lock(&rename_lock, &seq); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { /* Global root? */ - if (!mnt_has_parent(mnt)) - goto global_root; - dentry = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - vfsmnt = &mnt->mnt; - continue; + if (mnt_has_parent(mnt)) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + vfsmnt = &mnt->mnt; + continue; + } + /* + * Filesystems needing to implement special "root names" + * should do so with ->d_dname() + */ + if (IS_ROOT(dentry) && + (dentry->d_name.len != 1 || + dentry->d_name.name[0] != '/')) { + WARN(1, "Root dentry has weird name <%.*s>\n", + (int) dentry->d_name.len, + dentry->d_name.name); + } + if (!error) + error = is_mounted(vfsmnt) ? 1 : 2; + break; } parent = dentry->d_parent; prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(buffer, buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (!error) - error = prepend(buffer, buflen, "/", 1); + error = prepend_name(&bptr, &blen, &dentry->d_name); if (error) break; - slash = true; dentry = parent; } + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; + goto restart; + } + done_seqretry(&rename_lock, seq); - if (!error && !slash) - error = prepend(buffer, buflen, "/", 1); - - return error; - -global_root: - /* - * Filesystems needing to implement special "root names" - * should do so with ->d_dname() - */ - if (IS_ROOT(dentry) && - (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) { - WARN(1, "Root dentry has weird name <%.*s>\n", - (int) dentry->d_name.len, dentry->d_name.name); - } - if (!slash) - error = prepend(buffer, buflen, "/", 1); - if (!error) - error = is_mounted(vfsmnt) ? 1 : 2; + if (error >= 0 && bptr == *buffer) { + if (--blen < 0) + error = -ENAMETOOLONG; + else + *--bptr = '/'; + } + *buffer = bptr; + *buflen = blen; return error; } @@ -2603,9 +2817,7 @@ char *__d_path(const struct path *path, prepend(&res, &buflen, "\0", 1); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error < 0) @@ -2624,9 +2836,7 @@ char *d_absolute_path(const struct path *path, prepend(&res, &buflen, "\0", 1); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = prepend_path(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error > 1) @@ -2692,9 +2902,7 @@ char *d_path(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); error = path_with_deleted(path, &root, &res, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error < 0) res = ERR_PTR(error); @@ -2724,35 +2932,58 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, return memcpy(buffer, temp, sz); } +char *simple_dname(struct dentry *dentry, char *buffer, int buflen) +{ + char *end = buffer + buflen; + /* these dentries are never renamed, so d_lock is not needed */ + if (prepend(&end, &buflen, " (deleted)", 11) || + prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) || + prepend(&end, &buflen, "/", 1)) + end = ERR_PTR(-ENAMETOOLONG); + return end; +} + /* * Write full pathname from the root of the filesystem into the buffer. */ static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) { - char *end = buf + buflen; - char *retval; + char *end, *retval; + int len, seq = 0; + int error = 0; - prepend(&end, &buflen, "\0", 1); + rcu_read_lock(); +restart: + end = buf + buflen; + len = buflen; + prepend(&end, &len, "\0", 1); if (buflen < 1) goto Elong; /* Get '/' right */ retval = end-1; *retval = '/'; - + read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { struct dentry *parent = dentry->d_parent; int error; prefetch(parent); - spin_lock(&dentry->d_lock); - error = prepend_name(&end, &buflen, &dentry->d_name); - spin_unlock(&dentry->d_lock); - if (error != 0 || prepend(&end, &buflen, "/", 1) != 0) - goto Elong; + error = prepend_name(&end, &len, &dentry->d_name); + if (error) + break; retval = end; dentry = parent; } + if (!(seq & 1)) + rcu_read_unlock(); + if (need_seqretry(&rename_lock, seq)) { + seq = 1; + goto restart; + } + done_seqretry(&rename_lock, seq); + if (error) + goto Elong; return retval; Elong: return ERR_PTR(-ENAMETOOLONG); @@ -2760,13 +2991,7 @@ Elong: char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) { - char *retval; - - write_seqlock(&rename_lock); - retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); - - return retval; + return __dentry_path(dentry, buf, buflen); } EXPORT_SYMBOL(dentry_path_raw); @@ -2775,7 +3000,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) char *p = NULL; char *retval; - write_seqlock(&rename_lock); if (d_unlinked(dentry)) { p = buf + buflen; if (prepend(&p, &buflen, "//deleted", 10) != 0) @@ -2783,7 +3007,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) buflen++; } retval = __dentry_path(dentry, buf, buflen); - write_sequnlock(&rename_lock); if (!IS_ERR(retval) && p) *p = '/'; /* restore '/' overriden with '\0' */ return retval; @@ -2822,7 +3045,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) error = -ENOENT; br_read_lock(&vfsmount_lock); - write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; char *cwd = page + PAGE_SIZE; @@ -2830,7 +3052,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); if (error < 0) @@ -2851,7 +3072,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) error = -EFAULT; } } else { - write_sequnlock(&rename_lock); br_read_unlock(&vfsmount_lock); } @@ -2904,68 +3124,24 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) return result; } -void d_genocide(struct dentry *root) +static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { - struct dentry *this_parent; - struct list_head *next; - unsigned seq; - int locked = 0; - - seq = read_seqbegin(&rename_lock); -again: - this_parent = root; - spin_lock(&this_parent->d_lock); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); - next = tmp->next; + struct dentry *root = data; + if (dentry != root) { + if (d_unhashed(dentry) || !dentry->d_inode) + return D_WALK_SKIP; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (d_unhashed(dentry) || !dentry->d_inode) { - spin_unlock(&dentry->d_lock); - continue; - } - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_count--; - } - spin_unlock(&dentry->d_lock); - } - if (this_parent != root) { - struct dentry *child = this_parent; - if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { - this_parent->d_flags |= DCACHE_GENOCIDE; - this_parent->d_count--; + dentry->d_lockref.count--; } - this_parent = try_to_ascend(this_parent, locked, seq); - if (!this_parent) - goto rename_retry; - next = child->d_u.d_child.next; - goto resume; } - spin_unlock(&this_parent->d_lock); - if (!locked && read_seqretry(&rename_lock, seq)) - goto rename_retry; - if (locked) - write_sequnlock(&rename_lock); - return; + return D_WALK_CONTINUE; +} -rename_retry: - if (locked) - goto again; - locked = 1; - write_seqlock(&rename_lock); - goto again; +void d_genocide(struct dentry *parent) +{ + d_walk(parent, parent, d_genocide_kill, NULL); } void d_tmpfile(struct dentry *dentry, struct inode *inode) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 4888cb3fdef7..c7c83ff0f752 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -533,8 +533,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove); */ void debugfs_remove_recursive(struct dentry *dentry) { - struct dentry *child; - struct dentry *parent; + struct dentry *child, *next, *parent; if (IS_ERR_OR_NULL(dentry)) return; @@ -544,61 +543,37 @@ void debugfs_remove_recursive(struct dentry *dentry) return; parent = dentry; + down: mutex_lock(&parent->d_inode->i_mutex); + list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) { + if (!debugfs_positive(child)) + continue; - while (1) { - /* - * When all dentries under "parent" has been removed, - * walk up the tree until we reach our starting point. - */ - if (list_empty(&parent->d_subdirs)) { - mutex_unlock(&parent->d_inode->i_mutex); - if (parent == dentry) - break; - parent = parent->d_parent; - mutex_lock(&parent->d_inode->i_mutex); - } - child = list_entry(parent->d_subdirs.next, struct dentry, - d_u.d_child); - next_sibling: - - /* - * If "child" isn't empty, walk down the tree and - * remove all its descendants first. - */ + /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { mutex_unlock(&parent->d_inode->i_mutex); parent = child; - mutex_lock(&parent->d_inode->i_mutex); - continue; + goto down; } - __debugfs_remove(child, parent); - if (parent->d_subdirs.next == &child->d_u.d_child) { - /* - * Try the next sibling. - */ - if (child->d_u.d_child.next != &parent->d_subdirs) { - child = list_entry(child->d_u.d_child.next, - struct dentry, - d_u.d_child); - goto next_sibling; - } - - /* - * Avoid infinite loop if we fail to remove - * one dentry. - */ - mutex_unlock(&parent->d_inode->i_mutex); - break; - } - simple_release_fs(&debugfs_mount, &debugfs_mount_count); + up: + if (!__debugfs_remove(child, parent)) + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } - parent = dentry->d_parent; + mutex_unlock(&parent->d_inode->i_mutex); + child = parent; + parent = parent->d_parent; mutex_lock(&parent->d_inode->i_mutex); - __debugfs_remove(dentry, parent); + + if (child != dentry) { + next = list_entry(child->d_u.d_child.next, struct dentry, + d_u.d_child); + goto up; + } + + if (!__debugfs_remove(child, parent)) + simple_release_fs(&debugfs_mount, &debugfs_mount_count); mutex_unlock(&parent->d_inode->i_mutex); - simple_release_fs(&debugfs_mount, &debugfs_mount_count); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); diff --git a/fs/direct-io.c b/fs/direct-io.c index 7ab90f5081ee..0e04142d5962 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -127,6 +127,7 @@ struct dio { spinlock_t bio_lock; /* protects BIO fields below */ int page_errors; /* errno from get_user_pages() */ int is_async; /* is IO async ? */ + bool defer_completion; /* defer AIO completion to workqueue? */ int io_error; /* IO error in completion path */ unsigned long refcount; /* direct_io_worker() and bios */ struct bio *bio_list; /* singly linked via bi_private */ @@ -141,7 +142,10 @@ struct dio { * allocation time. Don't add new fields after pages[] unless you * wish that they not be zeroed. */ - struct page *pages[DIO_PAGES]; /* page buffer */ + union { + struct page *pages[DIO_PAGES]; /* page buffer */ + struct work_struct complete_work;/* deferred AIO completion */ + }; } ____cacheline_aligned_in_smp; static struct kmem_cache *dio_cache __read_mostly; @@ -221,16 +225,16 @@ static inline struct page *dio_get_page(struct dio *dio, * dio_complete() - called when all DIO BIO I/O has been completed * @offset: the byte offset in the file of the completed operation * - * This releases locks as dictated by the locking type, lets interested parties - * know that a DIO operation has completed, and calculates the resulting return - * code for the operation. + * This drops i_dio_count, lets interested parties know that a DIO operation + * has completed, and calculates the resulting return code for the operation. * * It lets the filesystem know if it registered an interest earlier via * get_block. Pass the private field of the map buffer_head so that * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, + bool is_async) { ssize_t transferred = 0; @@ -258,19 +262,36 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is if (ret == 0) ret = transferred; - if (dio->end_io && dio->result) { - dio->end_io(dio->iocb, offset, transferred, - dio->private, ret, is_async); - } else { - inode_dio_done(dio->inode); - if (is_async) - aio_complete(dio->iocb, ret, 0); + if (dio->end_io && dio->result) + dio->end_io(dio->iocb, offset, transferred, dio->private); + + inode_dio_done(dio->inode); + if (is_async) { + if (dio->rw & WRITE) { + int err; + + err = generic_write_sync(dio->iocb->ki_filp, offset, + transferred); + if (err < 0 && ret > 0) + ret = err; + } + + aio_complete(dio->iocb, ret, 0); } + kmem_cache_free(dio_cache, dio); return ret; } +static void dio_aio_complete_work(struct work_struct *work) +{ + struct dio *dio = container_of(work, struct dio, complete_work); + + dio_complete(dio, dio->iocb->ki_pos, 0, true); +} + static int dio_bio_complete(struct dio *dio, struct bio *bio); + /* * Asynchronous IO callback. */ @@ -290,8 +311,13 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - dio_complete(dio, dio->iocb->ki_pos, 0, true); - kmem_cache_free(dio_cache, dio); + if (dio->result && dio->defer_completion) { + INIT_WORK(&dio->complete_work, dio_aio_complete_work); + queue_work(dio->inode->i_sb->s_dio_done_wq, + &dio->complete_work); + } else { + dio_complete(dio, dio->iocb->ki_pos, 0, true); + } } } @@ -511,6 +537,42 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) } /* + * Create workqueue for deferred direct IO completions. We allocate the + * workqueue when it's first needed. This avoids creating workqueue for + * filesystems that don't need it and also allows us to create the workqueue + * late enough so the we can include s_id in the name of the workqueue. + */ +static int sb_init_dio_done_wq(struct super_block *sb) +{ + struct workqueue_struct *old; + struct workqueue_struct *wq = alloc_workqueue("dio/%s", + WQ_MEM_RECLAIM, 0, + sb->s_id); + if (!wq) + return -ENOMEM; + /* + * This has to be atomic as more DIOs can race to create the workqueue + */ + old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); + /* Someone created workqueue before us? Free ours... */ + if (old) + destroy_workqueue(wq); + return 0; +} + +static int dio_set_defer_completion(struct dio *dio) +{ + struct super_block *sb = dio->inode->i_sb; + + if (dio->defer_completion) + return 0; + dio->defer_completion = true; + if (!sb->s_dio_done_wq) + return sb_init_dio_done_wq(sb); + return 0; +} + +/* * Call into the fs to map some more disk blocks. We record the current number * of available blocks at sdio->blocks_available. These are in units of the * fs blocksize, (1 << inode->i_blkbits). @@ -581,6 +643,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, /* Store for completion */ dio->private = map_bh->b_private; + + if (ret == 0 && buffer_defer_completion(map_bh)) + ret = dio_set_defer_completion(dio); } return ret; } @@ -1129,11 +1194,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, } /* - * Will be decremented at I/O completion time. - */ - atomic_inc(&inode->i_dio_count); - - /* * For file extending writes updating i_size before data * writeouts complete can expose uninitialized blocks. So * even for AIO, we need to wait for i/o to complete before @@ -1141,11 +1201,33 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, */ dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && (end > i_size_read(inode))); - - retval = 0; - dio->inode = inode; dio->rw = rw; + + /* + * For AIO O_(D)SYNC writes we need to defer completions to a workqueue + * so that we can call ->fsync. + */ + if (dio->is_async && (rw & WRITE) && + ((iocb->ki_filp->f_flags & O_DSYNC) || + IS_SYNC(iocb->ki_filp->f_mapping->host))) { + retval = dio_set_defer_completion(dio); + if (retval) { + /* + * We grab i_mutex only for reads so we don't have + * to release it here + */ + kmem_cache_free(dio_cache, dio); + goto out; + } + } + + /* + * Will be decremented at I/O completion time. + */ + atomic_inc(&inode->i_dio_count); + + retval = 0; sdio.blkbits = blkbits; sdio.blkfactor = i_blkbits - blkbits; sdio.block_in_file = offset >> blkbits; @@ -1269,7 +1351,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (drop_refcount(dio) == 0) { retval = dio_complete(dio, offset, retval, false); - kmem_cache_free(dio_cache, dio); } else BUG_ON(retval != -EIOCBQUEUED); diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 27a6ba9aaeec..0e90f0c91b93 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -267,10 +267,7 @@ void dlm_callback_work(struct work_struct *work) int dlm_callback_start(struct dlm_ls *ls) { ls->ls_callback_wq = alloc_workqueue("dlm_callback", - WQ_UNBOUND | - WQ_MEM_RECLAIM | - WQ_NON_REENTRANT, - 0); + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); if (!ls->ls_callback_wq) { log_print("can't start dlm_callback workqueue"); return -ENOMEM; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 911649a47dd5..142e21655eed 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -493,7 +493,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, { struct dlm_user_proc *proc = file->private_data; struct dlm_write_request *kbuf; - sigset_t tmpsig, allsigs; int error; #ifdef CONFIG_COMPAT @@ -557,9 +556,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, goto out_free; } - sigfillset(&allsigs); - sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); - error = -EINVAL; switch (kbuf->cmd) @@ -567,7 +563,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_LOCK: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_lock(proc, &kbuf->i.lock); break; @@ -575,7 +571,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_UNLOCK: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_unlock(proc, &kbuf->i.lock); break; @@ -583,7 +579,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_DEADLOCK: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_deadlock(proc, &kbuf->i.lock); break; @@ -591,7 +587,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_CREATE_LOCKSPACE: if (proc) { log_print("create/remove only on control device"); - goto out_sig; + goto out_free; } error = device_create_lockspace(&kbuf->i.lspace); break; @@ -599,7 +595,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_REMOVE_LOCKSPACE: if (proc) { log_print("create/remove only on control device"); - goto out_sig; + goto out_free; } error = device_remove_lockspace(&kbuf->i.lspace); break; @@ -607,7 +603,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, case DLM_USER_PURGE: if (!proc) { log_print("no locking on control device"); - goto out_sig; + goto out_free; } error = device_user_purge(proc, &kbuf->i.purge); break; @@ -617,8 +613,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, kbuf->cmd); } - out_sig: - sigprocmask(SIG_SETMASK, &tmpsig, NULL); out_free: kfree(kbuf); return error; @@ -659,15 +653,11 @@ static int device_close(struct inode *inode, struct file *file) { struct dlm_user_proc *proc = file->private_data; struct dlm_ls *ls; - sigset_t tmpsig, allsigs; ls = dlm_find_lockspace_local(proc->lockspace); if (!ls) return -ENOENT; - sigfillset(&allsigs); - sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); - set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags); dlm_clear_proc_locks(ls, proc); @@ -685,9 +675,6 @@ static int device_close(struct inode *inode, struct file *file) /* FIXME: AUTOFREE: if this ls is no longer used do device_remove_lockspace() */ - sigprocmask(SIG_SETMASK, &tmpsig, NULL); - recalc_sigpending(); - return 0; } diff --git a/fs/efs/inode.c b/fs/efs/inode.c index f3913eb2c474..d15ccf20f1b3 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -57,7 +57,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) struct inode *inode; inode = iget_locked(super, ino); - if (IS_ERR(inode)) + if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9ad17b15b454..293f86741ddb 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1792,7 +1792,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, { int error; int did_lock_epmutex = 0; - struct file *file, *tfile; + struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; @@ -1802,20 +1802,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, copy_from_user(&epds, event, sizeof(struct epoll_event))) goto error_return; - /* Get the "struct file *" for the eventpoll file */ error = -EBADF; - file = fget(epfd); - if (!file) + f = fdget(epfd); + if (!f.file) goto error_return; /* Get the "struct file *" for the target file */ - tfile = fget(fd); - if (!tfile) + tf = fdget(fd); + if (!tf.file) goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; - if (!tfile->f_op || !tfile->f_op->poll) + if (!tf.file->f_op || !tf.file->f_op->poll) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ @@ -1828,14 +1827,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * adding an epoll file descriptor inside itself. */ error = -EINVAL; - if (file == tfile || !is_file_epoll(file)) + if (f.file == tf.file || !is_file_epoll(f.file)) goto error_tgt_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = file->private_data; + ep = f.file->private_data; /* * When we insert an epoll file descriptor, inside another epoll file @@ -1854,14 +1853,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, did_lock_epmutex = 1; } if (op == EPOLL_CTL_ADD) { - if (is_file_epoll(tfile)) { + if (is_file_epoll(tf.file)) { error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) { + if (ep_loop_check(ep, tf.file) != 0) { clear_tfile_check_list(); goto error_tgt_fput; } } else - list_add(&tfile->f_tfile_llink, &tfile_check_list); + list_add(&tf.file->f_tfile_llink, &tfile_check_list); } mutex_lock_nested(&ep->mtx, 0); @@ -1871,14 +1870,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ - epi = ep_find(ep, tfile, fd); + epi = ep_find(ep, tf.file, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; - error = ep_insert(ep, &epds, tfile, fd); + error = ep_insert(ep, &epds, tf.file, fd); } else error = -EEXIST; clear_tfile_check_list(); @@ -1903,9 +1902,9 @@ error_tgt_fput: if (did_lock_epmutex) mutex_unlock(&epmutex); - fput(tfile); + fdput(tf); error_fput: - fput(file); + fdput(f); error_return: return error; diff --git a/fs/exec.c b/fs/exec.c index 9c73def87642..fd774c7cb483 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -608,7 +608,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) return -ENOMEM; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, old_start, old_end); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. @@ -625,7 +625,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) free_pgd_range(&tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } - tlb_finish_mmu(&tlb, new_end, old_end); + tlb_finish_mmu(&tlb, old_start, old_end); /* * Shrink the vma to just the new range. Always succeeds. diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 293bc2e47a73..a235f0016889 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -231,7 +231,7 @@ static int filldir_one(void * __buf, const char * name, int len, int result = 0; buf->sequence++; - if (buf->ino == ino) { + if (buf->ino == ino && len <= NAME_MAX) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index f522425aaa24..bafdd48eefde 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -41,7 +41,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) /** * Check if the given dir-inode refers to an htree-indexed directory - * (or a directory which chould potentially get coverted to use htree + * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not diff --git a/fs/ext3/super.c b/fs/ext3/super.c index c47f14750722..c50c76190373 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -27,6 +27,7 @@ #include <linux/seq_file.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/namei.h> #include <asm/uaccess.h> @@ -819,6 +820,7 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_journal_path, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -860,6 +862,7 @@ static const match_table_t tokens = { {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_path, "journal_path=%s"}, {Opt_abort, "abort"}, {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, @@ -975,6 +978,11 @@ static int parse_options (char *options, struct super_block *sb, int option; kuid_t uid; kgid_t gid; + char *journal_path; + struct inode *journal_inode; + struct path path; + int error; + #ifdef CONFIG_QUOTA int qfmt; #endif @@ -1129,6 +1137,41 @@ static int parse_options (char *options, struct super_block *sb, return 0; *journal_devnum = option; break; + case Opt_journal_path: + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + + journal_path = match_strdup(&args[0]); + if (!journal_path) { + ext3_msg(sb, KERN_ERR, "error: could not dup " + "journal device string"); + return 0; + } + + error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + if (error) { + ext3_msg(sb, KERN_ERR, "error: could not find " + "journal device path: error %d", error); + kfree(journal_path); + return 0; + } + + journal_inode = path.dentry->d_inode; + if (!S_ISBLK(journal_inode->i_mode)) { + ext3_msg(sb, KERN_ERR, "error: journal path %s " + "is not a block device", journal_path); + path_put(&path); + kfree(journal_path); + return 0; + } + + *journal_devnum = new_encode_dev(journal_inode->i_rdev); + path_put(&path); + kfree(journal_path); + break; case Opt_noload: set_opt (sbi->s_mount_opt, NOLOAD); break; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index ddd715e42a5c..dc5d572ebd6a 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -184,6 +184,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; int flex_bg = 0; + struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); @@ -191,11 +192,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } memset(bh->b_data, 0, sb->s_blocksize); @@ -305,7 +304,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, */ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, - unsigned int block_group, + ext4_group_t block_group, struct buffer_head *bh) { ext4_grpblk_t offset; @@ -352,10 +351,11 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, - unsigned int block_group, + ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; + struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return; @@ -366,12 +366,14 @@ void ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } set_buffer_verified(bh); @@ -445,7 +447,10 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) return bh; verify: ext4_validate_block_bitmap(sb, desc, block_group, bh); - return bh; + if (buffer_verified(bh)) + return bh; + put_bh(bh); + return NULL; } /* Returns 0 on success, 1 on error */ @@ -469,7 +474,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ ext4_validate_block_bitmap(sb, desc, block_group, bh); - return 0; + /* ...but check for error just in case errors=continue. */ + return !buffer_verified(bh); } struct buffer_head * diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 3c7d288ae94c..680bb3388919 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -33,7 +33,7 @@ static int ext4_dx_readdir(struct file *, struct dir_context *); /** * Check if the given dir-inode refers to an htree-indexed directory - * (or a directory which chould potentially get coverted to use htree + * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b577e45425b0..af815ea9d7cc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -180,7 +180,6 @@ struct ext4_map_blocks { * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 -#define EXT4_IO_END_DIRECT 0x0002 /* * For converting uninitialized extents on a work queue. 'handle' is used for @@ -196,8 +195,6 @@ typedef struct ext4_io_end { unsigned int flag; /* unwritten or not */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ - struct kiocb *iocb; /* iocb struct for AIO */ - int result; /* error value for AIO */ atomic_t count; /* reference counter */ } ext4_io_end_t; @@ -561,6 +558,18 @@ enum { #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), + * read_extent_tree_block(), ext4_split_extent_at(), + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. + */ +#define EXT4_EX_NOCACHE 0x0400 +#define EXT4_EX_FORCE_CACHE 0x0800 + +/* * Flags used by ext4_free_blocks */ #define EXT4_FREE_BLOCKS_METADATA 0x0001 @@ -569,6 +578,7 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RESERVE 0x0040 /* * ioctl commands @@ -590,6 +600,7 @@ enum { #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -900,11 +911,9 @@ struct ext4_inode_info { * Completed IOs that need unwritten extents handling and don't have * transaction reserved */ - struct list_head i_unrsv_conversion_list; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ atomic_t i_unwritten; /* Nr. of inflight conversions pending */ struct work_struct i_rsv_conversion_work; - struct work_struct i_unrsv_conversion_work; spinlock_t i_block_reservation_lock; @@ -1276,8 +1285,6 @@ struct ext4_sb_info { struct flex_groups *s_flex_groups; ext4_group_t s_flex_groups_allocated; - /* workqueue for unreserved extent convertions (dio) */ - struct workqueue_struct *unrsv_conversion_wq; /* workqueue for reserved extent conversions (buffered io) */ struct workqueue_struct *rsv_conversion_wq; @@ -1340,9 +1347,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, struct ext4_io_end *io_end) { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - /* Writeback has to have coversion transaction reserved */ - WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle && - !(io_end->flag & EXT4_IO_END_DIRECT)); io_end->flag |= EXT4_IO_END_UNWRITTEN; atomic_inc(&EXT4_I(inode)->i_unwritten); } @@ -1375,6 +1379,7 @@ enum { nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1915,7 +1920,7 @@ extern ext4_group_t ext4_get_group_number(struct super_block *sb, extern void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, - unsigned int block_group, + ext4_group_t block_group, struct buffer_head *bh); extern unsigned int ext4_block_group(struct super_block *sb, ext4_fsblk_t blocknr); @@ -2086,6 +2091,7 @@ extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); @@ -2416,16 +2422,32 @@ do { \ #define EXT4_FREECLUSTERS_WATERMARK 0 #endif +/* Update i_disksize. Requires i_mutex to avoid races with truncate */ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { - /* - * XXX: replace with spinlock if seen contended -bzzz - */ + WARN_ON_ONCE(S_ISREG(inode->i_mode) && + !mutex_is_locked(&inode->i_mutex)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = newsize; up_write(&EXT4_I(inode)->i_data_sem); - return ; +} + +/* + * Update i_disksize after writeback has been started. Races with truncate + * are avoided by checking i_size under i_data_sem. + */ +static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) +{ + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (newsize > i_size) + newsize = i_size; + if (newsize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = newsize; + up_write(&EXT4_I(inode)->i_data_sem); } struct ext4_group_info { @@ -2448,9 +2470,15 @@ struct ext4_group_info { #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ + (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) @@ -2654,6 +2682,12 @@ extern int ext4_check_blockref(const char *, unsigned int, struct ext4_ext_path; struct ext4_extent; +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); @@ -2683,7 +2717,8 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *); + struct ext4_ext_path *, + int flags); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern int ext4_find_delalloc_range(struct inode *inode, @@ -2692,7 +2727,7 @@ extern int ext4_find_delalloc_range(struct inode *inode, extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); - +extern int ext4_ext_precache(struct inode *inode); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -2715,7 +2750,6 @@ extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); extern void ext4_end_io_rsv_work(struct work_struct *work); -extern void ext4_end_io_unrsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 51bc821ade90..5074fe23f19e 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -134,12 +134,6 @@ struct ext4_ext_path { */ /* - * Maximum number of logical blocks in a file; ext4_extent's ee_block is - * __le32. - */ -#define EXT_MAX_BLOCKS 0xffffffff - -/* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 72a3600aedbd..17ac112ab101 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -255,10 +255,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - if (err) { - /* Errors can only happen if there is a bug */ - handle->h_err = err; - __ext4_journal_stop(where, line, handle); + /* Errors can only happen if there is a bug */ + if (WARN_ON_ONCE(err)) { + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); } } else { if (inode) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 2877258d9497..81cfefa9dc0c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -197,7 +197,7 @@ static inline void ext4_journal_callback_add(handle_t *handle, * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered * @jce: registered journal callback entry to unregister - * Return true if object was sucessfully removed + * Return true if object was successfully removed */ static inline bool ext4_journal_callback_try_del(handle_t *handle, struct ext4_journal_cb_entry *jce) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a61873808f76..54d52afcdb19 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -407,7 +407,7 @@ static int ext4_valid_extent_entries(struct inode *inode, static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, - int depth) + int depth, ext4_fsblk_t pblk) { const char *error_msg; int max = 0; @@ -447,42 +447,149 @@ static int __ext4_ext_check(const char *function, unsigned int line, corrupted: ext4_error_inode(inode, function, line, 0, - "bad header/extent: %s - magic %x, " - "entries %u, max %u(%u), depth %u(%u)", - error_msg, le16_to_cpu(eh->eh_magic), - le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), - max, le16_to_cpu(eh->eh_depth), depth); - + "pblk %llu bad header/extent: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + (unsigned long long) pblk, error_msg, + le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); return -EIO; } -#define ext4_ext_check(inode, eh, depth) \ - __ext4_ext_check(__func__, __LINE__, inode, eh, depth) +#define ext4_ext_check(inode, eh, depth, pblk) \ + __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) int ext4_ext_check_inode(struct inode *inode) { - return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } -static int __ext4_ext_check_block(const char *function, unsigned int line, - struct inode *inode, - struct ext4_extent_header *eh, - int depth, - struct buffer_head *bh) +static struct buffer_head * +__read_extent_tree_block(const char *function, unsigned int line, + struct inode *inode, ext4_fsblk_t pblk, int depth, + int flags) { - int ret; + struct buffer_head *bh; + int err; - if (buffer_verified(bh)) - return 0; - ret = ext4_ext_check(inode, eh, depth); - if (ret) - return ret; + bh = sb_getblk(inode->i_sb, pblk); + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); + + if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); + err = bh_submit_read(bh); + if (err < 0) + goto errout; + } + if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) + return bh; + err = __ext4_ext_check(function, line, inode, + ext_block_hdr(bh), depth, pblk); + if (err) + goto errout; set_buffer_verified(bh); - return ret; + /* + * If this is a leaf block, cache all of its entries + */ + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { + struct ext4_extent_header *eh = ext_block_hdr(bh); + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); + ext4_lblk_t prev = 0; + int i; + + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { + unsigned int status = EXTENT_STATUS_WRITTEN; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); + int len = ext4_ext_get_actual_len(ex); + + if (prev && (prev != lblk)) + ext4_es_cache_extent(inode, prev, + lblk - prev, ~0, + EXTENT_STATUS_HOLE); + + if (ext4_ext_is_uninitialized(ex)) + status = EXTENT_STATUS_UNWRITTEN; + ext4_es_cache_extent(inode, lblk, len, + ext4_ext_pblock(ex), status); + prev = lblk + len; + } + } + return bh; +errout: + put_bh(bh); + return ERR_PTR(err); + } -#define ext4_ext_check_block(inode, eh, depth, bh) \ - __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh) +#define read_extent_tree_block(inode, pblk, depth, flags) \ + __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ + (depth), (flags)) + +/* + * This function is called to cache a file's extent information in the + * extent status tree + */ +int ext4_ext_precache(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_ext_path *path = NULL; + struct buffer_head *bh; + int i = 0, depth, ret = 0; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return 0; /* not an extent-mapped inode */ + + down_read(&ei->i_data_sem); + depth = ext_depth(inode); + + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), + GFP_NOFS); + if (path == NULL) { + up_read(&ei->i_data_sem); + return -ENOMEM; + } + + /* Don't cache anything if there are no external extent blocks */ + if (depth == 0) + goto out; + path[0].p_hdr = ext_inode_hdr(inode); + ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); + if (ret) + goto out; + path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); + while (i >= 0) { + /* + * If this is a leaf block or we've reached the end of + * the index block, go up + */ + if ((i == depth) || + path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx++), + depth - i - 1, + EXT4_EX_FORCE_CACHE); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + break; + } + i++; + path[i].p_bh = bh; + path[i].p_hdr = ext_block_hdr(bh); + path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); + } + ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); +out: + up_read(&ei->i_data_sem); + ext4_ext_drop_refs(path); + kfree(path); + return ret; +} #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) @@ -716,7 +823,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) struct ext4_ext_path * ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path *path) + struct ext4_ext_path *path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; @@ -748,20 +855,13 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = sb_getblk(inode->i_sb, path[ppos].p_block); - if (unlikely(!bh)) { - ret = -ENOMEM; + bh = read_extent_tree_block(inode, path[ppos].p_block, --i, + flags); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); goto err; } - if (!bh_uptodate_or_lock(bh)) { - trace_ext4_ext_load_extent(inode, block, - path[ppos].p_block); - ret = bh_submit_read(bh); - if (ret < 0) { - put_bh(bh); - goto err; - } - } + eh = ext_block_hdr(bh); ppos++; if (unlikely(ppos > depth)) { @@ -773,11 +873,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; - i--; - - ret = ext4_ext_check_block(inode, eh, i, bh); - if (ret < 0) - goto err; } path[ppos].p_depth = i; @@ -1198,7 +1293,8 @@ out: * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - unsigned int flags, + unsigned int mb_flags, + unsigned int gb_flags, struct ext4_ext_path *path, struct ext4_extent *newext) { @@ -1220,7 +1316,7 @@ repeat: if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ - err = ext4_ext_split(handle, inode, flags, path, newext, i); + err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) goto out; @@ -1228,12 +1324,12 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); + path, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, flags, newext); + err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); if (err) goto out; @@ -1241,7 +1337,7 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); + path, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -1412,29 +1508,21 @@ got_index: ix++; block = ext4_idx_pblock(ix); while (++depth < path->p_depth) { - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; - eh = ext_block_hdr(bh); /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check_block(inode, eh, - path->p_depth - depth, bh)) { - put_bh(bh); - return -EIO; - } + bh = read_extent_tree_block(inode, block, + path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); + eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); block = ext4_idx_pblock(ix); put_bh(bh); } - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; + bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); eh = ext_block_hdr(bh); - if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) { - put_bh(bh); - return -EIO; - } ex = EXT_FIRST_EXTENT(eh); found_extent: *logical = le32_to_cpu(ex->ee_block); @@ -1705,7 +1793,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, - EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | + EXT4_FREE_BLOCKS_RESERVE); } /* @@ -1793,7 +1882,7 @@ out: */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext, int flag) + struct ext4_extent *newext, int gb_flags) { struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; @@ -1802,7 +1891,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, int depth, len, err; ext4_lblk_t next; unsigned uninitialized = 0; - int flags = 0; + int mb_flags = 0; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1817,7 +1906,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, } /* try to insert block into found extent and return */ - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { + if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { /* * Try to see whether we should rather test the extent on @@ -1920,7 +2009,7 @@ prepend: if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); - npath = ext4_ext_find_extent(inode, next, NULL); + npath = ext4_ext_find_extent(inode, next, NULL, 0); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); @@ -1939,9 +2028,10 @@ prepend: * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) - flags = EXT4_MB_USE_RESERVED; - err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); + if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + mb_flags = EXT4_MB_USE_RESERVED; + err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, + path, newext); if (err) goto cleanup; depth = ext_depth(inode); @@ -2007,7 +2097,7 @@ has_space: merge: /* try to merge extents */ - if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) + if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, nearex); @@ -2050,7 +2140,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, path = NULL; } - path = ext4_ext_find_extent(inode, block, path); + path = ext4_ext_find_extent(inode, block, path, 0); if (IS_ERR(path)) { up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); @@ -2195,8 +2285,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { int depth = ext_depth(inode); - unsigned long len; - ext4_lblk_t lblock; + unsigned long len = 0; + ext4_lblk_t lblock = 0; struct ext4_extent *ex; ex = path[depth].p_ext; @@ -2233,7 +2323,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); } else { - lblock = len = 0; BUG(); } @@ -2712,7 +2801,7 @@ again: ext4_lblk_t ee_block; /* find extent for this block */ - path = ext4_ext_find_extent(inode, end, NULL); + path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2754,6 +2843,7 @@ again: */ err = ext4_split_extent_at(handle, inode, path, end + 1, split_flag, + EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); @@ -2782,7 +2872,7 @@ again: path[0].p_hdr = ext_inode_hdr(inode); i = 0; - if (ext4_ext_check(inode, path[0].p_hdr, depth)) { + if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { err = -EIO; goto out; } @@ -2829,10 +2919,12 @@ again: ext_debug("move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); - bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); - if (!bh) { + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx), depth - i - 1, + EXT4_EX_NOCACHE); + if (IS_ERR(bh)) { /* should we reset i_size? */ - err = -EIO; + err = PTR_ERR(bh); break; } /* Yield here to deal with large extent trees. @@ -2842,11 +2934,6 @@ again: err = -EIO; break; } - if (ext4_ext_check_block(inode, ext_block_hdr(bh), - depth - i - 1, bh)) { - err = -EIO; - break; - } path[i + 1].p_bh = bh; /* save actual number of indexes since this @@ -2961,6 +3048,23 @@ void ext4_ext_release(struct super_block *sb) #endif } +static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { @@ -3113,7 +3217,7 @@ static int ext4_split_extent_at(handle_t *handle, goto fix_extent_len; /* update extent status tree */ - err = ext4_es_zeroout(inode, &zero_ex); + err = ext4_zeroout_es(inode, &zero_ex); goto out; } else if (err) @@ -3133,7 +3237,7 @@ fix_extent_len: * ext4_split_extents() splits an extent and mark extent which is covered * by @map as split_flags indicates * - * It may result in splitting the extent into multiple extents (upto three) + * It may result in splitting the extent into multiple extents (up to three) * There are three possibilities: * a> There is no split required * b> Splits in two extents: Split is happening at either end of the extent @@ -3181,7 +3285,7 @@ static int ext4_split_extent(handle_t *handle, * result in split of original leaf or extent zeroout. */ ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -3464,7 +3568,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, out: /* If we have gotten a failure, don't zero out status tree */ if (!err) - err = ext4_es_zeroout(inode, &zero_ex); + err = ext4_zeroout_es(inode, &zero_ex); return err ? err : allocated; } @@ -3565,7 +3669,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, if (err < 0) goto out; ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -4052,7 +4156,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_ext_find_extent(inode, map->m_lblk, NULL); + path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -4412,7 +4516,7 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode) retry: err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); - if (err == ENOMEM) { + if (err == -ENOMEM) { cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; @@ -4744,6 +4848,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return error; } + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + error = ext4_ext_precache(inode); + if (error) + return error; + } + /* fallback to generic here if not in extents fmt */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, @@ -4771,6 +4881,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, error = ext4_fill_fiemap_extents(inode, start_blk, len_blks, fieinfo); } - + ext4_es_lru_add(inode); return error; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 91cb110da1b4..2d1bdbe78c04 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -13,7 +13,6 @@ #include <linux/list_sort.h> #include "ext4.h" #include "extents_status.h" -#include "ext4_extents.h" #include <trace/events/ext4.h> @@ -263,7 +262,7 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, if (tree->cache_es) { es1 = tree->cache_es; if (in_range(lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u) %llu %llx\n", + es_debug("%u cached by [%u/%u) %llu %x\n", lblk, es1->es_lblk, es1->es_len, ext4_es_pblock(es1), ext4_es_status(es1)); goto out; @@ -409,6 +408,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) } #ifdef ES_AGGRESSIVE_TEST +#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */ + static void ext4_es_insert_extent_ext_check(struct inode *inode, struct extent_status *es) { @@ -419,7 +420,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, unsigned short ee_len; int depth, ee_status, es_status; - path = ext4_ext_find_extent(inode, es->es_lblk, NULL); + path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return; @@ -641,13 +642,13 @@ out: */ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned long long status) + unsigned int status) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err = 0; - es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n", + es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", lblk, len, pblk, status, inode->i_ino); if (!len) @@ -684,6 +685,38 @@ error: } /* + * ext4_es_cache_extent() inserts information into the extent status + * tree if and only if there isn't information about the range in + * question already. + */ +void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status) +{ + struct extent_status *es; + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; + + newes.es_lblk = lblk; + newes.es_len = len; + ext4_es_store_pblock(&newes, pblk); + ext4_es_store_status(&newes, status); + trace_ext4_es_cache_extent(inode, &newes); + + if (!len) + return; + + BUG_ON(end < lblk); + + write_lock(&EXT4_I(inode)->i_es_lock); + + es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); + if (!es || es->es_lblk > end) + __es_insert_extent(inode, &newes); + write_unlock(&EXT4_I(inode)->i_es_lock); +} + +/* * ext4_es_lookup_extent() looks up an extent in extent status tree. * * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. @@ -871,23 +904,6 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } -int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) -{ - ext4_lblk_t ee_block; - ext4_fsblk_t ee_pblock; - unsigned int ee_len; - - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - ee_pblock = ext4_ext_pblock(ex); - - if (ee_len == 0) - return 0; - - return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, - EXTENT_STATUS_WRITTEN); -} - static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, struct list_head *b) { @@ -895,6 +911,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, eia = list_entry(a, struct ext4_inode_info, i_es_lru); eib = list_entry(b, struct ext4_inode_info, i_es_lru); + if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && + !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) + return 1; + if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && + ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) + return -1; if (eia->i_touch_when == eib->i_touch_when) return 0; if (time_after(eia->i_touch_when, eib->i_touch_when)) @@ -908,21 +930,13 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, { struct ext4_inode_info *ei; struct list_head *cur, *tmp; - LIST_HEAD(skiped); + LIST_HEAD(skipped); int ret, nr_shrunk = 0; + int retried = 0, skip_precached = 1, nr_skipped = 0; spin_lock(&sbi->s_es_lru_lock); - /* - * If the inode that is at the head of LRU list is newer than - * last_sorted time, that means that we need to sort this list. - */ - ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); - if (sbi->s_es_last_sorted < ei->i_touch_when) { - list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - sbi->s_es_last_sorted = jiffies; - } - +retry: list_for_each_safe(cur, tmp, &sbi->s_es_lru) { /* * If we have already reclaimed all extents from extent @@ -933,9 +947,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, ei = list_entry(cur, struct ext4_inode_info, i_es_lru); - /* Skip the inode that is newer than the last_sorted time */ - if (sbi->s_es_last_sorted < ei->i_touch_when) { - list_move_tail(cur, &skiped); + /* + * Skip the inode that is newer than the last_sorted + * time. Normally we try hard to avoid shrinking + * precached inodes, but we will as a last resort. + */ + if ((sbi->s_es_last_sorted < ei->i_touch_when) || + (skip_precached && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED))) { + nr_skipped++; + list_move_tail(cur, &skipped); continue; } @@ -955,11 +976,33 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, } /* Move the newer inodes into the tail of the LRU list. */ - list_splice_tail(&skiped, &sbi->s_es_lru); + list_splice_tail(&skipped, &sbi->s_es_lru); + INIT_LIST_HEAD(&skipped); + + /* + * If we skipped any inodes, and we weren't able to make any + * forward progress, sort the list and try again. + */ + if ((nr_shrunk == 0) && nr_skipped && !retried) { + retried++; + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); + sbi->s_es_last_sorted = jiffies; + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, + i_es_lru); + /* + * If there are no non-precached inodes left on the + * list, start releasing precached extents. + */ + if (ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED)) + skip_precached = 0; + goto retry; + } + spin_unlock(&sbi->s_es_lru_lock); if (locked_ei && nr_shrunk == 0) - nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); + nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); return nr_shrunk; } @@ -1034,10 +1077,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, struct rb_node *node; struct extent_status *es; int nr_shrunk = 0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (ei->i_es_lru_nr == 0) return 0; + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && + __ratelimit(&_rs)) + ext4_warning(inode->i_sb, "forced shrink of precached extents"); + node = rb_first(&tree->root); while (node != NULL) { es = rb_entry(node, struct extent_status, rb_node); diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index e936730cc5b0..167f4ab8ecc3 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -29,16 +29,26 @@ /* * These flags live in the high bits of extent_status.es_pblk */ -#define EXTENT_STATUS_WRITTEN (1ULL << 63) -#define EXTENT_STATUS_UNWRITTEN (1ULL << 62) -#define EXTENT_STATUS_DELAYED (1ULL << 61) -#define EXTENT_STATUS_HOLE (1ULL << 60) +#define ES_SHIFT 60 + +#define EXTENT_STATUS_WRITTEN (1 << 3) +#define EXTENT_STATUS_UNWRITTEN (1 << 2) +#define EXTENT_STATUS_DELAYED (1 << 1) +#define EXTENT_STATUS_HOLE (1 << 0) #define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) +#define ES_WRITTEN (1ULL << 63) +#define ES_UNWRITTEN (1ULL << 62) +#define ES_DELAYED (1ULL << 61) +#define ES_HOLE (1ULL << 60) + +#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \ + ES_DELAYED | ES_HOLE) + struct ext4_sb_info; struct ext4_extent; @@ -60,7 +70,10 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned long long status); + unsigned int status); +extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_find_delayed_extent_range(struct inode *inode, @@ -68,36 +81,35 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); -extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); static inline int ext4_es_is_written(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0; + return (es->es_pblk & ES_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0; + return (es->es_pblk & ES_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0; + return (es->es_pblk & ES_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_HOLE) != 0; + return (es->es_pblk & ES_HOLE) != 0; } -static inline ext4_fsblk_t ext4_es_status(struct extent_status *es) +static inline unsigned int ext4_es_status(struct extent_status *es) { - return (es->es_pblk & EXTENT_STATUS_FLAGS); + return es->es_pblk >> ES_SHIFT; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) { - return (es->es_pblk & ~EXTENT_STATUS_FLAGS); + return es->es_pblk & ~ES_MASK; } static inline void ext4_es_store_pblock(struct extent_status *es, @@ -105,19 +117,16 @@ static inline void ext4_es_store_pblock(struct extent_status *es, { ext4_fsblk_t block; - block = (pb & ~EXTENT_STATUS_FLAGS) | - (es->es_pblk & EXTENT_STATUS_FLAGS); + block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); es->es_pblk = block; } static inline void ext4_es_store_status(struct extent_status *es, - unsigned long long status) + unsigned int status) { - ext4_fsblk_t block; - - block = (status & EXTENT_STATUS_FLAGS) | - (es->es_pblk & ~EXTENT_STATUS_FLAGS); - es->es_pblk = block; + es->es_pblk = (((ext4_fsblk_t) + (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | + (es->es_pblk & ~ES_MASK)); } extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6f4cc567c382..3da21945ff1f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -149,7 +149,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 || ret == -EIOCBQUEUED) { + if (ret > 0) { ssize_t err; err = generic_write_sync(file, pos, ret); @@ -219,7 +219,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); struct vfsmount *mnt = filp->f_path.mnt; struct path path; char buf[64], *cp; @@ -259,22 +258,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ - if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { - struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); - - spin_lock(&inode->i_lock); - if (!ei->jinode) { - if (!jinode) { - spin_unlock(&inode->i_lock); - return -ENOMEM; - } - ei->jinode = jinode; - jbd2_journal_init_jbd_inode(ei->jinode, inode); - jinode = NULL; - } - spin_unlock(&inode->i_lock); - if (unlikely(jinode != NULL)) - jbd2_free_inode(jinode); + if (filp->f_mode & FMODE_WRITE) { + int ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + return ret; } return dquot_file_open(inode, filp); } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f03598c6ffd3..137193ff389b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -70,18 +70,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { + struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, - EXT4_INODES_PER_GROUP(sb) / 8); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } @@ -117,6 +115,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct ext4_group_desc *desc; struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; + struct ext4_group_info *grp; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -185,6 +184,8 @@ verify: put_bh(bh); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); + grp = ext4_get_group_info(sb, block_group); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } ext4_unlock_group(sb, block_group); @@ -221,6 +222,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; + struct ext4_group_info *grp; if (!sb) { printk(KERN_ERR "EXT4-fs: %s:%d: inode on " @@ -266,7 +268,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (!bitmap_bh) + /* Don't bother if the inode bitmap is corrupt. */ + grp = ext4_get_group_info(sb, block_group); + if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh) goto error_return; BUFFER_TRACE(bitmap_bh, "get_write_access"); @@ -315,8 +319,10 @@ out: err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; - } else + } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + } error_return: brelse(bitmap_bh); @@ -625,6 +631,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* + * In no journal mode, if an inode has recently been deleted, we want + * to avoid reusing it until we're reasonably sure the inode table + * block has been written back to disk. (Yes, these values are + * somewhat arbitrary...) + */ +#define RECENTCY_MIN 5 +#define RECENTCY_DIRTY 30 + +static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) +{ + struct ext4_group_desc *gdp; + struct ext4_inode *raw_inode; + struct buffer_head *bh; + unsigned long dtime, now; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0, recentcy = RECENTCY_MIN; + + gdp = ext4_get_group_desc(sb, group, NULL); + if (unlikely(!gdp)) + return 0; + + bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + (ino / inodes_per_block)); + if (unlikely(!bh) || !buffer_uptodate(bh)) + /* + * If the block is not in the buffer cache, then it + * must have been written out. + */ + goto out; + + offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); + raw_inode = (struct ext4_inode *) (bh->b_data + offset); + dtime = le32_to_cpu(raw_inode->i_dtime); + now = get_seconds(); + if (buffer_dirty(bh)) + recentcy += RECENTCY_DIRTY; + + if (dtime && (dtime < now) && (now < dtime + recentcy)) + ret = 1; +out: + brelse(bh); + return ret; +} + +/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -652,6 +703,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, struct inode *ret; ext4_group_t i; ext4_group_t flex_group; + struct ext4_group_info *grp; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -725,25 +777,39 @@ got_group: continue; } + grp = ext4_get_group_info(sb, group); + /* Skip groups with already-known suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (++group == ngroups) + group = 0; + continue; + } + brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!inode_bitmap_bh) - goto out; + /* Skip groups with suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { + if (++group == ngroups) + group = 0; + continue; + } repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) inode_bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); - if (ino >= EXT4_INODES_PER_GROUP(sb)) { - if (++group == ngroups) - group = 0; - continue; - } + if (ino >= EXT4_INODES_PER_GROUP(sb)) + goto next_group; if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); continue; } + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, ino)) { + ino++; + goto next_inode; + } if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, @@ -767,8 +833,12 @@ repeat_in_this_group: ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ +next_inode: if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; +next_group: + if (++group == ngroups) + group = 0; } err = -ENOSPC; goto out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 87b30cd357e7..594009f5f523 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -23,7 +23,6 @@ #include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" -#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ #include <trace/events/ext4.h> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ba33c67d6e48..c79fd7dabe79 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -553,16 +553,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, } if (retval > 0) { int ret; - unsigned long long status; + unsigned int status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertion failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (lookup)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -654,16 +653,15 @@ found: if (retval > 0) { int ret; - unsigned long long status; + unsigned int status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertion failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (allocation)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif /* * If the extent has been zeroed out, we don't need to update @@ -729,8 +727,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, ret = ext4_map_blocks(handle, inode, &map, flags); if (ret > 0) { + ext4_io_end_t *io_end = ext4_inode_aio(inode); + map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) + set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } @@ -971,7 +973,8 @@ retry_journal: ext4_journal_stop(handle); goto retry_grab; } - wait_on_page_writeback(page); + /* In case writeback began while the page was unlocked */ + wait_for_stable_page(page); if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_write); @@ -1635,16 +1638,15 @@ add_delayed: set_buffer_delay(bh); } else if (retval > 0) { int ret; - unsigned long long status; + unsigned int status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertion failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (lookup)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -1893,12 +1895,32 @@ static int ext4_writepage(struct page *page, return ret; } +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +{ + int len; + loff_t size = i_size_read(mpd->inode); + int err; + + BUG_ON(page->index != mpd->first_page); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + clear_page_dirty_for_io(page); + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); + if (!err) + mpd->wbc->nr_to_write--; + mpd->first_page++; + + return err; +} + #define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) /* * mballoc gives us at most this number of blocks... * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). - * The rest of mballoc seems to handle chunks upto full group size. + * The rest of mballoc seems to handle chunks up to full group size. */ #define MAX_WRITEPAGES_EXTENT_LEN 2048 @@ -1907,82 +1929,94 @@ static int ext4_writepage(struct page *page, * * @mpd - extent of blocks * @lblk - logical number of the block in the file - * @b_state - b_state of the buffer head added + * @bh - buffer head we want to add to the extent * - * the function is used to collect contig. blocks in same state + * The function is used to collect contig. blocks in the same state. If the + * buffer doesn't require mapping for writeback and we haven't started the + * extent of buffers to map yet, the function returns 'true' immediately - the + * caller can write the buffer right away. Otherwise the function returns true + * if the block has been added to the extent, false if the block couldn't be + * added. */ -static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, - unsigned long b_state) +static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, + struct buffer_head *bh) { struct ext4_map_blocks *map = &mpd->map; - /* Don't go larger than mballoc is willing to allocate */ - if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) - return 0; + /* Buffer that doesn't need mapping for writeback? */ + if (!buffer_dirty(bh) || !buffer_mapped(bh) || + (!buffer_delay(bh) && !buffer_unwritten(bh))) { + /* So far no extent to map => we write the buffer right away */ + if (map->m_len == 0) + return true; + return false; + } /* First block in the extent? */ if (map->m_len == 0) { map->m_lblk = lblk; map->m_len = 1; - map->m_flags = b_state & BH_FLAGS; - return 1; + map->m_flags = bh->b_state & BH_FLAGS; + return true; } + /* Don't go larger than mballoc is willing to allocate */ + if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) + return false; + /* Can we merge the block to our big extent? */ if (lblk == map->m_lblk + map->m_len && - (b_state & BH_FLAGS) == map->m_flags) { + (bh->b_state & BH_FLAGS) == map->m_flags) { map->m_len++; - return 1; + return true; } - return 0; + return false; } -static bool add_page_bufs_to_extent(struct mpage_da_data *mpd, - struct buffer_head *head, - struct buffer_head *bh, - ext4_lblk_t lblk) +/* + * mpage_process_page_bufs - submit page buffers for IO or add them to extent + * + * @mpd - extent of blocks for mapping + * @head - the first buffer in the page + * @bh - buffer we should start processing from + * @lblk - logical number of the block in the file corresponding to @bh + * + * Walk through page buffers from @bh upto @head (exclusive) and either submit + * the page for IO if all buffers in this page were mapped and there's no + * accumulated extent of buffers to map or add buffers in the page to the + * extent of buffers to map. The function returns 1 if the caller can continue + * by processing the next page, 0 if it should stop adding buffers to the + * extent to map because we cannot extend it anymore. It can also return value + * < 0 in case of error during IO submission. + */ +static int mpage_process_page_bufs(struct mpage_da_data *mpd, + struct buffer_head *head, + struct buffer_head *bh, + ext4_lblk_t lblk) { struct inode *inode = mpd->inode; + int err; ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) >> inode->i_blkbits; do { BUG_ON(buffer_locked(bh)); - if (!buffer_dirty(bh) || !buffer_mapped(bh) || - (!buffer_delay(bh) && !buffer_unwritten(bh)) || - lblk >= blocks) { + if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { /* Found extent to map? */ if (mpd->map.m_len) - return false; - if (lblk >= blocks) - return true; - continue; + return 0; + /* Everything mapped so far and we hit EOF */ + break; } - if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state)) - return false; } while (lblk++, (bh = bh->b_this_page) != head); - return true; -} - -static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) -{ - int len; - loff_t size = i_size_read(mpd->inode); - int err; - - BUG_ON(page->index != mpd->first_page); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - clear_page_dirty_for_io(page); - err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); - if (!err) - mpd->wbc->nr_to_write--; - mpd->first_page++; - - return err; + /* So far everything mapped? Submit the page for IO. */ + if (mpd->map.m_len == 0) { + err = mpage_submit_page(mpd, head->b_page); + if (err < 0) + return err; + } + return lblk < blocks; } /* @@ -2006,8 +2040,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct inode *inode = mpd->inode; struct buffer_head *head, *bh; int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; - ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) - >> inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; sector_t pblock; @@ -2029,7 +2061,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (page->index > end) break; - /* Upto 'end' pages must be contiguous */ + /* Up to 'end' pages must be contiguous */ BUG_ON(page->index != start); bh = head = page_buffers(page); do { @@ -2042,18 +2074,26 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) */ mpd->map.m_len = 0; mpd->map.m_flags = 0; - add_page_bufs_to_extent(mpd, head, bh, - lblk); + /* + * FIXME: If dioread_nolock supports + * blocksize < pagesize, we need to make + * sure we add size mapped so far to + * io_end->size as the following call + * can submit the page for IO. + */ + err = mpage_process_page_bufs(mpd, head, + bh, lblk); pagevec_release(&pvec); - return 0; + if (err > 0) + err = 0; + return err; } if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); - } while (++lblk < blocks && - (bh = bh->b_this_page) != head); + } while (lblk++, (bh = bh->b_this_page) != head); /* * FIXME: This is going to break if dioread_nolock @@ -2202,12 +2242,10 @@ static int mpage_map_and_submit_extent(handle_t *handle, /* Update on-disk size after IO is submitted */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); if (disksize > EXT4_I(inode)->i_disksize) { int err2; - ext4_update_i_disksize(inode, disksize); + ext4_wb_update_i_disksize(inode, disksize); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) ext4_error(inode->i_sb, @@ -2222,7 +2260,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, /* * Calculate the total number of credits to reserve for one writepages * iteration. This is called from ext4_writepages(). We map an extent of - * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping + * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + * bpp - 1 blocks in bpp different extents. */ @@ -2322,14 +2360,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) lblk = ((ext4_lblk_t)page->index) << (PAGE_CACHE_SHIFT - blkbits); head = page_buffers(page); - if (!add_page_bufs_to_extent(mpd, head, head, lblk)) + err = mpage_process_page_bufs(mpd, head, head, lblk); + if (err <= 0) goto out; - /* So far everything mapped? Submit the page for IO. */ - if (mpd->map.m_len == 0) { - err = mpage_submit_page(mpd, page); - if (err < 0) - goto out; - } + err = 0; /* * Accumulated enough dirty pages? This doesn't apply @@ -2413,7 +2447,7 @@ static int ext4_writepages(struct address_space *mapping, if (ext4_should_dioread_nolock(inode)) { /* - * We may need to convert upto one extent per block in + * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); @@ -2649,7 +2683,7 @@ retry_journal: goto retry_grab; } /* In case writeback began while the page was unlocked */ - wait_on_page_writeback(page); + wait_for_stable_page(page); ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); if (ret < 0) { @@ -2994,19 +3028,13 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private, int ret, - bool is_async) + ssize_t size, void *private) { - struct inode *inode = file_inode(iocb->ki_filp); ext4_io_end_t *io_end = iocb->private; /* if not async direct IO just return */ - if (!io_end) { - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); + if (!io_end) return; - } ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", @@ -3016,11 +3044,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, iocb->private = NULL; io_end->offset = offset; io_end->size = size; - if (is_async) { - io_end->iocb = iocb; - io_end->result = ret; - } - ext4_put_io_end_defer(io_end); + ext4_put_io_end(io_end); } /* @@ -3105,7 +3129,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ret = -ENOMEM; goto retake_lock; } - io_end->flag |= EXT4_IO_END_DIRECT; /* * Grab reference for DIO. Will be dropped in ext4_end_io_dio() */ @@ -3150,13 +3173,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { WARN_ON(iocb->private != io_end); WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); - WARN_ON(io_end->iocb); - /* - * Generic code already did inode_dio_done() so we - * have to clear EXT4_IO_END_DIRECT to not do it for - * the second time. - */ - io_end->flag = 0; ext4_put_io_end(io_end); iocb->private = NULL; } @@ -3536,6 +3552,18 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) offset; } + if (offset & (sb->s_blocksize - 1) || + (offset + length) & (sb->s_blocksize - 1)) { + /* + * Attach jinode to inode for jbd2 if we do any zeroing of + * partial block + */ + ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + goto out_mutex; + + } + first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; @@ -3604,6 +3632,31 @@ out_mutex: return ret; } +int ext4_inode_attach_jinode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct jbd2_inode *jinode; + + if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) + return 0; + + jinode = jbd2_alloc_inode(GFP_KERNEL); + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + return 0; +} + /* * ext4_truncate() * @@ -3664,6 +3717,12 @@ void ext4_truncate(struct inode *inode) return; } + /* If we zero-out tail of the page, we have to create jinode for jbd2 */ + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { + if (ext4_inode_attach_jinode(inode) < 0) + return; + } + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else @@ -4526,7 +4585,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_journal_stop(handle); } - if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + handle_t *handle; + loff_t oldsize = inode->i_size; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4534,73 +4595,69 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } - } - - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && - (attr->ia_size < inode->i_size)) { - handle_t *handle; - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto err_out; - } - if (ext4_handle_valid(handle)) { - error = ext4_orphan_add(handle, inode); - orphan = 1; - } - EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; - ext4_journal_stop(handle); - - if (ext4_should_order_data(inode)) { - error = ext4_begin_ordered_truncate(inode, + if (S_ISREG(inode->i_mode) && + (attr->ia_size < inode->i_size)) { + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, attr->ia_size); - if (error) { - /* Do as much error cleanup as possible */ - handle = ext4_journal_start(inode, - EXT4_HT_INODE, 3); - if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); + if (error) goto err_out; - } - ext4_orphan_del(handle, inode); - orphan = 0; - ext4_journal_stop(handle); + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); goto err_out; } - } - } - - if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != inode->i_size) { - loff_t oldsize = inode->i_size; - - i_size_write(inode, attr->ia_size); - /* - * Blocks are going to be removed from the inode. Wait - * for dio in flight. Temporarily disable - * dioread_nolock to prevent livelock. - */ - if (orphan) { - if (!ext4_should_journal_data(inode)) { - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ext4_inode_resume_unlocked_dio(inode); - } else - ext4_wait_for_tail_page_commit(inode); + if (ext4_handle_valid(handle)) { + error = ext4_orphan_add(handle, inode); + orphan = 1; } + down_write(&EXT4_I(inode)->i_data_sem); + EXT4_I(inode)->i_disksize = attr->ia_size; + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; /* - * Truncate pagecache after we've waited for commit - * in data=journal mode to make pages freeable. + * We have to update i_size under i_data_sem together + * with i_disksize to avoid races with writeback code + * running ext4_wb_update_i_disksize(). */ - truncate_pagecache(inode, oldsize, inode->i_size); + if (!error) + i_size_write(inode, attr->ia_size); + up_write(&EXT4_I(inode)->i_data_sem); + ext4_journal_stop(handle); + if (error) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + } else + i_size_write(inode, attr->ia_size); + + /* + * Blocks are going to be removed from the inode. Wait + * for dio in flight. Temporarily disable + * dioread_nolock to prevent livelock. + */ + if (orphan) { + if (!ext4_should_journal_data(inode)) { + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ext4_inode_resume_unlocked_dio(inode); + } else + ext4_wait_for_tail_page_commit(inode); } - ext4_truncate(inode); + /* + * Truncate pagecache after we've waited for commit + * in data=journal mode to make pages freeable. + */ + truncate_pagecache(inode, oldsize, inode->i_size); } + /* + * We want to call ext4_truncate() even if attr->ia_size == + * inode->i_size for cases like truncation of fallocated space + */ + if (attr->ia_valid & ATTR_SIZE) + ext4_truncate(inode); if (!rc) { setattr_copy(inode, attr); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 9491ac0590f7..a569d335f804 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -17,7 +17,6 @@ #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" -#include "ext4_extents.h" #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) @@ -77,8 +76,10 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); - memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree)); - memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr)); + ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); + ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); + ext4_es_lru_del(inode1); + ext4_es_lru_del(inode2); isize = i_size_read(inode1); i_size_write(inode1, i_size_read(inode2)); @@ -622,6 +623,8 @@ resizefs_out: return 0; } + case EXT4_IOC_PRECACHE_EXTENTS: + return ext4_ext_precache(inode); default: return -ENOTTY; @@ -686,6 +689,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_MOVE_EXT: case FITRIM: case EXT4_IOC_RESIZE_FS: + case EXT4_IOC_PRECACHE_EXTENTS: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4bbbf13bd743..a41e3ba8cfaa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -751,13 +751,15 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd", + "%u clusters in bitmap, %u in gd; " + "block bitmap corrupt.", free, grp->bb_free); /* - * If we intent to continue, we consider group descritor + * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } mb_set_largest_free_order(sb, grp); @@ -1398,6 +1400,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + /* Don't bother if the block group is corrupt. */ + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return; + mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1423,7 +1429,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, inode ? inode->i_ino : 0, blocknr, "freeing already freed block " - "(bit %u)", block); + "(bit %u); block bitmap corrupt.", + block); + /* Mark the block group as corrupt. */ + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &e4b->bd_info->bb_state); mb_regenerate_buddy(e4b); goto done; } @@ -1790,6 +1800,11 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (err) return err; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { + ext4_mb_unload_buddy(e4b); + return 0; + } + ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); @@ -1987,6 +2002,9 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if (cr <= 2 && free < ac->ac_g_ex.fe_len) return 0; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return 0; + /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { int ret = ext4_mb_init_group(ac->ac_sb, group); @@ -4585,6 +4603,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; + struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; @@ -4673,6 +4692,10 @@ do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( + ext4_get_group_info(sb, block_group)))) + return; + /* * Check to see if we are freeing blocks across a group * boundary. @@ -4784,7 +4807,6 @@ do_more: ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); @@ -4792,10 +4814,23 @@ do_more: &sbi->s_flex_groups[flex_group].free_clusters); } - ext4_mb_unload_buddy(&e4b); - - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, + count_clusters); + spin_lock(&ei->i_block_reservation_lock); + if (flags & EXT4_FREE_BLOCKS_METADATA) + ei->i_reserved_meta_blocks += count_clusters; + else + ei->i_reserved_data_blocks += count_clusters; + spin_unlock(&ei->i_block_reservation_lock); + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_reclaim_block(inode, + EXT4_C2B(sbi, count_clusters)); + } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + + ext4_mb_unload_buddy(&e4b); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 49e8bdff9163..2ae73a80c19b 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -39,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode, newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); - path = ext4_ext_find_extent(inode, lb->first_block, NULL); + path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); @@ -494,7 +494,7 @@ int ext4_ext_migrate(struct inode *inode) * superblock modification. * * For the tmp_inode we already have committed the - * trascation that created the inode. Later as and + * transaction that created the inode. Later as and * when we add extents we extent the journal */ /* diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index e86dddbd8296..7fa4d855dbd5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -37,7 +37,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, int ret = 0; struct ext4_ext_path *path; - path = ext4_ext_find_extent(inode, lblock, *orig_path); + path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE); if (IS_ERR(path)) ret = PTR_ERR(path); else if (path[ext_depth(inode)].p_ext == NULL) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 35f55a0dbc4b..1bec5a5c1e45 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3005,15 +3005,19 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. + * + * n.b. old_{dentry,inode) refers to the source dentry/inode + * while new_{dentry,inode) refers to the destination dentry/inode + * This comes from rename(const char *oldpath, const char *newpath) */ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - handle_t *handle; + handle_t *handle = NULL; struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh, *dir_bh; struct ext4_dir_entry_2 *old_de, *new_de; - int retval, force_da_alloc = 0; + int retval; int inlined = 0, new_inlined = 0; struct ext4_dir_entry_2 *parent_de; @@ -3026,14 +3030,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * in separate transaction */ if (new_dentry->d_inode) dquot_initialize(new_dentry->d_inode); - handle = ext4_journal_start(old_dir, EXT4_HT_DIR, - (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - ext4_handle_sync(handle); old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); /* @@ -3056,6 +3052,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, new_bh = NULL; } } + if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) + ext4_alloc_da_blocks(old_inode); + + handle = ext4_journal_start(old_dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + ext4_handle_sync(handle); + if (S_ISDIR(old_inode->i_mode)) { if (new_inode) { retval = -ENOTEMPTY; @@ -3186,8 +3194,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_mark_inode_dirty(handle, new_inode); if (!new_inode->i_nlink) ext4_orphan_add(handle, new_inode); - if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) - force_da_alloc = 1; } retval = 0; @@ -3195,9 +3201,8 @@ end_rename: brelse(dir_bh); brelse(old_bh); brelse(new_bh); - ext4_journal_stop(handle); - if (retval == 0 && force_da_alloc) - ext4_alloc_da_blocks(old_inode); + if (handle) + ext4_journal_stop(handle); return retval; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 6625d210fb45..d7d0c7b46ed4 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -123,10 +123,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) ext4_finish_bio(bio); bio_put(bio); } - if (io_end->flag & EXT4_IO_END_DIRECT) - inode_dio_done(io_end->inode); - if (io_end->iocb) - aio_complete(io_end->iocb, io_end->result, 0); kmem_cache_free(io_end_cachep, io_end); } @@ -204,19 +200,14 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) struct workqueue_struct *wq; unsigned long flags; - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + /* Only reserved conversions from writeback should enter here */ + WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + WARN_ON(!io_end->handle); spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (io_end->handle) { - wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; - if (list_empty(&ei->i_rsv_conversion_list)) - queue_work(wq, &ei->i_rsv_conversion_work); - list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); - } else { - wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq; - if (list_empty(&ei->i_unrsv_conversion_list)) - queue_work(wq, &ei->i_unrsv_conversion_work); - list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list); - } + wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); } @@ -256,13 +247,6 @@ void ext4_end_io_rsv_work(struct work_struct *work) ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); } -void ext4_end_io_unrsv_work(struct work_struct *work) -{ - struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, - i_unrsv_conversion_work); - ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list); -} - ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bca26f34edf4..2c2e6cbc6bed 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -162,7 +162,7 @@ void *ext4_kvmalloc(size_t size, gfp_t flags) { void *ret; - ret = kmalloc(size, flags); + ret = kmalloc(size, flags | __GFP_NOWARN); if (!ret) ret = __vmalloc(size, flags, PAGE_KERNEL); return ret; @@ -172,7 +172,7 @@ void *ext4_kvzalloc(size_t size, gfp_t flags) { void *ret; - ret = kzalloc(size, flags); + ret = kzalloc(size, flags | __GFP_NOWARN); if (!ret) ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); return ret; @@ -762,9 +762,7 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - flush_workqueue(sbi->unrsv_conversion_wq); flush_workqueue(sbi->rsv_conversion_wq); - destroy_workqueue(sbi->unrsv_conversion_wq); destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { @@ -875,14 +873,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); - INIT_LIST_HEAD(&ei->i_unrsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); - INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work); return &ei->vfs_inode; } @@ -1134,8 +1130,8 @@ enum { Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, - Opt_commit, Opt_min_batch_time, Opt_max_batch_time, - Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, + Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -1179,6 +1175,7 @@ static const match_table_t tokens = { {Opt_min_batch_time, "min_batch_time=%u"}, {Opt_max_batch_time, "max_batch_time=%u"}, {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_path, "journal_path=%s"}, {Opt_journal_checksum, "journal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_abort, "abort"}, @@ -1338,6 +1335,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_NO_EXT2 0x0100 #define MOPT_NO_EXT3 0x0200 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) +#define MOPT_STRING 0x0400 static const struct mount_opts { int token; @@ -1359,7 +1357,7 @@ static const struct mount_opts { {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, - MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT}, + MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | @@ -1387,6 +1385,7 @@ static const struct mount_opts { {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, {Opt_journal_dev, 0, MOPT_GTE0}, + {Opt_journal_path, 0, MOPT_STRING}, {Opt_journal_ioprio, 0, MOPT_GTE0}, {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, @@ -1480,7 +1479,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } - if (args->from && match_int(args, &arg)) + if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg)) return -1; if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) return -1; @@ -1544,6 +1543,44 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } *journal_devnum = arg; + } else if (token == Opt_journal_path) { + char *journal_path; + struct inode *journal_inode; + struct path path; + int error; + + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return -1; + } + journal_path = match_strdup(&args[0]); + if (!journal_path) { + ext4_msg(sb, KERN_ERR, "error: could not dup " + "journal device string"); + return -1; + } + + error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + if (error) { + ext4_msg(sb, KERN_ERR, "error: could not find " + "journal device path: error %d", error); + kfree(journal_path); + return -1; + } + + journal_inode = path.dentry->d_inode; + if (!S_ISBLK(journal_inode->i_mode)) { + ext4_msg(sb, KERN_ERR, "error: journal path %s " + "is not a block device", journal_path); + path_put(&path); + kfree(journal_path); + return -1; + } + + *journal_devnum = new_encode_dev(journal_inode->i_rdev); + path_put(&path); + kfree(journal_path); } else if (token == Opt_journal_ioprio) { if (arg > 7) { ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" @@ -3483,7 +3520,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (test_opt(sb, DIOREAD_NOLOCK)) { ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and delalloc"); + "both data=journal and dioread_nolock"); goto failed_mount; } if (test_opt(sb, DELALLOC)) @@ -3954,14 +3991,6 @@ no_journal: goto failed_mount4; } - EXT4_SB(sb)->unrsv_conversion_wq = - alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!EXT4_SB(sb)->unrsv_conversion_wq) { - printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); - ret = -ENOMEM; - goto failed_mount4; - } - /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. @@ -4115,8 +4144,6 @@ failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); - if (EXT4_SB(sb)->unrsv_conversion_wq) - destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); failed_mount_wq: if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); @@ -4564,7 +4591,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait) trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); - flush_workqueue(sbi->unrsv_conversion_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots @@ -4600,7 +4626,6 @@ static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) trace_ext4_sync_fs(sb, wait); flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); - flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); dquot_writeback_dquots(sb, -1); if (wait && test_opt(sb, BARRIER)) ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); @@ -4727,6 +4752,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + err = -EINVAL; + goto restore_opts; + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dioread_nolock"); + err = -EINVAL; + goto restore_opts; + } + } + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ext4_abort(sb, "Abort forced by user"); @@ -5481,6 +5521,7 @@ static void __exit ext4_exit_fs(void) kset_unregister(ext4_kset); ext4_exit_system_zone(); ext4_exit_pageio(); + ext4_exit_es(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 66a6b85a51d8..bb312201ca95 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -182,7 +182,7 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, }; -int check_orphan_space(struct f2fs_sb_info *sbi) +int acquire_orphan_inode(struct f2fs_sb_info *sbi) { unsigned int max_orphans; int err = 0; @@ -197,10 +197,19 @@ int check_orphan_space(struct f2fs_sb_info *sbi) mutex_lock(&sbi->orphan_inode_mutex); if (sbi->n_orphans >= max_orphans) err = -ENOSPC; + else + sbi->n_orphans++; mutex_unlock(&sbi->orphan_inode_mutex); return err; } +void release_orphan_inode(struct f2fs_sb_info *sbi) +{ + mutex_lock(&sbi->orphan_inode_mutex); + sbi->n_orphans--; + mutex_unlock(&sbi->orphan_inode_mutex); +} + void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct list_head *head, *this; @@ -229,21 +238,18 @@ retry: list_add(&new->list, this->prev); else list_add_tail(&new->list, head); - - sbi->n_orphans++; out: mutex_unlock(&sbi->orphan_inode_mutex); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *this, *next, *head; + struct list_head *head; struct orphan_inode_entry *orphan; mutex_lock(&sbi->orphan_inode_mutex); head = &sbi->orphan_inode_list; - list_for_each_safe(this, next, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); + list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { list_del(&orphan->list); kmem_cache_free(orphan_entry_slab, orphan); @@ -373,7 +379,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp1; - pre_version = le64_to_cpu(cp_block->checkpoint_ver); + pre_version = cur_cp_version(cp_block); /* Read the 2nd cp block in this CP pack */ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; @@ -388,7 +394,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp2; - cur_version = le64_to_cpu(cp_block->checkpoint_ver); + cur_version = cur_cp_version(cp_block); if (cur_version == pre_version) { *version = cur_version; @@ -793,7 +799,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) * Increase the version number so that * SIT entries and seg summaries are written at correct place */ - ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver); + ckpt_ver = cur_cp_version(ckpt); ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 035f9a345cdf..941f9b9ca3a5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -37,9 +37,9 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) struct page *node_page = dn->node_page; unsigned int ofs_in_node = dn->ofs_in_node; - wait_on_page_writeback(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, false); - rn = (struct f2fs_node *)page_address(node_page); + rn = F2FS_NODE(node_page); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); @@ -117,7 +117,8 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) block_t start_blkaddr, end_blkaddr; BUG_ON(blk_addr == NEW_ADDR); - fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node; + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; /* Update the page address in the parent node */ __set_data_blkaddr(dn, blk_addr); @@ -176,7 +177,6 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) end_update: write_unlock(&fi->ext.ext_lock); sync_inode_page(dn); - return; } struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) @@ -260,8 +260,17 @@ repeat: if (PageUptodate(page)) return page; - BUG_ON(dn.data_blkaddr == NEW_ADDR); - BUG_ON(dn.data_blkaddr == NULL_ADDR); + /* + * A new dentry page is allocated but not able to be written, since its + * new inode page couldn't be allocated due to -ENOSPC. + * In such the case, its blkaddr can be remained as NEW_ADDR. + * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. + */ + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return page; + } err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); if (err) @@ -365,7 +374,6 @@ static void read_end_io(struct bio *bio, int err) } unlock_page(page); } while (bvec >= bio->bi_io_vec); - kfree(bio->bi_private); bio_put(bio); } @@ -391,7 +399,6 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, bio->bi_end_io = read_end_io; if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { - kfree(bio->bi_private); bio_put(bio); up_read(&sbi->bio_sem); f2fs_put_page(page, 1); @@ -442,7 +449,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock, unsigned int end_offset; end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE : + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; clear_buffer_new(bh_result); @@ -636,9 +643,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, int err = 0; int ilock; - /* for nobh_write_end */ - *fsdata = NULL; - f2fs_balance_fs(sbi); repeat: page = grab_cache_page_write_begin(mapping, index, flags); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0d6c6aafb235..a84b0a8e6854 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -29,7 +29,7 @@ static DEFINE_MUTEX(f2fs_stat_mutex); static void update_general_status(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); int i; /* valid check of the segment numbers */ @@ -83,7 +83,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) */ static void update_sit_info(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, vblocks; @@ -118,7 +118,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) */ static void update_mem_info(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned npages; if (si->base_mem) @@ -253,21 +253,21 @@ static int stat_show(struct seq_file *s, void *v) si->nats, NM_WOUT_THRESHOLD); seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", si->sits, si->fnids); - seq_printf(s, "\nDistribution of User Blocks:"); - seq_printf(s, " [ valid | invalid | free ]\n"); - seq_printf(s, " ["); + seq_puts(s, "\nDistribution of User Blocks:"); + seq_puts(s, " [ valid | invalid | free ]\n"); + seq_puts(s, " ["); for (j = 0; j < si->util_valid; j++) - seq_printf(s, "-"); - seq_printf(s, "|"); + seq_putc(s, '-'); + seq_putc(s, '|'); for (j = 0; j < si->util_invalid; j++) - seq_printf(s, "-"); - seq_printf(s, "|"); + seq_putc(s, '-'); + seq_putc(s, '|'); for (j = 0; j < si->util_free; j++) - seq_printf(s, "-"); - seq_printf(s, "]\n\n"); + seq_putc(s, '-'); + seq_puts(s, "]\n\n"); seq_printf(s, "SSR: %u blocks in %u segments\n", si->block_count[SSR], si->segment_count[SSR]); seq_printf(s, "LFS: %u blocks in %u segments\n", @@ -305,11 +305,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; - sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); - if (!sbi->stat_info) + si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); + if (!si) return -ENOMEM; - si = sbi->stat_info; si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -319,6 +318,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); si->sbi = sbi; + sbi->stat_info = si; mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); @@ -329,13 +329,13 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); mutex_lock(&f2fs_stat_mutex); list_del(&si->stat_list); mutex_unlock(&f2fs_stat_mutex); - kfree(sbi->stat_info); + kfree(si); } void __init f2fs_create_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 62f0d5977c64..384c6daf9a89 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -270,12 +270,27 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) struct f2fs_node *rn; /* copy name info. to this inode page */ - rn = (struct f2fs_node *)page_address(ipage); + rn = F2FS_NODE(ipage); rn->i.i_namelen = cpu_to_le32(name->len); memcpy(rn->i.i_name, name->name, name->len); set_page_dirty(ipage); } +int update_dent_inode(struct inode *inode, const struct qstr *name) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *page; + + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + return PTR_ERR(page); + + init_dent_inode(name, page); + f2fs_put_page(page, 1); + + return 0; +} + static int make_empty_dir(struct inode *inode, struct inode *parent, struct page *page) { @@ -557,6 +572,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (inode->i_nlink == 0) add_orphan_inode(sbi, inode->i_ino); + else + release_orphan_inode(sbi); } if (bit_pos == NR_DENTRY_IN_BLOCK) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 467d42d65c48..608f0df5b919 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/crc32.h> #include <linux/magic.h> +#include <linux/kobject.h> /* * For mount options @@ -28,6 +29,7 @@ #define F2FS_MOUNT_XATTR_USER 0x00000010 #define F2FS_MOUNT_POSIX_ACL 0x00000020 #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 +#define F2FS_MOUNT_INLINE_XATTR 0x00000080 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -134,11 +136,13 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) /* * For INODE and NODE manager */ -#define XATTR_NODE_OFFSET (-1) /* - * store xattrs to one node block per - * file keeping -1 as its node offset to - * distinguish from index node blocks. - */ +/* + * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1 + * as its node offset to distinguish from index node blocks. + * But some bits are used to mark the node block. + */ +#define XATTR_NODE_OFFSET ((((unsigned int)-1) << OFFSET_BIT_SHIFT) \ + >> OFFSET_BIT_SHIFT) enum { ALLOC_NODE, /* allocate a new node page if needed */ LOOKUP_NODE, /* look up a node without readahead */ @@ -178,6 +182,7 @@ struct f2fs_inode_info { f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ + unsigned long long xattr_ver; /* cp version of xattr modification */ struct extent_info ext; /* in-memory extent cache entry */ }; @@ -296,15 +301,6 @@ struct f2fs_sm_info { }; /* - * For directory operation - */ -#define NODE_DIR1_BLOCK (ADDRS_PER_INODE + 1) -#define NODE_DIR2_BLOCK (ADDRS_PER_INODE + 2) -#define NODE_IND1_BLOCK (ADDRS_PER_INODE + 3) -#define NODE_IND2_BLOCK (ADDRS_PER_INODE + 4) -#define NODE_DIND_BLOCK (ADDRS_PER_INODE + 5) - -/* * For superblock */ /* @@ -350,6 +346,7 @@ enum page_type { struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ + struct proc_dir_entry *s_proc; /* proc entry */ struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ int s_dirty; /* dirty flag for checkpoint */ @@ -429,6 +426,10 @@ struct f2fs_sb_info { #endif unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ + + /* For sysfs suppport */ + struct kobject s_kobj; + struct completion s_kobj_unregister; }; /* @@ -454,6 +455,11 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) return (struct f2fs_checkpoint *)(sbi->ckpt); } +static inline struct f2fs_node *F2FS_NODE(struct page *page) +{ + return (struct f2fs_node *)page_address(page); +} + static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) { return (struct f2fs_nm_info *)(sbi->nm_info); @@ -489,6 +495,11 @@ static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi) sbi->s_dirty = 0; } +static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) +{ + return le64_to_cpu(cp->checkpoint_ver); +} + static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); @@ -677,7 +688,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { block_t start_addr; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver); + unsigned long long ckpt_version = cur_cp_version(ckpt); start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); @@ -812,7 +823,7 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, static inline bool IS_INODE(struct page *page) { - struct f2fs_node *p = (struct f2fs_node *)page_address(page); + struct f2fs_node *p = F2FS_NODE(page); return RAW_IS_INODE(p); } @@ -826,7 +837,7 @@ static inline block_t datablock_addr(struct page *node_page, { struct f2fs_node *raw_node; __le32 *addr_array; - raw_node = (struct f2fs_node *)page_address(node_page); + raw_node = F2FS_NODE(node_page); addr_array = blkaddr_in_node(raw_node); return le32_to_cpu(addr_array[offset]); } @@ -873,6 +884,7 @@ enum { FI_NO_ALLOC, /* should not allocate any blocks */ FI_UPDATE_DIR, /* should update inode block for consistency */ FI_DELAY_IPUT, /* used for the recovery */ + FI_INLINE_XATTR, /* used for inline xattr */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -905,6 +917,45 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag) return 0; } +static inline void get_inline_info(struct f2fs_inode_info *fi, + struct f2fs_inode *ri) +{ + if (ri->i_inline & F2FS_INLINE_XATTR) + set_inode_flag(fi, FI_INLINE_XATTR); +} + +static inline void set_raw_inline(struct f2fs_inode_info *fi, + struct f2fs_inode *ri) +{ + ri->i_inline = 0; + + if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + ri->i_inline |= F2FS_INLINE_XATTR; +} + +static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) +{ + if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; + return DEF_ADDRS_PER_INODE; +} + +static inline void *inline_xattr_addr(struct page *page) +{ + struct f2fs_inode *ri; + ri = (struct f2fs_inode *)page_address(page); + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - + F2FS_INLINE_XATTR_ADDRS]); +} + +static inline int inline_xattr_size(struct inode *inode) +{ + if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) + return F2FS_INLINE_XATTR_ADDRS << 2; + else + return 0; +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; @@ -947,6 +998,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); ino_t f2fs_inode_by_name(struct inode *, struct qstr *); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); +int update_dent_inode(struct inode *, const struct qstr *); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); @@ -980,6 +1032,7 @@ int is_checkpointed_node(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); +int truncate_xattr_node(struct inode *, struct page *); int remove_inode_page(struct inode *); struct page *new_inode_page(struct inode *, const struct qstr *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); @@ -1012,7 +1065,8 @@ int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); struct bio *f2fs_bio_alloc(struct block_device *, int); -void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync); +void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool); +void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, block_t, block_t *); @@ -1037,7 +1091,8 @@ void destroy_segment_manager(struct f2fs_sb_info *); struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -int check_orphan_space(struct f2fs_sb_info *); +int acquire_orphan_inode(struct f2fs_sb_info *); +void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); @@ -1068,7 +1123,7 @@ int do_write_data_page(struct page *); */ int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int); +block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); int f2fs_gc(struct f2fs_sb_info *); void build_gc_manager(struct f2fs_sb_info *); int __init create_gc_caches(void); @@ -1112,11 +1167,16 @@ struct f2fs_stat_info { unsigned base_mem, cache_mem; }; +static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_stat_info*)sbi->stat_info; +} + #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_seg_count(sbi, type) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ (si)->tot_segs++; \ if (type == SUM_TYPE_DATA) \ si->data_segs++; \ @@ -1129,14 +1189,14 @@ struct f2fs_stat_info { #define stat_inc_data_blk_count(sbi, blks) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ } while (0) #define stat_inc_node_blk_count(sbi, blks) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ } while (0) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d2d2b7dbdcc1..02c906971cc6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -112,11 +112,13 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - inode = igrab(dentry->d_parent->d_inode); - dput(dentry); + if (update_dent_inode(inode, &dentry->d_name)) { + dput(dentry); + return 0; + } - *pino = inode->i_ino; - iput(inode); + *pino = parent_ino(dentry); + dput(dentry); return 1; } @@ -147,9 +149,10 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) mutex_lock(&inode->i_mutex); - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; - + /* + * Both of fdatasync() and fsync() are able to be recovered from + * sudden-power-off. + */ if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; else if (file_wrong_pino(inode)) @@ -158,10 +161,14 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) need_cp = true; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; + else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) + need_cp = true; if (need_cp) { nid_t pino; + F2FS_I(inode)->xattr_ver = 0; + /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); if (file_wrong_pino(inode) && inode->i_nlink == 1 && @@ -205,7 +212,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) struct f2fs_node *raw_node; __le32 *addr; - raw_node = page_address(dn->node_page); + raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + ofs; for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { @@ -283,7 +290,7 @@ static int truncate_blocks(struct inode *inode, u64 from) } if (IS_INODE(dn.node_page)) - count = ADDRS_PER_INODE; + count = ADDRS_PER_INODE(F2FS_I(inode)); else count = ADDRS_PER_BLOCK; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 35f9b1a196aa..2f157e883687 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -29,10 +29,11 @@ static struct kmem_cache *winode_slab; static int gc_thread_func(void *data) { struct f2fs_sb_info *sbi = data; + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; long wait_ms; - wait_ms = GC_THREAD_MIN_SLEEP_TIME; + wait_ms = gc_th->min_sleep_time; do { if (try_to_freeze()) @@ -45,7 +46,7 @@ static int gc_thread_func(void *data) break; if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { - wait_ms = GC_THREAD_MAX_SLEEP_TIME; + wait_ms = increase_sleep_time(gc_th, wait_ms); continue; } @@ -66,15 +67,15 @@ static int gc_thread_func(void *data) continue; if (!is_idle(sbi)) { - wait_ms = increase_sleep_time(wait_ms); + wait_ms = increase_sleep_time(gc_th, wait_ms); mutex_unlock(&sbi->gc_mutex); continue; } if (has_enough_invalid_blocks(sbi)) - wait_ms = decrease_sleep_time(wait_ms); + wait_ms = decrease_sleep_time(gc_th, wait_ms); else - wait_ms = increase_sleep_time(wait_ms); + wait_ms = increase_sleep_time(gc_th, wait_ms); #ifdef CONFIG_F2FS_STAT_FS sbi->bg_gc++; @@ -82,7 +83,7 @@ static int gc_thread_func(void *data) /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi)) - wait_ms = GC_THREAD_NOGC_SLEEP_TIME; + wait_ms = gc_th->no_gc_sleep_time; } while (!kthread_should_stop()); return 0; } @@ -101,6 +102,12 @@ int start_gc_thread(struct f2fs_sb_info *sbi) goto out; } + gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; + gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; + gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; + + gc_th->gc_idle = 0; + sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, @@ -125,9 +132,17 @@ void stop_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = NULL; } -static int select_gc_type(int gc_type) +static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) { - return (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + + if (gc_th && gc_th->gc_idle) { + if (gc_th->gc_idle == 1) + gc_mode = GC_CB; + else if (gc_th->gc_idle == 2) + gc_mode = GC_GREEDY; + } + return gc_mode; } static void select_policy(struct f2fs_sb_info *sbi, int gc_type, @@ -138,12 +153,18 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (p->alloc_mode == SSR) { p->gc_mode = GC_GREEDY; p->dirty_segmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; } else { - p->gc_mode = select_gc_type(gc_type); + p->gc_mode = select_gc_type(sbi->gc_thread, gc_type); p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; + p->max_search = dirty_i->nr_dirty[DIRTY]; p->ofs_unit = sbi->segs_per_sec; } + + if (p->max_search > MAX_VICTIM_SEARCH) + p->max_search = MAX_VICTIM_SEARCH; + p->offset = sbi->last_victim[p->gc_mode]; } @@ -290,7 +311,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (cost == max_cost) continue; - if (nsearched++ >= MAX_VICTIM_SEARCH) { + if (nsearched++ >= p.max_search) { sbi->last_victim[p.gc_mode] = segno; break; } @@ -407,8 +428,7 @@ next_step: /* set page dirty and write it */ if (gc_type == FG_GC) { - f2fs_submit_bio(sbi, NODE, true); - wait_on_page_writeback(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); set_page_dirty(node_page); } else { if (!PageWriteback(node_page)) @@ -447,7 +467,7 @@ next_step: * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ -block_t start_bidx_of_node(unsigned int node_ofs) +block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; @@ -464,7 +484,7 @@ block_t start_bidx_of_node(unsigned int node_ofs) int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 5 - dec; } - return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); } static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -508,10 +528,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) } else { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - if (PageWriteback(page)) { - f2fs_submit_bio(sbi, DATA, true); - wait_on_page_writeback(page); - } + f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page) && S_ISDIR(inode->i_mode)) { @@ -575,7 +592,6 @@ next_step: continue; } - start_bidx = start_bidx_of_node(nofs); ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 2) { @@ -583,6 +599,8 @@ next_step: if (IS_ERR(inode)) continue; + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + data_page = find_data_page(inode, start_bidx + ofs_in_node, false); if (IS_ERR(data_page)) @@ -593,6 +611,8 @@ next_step: } else { inode = find_gc_inode(dni.ino, ilist); if (inode) { + start_bidx = start_bidx_of_node(nofs, + F2FS_I(inode)); data_page = get_lock_data_page(inode, start_bidx + ofs_in_node); if (IS_ERR(data_page)) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 2c6a6bd08322..507056d22205 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,18 +13,26 @@ * whether IO subsystem is idle * or not */ -#define GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ -#define GC_THREAD_MAX_SLEEP_TIME 60000 -#define GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ +#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ +#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 +#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ /* Search max. number of dirty segments to select a victim segment */ -#define MAX_VICTIM_SEARCH 20 +#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */ struct f2fs_gc_kthread { struct task_struct *f2fs_gc_task; wait_queue_head_t gc_wait_queue_head; + + /* for gc sleep time */ + unsigned int min_sleep_time; + unsigned int max_sleep_time; + unsigned int no_gc_sleep_time; + + /* for changing gc mode */ + unsigned int gc_idle; }; struct inode_entry { @@ -56,25 +64,25 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; } -static inline long increase_sleep_time(long wait) +static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) { - if (wait == GC_THREAD_NOGC_SLEEP_TIME) + if (wait == gc_th->no_gc_sleep_time) return wait; - wait += GC_THREAD_MIN_SLEEP_TIME; - if (wait > GC_THREAD_MAX_SLEEP_TIME) - wait = GC_THREAD_MAX_SLEEP_TIME; + wait += gc_th->min_sleep_time; + if (wait > gc_th->max_sleep_time) + wait = gc_th->max_sleep_time; return wait; } -static inline long decrease_sleep_time(long wait) +static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) { - if (wait == GC_THREAD_NOGC_SLEEP_TIME) - wait = GC_THREAD_MAX_SLEEP_TIME; + if (wait == gc_th->no_gc_sleep_time) + wait = gc_th->max_sleep_time; - wait -= GC_THREAD_MIN_SLEEP_TIME; - if (wait <= GC_THREAD_MIN_SLEEP_TIME) - wait = GC_THREAD_MIN_SLEEP_TIME; + wait -= gc_th->min_sleep_time; + if (wait <= gc_th->min_sleep_time) + wait = gc_th->min_sleep_time; return wait; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2b2d45d19e3e..9339cd292047 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -56,7 +56,7 @@ static int do_read_inode(struct inode *inode) if (IS_ERR(node_page)) return PTR_ERR(node_page); - rn = page_address(node_page); + rn = F2FS_NODE(node_page); ri = &(rn->i); inode->i_mode = le16_to_cpu(ri->i_mode); @@ -85,6 +85,7 @@ static int do_read_inode(struct inode *inode) fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); get_extent_info(&fi->ext, ri->i_ext); + get_inline_info(fi, ri); f2fs_put_page(node_page, 1); return 0; } @@ -151,9 +152,9 @@ void update_inode(struct inode *inode, struct page *node_page) struct f2fs_node *rn; struct f2fs_inode *ri; - wait_on_page_writeback(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, false); - rn = page_address(node_page); + rn = F2FS_NODE(node_page); ri = &(rn->i); ri->i_mode = cpu_to_le16(inode->i_mode); @@ -164,6 +165,7 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); + set_raw_inline(F2FS_I(inode), ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); @@ -221,9 +223,6 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) return 0; - if (wbc) - f2fs_balance_fs(sbi); - /* * We need to lock here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. @@ -231,6 +230,10 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) ilock = mutex_lock_op(sbi); ret = update_inode_page(inode); mutex_unlock_op(sbi, ilock); + + if (wbc) + f2fs_balance_fs(sbi); + return ret; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 64c07169df05..2a5359c990fc 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -83,21 +83,11 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); - int ret; if (sublen > slen) return 0; - ret = memcmp(s + slen - sublen, sub, sublen); - if (ret) { /* compare upper case */ - int i; - char upper_sub[8]; - for (i = 0; i < sublen && i < sizeof(upper_sub); i++) - upper_sub[i] = toupper(sub[i]); - return !memcmp(s + slen - sublen, upper_sub, sublen); - } - - return !ret; + return !strncasecmp(s + slen - sublen, sub, sublen); } /* @@ -239,7 +229,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (!de) goto fail; - err = check_orphan_space(sbi); + err = acquire_orphan_inode(sbi); if (err) { kunmap(page); f2fs_put_page(page, 0); @@ -393,7 +383,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct page *old_dir_page; - struct page *old_page; + struct page *old_page, *new_page; struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; @@ -415,7 +405,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, ilock = mutex_lock_op(sbi); if (new_inode) { - struct page *new_page; err = -ENOTEMPTY; if (old_dir_entry && !f2fs_empty_dir(new_inode)) @@ -427,14 +416,28 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) goto out_dir; + err = acquire_orphan_inode(sbi); + if (err) + goto put_out_dir; + + if (update_dent_inode(old_inode, &new_dentry->d_name)) { + release_orphan_inode(sbi); + goto put_out_dir; + } + f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME; if (old_dir_entry) drop_nlink(new_inode); drop_nlink(new_inode); + if (!new_inode->i_nlink) add_orphan_inode(sbi, new_inode->i_ino); + else + release_orphan_inode(sbi); + + update_inode_page(old_inode); update_inode_page(new_inode); } else { err = f2fs_add_link(new_dentry, old_inode); @@ -467,6 +470,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, mutex_unlock_op(sbi, ilock); return 0; +put_out_dir: + f2fs_put_page(new_page, 1); out_dir: if (old_dir_entry) { kunmap(old_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b418aee09573..51ef27894433 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -315,9 +315,10 @@ cache: * The maximum depth is four. * Offset[0] will have raw inode offset. */ -static int get_node_path(long block, int offset[4], unsigned int noffset[4]) +static int get_node_path(struct f2fs_inode_info *fi, long block, + int offset[4], unsigned int noffset[4]) { - const long direct_index = ADDRS_PER_INODE; + const long direct_index = ADDRS_PER_INODE(fi); const long direct_blks = ADDRS_PER_BLOCK; const long dptrs_per_blk = NIDS_PER_BLOCK; const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; @@ -405,7 +406,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int level, i; int err = 0; - level = get_node_path(index, offset, noffset); + level = get_node_path(F2FS_I(dn->inode), index, offset, noffset); nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -565,7 +566,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, return PTR_ERR(page); } - rn = (struct f2fs_node *)page_address(page); + rn = F2FS_NODE(page); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { child_nid = le32_to_cpu(rn->in.nid[i]); @@ -687,7 +688,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); - level = get_node_path(from, offset, noffset); + level = get_node_path(F2FS_I(inode), from, offset, noffset); restart: page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -698,7 +699,7 @@ restart: set_new_dnode(&dn, inode, page, NULL, 0); unlock_page(page); - rn = page_address(page); + rn = F2FS_NODE(page); switch (level) { case 0: case 1: @@ -771,6 +772,33 @@ fail: return err > 0 ? 0 : err; } +int truncate_xattr_node(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + nid_t nid = F2FS_I(inode)->i_xattr_nid; + struct dnode_of_data dn; + struct page *npage; + + if (!nid) + return 0; + + npage = get_node_page(sbi, nid); + if (IS_ERR(npage)) + return PTR_ERR(npage); + + F2FS_I(inode)->i_xattr_nid = 0; + + /* need to do checkpoint during fsync */ + F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); + + set_new_dnode(&dn, inode, page, npage, nid); + + if (page) + dn.inode_page_locked = 1; + truncate_node(&dn); + return 0; +} + /* * Caller should grab and release a mutex by calling mutex_lock_op() and * mutex_unlock_op(). @@ -781,22 +809,16 @@ int remove_inode_page(struct inode *inode) struct page *page; nid_t ino = inode->i_ino; struct dnode_of_data dn; + int err; page = get_node_page(sbi, ino); if (IS_ERR(page)) return PTR_ERR(page); - if (F2FS_I(inode)->i_xattr_nid) { - nid_t nid = F2FS_I(inode)->i_xattr_nid; - struct page *npage = get_node_page(sbi, nid); - - if (IS_ERR(npage)) - return PTR_ERR(npage); - - F2FS_I(inode)->i_xattr_nid = 0; - set_new_dnode(&dn, inode, page, npage, nid); - dn.inode_page_locked = 1; - truncate_node(&dn); + err = truncate_xattr_node(inode, page); + if (err) { + f2fs_put_page(page, 1); + return err; } /* 0 is possible, after f2fs_new_inode() is failed */ @@ -833,29 +855,32 @@ struct page *new_node_page(struct dnode_of_data *dn, if (!page) return ERR_PTR(-ENOMEM); - get_node_info(sbi, dn->nid, &old_ni); + if (!inc_valid_node_count(sbi, dn->inode, 1)) { + err = -ENOSPC; + goto fail; + } - SetPageUptodate(page); - fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); + get_node_info(sbi, dn->nid, &old_ni); /* Reinitialize old_ni with new node page */ BUG_ON(old_ni.blk_addr != NULL_ADDR); new_ni = old_ni; new_ni.ino = dn->inode->i_ino; - - if (!inc_valid_node_count(sbi, dn->inode, 1)) { - err = -ENOSPC; - goto fail; - } set_node_addr(sbi, &new_ni, NEW_ADDR); + + fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); + SetPageUptodate(page); + set_page_dirty(page); + + if (ofs == XATTR_NODE_OFFSET) + F2FS_I(dn->inode)->i_xattr_nid = dn->nid; dn->node_page = page; if (ipage) update_inode(dn->inode, ipage); else sync_inode_page(dn); - set_page_dirty(page); if (ofs == 0) inc_valid_inode_count(sbi); @@ -916,7 +941,6 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, 0); else if (err == LOCKED_PAGE) f2fs_put_page(apage, 1); - return; } struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1167,9 +1191,9 @@ static int f2fs_write_node_page(struct page *page, /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. - * Be default, 512 pages (2MB), a segment size, is quite reasonable. + * Be default, 512 pages (2MB) * 3 node types, is more reasonable. */ -#define COLLECT_DIRTY_NODES 512 +#define COLLECT_DIRTY_NODES 1536 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1187,9 +1211,10 @@ static int f2fs_write_node_pages(struct address_space *mapping, return 0; /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = max_hw_blocks(sbi); + wbc->nr_to_write = 3 * max_hw_blocks(sbi); sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - (max_hw_blocks(sbi) - wbc->nr_to_write); + wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - + wbc->nr_to_write); return 0; } @@ -1444,6 +1469,9 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; + if (!nid) + return; + spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); BUG_ON(!i || i->state != NID_ALLOC); @@ -1484,8 +1512,8 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); - src = (struct f2fs_node *)page_address(page); - dst = (struct f2fs_node *)page_address(ipage); + src = F2FS_NODE(page); + dst = F2FS_NODE(ipage); memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); dst->i.i_size = 0; @@ -1515,8 +1543,8 @@ int restore_node_summary(struct f2fs_sb_info *sbi, /* alloc temporal page for read node */ page = alloc_page(GFP_NOFS | __GFP_ZERO); - if (IS_ERR(page)) - return PTR_ERR(page); + if (!page) + return -ENOMEM; lock_page(page); /* scan the node segment */ @@ -1535,7 +1563,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, goto out; lock_page(page); - rn = (struct f2fs_node *)page_address(page); + rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; sum_entry->ofs_in_node = 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c65fb4f4230f..3496bb3e15dc 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -155,8 +155,7 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) static inline void fill_node_footer(struct page *page, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(page); if (reset) memset(rn, 0, sizeof(*rn)); rn->footer.nid = cpu_to_le32(nid); @@ -166,10 +165,8 @@ static inline void fill_node_footer(struct page *page, nid_t nid, static inline void copy_node_footer(struct page *dst, struct page *src) { - void *src_addr = page_address(src); - void *dst_addr = page_address(dst); - struct f2fs_node *src_rn = (struct f2fs_node *)src_addr; - struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr; + struct f2fs_node *src_rn = F2FS_NODE(src); + struct f2fs_node *dst_rn = F2FS_NODE(dst); memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); } @@ -177,45 +174,40 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(page); + rn->footer.cp_ver = ckpt->checkpoint_ver; rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } static inline nid_t ino_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.ino); } static inline nid_t nid_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.nid); } static inline unsigned int ofs_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); unsigned flag = le32_to_cpu(rn->footer.flag); return flag >> OFFSET_BIT_SHIFT; } static inline unsigned long long cpver_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le64_to_cpu(rn->footer.cp_ver); } static inline block_t next_blkaddr_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.next_blkaddr); } @@ -237,6 +229,10 @@ static inline block_t next_blkaddr_of_node(struct page *node_page) static inline bool IS_DNODE(struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); + + if (ofs == XATTR_NODE_OFFSET) + return false; + if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) return false; @@ -250,7 +246,7 @@ static inline bool IS_DNODE(struct page *node_page) static inline void set_nid(struct page *p, int off, nid_t nid, bool i) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(p); + struct f2fs_node *rn = F2FS_NODE(p); wait_on_page_writeback(p); @@ -263,7 +259,8 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i) static inline nid_t get_nid(struct page *p, int off, bool i) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(p); + struct f2fs_node *rn = F2FS_NODE(p); + if (i) return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); return le32_to_cpu(rn->in.nid[off]); @@ -314,8 +311,7 @@ static inline void clear_cold_data(struct page *page) static inline int is_node(struct page *page, int type) { - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(page); return le32_to_cpu(rn->footer.flag) & (1 << type); } @@ -325,7 +321,7 @@ static inline int is_node(struct page *page, int type) static inline void set_cold_node(struct inode *inode, struct page *page) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(page); + struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (S_ISDIR(inode->i_mode)) @@ -337,7 +333,7 @@ static inline void set_cold_node(struct inode *inode, struct page *page) static inline void set_mark(struct page *page, int mark, int type) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(page); + struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) flag |= (0x1 << type); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d56d951c2253..51ef5eec33d7 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -40,8 +40,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, static int recover_dentry(struct page *ipage, struct inode *inode) { - void *kaddr = page_address(ipage); - struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; + struct f2fs_node *raw_node = F2FS_NODE(ipage); struct f2fs_inode *raw_inode = &(raw_node->i); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; @@ -93,8 +92,7 @@ out: static int recover_inode(struct inode *inode, struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; + struct f2fs_node *raw_node = F2FS_NODE(node_page); struct f2fs_inode *raw_inode = &(raw_node->i); if (!IS_INODE(node_page)) @@ -119,7 +117,7 @@ static int recover_inode(struct inode *inode, struct page *node_page) static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); + unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page; block_t blkaddr; @@ -131,8 +129,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) /* read node page */ page = alloc_page(GFP_F2FS_ZERO); - if (IS_ERR(page)) - return PTR_ERR(page); + if (!page) + return -ENOMEM; lock_page(page); while (1) { @@ -215,6 +213,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, void *kaddr; struct inode *inode; struct page *node_page; + unsigned int offset; block_t bidx; int i; @@ -259,8 +258,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) return PTR_ERR(node_page); - bidx = start_bidx_of_node(ofs_of_node(node_page)) + - le16_to_cpu(sum.ofs_in_node); + + offset = ofs_of_node(node_page); ino = ino_of_node(node_page); f2fs_put_page(node_page, 1); @@ -269,6 +268,9 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, if (IS_ERR(inode)) return PTR_ERR(inode); + bidx = start_bidx_of_node(offset, F2FS_I(inode)) + + le16_to_cpu(sum.ofs_in_node); + truncate_hole(inode, bidx, bidx + 1); iput(inode); return 0; @@ -277,6 +279,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, block_t blkaddr) { + struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int start, end; struct dnode_of_data dn; struct f2fs_summary sum; @@ -284,9 +287,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, int err = 0, recovered = 0; int ilock; - start = start_bidx_of_node(ofs_of_node(page)); + start = start_bidx_of_node(ofs_of_node(page), fi); if (IS_INODE(page)) - end = start + ADDRS_PER_INODE; + end = start + ADDRS_PER_INODE(fi); else end = start + ADDRS_PER_BLOCK; @@ -357,7 +360,7 @@ err: static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head, int type) { - unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); + unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page; int err = 0; @@ -369,7 +372,7 @@ static int recover_data(struct f2fs_sb_info *sbi, /* read node page */ page = alloc_page(GFP_NOFS | __GFP_ZERO); - if (IS_ERR(page)) + if (!page) return -ENOMEM; lock_page(page); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a86d125a9885..09af9c7b0f52 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -117,7 +117,6 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } mutex_unlock(&dirty_i->seglist_lock); - return; } /* @@ -261,7 +260,6 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, void *addr = curseg->sum_blk; addr += curseg->next_blkoff * sizeof(struct f2fs_summary); memcpy(addr, sum, sizeof(struct f2fs_summary)); - return; } /* @@ -542,12 +540,9 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, { struct curseg_info *curseg = CURSEG_I(sbi, type); - if (force) { + if (force) new_curseg(sbi, type, true); - goto out; - } - - if (type == CURSEG_WARM_NODE) + else if (type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); @@ -555,11 +550,9 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, change_curseg(sbi, type, true); else new_curseg(sbi, type, false); -out: #ifdef CONFIG_F2FS_STAT_FS sbi->segment_count[curseg->alloc_type]++; #endif - return; } void allocate_new_segments(struct f2fs_sb_info *sbi) @@ -611,18 +604,12 @@ static void f2fs_end_io_write(struct bio *bio, int err) struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages) { struct bio *bio; - struct bio_private *priv; -retry: - priv = kmalloc(sizeof(struct bio_private), GFP_NOFS); - if (!priv) { - cond_resched(); - goto retry; - } /* No failure on bio allocation */ bio = bio_alloc(GFP_NOIO, npages); bio->bi_bdev = bdev; - bio->bi_private = priv; + bio->bi_private = NULL; + return bio; } @@ -681,8 +668,17 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, do_submit_bio(sbi, type, false); alloc_new: if (sbi->bio[type] == NULL) { + struct bio_private *priv; +retry: + priv = kmalloc(sizeof(struct bio_private), GFP_NOFS); + if (!priv) { + cond_resched(); + goto retry; + } + sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi)); sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); + sbi->bio[type]->bi_private = priv; /* * The end_io will be assigned at the sumbission phase. * Until then, let bio_add_page() merge consecutive IOs as much @@ -702,6 +698,16 @@ alloc_new: trace_f2fs_submit_write_page(page, blk_addr, type); } +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool sync) +{ + struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + if (PageWriteback(page)) { + f2fs_submit_bio(sbi, type, sync); + wait_on_page_writeback(page); + } +} + static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1179,7 +1185,6 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); - return; } int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 062424a0e4c3..bdd10eab8c40 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -142,6 +142,7 @@ struct victim_sel_policy { int alloc_mode; /* LFS or SSR */ int gc_mode; /* GC_CB or GC_GREEDY */ unsigned long *dirty_segmap; /* dirty segment bitmap */ + unsigned int max_search; /* maximum # of segments to search */ unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ @@ -453,7 +454,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) static inline bool need_SSR(struct f2fs_sb_info *sbi) { - return (free_sections(sbi) < overprovision_sections(sbi)); + return ((prefree_segments(sbi) / sbi->segs_per_sec) + + free_sections(sbi) < overprovision_sections(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) @@ -470,7 +472,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) static inline int utilization(struct f2fs_sb_info *sbi) { - return div_u64(valid_user_blocks(sbi) * 100, sbi->user_block_count); + return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); } /* diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 75c7dc363e92..13d0a0fe49dd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -18,20 +18,25 @@ #include <linux/parser.h> #include <linux/mount.h> #include <linux/seq_file.h> +#include <linux/proc_fs.h> #include <linux/random.h> #include <linux/exportfs.h> #include <linux/blkdev.h> #include <linux/f2fs_fs.h> +#include <linux/sysfs.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "xattr.h" +#include "gc.h" #define CREATE_TRACE_POINTS #include <trace/events/f2fs.h> +static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; +static struct kset *f2fs_kset; enum { Opt_gc_background, @@ -42,6 +47,7 @@ enum { Opt_noacl, Opt_active_logs, Opt_disable_ext_identify, + Opt_inline_xattr, Opt_err, }; @@ -54,9 +60,117 @@ static match_table_t f2fs_tokens = { {Opt_noacl, "noacl"}, {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, + {Opt_inline_xattr, "inline_xattr"}, {Opt_err, NULL}, }; +/* Sysfs support for f2fs */ +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int offset; +}; + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_gc_kthread *gc_kth = sbi->gc_thread; + unsigned int *ui; + + if (!gc_kth) + return -EINVAL; + + ui = (unsigned int *)(((char *)gc_kth) + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + struct f2fs_gc_kthread *gc_kth = sbi->gc_thread; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + if (!gc_kth) + return -EINVAL; + + ui = (unsigned int *)(((char *)gc_kth) + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; + *ui = t; + return count; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +#define F2FS_ATTR_OFFSET(_name, _mode, _show, _store, _elname) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .offset = offsetof(struct f2fs_gc_kthread, _elname), \ +} + +#define F2FS_RW_ATTR(name, elname) \ + F2FS_ATTR_OFFSET(name, 0644, f2fs_sbi_show, f2fs_sbi_store, elname) + +F2FS_RW_ATTR(gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(gc_idle, gc_idle); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -126,11 +240,18 @@ static int parse_options(struct super_block *sb, char *options) case Opt_nouser_xattr: clear_opt(sbi, XATTR_USER); break; + case Opt_inline_xattr: + set_opt(sbi, INLINE_XATTR); + break; #else case Opt_nouser_xattr: f2fs_msg(sb, KERN_INFO, "nouser_xattr options not supported"); break; + case Opt_inline_xattr: + f2fs_msg(sb, KERN_INFO, + "inline_xattr options not supported"); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL case Opt_noacl: @@ -180,6 +301,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) set_inode_flag(fi, FI_NEW_INODE); + if (test_opt(F2FS_SB(sb), INLINE_XATTR)) + set_inode_flag(fi, FI_INLINE_XATTR); + return &fi->vfs_inode; } @@ -205,7 +329,6 @@ static int f2fs_drop_inode(struct inode *inode) static void f2fs_dirty_inode(struct inode *inode, int flags) { set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); - return; } static void f2fs_i_callback(struct rcu_head *head) @@ -223,6 +346,12 @@ static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + kobject_del(&sbi->s_kobj); + f2fs_destroy_stats(sbi); stop_gc_thread(sbi); @@ -236,6 +365,8 @@ static void f2fs_put_super(struct super_block *sb) destroy_segment_manager(sbi); kfree(sbi->ckpt); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); sb->s_fs_info = NULL; brelse(sbi->raw_super_buf); @@ -325,6 +456,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",user_xattr"); else seq_puts(seq, ",nouser_xattr"); + if (test_opt(sbi, INLINE_XATTR)) + seq_puts(seq, ",inline_xattr"); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -340,6 +473,36 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) return 0; } +static int segment_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + for (i = 0; i < total_segs; i++) { + seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); + if (i != 0 && (i % 10) == 0) + seq_puts(seq, "\n"); + else + seq_puts(seq, " "); + } + return 0; +} + +static int segment_info_open_fs(struct inode *inode, struct file *file) +{ + return single_open(file, segment_info_seq_show, PDE_DATA(inode)); +} + +static const struct file_operations f2fs_seq_segment_info_fops = { + .owner = THIS_MODULE, + .open = segment_info_open_fs, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -455,7 +618,7 @@ static const struct export_operations f2fs_export_ops = { static loff_t max_file_size(unsigned bits) { - loff_t result = ADDRS_PER_INODE; + loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); loff_t leaf_count = ADDRS_PER_BLOCK; /* two direct node blocks */ @@ -766,6 +929,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto fail; + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + if (test_opt(sbi, DISCARD)) { struct request_queue *q = bdev_get_queue(sb->s_bdev); if (!blk_queue_discard(q)) @@ -774,6 +944,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) "the device does not support discard"); } + sbi->s_kobj.kset = f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, + "%s", sb->s_id); + if (err) + goto fail; + return 0; fail: stop_gc_thread(sbi); @@ -841,29 +1018,49 @@ static int __init init_f2fs_fs(void) goto fail; err = create_node_manager_caches(); if (err) - goto fail; + goto free_inodecache; err = create_gc_caches(); if (err) - goto fail; + goto free_node_manager_caches; err = create_checkpoint_caches(); if (err) - goto fail; + goto free_gc_caches; + f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); + if (!f2fs_kset) { + err = -ENOMEM; + goto free_checkpoint_caches; + } err = register_filesystem(&f2fs_fs_type); if (err) - goto fail; + goto free_kset; f2fs_create_root_stats(); + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return 0; + +free_kset: + kset_unregister(f2fs_kset); +free_checkpoint_caches: + destroy_checkpoint_caches(); +free_gc_caches: + destroy_gc_caches(); +free_node_manager_caches: + destroy_node_manager_caches(); +free_inodecache: + destroy_inodecache(); fail: return err; } static void __exit exit_f2fs_fs(void) { + remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); destroy_gc_caches(); destroy_node_manager_caches(); destroy_inodecache(); + kset_unregister(f2fs_kset); } module_init(init_f2fs_fs) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3ab07ecd86ca..1ac8a5f6e380 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -246,40 +246,170 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int name_index) return handler; } +static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int name_index, + size_t name_len, const char *name) +{ + struct f2fs_xattr_entry *entry; + + list_for_each_xattr(entry, base_addr) { + if (entry->e_name_index != name_index) + continue; + if (entry->e_name_len != name_len) + continue; + if (!memcmp(entry->e_name, name, name_len)) + break; + } + return entry; +} + +static void *read_all_xattrs(struct inode *inode, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_xattr_header *header; + size_t size = PAGE_SIZE, inline_size = 0; + void *txattr_addr; + + inline_size = inline_xattr_size(inode); + + txattr_addr = kzalloc(inline_size + size, GFP_KERNEL); + if (!txattr_addr) + return NULL; + + /* read from inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + goto fail; + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + } + + /* read from xattr node block */ + if (F2FS_I(inode)->i_xattr_nid) { + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xpage)) + goto fail; + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + f2fs_put_page(xpage, 1); + } + + header = XATTR_HDR(txattr_addr); + + /* never been allocated xattrs */ + if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { + header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); + header->h_refcount = cpu_to_le32(1); + } + return txattr_addr; +fail: + kzfree(txattr_addr); + return NULL; +} + +static inline int write_all_xattrs(struct inode *inode, __u32 hsize, + void *txattr_addr, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + size_t inline_size = 0; + void *xattr_addr; + struct page *xpage; + nid_t new_nid = 0; + int err; + + inline_size = inline_xattr_size(inode); + + if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) + if (!alloc_nid(sbi, &new_nid)) + return -ENOSPC; + + /* write to inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(page); + } + inline_addr = inline_xattr_addr(page); + } + memcpy(inline_addr, txattr_addr, inline_size); + f2fs_put_page(page, 1); + + /* no need to use xattr node block */ + if (hsize <= inline_size) { + err = truncate_xattr_node(inode, ipage); + alloc_nid_failed(sbi, new_nid); + return err; + } + } + + /* write to xattr node block */ + if (F2FS_I(inode)->i_xattr_nid) { + xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(xpage); + } + BUG_ON(new_nid); + } else { + struct dnode_of_data dn; + set_new_dnode(&dn, inode, NULL, NULL, new_nid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(xpage); + } + alloc_nid_done(sbi, new_nid); + } + + xattr_addr = page_address(xpage); + memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE - + sizeof(struct node_footer)); + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); + + /* need to checkpoint during fsync */ + F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); + return 0; +} + int f2fs_getxattr(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *entry; - struct page *page; void *base_addr; - int error = 0, found = 0; + int error = 0; size_t value_len, name_len; if (name == NULL) return -EINVAL; name_len = strlen(name); - if (!fi->i_xattr_nid) - return -ENODATA; + base_addr = read_all_xattrs(inode, NULL); + if (!base_addr) + return -ENOMEM; - page = get_node_page(sbi, fi->i_xattr_nid); - if (IS_ERR(page)) - return PTR_ERR(page); - base_addr = page_address(page); - - list_for_each_xattr(entry, base_addr) { - if (entry->e_name_index != name_index) - continue; - if (entry->e_name_len != name_len) - continue; - if (!memcmp(entry->e_name, name, name_len)) { - found = 1; - break; - } - } - if (!found) { + entry = __find_xattr(base_addr, name_index, name_len, name); + if (IS_XATTR_LAST_ENTRY(entry)) { error = -ENODATA; goto cleanup; } @@ -298,28 +428,21 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name, error = value_len; cleanup: - f2fs_put_page(page, 1); + kzfree(base_addr); return error; } ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = dentry->d_inode; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *entry; - struct page *page; void *base_addr; int error = 0; size_t rest = buffer_size; - if (!fi->i_xattr_nid) - return 0; - - page = get_node_page(sbi, fi->i_xattr_nid); - if (IS_ERR(page)) - return PTR_ERR(page); - base_addr = page_address(page); + base_addr = read_all_xattrs(inode, NULL); + if (!base_addr) + return -ENOMEM; list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -342,7 +465,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - f2fs_put_page(page, 1); + kzfree(base_addr); return error; } @@ -351,14 +474,13 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_xattr_header *header = NULL; struct f2fs_xattr_entry *here, *last; - struct page *page; void *base_addr; - int error, found, free, newsize; + int found, newsize; size_t name_len; - char *pval; int ilock; + __u32 new_hsize; + int error = -ENOMEM; if (name == NULL) return -EINVAL; @@ -368,67 +490,21 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, name_len = strlen(name); - if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN) + if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN(inode)) return -ERANGE; f2fs_balance_fs(sbi); ilock = mutex_lock_op(sbi); - if (!fi->i_xattr_nid) { - /* Allocate new attribute block */ - struct dnode_of_data dn; - - if (!alloc_nid(sbi, &fi->i_xattr_nid)) { - error = -ENOSPC; - goto exit; - } - set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); - mark_inode_dirty(inode); - - page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); - if (IS_ERR(page)) { - alloc_nid_failed(sbi, fi->i_xattr_nid); - fi->i_xattr_nid = 0; - error = PTR_ERR(page); - goto exit; - } - - alloc_nid_done(sbi, fi->i_xattr_nid); - base_addr = page_address(page); - header = XATTR_HDR(base_addr); - header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); - header->h_refcount = cpu_to_le32(1); - } else { - /* The inode already has an extended attribute block. */ - page = get_node_page(sbi, fi->i_xattr_nid); - if (IS_ERR(page)) { - error = PTR_ERR(page); - goto exit; - } - - base_addr = page_address(page); - header = XATTR_HDR(base_addr); - } - - if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { - error = -EIO; - goto cleanup; - } + base_addr = read_all_xattrs(inode, ipage); + if (!base_addr) + goto exit; /* find entry with wanted name. */ - found = 0; - list_for_each_xattr(here, base_addr) { - if (here->e_name_index != name_index) - continue; - if (here->e_name_len != name_len) - continue; - if (!memcmp(here->e_name, name, name_len)) { - found = 1; - break; - } - } + here = __find_xattr(base_addr, name_index, name_len, name); + found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; last = here; while (!IS_XATTR_LAST_ENTRY(last)) @@ -439,22 +515,25 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, /* 1. Check space */ if (value) { - /* If value is NULL, it is remove operation. + int free; + /* + * If value is NULL, it is remove operation. * In case of update operation, we caculate free. */ - free = MIN_OFFSET - ((char *)last - (char *)header); + free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); if (found) free = free - ENTRY_SIZE(here); if (free < newsize) { error = -ENOSPC; - goto cleanup; + goto exit; } } /* 2. Remove old entry */ if (found) { - /* If entry is found, remove old entry. + /* + * If entry is found, remove old entry. * If not found, remove operation is not needed. */ struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here); @@ -465,10 +544,15 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, memset(last, 0, oldsize); } + new_hsize = (char *)last - (char *)base_addr; + /* 3. Write new entry */ if (value) { - /* Before we come here, old entry is removed. - * We just write new entry. */ + char *pval; + /* + * Before we come here, old entry is removed. + * We just write new entry. + */ memset(last, 0, newsize); last->e_name_index = name_index; last->e_name_len = name_len; @@ -476,26 +560,25 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, pval = last->e_name + name_len; memcpy(pval, value, value_len); last->e_value_size = cpu_to_le16(value_len); + new_hsize += newsize; } - set_page_dirty(page); - f2fs_put_page(page, 1); + error = write_all_xattrs(inode, new_hsize, base_addr, ipage); + if (error) + goto exit; if (is_inode_flag_set(fi, FI_ACL_MODE)) { inode->i_mode = fi->i_acl_mode; inode->i_ctime = CURRENT_TIME; clear_inode_flag(fi, FI_ACL_MODE); } + if (ipage) update_inode(inode, ipage); else update_inode_page(inode); - mutex_unlock_op(sbi, ilock); - - return 0; -cleanup: - f2fs_put_page(page, 1); exit: mutex_unlock_op(sbi, ilock); + kzfree(base_addr); return error; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 3c0817bef25d..02a08fb88a15 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -51,7 +51,7 @@ struct f2fs_xattr_entry { #define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) #define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr)) -#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr)+1)) +#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) #define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) @@ -69,17 +69,16 @@ struct f2fs_xattr_entry { !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \ + sizeof(struct node_footer) - sizeof(__u32)) -#define MIN_OFFSET XATTR_ALIGN(PAGE_SIZE - \ - sizeof(struct node_footer) - \ - sizeof(__u32)) - -#define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \ - sizeof(struct f2fs_xattr_entry)) +#define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ + sizeof(struct f2fs_xattr_header) - \ + sizeof(struct f2fs_xattr_entry)) /* * On-disk structure of f2fs_xattr - * We use only 1 block for xattr. + * We use inline xattrs space + 1 block for xattr. * * +--------------------+ * | f2fs_xattr_header | diff --git a/fs/fcntl.c b/fs/fcntl.c index 6599222536eb..65343c3741ff 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -730,14 +730,14 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC | O_PATH + __FMODE_EXEC | O_PATH | __O_TMPFILE )); fasync_cache = kmem_cache_create("fasync_cache", diff --git a/fs/file_table.c b/fs/file_table.c index b44e4c559786..322cd37626cb 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -385,6 +385,10 @@ static inline void __file_sb_list_add(struct file *file, struct super_block *sb) */ void file_sb_list_add(struct file *file, struct super_block *sb) { + if (likely(!(file->f_mode & FMODE_WRITE))) + return; + if (!S_ISREG(file_inode(file)->i_mode)) + return; lg_local_lock(&files_lglock); __file_sb_list_add(file, sb); lg_local_unlock(&files_lglock); @@ -450,8 +454,6 @@ void mark_files_ro(struct super_block *sb) lg_global_lock(&files_lglock); do_file_list_for_each_entry(sb, f) { - if (!S_ISREG(file_inode(f)->i_mode)) - continue; if (!file_count(f)) continue; if (!(f->f_mode & FMODE_WRITE)) diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 0e91a3c9fdb2..318e8433527c 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -558,3 +558,74 @@ void __fscache_cookie_put(struct fscache_cookie *cookie) _leave(""); } + +/* + * check the consistency between the netfs inode and the backing cache + * + * NOTE: it only serves no-index type + */ +int __fscache_check_consistency(struct fscache_cookie *cookie) +{ + struct fscache_operation *op; + struct fscache_object *object; + int ret; + + _enter("%p,", cookie); + + ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + if (hlist_empty(&cookie->backing_objects)) + return 0; + + op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!op) + return -ENOMEM; + + fscache_operation_init(op, NULL, NULL); + op->flags = FSCACHE_OP_MYTHREAD | + (1 << FSCACHE_OP_WAITING); + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto inconsistent; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) + goto inconsistent; + + op->debug_id = atomic_inc_return(&fscache_op_debug_id); + + atomic_inc(&cookie->n_active); + if (fscache_submit_op(object, op) < 0) + goto submit_failed; + + /* the work queue now carries its own ref on the object */ + spin_unlock(&cookie->lock); + + ret = fscache_wait_for_operation_activation(object, op, + NULL, NULL, NULL); + if (ret == 0) { + /* ask the cache to honour the operation */ + ret = object->cache->ops->check_consistency(op); + fscache_op_complete(op, false); + } else if (ret == -ENOBUFS) { + ret = 0; + } + + fscache_put_operation(op); + _leave(" = %d", ret); + return ret; + +submit_failed: + atomic_dec(&cookie->n_active); +inconsistent: + spin_unlock(&cookie->lock); + kfree(op); + _leave(" = -ESTALE"); + return -ESTALE; +} +EXPORT_SYMBOL(__fscache_check_consistency); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 12d505bedb5c..4226f6680b06 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -130,6 +130,12 @@ extern void fscache_operation_gc(struct work_struct *); /* * page.c */ +extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *); +extern int fscache_wait_for_operation_activation(struct fscache_object *, + struct fscache_operation *, + atomic_t *, + atomic_t *, + void (*)(struct fscache_operation *)); extern void fscache_invalidate_writes(struct fscache_cookie *); /* diff --git a/fs/fscache/page.c b/fs/fscache/page.c index d479ab3c63e4..8702b732109a 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -278,7 +278,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval( /* * wait for a deferred lookup to complete */ -static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) +int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) { unsigned long jif; @@ -322,42 +322,46 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op) /* * wait for an object to become active (or dead) */ -static int fscache_wait_for_retrieval_activation(struct fscache_object *object, - struct fscache_retrieval *op, - atomic_t *stat_op_waits, - atomic_t *stat_object_dead) +int fscache_wait_for_operation_activation(struct fscache_object *object, + struct fscache_operation *op, + atomic_t *stat_op_waits, + atomic_t *stat_object_dead, + void (*do_cancel)(struct fscache_operation *)) { int ret; - if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags)) + if (!test_bit(FSCACHE_OP_WAITING, &op->flags)) goto check_if_dead; _debug(">>> WT"); - fscache_stat(stat_op_waits); - if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + if (stat_op_waits) + fscache_stat(stat_op_waits); + if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, fscache_wait_bit_interruptible, TASK_INTERRUPTIBLE) != 0) { - ret = fscache_cancel_op(&op->op, fscache_do_cancel_retrieval); + ret = fscache_cancel_op(op, do_cancel); if (ret == 0) return -ERESTARTSYS; /* it's been removed from the pending queue by another party, * so we should get to run shortly */ - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + wait_on_bit(&op->flags, FSCACHE_OP_WAITING, fscache_wait_bit, TASK_UNINTERRUPTIBLE); } _debug("<<< GO"); check_if_dead: - if (op->op.state == FSCACHE_OP_ST_CANCELLED) { - fscache_stat(stat_object_dead); + if (op->state == FSCACHE_OP_ST_CANCELLED) { + if (stat_object_dead) + fscache_stat(stat_object_dead); _leave(" = -ENOBUFS [cancelled]"); return -ENOBUFS; } if (unlikely(fscache_object_is_dead(object))) { - pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->op.state); - fscache_cancel_op(&op->op, fscache_do_cancel_retrieval); - fscache_stat(stat_object_dead); + pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state); + fscache_cancel_op(op, do_cancel); + if (stat_object_dead) + fscache_stat(stat_object_dead); return -ENOBUFS; } return 0; @@ -432,10 +436,11 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ - ret = fscache_wait_for_retrieval_activation( - object, op, + ret = fscache_wait_for_operation_activation( + object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead)); + __fscache_stat(&fscache_n_retrievals_object_dead), + fscache_do_cancel_retrieval); if (ret < 0) goto error; @@ -557,10 +562,11 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ - ret = fscache_wait_for_retrieval_activation( - object, op, + ret = fscache_wait_for_operation_activation( + object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead)); + __fscache_stat(&fscache_n_retrievals_object_dead), + fscache_do_cancel_retrieval); if (ret < 0) goto error; @@ -658,10 +664,11 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_alloc_ops); - ret = fscache_wait_for_retrieval_activation( - object, op, + ret = fscache_wait_for_operation_activation( + object, &op->op, __fscache_stat(&fscache_n_alloc_op_waits), - __fscache_stat(&fscache_n_allocs_object_dead)); + __fscache_stat(&fscache_n_allocs_object_dead), + fscache_do_cancel_retrieval); if (ret < 0) goto error; @@ -694,6 +701,22 @@ nobufs: EXPORT_SYMBOL(__fscache_alloc_page); /* + * Unmark pages allocate in the readahead code path (via: + * fscache_readpages_or_alloc) after delegating to the base filesystem + */ +void __fscache_readpages_cancel(struct fscache_cookie *cookie, + struct list_head *pages) +{ + struct page *page; + + list_for_each_entry(page, pages, lru) { + if (PageFsCache(page)) + __fscache_uncache_page(cookie, page); + } +} +EXPORT_SYMBOL(__fscache_readpages_cancel); + +/* * release a write op reference */ static void fscache_release_write_op(struct fscache_operation *_op) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index aef34b1e635e..adbfd66b380f 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -568,6 +568,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev, return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); } +static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL); static ssize_t cuse_class_abort_store(struct device *dev, struct device_attribute *attr, @@ -578,12 +579,14 @@ static ssize_t cuse_class_abort_store(struct device *dev, fuse_abort_conn(&cc->fc); return count; } +static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store); -static struct device_attribute cuse_class_dev_attrs[] = { - __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL), - __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store), - { } +static struct attribute *cuse_class_dev_attrs[] = { + &dev_attr_waiting.attr, + &dev_attr_abort.attr, + NULL, }; +ATTRIBUTE_GROUPS(cuse_class_dev); static struct miscdevice cuse_miscdev = { .minor = MISC_DYNAMIC_MINOR, @@ -609,7 +612,7 @@ static int __init cuse_init(void) if (IS_ERR(cuse_class)) return PTR_ERR(cuse_class); - cuse_class->dev_attrs = cuse_class_dev_attrs; + cuse_class->dev_groups = cuse_class_dev_groups; rc = misc_register(&cuse_miscdev); if (rc) { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1d55f9465400..ef74ad5fd362 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1765,11 +1765,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, /* Look up request on processing list by unique ID */ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) { - struct list_head *entry; + struct fuse_req *req; - list_for_each(entry, &fc->processing) { - struct fuse_req *req; - req = list_entry(entry, struct fuse_req, list); + list_for_each_entry(req, &fc->processing, list) { if (req->in.h.unique == unique || req->intr_unique == unique) return req; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 72a5d5b04494..3ac91086f41f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -182,10 +182,11 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) struct inode *inode; struct dentry *parent; struct fuse_conn *fc; + int ret; inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) - return 0; + goto invalid; else if (fuse_dentry_time(entry) < get_jiffies_64()) { int err; struct fuse_entry_out outarg; @@ -195,20 +196,23 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) /* For negative dentries, always do a fresh lookup */ if (!inode) - return 0; + goto invalid; + ret = -ECHILD; if (flags & LOOKUP_RCU) - return -ECHILD; + goto out; fc = get_fuse_conn(inode); req = fuse_get_req_nopages(fc); + ret = PTR_ERR(req); if (IS_ERR(req)) - return 0; + goto out; forget = fuse_alloc_forget(); if (!forget) { fuse_put_request(fc, req); - return 0; + ret = -ENOMEM; + goto out; } attr_version = fuse_get_attr_version(fc); @@ -227,7 +231,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) struct fuse_inode *fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode)) { fuse_queue_forget(fc, forget, outarg.nodeid, 1); - return 0; + goto invalid; } spin_lock(&fc->lock); fi->nlookup++; @@ -235,7 +239,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) } kfree(forget); if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) - return 0; + goto invalid; fuse_change_attributes(inode, &outarg.attr, entry_attr_timeout(&outarg), @@ -249,7 +253,15 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) dput(parent); } } - return 1; + ret = 1; +out: + return ret; + +invalid: + ret = 0; + if (check_submounts_and_drop(entry) != 0) + ret = 1; + goto out; } static int invalid_nodeid(u64 nodeid) @@ -267,26 +279,6 @@ int fuse_valid_type(int m) S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); } -/* - * Add a directory inode to a dentry, ensuring that no other dentry - * refers to this inode. Called with fc->inst_mutex. - */ -static struct dentry *fuse_d_add_directory(struct dentry *entry, - struct inode *inode) -{ - struct dentry *alias = d_find_alias(inode); - if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { - /* This tries to shrink the subtree below alias */ - fuse_invalidate_entry(alias); - dput(alias); - if (!hlist_empty(&inode->i_dentry)) - return ERR_PTR(-EBUSY); - } else { - dput(alias); - } - return d_splice_alias(inode, entry); -} - int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { @@ -345,6 +337,24 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, return err; } +static struct dentry *fuse_materialise_dentry(struct dentry *dentry, + struct inode *inode) +{ + struct dentry *newent; + + if (inode && S_ISDIR(inode->i_mode)) { + struct fuse_conn *fc = get_fuse_conn(inode); + + mutex_lock(&fc->inst_mutex); + newent = d_materialise_unique(dentry, inode); + mutex_unlock(&fc->inst_mutex); + } else { + newent = d_materialise_unique(dentry, inode); + } + + return newent; +} + static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { @@ -352,7 +362,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, struct fuse_entry_out outarg; struct inode *inode; struct dentry *newent; - struct fuse_conn *fc = get_fuse_conn(dir); bool outarg_valid = true; err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, @@ -368,16 +377,10 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, if (inode && get_node_id(inode) == FUSE_ROOT_ID) goto out_iput; - if (inode && S_ISDIR(inode->i_mode)) { - mutex_lock(&fc->inst_mutex); - newent = fuse_d_add_directory(entry, inode); - mutex_unlock(&fc->inst_mutex); - err = PTR_ERR(newent); - if (IS_ERR(newent)) - goto out_iput; - } else { - newent = d_splice_alias(inode, entry); - } + newent = fuse_materialise_dentry(entry, inode); + err = PTR_ERR(newent); + if (IS_ERR(newent)) + goto out_err; entry = newent ? newent : entry; if (outarg_valid) @@ -1174,6 +1177,8 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, return -EIO; if (reclen > nbytes) break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; if (!dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, dirent->type)) @@ -1275,18 +1280,10 @@ static int fuse_direntplus_link(struct file *file, if (!inode) goto out; - if (S_ISDIR(inode->i_mode)) { - mutex_lock(&fc->inst_mutex); - alias = fuse_d_add_directory(dentry, inode); - mutex_unlock(&fc->inst_mutex); - err = PTR_ERR(alias); - if (IS_ERR(alias)) { - iput(inode); - goto out; - } - } else { - alias = d_splice_alias(inode, dentry); - } + alias = fuse_materialise_dentry(dentry, inode); + err = PTR_ERR(alias); + if (IS_ERR(alias)) + goto out; if (alias) { dput(dentry); @@ -1320,6 +1317,8 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, return -EIO; if (reclen > nbytes) break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; if (!over) { /* We fill entries into dstbuf only as much as @@ -1590,6 +1589,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_req *req; struct fuse_setattr_in inarg; struct fuse_attr_out outarg; @@ -1617,8 +1617,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (IS_ERR(req)) return PTR_ERR(req); - if (is_truncate) + if (is_truncate) { fuse_set_nowrite(inode); + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + } memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); @@ -1680,12 +1682,14 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, invalidate_inode_pages2(inode->i_mapping); } + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); return 0; error: if (is_truncate) fuse_release_nowrite(inode); + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); return err; } @@ -1749,6 +1753,8 @@ static int fuse_setxattr(struct dentry *entry, const char *name, fc->no_setxattr = 1; err = -EOPNOTSUPP; } + if (!err) + fuse_invalidate_attr(inode); return err; } @@ -1878,6 +1884,8 @@ static int fuse_removexattr(struct dentry *entry, const char *name) fc->no_removexattr = 1; err = -EOPNOTSUPP; } + if (!err) + fuse_invalidate_attr(inode); return err; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5c121fe19c5f..d409deafc67b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -629,7 +629,8 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fc->lock); - if (attr_ver == fi->attr_version && size < inode->i_size) { + if (attr_ver == fi->attr_version && size < inode->i_size && + !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { fi->attr_version = ++fc->attr_version; i_size_write(inode, size); } @@ -1032,12 +1033,16 @@ static ssize_t fuse_perform_write(struct file *file, { struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; ssize_t res = 0; if (is_bad_inode(inode)) return -EIO; + if (inode->i_size < pos + iov_iter_count(ii)) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + do { struct fuse_req *req; ssize_t count; @@ -1073,6 +1078,7 @@ static ssize_t fuse_perform_write(struct file *file, if (res > 0) fuse_write_update_size(inode, pos); + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); fuse_invalidate_attr(inode); return res > 0 ? res : err; @@ -1529,7 +1535,6 @@ static int fuse_writepage_locked(struct page *page) inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); - end_page_writeback(page); spin_lock(&fc->lock); list_add(&req->writepages_entry, &fi->writepages); @@ -1537,6 +1542,8 @@ static int fuse_writepage_locked(struct page *page) fuse_flush_writepages(inode); spin_unlock(&fc->lock); + end_page_writeback(page); + return 0; err_free: diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fde7249a3a96..5ced199b50bb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -115,6 +115,8 @@ struct fuse_inode { enum { /** Advise readdirplus */ FUSE_I_ADVISE_RDPLUS, + /** An operation changing file size is in progress */ + FUSE_I_SIZE_UNSTABLE, }; struct fuse_conn; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0b578598c6ac..e0fe703ee3d6 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -201,7 +201,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, struct timespec old_mtime; spin_lock(&fc->lock); - if (attr_version != 0 && fi->attr_version > attr_version) { + if ((attr_version != 0 && fi->attr_version > attr_version) || + test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { spin_unlock(&fc->lock); return; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ee48ad37d9c0..1f7d8057ea68 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -122,14 +122,13 @@ out: } /** - * gfs2_writeback_writepage - Write page for writeback mappings + * gfs2_writepage - Write page for writeback mappings * @page: The page * @wbc: The writeback control * */ -static int gfs2_writeback_writepage(struct page *page, - struct writeback_control *wbc) +static int gfs2_writepage(struct page *page, struct writeback_control *wbc) { int ret; @@ -141,32 +140,6 @@ static int gfs2_writeback_writepage(struct page *page, } /** - * gfs2_ordered_writepage - Write page for ordered data files - * @page: The page to write - * @wbc: The writeback control - * - */ - -static int gfs2_ordered_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - int ret; - - ret = gfs2_writepage_common(page, wbc); - if (ret <= 0) - return ret; - - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1); - return block_write_full_page(page, gfs2_get_block_noalloc, wbc); -} - -/** * __gfs2_jdata_writepage - The core of jdata writepage * @page: The page to write * @wbc: The writeback control @@ -842,6 +815,8 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, unsigned int from = pos & (PAGE_CACHE_SIZE - 1); unsigned int to = from + len; int ret; + struct gfs2_trans *tr = current->journal_info; + BUG_ON(!tr); BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); @@ -852,8 +827,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, goto failed; } - gfs2_trans_add_meta(ip->i_gl, dibh); - if (gfs2_is_stuffed(ip)) return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); @@ -861,6 +834,11 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, gfs2_page_add_databufs(ip, page, from, to); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (tr->tr_num_buf_new) + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + else + gfs2_trans_add_meta(ip->i_gl, dibh); + if (inode == sdp->sd_rindex) { adjust_fs_space(inode); @@ -1107,7 +1085,7 @@ cannot_release: } static const struct address_space_operations gfs2_writeback_aops = { - .writepage = gfs2_writeback_writepage, + .writepage = gfs2_writepage, .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, @@ -1123,7 +1101,7 @@ static const struct address_space_operations gfs2_writeback_aops = { }; static const struct address_space_operations gfs2_ordered_aops = { - .writepage = gfs2_ordered_writepage, + .writepage = gfs2_writepage, .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index f2448ab2aac5..d3a5d4e29ba5 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -93,12 +93,9 @@ invalid_gunlock: if (!had_lock) gfs2_glock_dq_uninit(&d_gh); invalid: - if (inode && S_ISDIR(inode->i_mode)) { - if (have_submounts(dentry)) - goto valid; - shrink_dcache_parent(dentry); - } - d_drop(dentry); + if (check_submounts_and_drop(dentry) != 0) + goto valid; + dput(parent); return 0; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 0cb4c1557f20..2e5fc268d324 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1859,7 +1859,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - ht = kzalloc(size, GFP_NOFS); + ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); if (ht == NULL) ht = vzalloc(size); if (!ht) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 72c3866a7320..0621b46d474d 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -650,7 +650,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); + int sync_state = inode->i_state & I_DIRTY; struct gfs2_inode *ip = GFS2_I(inode); int ret = 0, ret1 = 0; @@ -660,6 +660,8 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, return ret1; } + if (!gfs2_is_jdata(ip)) + sync_state &= ~I_DIRTY_PAGES; if (datasync) sync_state &= ~I_DIRTY_SYNC; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9435384562a2..722329cac98f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1411,7 +1411,6 @@ __acquires(&lru_lock) if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); - smp_mb__after_clear_bit(); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put_nolock(gl); spin_unlock(&gl->gl_spin); @@ -1488,7 +1487,7 @@ static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp, rcu_read_lock(); hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) { - if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref)) + if ((gl->gl_sbd == sdp) && atomic_inc_not_zero(&gl->gl_ref)) examiner(gl); } rcu_read_unlock(); @@ -1508,18 +1507,17 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) * thaw_glock - thaw out a glock which has an unprocessed reply waiting * @gl: The glock to thaw * - * N.B. When we freeze a glock, we leave a ref to the glock outstanding, - * so this has to result in the ref count being dropped by one. */ static void thaw_glock(struct gfs2_glock *gl) { if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) - return; + goto out; set_bit(GLF_REPLY_PENDING, &gl->gl_flags); - gfs2_glock_hold(gl); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) { +out: gfs2_glock_put(gl); + } } /** @@ -1536,7 +1534,6 @@ static void clear_glock(struct gfs2_glock *gl) if (gl->gl_state != LM_ST_UNLOCKED) handle_callback(gl, LM_ST_UNLOCKED, 0, false); spin_unlock(&gl->gl_spin); - gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); } @@ -1838,14 +1835,14 @@ int __init gfs2_glock_init(void) glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0); - if (IS_ERR(glock_workqueue)) - return PTR_ERR(glock_workqueue); + if (!glock_workqueue) + return -ENOMEM; gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); - if (IS_ERR(gfs2_delete_workqueue)) { + if (!gfs2_delete_workqueue) { destroy_workqueue(glock_workqueue); - return PTR_ERR(gfs2_delete_workqueue); + return -ENOMEM; } register_shrinker(&glock_shrinker); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5f2e5224c51c..e2e0a90396e7 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -47,7 +47,8 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) * None of the buffers should be dirty, locked, or pinned. */ -static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) +static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync, + unsigned int nr_revokes) { struct gfs2_sbd *sdp = gl->gl_sbd; struct list_head *head = &gl->gl_ail_list; @@ -57,7 +58,9 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) { + list_for_each_entry_safe_reverse(bd, tmp, head, bd_ail_gl_list) { + if (nr_revokes == 0) + break; bh = bd->bd_bh; if (bh->b_state & b_state) { if (fsync) @@ -65,6 +68,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) gfs2_ail_error(gl, bh); } gfs2_trans_add_revoke(sdp, bd); + nr_revokes--; } GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); spin_unlock(&sdp->sd_ail_lock); @@ -91,7 +95,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) WARN_ON_ONCE(current->journal_info); current->journal_info = &tr; - __gfs2_ail_flush(gl, 0); + __gfs2_ail_flush(gl, 0, tr.tr_revokes); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); @@ -101,15 +105,19 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_sbd; unsigned int revokes = atomic_read(&gl->gl_ail_count); + unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); int ret; if (!revokes) return; - ret = gfs2_trans_begin(sdp, 0, revokes); + while (revokes > max_revokes) + max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + + ret = gfs2_trans_begin(sdp, 0, max_revokes); if (ret) return; - __gfs2_ail_flush(gl, fsync); + __gfs2_ail_flush(gl, fsync, max_revokes); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index bbb2715171cd..64915eeae5a7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -594,7 +594,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, } gfs2_glock_dq_uninit(ghs); if (IS_ERR(d)) - return PTR_RET(d); + return PTR_ERR(d); return error; } else if (error != -ENOENT) { goto fail_gunlock; @@ -1750,6 +1750,10 @@ static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, struct gfs2_holder gh; int ret; + /* For selinux during lookup */ + if (gfs2_glock_is_locked_by_me(ip->i_gl)) + return generic_getxattr(dentry, name, data, size); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); ret = gfs2_glock_nq(&gh); if (ret == 0) { diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 17c5b5d7dc88..010b9fb9fec6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -579,6 +579,24 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, return error; } +/** + * gfs2_meta_sync - Sync all buffers associated with a glock + * @gl: The glock + * + */ + +static void gfs2_meta_sync(struct gfs2_glock *gl) +{ + struct address_space *mapping = gfs2_glock2aspace(gl); + int error; + + filemap_fdatawrite(mapping); + error = filemap_fdatawait(mapping); + + if (error) + gfs2_io_error(gl->gl_sbd); +} + static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index e04d0e09ee7b..7b0f5043cf24 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -155,7 +155,7 @@ static int __init init_gfs2_fs(void) goto fail_wq; gfs2_control_wq = alloc_workqueue("gfs2_control", - WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0); + WQ_UNBOUND | WQ_FREEZABLE, 0); if (!gfs2_control_wq) goto fail_recovery; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0da390686c08..932415050540 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -98,24 +98,6 @@ const struct address_space_operations gfs2_meta_aops = { }; /** - * gfs2_meta_sync - Sync all buffers associated with a glock - * @gl: The glock - * - */ - -void gfs2_meta_sync(struct gfs2_glock *gl) -{ - struct address_space *mapping = gfs2_glock2aspace(gl); - int error; - - filemap_fdatawrite(mapping); - error = filemap_fdatawait(mapping); - - if (error) - gfs2_io_error(gl->gl_sbd); -} - -/** * gfs2_getbuf - Get a buffer with a given address space * @gl: the glock * @blkno: the block number (filesystem scope) diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 0d4c843b6f8e..4823b934208a 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -48,21 +48,17 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) return inode->i_sb->s_fs_info; } -void gfs2_meta_sync(struct gfs2_glock *gl); - -struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); -int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, - int flags, struct buffer_head **bhp); -int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); -struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); - -void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, - int meta); - -void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); - -int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, - struct buffer_head **bhp); +extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); +extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, + struct buffer_head **bhp); +extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); +extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, + int create); +extern void gfs2_remove_from_journal(struct buffer_head *bh, + struct gfs2_trans *tr, int meta); +extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, + struct buffer_head **bhp); static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, struct buffer_head **bhp) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 0262c190b6f9..19ff5e8c285c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -646,6 +646,48 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) return error; } +/** + * check_journal_clean - Make sure a journal is clean for a spectator mount + * @sdp: The GFS2 superblock + * @jd: The journal descriptor + * + * Returns: 0 if the journal is clean or locked, else an error + */ +static int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) +{ + int error; + struct gfs2_holder j_gh; + struct gfs2_log_header_host head; + struct gfs2_inode *ip; + + ip = GFS2_I(jd->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | + GL_EXACT | GL_NOCACHE, &j_gh); + if (error) { + fs_err(sdp, "Error locking journal for spectator mount.\n"); + return -EPERM; + } + error = gfs2_jdesc_check(jd); + if (error) { + fs_err(sdp, "Error checking journal for spectator mount.\n"); + goto out_unlock; + } + error = gfs2_find_jhead(jd, &head); + if (error) { + fs_err(sdp, "Error parsing journal for spectator mount.\n"); + goto out_unlock; + } + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + error = -EPERM; + fs_err(sdp, "jid=%u: Journal is dirty, so the first mounter " + "must not be a spectator.\n", jd->jd_jid); + } + +out_unlock: + gfs2_glock_dq_uninit(&j_gh); + return error; +} + static int init_journal(struct gfs2_sbd *sdp, int undo) { struct inode *master = sdp->sd_master_dir->d_inode; @@ -732,8 +774,15 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_lockstruct.ls_first) { unsigned int x; for (x = 0; x < sdp->sd_journals; x++) { - error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x), - true); + struct gfs2_jdesc *jd = gfs2_jdesc_find(sdp, x); + + if (sdp->sd_args.ar_spectator) { + error = check_journal_clean(sdp, jd); + if (error) + goto fail_jinode_gh; + continue; + } + error = gfs2_recover_journal(jd, true); if (error) { fs_err(sdp, "error recovering journal %u: %d\n", x, error); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index cddb05217512..25437280a207 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -361,6 +361,13 @@ retry: return 0; } +static int hostfs_file_release(struct inode *inode, struct file *file) +{ + filemap_write_and_wait(inode->i_mapping); + + return 0; +} + int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; @@ -386,7 +393,7 @@ static const struct file_operations hostfs_file_fops = { .write = do_sync_write, .mmap = generic_file_mmap, .open = hostfs_file_open, - .release = NULL, + .release = hostfs_file_release, .fsync = hostfs_fsync, }; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a3f868ae3fd4..d19b30ababf1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -463,6 +463,14 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, return inode; } +/* + * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never + * be taken from reclaim -- unlike regular filesystems. This needs an + * annotation because huge_pmd_share() does an allocation under + * i_mmap_mutex. + */ +struct lock_class_key hugetlbfs_i_mmap_mutex_key; + static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev) @@ -474,6 +482,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); + lockdep_set_class(&inode->i_mapping->i_mmap_mutex, + &hugetlbfs_i_mmap_mutex_key); inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -916,14 +926,8 @@ static int get_hstate_idx(int page_size_log) return h - hstates; } -static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen) -{ - return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", - dentry->d_name.name); -} - static struct dentry_operations anon_ops = { - .d_dname = hugetlb_dname + .d_dname = simple_dname }; /* diff --git a/fs/inode.c b/fs/inode.c index d6dfb09c8280..93a0625b46e4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1525,7 +1525,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -void touch_atime(struct path *path) +void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = path->dentry->d_inode; diff --git a/fs/internal.h b/fs/internal.h index 7c5f01cf619d..2be46ea5dd0b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -45,6 +45,9 @@ extern void __init chrdev_init(void); * namei.c */ extern int __inode_permission(struct inode *, int); +extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); +extern int vfs_path_lookup(struct dentry *, struct vfsmount *, + const char *, unsigned int, struct path *); /* * namespace.c @@ -126,6 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool); * dcache.c */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); +extern int d_set_mounted(struct dentry *dentry); /* * read_write.c diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index c348d6d88624..e5d408a7ea4a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -117,8 +117,8 @@ static void destroy_inodecache(void) static int isofs_remount(struct super_block *sb, int *flags, char *data) { - /* we probably want a lot more here */ - *flags |= MS_RDONLY; + if (!(*flags & MS_RDONLY)) + return -EROFS; return 0; } @@ -763,15 +763,6 @@ root_found: */ s->s_maxbytes = 0x80000000000LL; - /* - * The CDROM is read-only, has no nodes (devices) on it, and since - * all of the files appear to be owned by root, we really do not want - * to allow suid. (suid or devices will not show up unless we have - * Rock Ridge extensions) - */ - - s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */; - /* Set this for reference. Its not currently used except on write which we don't have .. */ @@ -1530,6 +1521,9 @@ struct inode *isofs_iget(struct super_block *sb, static struct dentry *isofs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { + /* We don't support read-write mounts */ + if (!(flags & MS_RDONLY)) + return ERR_PTR(-EACCES); return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 11bb11f48b3a..bb217dcb41af 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -340,13 +340,13 @@ void journal_commit_transaction(journal_t *journal) J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; - J_ASSERT(commit_transaction->t_state == T_RUNNING); trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_RUNNING); commit_transaction->t_state = T_LOCKED; trace_jbd_commit_locking(journal, commit_transaction); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 6510d6355729..2d04f9afafd7 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -90,6 +90,24 @@ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); static const char *journal_dev_name(journal_t *journal, char *buffer); +#ifdef CONFIG_JBD_DEBUG +void __jbd_debug(int level, const char *file, const char *func, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (level > journal_enable_debug) + return; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + va_end(args); +} +EXPORT_SYMBOL(__jbd_debug); +#endif + /* * Helper function used to manage commit timeouts */ diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 559bec1a37b4..cf2fc0594063 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -343,14 +343,14 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct page *page = bh->b_page; __u8 *addr; __u32 csum32; + __be32 seq; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; - sequence = cpu_to_be32(sequence); + seq = cpu_to_be32(sequence); addr = kmap_atomic(page); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, - sizeof(sequence)); + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), bh->b_size); kunmap_atomic(addr); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 02c7ad9d7a41..52032647dd4a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -130,9 +130,10 @@ int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; } -static __u32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) +static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) { - __u32 csum, old_csum; + __u32 csum; + __be32 old_csum; old_csum = sb->s_checksum; sb->s_checksum = 0; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index d4851464b57e..3929c50428b1 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -178,7 +178,8 @@ static int jbd2_descr_block_csum_verify(journal_t *j, void *buf) { struct jbd2_journal_block_tail *tail; - __u32 provided, calculated; + __be32 provided; + __u32 calculated; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -190,8 +191,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j, calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); tail->t_checksum = provided; - provided = be32_to_cpu(provided); - return provided == calculated; + return provided == cpu_to_be32(calculated); } /* @@ -381,7 +381,8 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh, static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) { struct commit_header *h; - __u32 provided, calculated; + __be32 provided; + __u32 calculated; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -392,21 +393,20 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); h->h_chksum[0] = provided; - provided = be32_to_cpu(provided); - return provided == calculated; + return provided == cpu_to_be32(calculated); } static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, void *buf, __u32 sequence) { __u32 csum32; + __be32 seq; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; - sequence = cpu_to_be32(sequence); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, - sizeof(sequence)); + seq = cpu_to_be32(sequence); + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); return tag->t_checksum == cpu_to_be16(csum32); @@ -808,7 +808,8 @@ static int jbd2_revoke_block_csum_verify(journal_t *j, void *buf) { struct jbd2_journal_revoke_tail *tail; - __u32 provided, calculated; + __be32 provided; + __u32 calculated; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -820,8 +821,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j, calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); tail->r_checksum = provided; - provided = be32_to_cpu(provided); - return provided == calculated; + return provided == cpu_to_be32(calculated); } /* Scan a revoke record, marking all blocks mentioned as revoked. */ diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 8743ba9c6742..984c2bbf4f61 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -3047,6 +3047,14 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) dir_index = (u32) ctx->pos; + /* + * NFSv4 reserves cookies 1 and 2 for . and .. so the value + * we return to the vfs is one greater than the one we use + * internally. + */ + if (dir_index) + dir_index--; + if (dir_index > 1) { struct dir_table_slot dirtab_slot; @@ -3086,7 +3094,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) if (p->header.flag & BT_INTERNAL) { jfs_err("jfs_readdir: bad index table"); DT_PUTPAGE(mp); - ctx->pos = -1; + ctx->pos = DIREND; return 0; } } else { @@ -3094,14 +3102,14 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) /* * self "." */ - ctx->pos = 0; + ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; } /* * parent ".." */ - ctx->pos = 1; + ctx->pos = 2; if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; @@ -3122,22 +3130,23 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) /* * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 * - * pn = index = 0: First entry "." - * pn = 0; index = 1: Second entry ".." + * pn = 0; index = 1: First entry "." + * pn = 0; index = 2: Second entry ".." * pn > 0: Real entries, pn=1 -> leftmost page * pn = index = -1: No more entries */ dtpos = ctx->pos; - if (dtpos == 0) { + if (dtpos < 2) { /* build "." entry */ + ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; - dtoffset->index = 1; + dtoffset->index = 2; ctx->pos = dtpos; } if (dtoffset->pn == 0) { - if (dtoffset->index == 1) { + if (dtoffset->index == 2) { /* build ".." entry */ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; @@ -3228,6 +3237,12 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) } jfs_dirent->position = unique_pos++; } + /* + * We add 1 to the index because we may + * use a value of 2 internally, and NFSv4 + * doesn't like that. + */ + jfs_dirent->position++; } else { jfs_dirent->position = dtpos; len = min(d_namleft, DTLHDRDATALEN_LEGACY); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 01bfe7662751..41e491b8e5d7 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -64,12 +64,17 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) nlm_init->protocol, nlm_version, nlm_init->hostname, nlm_init->noresvport, nlm_init->net); - if (host == NULL) { - lockd_down(nlm_init->net); - return ERR_PTR(-ENOLCK); - } + if (host == NULL) + goto out_nohost; + if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL) + goto out_nobind; return host; +out_nobind: + nlmclnt_release_host(host); +out_nohost: + lockd_down(nlm_init->net); + return ERR_PTR(-ENOLCK); } EXPORT_SYMBOL_GPL(nlmclnt_init); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 9760ecb9b60f..acd394716349 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -125,14 +125,15 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) { struct nlm_args *argp = &req->a_args; struct nlm_lock *lock = &argp->lock; + char *nodename = req->a_host->h_rpcclnt->cl_nodename; nlmclnt_next_cookie(&argp->cookie); memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); - lock->caller = utsname()->nodename; + lock->caller = nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", (unsigned int)fl->fl_u.nfs_fl.owner->pid, - utsname()->nodename); + nodename); lock->svid = fl->fl_u.nfs_fl.owner->pid; lock->fl.fl_start = fl->fl_start; lock->fl.fl_end = fl->fl_end; diff --git a/fs/namei.c b/fs/namei.c index 8b61d103a8a7..409a441ba2ae 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -508,56 +508,78 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) { struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; - int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - want_root = 1; - spin_lock(&fs->lock); - if (nd->root.mnt != fs->root.mnt || - nd->root.dentry != fs->root.dentry) - goto err_root; - } - spin_lock(&parent->d_lock); + + /* + * Get a reference to the parent first: we're + * going to make "path_put(nd->path)" valid in + * non-RCU context for "terminate_walk()". + * + * If this doesn't work, return immediately with + * RCU walking still active (and then we will do + * the RCU walk cleanup in terminate_walk()). + */ + if (!lockref_get_not_dead(&parent->d_lockref)) + return -ECHILD; + + /* + * After the mntget(), we terminate_walk() will do + * the right thing for non-RCU mode, and all our + * subsequent exit cases should unlock_rcu_walk() + * before returning. + */ + mntget(nd->path.mnt); + nd->flags &= ~LOOKUP_RCU; + + /* + * For a negative lookup, the lookup sequence point is the parents + * sequence point, and it only needs to revalidate the parent dentry. + * + * For a positive lookup, we need to move both the parent and the + * dentry from the RCU domain to be properly refcounted. And the + * sequence number in the dentry validates *both* dentry counters, + * since we checked the sequence number of the parent after we got + * the child sequence number. So we know the parent must still + * be valid if the child sequence number is still valid. + */ if (!dentry) { - if (!__d_rcu_to_refcount(parent, nd->seq)) - goto err_parent; + if (read_seqcount_retry(&parent->d_seq, nd->seq)) + goto out; BUG_ON(nd->inode != parent->d_inode); } else { - if (dentry->d_parent != parent) - goto err_parent; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!__d_rcu_to_refcount(dentry, nd->seq)) - goto err_child; - /* - * If the sequence check on the child dentry passed, then - * the child has not been removed from its parent. This - * means the parent dentry must be valid and able to take - * a reference at this point. - */ - BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); - BUG_ON(!parent->d_count); - parent->d_count++; - spin_unlock(&dentry->d_lock); + if (!lockref_get_not_dead(&dentry->d_lockref)) + goto out; + if (read_seqcount_retry(&dentry->d_seq, nd->seq)) + goto drop_dentry; } - spin_unlock(&parent->d_lock); - if (want_root) { + + /* + * Sequence counts matched. Now make sure that the root is + * still valid and get it if required. + */ + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + spin_lock(&fs->lock); + if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) + goto unlock_and_drop_dentry; path_get(&nd->root); spin_unlock(&fs->lock); } - mntget(nd->path.mnt); unlock_rcu_walk(); - nd->flags &= ~LOOKUP_RCU; return 0; -err_child: - spin_unlock(&dentry->d_lock); -err_parent: - spin_unlock(&parent->d_lock); -err_root: - if (want_root) - spin_unlock(&fs->lock); +unlock_and_drop_dentry: + spin_unlock(&fs->lock); +drop_dentry: + unlock_rcu_walk(); + dput(dentry); + goto drop_root_mnt; +out: + unlock_rcu_walk(); +drop_root_mnt: + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; return -ECHILD; } @@ -585,14 +607,16 @@ static int complete_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - spin_lock(&dentry->d_lock); - if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { - spin_unlock(&dentry->d_lock); + + if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { unlock_rcu_walk(); return -ECHILD; } - BUG_ON(nd->inode != dentry->d_inode); - spin_unlock(&dentry->d_lock); + if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { + unlock_rcu_walk(); + dput(dentry); + return -ECHILD; + } mntget(nd->path.mnt); unlock_rcu_walk(); } @@ -2184,6 +2208,194 @@ user_path_parent(int dfd, const char __user *path, struct nameidata *nd, return s; } +/** + * mountpoint_last - look up last component for umount + * @nd: pathwalk nameidata - currently pointing at parent directory of "last" + * @path: pointer to container for result + * + * This is a special lookup_last function just for umount. In this case, we + * need to resolve the path without doing any revalidation. + * + * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since + * mountpoints are always pinned in the dcache, their ancestors are too. Thus, + * in almost all cases, this lookup will be served out of the dcache. The only + * cases where it won't are if nd->last refers to a symlink or the path is + * bogus and it doesn't exist. + * + * Returns: + * -error: if there was an error during lookup. This includes -ENOENT if the + * lookup found a negative dentry. The nd->path reference will also be + * put in this case. + * + * 0: if we successfully resolved nd->path and found it to not to be a + * symlink that needs to be followed. "path" will also be populated. + * The nd->path reference will also be put. + * + * 1: if we successfully resolved nd->last and found it to be a symlink + * that needs to be followed. "path" will be populated with the path + * to the link, and nd->path will *not* be put. + */ +static int +mountpoint_last(struct nameidata *nd, struct path *path) +{ + int error = 0; + struct dentry *dentry; + struct dentry *dir = nd->path.dentry; + + /* If we're in rcuwalk, drop out of it to handle last component */ + if (nd->flags & LOOKUP_RCU) { + if (unlazy_walk(nd, NULL)) { + error = -ECHILD; + goto out; + } + } + + nd->flags &= ~LOOKUP_PARENT; + + if (unlikely(nd->last_type != LAST_NORM)) { + error = handle_dots(nd, nd->last_type); + if (error) + goto out; + dentry = dget(nd->path.dentry); + goto done; + } + + mutex_lock(&dir->d_inode->i_mutex); + dentry = d_lookup(dir, &nd->last); + if (!dentry) { + /* + * No cached dentry. Mounted dentries are pinned in the cache, + * so that means that this dentry is probably a symlink or the + * path doesn't actually point to a mounted dentry. + */ + dentry = d_alloc(dir, &nd->last); + if (!dentry) { + error = -ENOMEM; + goto out; + } + dentry = lookup_real(dir->d_inode, dentry, nd->flags); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out; + } + mutex_unlock(&dir->d_inode->i_mutex); + +done: + if (!dentry->d_inode) { + error = -ENOENT; + dput(dentry); + goto out; + } + path->dentry = dentry; + path->mnt = mntget(nd->path.mnt); + if (should_follow_link(dentry->d_inode, nd->flags & LOOKUP_FOLLOW)) + return 1; + follow_mount(path); + error = 0; +out: + terminate_walk(nd); + return error; +} + +/** + * path_mountpoint - look up a path to be umounted + * @dfd: directory file descriptor to start walk from + * @name: full pathname to walk + * @flags: lookup flags + * + * Look up the given name, but don't attempt to revalidate the last component. + * Returns 0 and "path" will be valid on success; Retuns error otherwise. + */ +static int +path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) +{ + struct file *base = NULL; + struct nameidata nd; + int err; + + err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base); + if (unlikely(err)) + return err; + + current->total_link_count = 0; + err = link_path_walk(name, &nd); + if (err) + goto out; + + err = mountpoint_last(&nd, path); + while (err > 0) { + void *cookie; + struct path link = *path; + err = may_follow_link(&link, &nd); + if (unlikely(err)) + break; + nd.flags |= LOOKUP_PARENT; + err = follow_link(&link, &nd, &cookie); + if (err) + break; + err = mountpoint_last(&nd, path); + put_link(&nd, &link, cookie); + } +out: + if (base) + fput(base); + + if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT)) + path_put(&nd.root); + + return err; +} + +static int +filename_mountpoint(int dfd, struct filename *s, struct path *path, + unsigned int flags) +{ + int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); + if (unlikely(error == -ECHILD)) + error = path_mountpoint(dfd, s->name, path, flags); + if (unlikely(error == -ESTALE)) + error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL); + if (likely(!error)) + audit_inode(s, path->dentry, 0); + return error; +} + +/** + * user_path_mountpoint_at - lookup a path from userland in order to umount it + * @dfd: directory file descriptor + * @name: pathname from userland + * @flags: lookup flags + * @path: pointer to container to hold result + * + * A umount is a special case for path walking. We're not actually interested + * in the inode in this situation, and ESTALE errors can be a problem. We + * simply want track down the dentry and vfsmount attached at the mountpoint + * and avoid revalidating the last component. + * + * Returns 0 and populates "path" on success. + */ +int +user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, + struct path *path) +{ + struct filename *s = getname(name); + int error; + if (IS_ERR(s)) + return PTR_ERR(s); + error = filename_mountpoint(dfd, s, path, flags); + putname(s); + return error; +} + +int +kern_path_mountpoint(int dfd, const char *name, struct path *path, + unsigned int flags) +{ + struct filename s = {.name = name}; + return filename_mountpoint(dfd, &s, path, flags); +} +EXPORT_SYMBOL(kern_path_mountpoint); + /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. @@ -3327,7 +3539,7 @@ void dentry_unhash(struct dentry *dentry) { shrink_dcache_parent(dentry); spin_lock(&dentry->d_lock); - if (dentry->d_count == 1) + if (dentry->d_lockref.count == 1) __d_drop(dentry); spin_unlock(&dentry->d_lock); } diff --git a/fs/namespace.c b/fs/namespace.c index 7b1ca9ba0b0a..25845d1b300b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -611,6 +611,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry) { struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry); struct mountpoint *mp; + int ret; list_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { @@ -626,14 +627,12 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry) if (!mp) return ERR_PTR(-ENOMEM); - spin_lock(&dentry->d_lock); - if (d_unlinked(dentry)) { - spin_unlock(&dentry->d_lock); + ret = d_set_mounted(dentry); + if (ret) { kfree(mp); - return ERR_PTR(-ENOENT); + return ERR_PTR(ret); } - dentry->d_flags |= DCACHE_MOUNTED; - spin_unlock(&dentry->d_lock); + mp->m_dentry = dentry; mp->m_count = 1; list_add(&mp->m_hash, chain); @@ -831,6 +830,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + /* Don't allow unprivileged users to reveal what is under a mount */ + if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) + mnt->mnt.mnt_flags |= MNT_LOCKED; + atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -1318,7 +1321,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); + retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; mnt = real_mount(path.mnt); @@ -1327,6 +1330,8 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; if (!check_mnt(mnt)) goto dput_and_out; + if (mnt->mnt.mnt_flags & MNT_LOCKED) + goto dput_and_out; retval = do_umount(mnt, flags); dput_and_out: @@ -1349,14 +1354,11 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) #endif -static bool mnt_ns_loop(struct path *path) +static bool is_mnt_ns_file(struct dentry *dentry) { - /* Could bind mounting the mount namespace inode cause a - * mount namespace loop? - */ - struct inode *inode = path->dentry->d_inode; + /* Is this a proxy for a mount namespace? */ + struct inode *inode = dentry->d_inode; struct proc_ns *ei; - struct mnt_namespace *mnt_ns; if (!proc_ns_inode(inode)) return false; @@ -1365,7 +1367,19 @@ static bool mnt_ns_loop(struct path *path) if (ei->ns_ops != &mntns_operations) return false; - mnt_ns = ei->ns; + return true; +} + +static bool mnt_ns_loop(struct dentry *dentry) +{ + /* Could bind mounting the mount namespace inode cause a + * mount namespace loop? + */ + struct mnt_namespace *mnt_ns; + if (!is_mnt_ns_file(dentry)) + return false; + + mnt_ns = get_proc_ns(dentry->d_inode)->ns; return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } @@ -1374,13 +1388,17 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, { struct mount *res, *p, *q, *r, *parent; - if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) + if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) + return ERR_PTR(-EINVAL); + + if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) return ERR_PTR(-EINVAL); res = q = clone_mnt(mnt, dentry, flag); if (IS_ERR(q)) return q; + q->mnt.mnt_flags &= ~MNT_LOCKED; q->mnt_mountpoint = mnt->mnt_mountpoint; p = mnt; @@ -1390,7 +1408,13 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { - if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { + if (!(flag & CL_COPY_UNBINDABLE) && + IS_MNT_UNBINDABLE(s)) { + s = skip_mnt_tree(s); + continue; + } + if (!(flag & CL_COPY_MNT_NS_FILE) && + is_mnt_ns_file(s->mnt.mnt_root)) { s = skip_mnt_tree(s); continue; } @@ -1429,7 +1453,7 @@ struct vfsmount *collect_mounts(struct path *path) CL_COPY_ALL | CL_PRIVATE); namespace_unlock(); if (IS_ERR(tree)) - return NULL; + return ERR_CAST(tree); return &tree->mnt; } @@ -1696,6 +1720,19 @@ static int do_change_type(struct path *path, int flag) return err; } +static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + struct mount *child; + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, dentry)) + continue; + + if (child->mnt.mnt_flags & MNT_LOCKED) + return true; + } + return false; +} + /* * do loopback mount. */ @@ -1713,7 +1750,7 @@ static int do_loopback(struct path *path, const char *old_name, return err; err = -EINVAL; - if (mnt_ns_loop(&old_path)) + if (mnt_ns_loop(old_path.dentry)) goto out; mp = lock_mount(path); @@ -1731,8 +1768,11 @@ static int do_loopback(struct path *path, const char *old_name, if (!check_mnt(parent) || !check_mnt(old)) goto out2; + if (!recurse && has_locked_children(old, old_path.dentry)) + goto out2; + if (recurse) - mnt = copy_tree(old, old_path.dentry, 0); + mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); else mnt = clone_mnt(old, old_path.dentry, 0); @@ -1741,6 +1781,8 @@ static int do_loopback(struct path *path, const char *old_name, goto out2; } + mnt->mnt.mnt_flags &= ~MNT_LOCKED; + err = graft_tree(mnt, parent, mp); if (err) { br_write_lock(&vfsmount_lock); @@ -1853,6 +1895,9 @@ static int do_move_mount(struct path *path, const char *old_name) if (!check_mnt(p) || !check_mnt(old)) goto out1; + if (old->mnt.mnt_flags & MNT_LOCKED) + goto out1; + err = -EINVAL; if (old_path.dentry != old_path.mnt->mnt_root) goto out1; @@ -2389,7 +2434,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, namespace_lock(); /* First pass: copy the tree topology */ - copy_flags = CL_COPY_ALL | CL_EXPIRE; + copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != mnt_ns->user_ns) copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); @@ -2424,6 +2469,10 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, } p = next_mnt(p, old); q = next_mnt(q, new); + if (!q) + break; + while (p->mnt.mnt_root != q->mnt.mnt_root) + p = next_mnt(p, old); } namespace_unlock(); @@ -2630,6 +2679,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out4; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) goto out4; + if (new_mnt->mnt.mnt_flags & MNT_LOCKED) + goto out4; error = -ENOENT; if (d_unlinked(new.dentry)) goto out4; @@ -2653,6 +2704,10 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, br_write_lock(&vfsmount_lock); detach_mnt(new_mnt, &parent_path); detach_mnt(root_mnt, &root_parent); + if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { + new_mnt->mnt.mnt_flags |= MNT_LOCKED; + root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; + } /* mount old root on put_old */ attach_mnt(root_mnt, old_mnt, old_mp); /* mount new_root on / */ @@ -2811,25 +2866,38 @@ bool current_chrooted(void) return chrooted; } -void update_mnt_policy(struct user_namespace *userns) +bool fs_fully_visible(struct file_system_type *type) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt; + bool visible = false; - down_read(&namespace_sem); + if (unlikely(!ns)) + return false; + + namespace_lock(); list_for_each_entry(mnt, &ns->list, mnt_list) { - switch (mnt->mnt.mnt_sb->s_magic) { - case SYSFS_MAGIC: - userns->may_mount_sysfs = true; - break; - case PROC_SUPER_MAGIC: - userns->may_mount_proc = true; - break; + struct mount *child; + if (mnt->mnt.mnt_sb->s_type != type) + continue; + + /* This mount is not fully visible if there are any child mounts + * that cover anything except for empty directories. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + struct inode *inode = child->mnt_mountpoint->d_inode; + if (!S_ISDIR(inode->i_mode)) + goto next; + if (inode->i_nlink != 2) + goto next; } - if (userns->may_mount_sysfs && userns->may_mount_proc) - break; + visible = true; + goto found; + next: ; } - up_read(&namespace_sem); +found: + namespace_unlock(); + return visible; } static void *mntns_get(struct task_struct *task) @@ -2860,8 +2928,8 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) struct path root; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_CHROOT) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; if (fs->users != 1) diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index e0bb048e9576..03192a66c143 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -4,9 +4,10 @@ obj-$(CONFIG_NFS_FS) += nfs.o +CFLAGS_nfstrace.o += -I$(src) nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ direct.o pagelist.o read.o symlink.o unlink.o \ - write.o namespace.o mount_clnt.o + write.o namespace.o mount_clnt.o nfstrace.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o @@ -19,12 +20,14 @@ nfsv3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o obj-$(CONFIG_NFS_V4) += nfsv4.o +CFLAGS_nfs4trace.o += -I$(src) nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ - nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o + nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \ + dns_resolve.o nfs4trace.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o -nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o +nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index e6ebc4c38c81..ae2e87b95453 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -15,6 +15,7 @@ #include "internal.h" #include "pnfs.h" #include "nfs4session.h" +#include "nfs4trace.h" #ifdef NFS_DEBUG #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -93,6 +94,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, default: res = htonl(NFS4ERR_RESOURCE); } + trace_nfs4_recall_delegation(inode, -ntohl(res)); iput(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); @@ -301,14 +303,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) { struct nfs4_slot *slot; - dprintk("%s enter. slotid %d seqid %d\n", + dprintk("%s enter. slotid %u seqid %u\n", __func__, args->csa_slotid, args->csa_sequenceid); if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS) return htonl(NFS4ERR_BADSLOT); slot = tbl->slots + args->csa_slotid; - dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr); + dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr); /* Normal */ if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { @@ -318,7 +320,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { - dprintk("%s seqid %d is a replay\n", + dprintk("%s seqid %u is a replay\n", __func__, args->csa_sequenceid); /* Signal process_op to set this error on next op */ if (args->csa_cachethis == 0) @@ -462,6 +464,7 @@ out: } else res->csr_status = status; + trace_nfs4_cb_sequence(args, res, status); dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, ntohl(status), ntohl(res->csr_status)); return status; @@ -518,7 +521,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, if (!cps->clp) /* set in cb_sequence */ goto out; - dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %d\n", + dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %u\n", rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), args->crsa_target_highest_slotid); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 340b1eff0267..2dceee4db076 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -501,8 +501,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new->cl_flags = cl_init->init_flags; - return rpc_ops->init_client(new, timeparms, ip_addr, - authflavour); + return rpc_ops->init_client(new, timeparms, ip_addr); } spin_unlock(&nn->nfs_client_lock); @@ -694,13 +693,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); * @clp: nfs_client to initialise * @timeparms: timeout parameters for underlying RPC transport * @ip_addr: IP presentation address (not used) - * @authflavor: authentication flavor for underlying RPC transport * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, - const char *ip_addr, rpc_authflavor_t authflavour) + const char *ip_addr) { int error; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 7ec4814e298d..ef792f29f831 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -20,6 +20,7 @@ #include "nfs4_fs.h" #include "delegation.h" #include "internal.h" +#include "nfs4trace.h" static void nfs_free_delegation(struct nfs_delegation *delegation) { @@ -160,6 +161,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, spin_unlock(&delegation->lock); put_rpccred(oldcred); rcu_read_unlock(); + trace_nfs4_reclaim_delegation(inode, res->delegation_type); } else { /* We appear to have raced with a delegation return. */ spin_unlock(&delegation->lock); @@ -344,6 +346,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct spin_lock(&inode->i_lock); nfsi->cache_validity |= NFS_INO_REVAL_FORCED; spin_unlock(&inode->i_lock); + trace_nfs4_set_delegation(inode, res->delegation_type); out: spin_unlock(&clp->cl_lock); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e474ca2b2bfe..e79bc6ce828e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -43,6 +43,8 @@ #include "internal.h" #include "fscache.h" +#include "nfstrace.h" + /* #define NFS_DEBUG_VERBOSE 1 */ static int nfs_opendir(struct inode *, struct file *); @@ -1100,7 +1102,9 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (IS_ERR(label)) goto out_error; + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); + trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); if (error) goto out_bad; if (nfs_compare_fh(NFS_FH(inode), fhandle)) @@ -1135,14 +1139,13 @@ out_zap_parent: if (inode && S_ISDIR(inode->i_mode)) { /* Purge readdir caches. */ nfs_zap_caches(inode); - /* If we have submounts, don't unhash ! */ - if (have_submounts(dentry)) - goto out_valid; if (dentry->d_flags & DCACHE_DISCONNECTED) goto out_valid; - shrink_dcache_parent(dentry); } - d_drop(dentry); + /* If we have submounts, don't unhash ! */ + if (check_submounts_and_drop(dentry) != 0) + goto out_valid; + dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", __func__, dentry->d_parent->d_name.name, @@ -1313,6 +1316,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ + trace_nfs_lookup_enter(dir, dentry, flags); nfs_block_sillyrename(parent); error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); if (error == -ENOENT) @@ -1339,6 +1343,7 @@ no_entry: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_unblock_sillyrename: nfs_unblock_sillyrename(parent); + trace_nfs_lookup_exit(dir, dentry, flags, error); nfs4_label_free(label); out: nfs_free_fattr(fattr); @@ -1393,7 +1398,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx, nfs_file_set_open_context(file, ctx); out: - put_nfs_open_context(ctx); return err; } @@ -1405,6 +1409,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct dentry *res; struct iattr attr = { .ia_valid = ATTR_OPEN }; struct inode *inode; + unsigned int lookup_flags = 0; int err; /* Expect a negative dentry */ @@ -1413,6 +1418,10 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + err = nfs_check_flags(open_flags); + if (err) + return err; + /* NFS only supports OPEN on regular files */ if ((open_flags & O_DIRECTORY)) { if (!d_unhashed(dentry)) { @@ -1423,6 +1432,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, */ return -ENOENT; } + lookup_flags = LOOKUP_OPEN|LOOKUP_DIRECTORY; goto no_open; } @@ -1443,12 +1453,14 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, if (IS_ERR(ctx)) goto out; + trace_nfs_atomic_open_enter(dir, ctx, open_flags); nfs_block_sillyrename(dentry->d_parent); inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); nfs_unblock_sillyrename(dentry->d_parent); if (IS_ERR(inode)) { - put_nfs_open_context(ctx); err = PTR_ERR(inode); + trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); + put_nfs_open_context(ctx); switch (err) { case -ENOENT: d_drop(dentry); @@ -1469,11 +1481,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, } err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened); + trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); + put_nfs_open_context(ctx); out: return err; no_open: - res = nfs_lookup(dir, dentry, 0); + res = nfs_lookup(dir, dentry, lookup_flags); err = PTR_ERR(res); if (IS_ERR(res)) goto out; @@ -1597,7 +1611,9 @@ int nfs_create(struct inode *dir, struct dentry *dentry, attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; + trace_nfs_create_enter(dir, dentry, open_flags); error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); + trace_nfs_create_exit(dir, dentry, open_flags, error); if (error != 0) goto out_err; return 0; @@ -1625,7 +1641,9 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; + trace_nfs_mknod_enter(dir, dentry); status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); + trace_nfs_mknod_exit(dir, dentry, status); if (status != 0) goto out_err; return 0; @@ -1649,7 +1667,9 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) attr.ia_valid = ATTR_MODE; attr.ia_mode = mode | S_IFDIR; + trace_nfs_mkdir_enter(dir, dentry); error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); + trace_nfs_mkdir_exit(dir, dentry, error); if (error != 0) goto out_err; return 0; @@ -1672,12 +1692,21 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); - /* Ensure the VFS deletes this inode */ - if (error == 0 && dentry->d_inode != NULL) - clear_nlink(dentry->d_inode); - else if (error == -ENOENT) - nfs_dentry_handle_enoent(dentry); + trace_nfs_rmdir_enter(dir, dentry); + if (dentry->d_inode) { + nfs_wait_on_sillyrename(dentry); + error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); + /* Ensure the VFS deletes this inode */ + switch (error) { + case 0: + clear_nlink(dentry->d_inode); + break; + case -ENOENT: + nfs_dentry_handle_enoent(dentry); + } + } else + error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); + trace_nfs_rmdir_exit(dir, dentry, error); return error; } @@ -1705,6 +1734,7 @@ static int nfs_safe_remove(struct dentry *dentry) goto out; } + trace_nfs_remove_enter(dir, dentry); if (inode != NULL) { NFS_PROTO(inode)->return_delegation(inode); error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); @@ -1714,6 +1744,7 @@ static int nfs_safe_remove(struct dentry *dentry) error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); if (error == -ENOENT) nfs_dentry_handle_enoent(dentry); + trace_nfs_remove_exit(dir, dentry, error); out: return error; } @@ -1731,13 +1762,14 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + trace_nfs_unlink_enter(dir, dentry); spin_lock(&dentry->d_lock); if (d_count(dentry) > 1) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(dentry->d_inode, 0); error = nfs_sillyrename(dir, dentry); - return error; + goto out; } if (!d_unhashed(dentry)) { __d_drop(dentry); @@ -1749,6 +1781,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); } else if (need_rehash) d_rehash(dentry); +out: + trace_nfs_unlink_exit(dir, dentry, error); return error; } EXPORT_SYMBOL_GPL(nfs_unlink); @@ -1795,7 +1829,9 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); kunmap_atomic(kaddr); + trace_nfs_symlink_enter(dir, dentry); error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); + trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, @@ -1830,6 +1866,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry->d_parent->d_name.name, old_dentry->d_name.name, dentry->d_parent->d_name.name, dentry->d_name.name); + trace_nfs_link_enter(inode, dir, dentry); NFS_PROTO(inode)->return_delegation(inode); d_drop(dentry); @@ -1838,6 +1875,7 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) ihold(inode); d_add(dentry, inode); } + trace_nfs_link_exit(inode, dir, dentry, error); return error; } EXPORT_SYMBOL_GPL(nfs_link); @@ -1879,6 +1917,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_parent->d_name.name, new_dentry->d_name.name, d_count(new_dentry)); + trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry); /* * For non-directories, check whether the target is busy and if so, * make a copy of the dentry and then do a silly-rename. If the @@ -1925,6 +1964,8 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, out: if (rehash) d_rehash(rehash); + trace_nfs_rename_exit(old_dir, old_dentry, + new_dir, new_dentry, error); if (!error) { if (new_inode != NULL) nfs_drop_nlink(new_inode); @@ -2174,9 +2215,11 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) struct nfs_access_entry cache; int status; + trace_nfs_access_enter(inode); + status = nfs_access_get_cached(inode, cred, &cache); if (status == 0) - goto out; + goto out_cached; /* Be clever: ask server to check for all possible rights */ cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; @@ -2189,13 +2232,15 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) if (!S_ISDIR(inode->i_mode)) set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); } - return status; + goto out; } nfs_access_add_cache(inode, &cache); +out_cached: + if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) + status = -EACCES; out: - if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) - return 0; - return -EACCES; + trace_nfs_access_exit(inode, status); + return status; } static int nfs_open_permission_mask(int openflags) @@ -2241,11 +2286,6 @@ int nfs_permission(struct inode *inode, int mask) case S_IFLNK: goto out; case S_IFREG: - /* NFSv4 has atomic_open... */ - if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN) - && (mask & MAY_OPEN) - && !(mask & MAY_EXEC)) - goto out; break; case S_IFDIR: /* diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 94e94bd11aae..1e6bfdbc1aff 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -37,6 +37,8 @@ #include "iostat.h" #include "fscache.h" +#include "nfstrace.h" + #define NFSDBG_FACILITY NFSDBG_FILE static const struct vm_operations_struct nfs_file_vm_ops; @@ -294,6 +296,8 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ret; struct inode *inode = file_inode(file); + trace_nfs_fsync_enter(inode); + do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -310,6 +314,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) end = LLONG_MAX; } while (ret == -EAGAIN); + trace_nfs_fsync_exit(inode, ret); return ret; } @@ -406,6 +411,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct nfs_open_context *ctx = nfs_file_open_context(file); int status; dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", @@ -441,6 +447,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, if (status < 0) return status; NFS_I(mapping->host)->write_io += copied; + + if (nfs_ctx_key_to_expire(ctx)) { + status = nfs_wb_all(mapping->host); + if (status < 0) + return status; + } + return copied; } @@ -637,7 +650,8 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode) if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) return 1; ctx = nfs_file_open_context(filp); - if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) + if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || + nfs_ctx_key_to_expire(ctx)) return 1; return 0; } @@ -651,6 +665,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, ssize_t result; size_t count = iov_length(iov, nr_segs); + result = nfs_key_timeout_notify(iocb->ki_filp, inode); + if (result) + return result; + if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_write(iocb, iov, nr_segs, pos, true); diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index c2c4163d5683..567983d2c0eb 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -49,6 +49,7 @@ #include "internal.h" #include "netns.h" +#include "nfs4trace.h" #define NFS_UINT_MAXLEN 11 @@ -63,6 +64,7 @@ struct idmap_legacy_upcalldata { }; struct idmap { + struct rpc_pipe_dir_object idmap_pdo; struct rpc_pipe *idmap_pipe; struct idmap_legacy_upcalldata *idmap_upcall_data; struct mutex idmap_mutex; @@ -310,7 +312,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, if (ret < 0) goto out_up; - payload = rcu_dereference(rkey->payload.data); + payload = rcu_dereference(rkey->payload.rcudata); if (IS_ERR_OR_NULL(payload)) { ret = PTR_ERR(payload); goto out_up; @@ -401,16 +403,23 @@ static struct key_type key_type_id_resolver_legacy = { .request_key = nfs_idmap_legacy_upcall, }; -static void __nfs_idmap_unregister(struct rpc_pipe *pipe) +static void nfs_idmap_pipe_destroy(struct dentry *dir, + struct rpc_pipe_dir_object *pdo) { - if (pipe->dentry) + struct idmap *idmap = pdo->pdo_data; + struct rpc_pipe *pipe = idmap->idmap_pipe; + + if (pipe->dentry) { rpc_unlink(pipe->dentry); + pipe->dentry = NULL; + } } -static int __nfs_idmap_register(struct dentry *dir, - struct idmap *idmap, - struct rpc_pipe *pipe) +static int nfs_idmap_pipe_create(struct dentry *dir, + struct rpc_pipe_dir_object *pdo) { + struct idmap *idmap = pdo->pdo_data; + struct rpc_pipe *pipe = idmap->idmap_pipe; struct dentry *dentry; dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); @@ -420,36 +429,10 @@ static int __nfs_idmap_register(struct dentry *dir, return 0; } -static void nfs_idmap_unregister(struct nfs_client *clp, - struct rpc_pipe *pipe) -{ - struct net *net = clp->cl_net; - struct super_block *pipefs_sb; - - pipefs_sb = rpc_get_sb_net(net); - if (pipefs_sb) { - __nfs_idmap_unregister(pipe); - rpc_put_sb_net(net); - } -} - -static int nfs_idmap_register(struct nfs_client *clp, - struct idmap *idmap, - struct rpc_pipe *pipe) -{ - struct net *net = clp->cl_net; - struct super_block *pipefs_sb; - int err = 0; - - pipefs_sb = rpc_get_sb_net(net); - if (pipefs_sb) { - if (clp->cl_rpcclient->cl_dentry) - err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry, - idmap, pipe); - rpc_put_sb_net(net); - } - return err; -} +static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { + .create = nfs_idmap_pipe_create, + .destroy = nfs_idmap_pipe_destroy, +}; int nfs_idmap_new(struct nfs_client *clp) @@ -462,23 +445,31 @@ nfs_idmap_new(struct nfs_client *clp) if (idmap == NULL) return -ENOMEM; + rpc_init_pipe_dir_object(&idmap->idmap_pdo, + &nfs_idmap_pipe_dir_object_ops, + idmap); + pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0); if (IS_ERR(pipe)) { error = PTR_ERR(pipe); - kfree(idmap); - return error; - } - error = nfs_idmap_register(clp, idmap, pipe); - if (error) { - rpc_destroy_pipe_data(pipe); - kfree(idmap); - return error; + goto err; } idmap->idmap_pipe = pipe; mutex_init(&idmap->idmap_mutex); + error = rpc_add_pipe_dir_object(clp->cl_net, + &clp->cl_rpcclient->cl_pipedir_objects, + &idmap->idmap_pdo); + if (error) + goto err_destroy_pipe; + clp->cl_idmap = idmap; return 0; +err_destroy_pipe: + rpc_destroy_pipe_data(idmap->idmap_pipe); +err: + kfree(idmap); + return error; } void @@ -488,130 +479,26 @@ nfs_idmap_delete(struct nfs_client *clp) if (!idmap) return; - nfs_idmap_unregister(clp, idmap->idmap_pipe); - rpc_destroy_pipe_data(idmap->idmap_pipe); clp->cl_idmap = NULL; + rpc_remove_pipe_dir_object(clp->cl_net, + &clp->cl_rpcclient->cl_pipedir_objects, + &idmap->idmap_pdo); + rpc_destroy_pipe_data(idmap->idmap_pipe); kfree(idmap); } -static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event, - struct super_block *sb) -{ - int err = 0; - - switch (event) { - case RPC_PIPEFS_MOUNT: - err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry, - clp->cl_idmap, - clp->cl_idmap->idmap_pipe); - break; - case RPC_PIPEFS_UMOUNT: - if (clp->cl_idmap->idmap_pipe) { - struct dentry *parent; - - parent = clp->cl_idmap->idmap_pipe->dentry->d_parent; - __nfs_idmap_unregister(clp->cl_idmap->idmap_pipe); - /* - * Note: This is a dirty hack. SUNRPC hook has been - * called already but simple_rmdir() call for the - * directory returned with error because of idmap pipe - * inside. Thus now we have to remove this directory - * here. - */ - if (rpc_rmdir(parent)) - printk(KERN_ERR "NFS: %s: failed to remove " - "clnt dir!\n", __func__); - } - break; - default: - printk(KERN_ERR "NFS: %s: unknown event: %ld\n", __func__, - event); - return -ENOTSUPP; - } - return err; -} - -static struct nfs_client *nfs_get_client_for_event(struct net *net, int event) -{ - struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *cl_dentry; - struct nfs_client *clp; - int err; - -restart: - spin_lock(&nn->nfs_client_lock); - list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { - /* Wait for initialisation to finish */ - if (clp->cl_cons_state == NFS_CS_INITING) { - atomic_inc(&clp->cl_count); - spin_unlock(&nn->nfs_client_lock); - err = nfs_wait_client_init_complete(clp); - nfs_put_client(clp); - if (err) - return NULL; - goto restart; - } - /* Skip nfs_clients that failed to initialise */ - if (clp->cl_cons_state < 0) - continue; - smp_rmb(); - if (clp->rpc_ops != &nfs_v4_clientops) - continue; - cl_dentry = clp->cl_idmap->idmap_pipe->dentry; - if (((event == RPC_PIPEFS_MOUNT) && cl_dentry) || - ((event == RPC_PIPEFS_UMOUNT) && !cl_dentry)) - continue; - atomic_inc(&clp->cl_count); - spin_unlock(&nn->nfs_client_lock); - return clp; - } - spin_unlock(&nn->nfs_client_lock); - return NULL; -} - -static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, - void *ptr) -{ - struct super_block *sb = ptr; - struct nfs_client *clp; - int error = 0; - - if (!try_module_get(THIS_MODULE)) - return 0; - - while ((clp = nfs_get_client_for_event(sb->s_fs_info, event))) { - error = __rpc_pipefs_event(clp, event, sb); - nfs_put_client(clp); - if (error) - break; - } - module_put(THIS_MODULE); - return error; -} - -#define PIPEFS_NFS_PRIO 1 - -static struct notifier_block nfs_idmap_block = { - .notifier_call = rpc_pipefs_event, - .priority = SUNRPC_PIPEFS_NFS_PRIO, -}; - int nfs_idmap_init(void) { int ret; ret = nfs_idmap_init_keyring(); if (ret != 0) goto out; - ret = rpc_pipefs_notifier_register(&nfs_idmap_block); - if (ret != 0) - nfs_idmap_quit_keyring(); out: return ret; } void nfs_idmap_quit(void) { - rpc_pipefs_notifier_unregister(&nfs_idmap_block); nfs_idmap_quit_keyring(); } @@ -849,6 +736,7 @@ int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_ if (!uid_valid(*uid)) ret = -ERANGE; } + trace_nfs4_map_name_to_uid(name, namelen, id, ret); return ret; } @@ -865,6 +753,7 @@ int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size if (!gid_valid(*gid)) ret = -ERANGE; } + trace_nfs4_map_group_to_gid(name, namelen, id, ret); return ret; } @@ -879,6 +768,7 @@ int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); if (ret < 0) ret = nfs_map_numeric_to_string(id, buf, buflen); + trace_nfs4_map_uid_to_name(buf, ret, id, ret); return ret; } int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen) @@ -892,5 +782,6 @@ int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); if (ret < 0) ret = nfs_map_numeric_to_string(id, buf, buflen); + trace_nfs4_map_gid_to_group(buf, ret, id, ret); return ret; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index af6e806044d7..87e797640828 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -38,7 +38,6 @@ #include <linux/slab.h> #include <linux/compat.h> #include <linux/freezer.h> -#include <linux/crc32.h> #include <asm/uaccess.h> @@ -52,6 +51,8 @@ #include "nfs.h" #include "netns.h" +#include "nfstrace.h" + #define NFSDBG_FACILITY NFSDBG_VFS #define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 @@ -463,7 +464,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); - nfs_setsecurity(inode, fattr, label); dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), @@ -504,6 +504,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) return 0; + trace_nfs_setattr_enter(inode); + /* Write all dirty data */ if (S_ISREG(inode->i_mode)) { nfs_inode_dio_wait(inode); @@ -523,6 +525,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) error = nfs_refresh_inode(inode, fattr); nfs_free_fattr(fattr); out: + trace_nfs_setattr_exit(inode, error); return error; } EXPORT_SYMBOL_GPL(nfs_setattr); @@ -592,6 +595,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; int err; + trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { nfs_inode_dio_wait(inode); @@ -622,6 +626,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); } out: + trace_nfs_getattr_exit(inode, err); return err; } EXPORT_SYMBOL_GPL(nfs_getattr); @@ -876,6 +881,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + trace_nfs_revalidate_inode_enter(inode); + if (is_bad_inode(inode)) goto out; if (NFS_STALE(inode)) @@ -926,6 +933,7 @@ err_out: nfs4_label_free(label); out: nfs_free_fattr(fattr); + trace_nfs_revalidate_inode_exit(inode, status); return status; } @@ -963,9 +971,15 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode); static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); - + int ret; + if (mapping->nrpages != 0) { - int ret = invalidate_inode_pages2(mapping); + if (S_ISREG(inode->i_mode)) { + ret = nfs_sync_mapping(mapping); + if (ret < 0) + return ret; + } + ret = invalidate_inode_pages2(mapping); if (ret < 0) return ret; } @@ -976,6 +990,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map spin_unlock(&inode->i_lock); nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); nfs_fscache_wait_on_invalidate(inode); + dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); return 0; @@ -1009,8 +1024,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) if (ret < 0) goto out; } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { + trace_nfs_invalidate_mapping_enter(inode); ret = nfs_invalidate_mapping(inode, mapping); + trace_nfs_invalidate_mapping_exit(inode, ret); + } + out: return ret; } @@ -1190,7 +1209,7 @@ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) { /* wireshark uses 32-bit AUTODIN crc and does a bitwise * not on the result */ - return ~crc32(0xFFFFFFFF, &fh->data[0], fh->size); + return nfs_fhandle_hash(fh); } /* @@ -1269,9 +1288,17 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { + int ret; + + trace_nfs_refresh_inode_enter(inode); + if (nfs_inode_attrs_need_update(inode, fattr)) - return nfs_update_inode(inode, fattr); - return nfs_check_inode_attributes(inode, fattr); + ret = nfs_update_inode(inode, fattr); + else + ret = nfs_check_inode_attributes(inode, fattr); + + trace_nfs_refresh_inode_exit(inode, ret); + return ret; } /** diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 3c8373f90ab3..d388302c005f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -5,6 +5,7 @@ #include "nfs4_fs.h" #include <linux/mount.h> #include <linux/security.h> +#include <linux/crc32.h> #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) @@ -185,6 +186,8 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans); +extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, + struct inode *); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); @@ -267,7 +270,7 @@ extern struct rpc_procinfo nfs4_procedures[]; void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, - const char *ip_addr, rpc_authflavor_t authflavour); + const char *ip_addr); /* dir.c */ extern int nfs_access_cache_shrinker(struct shrinker *shrink, @@ -355,7 +358,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, const char *); -extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); +extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool); #endif struct nfs_pgio_completion_ops; @@ -430,6 +433,8 @@ void nfs_request_remove_commit_list(struct nfs_page *req, void nfs_init_cinfo(struct nfs_commit_info *cinfo, struct inode *inode, struct nfs_direct_req *dreq); +int nfs_key_timeout_notify(struct file *filp, struct inode *inode); +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, @@ -451,8 +456,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); extern void __nfs4_read_done_cb(struct nfs_read_data *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, - const char *ip_addr, - rpc_authflavor_t authflavour); + const char *ip_addr); extern int nfs40_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); @@ -575,3 +579,22 @@ u64 nfs_timespec_to_change_attr(const struct timespec *ts) { return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } + +#ifdef CONFIG_CRC32 +/** + * nfs_fhandle_hash - calculate the crc32 hash for the filehandle + * @fh - pointer to filehandle + * + * returns a crc32 hash for the filehandle that is compatible with + * the one displayed by "wireshark". + */ +static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) +{ + return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); +} +#else +static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) +{ + return 0; +} +#endif diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index f5c84c3efbca..90cb10d7b693 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -336,8 +336,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, data->arg.create.createmode = NFS3_CREATE_UNCHECKED; if (flags & O_EXCL) { data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE; - data->arg.create.verifier[0] = jiffies; - data->arg.create.verifier[1] = current->pid; + data->arg.create.verifier[0] = cpu_to_be32(jiffies); + data->arg.create.verifier[1] = cpu_to_be32(current->pid); } sattr->ia_mode &= ~current_umask(); @@ -826,9 +826,10 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; } -static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) { rpc_call_start(task); + return 0; } static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) @@ -847,9 +848,10 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; } -static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) +static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) { rpc_call_start(task); + return 0; } static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index ee81e354bce7..f520a1113b38 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -38,17 +38,15 @@ struct nfs4_minor_version_ops { u32 minor_version; unsigned init_caps; - int (*call_sync)(struct rpc_clnt *clnt, - struct nfs_server *server, - struct rpc_message *msg, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res); + int (*init_client)(struct nfs_client *); + void (*shutdown_client)(struct nfs_client *); bool (*match_stateid)(const nfs4_stateid *, const nfs4_stateid *); int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); int (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); + const struct rpc_call_ops *call_sync_ops; const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; const struct nfs4_state_maintenance_ops *state_renewal_ops; @@ -135,6 +133,7 @@ struct nfs4_lock_state { struct list_head ls_locks; /* Other lock stateids */ struct nfs4_state * ls_state; /* Pointer to open state */ #define NFS_LOCK_INITIALIZED 0 +#define NFS_LOCK_LOST 1 unsigned long ls_flags; struct nfs_seqid_counter ls_seqid; nfs4_stateid ls_stateid; @@ -193,7 +192,6 @@ struct nfs4_state_recovery_ops { int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *); int (*recover_lock)(struct nfs4_state *, struct file_lock *); int (*establish_clid)(struct nfs_client *, struct rpc_cred *); - struct rpc_cred * (*get_clid_cred)(struct nfs_client *); int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *); int (*detect_trunking)(struct nfs_client *, struct nfs_client **, struct rpc_cred *); @@ -223,7 +221,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, /* nfs4proc.c */ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); -extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred); extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); extern int nfs4_destroy_clientid(struct nfs_client *clp); @@ -248,9 +246,6 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser return server->nfs_client->cl_session; } -extern int nfs4_setup_sequence(const struct nfs_server *server, - struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - struct rpc_task *task); extern int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task); @@ -273,18 +268,63 @@ is_ds_client(struct nfs_client *clp) { return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS; } -#else /* CONFIG_NFS_v4_1 */ -static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) + +static inline bool +_nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, + struct rpc_clnt **clntp, struct rpc_message *msg) { - return NULL; + struct rpc_cred *newcred = NULL; + rpc_authflavor_t flavor; + + if (test_bit(sp4_mode, &clp->cl_sp4_flags)) { + spin_lock(&clp->cl_lock); + if (clp->cl_machine_cred != NULL) + newcred = get_rpccred(clp->cl_machine_cred); + spin_unlock(&clp->cl_lock); + if (msg->rpc_cred) + put_rpccred(msg->rpc_cred); + msg->rpc_cred = newcred; + + flavor = clp->cl_rpcclient->cl_auth->au_flavor; + WARN_ON(flavor != RPC_AUTH_GSS_KRB5I && + flavor != RPC_AUTH_GSS_KRB5P); + *clntp = clp->cl_rpcclient; + + return true; + } + return false; } -static inline int nfs4_setup_sequence(const struct nfs_server *server, - struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - struct rpc_task *task) +/* + * Function responsible for determining if an rpc_message should use the + * machine cred under SP4_MACH_CRED and if so switching the credential and + * authflavor (using the nfs_client's rpc_clnt which will be krb5i/p). + * Should be called before rpc_call_sync/rpc_call_async. + */ +static inline void +nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, + struct rpc_clnt **clntp, struct rpc_message *msg) { - rpc_call_start(task); - return 0; + _nfs4_state_protect(clp, sp4_mode, clntp, msg); +} + +/* + * Special wrapper to nfs4_state_protect for write. + * If WRITE can use machine cred but COMMIT cannot, make sure all writes + * that use machine cred use NFS_FILE_SYNC. + */ +static inline void +nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, + struct rpc_message *msg, struct nfs_write_data *wdata) +{ + if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) && + !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags)) + wdata->args.stable = NFS_FILE_SYNC; +} +#else /* CONFIG_NFS_v4_1 */ +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return NULL; } static inline bool @@ -298,6 +338,18 @@ is_ds_client(struct nfs_client *clp) { return false; } + +static inline void +nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags, + struct rpc_clnt **clntp, struct rpc_message *msg) +{ +} + +static inline void +nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, + struct rpc_message *msg, struct nfs_write_data *wdata) +{ +} #endif /* CONFIG_NFS_V4_1 */ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; @@ -308,6 +360,10 @@ extern const u32 nfs4_pathconf_bitmap[3]; extern const u32 nfs4_fsinfo_bitmap[3]; extern const u32 nfs4_fs_locations_bitmap[3]; +void nfs40_shutdown_client(struct nfs_client *); +void nfs41_shutdown_client(struct nfs_client *); +int nfs40_init_client(struct nfs_client *); +int nfs41_init_client(struct nfs_client *); void nfs4_free_client(struct nfs_client *); struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *); @@ -319,7 +375,7 @@ extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(struct work_struct *); /* nfs4state.c */ -struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); +struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp); struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); int nfs4_discover_server_trunking(struct nfs_client *clp, @@ -327,7 +383,6 @@ int nfs4_discover_server_trunking(struct nfs_client *clp, int nfs40_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, struct rpc_cred *); #if defined(CONFIG_NFS_V4_1) -struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); int nfs41_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, struct rpc_cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); @@ -382,6 +437,7 @@ struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct extern bool nfs4_disable_idmapping; extern unsigned short max_session_slots; extern unsigned short send_implementation_id; +extern bool recover_lost_locks; #define NFS4_CLIENT_ID_UNIQ_LEN (64) extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN]; @@ -429,6 +485,8 @@ static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) #define nfs4_close_state(a, b) do { } while (0) #define nfs4_close_sync(a, b) do { } while (0) +#define nfs4_state_protect(a, b, c, d) do { } while (0) +#define nfs4_state_protect_write(a, b, c, d) do { } while (0) #endif /* CONFIG_NFS_V4 */ #endif /* __LINUX_FS_NFS_NFS4_FS.H */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 90dce91dd5b5..a860ab566d6e 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -41,19 +41,138 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) } #ifdef CONFIG_NFS_V4_1 -static void nfs4_shutdown_session(struct nfs_client *clp) +/** + * Per auth flavor data server rpc clients + */ +struct nfs4_ds_server { + struct list_head list; /* ds_clp->cl_ds_clients */ + struct rpc_clnt *rpc_clnt; +}; + +/** + * Common lookup case for DS I/O + */ +static struct nfs4_ds_server * +nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor) +{ + struct nfs4_ds_server *dss; + + rcu_read_lock(); + list_for_each_entry_rcu(dss, &ds_clp->cl_ds_clients, list) { + if (dss->rpc_clnt->cl_auth->au_flavor != flavor) + continue; + goto out; + } + dss = NULL; +out: + rcu_read_unlock(); + return dss; +} + +static struct nfs4_ds_server * +nfs4_add_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor, + struct nfs4_ds_server *new) +{ + struct nfs4_ds_server *dss; + + spin_lock(&ds_clp->cl_lock); + list_for_each_entry(dss, &ds_clp->cl_ds_clients, list) { + if (dss->rpc_clnt->cl_auth->au_flavor != flavor) + continue; + goto out; + } + if (new) + list_add_rcu(&new->list, &ds_clp->cl_ds_clients); + dss = new; +out: + spin_unlock(&ds_clp->cl_lock); /* need some lock to protect list */ + return dss; +} + +static struct nfs4_ds_server * +nfs4_alloc_ds_server(struct nfs_client *ds_clp, rpc_authflavor_t flavor) +{ + struct nfs4_ds_server *dss; + + dss = kmalloc(sizeof(*dss), GFP_NOFS); + if (dss == NULL) + return ERR_PTR(-ENOMEM); + + dss->rpc_clnt = rpc_clone_client_set_auth(ds_clp->cl_rpcclient, flavor); + if (IS_ERR(dss->rpc_clnt)) { + int err = PTR_ERR(dss->rpc_clnt); + kfree (dss); + return ERR_PTR(err); + } + INIT_LIST_HEAD(&dss->list); + + return dss; +} + +static void +nfs4_free_ds_server(struct nfs4_ds_server *dss) +{ + rpc_release_client(dss->rpc_clnt); + kfree(dss); +} + +/** +* Find or create a DS rpc client with th MDS server rpc client auth flavor +* in the nfs_client cl_ds_clients list. +*/ +struct rpc_clnt * +nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode) +{ + struct nfs4_ds_server *dss, *new; + rpc_authflavor_t flavor = NFS_SERVER(inode)->client->cl_auth->au_flavor; + + dss = nfs4_find_ds_client(ds_clp, flavor); + if (dss != NULL) + goto out; + new = nfs4_alloc_ds_server(ds_clp, flavor); + if (IS_ERR(new)) + return ERR_CAST(new); + dss = nfs4_add_ds_client(ds_clp, flavor, new); + if (dss != new) + nfs4_free_ds_server(new); +out: + return dss->rpc_clnt; +} +EXPORT_SYMBOL_GPL(nfs4_find_or_create_ds_client); + +static void +nfs4_shutdown_ds_clients(struct nfs_client *clp) +{ + struct nfs4_ds_server *dss; + LIST_HEAD(shutdown_list); + + while (!list_empty(&clp->cl_ds_clients)) { + dss = list_entry(clp->cl_ds_clients.next, + struct nfs4_ds_server, list); + list_del(&dss->list); + rpc_shutdown_client(dss->rpc_clnt); + kfree (dss); + } +} + +void nfs41_shutdown_client(struct nfs_client *clp) { if (nfs4_has_session(clp)) { + nfs4_shutdown_ds_clients(clp); nfs4_destroy_session(clp->cl_session); nfs4_destroy_clientid(clp); } } -#else /* CONFIG_NFS_V4_1 */ -static void nfs4_shutdown_session(struct nfs_client *clp) +#endif /* CONFIG_NFS_V4_1 */ + +void nfs40_shutdown_client(struct nfs_client *clp) { + if (clp->cl_slot_tbl) { + nfs4_release_slot_table(clp->cl_slot_tbl); + kfree(clp->cl_slot_tbl); + } } -#endif /* CONFIG_NFS_V4_1 */ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) { @@ -73,6 +192,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) spin_lock_init(&clp->cl_lock); INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); + INIT_LIST_HEAD(&clp->cl_ds_clients); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_minorversion = cl_init->minorversion; @@ -97,7 +217,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp) { if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) nfs4_kill_renewd(clp); - nfs4_shutdown_session(clp); + clp->cl_mvops->shutdown_client(clp); nfs4_destroy_callback(clp); if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) nfs_idmap_delete(clp); @@ -144,34 +264,77 @@ static int nfs4_init_callback(struct nfs_client *clp) return 0; } +/** + * nfs40_init_client - nfs_client initialization tasks for NFSv4.0 + * @clp - nfs_client to initialize + * + * Returns zero on success, or a negative errno if some error occurred. + */ +int nfs40_init_client(struct nfs_client *clp) +{ + struct nfs4_slot_table *tbl; + int ret; + + tbl = kzalloc(sizeof(*tbl), GFP_NOFS); + if (tbl == NULL) + return -ENOMEM; + + ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE, + "NFSv4.0 transport Slot table"); + if (ret) { + kfree(tbl); + return ret; + } + + clp->cl_slot_tbl = tbl; + return 0; +} + +#if defined(CONFIG_NFS_V4_1) + +/** + * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+ + * @clp - nfs_client to initialize + * + * Returns zero on success, or a negative errno if some error occurred. + */ +int nfs41_init_client(struct nfs_client *clp) +{ + struct nfs4_session *session = NULL; + + /* + * Create the session and mark it expired. + * When a SEQUENCE operation encounters the expired session + * it will do session recovery to initialize it. + */ + session = nfs4_alloc_session(clp); + if (!session) + return -ENOMEM; + + clp->cl_session = session; + + /* + * The create session reply races with the server back + * channel probe. Mark the client NFS_CS_SESSION_INITING + * so that the client back channel can find the + * nfs_client struct + */ + nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING); + return 0; +} + +#endif /* CONFIG_NFS_V4_1 */ + /* * Initialize the minor version specific parts of an NFS4 client record */ static int nfs4_init_client_minor_version(struct nfs_client *clp) { -#if defined(CONFIG_NFS_V4_1) - if (clp->cl_mvops->minor_version) { - struct nfs4_session *session = NULL; - /* - * Create the session and mark it expired. - * When a SEQUENCE operation encounters the expired session - * it will do session recovery to initialize it. - */ - session = nfs4_alloc_session(clp); - if (!session) - return -ENOMEM; - - clp->cl_session = session; - /* - * The create session reply races with the server back - * channel probe. Mark the client NFS_CS_SESSION_INITING - * so that the client back channel can find the - * nfs_client struct - */ - nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING); - } -#endif /* CONFIG_NFS_V4_1 */ + int ret; + ret = clp->cl_mvops->init_client(clp); + if (ret) + return ret; return nfs4_init_callback(clp); } @@ -187,8 +350,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) */ struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, - const char *ip_addr, - rpc_authflavor_t authflavour) + const char *ip_addr) { char buf[INET6_ADDRSTRLEN + 1]; struct nfs_client *old; @@ -723,7 +885,7 @@ static void nfs4_session_set_rwsize(struct nfs_server *server) } static int nfs4_server_common_setup(struct nfs_server *server, - struct nfs_fh *mntfh) + struct nfs_fh *mntfh, bool auth_probe) { struct nfs_fattr *fattr; int error; @@ -755,7 +917,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, /* Probe the root fh to retrieve its FSID and filehandle */ - error = nfs4_get_rootfh(server, mntfh); + error = nfs4_get_rootfh(server, mntfh, auth_probe); if (error < 0) goto out; @@ -787,6 +949,7 @@ out: static int nfs4_init_server(struct nfs_server *server, const struct nfs_parsed_mount_data *data) { + rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; struct rpc_timeout timeparms; int error; @@ -799,13 +962,16 @@ static int nfs4_init_server(struct nfs_server *server, server->flags = data->flags; server->options = data->options; + if (data->auth_flavor_len >= 1) + pseudoflavor = data->auth_flavors[0]; + /* Get a client record */ error = nfs4_set_client(server, data->nfs_server.hostname, (const struct sockaddr *)&data->nfs_server.address, data->nfs_server.addrlen, data->client_address, - data->auth_flavors[0], + pseudoflavor, data->nfs_server.protocol, &timeparms, data->minorversion, @@ -825,7 +991,7 @@ static int nfs4_init_server(struct nfs_server *server, server->port = data->nfs_server.port; - error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); + error = nfs_init_server_rpcclient(server, &timeparms, pseudoflavor); error: /* Done */ @@ -843,6 +1009,7 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, struct nfs_subversion *nfs_mod) { struct nfs_server *server; + bool auth_probe; int error; dprintk("--> nfs4_create_server()\n"); @@ -851,12 +1018,14 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, if (!server) return ERR_PTR(-ENOMEM); + auth_probe = mount_info->parsed->auth_flavor_len < 1; + /* set up the general RPC client */ error = nfs4_init_server(server, mount_info->parsed); if (error < 0) goto error; - error = nfs4_server_common_setup(server, mount_info->mntfh); + error = nfs4_server_common_setup(server, mount_info->mntfh, auth_probe); if (error < 0) goto error; @@ -909,7 +1078,8 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; - error = nfs4_server_common_setup(server, mntfh); + error = nfs4_server_common_setup(server, mntfh, + !(parent_server->flags & NFS_MOUNT_SECFLAVOUR)); if (error < 0) goto error; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 17ed87ef9de8..b86464ba25e1 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -39,6 +39,7 @@ #include "internal.h" #include "delegation.h" #include "nfs4filelayout.h" +#include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -247,6 +248,7 @@ static int filelayout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr = data->header; int err; + trace_nfs4_pnfs_read(data, task->tk_status); err = filelayout_async_handle_error(task, data->args.context->state, data->ds_clp, hdr->lseg); @@ -363,6 +365,7 @@ static int filelayout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr = data->header; int err; + trace_nfs4_pnfs_write(data, task->tk_status); err = filelayout_async_handle_error(task, data->args.context->state, data->ds_clp, hdr->lseg); @@ -395,6 +398,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, { int err; + trace_nfs4_pnfs_commit_ds(data, task->tk_status); err = filelayout_async_handle_error(task, NULL, data->ds_clp, data->lseg); @@ -524,6 +528,7 @@ filelayout_read_pagelist(struct nfs_read_data *data) struct nfs_pgio_header *hdr = data->header; struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; loff_t offset = data->args.offset; u32 j, idx; struct nfs_fh *fh; @@ -538,6 +543,11 @@ filelayout_read_pagelist(struct nfs_read_data *data) ds = nfs4_fl_prepare_ds(lseg, idx); if (!ds) return PNFS_NOT_ATTEMPTED; + + ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode); + if (IS_ERR(ds_clnt)) + return PNFS_NOT_ATTEMPTED; + dprintk("%s USE DS: %s cl_count %d\n", __func__, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); @@ -552,7 +562,7 @@ filelayout_read_pagelist(struct nfs_read_data *data) data->mds_offset = offset; /* Perform an asynchronous read to ds */ - nfs_initiate_read(ds->ds_clp->cl_rpcclient, data, + nfs_initiate_read(ds_clnt, data, &filelayout_read_call_ops, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; } @@ -564,6 +574,7 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) struct nfs_pgio_header *hdr = data->header; struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; loff_t offset = data->args.offset; u32 j, idx; struct nfs_fh *fh; @@ -574,6 +585,11 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) ds = nfs4_fl_prepare_ds(lseg, idx); if (!ds) return PNFS_NOT_ATTEMPTED; + + ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode); + if (IS_ERR(ds_clnt)) + return PNFS_NOT_ATTEMPTED; + dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n", __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); @@ -591,7 +607,7 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) data->args.offset = filelayout_get_dserver_offset(lseg, offset); /* Perform an asynchronous write */ - nfs_initiate_write(ds->ds_clp->cl_rpcclient, data, + nfs_initiate_write(ds_clnt, data, &filelayout_write_call_ops, sync, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; @@ -1101,16 +1117,19 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) { struct pnfs_layout_segment *lseg = data->lseg; struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; u32 idx; struct nfs_fh *fh; idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); ds = nfs4_fl_prepare_ds(lseg, idx); - if (!ds) { - prepare_to_resend_writes(data); - filelayout_commit_release(data); - return -EAGAIN; - } + if (!ds) + goto out_err; + + ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, data->inode); + if (IS_ERR(ds_clnt)) + goto out_err; + dprintk("%s ino %lu, how %d cl_count %d\n", __func__, data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count)); data->commit_done_cb = filelayout_commit_done_cb; @@ -1119,9 +1138,13 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) data->args.fh = fh; - return nfs_initiate_commit(ds->ds_clp->cl_rpcclient, data, + return nfs_initiate_commit(ds_clnt, data, &filelayout_commit_call_ops, how, RPC_TASK_SOFTCONN); +out_err: + prepare_to_resend_writes(data); + filelayout_commit_release(data); + return -EAGAIN; } static int diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index 549462e5b9b0..c0b3a16b4a00 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -9,7 +9,7 @@ #define NFSDBG_FACILITY NFSDBG_CLIENT -int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) +int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe) { struct nfs_fsinfo fsinfo; int ret = -ENOMEM; @@ -21,7 +21,7 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) goto out; /* Start by getting the root filehandle from the server */ - ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo); + ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe); if (ret < 0) { dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); goto out; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index cdb0b41a4810..2288cd3c9278 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -11,6 +11,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/sunrpc/clnt.h> @@ -369,21 +370,33 @@ out: struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr) { + rpc_authflavor_t flavor = server->client->cl_auth->au_flavor; struct dentry *parent = dget_parent(dentry); + struct inode *dir = parent->d_inode; + struct qstr *name = &dentry->d_name; struct rpc_clnt *client; struct vfsmount *mnt; /* Look it up again to get its attributes and sec flavor */ - client = nfs4_proc_lookup_mountpoint(parent->d_inode, &dentry->d_name, fh, fattr); + client = nfs4_proc_lookup_mountpoint(dir, name, fh, fattr); dput(parent); if (IS_ERR(client)) return ERR_CAST(client); - if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) + if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { mnt = nfs_do_refmount(client, dentry); - else - mnt = nfs_do_submount(dentry, fh, fattr, client->cl_auth->au_flavor); + goto out; + } + if (client->cl_auth->au_flavor != flavor) + flavor = client->cl_auth->au_flavor; + else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) { + rpc_authflavor_t new = nfs4_negotiate_security(dir, name); + if ((int)new >= 0) + flavor = new; + } + mnt = nfs_do_submount(dentry, fh, fattr, flavor); +out: rpc_shutdown_client(client); return mnt; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cf11799297c4..39b6cf2d1683 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -66,6 +66,8 @@ #include "nfs4session.h" #include "fscache.h" +#include "nfs4trace.h" + #define NFSDBG_FACILITY NFSDBG_PROC #define NFS4_POLL_RETRY_MIN (HZ/10) @@ -150,6 +152,7 @@ static int nfs4_map_errors(int err) case -NFS4ERR_RECALLCONFLICT: return -EREMOTEIO; case -NFS4ERR_WRONGSEC: + case -NFS4ERR_WRONG_CRED: return -EPERM; case -NFS4ERR_BADOWNER: case -NFS4ERR_BADNAME: @@ -433,6 +436,20 @@ wait_on_recovery: return ret; } +/* + * Return 'true' if 'clp' is using an rpc_client that is integrity protected + * or 'false' otherwise. + */ +static bool _nfs4_is_integrity_protected(struct nfs_client *clp) +{ + rpc_authflavor_t flavor = clp->cl_rpcclient->cl_auth->au_flavor; + + if (flavor == RPC_AUTH_GSS_KRB5I || + flavor == RPC_AUTH_GSS_KRB5P) + return true; + + return false; +} static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) { @@ -447,6 +464,88 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp do_renew_lease(server->nfs_client, timestamp); } +struct nfs4_call_sync_data { + const struct nfs_server *seq_server; + struct nfs4_sequence_args *seq_args; + struct nfs4_sequence_res *seq_res; +}; + +static void nfs4_init_sequence(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, int cache_reply) +{ + args->sa_slot = NULL; + args->sa_cache_this = cache_reply; + args->sa_privileged = 0; + + res->sr_slot = NULL; +} + +static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) +{ + args->sa_privileged = 1; +} + +static int nfs40_setup_sequence(const struct nfs_server *server, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl; + struct nfs4_slot *slot; + + /* slot already allocated? */ + if (res->sr_slot != NULL) + goto out_start; + + spin_lock(&tbl->slot_tbl_lock); + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + goto out_sleep; + + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; + goto out_sleep; + } + spin_unlock(&tbl->slot_tbl_lock); + + args->sa_slot = slot; + res->sr_slot = slot; + +out_start: + rpc_call_start(task); + return 0; + +out_sleep: + if (args->sa_privileged) + rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, + NULL, RPC_PRIORITY_PRIVILEGED); + else + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + return -EAGAIN; +} + +static int nfs40_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + struct nfs4_slot *slot = res->sr_slot; + struct nfs4_slot_table *tbl; + + if (!RPC_WAS_SENT(task)) + goto out; + + tbl = slot->table; + spin_lock(&tbl->slot_tbl_lock); + if (!nfs41_wake_and_assign_slot(tbl, slot)) + nfs4_free_slot(tbl, slot); + spin_unlock(&tbl->slot_tbl_lock); + + res->sr_slot = NULL; +out: + return 1; +} + #if defined(CONFIG_NFS_V4_1) static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) @@ -506,6 +605,7 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * interrupted = true; } + trace_nfs4_sequence_done(session, res); /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: @@ -591,25 +691,11 @@ static int nfs4_sequence_done(struct rpc_task *task, { if (res->sr_slot == NULL) return 1; + if (!res->sr_slot->table->session) + return nfs40_sequence_done(task, res); return nfs41_sequence_done(task, res); } -static void nfs41_init_sequence(struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, int cache_reply) -{ - args->sa_slot = NULL; - args->sa_cache_this = 0; - args->sa_privileged = 0; - if (cache_reply) - args->sa_cache_this = 1; - res->sr_slot = NULL; -} - -static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) -{ - args->sa_privileged = 1; -} - int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -647,7 +733,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, args->sa_slot = slot; - dprintk("<-- %s slotid=%d seqid=%d\n", __func__, + dprintk("<-- %s slotid=%u seqid=%u\n", __func__, slot->slot_nr, slot->seq_nr); res->sr_slot = slot; @@ -658,6 +744,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, * set to 1 if an rpc level failure occurs. */ res->sr_status = 1; + trace_nfs4_setup_sequence(session, args); out_success: rpc_call_start(task); return 0; @@ -673,38 +760,30 @@ out_sleep: } EXPORT_SYMBOL_GPL(nfs41_setup_sequence); -int nfs4_setup_sequence(const struct nfs_server *server, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) +static int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) { struct nfs4_session *session = nfs4_get_session(server); int ret = 0; - if (session == NULL) { - rpc_call_start(task); - goto out; - } + if (!session) + return nfs40_setup_sequence(server, args, res, task); - dprintk("--> %s clp %p session %p sr_slot %d\n", + dprintk("--> %s clp %p session %p sr_slot %u\n", __func__, session->clp, session, res->sr_slot ? - res->sr_slot->slot_nr : -1); + res->sr_slot->slot_nr : NFS4_NO_SLOT); ret = nfs41_setup_sequence(session, args, res, task); -out: + dprintk("<-- %s status=%d\n", __func__, ret); return ret; } -struct nfs41_call_sync_data { - const struct nfs_server *seq_server; - struct nfs4_sequence_args *seq_args; - struct nfs4_sequence_res *seq_res; -}; - static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) { - struct nfs41_call_sync_data *data = calldata; + struct nfs4_call_sync_data *data = calldata; struct nfs4_session *session = nfs4_get_session(data->seq_server); dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); @@ -714,7 +793,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) { - struct nfs41_call_sync_data *data = calldata; + struct nfs4_call_sync_data *data = calldata; nfs41_sequence_done(task, data->seq_res); } @@ -724,6 +803,42 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; +#else /* !CONFIG_NFS_V4_1 */ + +static int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + return nfs40_setup_sequence(server, args, res, task); +} + +static int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + return nfs40_sequence_done(task, res); +} + +#endif /* !CONFIG_NFS_V4_1 */ + +static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_call_sync_data *data = calldata; + nfs4_setup_sequence(data->seq_server, + data->seq_args, data->seq_res, task); +} + +static void nfs40_call_sync_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_call_sync_data *data = calldata; + nfs4_sequence_done(task, data->seq_res); +} + +static const struct rpc_call_ops nfs40_call_sync_ops = { + .rpc_call_prepare = nfs40_call_sync_prepare, + .rpc_call_done = nfs40_call_sync_done, +}; + static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, @@ -732,7 +847,8 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, { int ret; struct rpc_task *task; - struct nfs41_call_sync_data data = { + struct nfs_client *clp = server->nfs_client; + struct nfs4_call_sync_data data = { .seq_server = server, .seq_args = args, .seq_res = res, @@ -740,7 +856,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct rpc_task_setup task_setup = { .rpc_client = clnt, .rpc_message = msg, - .callback_ops = &nfs41_call_sync_ops, + .callback_ops = clp->cl_mvops->call_sync_ops, .callback_data = &data }; @@ -754,35 +870,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, return ret; } -#else -static -void nfs41_init_sequence(struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, int cache_reply) -{ -} - -static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) -{ -} - - -static int nfs4_sequence_done(struct rpc_task *task, - struct nfs4_sequence_res *res) -{ - return 1; -} -#endif /* CONFIG_NFS_V4_1 */ - -static -int _nfs4_call_sync(struct rpc_clnt *clnt, - struct nfs_server *server, - struct rpc_message *msg, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res) -{ - return rpc_call_sync(clnt, msg, 0); -} - static int nfs4_call_sync(struct rpc_clnt *clnt, struct nfs_server *server, @@ -791,9 +878,8 @@ int nfs4_call_sync(struct rpc_clnt *clnt, struct nfs4_sequence_res *res, int cache_reply) { - nfs41_init_sequence(args, res, cache_reply); - return server->nfs_client->cl_mvops->call_sync(clnt, server, msg, - args, res); + nfs4_init_sequence(args, res, cache_reply); + return nfs4_call_sync_sequence(clnt, server, msg, args, res); } static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) @@ -933,7 +1019,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.fh = NFS_FH(dentry->d_inode); } if (attrs != NULL && attrs->ia_valid != 0) { - __be32 verf[2]; + __u32 verf[2]; p->o_arg.u.attrs = &p->attrs; memcpy(&p->attrs, attrs, sizeof(p->attrs)); @@ -1103,7 +1189,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat goto no_delegation; spin_lock(&deleg_cur->lock); - if (nfsi->delegation != deleg_cur || + if (rcu_dereference(nfsi->delegation) != deleg_cur || test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) || (deleg_cur->type & fmode) != fmode) goto no_delegation_unlock; @@ -1440,6 +1526,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state int err; do { err = _nfs4_do_open_reclaim(ctx, state); + trace_nfs4_open_reclaim(ctx, 0, err); if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) continue; if (err != -NFS4ERR_DELAY) @@ -1524,10 +1611,20 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state return nfs4_handle_delegation_recall_error(server, state, stateid, err); } +static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_opendata *data = calldata; + + nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, + &data->o_res.seq_res, task); +} + static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; + nfs40_sequence_done(task, &data->o_res.seq_res); + data->rpc_status = task->tk_status; if (data->rpc_status == 0) { nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid); @@ -1556,6 +1653,7 @@ out_free: } static const struct rpc_call_ops nfs4_open_confirm_ops = { + .rpc_call_prepare = nfs4_open_confirm_prepare, .rpc_call_done = nfs4_open_confirm_done, .rpc_release = nfs4_open_confirm_release, }; @@ -1583,6 +1681,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) }; int status; + nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1); kref_get(&data->kref); data->rpc_done = 0; data->rpc_status = 0; @@ -1742,7 +1841,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) }; int status; - nfs41_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1); + nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1); kref_get(&data->kref); data->rpc_done = 0; data->rpc_status = 0; @@ -1895,6 +1994,7 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state do { err = _nfs4_open_expired(ctx, state); + trace_nfs4_open_expired(ctx, 0, err); if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) continue; switch (err) { @@ -1944,6 +2044,7 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state) cred = get_rpccred(delegation->cred); rcu_read_unlock(); status = nfs41_test_stateid(server, stateid, cred); + trace_nfs4_test_delegation_stateid(state, NULL, status); } else rcu_read_unlock(); @@ -1986,6 +2087,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) return -NFS4ERR_BAD_STATEID; status = nfs41_test_stateid(server, stateid, cred); + trace_nfs4_test_open_stateid(state, NULL, status); if (status != NFS_OK) { /* Free the stateid unless the server explicitly * informs us the stateid is unrecognized. */ @@ -2197,6 +2299,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, do { status = _nfs4_do_open(dir, ctx, flags, sattr, label); res = ctx->state; + trace_nfs4_open_file(ctx, flags, status); if (status == 0) break; /* NOTE: BAD_SEQID means the server and client disagree about the @@ -2310,6 +2413,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, int err; do { err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); + trace_nfs4_setattr(inode, err); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { @@ -2387,6 +2491,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) dprintk("%s: begin!\n", __func__); if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; + trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors */ @@ -2511,10 +2616,13 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) }; int status = -ENOMEM; + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, + &task_setup_data.rpc_client, &msg); + calldata = kzalloc(sizeof(*calldata), gfp_mask); if (calldata == NULL) goto out; - nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1); + nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1); calldata->inode = state->inode; calldata->state = state; calldata->arg.fh = NFS_FH(state->inode); @@ -2690,6 +2798,7 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, int err; do { err = _nfs4_lookup_root(server, fhandle, info); + trace_nfs4_lookup_root(server, fhandle, info->fattr, err); switch (err) { case 0: case -NFS4ERR_WRONGSEC: @@ -2705,10 +2814,13 @@ out: static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info, rpc_authflavor_t flavor) { + struct rpc_auth_create_args auth_args = { + .pseudoflavor = flavor, + }; struct rpc_auth *auth; int ret; - auth = rpcauth_create(flavor, server->client); + auth = rpcauth_create(&auth_args, server->client); if (IS_ERR(auth)) { ret = -EACCES; goto out; @@ -2772,18 +2884,27 @@ static int nfs4_do_find_root_sec(struct nfs_server *server, * @server: initialized nfs_server handle * @fhandle: we fill in the pseudo-fs root file handle * @info: we fill in an FSINFO struct + * @auth_probe: probe the auth flavours * * Returns zero on success, or a negative errno. */ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fsinfo *info, + bool auth_probe) { int status; - status = nfs4_lookup_root(server, fhandle, info); - if ((status == -NFS4ERR_WRONGSEC) && - !(server->flags & NFS_MOUNT_SECFLAVOUR)) + switch (auth_probe) { + case false: + status = nfs4_lookup_root(server, fhandle, info); + if (status != -NFS4ERR_WRONGSEC) + break; + /* Did user force a 'sec=' mount option? */ + if (server->flags & NFS_MOUNT_SECFLAVOUR) + break; + default: status = nfs4_do_find_root_sec(server, fhandle, info); + } if (status == 0) status = nfs4_server_capabilities(server, fhandle); @@ -2899,8 +3020,9 @@ static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(server, - _nfs4_proc_getattr(server, fhandle, fattr, label), + err = _nfs4_proc_getattr(server, fhandle, fattr, label); + trace_nfs4_getattr(server, fhandle, fattr, err); + err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); return err; @@ -2940,10 +3062,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, /* Deal with open(O_TRUNC) */ if (sattr->ia_valid & ATTR_OPEN) - sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); + sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME); /* Optimization: if the end result is no change, don't RPC */ - if ((sattr->ia_valid & ~(ATTR_FILE)) == 0) + if ((sattr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) return 0; /* Search for an existing open(O_WRITE) file */ @@ -3020,6 +3142,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, int err; do { err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label); + trace_nfs4_lookup(dir, name, err); switch (err) { case -NFS4ERR_BADNAME: err = -ENOENT; @@ -3031,7 +3154,9 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, err = -EPERM; if (client != *clnt) goto out; - + /* No security negotiation if the user specified 'sec=' */ + if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR) + goto out; client = nfs4_create_sec_client(client, dir, name); if (IS_ERR(client)) return PTR_ERR(client); @@ -3071,15 +3196,13 @@ struct rpc_clnt * nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { + struct rpc_clnt *client = NFS_CLIENT(dir); int status; - struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir)); status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL); - if (status < 0) { - rpc_shutdown_client(client); + if (status < 0) return ERR_PTR(status); - } - return client; + return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client; } static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) @@ -3136,8 +3259,9 @@ static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(inode), - _nfs4_proc_access(inode, entry), + err = _nfs4_proc_access(inode, entry); + trace_nfs4_access(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); return err; @@ -3190,8 +3314,9 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page, struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(inode), - _nfs4_proc_readlink(inode, page, pgbase, pglen), + err = _nfs4_proc_readlink(inode, page, pgbase, pglen); + trace_nfs4_readlink(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); return err; @@ -3255,8 +3380,9 @@ static int nfs4_proc_remove(struct inode *dir, struct qstr *name) struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_remove(dir, name), + err = _nfs4_proc_remove(dir, name); + trace_nfs4_remove(dir, name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); return err; @@ -3270,7 +3396,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) res->server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; - nfs41_init_sequence(&args->seq_args, &res->seq_res, 1); + nfs4_init_sequence(&args->seq_args, &res->seq_res, 1); nfs_fattr_init(res->dir_attr); } @@ -3285,7 +3411,8 @@ static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlin static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) { - struct nfs_removeres *res = task->tk_msg.rpc_resp; + struct nfs_unlinkdata *data = task->tk_calldata; + struct nfs_removeres *res = &data->res; if (!nfs4_sequence_done(task, &res->seq_res)) return 0; @@ -3303,7 +3430,7 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; res->server = server; - nfs41_init_sequence(&arg->seq_args, &res->seq_res, 1); + nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1); } static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) @@ -3317,7 +3444,8 @@ static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renam static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, struct inode *new_dir) { - struct nfs_renameres *res = task->tk_msg.rpc_resp; + struct nfs_renamedata *data = task->tk_calldata; + struct nfs_renameres *res = &data->res; if (!nfs4_sequence_done(task, &res->seq_res)) return 0; @@ -3363,9 +3491,10 @@ static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(old_dir), - _nfs4_proc_rename(old_dir, old_name, - new_dir, new_name), + err = _nfs4_proc_rename(old_dir, old_name, + new_dir, new_name); + trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err); + err = nfs4_handle_exception(NFS_SERVER(old_dir), err, &exception); } while (exception.retry); return err; @@ -3527,9 +3656,9 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, label = nfs4_label_init_security(dir, dentry, sattr, &l); do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_symlink(dir, dentry, page, - len, sattr, label), + err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label); + trace_nfs4_symlink(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); @@ -3566,8 +3695,9 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, sattr->ia_mode &= ~current_umask(); do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_mkdir(dir, dentry, sattr, label), + err = _nfs4_proc_mkdir(dir, dentry, sattr, label); + trace_nfs4_mkdir(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); nfs4_label_release_security(label); @@ -3620,9 +3750,10 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), - _nfs4_proc_readdir(dentry, cred, cookie, - pages, count, plus), + err = _nfs4_proc_readdir(dentry, cred, cookie, + pages, count, plus); + trace_nfs4_readdir(dentry->d_inode, err); + err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), err, &exception); } while (exception.retry); return err; @@ -3674,8 +3805,9 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, sattr->ia_mode &= ~current_umask(); do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_mknod(dir, dentry, sattr, label, rdev), + err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev); + trace_nfs4_mknod(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); @@ -3743,6 +3875,7 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str do { err = _nfs4_do_fsinfo(server, fhandle, fsinfo); + trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err); if (err == 0) { struct nfs_client *clp = server->nfs_client; @@ -3861,6 +3994,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { struct nfs_server *server = NFS_SERVER(data->header->inode); + trace_nfs4_read(data, task->tk_status); if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -3904,24 +4038,29 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message data->timestamp = jiffies; data->read_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; - nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); } -static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) { if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, &data->res.seq_res, task)) - return; - nfs4_set_rw_stateid(&data->args.stateid, data->args.context, - data->args.lock_context, FMODE_READ); + return 0; + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, + data->args.lock_context, FMODE_READ) == -EIO) + return -EIO; + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) + return -EIO; + return 0; } static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->header->inode; + trace_nfs4_write(data, task->tk_status); if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -3987,18 +4126,22 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag data->timestamp = jiffies; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; - nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); } -static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) { if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, &data->res.seq_res, task)) - return; - nfs4_set_rw_stateid(&data->args.stateid, data->args.context, - data->args.lock_context, FMODE_WRITE); + return 0; + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, + data->args.lock_context, FMODE_WRITE) == -EIO) + return -EIO; + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) + return -EIO; + return 0; } static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) @@ -4013,6 +4156,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da { struct inode *inode = data->inode; + trace_nfs4_commit(data, task->tk_status); if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -4035,7 +4179,7 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess data->commit_done_cb = nfs4_commit_done_cb; data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; - nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); } struct nfs4_renewdata { @@ -4064,6 +4208,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) struct nfs_client *clp = data->client; unsigned long timestamp = data->timestamp; + trace_nfs4_renew_async(clp, task->tk_status); if (task->tk_status < 0) { /* Unless we're shutting down, schedule state recovery! */ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0) @@ -4321,6 +4466,7 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl ssize_t ret; do { ret = __nfs4_get_acl_uncached(inode, buf, buflen); + trace_nfs4_get_acl(inode, ret); if (ret >= 0) break; ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception); @@ -4400,8 +4546,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(inode), - __nfs4_proc_set_acl(inode, buf, buflen), + err = __nfs4_proc_set_acl(inode, buf, buflen); + trace_nfs4_set_acl(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); return err; @@ -4454,8 +4601,9 @@ static int nfs4_get_security_label(struct inode *inode, void *buf, return -EOPNOTSUPP; do { - err = nfs4_handle_exception(NFS_SERVER(inode), - _nfs4_get_security_label(inode, buf, buflen), + err = _nfs4_get_security_label(inode, buf, buflen); + trace_nfs4_get_security_label(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); return err; @@ -4507,9 +4655,10 @@ static int nfs4_do_set_security_label(struct inode *inode, int err; do { - err = nfs4_handle_exception(NFS_SERVER(inode), - _nfs4_do_set_security_label(inode, ilabel, - fattr, olabel), + err = _nfs4_do_set_security_label(inode, ilabel, + fattr, olabel); + trace_nfs4_set_security_label(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); return err; @@ -4632,11 +4781,11 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, /* An impossible timestamp guarantees this value * will never match a generated boot time. */ verf[0] = 0; - verf[1] = (__be32)(NSEC_PER_SEC + 1); + verf[1] = cpu_to_be32(NSEC_PER_SEC + 1); } else { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); - verf[0] = (__be32)nn->boot_time.tv_sec; - verf[1] = (__be32)nn->boot_time.tv_nsec; + verf[0] = cpu_to_be32(nn->boot_time.tv_sec); + verf[1] = cpu_to_be32(nn->boot_time.tv_nsec); } memcpy(bootverf->data, verf, sizeof(bootverf->data)); } @@ -4662,10 +4811,14 @@ static unsigned int nfs4_init_uniform_client_string(const struct nfs_client *clp, char *buf, size_t len) { - char *nodename = clp->cl_rpcclient->cl_nodename; + const char *nodename = clp->cl_rpcclient->cl_nodename; if (nfs4_client_id_uniquifier[0] != '\0') - nodename = nfs4_client_id_uniquifier; + return scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", + clp->rpc_ops->version, + clp->cl_minorversion, + nfs4_client_id_uniquifier, + nodename); return scnprintf(buf, len, "Linux NFSv%u.%u %s", clp->rpc_ops->version, clp->cl_minorversion, nodename); @@ -4726,6 +4879,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, clp->cl_rpcclient->cl_auth->au_ops->au_name, setclientid.sc_name_len, setclientid.sc_name); status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_setclientid(clp, status); dprintk("NFS reply setclientid: %d\n", status); return status; } @@ -4753,6 +4907,7 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_clientid); status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_setclientid_confirm(clp, status); dprintk("NFS reply setclientid_confirm: %d\n", status); return status; } @@ -4774,6 +4929,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) if (!nfs4_sequence_done(task, &data->res.seq_res)) return; + trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); switch (task->tk_status) { case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: @@ -4795,7 +4951,6 @@ static void nfs4_delegreturn_release(void *calldata) kfree(calldata); } -#if defined(CONFIG_NFS_V4_1) static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) { struct nfs4_delegreturndata *d_data; @@ -4807,12 +4962,9 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) &d_data->res.seq_res, task); } -#endif /* CONFIG_NFS_V4_1 */ static const struct rpc_call_ops nfs4_delegreturn_ops = { -#if defined(CONFIG_NFS_V4_1) .rpc_call_prepare = nfs4_delegreturn_prepare, -#endif /* CONFIG_NFS_V4_1 */ .rpc_call_done = nfs4_delegreturn_done, .rpc_release = nfs4_delegreturn_release, }; @@ -4837,7 +4989,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co data = kzalloc(sizeof(*data), GFP_NOFS); if (data == NULL) return -ENOMEM; - nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; data->args.bitmask = server->cache_consistency_bitmask; @@ -4877,6 +5029,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 int err; do { err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); + trace_nfs4_delegreturn(inode, err); switch (err) { case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: @@ -4951,8 +5104,9 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * int err; do { - err = nfs4_handle_exception(NFS_SERVER(state->inode), - _nfs4_proc_getlk(state, cmd, request), + err = _nfs4_proc_getlk(state, cmd, request); + trace_nfs4_get_lock(request, state, cmd, err); + err = nfs4_handle_exception(NFS_SERVER(state->inode), err, &exception); } while (exception.retry); return err; @@ -5089,6 +5243,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .flags = RPC_TASK_ASYNC, }; + nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, + NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); + /* Ensure this is an unlock - when canceling a lock, the * canceled lock is passed in, and it won't be an unlock. */ @@ -5100,7 +5257,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, return ERR_PTR(-ENOMEM); } - nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); msg.rpc_argp = &data->arg; msg.rpc_resp = &data->res; task_setup_data.callback_data = data; @@ -5150,6 +5307,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * rpc_put_task(task); out: request->fl_flags = fl_flags; + trace_nfs4_unlock(request, state, F_SETLK, status); return status; } @@ -5335,7 +5493,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f return -ENOMEM; if (IS_SETLKW(cmd)) data->arg.block = 1; - nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); msg.rpc_argp = &data->arg; msg.rpc_resp = &data->res; task_setup_data.callback_data = data; @@ -5373,6 +5531,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); + trace_nfs4_lock_reclaim(request, state, F_SETLK, err); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -5391,10 +5550,15 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request err = nfs4_set_lock_state(state, request); if (err != 0) return err; + if (!recover_lost_locks) { + set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags); + return 0; + } do { if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); + trace_nfs4_lock_expired(request, state, F_SETLK, err); switch (err) { default: goto out; @@ -5430,6 +5594,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) status = nfs41_test_stateid(server, &lsp->ls_stateid, cred); + trace_nfs4_test_lock_stateid(state, lsp, status); if (status != NFS_OK) { /* Free the stateid unless the server * informs us the stateid is unrecognized. */ @@ -5517,6 +5682,7 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * do { err = _nfs4_proc_setlk(state, cmd, request); + trace_nfs4_set_lock(request, state, cmd, err); if (err == -NFS4ERR_DENIED) err = -EAGAIN; err = nfs4_handle_exception(NFS_SERVER(state->inode), @@ -5599,8 +5765,23 @@ struct nfs_release_lockowner_data { struct nfs4_lock_state *lsp; struct nfs_server *server; struct nfs_release_lockowner_args args; + struct nfs4_sequence_args seq_args; + struct nfs4_sequence_res seq_res; }; +static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_release_lockowner_data *data = calldata; + nfs40_setup_sequence(data->server, + &data->seq_args, &data->seq_res, task); +} + +static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) +{ + struct nfs_release_lockowner_data *data = calldata; + nfs40_sequence_done(task, &data->seq_res); +} + static void nfs4_release_lockowner_release(void *calldata) { struct nfs_release_lockowner_data *data = calldata; @@ -5609,6 +5790,8 @@ static void nfs4_release_lockowner_release(void *calldata) } static const struct rpc_call_ops nfs4_release_lockowner_ops = { + .rpc_call_prepare = nfs4_release_lockowner_prepare, + .rpc_call_done = nfs4_release_lockowner_done, .rpc_release = nfs4_release_lockowner_release, }; @@ -5621,14 +5804,17 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st if (server->nfs_client->cl_mvops->minor_version != 0) return -EINVAL; + data = kmalloc(sizeof(*data), GFP_NOFS); if (!data) return -ENOMEM; + nfs4_init_sequence(&data->seq_args, &data->seq_res, 0); data->lsp = lsp; data->server = server; data->args.lock_owner.clientid = server->nfs_client->cl_clientid; data->args.lock_owner.id = lsp->ls_seqid.owner_id; data->args.lock_owner.s_dev = server->s_dev; + msg.rpc_argp = &data->args; rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); return 0; @@ -5783,14 +5969,23 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_fs_locations(client, dir, name, fs_locations, page), + err = _nfs4_proc_fs_locations(client, dir, name, + fs_locations, page); + trace_nfs4_get_fs_locations(dir, name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); return err; } -static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) +/** + * If 'use_integrity' is true and the state managment nfs_client + * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient + * and the machine credential as per RFC3530bis and RFC5661 Security + * Considerations sections. Otherwise, just use the user cred with the + * filesystem's rpc_client. + */ +static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity) { int status; struct nfs4_secinfo_arg args = { @@ -5805,10 +6000,25 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct .rpc_argp = &args, .rpc_resp = &res, }; + struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + + if (use_integrity) { + clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; + msg.rpc_cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); + } dprintk("NFS call secinfo %s\n", name->name); - status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + + nfs4_state_protect(NFS_SERVER(dir)->nfs_client, + NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + + status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, + &res.seq_res, 0); dprintk("NFS reply secinfo: %d\n", status); + + if (msg.rpc_cred) + put_rpccred(msg.rpc_cred); + return status; } @@ -5818,8 +6028,23 @@ int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_secinfo(dir, name, flavors), + err = -NFS4ERR_WRONGSEC; + + /* try to use integrity protection with machine cred */ + if (_nfs4_is_integrity_protected(NFS_SERVER(dir)->nfs_client)) + err = _nfs4_proc_secinfo(dir, name, flavors, true); + + /* + * if unable to use integrity protection, or SECINFO with + * integrity protection returns NFS4ERR_WRONGSEC (which is + * disallowed by spec, but exists in deployed servers) use + * the current filesystem's rpc_client and the user cred. + */ + if (err == -NFS4ERR_WRONGSEC) + err = _nfs4_proc_secinfo(dir, name, flavors, false); + + trace_nfs4_secinfo(dir, name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); return err; @@ -5883,6 +6108,7 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred } status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_bind_conn_to_session(clp, status); if (status == 0) { if (memcmp(res.session->sess_id.data, clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { @@ -5911,16 +6137,124 @@ out: } /* - * nfs4_proc_exchange_id() + * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map + * and operations we'd like to see to enable certain features in the allow map + */ +static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = { + .how = SP4_MACH_CRED, + .enforce.u.words = { + [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32) + }, + .allow.u.words = { + [0] = 1 << (OP_CLOSE) | + 1 << (OP_LOCKU), + [1] = 1 << (OP_SECINFO - 32) | + 1 << (OP_SECINFO_NO_NAME - 32) | + 1 << (OP_TEST_STATEID - 32) | + 1 << (OP_FREE_STATEID - 32) + } +}; + +/* + * Select the state protection mode for client `clp' given the server results + * from exchange_id in `sp'. * - * Returns zero, a negative errno, or a negative NFS4ERR status code. + * Returns 0 on success, negative errno otherwise. + */ +static int nfs4_sp4_select_mode(struct nfs_client *clp, + struct nfs41_state_protection *sp) +{ + static const u32 supported_enforce[NFS4_OP_MAP_NUM_WORDS] = { + [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32) + }; + unsigned int i; + + if (sp->how == SP4_MACH_CRED) { + /* Print state protect result */ + dfprintk(MOUNT, "Server SP4_MACH_CRED support:\n"); + for (i = 0; i <= LAST_NFS4_OP; i++) { + if (test_bit(i, sp->enforce.u.longs)) + dfprintk(MOUNT, " enforce op %d\n", i); + if (test_bit(i, sp->allow.u.longs)) + dfprintk(MOUNT, " allow op %d\n", i); + } + + /* make sure nothing is on enforce list that isn't supported */ + for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) { + if (sp->enforce.u.words[i] & ~supported_enforce[i]) { + dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); + return -EINVAL; + } + } + + /* + * Minimal mode - state operations are allowed to use machine + * credential. Note this already happens by default, so the + * client doesn't have to do anything more than the negotiation. + * + * NOTE: we don't care if EXCHANGE_ID is in the list - + * we're already using the machine cred for exchange_id + * and will never use a different cred. + */ + if (test_bit(OP_BIND_CONN_TO_SESSION, sp->enforce.u.longs) && + test_bit(OP_CREATE_SESSION, sp->enforce.u.longs) && + test_bit(OP_DESTROY_SESSION, sp->enforce.u.longs) && + test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) { + dfprintk(MOUNT, "sp4_mach_cred:\n"); + dfprintk(MOUNT, " minimal mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags); + } else { + dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); + return -EINVAL; + } + + if (test_bit(OP_CLOSE, sp->allow.u.longs) && + test_bit(OP_LOCKU, sp->allow.u.longs)) { + dfprintk(MOUNT, " cleanup mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); + } + + if (test_bit(OP_SECINFO, sp->allow.u.longs) && + test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { + dfprintk(MOUNT, " secinfo mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags); + } + + if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) && + test_bit(OP_FREE_STATEID, sp->allow.u.longs)) { + dfprintk(MOUNT, " stateid mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags); + } + + if (test_bit(OP_WRITE, sp->allow.u.longs)) { + dfprintk(MOUNT, " write mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags); + } + + if (test_bit(OP_COMMIT, sp->allow.u.longs)) { + dfprintk(MOUNT, " commit mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags); + } + } + + return 0; +} + +/* + * _nfs4_proc_exchange_id() * - * Since the clientid has expired, all compounds using sessions - * associated with the stale clientid will be returning - * NFS4ERR_BADSESSION in the sequence operation, and will therefore - * be in some phase of session reset. + * Wrapper for EXCHANGE_ID operation. */ -int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) +static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, + u32 sp4_how) { nfs4_verifier verifier; struct nfs41_exchange_id_args args = { @@ -5967,10 +6301,30 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) goto out_server_scope; } + switch (sp4_how) { + case SP4_NONE: + args.state_protect.how = SP4_NONE; + break; + + case SP4_MACH_CRED: + args.state_protect = nfs4_sp4_mach_cred_request; + break; + + default: + /* unsupported! */ + WARN_ON_ONCE(1); + status = -EINVAL; + goto out_server_scope; + } + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_exchange_id(clp, status); if (status == 0) status = nfs4_check_cl_exchange_flags(res.flags); + if (status == 0) + status = nfs4_sp4_select_mode(clp, &res.state_protect); + if (status == 0) { clp->cl_clientid = res.clientid; clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R); @@ -6017,6 +6371,35 @@ out: return status; } +/* + * nfs4_proc_exchange_id() + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + * + * Since the clientid has expired, all compounds using sessions + * associated with the stale clientid will be returning + * NFS4ERR_BADSESSION in the sequence operation, and will therefore + * be in some phase of session reset. + * + * Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used. + */ +int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) +{ + rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor; + int status; + + /* try SP4_MACH_CRED if krb5i/p */ + if (authflavor == RPC_AUTH_GSS_KRB5I || + authflavor == RPC_AUTH_GSS_KRB5P) { + status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); + if (!status) + return 0; + } + + /* try SP4_NONE */ + return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); +} + static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, struct rpc_cred *cred) { @@ -6028,6 +6411,7 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, int status; status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_destroy_clientid(clp, status); if (status) dprintk("NFS: Got error %d from the server %s on " "DESTROY_CLIENTID.", status, clp->cl_hostname); @@ -6065,7 +6449,7 @@ int nfs4_destroy_clientid(struct nfs_client *clp) goto out; if (clp->cl_preserve_clid) goto out; - cred = nfs4_get_exchange_id_cred(clp); + cred = nfs4_get_clid_cred(clp); ret = nfs4_proc_destroy_clientid(clp, cred); if (cred) put_rpccred(cred); @@ -6157,7 +6541,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) }; int status; - nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); + nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); nfs4_set_sequence_privileged(&args.la_seq_args); dprintk("--> %s\n", __func__); task = rpc_run_task(&task_setup); @@ -6291,6 +6675,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_create_session(clp, status); if (!status) { /* Verify the session's negotiated channel_attrs values */ @@ -6354,6 +6739,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, return status; status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_destroy_session(session->clp, status); if (status) dprintk("NFS: Got error %d from the server on DESTROY_SESSION. " @@ -6403,6 +6789,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data) if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) return; + trace_nfs4_sequence(clp, task->tk_status); if (task->tk_status < 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); if (atomic_read(&clp->cl_count) == 1) @@ -6460,7 +6847,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, nfs_put_client(clp); return ERR_PTR(-ENOMEM); } - nfs41_init_sequence(&calldata->args, &calldata->res, 0); + nfs4_init_sequence(&calldata->args, &calldata->res, 0); if (is_privileged) nfs4_set_sequence_privileged(&calldata->args); msg.rpc_argp = &calldata->args; @@ -6555,6 +6942,7 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) if (!nfs41_sequence_done(task, res)) return; + trace_nfs4_reclaim_complete(clp, task->tk_status); if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { rpc_restart_call_prepare(task); return; @@ -6602,7 +6990,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, calldata->clp = clp; calldata->arg.one_fs = 0; - nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); + nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); nfs4_set_sequence_privileged(&calldata->arg.seq_args); msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; @@ -6793,7 +7181,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; - nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); + nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ pnfs_get_layout_hdr(NFS_I(inode)->layout); @@ -6804,6 +7192,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) status = nfs4_wait_for_completion_rpc_task(task); if (status == 0) status = task->tk_status; + trace_nfs4_layoutget(lgp->args.ctx, + &lgp->args.range, + &lgp->res.range, + status); /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); @@ -6876,7 +7268,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) .rpc_cred = lrp->cred, }; struct rpc_task_setup task_setup_data = { - .rpc_client = lrp->clp->cl_rpcclient, + .rpc_client = NFS_SERVER(lrp->args.inode)->client, .rpc_message = &msg, .callback_ops = &nfs4_layoutreturn_call_ops, .callback_data = lrp, @@ -6884,11 +7276,12 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) int status; dprintk("--> %s\n", __func__); - nfs41_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); + nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); status = task->tk_status; + trace_nfs4_layoutreturn(lrp->args.inode, status); dprintk("<-- %s status=%d\n", __func__, status); rpc_put_task(task); return status; @@ -7065,7 +7458,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) data->args.lastbytewritten, data->args.inode->i_ino); - nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -7075,15 +7468,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) if (status != 0) goto out; status = task->tk_status; + trace_nfs4_layoutcommit(data->args.inode, status); out: dprintk("%s: status %d\n", __func__, status); rpc_put_task(task); return status; } +/** + * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if + * possible) as per RFC3530bis and RFC5661 Security Considerations sections + */ static int _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) + struct nfs_fsinfo *info, + struct nfs4_secinfo_flavors *flavors, bool use_integrity) { struct nfs41_secinfo_no_name_args args = { .style = SECINFO_STYLE_CURRENT_FH, @@ -7096,7 +7495,23 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; - return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + struct rpc_clnt *clnt = server->client; + int status; + + if (use_integrity) { + clnt = server->nfs_client->cl_rpcclient; + msg.rpc_cred = nfs4_get_clid_cred(server->nfs_client); + } + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, + &res.seq_res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + + if (msg.rpc_cred) + put_rpccred(msg.rpc_cred); + + return status; } static int @@ -7106,7 +7521,24 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs4_exception exception = { }; int err; do { - err = _nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + /* first try using integrity protection */ + err = -NFS4ERR_WRONGSEC; + + /* try to use integrity protection with machine cred */ + if (_nfs4_is_integrity_protected(server->nfs_client)) + err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + flavors, true); + + /* + * if unable to use integrity protection, or SECINFO with + * integrity protection returns NFS4ERR_WRONGSEC (which is + * disallowed by spec, but exists in deployed servers) use + * the current filesystem's rpc_client and the user cred. + */ + if (err == -NFS4ERR_WRONGSEC) + err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + flavors, false); + switch (err) { case 0: case -NFS4ERR_WRONGSEC: @@ -7176,11 +7608,15 @@ static int _nfs41_test_stateid(struct nfs_server *server, .rpc_resp = &res, .rpc_cred = cred, }; + struct rpc_clnt *rpc_client = server->client; + + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID, + &rpc_client, &msg); dprintk("NFS call test_stateid %p\n", stateid); - nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); nfs4_set_sequence_privileged(&args.seq_args); - status = nfs4_call_sync_sequence(server->client, server, &msg, + status = nfs4_call_sync_sequence(rpc_client, server, &msg, &args.seq_args, &res.seq_res); if (status != NFS_OK) { dprintk("NFS reply test_stateid: failed, %d\n", status); @@ -7249,7 +7685,7 @@ static void nfs41_free_stateid_release(void *calldata) kfree(calldata); } -const struct rpc_call_ops nfs41_free_stateid_ops = { +static const struct rpc_call_ops nfs41_free_stateid_ops = { .rpc_call_prepare = nfs41_free_stateid_prepare, .rpc_call_done = nfs41_free_stateid_done, .rpc_release = nfs41_free_stateid_release, @@ -7272,6 +7708,9 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, }; struct nfs_free_stateid_data *data; + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID, + &task_setup.rpc_client, &msg); + dprintk("NFS call free_stateid %p\n", stateid); data = kmalloc(sizeof(*data), GFP_NOFS); if (!data) @@ -7283,7 +7722,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; - nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); if (privileged) nfs4_set_sequence_privileged(&data->args.seq_args); @@ -7359,7 +7798,6 @@ static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { .recover_open = nfs4_open_reclaim, .recover_lock = nfs4_lock_reclaim, .establish_clid = nfs4_init_clientid, - .get_clid_cred = nfs4_get_setclientid_cred, .detect_trunking = nfs40_discover_server_trunking, }; @@ -7370,7 +7808,6 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { .recover_open = nfs4_open_reclaim, .recover_lock = nfs4_lock_reclaim, .establish_clid = nfs41_init_clientid, - .get_clid_cred = nfs4_get_exchange_id_cred, .reclaim_complete = nfs41_proc_reclaim_complete, .detect_trunking = nfs41_discover_server_trunking, }; @@ -7382,7 +7819,6 @@ static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { .recover_open = nfs4_open_expired, .recover_lock = nfs4_lock_expired, .establish_clid = nfs4_init_clientid, - .get_clid_cred = nfs4_get_setclientid_cred, }; #if defined(CONFIG_NFS_V4_1) @@ -7392,7 +7828,6 @@ static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { .recover_open = nfs41_open_expired, .recover_lock = nfs41_lock_expired, .establish_clid = nfs41_init_clientid, - .get_clid_cred = nfs4_get_exchange_id_cred, }; #endif /* CONFIG_NFS_V4_1 */ @@ -7416,10 +7851,12 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { | NFS_CAP_ATOMIC_OPEN | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK, - .call_sync = _nfs4_call_sync, + .init_client = nfs40_init_client, + .shutdown_client = nfs40_shutdown_client, .match_stateid = nfs4_match_stateid, .find_root_sec = nfs4_find_root_sec, .free_lock_state = nfs4_release_lockowner, + .call_sync_ops = &nfs40_call_sync_ops, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, .state_renewal_ops = &nfs40_state_renewal_ops, @@ -7434,10 +7871,12 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1, - .call_sync = nfs4_call_sync_sequence, + .init_client = nfs41_init_client, + .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, + .call_sync_ops = &nfs41_call_sync_ops, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, @@ -7453,10 +7892,12 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1, - .call_sync = nfs4_call_sync_sequence, + .init_client = nfs41_init_client, + .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, + .call_sync_ops = &nfs41_call_sync_ops, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, @@ -7473,7 +7914,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { #endif }; -const struct inode_operations nfs4_dir_inode_operations = { +static const struct inode_operations nfs4_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, .atomic_open = nfs_atomic_open, diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index 36e21cb29d65..cf883c7ae053 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -23,6 +23,14 @@ #define NFSDBG_FACILITY NFSDBG_STATE +static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue) +{ + tbl->highest_used_slotid = NFS4_NO_SLOT; + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue); + init_completion(&tbl->complete); +} + /* * nfs4_shrink_slot_table - free retired slots from the slot table */ @@ -44,6 +52,17 @@ static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize) } } +/** + * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete + * @tbl - controlling slot table + * + */ +void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) +{ + if (nfs4_slot_tbl_draining(tbl)) + complete(&tbl->complete); +} + /* * nfs4_free_slot - free a slot and efficiently update slot table. * @@ -76,7 +95,7 @@ void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) nfs4_slot_tbl_drain_complete(tbl); } } - dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, + dprintk("%s: slotid %u highest_used_slotid %u\n", __func__, slotid, tbl->highest_used_slotid); } @@ -146,9 +165,9 @@ struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) ret->generation = tbl->generation; out: - dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n", + dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n", __func__, tbl->used_slots[0], tbl->highest_used_slotid, - !IS_ERR(ret) ? ret->slot_nr : -1); + !IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT); return ret; } @@ -191,7 +210,7 @@ static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, { int ret; - dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, + dprintk("--> %s: max_reqs=%u, tbl->max_slots %u\n", __func__, max_reqs, tbl->max_slots); if (max_reqs > NFS4_MAX_SLOT_TABLE) @@ -205,18 +224,36 @@ static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue); spin_unlock(&tbl->slot_tbl_lock); - dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, + dprintk("%s: tbl=%p slots=%p max_slots=%u\n", __func__, tbl, tbl->slots, tbl->max_slots); out: dprintk("<-- %s: return %d\n", __func__, ret); return ret; } -/* Destroy the slot table */ -static void nfs4_destroy_slot_tables(struct nfs4_session *session) +/** + * nfs4_release_slot_table - release resources attached to a slot table + * @tbl: slot table to shut down + * + */ +void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +{ + nfs4_shrink_slot_table(tbl, 0); +} + +/** + * nfs4_setup_slot_table - prepare a stand-alone slot table for use + * @tbl: slot table to set up + * @max_reqs: maximum number of requests allowed + * @queue: name to give RPC wait queue + * + * Returns zero on success, or a negative errno. + */ +int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs, + const char *queue) { - nfs4_shrink_slot_table(&session->fc_slot_table, 0); - nfs4_shrink_slot_table(&session->bc_slot_table, 0); + nfs4_init_slot_table(tbl, queue); + return nfs4_realloc_slot_table(tbl, max_reqs, 0); } static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) @@ -273,6 +310,8 @@ void nfs41_wake_slot_table(struct nfs4_slot_table *tbl) } } +#if defined(CONFIG_NFS_V4_1) + static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl, u32 target_highest_slotid) { @@ -383,6 +422,12 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, spin_unlock(&tbl->slot_tbl_lock); } +static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +{ + nfs4_release_slot_table(&session->fc_slot_table); + nfs4_release_slot_table(&session->bc_slot_table); +} + /* * Initialize or reset the forechannel and backchannel tables */ @@ -405,31 +450,20 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses) if (status && tbl->slots == NULL) /* Fore and back channel share a connection so get * both slot tables or neither */ - nfs4_destroy_slot_tables(ses); + nfs4_destroy_session_slot_tables(ses); return status; } struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) { struct nfs4_session *session; - struct nfs4_slot_table *tbl; session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); if (!session) return NULL; - tbl = &session->fc_slot_table; - tbl->highest_used_slotid = NFS4_NO_SLOT; - spin_lock_init(&tbl->slot_tbl_lock); - rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); - init_completion(&tbl->complete); - - tbl = &session->bc_slot_table; - tbl->highest_used_slotid = NFS4_NO_SLOT; - spin_lock_init(&tbl->slot_tbl_lock); - rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); - init_completion(&tbl->complete); - + nfs4_init_slot_table(&session->fc_slot_table, "ForeChannel Slot table"); + nfs4_init_slot_table(&session->bc_slot_table, "BackChannel Slot table"); session->session_state = 1<<NFS4_SESSION_INITING; session->clp = clp; @@ -441,7 +475,7 @@ void nfs4_destroy_session(struct nfs4_session *session) struct rpc_xprt *xprt; struct rpc_cred *cred; - cred = nfs4_get_exchange_id_cred(session->clp); + cred = nfs4_get_clid_cred(session->clp); nfs4_proc_destroy_session(session, cred); if (cred) put_rpccred(cred); @@ -452,7 +486,7 @@ void nfs4_destroy_session(struct nfs4_session *session) dprintk("%s Destroy backchannel for xprt %p\n", __func__, xprt); xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); - nfs4_destroy_slot_tables(session); + nfs4_destroy_session_slot_tables(session); kfree(session); } @@ -513,4 +547,4 @@ int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time) } EXPORT_SYMBOL_GPL(nfs4_init_ds_session); - +#endif /* defined(CONFIG_NFS_V4_1) */ diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 3a153d82b90c..232306100651 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -8,7 +8,7 @@ #define __LINUX_FS_NFS_NFS4SESSION_H /* maximum number of slots to use */ -#define NFS4_DEF_SLOT_TABLE_SIZE (16U) +#define NFS4_DEF_SLOT_TABLE_SIZE (64U) #define NFS4_MAX_SLOT_TABLE (1024U) #define NFS4_NO_SLOT ((u32)-1) @@ -72,10 +72,22 @@ enum nfs4_session_state { NFS4_SESSION_INITING, }; -#if defined(CONFIG_NFS_V4_1) +extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, + unsigned int max_reqs, const char *queue); +extern void nfs4_release_slot_table(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); +extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); +bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot); +void nfs41_wake_slot_table(struct nfs4_slot_table *tbl); + +static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl) +{ + return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); +} +#if defined(CONFIG_NFS_V4_1) extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, u32 target_highest_slotid); extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, @@ -89,17 +101,6 @@ extern void nfs4_destroy_session(struct nfs4_session *session); extern int nfs4_init_session(struct nfs_client *clp); extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); -extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); - -static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl) -{ - return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); -} - -bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, - struct nfs4_slot *slot); -void nfs41_wake_slot_table(struct nfs4_slot_table *tbl); - /* * Determine if sessions are in use. */ @@ -117,6 +118,16 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp) return 0; } +#ifdef CONFIG_CRC32 +/* + * nfs_session_id_hash - calculate the crc32 hash for the session id + * @session - pointer to session + */ +#define nfs_session_id_hash(sess_id) \ + (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data))) +#else +#define nfs_session_id_hash(session) (0) +#endif #else /* defined(CONFIG_NFS_V4_1) */ static inline int nfs4_init_session(struct nfs_client *clp) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e22862f13564..cc14cbb78b73 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -154,6 +154,19 @@ struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) return cred; } +static void nfs4_root_machine_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred, *new; + + new = rpc_lookup_machine_cred(NULL); + spin_lock(&clp->cl_lock); + cred = clp->cl_machine_cred; + clp->cl_machine_cred = new; + spin_unlock(&clp->cl_lock); + if (cred != NULL) + put_rpccred(cred); +} + static struct rpc_cred * nfs4_get_renew_cred_server_locked(struct nfs_server *server) { @@ -202,32 +215,6 @@ out: return cred; } -#if defined(CONFIG_NFS_V4_1) - -static int nfs41_setup_state_renewal(struct nfs_client *clp) -{ - int status; - struct nfs_fsinfo fsinfo; - - if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { - nfs4_schedule_state_renewal(clp); - return 0; - } - - status = nfs4_proc_get_lease_time(clp, &fsinfo); - if (status == 0) { - /* Update lease time and schedule renewal */ - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo.lease_time * HZ; - clp->cl_last_renewal = jiffies; - spin_unlock(&clp->cl_lock); - - nfs4_schedule_state_renewal(clp); - } - - return status; -} - static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl) { if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { @@ -241,20 +228,18 @@ static void nfs4_end_drain_session(struct nfs_client *clp) { struct nfs4_session *ses = clp->cl_session; + if (clp->cl_slot_tbl) { + nfs4_end_drain_slot_table(clp->cl_slot_tbl); + return; + } + if (ses != NULL) { nfs4_end_drain_slot_table(&ses->bc_slot_table); nfs4_end_drain_slot_table(&ses->fc_slot_table); } } -/* - * Signal state manager thread if session fore channel is drained - */ -void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) -{ - if (nfs4_slot_tbl_draining(tbl)) - complete(&tbl->complete); -} +#if defined(CONFIG_NFS_V4_1) static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) { @@ -274,6 +259,9 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) struct nfs4_session *ses = clp->cl_session; int ret = 0; + if (clp->cl_slot_tbl) + return nfs4_drain_slot_tbl(clp->cl_slot_tbl); + /* back channel */ ret = nfs4_drain_slot_tbl(&ses->bc_slot_table); if (ret) @@ -282,6 +270,30 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) return nfs4_drain_slot_tbl(&ses->fc_slot_table); } +static int nfs41_setup_state_renewal(struct nfs_client *clp) +{ + int status; + struct nfs_fsinfo fsinfo; + + if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { + nfs4_schedule_state_renewal(clp); + return 0; + } + + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo.lease_time * HZ; + clp->cl_last_renewal = jiffies; + spin_unlock(&clp->cl_lock); + + nfs4_schedule_state_renewal(clp); + } + + return status; +} + static void nfs41_finish_session_reset(struct nfs_client *clp) { clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); @@ -339,62 +351,21 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, return nfs41_walk_client_list(clp, result, cred); } -struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp) -{ - struct rpc_cred *cred; - - spin_lock(&clp->cl_lock); - cred = nfs4_get_machine_cred_locked(clp); - spin_unlock(&clp->cl_lock); - return cred; -} - #endif /* CONFIG_NFS_V4_1 */ -static struct rpc_cred * -nfs4_get_setclientid_cred_server(struct nfs_server *server) -{ - struct nfs_client *clp = server->nfs_client; - struct rpc_cred *cred = NULL; - struct nfs4_state_owner *sp; - struct rb_node *pos; - - spin_lock(&clp->cl_lock); - pos = rb_first(&server->state_owners); - if (pos != NULL) { - sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); - cred = get_rpccred(sp->so_cred); - } - spin_unlock(&clp->cl_lock); - return cred; -} - /** - * nfs4_get_setclientid_cred - Acquire credential for a setclientid operation + * nfs4_get_clid_cred - Acquire credential for a setclientid operation * @clp: client state handle * * Returns an rpc_cred with reference count bumped, or NULL. */ -struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) +struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp) { - struct nfs_server *server; struct rpc_cred *cred; spin_lock(&clp->cl_lock); cred = nfs4_get_machine_cred_locked(clp); spin_unlock(&clp->cl_lock); - if (cred != NULL) - goto out; - - rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - cred = nfs4_get_setclientid_cred_server(server); - if (cred != NULL) - break; - } - rcu_read_unlock(); - -out: return cred; } @@ -998,7 +969,9 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, fl_pid = lockowner->l_pid; spin_lock(&state->state_lock); lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); - if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { + if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) + ret = -EIO; + else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { nfs4_stateid_copy(dst, &lsp->ls_stateid); ret = 0; smp_rmb(); @@ -1038,11 +1011,17 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, fmode_t fmode, const struct nfs_lockowner *lockowner) { - int ret = 0; + int ret = nfs4_copy_lock_stateid(dst, state, lockowner); + if (ret == -EIO) + /* A lost lock - don't even consider delegations */ + goto out; if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) goto out; - ret = nfs4_copy_lock_stateid(dst, state, lockowner); if (ret != -ENOENT) + /* nfs4_copy_delegation_stateid() didn't over-write + * dst, so it still has the lock stateid which we now + * choose to use. + */ goto out; ret = nfs4_copy_open_stateid(dst, state); out: @@ -1443,14 +1422,16 @@ restart: if (status >= 0) { status = nfs4_reclaim_locks(state, ops); if (status >= 0) { - spin_lock(&state->state_lock); - list_for_each_entry(lock, &state->lock_states, ls_locks) { - if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) - pr_warn_ratelimited("NFS: " - "%s: Lock reclaim " - "failed!\n", __func__); + if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) { + spin_lock(&state->state_lock); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) + pr_warn_ratelimited("NFS: " + "%s: Lock reclaim " + "failed!\n", __func__); + } + spin_unlock(&state->state_lock); } - spin_unlock(&state->state_lock); nfs4_put_open_state(state); spin_lock(&sp->so_lock); goto restart; @@ -1618,7 +1599,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) if (!nfs4_state_clear_reclaim_reboot(clp)) return; ops = clp->cl_mvops->reboot_recovery_ops; - cred = ops->get_clid_cred(clp); + cred = nfs4_get_clid_cred(clp); nfs4_reclaim_complete(clp, ops, cred); put_rpccred(cred); } @@ -1732,7 +1713,7 @@ static int nfs4_check_lease(struct nfs_client *clp) cred = ops->get_state_renewal_cred_locked(clp); spin_unlock(&clp->cl_lock); if (cred == NULL) { - cred = nfs4_get_setclientid_cred(clp); + cred = nfs4_get_clid_cred(clp); status = -ENOKEY; if (cred == NULL) goto out; @@ -1804,7 +1785,7 @@ static int nfs4_establish_lease(struct nfs_client *clp) clp->cl_mvops->reboot_recovery_ops; int status; - cred = ops->get_clid_cred(clp); + cred = nfs4_get_clid_cred(clp); if (cred == NULL) return -ENOENT; status = ops->establish_clid(clp, cred); @@ -1878,7 +1859,7 @@ int nfs4_discover_server_trunking(struct nfs_client *clp, mutex_lock(&nfs_clid_init_mutex); again: status = -ENOENT; - cred = ops->get_clid_cred(clp); + cred = nfs4_get_clid_cred(clp); if (cred == NULL) goto out_unlock; @@ -1896,7 +1877,11 @@ again: __func__, status); goto again; case -EACCES: - if (i++) + if (i++ == 0) { + nfs4_root_machine_cred(clp); + goto again; + } + if (i > 2) break; case -NFS4ERR_CLID_INUSE: case -NFS4ERR_WRONGSEC: @@ -2052,7 +2037,7 @@ static int nfs4_reset_session(struct nfs_client *clp) if (!nfs4_has_session(clp)) return 0; nfs4_begin_drain_session(clp); - cred = nfs4_get_exchange_id_cred(clp); + cred = nfs4_get_clid_cred(clp); status = nfs4_proc_destroy_session(clp->cl_session, cred); switch (status) { case 0: @@ -2095,7 +2080,7 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) if (!nfs4_has_session(clp)) return 0; nfs4_begin_drain_session(clp); - cred = nfs4_get_exchange_id_cred(clp); + cred = nfs4_get_clid_cred(clp); ret = nfs4_proc_bind_conn_to_session(clp, cred); if (cred) put_rpccred(cred); @@ -2116,7 +2101,6 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) } #else /* CONFIG_NFS_V4_1 */ static int nfs4_reset_session(struct nfs_client *clp) { return 0; } -static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } static int nfs4_bind_conn_to_session(struct nfs_client *clp) { diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 5dbe2d269210..e26acdd1a645 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -253,8 +253,6 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name, dfprintk(MOUNT, "--> nfs4_try_mount()\n"); - if (data->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) - data->auth_flavors[0] = RPC_AUTH_UNIX; export_path = data->nfs_server.export_path; data->nfs_server.export_path = "/"; root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c new file mode 100644 index 000000000000..d774335cc8bc --- /dev/null +++ b/fs/nfs/nfs4trace.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/nfs_fs.h> +#include "nfs4_fs.h" +#include "internal.h" +#include "nfs4session.h" +#include "callback.h" + +#define CREATE_TRACE_POINTS +#include "nfs4trace.h" + +#ifdef CONFIG_NFS_V4_1 +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds); +#endif diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h new file mode 100644 index 000000000000..849cf146db30 --- /dev/null +++ b/fs/nfs/nfs4trace.h @@ -0,0 +1,1148 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfs4 + +#if !defined(_TRACE_NFS4_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NFS4_H + +#include <linux/tracepoint.h> + +#define show_nfsv4_errors(error) \ + __print_symbolic(error, \ + { NFS4_OK, "OK" }, \ + /* Mapped by nfs4_stat_to_errno() */ \ + { -EPERM, "EPERM" }, \ + { -ENOENT, "ENOENT" }, \ + { -EIO, "EIO" }, \ + { -ENXIO, "ENXIO" }, \ + { -EACCES, "EACCES" }, \ + { -EEXIST, "EEXIST" }, \ + { -EXDEV, "EXDEV" }, \ + { -ENOTDIR, "ENOTDIR" }, \ + { -EISDIR, "EISDIR" }, \ + { -EFBIG, "EFBIG" }, \ + { -ENOSPC, "ENOSPC" }, \ + { -EROFS, "EROFS" }, \ + { -EMLINK, "EMLINK" }, \ + { -ENAMETOOLONG, "ENAMETOOLONG" }, \ + { -ENOTEMPTY, "ENOTEMPTY" }, \ + { -EDQUOT, "EDQUOT" }, \ + { -ESTALE, "ESTALE" }, \ + { -EBADHANDLE, "EBADHANDLE" }, \ + { -EBADCOOKIE, "EBADCOOKIE" }, \ + { -ENOTSUPP, "ENOTSUPP" }, \ + { -ETOOSMALL, "ETOOSMALL" }, \ + { -EREMOTEIO, "EREMOTEIO" }, \ + { -EBADTYPE, "EBADTYPE" }, \ + { -EAGAIN, "EAGAIN" }, \ + { -ELOOP, "ELOOP" }, \ + { -EOPNOTSUPP, "EOPNOTSUPP" }, \ + { -EDEADLK, "EDEADLK" }, \ + /* RPC errors */ \ + { -ENOMEM, "ENOMEM" }, \ + { -EKEYEXPIRED, "EKEYEXPIRED" }, \ + { -ETIMEDOUT, "ETIMEDOUT" }, \ + { -ERESTARTSYS, "ERESTARTSYS" }, \ + { -ECONNREFUSED, "ECONNREFUSED" }, \ + { -ECONNRESET, "ECONNRESET" }, \ + { -ENETUNREACH, "ENETUNREACH" }, \ + { -EHOSTUNREACH, "EHOSTUNREACH" }, \ + { -EHOSTDOWN, "EHOSTDOWN" }, \ + { -EPIPE, "EPIPE" }, \ + { -EPFNOSUPPORT, "EPFNOSUPPORT" }, \ + { -EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \ + /* NFSv4 native errors */ \ + { -NFS4ERR_ACCESS, "ACCESS" }, \ + { -NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \ + { -NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \ + { -NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \ + { -NFS4ERR_BADCHAR, "BADCHAR" }, \ + { -NFS4ERR_BADHANDLE, "BADHANDLE" }, \ + { -NFS4ERR_BADIOMODE, "BADIOMODE" }, \ + { -NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \ + { -NFS4ERR_BADLABEL, "BADLABEL" }, \ + { -NFS4ERR_BADNAME, "BADNAME" }, \ + { -NFS4ERR_BADOWNER, "BADOWNER" }, \ + { -NFS4ERR_BADSESSION, "BADSESSION" }, \ + { -NFS4ERR_BADSLOT, "BADSLOT" }, \ + { -NFS4ERR_BADTYPE, "BADTYPE" }, \ + { -NFS4ERR_BADXDR, "BADXDR" }, \ + { -NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \ + { -NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \ + { -NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \ + { -NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \ + { -NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \ + { -NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \ + { -NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ + { -NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \ + { -NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \ + { -NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \ + { -NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \ + "CONN_NOT_BOUND_TO_SESSION" }, \ + { -NFS4ERR_DEADLOCK, "DEADLOCK" }, \ + { -NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \ + { -NFS4ERR_DELAY, "DELAY" }, \ + { -NFS4ERR_DELEG_ALREADY_WANTED, \ + "DELEG_ALREADY_WANTED" }, \ + { -NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \ + { -NFS4ERR_DENIED, "DENIED" }, \ + { -NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \ + { -NFS4ERR_DQUOT, "DQUOT" }, \ + { -NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \ + { -NFS4ERR_EXIST, "EXIST" }, \ + { -NFS4ERR_EXPIRED, "EXPIRED" }, \ + { -NFS4ERR_FBIG, "FBIG" }, \ + { -NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \ + { -NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \ + { -NFS4ERR_GRACE, "GRACE" }, \ + { -NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \ + { -NFS4ERR_INVAL, "INVAL" }, \ + { -NFS4ERR_IO, "IO" }, \ + { -NFS4ERR_ISDIR, "ISDIR" }, \ + { -NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \ + { -NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \ + { -NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \ + { -NFS4ERR_LOCKED, "LOCKED" }, \ + { -NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \ + { -NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \ + { -NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \ + { -NFS4ERR_MLINK, "MLINK" }, \ + { -NFS4ERR_MOVED, "MOVED" }, \ + { -NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \ + { -NFS4ERR_NOENT, "NOENT" }, \ + { -NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \ + { -NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \ + { -NFS4ERR_NOSPC, "NOSPC" }, \ + { -NFS4ERR_NOTDIR, "NOTDIR" }, \ + { -NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \ + { -NFS4ERR_NOTSUPP, "NOTSUPP" }, \ + { -NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \ + { -NFS4ERR_NOT_SAME, "NOT_SAME" }, \ + { -NFS4ERR_NO_GRACE, "NO_GRACE" }, \ + { -NFS4ERR_NXIO, "NXIO" }, \ + { -NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \ + { -NFS4ERR_OPENMODE, "OPENMODE" }, \ + { -NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \ + { -NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \ + { -NFS4ERR_PERM, "PERM" }, \ + { -NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \ + { -NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \ + { -NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \ + { -NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \ + { -NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \ + { -NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \ + { -NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \ + { -NFS4ERR_REP_TOO_BIG_TO_CACHE, \ + "REP_TOO_BIG_TO_CACHE" }, \ + { -NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \ + { -NFS4ERR_RESOURCE, "RESOURCE" }, \ + { -NFS4ERR_RESTOREFH, "RESTOREFH" }, \ + { -NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \ + { -NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \ + { -NFS4ERR_ROFS, "ROFS" }, \ + { -NFS4ERR_SAME, "SAME" }, \ + { -NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \ + { -NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \ + { -NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \ + { -NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \ + { -NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \ + { -NFS4ERR_STALE, "STALE" }, \ + { -NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \ + { -NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \ + { -NFS4ERR_SYMLINK, "SYMLINK" }, \ + { -NFS4ERR_TOOSMALL, "TOOSMALL" }, \ + { -NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \ + { -NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \ + { -NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \ + { -NFS4ERR_WRONGSEC, "WRONGSEC" }, \ + { -NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \ + { -NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \ + { -NFS4ERR_XDEV, "XDEV" }) + +#define show_open_flags(flags) \ + __print_flags(flags, "|", \ + { O_CREAT, "O_CREAT" }, \ + { O_EXCL, "O_EXCL" }, \ + { O_TRUNC, "O_TRUNC" }, \ + { O_DIRECT, "O_DIRECT" }) + +#define show_fmode_flags(mode) \ + __print_flags(mode, "|", \ + { ((__force unsigned long)FMODE_READ), "READ" }, \ + { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \ + { ((__force unsigned long)FMODE_EXEC), "EXEC" }) + +#define show_nfs_fattr_flags(valid) \ + __print_flags((unsigned long)valid, "|", \ + { NFS_ATTR_FATTR_TYPE, "TYPE" }, \ + { NFS_ATTR_FATTR_MODE, "MODE" }, \ + { NFS_ATTR_FATTR_NLINK, "NLINK" }, \ + { NFS_ATTR_FATTR_OWNER, "OWNER" }, \ + { NFS_ATTR_FATTR_GROUP, "GROUP" }, \ + { NFS_ATTR_FATTR_RDEV, "RDEV" }, \ + { NFS_ATTR_FATTR_SIZE, "SIZE" }, \ + { NFS_ATTR_FATTR_FSID, "FSID" }, \ + { NFS_ATTR_FATTR_FILEID, "FILEID" }, \ + { NFS_ATTR_FATTR_ATIME, "ATIME" }, \ + { NFS_ATTR_FATTR_MTIME, "MTIME" }, \ + { NFS_ATTR_FATTR_CTIME, "CTIME" }, \ + { NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \ + { NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \ + { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" }) + +DECLARE_EVENT_CLASS(nfs4_clientid_event, + TP_PROTO( + const struct nfs_client *clp, + int error + ), + + TP_ARGS(clp, error), + + TP_STRUCT__entry( + __string(dstaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR)) + __field(int, error) + ), + + TP_fast_assign( + __entry->error = error; + __assign_str(dstaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR)); + ), + + TP_printk( + "error=%d (%s) dstaddr=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + __get_str(dstaddr) + ) +); +#define DEFINE_NFS4_CLIENTID_EVENT(name) \ + DEFINE_EVENT(nfs4_clientid_event, name, \ + TP_PROTO( \ + const struct nfs_client *clp, \ + int error \ + ), \ + TP_ARGS(clp, error)) +DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid_confirm); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew_async); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_CLIENTID_EVENT(nfs4_exchange_id); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_create_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_clientid); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete); + +TRACE_EVENT(nfs4_setup_sequence, + TP_PROTO( + const struct nfs4_session *session, + const struct nfs4_sequence_args *args + ), + TP_ARGS(session, args), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_used_slotid) + ), + + TP_fast_assign( + const struct nfs4_slot *sa_slot = args->sa_slot; + __entry->session = nfs_session_id_hash(&session->sess_id); + __entry->slot_nr = sa_slot->slot_nr; + __entry->seq_nr = sa_slot->seq_nr; + __entry->highest_used_slotid = + sa_slot->table->highest_used_slotid; + ), + TP_printk( + "session=0x%08x slot_nr=%u seq_nr=%u " + "highest_used_slotid=%u", + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_used_slotid + ) +); + +#define show_nfs4_sequence_status_flags(status) \ + __print_flags((unsigned long)status, "|", \ + { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ + { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING, \ + "CB_GSS_CONTEXTS_EXPIRING" }, \ + { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED, \ + "CB_GSS_CONTEXTS_EXPIRED" }, \ + { SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED, \ + "EXPIRED_ALL_STATE_REVOKED" }, \ + { SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED, \ + "EXPIRED_SOME_STATE_REVOKED" }, \ + { SEQ4_STATUS_ADMIN_STATE_REVOKED, \ + "ADMIN_STATE_REVOKED" }, \ + { SEQ4_STATUS_RECALLABLE_STATE_REVOKED, \ + "RECALLABLE_STATE_REVOKED" }, \ + { SEQ4_STATUS_LEASE_MOVED, "LEASE_MOVED" }, \ + { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, \ + "RESTART_RECLAIM_NEEDED" }, \ + { SEQ4_STATUS_CB_PATH_DOWN_SESSION, \ + "CB_PATH_DOWN_SESSION" }, \ + { SEQ4_STATUS_BACKCHANNEL_FAULT, \ + "BACKCHANNEL_FAULT" }) + +TRACE_EVENT(nfs4_sequence_done, + TP_PROTO( + const struct nfs4_session *session, + const struct nfs4_sequence_res *res + ), + TP_ARGS(session, res), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_slotid) + __field(unsigned int, target_highest_slotid) + __field(unsigned int, status_flags) + __field(int, error) + ), + + TP_fast_assign( + const struct nfs4_slot *sr_slot = res->sr_slot; + __entry->session = nfs_session_id_hash(&session->sess_id); + __entry->slot_nr = sr_slot->slot_nr; + __entry->seq_nr = sr_slot->seq_nr; + __entry->highest_slotid = res->sr_highest_slotid; + __entry->target_highest_slotid = + res->sr_target_highest_slotid; + __entry->error = res->sr_status; + ), + TP_printk( + "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "highest_slotid=%u target_highest_slotid=%u " + "status_flags=%u (%s)", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_slotid, + __entry->target_highest_slotid, + __entry->status_flags, + show_nfs4_sequence_status_flags(__entry->status_flags) + ) +); + +struct cb_sequenceargs; +struct cb_sequenceres; + +TRACE_EVENT(nfs4_cb_sequence, + TP_PROTO( + const struct cb_sequenceargs *args, + const struct cb_sequenceres *res, + __be32 status + ), + TP_ARGS(args, res, status), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_slotid) + __field(unsigned int, cachethis) + __field(int, error) + ), + + TP_fast_assign( + __entry->session = nfs_session_id_hash(&args->csa_sessionid); + __entry->slot_nr = args->csa_slotid; + __entry->seq_nr = args->csa_sequenceid; + __entry->highest_slotid = args->csa_highestslotid; + __entry->cachethis = args->csa_cachethis; + __entry->error = -be32_to_cpu(status); + ), + + TP_printk( + "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "highest_slotid=%u", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_slotid + ) +); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_open_event, + TP_PROTO( + const struct nfs_open_context *ctx, + int flags, + int error + ), + + TP_ARGS(ctx, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(unsigned int, fmode) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, dir) + __string(name, ctx->dentry->d_name.name) + ), + + TP_fast_assign( + const struct nfs4_state *state = ctx->state; + const struct inode *inode = NULL; + + __entry->error = error; + __entry->flags = flags; + __entry->fmode = (__force unsigned int)ctx->mode; + __entry->dev = ctx->dentry->d_sb->s_dev; + if (!IS_ERR(state)) + inode = state->inode; + if (inode != NULL) { + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + } else { + __entry->fileid = 0; + __entry->fhandle = 0; + } + __entry->dir = NFS_FILEID(ctx->dentry->d_parent->d_inode); + __assign_str(name, ctx->dentry->d_name.name); + ), + + TP_printk( + "error=%d (%s) flags=%d (%s) fmode=%s " + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "name=%02x:%02x:%llu/%s", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->flags, + show_open_flags(__entry->flags), + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS4_OPEN_EVENT(name) \ + DEFINE_EVENT(nfs4_open_event, name, \ + TP_PROTO( \ + const struct nfs_open_context *ctx, \ + int flags, \ + int error \ + ), \ + TP_ARGS(ctx, flags, error)) +DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim); +DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired); +DEFINE_NFS4_OPEN_EVENT(nfs4_open_file); + +TRACE_EVENT(nfs4_close, + TP_PROTO( + const struct nfs4_state *state, + const struct nfs_closeargs *args, + const struct nfs_closeres *res, + int error + ), + + TP_ARGS(state, args, res, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, fmode) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fmode = (__force unsigned int)state->state; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu " + "fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->fmode ? show_fmode_flags(__entry->fmode) : + "closed", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define show_lock_cmd(type) \ + __print_symbolic((int)type, \ + { F_GETLK, "GETLK" }, \ + { F_SETLK, "SETLK" }, \ + { F_SETLKW, "SETLKW" }) +#define show_lock_type(type) \ + __print_symbolic((int)type, \ + { F_RDLCK, "RDLCK" }, \ + { F_WRLCK, "WRLCK" }, \ + { F_UNLCK, "UNLCK" }) + +DECLARE_EVENT_CLASS(nfs4_lock_event, + TP_PROTO( + const struct file_lock *request, + const struct nfs4_state *state, + int cmd, + int error + ), + + TP_ARGS(request, state, cmd, error), + + TP_STRUCT__entry( + __field(int, error) + __field(int, cmd) + __field(char, type) + __field(loff_t, start) + __field(loff_t, end) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->error = error; + __entry->cmd = cmd; + __entry->type = request->fl_type; + __entry->start = request->fl_start; + __entry->end = request->fl_end; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + ), + + TP_printk( + "error=%d (%s) cmd=%s:%s range=%lld:%lld " + "fileid=%02x:%02x:%llu fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + show_lock_cmd(__entry->cmd), + show_lock_type(__entry->type), + (long long)__entry->start, + (long long)__entry->end, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define DEFINE_NFS4_LOCK_EVENT(name) \ + DEFINE_EVENT(nfs4_lock_event, name, \ + TP_PROTO( \ + const struct file_lock *request, \ + const struct nfs4_state *state, \ + int cmd, \ + int error \ + ), \ + TP_ARGS(request, state, cmd, error)) +DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock); +DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock); +DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim); +DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired); +DEFINE_NFS4_LOCK_EVENT(nfs4_unlock); + +DECLARE_EVENT_CLASS(nfs4_set_delegation_event, + TP_PROTO( + const struct inode *inode, + fmode_t fmode + ), + + TP_ARGS(inode, fmode), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, fmode) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fmode = (__force unsigned int)fmode; + ), + + TP_printk( + "fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x", + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); +#define DEFINE_NFS4_SET_DELEGATION_EVENT(name) \ + DEFINE_EVENT(nfs4_set_delegation_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + fmode_t fmode \ + ), \ + TP_ARGS(inode, fmode)) +DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation); +DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation); + +TRACE_EVENT(nfs4_delegreturn_exit, + TP_PROTO( + const struct nfs4_delegreturnargs *args, + const struct nfs4_delegreturnres *res, + int error + ), + + TP_ARGS(args, res, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(int, error) + ), + + TP_fast_assign( + __entry->dev = res->server->s_dev; + __entry->fhandle = nfs_fhandle_hash(args->fhandle); + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) dev=%02x:%02x fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->fhandle + ) +); + +#ifdef CONFIG_NFS_V4_1 +DECLARE_EVENT_CLASS(nfs4_test_stateid_event, + TP_PROTO( + const struct nfs4_state *state, + const struct nfs4_lock_state *lsp, + int error + ), + + TP_ARGS(state, lsp, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->error = error; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define DEFINE_NFS4_TEST_STATEID_EVENT(name) \ + DEFINE_EVENT(nfs4_test_stateid_event, name, \ + TP_PROTO( \ + const struct nfs4_state *state, \ + const struct nfs4_lock_state *lsp, \ + int error \ + ), \ + TP_ARGS(state, lsp, error)) +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_delegation_stateid); +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_open_stateid); +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_lock_stateid); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_lookup_event, + TP_PROTO( + const struct inode *dir, + const struct qstr *name, + int error + ), + + TP_ARGS(dir, name, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, dir) + __string(name, name->name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __assign_str(name, name->name); + ), + + TP_printk( + "error=%d (%s) name=%02x:%02x:%llu/%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS4_LOOKUP_EVENT(name) \ + DEFINE_EVENT(nfs4_lookup_event, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct qstr *name, \ + int error \ + ), \ + TP_ARGS(dir, name, error)) + +DEFINE_NFS4_LOOKUP_EVENT(nfs4_lookup); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_symlink); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_mkdir); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_mknod); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_remove); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_get_fs_locations); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_secinfo); + +TRACE_EVENT(nfs4_rename, + TP_PROTO( + const struct inode *olddir, + const struct qstr *oldname, + const struct inode *newdir, + const struct qstr *newname, + int error + ), + + TP_ARGS(olddir, oldname, newdir, newname, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, olddir) + __string(oldname, oldname->name) + __field(u64, newdir) + __string(newname, newname->name) + ), + + TP_fast_assign( + __entry->dev = olddir->i_sb->s_dev; + __entry->olddir = NFS_FILEID(olddir); + __entry->newdir = NFS_FILEID(newdir); + __entry->error = error; + __assign_str(oldname, oldname->name); + __assign_str(newname, newname->name); + ), + + TP_printk( + "error=%d (%s) oldname=%02x:%02x:%llu/%s " + "newname=%02x:%02x:%llu/%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->olddir, + __get_str(oldname), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->newdir, + __get_str(newname) + ) +); + +DECLARE_EVENT_CLASS(nfs4_inode_event, + TP_PROTO( + const struct inode *inode, + int error + ), + + TP_ARGS(inode, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, error) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define DEFINE_NFS4_INODE_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + int error \ + ), \ + TP_ARGS(inode, error)) + +DEFINE_NFS4_INODE_EVENT(nfs4_setattr); +DEFINE_NFS4_INODE_EVENT(nfs4_access); +DEFINE_NFS4_INODE_EVENT(nfs4_readlink); +DEFINE_NFS4_INODE_EVENT(nfs4_readdir); +DEFINE_NFS4_INODE_EVENT(nfs4_get_acl); +DEFINE_NFS4_INODE_EVENT(nfs4_set_acl); +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label); +DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label); +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ +DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation); +DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn); + +DECLARE_EVENT_CLASS(nfs4_getattr_event, + TP_PROTO( + const struct nfs_server *server, + const struct nfs_fh *fhandle, + const struct nfs_fattr *fattr, + int error + ), + + TP_ARGS(server, fhandle, fattr, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, valid) + __field(int, error) + ), + + TP_fast_assign( + __entry->dev = server->s_dev; + __entry->valid = fattr->valid; + __entry->fhandle = nfs_fhandle_hash(fhandle); + __entry->fileid = (fattr->valid & NFS_ATTR_FATTR_FILEID) ? fattr->fileid : 0; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "valid=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_nfs_fattr_flags(__entry->valid) + ) +); + +#define DEFINE_NFS4_GETATTR_EVENT(name) \ + DEFINE_EVENT(nfs4_getattr_event, name, \ + TP_PROTO( \ + const struct nfs_server *server, \ + const struct nfs_fh *fhandle, \ + const struct nfs_fattr *fattr, \ + int error \ + ), \ + TP_ARGS(server, fhandle, fattr, error)) +DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr); +DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root); +DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo); + +DECLARE_EVENT_CLASS(nfs4_idmap_event, + TP_PROTO( + const char *name, + int len, + u32 id, + int error + ), + + TP_ARGS(name, len, id, error), + + TP_STRUCT__entry( + __field(int, error) + __field(u32, id) + __dynamic_array(char, name, len > 0 ? len + 1 : 1) + ), + + TP_fast_assign( + if (len < 0) + len = 0; + __entry->error = error < 0 ? error : 0; + __entry->id = id; + memcpy(__get_dynamic_array(name), name, len); + ((char *)__get_dynamic_array(name))[len] = 0; + ), + + TP_printk( + "error=%d id=%u name=%s", + __entry->error, + __entry->id, + __get_str(name) + ) +); +#define DEFINE_NFS4_IDMAP_EVENT(name) \ + DEFINE_EVENT(nfs4_idmap_event, name, \ + TP_PROTO( \ + const char *name, \ + int len, \ + u32 id, \ + int error \ + ), \ + TP_ARGS(name, len, id, error)) +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_name_to_uid); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group); + +DECLARE_EVENT_CLASS(nfs4_read_event, + TP_PROTO( + const struct nfs_read_data *data, + int error + ), + + TP_ARGS(data, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(size_t, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = data->header->inode; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->offset = data->args.offset; + __entry->count = data->args.count; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, + __entry->count + ) +); +#define DEFINE_NFS4_READ_EVENT(name) \ + DEFINE_EVENT(nfs4_read_event, name, \ + TP_PROTO( \ + const struct nfs_read_data *data, \ + int error \ + ), \ + TP_ARGS(data, error)) +DEFINE_NFS4_READ_EVENT(nfs4_read); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_write_event, + TP_PROTO( + const struct nfs_write_data *data, + int error + ), + + TP_ARGS(data, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(size_t, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = data->header->inode; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->offset = data->args.offset; + __entry->count = data->args.count; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, + __entry->count + ) +); + +#define DEFINE_NFS4_WRITE_EVENT(name) \ + DEFINE_EVENT(nfs4_write_event, name, \ + TP_PROTO( \ + const struct nfs_write_data *data, \ + int error \ + ), \ + TP_ARGS(data, error)) +DEFINE_NFS4_WRITE_EVENT(nfs4_write); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_commit_event, + TP_PROTO( + const struct nfs_commit_data *data, + int error + ), + + TP_ARGS(data, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(size_t, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = data->inode; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->offset = data->args.offset; + __entry->count = data->args.count; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, + __entry->count + ) +); +#define DEFINE_NFS4_COMMIT_EVENT(name) \ + DEFINE_EVENT(nfs4_commit_event, name, \ + TP_PROTO( \ + const struct nfs_commit_data *data, \ + int error \ + ), \ + TP_ARGS(data, error)) +DEFINE_NFS4_COMMIT_EVENT(nfs4_commit); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds); + +#define show_pnfs_iomode(iomode) \ + __print_symbolic(iomode, \ + { IOMODE_READ, "READ" }, \ + { IOMODE_RW, "RW" }, \ + { IOMODE_ANY, "ANY" }) + +TRACE_EVENT(nfs4_layoutget, + TP_PROTO( + const struct nfs_open_context *ctx, + const struct pnfs_layout_range *args, + const struct pnfs_layout_range *res, + int error + ), + + TP_ARGS(ctx, args, res, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u32, iomode) + __field(u64, offset) + __field(u64, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = ctx->dentry->d_inode; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->iomode = args->iomode; + __entry->offset = args->offset; + __entry->count = args->length; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "iomode=%s offset=%llu count=%llu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_pnfs_iomode(__entry->iomode), + (unsigned long long)__entry->offset, + (unsigned long long)__entry->count + ) +); + +DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); +DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); + +#endif /* CONFIG_NFS_V4_1 */ + +#endif /* _TRACE_NFS4_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE nfs4trace +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 3850b018815f..fbdad9e1719f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -294,7 +294,9 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \ 1 /* flags */ + \ 1 /* spa_how */ + \ - 0 /* SP4_NONE (for now) */ + \ + /* max is SP4_MACH_CRED (for now) */ + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ 1 /* implementation id array of size 1 */ + \ 1 /* nii_domain */ + \ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ @@ -306,7 +308,9 @@ static int nfs4_stat_to_errno(int); 1 /* eir_sequenceid */ + \ 1 /* eir_flags */ + \ 1 /* spr_how */ + \ - 0 /* SP4_NONE (for now) */ + \ + /* max is SP4_MACH_CRED (for now) */ + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ 2 /* eir_server_owner.so_minor_id */ + \ /* eir_server_owner.so_major_id<> */ \ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \ @@ -997,12 +1001,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, int owner_namelen = 0; int owner_grouplen = 0; __be32 *p; - __be32 *q; - int len; - uint32_t bmval_len = 2; - uint32_t bmval0 = 0; - uint32_t bmval1 = 0; - uint32_t bmval2 = 0; + unsigned i; + uint32_t len = 0; + uint32_t bmval_len; + uint32_t bmval[3] = { 0 }; /* * We reserve enough space to write the entire attribute buffer at once. @@ -1011,13 +1013,14 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, * = 40 bytes, plus any contribution from variable-length fields * such as owner/group. */ - len = 8; - - /* Sigh */ - if (iap->ia_valid & ATTR_SIZE) + if (iap->ia_valid & ATTR_SIZE) { + bmval[0] |= FATTR4_WORD0_SIZE; len += 8; - if (iap->ia_valid & ATTR_MODE) + } + if (iap->ia_valid & ATTR_MODE) { + bmval[1] |= FATTR4_WORD1_MODE; len += 4; + } if (iap->ia_valid & ATTR_UID) { owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { @@ -1028,6 +1031,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, owner_namelen = sizeof("nobody") - 1; /* goto out; */ } + bmval[1] |= FATTR4_WORD1_OWNER; len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } if (iap->ia_valid & ATTR_GID) { @@ -1039,92 +1043,73 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, owner_grouplen = sizeof("nobody") - 1; /* goto out; */ } + bmval[1] |= FATTR4_WORD1_OWNER_GROUP; len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); } - if (iap->ia_valid & ATTR_ATIME_SET) + if (iap->ia_valid & ATTR_ATIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; len += 16; - else if (iap->ia_valid & ATTR_ATIME) + } else if (iap->ia_valid & ATTR_ATIME) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; len += 4; - if (iap->ia_valid & ATTR_MTIME_SET) + } + if (iap->ia_valid & ATTR_MTIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; len += 16; - else if (iap->ia_valid & ATTR_MTIME) + } else if (iap->ia_valid & ATTR_MTIME) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; len += 4; + } if (label) { len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); - bmval_len = 3; + bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; } - len += bmval_len << 2; - p = reserve_space(xdr, len); + if (bmval[2] != 0) + bmval_len = 3; + else if (bmval[1] != 0) + bmval_len = 2; + else + bmval_len = 1; + + p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len); - /* - * We write the bitmap length now, but leave the bitmap and the attribute - * buffer length to be backfilled at the end of this routine. - */ *p++ = cpu_to_be32(bmval_len); - q = p; - /* Skip bitmap entries + attrlen */ - p += bmval_len + 1; + for (i = 0; i < bmval_len; i++) + *p++ = cpu_to_be32(bmval[i]); + *p++ = cpu_to_be32(len); - if (iap->ia_valid & ATTR_SIZE) { - bmval0 |= FATTR4_WORD0_SIZE; + if (bmval[0] & FATTR4_WORD0_SIZE) p = xdr_encode_hyper(p, iap->ia_size); - } - if (iap->ia_valid & ATTR_MODE) { - bmval1 |= FATTR4_WORD1_MODE; + if (bmval[1] & FATTR4_WORD1_MODE) *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); - } - if (iap->ia_valid & ATTR_UID) { - bmval1 |= FATTR4_WORD1_OWNER; + if (bmval[1] & FATTR4_WORD1_OWNER) p = xdr_encode_opaque(p, owner_name, owner_namelen); - } - if (iap->ia_valid & ATTR_GID) { - bmval1 |= FATTR4_WORD1_OWNER_GROUP; + if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) p = xdr_encode_opaque(p, owner_group, owner_grouplen); + if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { + if (iap->ia_valid & ATTR_ATIME_SET) { + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); + *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); + } else + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } - if (iap->ia_valid & ATTR_ATIME_SET) { - bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); - *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); - } - else if (iap->ia_valid & ATTR_ATIME) { - bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); - } - if (iap->ia_valid & ATTR_MTIME_SET) { - bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); - *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); - } - else if (iap->ia_valid & ATTR_MTIME) { - bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); + if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { + if (iap->ia_valid & ATTR_MTIME_SET) { + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); + } else + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } - if (label) { - bmval2 |= FATTR4_WORD2_SECURITY_LABEL; + if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { *p++ = cpu_to_be32(label->lfs); *p++ = cpu_to_be32(label->pi); *p++ = cpu_to_be32(label->len); p = xdr_encode_opaque_fixed(p, label->label, label->len); } - /* - * Now we backfill the bitmap and the attribute buffer length. - */ - if (len != ((char *)p - (char *)q) + 4) { - printk(KERN_ERR "NFS: Attr length error, %u != %Zu\n", - len, ((char *)p - (char *)q) + 4); - BUG(); - } - *q++ = htonl(bmval0); - *q++ = htonl(bmval1); - if (bmval_len == 3) - *q++ = htonl(bmval2); - len = (char *)p - (char *)(q + 1); - *q = htonl(len); - /* out: */ } @@ -1745,6 +1730,14 @@ static void encode_bind_conn_to_session(struct xdr_stream *xdr, *p = 0; /* use_conn_in_rdma_mode = False */ } +static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) +{ + unsigned int i; + encode_uint32(xdr, NFS4_OP_MAP_NUM_WORDS); + for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) + encode_uint32(xdr, op_map->u.words[i]); +} + static void encode_exchange_id(struct xdr_stream *xdr, struct nfs41_exchange_id_args *args, struct compound_hdr *hdr) @@ -1758,9 +1751,20 @@ static void encode_exchange_id(struct xdr_stream *xdr, encode_string(xdr, args->id_len, args->id); - p = reserve_space(xdr, 12); - *p++ = cpu_to_be32(args->flags); - *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ + encode_uint32(xdr, args->flags); + encode_uint32(xdr, args->state_protect.how); + + switch (args->state_protect.how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + encode_op_map(xdr, &args->state_protect.enforce); + encode_op_map(xdr, &args->state_protect.allow); + break; + default: + WARN_ON_ONCE(1); + break; + } if (send_implementation_id && sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 && @@ -1771,7 +1775,7 @@ static void encode_exchange_id(struct xdr_stream *xdr, utsname()->version, utsname()->machine); if (len > 0) { - *p = cpu_to_be32(1); /* implementation id array length=1 */ + encode_uint32(xdr, 1); /* implementation id array length=1 */ encode_string(xdr, sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1, @@ -1782,7 +1786,7 @@ static void encode_exchange_id(struct xdr_stream *xdr, p = xdr_encode_hyper(p, 0); *p = cpu_to_be32(0); } else - *p = cpu_to_be32(0); /* implementation id array length=0 */ + encode_uint32(xdr, 0); /* implementation id array length=0 */ } static void encode_create_session(struct xdr_stream *xdr, @@ -1835,7 +1839,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ - *p++ = (__be32)nn->boot_time.tv_nsec; /* stamp */ + *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */ p = xdr_encode_opaque(p, machine_name, len); *p++ = cpu_to_be32(0); /* UID */ *p++ = cpu_to_be32(0); /* GID */ @@ -1877,11 +1881,10 @@ static void encode_sequence(struct xdr_stream *xdr, struct nfs4_slot *slot = args->sa_slot; __be32 *p; - if (slot == NULL) - return; - tp = slot->table; session = tp->session; + if (!session) + return; encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr); @@ -2062,9 +2065,9 @@ static void encode_free_stateid(struct xdr_stream *xdr, static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) { #if defined(CONFIG_NFS_V4_1) - - if (args->sa_slot) - return args->sa_slot->table->session->clp->cl_mvops->minor_version; + struct nfs4_session *session = args->sa_slot->table->session; + if (session) + return session->clp->cl_mvops->minor_version; #endif /* CONFIG_NFS_V4_1 */ return 0; } @@ -4649,7 +4652,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, uint32_t *layouttype) { - uint32_t *p; + __be32 *p; int num; p = xdr_inline_decode(xdr, 4); @@ -5394,6 +5397,23 @@ static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_re return decode_secinfo_common(xdr, res); } +static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) +{ + __be32 *p; + uint32_t bitmap_words; + unsigned int i; + + p = xdr_inline_decode(xdr, 4); + bitmap_words = be32_to_cpup(p++); + if (bitmap_words > NFS4_OP_MAP_NUM_WORDS) + return -EIO; + p = xdr_inline_decode(xdr, 4 * bitmap_words); + for (i = 0; i < bitmap_words; i++) + op_map->u.words[i] = be32_to_cpup(p++); + + return 0; +} + static int decode_exchange_id(struct xdr_stream *xdr, struct nfs41_exchange_id_res *res) { @@ -5417,10 +5437,22 @@ static int decode_exchange_id(struct xdr_stream *xdr, res->seqid = be32_to_cpup(p++); res->flags = be32_to_cpup(p++); - /* We ask for SP4_NONE */ - dummy = be32_to_cpup(p); - if (dummy != SP4_NONE) + res->state_protect.how = be32_to_cpup(p); + switch (res->state_protect.how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + status = decode_op_map(xdr, &res->state_protect.enforce); + if (status) + return status; + status = decode_op_map(xdr, &res->state_protect.allow); + if (status) + return status; + break; + default: + WARN_ON_ONCE(1); return -EIO; + } /* server_owner4.so_minor_id */ p = xdr_inline_decode(xdr, 8); @@ -5614,6 +5646,8 @@ static int decode_sequence(struct xdr_stream *xdr, if (res->sr_slot == NULL) return 0; + if (!res->sr_slot->table->session) + return 0; status = decode_op_hdr(xdr, OP_SEQUENCE); if (!status) diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c new file mode 100644 index 000000000000..4eb0aead69b6 --- /dev/null +++ b/fs/nfs/nfstrace.c @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/nfs_fs.h> +#include <linux/namei.h> +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include "nfstrace.h" diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h new file mode 100644 index 000000000000..89fe741e58b1 --- /dev/null +++ b/fs/nfs/nfstrace.h @@ -0,0 +1,729 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfs + +#if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NFS_H + +#include <linux/tracepoint.h> + +#define nfs_show_file_type(ftype) \ + __print_symbolic(ftype, \ + { DT_UNKNOWN, "UNKNOWN" }, \ + { DT_FIFO, "FIFO" }, \ + { DT_CHR, "CHR" }, \ + { DT_DIR, "DIR" }, \ + { DT_BLK, "BLK" }, \ + { DT_REG, "REG" }, \ + { DT_LNK, "LNK" }, \ + { DT_SOCK, "SOCK" }, \ + { DT_WHT, "WHT" }) + +#define nfs_show_cache_validity(v) \ + __print_flags(v, "|", \ + { NFS_INO_INVALID_ATTR, "INVALID_ATTR" }, \ + { NFS_INO_INVALID_DATA, "INVALID_DATA" }, \ + { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \ + { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \ + { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \ + { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \ + { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \ + { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }) + +#define nfs_show_nfsi_flags(v) \ + __print_flags(v, "|", \ + { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ + { 1 << NFS_INO_STALE, "STALE" }, \ + { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ + { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ + { 1 << NFS_INO_COMMIT, "COMMIT" }, \ + { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ + { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) + +DECLARE_EVENT_CLASS(nfs_inode_event, + TP_PROTO( + const struct inode *inode + ), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode->i_version; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (unsigned long long)__entry->version + ) +); + +DECLARE_EVENT_CLASS(nfs_inode_event_done, + TP_PROTO( + const struct inode *inode, + int error + ), + + TP_ARGS(inode, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(unsigned char, type) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, size) + __field(unsigned long, nfsi_flags) + __field(unsigned long, cache_validity) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + __entry->error = error; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->type = nfs_umode_to_dtype(inode->i_mode); + __entry->version = inode->i_version; + __entry->size = i_size_read(inode); + __entry->nfsi_flags = nfsi->flags; + __entry->cache_validity = nfsi->cache_validity; + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "type=%u (%s) version=%llu size=%lld " + "cache_validity=%lu (%s) nfs_flags=%ld (%s)", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->type, + nfs_show_file_type(__entry->type), + (unsigned long long)__entry->version, + (long long)__entry->size, + __entry->cache_validity, + nfs_show_cache_validity(__entry->cache_validity), + __entry->nfsi_flags, + nfs_show_nfsi_flags(__entry->nfsi_flags) + ) +); + +#define DEFINE_NFS_INODE_EVENT(name) \ + DEFINE_EVENT(nfs_inode_event, name, \ + TP_PROTO( \ + const struct inode *inode \ + ), \ + TP_ARGS(inode)) +#define DEFINE_NFS_INODE_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_inode_event_done, name, \ + TP_PROTO( \ + const struct inode *inode, \ + int error \ + ), \ + TP_ARGS(inode, error)) +DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_revalidate_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_invalidate_mapping_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_invalidate_mapping_exit); +DEFINE_NFS_INODE_EVENT(nfs_getattr_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit); +DEFINE_NFS_INODE_EVENT(nfs_setattr_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit); +DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit); +DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); +DEFINE_NFS_INODE_EVENT(nfs_access_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_access_exit); + +#define show_lookup_flags(flags) \ + __print_flags((unsigned long)flags, "|", \ + { LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \ + { LOOKUP_DIRECTORY, "DIRECTORY" }, \ + { LOOKUP_OPEN, "OPEN" }, \ + { LOOKUP_CREATE, "CREATE" }, \ + { LOOKUP_EXCL, "EXCL" }) + +DECLARE_EVENT_CLASS(nfs_lookup_event, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags + ), + + TP_ARGS(dir, dentry, flags), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->flags, + show_lookup_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_LOOKUP_EVENT(name) \ + DEFINE_EVENT(nfs_lookup_event, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry, \ + unsigned int flags \ + ), \ + TP_ARGS(dir, dentry, flags)) + +DECLARE_EVENT_CLASS(nfs_lookup_event_done, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags, + int error + ), + + TP_ARGS(dir, dentry, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->error, + __entry->flags, + show_lookup_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_LOOKUP_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_lookup_event_done, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry, \ + unsigned int flags, \ + int error \ + ), \ + TP_ARGS(dir, dentry, flags, error)) + +DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter); +DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit); +DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter); +DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit); + +#define show_open_flags(flags) \ + __print_flags((unsigned long)flags, "|", \ + { O_CREAT, "O_CREAT" }, \ + { O_EXCL, "O_EXCL" }, \ + { O_TRUNC, "O_TRUNC" }, \ + { O_APPEND, "O_APPEND" }, \ + { O_DSYNC, "O_DSYNC" }, \ + { O_DIRECT, "O_DIRECT" }, \ + { O_DIRECTORY, "O_DIRECTORY" }) + +#define show_fmode_flags(mode) \ + __print_flags(mode, "|", \ + { ((__force unsigned long)FMODE_READ), "READ" }, \ + { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \ + { ((__force unsigned long)FMODE_EXEC), "EXEC" }) + +TRACE_EVENT(nfs_atomic_open_enter, + TP_PROTO( + const struct inode *dir, + const struct nfs_open_context *ctx, + unsigned int flags + ), + + TP_ARGS(dir, ctx, flags), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __field(unsigned int, fmode) + __field(dev_t, dev) + __field(u64, dir) + __string(name, ctx->dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __entry->fmode = (__force unsigned int)ctx->mode; + __assign_str(name, ctx->dentry->d_name.name); + ), + + TP_printk( + "flags=%u (%s) fmode=%s name=%02x:%02x:%llu/%s", + __entry->flags, + show_open_flags(__entry->flags), + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_atomic_open_exit, + TP_PROTO( + const struct inode *dir, + const struct nfs_open_context *ctx, + unsigned int flags, + int error + ), + + TP_ARGS(dir, ctx, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(unsigned int, fmode) + __field(dev_t, dev) + __field(u64, dir) + __string(name, ctx->dentry->d_name.name) + ), + + TP_fast_assign( + __entry->error = error; + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __entry->fmode = (__force unsigned int)ctx->mode; + __assign_str(name, ctx->dentry->d_name.name); + ), + + TP_printk( + "error=%d flags=%u (%s) fmode=%s " + "name=%02x:%02x:%llu/%s", + __entry->error, + __entry->flags, + show_open_flags(__entry->flags), + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_create_enter, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags + ), + + TP_ARGS(dir, dentry, flags), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->flags, + show_open_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_create_exit, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags, + int error + ), + + TP_ARGS(dir, dentry, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->error = error; + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->error, + __entry->flags, + show_open_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +DECLARE_EVENT_CLASS(nfs_directory_event, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry + ), + + TP_ARGS(dir, dentry), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "name=%02x:%02x:%llu/%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_DIRECTORY_EVENT(name) \ + DEFINE_EVENT(nfs_directory_event, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry \ + ), \ + TP_ARGS(dir, dentry)) + +DECLARE_EVENT_CLASS(nfs_directory_event_done, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + int error + ), + + TP_ARGS(dir, dentry, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_DIRECTORY_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_directory_event_done, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry, \ + int error \ + ), \ + TP_ARGS(dir, dentry, error)) + +DEFINE_NFS_DIRECTORY_EVENT(nfs_mknod_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mknod_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_mkdir_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mkdir_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_rmdir_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_rmdir_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_remove_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_remove_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_unlink_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_unlink_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_symlink_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_symlink_exit); + +TRACE_EVENT(nfs_link_enter, + TP_PROTO( + const struct inode *inode, + const struct inode *dir, + const struct dentry *dentry + ), + + TP_ARGS(inode, dir, dentry), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->dir = NFS_FILEID(dir); + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->fileid, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_link_exit, + TP_PROTO( + const struct inode *inode, + const struct inode *dir, + const struct dentry *dentry, + int error + ), + + TP_ARGS(inode, dir, dentry, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u64, fileid) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->fileid, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +DECLARE_EVENT_CLASS(nfs_rename_event, + TP_PROTO( + const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry + ), + + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, old_dir) + __field(u64, new_dir) + __string(old_name, old_dentry->d_name.name) + __string(new_name, new_dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = old_dir->i_sb->s_dev; + __entry->old_dir = NFS_FILEID(old_dir); + __entry->new_dir = NFS_FILEID(new_dir); + __assign_str(old_name, old_dentry->d_name.name); + __assign_str(new_name, new_dentry->d_name.name); + ), + + TP_printk( + "old_name=%02x:%02x:%llu/%s new_name=%02x:%02x:%llu/%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->old_dir, + __get_str(old_name), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->new_dir, + __get_str(new_name) + ) +); +#define DEFINE_NFS_RENAME_EVENT(name) \ + DEFINE_EVENT(nfs_rename_event, name, \ + TP_PROTO( \ + const struct inode *old_dir, \ + const struct dentry *old_dentry, \ + const struct inode *new_dir, \ + const struct dentry *new_dentry \ + ), \ + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry)) + +DECLARE_EVENT_CLASS(nfs_rename_event_done, + TP_PROTO( + const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry, + int error + ), + + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, old_dir) + __string(old_name, old_dentry->d_name.name) + __field(u64, new_dir) + __string(new_name, new_dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = old_dir->i_sb->s_dev; + __entry->old_dir = NFS_FILEID(old_dir); + __entry->new_dir = NFS_FILEID(new_dir); + __entry->error = error; + __assign_str(old_name, old_dentry->d_name.name); + __assign_str(new_name, new_dentry->d_name.name); + ), + + TP_printk( + "error=%d old_name=%02x:%02x:%llu/%s " + "new_name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->old_dir, + __get_str(old_name), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->new_dir, + __get_str(new_name) + ) +); +#define DEFINE_NFS_RENAME_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_rename_event_done, name, \ + TP_PROTO( \ + const struct inode *old_dir, \ + const struct dentry *old_dentry, \ + const struct inode *new_dir, \ + const struct dentry *new_dentry, \ + int error \ + ), \ + TP_ARGS(old_dir, old_dentry, new_dir, \ + new_dentry, error)) + +DEFINE_NFS_RENAME_EVENT(nfs_rename_enter); +DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit); + +DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename); + +TRACE_EVENT(nfs_sillyrename_unlink, + TP_PROTO( + const struct nfs_unlinkdata *data, + int error + ), + + TP_ARGS(data, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, dir) + __dynamic_array(char, name, data->args.name.len + 1) + ), + + TP_fast_assign( + struct inode *dir = data->dir; + size_t len = data->args.name.len; + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + memcpy(__get_dynamic_array(name), + data->args.name.name, len); + ((char *)__get_dynamic_array(name))[len] = 0; + ), + + TP_printk( + "error=%d name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); +#endif /* _TRACE_NFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE nfstrace +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 29cfb7ade121..2ffebf2081ce 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -328,6 +328,19 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, } EXPORT_SYMBOL_GPL(nfs_pageio_init); +static bool nfs_match_open_context(const struct nfs_open_context *ctx1, + const struct nfs_open_context *ctx2) +{ + return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; +} + +static bool nfs_match_lock_context(const struct nfs_lock_context *l1, + const struct nfs_lock_context *l2) +{ + return l1->lockowner.l_owner == l2->lockowner.l_owner + && l1->lockowner.l_pid == l2->lockowner.l_pid; +} + /** * nfs_can_coalesce_requests - test two requests for compatibility * @prev: pointer to nfs_page @@ -343,13 +356,10 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, struct nfs_page *req, struct nfs_pageio_descriptor *pgio) { - if (req->wb_context->cred != prev->wb_context->cred) - return false; - if (req->wb_lock_context->lockowner.l_owner != prev->wb_lock_context->lockowner.l_owner) - return false; - if (req->wb_lock_context->lockowner.l_pid != prev->wb_lock_context->lockowner.l_pid) + if (!nfs_match_open_context(req->wb_context, prev->wb_context)) return false; - if (req->wb_context->state != prev->wb_context->state) + if (req->wb_context->dentry->d_inode->i_flock != NULL && + !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) return false; if (req->wb_pgbase != 0) return false; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3a3a79d6bf15..d75d938d36cb 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -33,6 +33,7 @@ #include "internal.h" #include "pnfs.h" #include "iostat.h" +#include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) @@ -1526,6 +1527,7 @@ void pnfs_ld_write_done(struct nfs_write_data *data) { struct nfs_pgio_header *hdr = data->header; + trace_nfs4_pnfs_write(data, hdr->pnfs_error); if (!hdr->pnfs_error) { pnfs_set_layoutcommit(data); hdr->mds_ops->rpc_call_done(&data->task, data); @@ -1680,6 +1682,7 @@ void pnfs_ld_read_done(struct nfs_read_data *data) { struct nfs_pgio_header *hdr = data->header; + trace_nfs4_pnfs_read(data, hdr->pnfs_error); if (likely(!hdr->pnfs_error)) { __nfs4_read_done_cb(data); hdr->mds_ops->rpc_call_done(&data->task, data); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index c041c41f7a52..a8f57c728df5 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -623,9 +623,10 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message * msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; } -static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) { rpc_call_start(task); + return 0; } static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) @@ -644,9 +645,10 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; } -static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) { rpc_call_start(task); + return 0; } static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 70a26c651f09..31db5c366b81 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -513,9 +513,10 @@ static void nfs_readpage_release_common(void *calldata) void nfs_read_prepare(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; - NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data); - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) - rpc_exit(task, -EIO); + int err; + err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data); + if (err) + rpc_exit(task, err); } static const struct rpc_call_ops nfs_read_common_ops = { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 71fdc0dfa0d2..5793f24613c8 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -923,7 +923,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) data->nfs_server.port = NFS_UNSPEC_PORT; data->nfs_server.protocol = XPRT_TRANSPORT_TCP; data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR; - data->auth_flavor_len = 1; + data->auth_flavor_len = 0; data->minorversion = 0; data->need_mount = true; data->net = current->nsproxy->net_ns; @@ -1018,6 +1018,13 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) } } +static void nfs_set_auth_parsed_mount_data(struct nfs_parsed_mount_data *data, + rpc_authflavor_t pseudoflavor) +{ + data->auth_flavors[0] = pseudoflavor; + data->auth_flavor_len = 1; +} + /* * Parse the value of the 'sec=' option. */ @@ -1025,49 +1032,50 @@ static int nfs_parse_security_flavors(char *value, struct nfs_parsed_mount_data *mnt) { substring_t args[MAX_OPT_ARGS]; + rpc_authflavor_t pseudoflavor; dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); switch (match_token(value, nfs_secflavor_tokens, args)) { case Opt_sec_none: - mnt->auth_flavors[0] = RPC_AUTH_NULL; + pseudoflavor = RPC_AUTH_NULL; break; case Opt_sec_sys: - mnt->auth_flavors[0] = RPC_AUTH_UNIX; + pseudoflavor = RPC_AUTH_UNIX; break; case Opt_sec_krb5: - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; + pseudoflavor = RPC_AUTH_GSS_KRB5; break; case Opt_sec_krb5i: - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; + pseudoflavor = RPC_AUTH_GSS_KRB5I; break; case Opt_sec_krb5p: - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; + pseudoflavor = RPC_AUTH_GSS_KRB5P; break; case Opt_sec_lkey: - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; + pseudoflavor = RPC_AUTH_GSS_LKEY; break; case Opt_sec_lkeyi: - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; + pseudoflavor = RPC_AUTH_GSS_LKEYI; break; case Opt_sec_lkeyp: - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; + pseudoflavor = RPC_AUTH_GSS_LKEYP; break; case Opt_sec_spkm: - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; + pseudoflavor = RPC_AUTH_GSS_SPKM; break; case Opt_sec_spkmi: - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; + pseudoflavor = RPC_AUTH_GSS_SPKMI; break; case Opt_sec_spkmp: - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; + pseudoflavor = RPC_AUTH_GSS_SPKMP; break; default: return 0; } mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; + nfs_set_auth_parsed_mount_data(mnt, pseudoflavor); return 1; } @@ -1729,7 +1737,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf * Was a sec= authflavor specified in the options? First, verify * whether the server supports it, and then just try to use it if so. */ - if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) { + if (args->auth_flavor_len > 0) { status = nfs_verify_authflavor(args, authlist, authlist_len); dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); if (status) @@ -1760,7 +1768,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf /* Fallthrough */ } dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); - args->auth_flavors[0] = flavor; + nfs_set_auth_parsed_mount_data(args, flavor); server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); if (!IS_ERR(server)) return server; @@ -1776,7 +1784,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf /* Last chance! Try AUTH_UNIX */ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); - args->auth_flavors[0] = RPC_AUTH_UNIX; + nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); } @@ -1893,6 +1901,7 @@ static int nfs23_validate_mount_data(void *options, { struct nfs_mount_data *data = (struct nfs_mount_data *)options; struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; if (data == NULL) goto out_no_data; @@ -1908,6 +1917,8 @@ static int nfs23_validate_mount_data(void *options, goto out_no_v3; data->root.size = NFS2_FHSIZE; memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); + /* Turn off security negotiation */ + extra_flags |= NFS_MOUNT_SECFLAVOUR; case 4: if (data->flags & NFS_MOUNT_SECFLAVOUR) goto out_no_sec; @@ -1935,7 +1946,7 @@ static int nfs23_validate_mount_data(void *options, * can deal with. */ args->flags = data->flags & NFS_MOUNT_FLAGMASK; - args->flags |= NFS_MOUNT_LEGACY_INTERFACE; + args->flags |= extra_flags; args->rsize = data->rsize; args->wsize = data->wsize; args->timeo = data->timeo; @@ -1959,9 +1970,10 @@ static int nfs23_validate_mount_data(void *options, args->namlen = data->namlen; args->bsize = data->bsize; - args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->flags & NFS_MOUNT_SECFLAVOUR) - args->auth_flavors[0] = data->pseudoflavor; + nfs_set_auth_parsed_mount_data(args, data->pseudoflavor); + else + nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); if (!args->nfs_server.hostname) goto out_nomem; @@ -2084,6 +2096,8 @@ static int nfs_validate_text_mount_data(void *options, max_namelen = NFS4_MAXNAMLEN; max_pathlen = NFS4_MAXPATHLEN; nfs_validate_transport_protocol(args); + if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP) + goto out_invalid_transport_udp; nfs4_validate_mount_flags(args); #else goto out_v4_not_compiled; @@ -2106,6 +2120,10 @@ static int nfs_validate_text_mount_data(void *options, out_v4_not_compiled: dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); return -EPROTONOSUPPORT; +#else +out_invalid_transport_udp: + dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n"); + return -EINVAL; #endif /* !CONFIG_NFS_V4 */ out_no_address: @@ -2170,7 +2188,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->rsize = nfss->rsize; data->wsize = nfss->wsize; data->retrans = nfss->client->cl_timeout->to_retries; - data->auth_flavors[0] = nfss->client->cl_auth->au_flavor; + nfs_set_auth_parsed_mount_data(data, nfss->client->cl_auth->au_flavor); data->acregmin = nfss->acregmin / HZ; data->acregmax = nfss->acregmax / HZ; data->acdirmin = nfss->acdirmin / HZ; @@ -2277,6 +2295,18 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) nfs_initialise_sb(sb); } +#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ + | NFS_MOUNT_SECURE \ + | NFS_MOUNT_TCP \ + | NFS_MOUNT_VER3 \ + | NFS_MOUNT_KERBEROS \ + | NFS_MOUNT_NONLM \ + | NFS_MOUNT_BROKEN_SUID \ + | NFS_MOUNT_STRICTLOCK \ + | NFS_MOUNT_UNSHARED \ + | NFS_MOUNT_NORESVPORT \ + | NFS_MOUNT_LEGACY_INTERFACE) + static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) { const struct nfs_server *a = s->s_fs_info; @@ -2287,7 +2317,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n goto Ebusy; if (a->nfs_client != b->nfs_client) goto Ebusy; - if (a->flags != b->flags) + if ((a->flags ^ b->flags) & NFS_MOUNT_CMP_FLAGMASK) goto Ebusy; if (a->wsize != b->wsize) goto Ebusy; @@ -2301,7 +2331,8 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n goto Ebusy; if (a->acdirmax != b->acdirmax) goto Ebusy; - if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) + if (b->flags & NFS_MOUNT_SECFLAVOUR && + clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) goto Ebusy; return 1; Ebusy: @@ -2478,6 +2509,10 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, if (server->flags & NFS_MOUNT_NOAC) sb_mntdata.mntflags |= MS_SYNCHRONOUS; + if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL) + if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata); if (IS_ERR(s)) { @@ -2669,15 +2704,17 @@ static int nfs4_validate_mount_data(void *options, goto out_no_address; args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); - args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->auth_flavourlen) { + rpc_authflavor_t pseudoflavor; if (data->auth_flavourlen > 1) goto out_inval_auth; - if (copy_from_user(&args->auth_flavors[0], + if (copy_from_user(&pseudoflavor, data->auth_flavours, - sizeof(args->auth_flavors[0]))) + sizeof(pseudoflavor))) return -EFAULT; - } + nfs_set_auth_parsed_mount_data(args, pseudoflavor); + } else + nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); if (IS_ERR(c)) @@ -2711,6 +2748,8 @@ static int nfs4_validate_mount_data(void *options, args->acdirmax = data->acdirmax; args->nfs_server.protocol = data->proto; nfs_validate_transport_protocol(args); + if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP) + goto out_invalid_transport_udp; break; default: @@ -2731,6 +2770,10 @@ out_inval_auth: out_no_address: dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); return -EINVAL; + +out_invalid_transport_udp: + dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n"); + return -EINVAL; } /* @@ -2746,6 +2789,7 @@ bool nfs4_disable_idmapping = true; unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; unsigned short send_implementation_id = 1; char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; +bool recover_lost_locks = false; EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); EXPORT_SYMBOL_GPL(nfs_callback_tcpport); @@ -2754,6 +2798,7 @@ EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); EXPORT_SYMBOL_GPL(max_session_slots); EXPORT_SYMBOL_GPL(send_implementation_id); EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); +EXPORT_SYMBOL_GPL(recover_lost_locks); #define NFS_CALLBACK_MAXPORTNR (65535U) @@ -2791,4 +2836,10 @@ MODULE_PARM_DESC(send_implementation_id, "Send implementation ID with NFSv4.1 exchange_id"); MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string"); +module_param(recover_lost_locks, bool, 0644); +MODULE_PARM_DESC(recover_lost_locks, + "If the server reports that a lock might be lost, " + "try to recover it risking data corruption."); + + #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 60395ad3a2e4..bb939edd4c99 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -20,6 +20,8 @@ #include "iostat.h" #include "delegation.h" +#include "nfstrace.h" + /** * nfs_free_unlinkdata - release data from a sillydelete operation. * @data: pointer to unlink structure. @@ -77,6 +79,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) struct nfs_unlinkdata *data = calldata; struct inode *dir = data->dir; + trace_nfs_sillyrename_unlink(data, task->tk_status); if (!NFS_PROTO(dir)->unlink_done(task, dir)) rpc_restart_call_prepare(task); } @@ -204,6 +207,13 @@ out_free: return ret; } +void nfs_wait_on_sillyrename(struct dentry *dentry) +{ + struct nfs_inode *nfsi = NFS_I(dentry->d_inode); + + wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1); +} + void nfs_block_sillyrename(struct dentry *dentry) { struct nfs_inode *nfsi = NFS_I(dentry->d_inode); @@ -336,6 +346,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct inode *new_dir = data->new_dir; struct dentry *old_dentry = data->old_dentry; + trace_nfs_sillyrename_rename(old_dir, old_dentry, + new_dir, data->new_dentry, task->tk_status); if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { rpc_restart_call_prepare(task); return; @@ -444,6 +456,14 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, return rpc_run_task(&task_setup_data); } +#define SILLYNAME_PREFIX ".nfs" +#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1) +#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1) +#define SILLYNAME_COUNTER_LEN ((unsigned)sizeof(unsigned int) << 1) +#define SILLYNAME_LEN (SILLYNAME_PREFIX_LEN + \ + SILLYNAME_FILEID_LEN + \ + SILLYNAME_COUNTER_LEN) + /** * nfs_sillyrename - Perform a silly-rename of a dentry * @dir: inode of directory that contains dentry @@ -469,10 +489,8 @@ int nfs_sillyrename(struct inode *dir, struct dentry *dentry) { static unsigned int sillycounter; - const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2; - const int countersize = sizeof(sillycounter)*2; - const int slen = sizeof(".nfs")+fileidsize+countersize-1; - char silly[slen+1]; + unsigned char silly[SILLYNAME_LEN + 1]; + unsigned long long fileid; struct dentry *sdentry; struct rpc_task *task; int error = -EIO; @@ -489,20 +507,20 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto out; - sprintf(silly, ".nfs%*.*Lx", - fileidsize, fileidsize, - (unsigned long long)NFS_FILEID(dentry->d_inode)); + fileid = NFS_FILEID(dentry->d_inode); /* Return delegation in anticipation of the rename */ NFS_PROTO(dentry->d_inode)->return_delegation(dentry->d_inode); sdentry = NULL; do { - char *suffix = silly + slen - countersize; - + int slen; dput(sdentry); sillycounter++; - sprintf(suffix, "%*.*x", countersize, countersize, sillycounter); + slen = scnprintf(silly, sizeof(silly), + SILLYNAME_PREFIX "%0*llx%0*x", + SILLYNAME_FILEID_LEN, fileid, + SILLYNAME_COUNTER_LEN, sillycounter); dfprintk(VFS, "NFS: trying to rename %s to %s\n", dentry->d_name.name, silly); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f1bdb7254776..ac1dc331ba31 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -31,6 +31,8 @@ #include "fscache.h" #include "pnfs.h" +#include "nfstrace.h" + #define NFSDBG_FACILITY NFSDBG_PAGECACHE #define MIN_POOL_WRITE (32) @@ -861,7 +863,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) return 0; l_ctx = req->wb_lock_context; do_flush = req->wb_page != page || req->wb_context != ctx; - if (l_ctx) { + if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { do_flush |= l_ctx->lockowner.l_owner != current->files || l_ctx->lockowner.l_pid != current->tgid; } @@ -874,6 +876,33 @@ int nfs_flush_incompatible(struct file *file, struct page *page) } /* + * Avoid buffered writes when a open context credential's key would + * expire soon. + * + * Returns -EACCES if the key will expire within RPC_KEY_EXPIRE_FAIL. + * + * Return 0 and set a credential flag which triggers the inode to flush + * and performs NFS_FILE_SYNC writes if the key will expired within + * RPC_KEY_EXPIRE_TIMEO. + */ +int +nfs_key_timeout_notify(struct file *filp, struct inode *inode) +{ + struct nfs_open_context *ctx = nfs_file_open_context(filp); + struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; + + return rpcauth_key_timeout_notify(auth, ctx->cred); +} + +/* + * Test if the open context credential key is marked to expire soon. + */ +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) +{ + return rpcauth_cred_key_to_expire(ctx->cred); +} + +/* * If the page cache is marked as unsafe or invalid, then we can't rely on * the PageUptodate() flag. In this case, we will need to turn off * write optimisations that depend on the page contents being correct. @@ -993,6 +1022,9 @@ int nfs_initiate_write(struct rpc_clnt *clnt, data->args.count, (unsigned long long)data->args.offset); + nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, + &task_setup_data.rpc_client, &msg, data); + task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) { ret = PTR_ERR(task); @@ -1265,9 +1297,10 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); void nfs_write_prepare(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data); - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) - rpc_exit(task, -EIO); + int err; + err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data); + if (err) + rpc_exit(task, err); } void nfs_commit_prepare(struct rpc_task *task, void *calldata) @@ -1458,6 +1491,9 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client, + NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg); + task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -1732,8 +1768,14 @@ int nfs_wb_all(struct inode *inode) .range_start = 0, .range_end = LLONG_MAX, }; + int ret; - return sync_inode(inode, &wbc); + trace_nfs_writeback_inode_enter(inode); + + ret = sync_inode(inode, &wbc); + + trace_nfs_writeback_inode_exit(inode, ret); + return ret; } EXPORT_SYMBOL_GPL(nfs_wb_all); @@ -1781,6 +1823,8 @@ int nfs_wb_page(struct inode *inode, struct page *page) }; int ret; + trace_nfs_writeback_page_enter(inode); + for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { @@ -1789,14 +1833,15 @@ int nfs_wb_page(struct inode *inode, struct page *page) goto out_error; continue; } + ret = 0; if (!PagePrivate(page)) break; ret = nfs_commit_inode(inode, FLUSH_SYNC); if (ret < 0) goto out_error; } - return 0; out_error: + trace_nfs_writeback_page_exit(inode, ret); return ret; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0d4c410e4589..419572f33b72 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1524,7 +1524,7 @@ static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ - 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\ + 1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\ 2 + /*eir_server_owner.so_minor_id */\ /* eir_server_owner.so_major_id<> */\ XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 280acef6f0dc..43f42290e5df 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1264,6 +1264,8 @@ static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp) struct svc_cred *cr = &rqstp->rq_cred; u32 service; + if (!cr->cr_gss_mech) + return false; service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor); return service == RPC_GSS_SVC_INTEGRITY || service == RPC_GSS_SVC_PRIVACY; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0c0f3ea90de5..d9454fe5653f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1816,10 +1816,7 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, static __be32 nfsd4_encode_path(const struct path *root, const struct path *path, __be32 **pp, int *buflen) { - struct path cur = { - .mnt = path->mnt, - .dentry = path->dentry, - }; + struct path cur = *path; __be32 *p = *pp; struct dentry **components = NULL; unsigned int ncomponents = 0; @@ -1859,14 +1856,19 @@ static __be32 nfsd4_encode_path(const struct path *root, while (ncomponents) { struct dentry *dentry = components[ncomponents - 1]; - unsigned int len = dentry->d_name.len; + unsigned int len; + spin_lock(&dentry->d_lock); + len = dentry->d_name.len; *buflen -= 4 + (XDR_QUADLEN(len) << 2); - if (*buflen < 0) + if (*buflen < 0) { + spin_unlock(&dentry->d_lock); goto out_free; + } WRITE32(len); WRITEMEM(dentry->d_name.name, len); dprintk("/%s", dentry->d_name.name); + spin_unlock(&dentry->d_lock); dput(dentry); ncomponents--; } @@ -3360,7 +3362,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, 8 /* eir_clientid */ + 4 /* eir_sequenceid */ + 4 /* eir_flags */ + - 4 /* spr_how (SP4_NONE) */ + + 4 /* spr_how */ + + 8 /* spo_must_enforce, spo_must_allow */ + 8 /* so_minor_id */ + 4 /* so_major_id.len */ + (XDR_QUADLEN(major_id_sz) * 4) + @@ -3372,8 +3375,6 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, WRITE32(exid->seqid); WRITE32(exid->flags); - /* state_protect4_r. Currently only support SP4_NONE */ - BUG_ON(exid->spa_how != SP4_NONE); WRITE32(exid->spa_how); switch (exid->spa_how) { case SP4_NONE: diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc9a913784ab..2d8be51f90dc 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -345,8 +345,7 @@ static void nilfs_end_bio_write(struct bio *bio, int err) if (err == -EOPNOTSUPP) { set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - bio_put(bio); - /* to be detected by submit_seg_bio() */ + /* to be detected by nilfs_segbuf_submit_bio() */ } if (!uptodate) @@ -377,12 +376,12 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_private = segbuf; bio_get(bio); submit_bio(mode, bio); + segbuf->sb_nbio++; if (bio_flagged(bio, BIO_EOPNOTSUPP)) { bio_put(bio); err = -EOPNOTSUPP; goto failed; } - segbuf->sb_nbio++; bio_put(bio); wi->bio = NULL; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index af3ba0478cdf..7ac2a122ca1d 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -994,23 +994,16 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, return ret; } -static int nilfs_tree_was_touched(struct dentry *root_dentry) -{ - return d_count(root_dentry) > 1; -} - /** - * nilfs_try_to_shrink_tree() - try to shrink dentries of a checkpoint + * nilfs_tree_is_busy() - try to shrink dentries of a checkpoint * @root_dentry: root dentry of the tree to be shrunk * * This function returns true if the tree was in-use. */ -static int nilfs_try_to_shrink_tree(struct dentry *root_dentry) +static bool nilfs_tree_is_busy(struct dentry *root_dentry) { - if (have_submounts(root_dentry)) - return true; shrink_dcache_parent(root_dentry); - return nilfs_tree_was_touched(root_dentry); + return d_count(root_dentry) > 1; } int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) @@ -1034,8 +1027,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) if (inode) { dentry = d_find_alias(inode); if (dentry) { - if (nilfs_tree_was_touched(dentry)) - ret = nilfs_try_to_shrink_tree(dentry); + ret = nilfs_tree_is_busy(dentry); dput(dentry); } iput(inode); @@ -1331,11 +1323,8 @@ nilfs_mount(struct file_system_type *fs_type, int flags, s->s_flags |= MS_ACTIVE; } else if (!sd.cno) { - int busy = false; - - if (nilfs_tree_was_touched(s->s_root)) { - busy = nilfs_try_to_shrink_tree(s->s_root); - if (busy && (flags ^ s->s_flags) & MS_RDONLY) { + if (nilfs_tree_is_busy(s->s_root)) { + if ((flags ^ s->s_flags) & MS_RDONLY) { printk(KERN_ERR "NILFS: the device already " "has a %s mount.\n", (s->s_flags & MS_RDONLY) ? @@ -1343,8 +1332,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, err = -EBUSY; goto failed_super; } - } - if (!busy) { + } else { /* * Try remount to setup mount states if the current * tree is not mounted and only snapshots use this sb. diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 79736a28d84f..94417a85ce6e 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -565,9 +565,7 @@ bail: static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes, - void *private, - int ret, - bool is_async) + void *private) { struct inode *inode = file_inode(iocb->ki_filp); int level; @@ -592,10 +590,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, level = ocfs2_iocb_rw_locked_level(iocb); ocfs2_rw_unlock(inode, level); - - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); } /* @@ -1757,7 +1751,7 @@ try_again: goto out; } else if (ret == 1) { clusters_need = wc->w_clen; - ret = ocfs2_refcount_cow(inode, filp, di_bh, + ret = ocfs2_refcount_cow(inode, di_bh, wc->w_cpos, wc->w_clen, UINT_MAX); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index eb760d8acd50..30544ce8e9f7 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2153,11 +2153,9 @@ int ocfs2_empty_dir(struct inode *inode) { int ret; struct ocfs2_empty_dir_priv priv = { - .ctx.actor = ocfs2_empty_dir_filldir + .ctx.actor = ocfs2_empty_dir_filldir, }; - memset(&priv, 0, sizeof(priv)); - if (ocfs2_dir_indexed(inode)) { ret = ocfs2_empty_dir_dx(inode, &priv); if (ret) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 41000f223ca4..3261d71319ee 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -370,7 +370,7 @@ static int ocfs2_cow_file_pos(struct inode *inode, if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) goto out; - return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); + return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); out: return status; @@ -899,7 +899,7 @@ static int ocfs2_zero_extend_get_range(struct inode *inode, zero_clusters = last_cpos - zero_cpos; if (needs_cow) { - rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos, + rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, UINT_MAX); if (rc) { mlog_errno(rc); @@ -2078,7 +2078,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode, *meta_level = 1; - ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); if (ret) mlog_errno(ret); out: diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 96f9ac237e86..0a992737dcaf 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -537,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + - ocfs2_quota_trans_credits(sb) + bits_wanted; + ocfs2_quota_trans_credits(sb); } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index f1fc172175b6..452068b45749 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -69,7 +69,7 @@ static int __ocfs2_move_extent(handle_t *handle, u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); - ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, + ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, p_cpos, new_p_cpos, len); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9f6b96a09615..a70d604593b6 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -49,7 +49,6 @@ struct ocfs2_cow_context { struct inode *inode; - struct file *file; u32 cow_start; u32 cow_len; struct ocfs2_extent_tree data_et; @@ -66,7 +65,7 @@ struct ocfs2_cow_context { u32 *num_clusters, unsigned int *extent_flags); int (*cow_duplicate_clusters)(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); }; @@ -2922,14 +2921,12 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) } int ocfs2_duplicate_clusters_by_page(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0, partial; - struct inode *inode = file_inode(file); - struct ocfs2_caching_info *ci = INODE_CACHE(inode); - struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct super_block *sb = inode->i_sb; u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; @@ -2978,13 +2975,6 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, - &file->f_ra, file, - page, page_index, - readahead_pages); - } - if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); if (ret) { @@ -3004,7 +2994,8 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } } - ocfs2_map_and_dirty_page(inode, handle, from, to, + ocfs2_map_and_dirty_page(inode, + handle, from, to, page, 0, &new_block); mark_page_accessed(page); unlock: @@ -3020,12 +3011,11 @@ unlock: } int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0; - struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct ocfs2_caching_info *ci = INODE_CACHE(inode); int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); @@ -3150,7 +3140,7 @@ static int ocfs2_replace_clusters(handle_t *handle, /*If the old clusters is unwritten, no need to duplicate. */ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { - ret = context->cow_duplicate_clusters(handle, context->file, + ret = context->cow_duplicate_clusters(handle, context->inode, cpos, old, new, len); if (ret) { mlog_errno(ret); @@ -3428,35 +3418,12 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) return ret; } -static void ocfs2_readahead_for_cow(struct inode *inode, - struct file *file, - u32 start, u32 len) -{ - struct address_space *mapping; - pgoff_t index; - unsigned long num_pages; - int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; - - if (!file) - return; - - mapping = file->f_mapping; - num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT; - if (!num_pages) - num_pages = 1; - - index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT; - page_cache_sync_readahead(mapping, &file->f_ra, file, - index, num_pages); -} - /* * Starting at cpos, try to CoW write_len clusters. Don't CoW * past max_cpos. This will stop when it runs into a hole or an * unrefcounted extent. */ static int ocfs2_refcount_cow_hunk(struct inode *inode, - struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3485,8 +3452,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, BUG_ON(cow_len == 0); - ocfs2_readahead_for_cow(inode, file, cow_start, cow_len); - context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; @@ -3508,7 +3473,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, context->ref_root_bh = ref_root_bh; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; context->get_clusters = ocfs2_di_get_clusters; - context->file = file; ocfs2_init_dinode_extent_tree(&context->data_et, INODE_CACHE(inode), di_bh); @@ -3537,7 +3501,6 @@ out: * clusters between cpos and cpos+write_len are safe to modify. */ int ocfs2_refcount_cow(struct inode *inode, - struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3557,7 +3520,7 @@ int ocfs2_refcount_cow(struct inode *inode, num_clusters = write_len; if (ext_flags & OCFS2_EXT_REFCOUNTED) { - ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos, + ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, num_clusters, max_cpos); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 7754608c83a4..6422bbcdb525 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -53,7 +53,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, int *credits, int *ref_blocks); int ocfs2_refcount_cow(struct inode *inode, - struct file *filep, struct buffer_head *di_bh, + struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos); typedef int (ocfs2_post_refcount_func)(struct inode *inode, @@ -85,11 +85,11 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, u32 cpos, u32 write_len, struct ocfs2_post_refcount *post); int ocfs2_duplicate_clusters_by_page(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, - struct file *file, + struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len); int ocfs2_cow_sync_writeback(struct super_block *sb, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 854d80955bf8..121da2dc3be8 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1022,7 +1022,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode = NULL; struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; - char nodestr[8]; + char nodestr[12]; struct ocfs2_blockcheck_stats stats; trace_ocfs2_fill_super(sb, data, silent); diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index e5c7f15465b4..19f134e896a9 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -32,7 +32,7 @@ enum ocfs2_xattr_type { struct ocfs2_security_xattr_info { int enable; - char *name; + const char *name; void *value; size_t value_len; }; diff --git a/fs/open.c b/fs/open.c index d53e29895082..2a731b0d08bc 100644 --- a/fs/open.c +++ b/fs/open.c @@ -443,7 +443,7 @@ retry: goto dput_and_out; error = -EPERM; - if (!nsown_capable(CAP_SYS_CHROOT)) + if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) @@ -485,14 +485,13 @@ out_unlock: SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { - struct file * file; + struct fd f = fdget(fd); int err = -EBADF; - file = fget(fd); - if (file) { - audit_inode(NULL, file->f_path.dentry, 0); - err = chmod_common(&file->f_path, mode); - fput(file); + if (f.file) { + audit_inode(NULL, f.file->f_path.dentry, 0); + err = chmod_common(&f.file->f_path, mode); + fdput(f); } return err; } @@ -823,7 +822,7 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o int lookup_flags = 0; int acc_mode; - if (flags & O_CREAT) + if (flags & (O_CREAT | __O_TMPFILE)) op->mode = (mode & S_IALLUGO) | S_IFREG; else op->mode = 0; diff --git a/fs/pnode.h b/fs/pnode.h index b091445c1c4a..59e7eda1851e 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -19,11 +19,14 @@ #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 -#define CL_COPY_ALL 0x04 +#define CL_COPY_UNBINDABLE 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 #define CL_UNPRIVILEGED 0x40 +#define CL_COPY_MNT_NS_FILE 0x80 + +#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) static inline void set_mnt_shared(struct mount *mnt) { diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 75f2890abbd8..0ff80f9b930f 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -230,8 +230,6 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) goto out; - if (!dir_emit_dots(file, ctx)) - goto out; files = get_files_struct(p); if (!files) goto out; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 94441a407337..737e15615b04 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -271,7 +271,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, de = next; } while (de); spin_unlock(&proc_subdir_lock); - return 0; + return 1; } int proc_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 073aea60cf8f..9f8ef9b7674d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -285,6 +285,20 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) return rv; } +static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + int rv = -EIO; + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + if (use_pde(pde)) { + get_unmapped_area = pde->proc_fops->get_unmapped_area; + if (get_unmapped_area) + rv = get_unmapped_area(file, orig_addr, len, pgoff, flags); + unuse_pde(pde); + } + return rv; +} + static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); @@ -356,6 +370,7 @@ static const struct file_operations proc_reg_file_ops = { .compat_ioctl = proc_reg_compat_ioctl, #endif .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; @@ -368,6 +383,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = { .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; diff --git a/fs/proc/root.c b/fs/proc/root.c index 229e366598da..87dbcbef7fe4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -110,7 +110,11 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = task_active_pid_ns(current); options = data; - if (!current_user_ns()->may_mount_proc) + if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) + return ERR_PTR(-EPERM); + + /* Does the mounter have privilege over the pid namespace? */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); } @@ -205,7 +209,9 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr static int proc_root_readdir(struct file *file, struct dir_context *ctx) { if (ctx->pos < FIRST_PROCESS_ENTRY) { - proc_readdir(file, ctx); + int error = proc_readdir(file, ctx); + if (unlikely(error <= 0)) + return error; ctx->pos = FIRST_PROCESS_ENTRY; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dbf61f6174f0..107d026f5d6e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -730,8 +730,16 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * of how soft-dirty works. */ pte_t ptent = *pte; - ptent = pte_wrprotect(ptent); - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + + if (pte_present(ptent)) { + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_clear_soft_dirty(ptent); + } else if (pte_file(ptent)) { + ptent = pte_file_clear_soft_dirty(ptent); + } + set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -752,14 +760,15 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; - if (!pte_present(ptent)) - continue; if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); continue; } + if (!pte_present(ptent)) + continue; + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -859,7 +868,7 @@ typedef struct { } pagemap_entry_t; struct pagemapread { - int pos, len; + int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool v2; }; @@ -867,7 +876,7 @@ struct pagemapread { #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) -#define PM_ENTRY_BYTES sizeof(u64) +#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) #define PM_STATUS_BITS 3 #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) @@ -930,8 +939,10 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); } else if (is_swap_pte(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; + entry = pte_to_swp_entry(pte); frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags = PM_SWAP; @@ -1116,8 +1127,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, goto out_task; pm.v2 = soft_dirty_cleared; - pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); + pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); + pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); ret = -ENOMEM; if (!pm.buffer) goto out_task; diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index ca71db69da07..983d9510becc 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -1,6 +1,8 @@ config PSTORE bool "Persistent store support" default n + select ZLIB_DEFLATE + select ZLIB_INFLATE help This option enables generic access to platform level persistent storage via "pstore" filesystem that can diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 71bf5f4ae84c..12823845d324 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -275,8 +275,8 @@ int pstore_is_mounted(void) * Set the mtime & ctime to the date that this record was originally stored. */ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, - char *data, size_t size, struct timespec time, - struct pstore_info *psi) + char *data, bool compressed, size_t size, + struct timespec time, struct pstore_info *psi) { struct dentry *root = pstore_sb->s_root; struct dentry *dentry; @@ -315,7 +315,8 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, switch (type) { case PSTORE_TYPE_DMESG: - sprintf(name, "dmesg-%s-%lld", psname, id); + sprintf(name, "dmesg-%s-%lld%s", psname, id, + compressed ? ".enc.z" : ""); break; case PSTORE_TYPE_CONSOLE: sprintf(name, "console-%s", psname); @@ -345,9 +346,8 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, mutex_lock(&root->d_inode->i_mutex); - rc = -ENOSPC; dentry = d_alloc_name(root, name); - if (IS_ERR(dentry)) + if (!dentry) goto fail_lockedalloc; memcpy(private->data, data, size); diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 937d820f273c..3b3d305277c4 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -50,8 +50,9 @@ extern struct pstore_info *psinfo; extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, - int count, char *data, size_t size, - struct timespec time, struct pstore_info *psi); + int count, char *data, bool compressed, + size_t size, struct timespec time, + struct pstore_info *psi); extern int pstore_is_mounted(void); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 422962ae9fc2..4ffb7ab5e397 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -26,6 +26,7 @@ #include <linux/console.h> #include <linux/module.h> #include <linux/pstore.h> +#include <linux/zlib.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/slab.h> @@ -65,6 +66,15 @@ struct pstore_info *psinfo; static char *backend; +/* Compression parameters */ +#define COMPR_LEVEL 6 +#define WINDOW_BITS 12 +#define MEM_LEVEL 4 +static struct z_stream_s stream; + +static char *big_oops_buf; +static size_t big_oops_buf_sz; + /* How much of the console log to snapshot */ static unsigned long kmsg_bytes = 10240; @@ -117,6 +127,121 @@ bool pstore_cannot_block_path(enum kmsg_dump_reason reason) } EXPORT_SYMBOL_GPL(pstore_cannot_block_path); +/* Derived from logfs_compress() */ +static int pstore_compress(const void *in, void *out, size_t inlen, + size_t outlen) +{ + int err, ret; + + ret = -EIO; + err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS, + MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + return ret; +} + +/* Derived from logfs_uncompress */ +static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + err = zlib_inflateInit(&stream); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_inflateEnd(&stream); + if (err != Z_OK) + goto error; + + ret = stream.total_out; +error: + return ret; +} + +static void allocate_buf_for_compression(void) +{ + size_t size; + + big_oops_buf_sz = (psinfo->bufsize * 100) / 45; + big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); + if (big_oops_buf) { + size = max(zlib_deflate_workspacesize(WINDOW_BITS, MEM_LEVEL), + zlib_inflate_workspacesize()); + stream.workspace = kmalloc(size, GFP_KERNEL); + if (!stream.workspace) { + pr_err("pstore: No memory for compression workspace; " + "skipping compression\n"); + kfree(big_oops_buf); + big_oops_buf = NULL; + } + } else { + pr_err("No memory for uncompressed data; " + "skipping compression\n"); + stream.workspace = NULL; + } + +} + +/* + * Called when compression fails, since the printk buffer + * would be fetched for compression calling it again when + * compression fails would have moved the iterator of + * printk buffer which results in fetching old contents. + * Copy the recent messages from big_oops_buf to psinfo->buf + */ +static size_t copy_kmsg_to_buffer(int hsize, size_t len) +{ + size_t total_len; + size_t diff; + + total_len = hsize + len; + + if (total_len > psinfo->bufsize) { + diff = total_len - psinfo->bufsize + hsize; + memcpy(psinfo->buf, big_oops_buf, hsize); + memcpy(psinfo->buf + hsize, big_oops_buf + diff, + psinfo->bufsize - hsize); + total_len = psinfo->bufsize; + } else + memcpy(psinfo->buf, big_oops_buf, total_len); + + return total_len; +} + /* * callback from kmsg_dump. (s2,l2) has the most recently * written bytes, older bytes are in (s1,l1). Save as much @@ -148,22 +273,56 @@ static void pstore_dump(struct kmsg_dumper *dumper, char *dst; unsigned long size; int hsize; + int zipped_len = -1; size_t len; + bool compressed; + size_t total_len; - dst = psinfo->buf; - hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part); - size = psinfo->bufsize - hsize; - dst += hsize; + if (big_oops_buf) { + dst = big_oops_buf; + hsize = sprintf(dst, "%s#%d Part%d\n", why, + oopscount, part); + size = big_oops_buf_sz - hsize; - if (!kmsg_dump_get_buffer(dumper, true, dst, size, &len)) - break; + if (!kmsg_dump_get_buffer(dumper, true, dst + hsize, + size, &len)) + break; + + zipped_len = pstore_compress(dst, psinfo->buf, + hsize + len, psinfo->bufsize); + + if (zipped_len > 0) { + compressed = true; + total_len = zipped_len; + } else { + pr_err("pstore: compression failed for Part %d" + " returned %d\n", part, zipped_len); + pr_err("pstore: Capture uncompressed" + " oops/panic report of Part %d\n", part); + compressed = false; + total_len = copy_kmsg_to_buffer(hsize, len); + } + } else { + dst = psinfo->buf; + hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, + part); + size = psinfo->bufsize - hsize; + dst += hsize; + + if (!kmsg_dump_get_buffer(dumper, true, dst, + size, &len)) + break; + + compressed = false; + total_len = hsize + len; + } ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - oopscount, hsize, hsize + len, psinfo); + oopscount, compressed, total_len, psinfo); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; - total += hsize + len; + total += total_len; part++; } if (pstore_cannot_block_path(reason)) { @@ -221,10 +380,10 @@ static void pstore_register_console(void) {} static int pstore_write_compat(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, unsigned int part, int count, - size_t hsize, size_t size, + bool compressed, size_t size, struct pstore_info *psi) { - return psi->write_buf(type, reason, id, part, psinfo->buf, hsize, + return psi->write_buf(type, reason, id, part, psinfo->buf, compressed, size, psi); } @@ -261,6 +420,8 @@ int pstore_register(struct pstore_info *psi) return -EINVAL; } + allocate_buf_for_compression(); + if (pstore_is_mounted()) pstore_get_records(0); @@ -297,6 +458,8 @@ void pstore_get_records(int quiet) enum pstore_type_id type; struct timespec time; int failed = 0, rc; + bool compressed; + int unzipped_len = -1; if (!psi) return; @@ -305,11 +468,32 @@ void pstore_get_records(int quiet) if (psi->open && psi->open(psi)) goto out; - while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) { + while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed, + psi)) > 0) { + if (compressed && (type == PSTORE_TYPE_DMESG)) { + if (big_oops_buf) + unzipped_len = pstore_decompress(buf, + big_oops_buf, size, + big_oops_buf_sz); + + if (unzipped_len > 0) { + buf = big_oops_buf; + size = unzipped_len; + compressed = false; + } else { + pr_err("pstore: decompression failed;" + "returned %d\n", unzipped_len); + compressed = true; + } + } rc = pstore_mkfile(type, psi->name, id, count, buf, - (size_t)size, time, psi); - kfree(buf); - buf = NULL; + compressed, (size_t)size, time, psi); + if (unzipped_len < 0) { + /* Free buffer other than big oops */ + kfree(buf); + buf = NULL; + } else + unzipped_len = -1; if (rc && (rc != -EEXIST || !quiet)) failed++; } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index a6119f9469e2..fa8cef2cca3a 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -131,9 +131,31 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, return prz; } +static void ramoops_read_kmsg_hdr(char *buffer, struct timespec *time, + bool *compressed) +{ + char data_type; + + if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n", + &time->tv_sec, &time->tv_nsec, &data_type) == 3) { + if (data_type == 'C') + *compressed = true; + else + *compressed = false; + } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", + &time->tv_sec, &time->tv_nsec) == 2) { + *compressed = false; + } else { + time->tv_sec = 0; + time->tv_nsec = 0; + *compressed = false; + } +} + static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, int *count, struct timespec *time, - char **buf, struct pstore_info *psi) + char **buf, bool *compressed, + struct pstore_info *psi) { ssize_t size; ssize_t ecc_notice_size; @@ -152,10 +174,6 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (!prz) return 0; - /* TODO(kees): Bogus time for the moment. */ - time->tv_sec = 0; - time->tv_nsec = 0; - size = persistent_ram_old_size(prz); /* ECC correction notice */ @@ -166,12 +184,14 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, return -ENOMEM; memcpy(*buf, persistent_ram_old(prz), size); + ramoops_read_kmsg_hdr(*buf, time, compressed); persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1); return size + ecc_notice_size; } -static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) +static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, + bool compressed) { char *hdr; struct timespec timestamp; @@ -182,8 +202,9 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) timestamp.tv_sec = 0; timestamp.tv_nsec = 0; } - hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", - (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000)); + hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n", + (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000), + compressed ? 'C' : 'D'); WARN_ON_ONCE(!hdr); len = hdr ? strlen(hdr) : 0; persistent_ram_write(prz, hdr, len); @@ -196,7 +217,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, unsigned int part, const char *buf, - size_t hsize, size_t size, + bool compressed, size_t size, struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; @@ -242,7 +263,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, prz = cxt->przs[cxt->dump_write_cnt]; - hlen = ramoops_write_kmsg_hdr(prz); + hlen = ramoops_write_kmsg_hdr(prz, compressed); if (size + hlen > prz->buffer_size) size = prz->buffer_size - hlen; persistent_ram_write(prz, buf, size); @@ -400,11 +421,11 @@ static int ramoops_probe(struct platform_device *pdev) goto fail_out; } - if (!is_power_of_2(pdata->record_size)) + if (pdata->record_size && !is_power_of_2(pdata->record_size)) pdata->record_size = rounddown_pow_of_two(pdata->record_size); - if (!is_power_of_2(pdata->console_size)) + if (pdata->console_size && !is_power_of_2(pdata->console_size)) pdata->console_size = rounddown_pow_of_two(pdata->console_size); - if (!is_power_of_2(pdata->ftrace_size)) + if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size)) pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); cxt->dump_read_cnt = 0; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index fbad622841f9..9a702e193538 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1094,6 +1094,14 @@ static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number) dquot->dq_dqb.dqb_rsvspace -= number; } +static void dquot_reclaim_reserved_space(struct dquot *dquot, qsize_t number) +{ + if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number)) + number = dquot->dq_dqb.dqb_curspace; + dquot->dq_dqb.dqb_rsvspace += number; + dquot->dq_dqb.dqb_curspace -= number; +} + static inline void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) { @@ -1528,6 +1536,15 @@ void inode_claim_rsv_space(struct inode *inode, qsize_t number) } EXPORT_SYMBOL(inode_claim_rsv_space); +void inode_reclaim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + __inode_sub_bytes(inode, number); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(inode_reclaim_rsv_space); + void inode_sub_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); @@ -1702,6 +1719,35 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) EXPORT_SYMBOL(dquot_claim_space_nodirty); /* + * Convert allocated space back to in-memory reserved quotas + */ +void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) +{ + int cnt; + + if (!dquot_active(inode)) { + inode_reclaim_rsv_space(inode, number); + return; + } + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + spin_lock(&dq_data_lock); + /* Claim reserved quotas to allocated quotas */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt]) + dquot_reclaim_reserved_space(inode->i_dquot[cnt], + number); + } + /* Update inode bytes */ + inode_reclaim_rsv_space(inode, number); + spin_unlock(&dq_data_lock); + mark_all_dquot_dirty(inode->i_dquot); + up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + return; +} +EXPORT_SYMBOL(dquot_reclaim_space_nodirty); + +/* * This operation can block, but only after everything is updated */ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) diff --git a/fs/quota/quota.c b/fs/quota/quota.c index c7314f1771f5..dea86e8967ee 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -27,6 +27,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, case Q_SYNC: case Q_GETINFO: case Q_XGETQSTAT: + case Q_XGETQSTATV: case Q_XQUOTASYNC: break; /* allow to query information for dquots we "own" */ @@ -217,6 +218,31 @@ static int quota_getxstate(struct super_block *sb, void __user *addr) return ret; } +static int quota_getxstatev(struct super_block *sb, void __user *addr) +{ + struct fs_quota_statv fqs; + int ret; + + if (!sb->s_qcop->get_xstatev) + return -ENOSYS; + + memset(&fqs, 0, sizeof(fqs)); + if (copy_from_user(&fqs, addr, 1)) /* Just read qs_version */ + return -EFAULT; + + /* If this kernel doesn't support user specified version, fail */ + switch (fqs.qs_version) { + case FS_QSTATV_VERSION1: + break; + default: + return -EINVAL; + } + ret = sb->s_qcop->get_xstatev(sb, &fqs); + if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + return -EFAULT; + return ret; +} + static int quota_setxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { @@ -293,6 +319,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, return quota_setxstate(sb, cmd, addr); case Q_XGETQSTAT: return quota_getxstate(sb, addr); + case Q_XGETQSTATV: + return quota_getxstatev(sb, addr); case Q_XSETQLIM: return quota_setxquota(sb, type, id, addr); case Q_XGETQUOTA: @@ -317,6 +345,7 @@ static int quotactl_cmd_write(int cmd) case Q_GETINFO: case Q_SYNC: case Q_XGETQSTAT: + case Q_XGETQSTATV: case Q_XGETQUOTA: case Q_XQUOTASYNC: return 0; diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index a98b7740a0fc..dc9a6829f7c6 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -423,8 +423,11 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, set_sb_free_blocks(rs, sb_free_blocks(rs) + 1); journal_mark_dirty(th, s, sbh); - if (for_unformatted) + if (for_unformatted) { + int depth = reiserfs_write_unlock_nested(s); dquot_free_block_nodirty(inode, 1); + reiserfs_write_lock_nested(s, depth); + } } void reiserfs_free_block(struct reiserfs_transaction_handle *th, @@ -1128,6 +1131,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; int passno = 0; int nr_allocated = 0; + int depth; determine_prealloc_size(hint); if (!hint->formatted_node) { @@ -1137,10 +1141,13 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start "reiserquota: allocating %d blocks id=%u", amount_needed, hint->inode->i_uid); #endif + depth = reiserfs_write_unlock_nested(s); quota_ret = dquot_alloc_block_nodirty(hint->inode, amount_needed); - if (quota_ret) /* Quota exceeded? */ + if (quota_ret) { /* Quota exceeded? */ + reiserfs_write_lock_nested(s, depth); return QUOTA_EXCEEDED; + } if (hint->preallocate && hint->prealloc_size) { #ifdef REISERQUOTA_DEBUG reiserfs_debug(s, REISERFS_DEBUG_CODE, @@ -1153,6 +1160,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start hint->preallocate = hint->prealloc_size = 0; } /* for unformatted nodes, force large allocations */ + reiserfs_write_lock_nested(s, depth); } do { @@ -1181,9 +1189,11 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start hint->inode->i_uid); #endif /* Free not allocated blocks */ + depth = reiserfs_write_unlock_nested(s); dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); + reiserfs_write_lock_nested(s, depth); } while (nr_allocated--) reiserfs_free_block(hint->th, hint->inode, @@ -1214,10 +1224,13 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid); #endif + + depth = reiserfs_write_unlock_nested(s); dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)-> i_prealloc_count); + reiserfs_write_lock_nested(s, depth); } return CARRY_ON; @@ -1340,10 +1353,11 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, "reading failed", __func__, block); else { if (buffer_locked(bh)) { + int depth; PROC_INFO_INC(sb, scan_bitmap.wait); - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(sb); __wait_on_buffer(bh); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); } BUG_ON(!buffer_uptodate(bh)); BUG_ON(atomic_read(&bh->b_count) == 0); diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 03e4ca5624d6..1fd2051109a3 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -71,6 +71,7 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) char small_buf[32]; /* avoid kmalloc if we can */ struct reiserfs_dir_entry de; int ret = 0; + int depth; reiserfs_write_lock(inode->i_sb); @@ -181,17 +182,17 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) * Since filldir might sleep, we can release * the write lock here for other waiters */ - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(inode->i_sb); if (!dir_emit (ctx, local_buf, d_reclen, d_ino, DT_UNKNOWN)) { - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); if (local_buf != small_buf) { kfree(local_buf); } goto end; } - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); if (local_buf != small_buf) { kfree(local_buf); } diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 430e0658704c..dc4d41530316 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -1022,9 +1022,9 @@ static int get_far_parent(struct tree_balance *tb, if (buffer_locked(*pcom_father)) { /* Release the write lock while the buffer is busy */ - reiserfs_write_unlock(tb->tb_sb); + int depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(*pcom_father); - reiserfs_write_lock(tb->tb_sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) { brelse(*pcom_father); return REPEAT_SEARCH; @@ -1929,9 +1929,9 @@ static int get_direct_parent(struct tree_balance *tb, int h) return REPEAT_SEARCH; if (buffer_locked(bh)) { - reiserfs_write_unlock(tb->tb_sb); + int depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(bh); - reiserfs_write_lock(tb->tb_sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } @@ -1952,6 +1952,7 @@ static int get_neighbors(struct tree_balance *tb, int h) unsigned long son_number; struct super_block *sb = tb->tb_sb; struct buffer_head *bh; + int depth; PROC_INFO_INC(sb, get_neighbors[h]); @@ -1969,9 +1970,9 @@ static int get_neighbors(struct tree_balance *tb, int h) tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> FL[h]); son_number = B_N_CHILD_NUM(tb->FL[h], child_position); - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(tb->tb_sb); bh = sb_bread(sb, son_number); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (!bh) return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { @@ -2009,9 +2010,9 @@ static int get_neighbors(struct tree_balance *tb, int h) child_position = (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; son_number = B_N_CHILD_NUM(tb->FR[h], child_position); - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(tb->tb_sb); bh = sb_bread(sb, son_number); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (!bh) return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { @@ -2272,6 +2273,7 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) } if (locked) { + int depth; #ifdef CONFIG_REISERFS_CHECK repeat_counter++; if ((repeat_counter % 10000) == 0) { @@ -2286,9 +2288,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) REPEAT_SEARCH : CARRY_ON; } #endif - reiserfs_write_unlock(tb->tb_sb); + depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(locked); - reiserfs_write_lock(tb->tb_sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } @@ -2359,9 +2361,9 @@ int fix_nodes(int op_mode, struct tree_balance *tb, /* if it possible in indirect_to_direct conversion */ if (buffer_locked(tbS0)) { - reiserfs_write_unlock(tb->tb_sb); + int depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(tbS0); - reiserfs_write_lock(tb->tb_sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0048cc16a6a8..ad62bdbb451e 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -30,7 +30,6 @@ void reiserfs_evict_inode(struct inode *inode) JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; - int depth; int err; if (!inode->i_nlink && !is_bad_inode(inode)) @@ -40,12 +39,13 @@ void reiserfs_evict_inode(struct inode *inode) if (inode->i_nlink) goto no_delete; - depth = reiserfs_write_lock_once(inode->i_sb); - /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ + reiserfs_delete_xattrs(inode); + reiserfs_write_lock(inode->i_sb); + if (journal_begin(&th, inode->i_sb, jbegin_count)) goto out; reiserfs_update_inode_transaction(inode); @@ -57,8 +57,11 @@ void reiserfs_evict_inode(struct inode *inode) /* Do quota update inside a transaction for journaled quotas. We must do that * after delete_object so that quota updates go into the same transaction as * stat data deletion */ - if (!err) + if (!err) { + int depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_free_inode(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + } if (journal_end(&th, inode->i_sb, jbegin_count)) goto out; @@ -72,12 +75,12 @@ void reiserfs_evict_inode(struct inode *inode) /* all items of file are deleted, so we can remove "save" link */ remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything * about an error here */ +out: + reiserfs_write_unlock(inode->i_sb); } else { /* no object items are in the tree */ ; } - out: - reiserfs_write_unlock_once(inode->i_sb, depth); clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ dquot_drop(inode); inode->i_blocks = 0; @@ -610,7 +613,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block, __le32 *item; int done; int fs_gen; - int lock_depth; struct reiserfs_transaction_handle *th = NULL; /* space reserved in transaction batch: . 3 balancings in direct->indirect conversion @@ -626,11 +628,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block, loff_t new_offset = (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; - lock_depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); version = get_inode_item_key_version(inode); if (!file_capable(inode, block)) { - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); return -EFBIG; } @@ -642,7 +644,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, /* find number of block-th logical block of the file */ ret = _get_block_create_0(inode, block, bh_result, create | GET_BLOCK_READ_DIRECT); - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); return ret; } /* @@ -760,7 +762,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (!dangle && th) retval = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); /* the item was found, so new blocks were not added to the file ** there is no need to make sure the inode is updated with this @@ -1011,11 +1013,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, * long time. reschedule if needed and also release the write * lock for others. */ - if (need_resched()) { - reiserfs_write_unlock_once(inode->i_sb, lock_depth); - schedule(); - lock_depth = reiserfs_write_lock_once(inode->i_sb); - } + reiserfs_cond_resched(inode->i_sb); retval = search_for_position_by_key(inode->i_sb, &key, &path); if (retval == IO_ERROR) { @@ -1050,7 +1048,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, retval = err; } - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); reiserfs_check_path(&path); return retval; } @@ -1509,14 +1507,15 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key) { struct inode *inode; struct reiserfs_iget_args args; + int depth; args.objectid = key->on_disk_key.k_objectid; args.dirid = key->on_disk_key.k_dir_id; - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); inode = iget5_locked(s, key->on_disk_key.k_objectid, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); if (!inode) return ERR_PTR(-ENOMEM); @@ -1772,7 +1771,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, struct inode *inode, struct reiserfs_security_handle *security) { - struct super_block *sb; + struct super_block *sb = dir->i_sb; struct reiserfs_iget_args args; INITIALIZE_PATH(path_to_key); struct cpu_key key; @@ -1780,12 +1779,13 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, struct stat_data sd; int retval; int err; + int depth; BUG_ON(!th->t_trans_id); - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(sb); err = dquot_alloc_inode(inode); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(sb, depth); if (err) goto out_end_trans; if (!dir->i_nlink) { @@ -1793,8 +1793,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, goto out_bad_inode; } - sb = dir->i_sb; - /* item head of new item */ ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th)); @@ -1812,10 +1810,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(inode->i_sb); err = insert_inode_locked4(inode, args.objectid, reiserfs_find_actor, &args); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); if (err) { err = -EINVAL; goto out_bad_inode; @@ -1941,7 +1939,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, } if (reiserfs_posixacl(inode->i_sb)) { + reiserfs_write_unlock(inode->i_sb); retval = reiserfs_inherit_default_acl(th, dir, dentry, inode); + reiserfs_write_lock(inode->i_sb); if (retval) { err = retval; reiserfs_check_path(&path_to_key); @@ -1956,7 +1956,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, inode->i_flags |= S_PRIVATE; if (security->name) { + reiserfs_write_unlock(inode->i_sb); retval = reiserfs_security_write(th, inode, security); + reiserfs_write_lock(inode->i_sb); if (retval) { err = retval; reiserfs_check_path(&path_to_key); @@ -1982,14 +1984,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, INODE_PKEY(inode)->k_objectid = 0; /* Quota change must be inside a transaction for journaling */ + depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_free_inode(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); out_end_trans: journal_end(th, th->t_super, th->t_blocks_allocated); - reiserfs_write_unlock(inode->i_sb); /* Drop can be outside and it needs more credits so it's better to have it outside */ + depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_drop(inode); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); @@ -2103,9 +2107,8 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) int error; struct buffer_head *bh = NULL; int err2; - int lock_depth; - lock_depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); if (inode->i_size > 0) { error = grab_tail_page(inode, &page, &bh); @@ -2174,7 +2177,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) page_cache_release(page); } - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); return 0; out: @@ -2183,7 +2186,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) page_cache_release(page); } - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); return error; } @@ -2648,10 +2651,11 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) struct inode *inode = page->mapping->host; int ret; int old_ref = 0; + int depth; - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(inode->i_sb); reiserfs_wait_on_write_block(inode->i_sb); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); fix_tail_page_for_writing(page); if (reiserfs_transaction_running(inode->i_sb)) { @@ -2708,7 +2712,6 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, int update_sd = 0; struct reiserfs_transaction_handle *th; unsigned start; - int lock_depth = 0; bool locked = false; if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) @@ -2737,7 +2740,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, */ if (pos + copied > inode->i_size) { struct reiserfs_transaction_handle myth; - lock_depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); locked = true; /* If the file have grown beyond the border where it can have a tail, unmark it as needing a tail @@ -2768,7 +2771,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, } if (th) { if (!locked) { - lock_depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); locked = true; } if (!update_sd) @@ -2780,7 +2783,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, out: if (locked) - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); unlock_page(page); page_cache_release(page); @@ -2790,7 +2793,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, return ret == 0 ? copied : ret; journal_error: - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); locked = false; if (th) { if (!update_sd) @@ -2808,10 +2811,11 @@ int reiserfs_commit_write(struct file *f, struct page *page, int ret = 0; int update_sd = 0; struct reiserfs_transaction_handle *th = NULL; + int depth; - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(inode->i_sb); reiserfs_wait_on_write_block(inode->i_sb); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); if (reiserfs_transaction_running(inode->i_sb)) { th = current->journal_info; @@ -3110,7 +3114,6 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; unsigned int ia_valid; - int depth; int error; error = inode_change_ok(inode, attr); @@ -3122,13 +3125,14 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) if (is_quota_modification(inode, attr)) dquot_initialize(inode); - depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate */ if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && attr->ia_size > MAX_NON_LFS) { + reiserfs_write_unlock(inode->i_sb); error = -EFBIG; goto out; } @@ -3150,8 +3154,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) if (err) error = err; } - if (error) + if (error) { + reiserfs_write_unlock(inode->i_sb); goto out; + } /* * file size is changed, ctime and mtime are * to be updated @@ -3159,6 +3165,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME); } } + reiserfs_write_unlock(inode->i_sb); if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) || ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) && @@ -3183,14 +3190,16 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) return error; /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ + reiserfs_write_lock(inode->i_sb); error = journal_begin(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); if (error) goto out; - reiserfs_write_unlock_once(inode->i_sb, depth); error = dquot_transfer(inode, attr); - depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); if (error) { journal_end(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); goto out; } @@ -3202,17 +3211,11 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) inode->i_gid = attr->ia_gid; mark_inode_dirty(inode); error = journal_end(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); if (error) goto out; } - /* - * Relax the lock here, as it might truncate the - * inode pages and wait for inode pages locks. - * To release such page lock, the owner needs the - * reiserfs lock - */ - reiserfs_write_unlock_once(inode->i_sb, depth); if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { error = inode_newsize_ok(inode, attr->ia_size); @@ -3226,16 +3229,13 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) setattr_copy(inode, attr); mark_inode_dirty(inode); } - depth = reiserfs_write_lock_once(inode->i_sb); if (!error && reiserfs_posixacl(inode->i_sb)) { if (attr->ia_valid & ATTR_MODE) error = reiserfs_acl_chmod(inode); } - out: - reiserfs_write_unlock_once(inode->i_sb, depth); - +out: return error; } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 15cb5fe6b425..946ccbf5b5a1 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -167,7 +167,6 @@ int reiserfs_commit_write(struct file *f, struct page *page, int reiserfs_unpack(struct inode *inode, struct file *filp) { int retval = 0; - int depth; int index; struct page *page; struct address_space *mapping; @@ -183,11 +182,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) return 0; } - depth = reiserfs_write_lock_once(inode->i_sb); - /* we need to make sure nobody is changing the file size beneath us */ reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); + reiserfs_write_lock(inode->i_sb); + write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ if (write_from == 0) { @@ -221,6 +220,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) out: mutex_unlock(&inode->i_mutex); - reiserfs_write_unlock_once(inode->i_sb, depth); + reiserfs_write_unlock(inode->i_sb); return retval; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 742fdd4c209a..73feacc49b2e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -947,9 +947,11 @@ static int reiserfs_async_progress_wait(struct super_block *s) struct reiserfs_journal *j = SB_JOURNAL(s); if (atomic_read(&j->j_async_throttle)) { - reiserfs_write_unlock(s); + int depth; + + depth = reiserfs_write_unlock_nested(s); congestion_wait(BLK_RW_ASYNC, HZ / 10); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } return 0; @@ -972,6 +974,7 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal *journal = SB_JOURNAL(s); int retval = 0; int write_len; + int depth; reiserfs_check_lock_depth(s, "flush_commit_list"); @@ -1018,12 +1021,12 @@ static int flush_commit_list(struct super_block *s, * We might sleep in numerous places inside * write_ordered_buffers. Relax the write lock. */ - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, journal, jl, &jl->j_bh_list); if (ret < 0 && retval == 0) retval = ret; - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } BUG_ON(!list_empty(&jl->j_bh_list)); /* @@ -1043,9 +1046,9 @@ static int flush_commit_list(struct super_block *s, tbh = journal_find_get_block(s, bn); if (tbh) { if (buffer_dirty(tbh)) { - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); ll_rw_block(WRITE, 1, &tbh); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } put_bh(tbh) ; } @@ -1057,17 +1060,17 @@ static int flush_commit_list(struct super_block *s, (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn); - reiserfs_write_unlock(s); - wait_on_buffer(tbh); - reiserfs_write_lock(s); + depth = reiserfs_write_unlock_nested(s); + __wait_on_buffer(tbh); + reiserfs_write_lock_nested(s, depth); // since we're using ll_rw_blk above, it might have skipped over // a locked buffer. Double check here // /* redundant, sync_dirty_buffer() checks */ if (buffer_dirty(tbh)) { - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); sync_dirty_buffer(tbh); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } if (unlikely(!buffer_uptodate(tbh))) { #ifdef CONFIG_REISERFS_CHECK @@ -1091,12 +1094,12 @@ static int flush_commit_list(struct super_block *s, if (buffer_dirty(jl->j_commit_bh)) BUG(); mark_buffer_dirty(jl->j_commit_bh) ; - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); if (reiserfs_barrier_flush(s)) __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA); else sync_dirty_buffer(jl->j_commit_bh); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } /* If there was a write error in the journal - we can't commit this @@ -1228,15 +1231,16 @@ static int _update_journal_header_block(struct super_block *sb, { struct reiserfs_journal_header *jh; struct reiserfs_journal *journal = SB_JOURNAL(sb); + int depth; if (reiserfs_is_journal_aborted(journal)) return -EIO; if (trans_id >= journal->j_last_flush_trans_id) { if (buffer_locked((journal->j_header_bh))) { - reiserfs_write_unlock(sb); - wait_on_buffer((journal->j_header_bh)); - reiserfs_write_lock(sb); + depth = reiserfs_write_unlock_nested(sb); + __wait_on_buffer(journal->j_header_bh); + reiserfs_write_lock_nested(sb, depth); if (unlikely(!buffer_uptodate(journal->j_header_bh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(sb, "journal-699", @@ -1254,14 +1258,14 @@ static int _update_journal_header_block(struct super_block *sb, jh->j_mount_id = cpu_to_le32(journal->j_mount_id); set_buffer_dirty(journal->j_header_bh); - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(sb); if (reiserfs_barrier_flush(sb)) __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA); else sync_dirty_buffer(journal->j_header_bh); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); if (!buffer_uptodate(journal->j_header_bh)) { reiserfs_warning(sb, "journal-837", "IO error during journal replay"); @@ -1341,6 +1345,7 @@ static int flush_journal_list(struct super_block *s, unsigned long j_len_saved = jl->j_len; struct reiserfs_journal *journal = SB_JOURNAL(s); int err = 0; + int depth; BUG_ON(j_len_saved <= 0); @@ -1495,9 +1500,9 @@ static int flush_journal_list(struct super_block *s, "cn->bh is NULL"); } - reiserfs_write_unlock(s); - wait_on_buffer(cn->bh); - reiserfs_write_lock(s); + depth = reiserfs_write_unlock_nested(s); + __wait_on_buffer(cn->bh); + reiserfs_write_lock_nested(s, depth); if (!cn->bh) { reiserfs_panic(s, "journal-1012", @@ -1974,6 +1979,7 @@ static int journal_compare_desc_commit(struct super_block *sb, /* returns 0 if it did not find a description block ** returns -1 if it found a corrupt commit block ** returns 1 if both desc and commit were valid +** NOTE: only called during fs mount */ static int journal_transaction_is_valid(struct super_block *sb, struct buffer_head *d_bh, @@ -2073,8 +2079,9 @@ static void brelse_array(struct buffer_head **heads, int num) /* ** given the start, and values for the oldest acceptable transactions, -** this either reads in a replays a transaction, or returns because the transaction -** is invalid, or too old. +** this either reads in a replays a transaction, or returns because the +** transaction is invalid, or too old. +** NOTE: only called during fs mount */ static int journal_read_transaction(struct super_block *sb, unsigned long cur_dblock, @@ -2208,10 +2215,7 @@ static int journal_read_transaction(struct super_block *sb, ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { - reiserfs_write_unlock(sb); wait_on_buffer(log_blocks[i]); - reiserfs_write_lock(sb); - if (!buffer_uptodate(log_blocks[i])) { reiserfs_warning(sb, "journal-1212", "REPLAY FAILURE fsck required! " @@ -2318,12 +2322,13 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, /* ** read and replay the log -** on a clean unmount, the journal header's next unflushed pointer will be to an invalid -** transaction. This tests that before finding all the transactions in the log, which makes normal mount times fast. -** -** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid. -** +** on a clean unmount, the journal header's next unflushed pointer will +** be to an invalid transaction. This tests that before finding all the +** transactions in the log, which makes normal mount times fast. +** After a crash, this starts with the next unflushed transaction, and +** replays until it finds one too old, or invalid. ** On exit, it sets things up so the first transaction will work correctly. +** NOTE: only called during fs mount */ static int journal_read(struct super_block *sb) { @@ -2501,14 +2506,18 @@ static int journal_read(struct super_block *sb) "replayed %d transactions in %lu seconds\n", replay_count, get_seconds() - start); } + /* needed to satisfy the locking in _update_journal_header_block */ + reiserfs_write_lock(sb); if (!bdev_read_only(sb->s_bdev) && _update_journal_header_block(sb, journal->j_start, journal->j_last_flush_trans_id)) { + reiserfs_write_unlock(sb); /* replay failed, caller must call free_journal_ram and abort ** the mount */ return -1; } + reiserfs_write_unlock(sb); return 0; } @@ -2828,13 +2837,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, goto free_and_return; } - /* - * Journal_read needs to be inspected in order to push down - * the lock further inside (or even remove it). - */ - reiserfs_write_lock(sb); ret = journal_read(sb); - reiserfs_write_unlock(sb); if (ret < 0) { reiserfs_warning(sb, "reiserfs-2006", "Replay Failure, unable to mount"); @@ -2923,9 +2926,9 @@ static void queue_log_writer(struct super_block *s) add_wait_queue(&journal->j_join_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) { - reiserfs_write_unlock(s); + int depth = reiserfs_write_unlock_nested(s); schedule(); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } __set_current_state(TASK_RUNNING); remove_wait_queue(&journal->j_join_wait, &wait); @@ -2943,9 +2946,12 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id) struct reiserfs_journal *journal = SB_JOURNAL(sb); unsigned long bcount = journal->j_bcount; while (1) { - reiserfs_write_unlock(sb); + int depth; + + depth = reiserfs_write_unlock_nested(sb); schedule_timeout_uninterruptible(1); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); + journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; while ((atomic_read(&journal->j_wcount) > 0 || atomic_read(&journal->j_jlock)) && @@ -2976,6 +2982,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct reiserfs_transaction_handle myth; int sched_count = 0; int retval; + int depth; reiserfs_check_lock_depth(sb, "journal_begin"); BUG_ON(nblocks > journal->j_trans_max); @@ -2996,9 +3003,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { unlock_journal(sb); - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(sb); reiserfs_wait_on_write_block(sb); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); PROC_INFO_INC(sb, journal.journal_relock_writers); goto relock; } @@ -3821,6 +3828,7 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb, if (test_clear_buffer_journal_restore_dirty(bh) && buffer_journal_dirty(bh)) { struct reiserfs_journal_cnode *cn; + reiserfs_write_lock(sb); cn = get_journal_hash_dev(sb, journal->j_list_hash_table, bh->b_blocknr); @@ -3828,6 +3836,7 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb, set_buffer_journal_test(bh); mark_buffer_dirty(bh); } + reiserfs_write_unlock(sb); } clear_buffer_journal_prepared(bh); } @@ -3911,6 +3920,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, unsigned long jindex; unsigned int commit_trans_id; int trans_half; + int depth; BUG_ON(th->t_refcount > 1); BUG_ON(!th->t_trans_id); @@ -4116,9 +4126,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, next = cn->next; free_cnode(sb, cn); cn = next; - reiserfs_write_unlock(sb); - cond_resched(); - reiserfs_write_lock(sb); + reiserfs_cond_resched(sb); } /* we are done with both the c_bh and d_bh, but @@ -4165,10 +4173,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, * is lost. */ if (!list_empty(&jl->j_tail_bh_list)) { - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(sb); write_ordered_buffers(&journal->j_dirty_buffers_lock, journal, jl, &jl->j_tail_bh_list); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); } BUG_ON(!list_empty(&jl->j_tail_bh_list)); mutex_unlock(&jl->j_commit_mutex); diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c index d735bc8470e3..045b83ef9fd9 100644 --- a/fs/reiserfs/lock.c +++ b/fs/reiserfs/lock.c @@ -48,30 +48,35 @@ void reiserfs_write_unlock(struct super_block *s) } } -/* - * If we already own the lock, just exit and don't increase the depth. - * Useful when we don't want to lock more than once. - * - * We always return the lock_depth we had before calling - * this function. - */ -int reiserfs_write_lock_once(struct super_block *s) +int __must_check reiserfs_write_unlock_nested(struct super_block *s) { struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + int depth; - if (sb_i->lock_owner != current) { - mutex_lock(&sb_i->lock); - sb_i->lock_owner = current; - return sb_i->lock_depth++; - } + /* this can happen when the lock isn't always held */ + if (sb_i->lock_owner != current) + return -1; + + depth = sb_i->lock_depth; + + sb_i->lock_depth = -1; + sb_i->lock_owner = NULL; + mutex_unlock(&sb_i->lock); - return sb_i->lock_depth; + return depth; } -void reiserfs_write_unlock_once(struct super_block *s, int lock_depth) +void reiserfs_write_lock_nested(struct super_block *s, int depth) { - if (lock_depth == -1) - reiserfs_write_unlock(s); + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + /* this can happen when the lock isn't always held */ + if (depth == -1) + return; + + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + sb_i->lock_depth = depth; } /* @@ -82,9 +87,7 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller) { struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); - if (sb_i->lock_depth < 0) - reiserfs_panic(sb, "%s called without kernel lock held %d", - caller); + WARN_ON(sb_i->lock_depth < 0); } #ifdef CONFIG_REISERFS_CHECK diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 8567fb847601..dc5236f6de1b 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -325,7 +325,6 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int retval; - int lock_depth; struct inode *inode = NULL; struct reiserfs_dir_entry de; INITIALIZE_PATH(path_to_entry); @@ -333,12 +332,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) return ERR_PTR(-ENAMETOOLONG); - /* - * Might be called with or without the write lock, must be careful - * to not recursively hold it in case we want to release the lock - * before rescheduling. - */ - lock_depth = reiserfs_write_lock_once(dir->i_sb); + reiserfs_write_lock(dir->i_sb); de.de_gen_number_bit_string = NULL; retval = @@ -349,7 +343,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); if (!inode || IS_ERR(inode)) { - reiserfs_write_unlock_once(dir->i_sb, lock_depth); + reiserfs_write_unlock(dir->i_sb); return ERR_PTR(-EACCES); } @@ -358,7 +352,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, if (IS_PRIVATE(dir)) inode->i_flags |= S_PRIVATE; } - reiserfs_write_unlock_once(dir->i_sb, lock_depth); + reiserfs_write_unlock(dir->i_sb); if (retval == IO_ERROR) { return ERR_PTR(-EIO); } @@ -727,7 +721,6 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct inode *inode; struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; - int lock_depth; /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + @@ -753,7 +746,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode return retval; } jbegin_count += retval; - lock_depth = reiserfs_write_lock_once(dir->i_sb); + reiserfs_write_lock(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) { @@ -804,7 +797,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode d_instantiate(dentry, inode); retval = journal_end(&th, dir->i_sb, jbegin_count); out_failed: - reiserfs_write_unlock_once(dir->i_sb, lock_depth); + reiserfs_write_unlock(dir->i_sb); return retval; } @@ -920,7 +913,6 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) struct reiserfs_transaction_handle th; int jbegin_count; unsigned long savelink; - int depth; dquot_initialize(dir); @@ -934,7 +926,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - depth = reiserfs_write_lock_once(dir->i_sb); + reiserfs_write_lock(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) goto out_unlink; @@ -995,7 +987,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) retval = journal_end(&th, dir->i_sb, jbegin_count); reiserfs_check_path(&path); - reiserfs_write_unlock_once(dir->i_sb, depth); + reiserfs_write_unlock(dir->i_sb); return retval; end_unlink: @@ -1005,7 +997,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) if (err) retval = err; out_unlink: - reiserfs_write_unlock_once(dir->i_sb, depth); + reiserfs_write_unlock(dir->i_sb); return retval; } diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index c0b1112ab7e3..54944d5a4a6e 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -358,12 +358,13 @@ void __reiserfs_panic(struct super_block *sb, const char *id, dump_stack(); #endif if (sb) - panic(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n", + printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n", sb->s_id, id ? id : "", id ? " " : "", function, error_buf); else - panic(KERN_WARNING "REISERFS panic: %s%s%s: %s\n", + printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n", id ? id : "", id ? " " : "", function, error_buf); + BUG(); } void __reiserfs_error(struct super_block *sb, const char *id, diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 33532f79b4f7..a958444a75fc 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -19,12 +19,13 @@ /* * LOCKING: * - * We rely on new Alexander Viro's super-block locking. + * These guys are evicted from procfs as the very first step in ->kill_sb(). * */ -static int show_version(struct seq_file *m, struct super_block *sb) +static int show_version(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; char *format; if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) { @@ -66,8 +67,9 @@ static int show_version(struct seq_file *m, struct super_block *sb) #define DJP( x ) le32_to_cpu( jp -> x ) #define JF( x ) ( r -> s_journal -> x ) -static int show_super(struct seq_file *m, struct super_block *sb) +static int show_super(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "state: \t%s\n" @@ -128,8 +130,9 @@ static int show_super(struct seq_file *m, struct super_block *sb) return 0; } -static int show_per_level(struct seq_file *m, struct super_block *sb) +static int show_per_level(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); int level; @@ -186,8 +189,9 @@ static int show_per_level(struct seq_file *m, struct super_block *sb) return 0; } -static int show_bitmap(struct seq_file *m, struct super_block *sb) +static int show_bitmap(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "free_block: %lu\n" @@ -218,8 +222,9 @@ static int show_bitmap(struct seq_file *m, struct super_block *sb) return 0; } -static int show_on_disk_super(struct seq_file *m, struct super_block *sb) +static int show_on_disk_super(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); struct reiserfs_super_block *rs = sb_info->s_rs; int hash_code = DFL(s_hash_function_code); @@ -261,8 +266,9 @@ static int show_on_disk_super(struct seq_file *m, struct super_block *sb) return 0; } -static int show_oidmap(struct seq_file *m, struct super_block *sb) +static int show_oidmap(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); struct reiserfs_super_block *rs = sb_info->s_rs; unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize); @@ -291,8 +297,9 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb) return 0; } -static int show_journal(struct seq_file *m, struct super_block *sb) +static int show_journal(struct seq_file *m, void *unused) { + struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); struct reiserfs_super_block *rs = r->s_rs; struct journal_params *jp = &rs->s_v1.s_journal; @@ -383,92 +390,24 @@ static int show_journal(struct seq_file *m, struct super_block *sb) return 0; } -/* iterator */ -static int test_sb(struct super_block *sb, void *data) -{ - return data == sb; -} - -static int set_sb(struct super_block *sb, void *data) -{ - return -ENOENT; -} - -struct reiserfs_seq_private { - struct super_block *sb; - int (*show) (struct seq_file *, struct super_block *); -}; - -static void *r_start(struct seq_file *m, loff_t * pos) -{ - struct reiserfs_seq_private *priv = m->private; - loff_t l = *pos; - - if (l) - return NULL; - - if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb))) - return NULL; - - up_write(&priv->sb->s_umount); - return priv->sb; -} - -static void *r_next(struct seq_file *m, void *v, loff_t * pos) -{ - ++*pos; - if (v) - deactivate_super(v); - return NULL; -} - -static void r_stop(struct seq_file *m, void *v) -{ - if (v) - deactivate_super(v); -} - -static int r_show(struct seq_file *m, void *v) -{ - struct reiserfs_seq_private *priv = m->private; - return priv->show(m, v); -} - -static const struct seq_operations r_ops = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = r_show, -}; - static int r_open(struct inode *inode, struct file *file) { - struct reiserfs_seq_private *priv; - int ret = seq_open_private(file, &r_ops, - sizeof(struct reiserfs_seq_private)); - - if (!ret) { - struct seq_file *m = file->private_data; - priv = m->private; - priv->sb = proc_get_parent_data(inode); - priv->show = PDE_DATA(inode); - } - return ret; + return single_open(file, PDE_DATA(inode), + proc_get_parent_data(inode)); } static const struct file_operations r_file_operations = { .open = r_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, - .owner = THIS_MODULE, + .release = single_release, }; static struct proc_dir_entry *proc_info_root = NULL; static const char proc_info_root_name[] = "fs/reiserfs"; static void add_file(struct super_block *sb, char *name, - int (*func) (struct seq_file *, struct super_block *)) + int (*func) (struct seq_file *, void *)) { proc_create_data(name, 0, REISERFS_SB(sb)->procdir, &r_file_operations, func); diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 3df5ce6c724d..f8adaee537c2 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -630,8 +630,8 @@ static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal */ void reiserfs_write_lock(struct super_block *s); void reiserfs_write_unlock(struct super_block *s); -int reiserfs_write_lock_once(struct super_block *s); -void reiserfs_write_unlock_once(struct super_block *s, int lock_depth); +int __must_check reiserfs_write_unlock_nested(struct super_block *s); +void reiserfs_write_lock_nested(struct super_block *s, int depth); #ifdef CONFIG_REISERFS_CHECK void reiserfs_lock_check_recursive(struct super_block *s); @@ -667,31 +667,33 @@ static inline void reiserfs_lock_check_recursive(struct super_block *s) { } * - The inode mutex */ static inline void reiserfs_mutex_lock_safe(struct mutex *m, - struct super_block *s) + struct super_block *s) { - reiserfs_lock_check_recursive(s); - reiserfs_write_unlock(s); + int depth; + + depth = reiserfs_write_unlock_nested(s); mutex_lock(m); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } static inline void reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass, - struct super_block *s) + struct super_block *s) { - reiserfs_lock_check_recursive(s); - reiserfs_write_unlock(s); + int depth; + + depth = reiserfs_write_unlock_nested(s); mutex_lock_nested(m, subclass); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } static inline void reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s) { - reiserfs_lock_check_recursive(s); - reiserfs_write_unlock(s); - down_read(sem); - reiserfs_write_lock(s); + int depth; + depth = reiserfs_write_unlock_nested(s); + down_read(sem); + reiserfs_write_lock_nested(s, depth); } /* @@ -701,9 +703,11 @@ reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s) static inline void reiserfs_cond_resched(struct super_block *s) { if (need_resched()) { - reiserfs_write_unlock(s); + int depth; + + depth = reiserfs_write_unlock_nested(s); schedule(); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } } diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 3ce02cff5e90..a4ef5cd606eb 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -34,6 +34,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) unsigned long int block_count, free_blocks; int i; int copy_size; + int depth; sb = SB_DISK_SUPER_BLOCK(s); @@ -43,7 +44,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) } /* check the device size */ + depth = reiserfs_write_unlock_nested(s); bh = sb_bread(s, block_count_new - 1); + reiserfs_write_lock_nested(s, depth); if (!bh) { printk("reiserfs_resize: can\'t read last block\n"); return -EINVAL; @@ -125,9 +128,12 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) * transaction begins, and the new bitmaps don't matter if the * transaction fails. */ for (i = bmap_nr; i < bmap_nr_new; i++) { + int depth; /* don't use read_bitmap_block since it will cache * the uninitialized bitmap */ + depth = reiserfs_write_unlock_nested(s); bh = sb_bread(s, i * s->s_blocksize * 8); + reiserfs_write_lock_nested(s, depth); if (!bh) { vfree(bitmap); return -EIO; @@ -138,9 +144,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) set_buffer_uptodate(bh); mark_buffer_dirty(bh); - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); sync_dirty_buffer(bh); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); // update bitmap_info stuff bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; brelse(bh); diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 2f40a4c70a4d..b14706a05d52 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -524,14 +524,14 @@ static int is_tree_node(struct buffer_head *bh, int level) * the caller (search_by_key) will perform other schedule-unsafe * operations just after calling this function. * - * @return true if we have unlocked + * @return depth of lock to be restored after read completes */ -static bool search_by_key_reada(struct super_block *s, +static int search_by_key_reada(struct super_block *s, struct buffer_head **bh, b_blocknr_t *b, int num) { int i, j; - bool unlocked = false; + int depth = -1; for (i = 0; i < num; i++) { bh[i] = sb_getblk(s, b[i]); @@ -549,15 +549,13 @@ static bool search_by_key_reada(struct super_block *s, * you have to make sure the prepared bit isn't set on this buffer */ if (!buffer_uptodate(bh[j])) { - if (!unlocked) { - reiserfs_write_unlock(s); - unlocked = true; - } + if (depth == -1) + depth = reiserfs_write_unlock_nested(s); ll_rw_block(READA, 1, bh + j); } brelse(bh[j]); } - return unlocked; + return depth; } /************************************************************************** @@ -645,26 +643,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s have a pointer to it. */ if ((bh = last_element->pe_buffer = sb_getblk(sb, block_number))) { - bool unlocked = false; - if (!buffer_uptodate(bh) && reada_count > 1) - /* may unlock the write lock */ - unlocked = search_by_key_reada(sb, reada_bh, - reada_blocks, reada_count); /* - * If we haven't already unlocked the write lock, - * then we need to do that here before reading - * the current block + * We'll need to drop the lock if we encounter any + * buffers that need to be read. If all of them are + * already up to date, we don't need to drop the lock. */ - if (!buffer_uptodate(bh) && !unlocked) { - reiserfs_write_unlock(sb); - unlocked = true; - } + int depth = -1; + + if (!buffer_uptodate(bh) && reada_count > 1) + depth = search_by_key_reada(sb, reada_bh, + reada_blocks, reada_count); + + if (!buffer_uptodate(bh) && depth == -1) + depth = reiserfs_write_unlock_nested(sb); + ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); - if (unlocked) - reiserfs_write_lock(sb); + if (depth != -1) + reiserfs_write_lock_nested(sb, depth); if (!buffer_uptodate(bh)) goto io_error; } else { @@ -1059,9 +1057,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st reiserfs_free_block(th, inode, block, 1); } - reiserfs_write_unlock(sb); - cond_resched(); - reiserfs_write_lock(sb); + reiserfs_cond_resched(sb); if (item_moved (&s_ih, path)) { need_re_search = 1; @@ -1190,6 +1186,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct item_head *q_ih; int quota_cut_bytes; int ret_value, del_size, removed; + int depth; #ifdef CONFIG_REISERFS_CHECK char mode; @@ -1299,7 +1296,9 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, inode->i_uid, head2type(&s_ih)); #endif + depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_free_space_nodirty(inode, quota_cut_bytes); + reiserfs_write_lock_nested(inode->i_sb, depth); /* Return deleted body length */ return ret_value; @@ -1325,6 +1324,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, struct inode *inode, struct reiserfs_key *key) { + struct super_block *sb = th->t_super; struct tree_balance tb; INITIALIZE_PATH(path); int item_len = 0; @@ -1377,14 +1377,17 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, if (retval == CARRY_ON) { do_balance(&tb, NULL, NULL, M_DELETE); if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */ + int depth; #ifdef REISERQUOTA_DEBUG reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, "reiserquota delete_solid_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, key2type(key)); #endif + depth = reiserfs_write_unlock_nested(sb); dquot_free_space_nodirty(inode, quota_cut_bytes); + reiserfs_write_lock_nested(sb, depth); } break; } @@ -1561,6 +1564,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, int retval2 = -1; int quota_cut_bytes; loff_t tail_pos = 0; + int depth; BUG_ON(!th->t_trans_id); @@ -1733,7 +1737,9 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, '?'); #endif + depth = reiserfs_write_unlock_nested(sb); dquot_free_space_nodirty(inode, quota_cut_bytes); + reiserfs_write_lock_nested(sb, depth); return ret_value; } @@ -1953,9 +1959,11 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree const char *body, /* Pointer to the bytes to paste. */ int pasted_size) { /* Size of pasted bytes. */ + struct super_block *sb = inode->i_sb; struct tree_balance s_paste_balance; int retval; int fs_gen; + int depth; BUG_ON(!th->t_trans_id); @@ -1968,9 +1976,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree key2type(&(key->on_disk_key))); #endif - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(sb); retval = dquot_alloc_space_nodirty(inode, pasted_size); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(sb, depth); if (retval) { pathrelse(search_path); return retval; @@ -2027,7 +2035,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree pasted_size, inode->i_uid, key2type(&(key->on_disk_key))); #endif + depth = reiserfs_write_unlock_nested(sb); dquot_free_space_nodirty(inode, pasted_size); + reiserfs_write_lock_nested(sb, depth); return retval; } @@ -2050,6 +2060,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); if (inode) { /* Do we count quotas for item? */ + int depth; fs_gen = get_generation(inode->i_sb); quota_bytes = ih_item_len(ih); @@ -2063,11 +2074,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif - reiserfs_write_unlock(inode->i_sb); /* We can't dirty inode here. It would be immediately written but * appropriate stat item isn't inserted yet... */ + depth = reiserfs_write_unlock_nested(inode->i_sb); retval = dquot_alloc_space_nodirty(inode, quota_bytes); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); if (retval) { pathrelse(path); return retval; @@ -2118,7 +2129,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif - if (inode) + if (inode) { + int depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_free_space_nodirty(inode, quota_bytes); + reiserfs_write_lock_nested(inode->i_sb, depth); + } return retval; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index f8a23c3078f8..3ead145dadc4 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -243,6 +243,7 @@ static int finish_unfinished(struct super_block *s) done = 0; REISERFS_SB(s)->s_is_unlinked_ok = 1; while (!retval) { + int depth; retval = search_item(s, &max_cpu_key, &path); if (retval != ITEM_NOT_FOUND) { reiserfs_error(s, "vs-2140", @@ -298,9 +299,9 @@ static int finish_unfinished(struct super_block *s) retval = remove_save_link_only(s, &save_link_key, 0); continue; } - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_initialize(inode); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(inode->i_sb, depth); if (truncate && S_ISDIR(inode->i_mode)) { /* We got a truncate request for a dir which is impossible. @@ -356,10 +357,12 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA /* Turn quotas off */ + reiserfs_write_unlock(s); for (i = 0; i < MAXQUOTAS; i++) { if (sb_dqopt(s)->files[i] && quota_enabled[i]) dquot_quota_off(s, i); } + reiserfs_write_lock(s); if (ms_active_set) /* Restore the flag back */ s->s_flags &= ~MS_ACTIVE; @@ -499,6 +502,7 @@ int remove_save_link(struct inode *inode, int truncate) static void reiserfs_kill_sb(struct super_block *s) { if (REISERFS_SB(s)) { + reiserfs_proc_info_done(s); /* * Force any pending inode evictions to occur now. Any * inodes to be removed that have extended attributes @@ -554,8 +558,6 @@ static void reiserfs_put_super(struct super_block *s) REISERFS_SB(s)->reserved_blocks); } - reiserfs_proc_info_done(s); - reiserfs_write_unlock(s); mutex_destroy(&REISERFS_SB(s)->lock); kfree(s->s_fs_info); @@ -624,7 +626,6 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags) struct reiserfs_transaction_handle th; int err = 0; - int lock_depth; if (inode->i_sb->s_flags & MS_RDONLY) { reiserfs_warning(inode->i_sb, "clm-6006", @@ -632,7 +633,7 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags) inode->i_ino); return; } - lock_depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); /* this is really only used for atime updates, so they don't have ** to be included in O_SYNC or fsync @@ -645,7 +646,7 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags) journal_end(&th, inode->i_sb, 1); out: - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); } static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) @@ -1335,7 +1336,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) kfree(qf_names[i]); #endif err = -EINVAL; - goto out_unlock; + goto out_err_unlock; } #ifdef CONFIG_QUOTA handle_quota_files(s, qf_names, &qfmt); @@ -1379,35 +1380,32 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (blocks) { err = reiserfs_resize(s, blocks); if (err != 0) - goto out_unlock; + goto out_err_unlock; } if (*mount_flags & MS_RDONLY) { + reiserfs_write_unlock(s); reiserfs_xattr_init(s, *mount_flags); /* remount read-only */ if (s->s_flags & MS_RDONLY) /* it is read-only already */ - goto out_ok; + goto out_ok_unlocked; - /* - * Drop write lock. Quota will retake it when needed and lock - * ordering requires calling dquot_suspend() without it. - */ - reiserfs_write_unlock(s); err = dquot_suspend(s, -1); if (err < 0) goto out_err; - reiserfs_write_lock(s); /* try to remount file system with read-only permissions */ if (sb_umount_state(rs) == REISERFS_VALID_FS || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { - goto out_ok; + goto out_ok_unlocked; } + reiserfs_write_lock(s); + err = journal_begin(&th, s, 10); if (err) - goto out_unlock; + goto out_err_unlock; /* Mounting a rw partition read-only. */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1416,13 +1414,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) } else { /* remount read-write */ if (!(s->s_flags & MS_RDONLY)) { + reiserfs_write_unlock(s); reiserfs_xattr_init(s, *mount_flags); - goto out_ok; /* We are read-write already */ + goto out_ok_unlocked; /* We are read-write already */ } if (reiserfs_is_journal_aborted(journal)) { err = journal->j_errno; - goto out_unlock; + goto out_err_unlock; } handle_data_mode(s, mount_options); @@ -1431,7 +1430,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ err = journal_begin(&th, s, 10); if (err) - goto out_unlock; + goto out_err_unlock; /* Mount a partition which is read-only, read-write */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1448,26 +1447,22 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) SB_JOURNAL(s)->j_must_wait = 1; err = journal_end(&th, s, 10); if (err) - goto out_unlock; + goto out_err_unlock; + reiserfs_write_unlock(s); if (!(*mount_flags & MS_RDONLY)) { - /* - * Drop write lock. Quota will retake it when needed and lock - * ordering requires calling dquot_resume() without it. - */ - reiserfs_write_unlock(s); dquot_resume(s, -1); reiserfs_write_lock(s); finish_unfinished(s); + reiserfs_write_unlock(s); reiserfs_xattr_init(s, *mount_flags); } -out_ok: +out_ok_unlocked: replace_mount_options(s, new_opts); - reiserfs_write_unlock(s); return 0; -out_unlock: +out_err_unlock: reiserfs_write_unlock(s); out_err: kfree(new_opts); @@ -2014,12 +2009,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) goto error; } + reiserfs_write_unlock(s); if ((errval = reiserfs_lookup_privroot(s)) || (errval = reiserfs_xattr_init(s, s->s_flags))) { dput(s->s_root); s->s_root = NULL; - goto error; + goto error_unlocked; } + reiserfs_write_lock(s); /* look for files which were to be removed in previous session */ finish_unfinished(s); @@ -2028,12 +2025,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) reiserfs_info(s, "using 3.5.x disk format\n"); } + reiserfs_write_unlock(s); if ((errval = reiserfs_lookup_privroot(s)) || (errval = reiserfs_xattr_init(s, s->s_flags))) { dput(s->s_root); s->s_root = NULL; - goto error; + goto error_unlocked; } + reiserfs_write_lock(s); } // mark hash in super block: it could be unset. overwrite should be ok set_sb_hash_function_code(rs, function2code(sbi->s_hash_function)); @@ -2101,6 +2100,7 @@ static int reiserfs_write_dquot(struct dquot *dquot) { struct reiserfs_transaction_handle th; int ret, err; + int depth; reiserfs_write_lock(dquot->dq_sb); ret = @@ -2108,9 +2108,9 @@ static int reiserfs_write_dquot(struct dquot *dquot) REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (ret) goto out; - reiserfs_write_unlock(dquot->dq_sb); + depth = reiserfs_write_unlock_nested(dquot->dq_sb); ret = dquot_commit(dquot); - reiserfs_write_lock(dquot->dq_sb); + reiserfs_write_lock_nested(dquot->dq_sb, depth); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); @@ -2125,6 +2125,7 @@ static int reiserfs_acquire_dquot(struct dquot *dquot) { struct reiserfs_transaction_handle th; int ret, err; + int depth; reiserfs_write_lock(dquot->dq_sb); ret = @@ -2132,9 +2133,9 @@ static int reiserfs_acquire_dquot(struct dquot *dquot) REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (ret) goto out; - reiserfs_write_unlock(dquot->dq_sb); + depth = reiserfs_write_unlock_nested(dquot->dq_sb); ret = dquot_acquire(dquot); - reiserfs_write_lock(dquot->dq_sb); + reiserfs_write_lock_nested(dquot->dq_sb, depth); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); @@ -2187,15 +2188,16 @@ static int reiserfs_write_info(struct super_block *sb, int type) { struct reiserfs_transaction_handle th; int ret, err; + int depth; /* Data block + inode block */ reiserfs_write_lock(sb); ret = journal_begin(&th, sb, 2); if (ret) goto out; - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(sb); ret = dquot_commit_info(sb, type); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); err = journal_end(&th, sb, 2); if (!ret && err) ret = err; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index c69cdd749f09..8a9e2dcfe004 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -81,8 +81,7 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, - I_MUTEX_CHILD, dir->i_sb); + mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); error = dir->i_op->unlink(dir, dentry); mutex_unlock(&dentry->d_inode->i_mutex); @@ -96,8 +95,7 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, - I_MUTEX_CHILD, dir->i_sb); + mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); error = dir->i_op->rmdir(dir, dentry); if (!error) dentry->d_inode->i_flags |= S_DEAD; @@ -232,22 +230,17 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) return 0; - reiserfs_write_unlock(inode->i_sb); dir = open_xa_dir(inode, XATTR_REPLACE); if (IS_ERR(dir)) { err = PTR_ERR(dir); - reiserfs_write_lock(inode->i_sb); goto out; } else if (!dir->d_inode) { err = 0; - reiserfs_write_lock(inode->i_sb); goto out_dir; } mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); - reiserfs_write_lock(inode->i_sb); - buf.xadir = dir; while (1) { err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); @@ -281,14 +274,17 @@ static int reiserfs_for_each_xattr(struct inode *inode, int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; + reiserfs_write_lock(inode->i_sb); err = journal_begin(&th, inode->i_sb, blocks); + reiserfs_write_unlock(inode->i_sb); if (!err) { int jerror; - reiserfs_mutex_lock_nested_safe( - &dir->d_parent->d_inode->i_mutex, - I_MUTEX_XATTR, inode->i_sb); + mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, + I_MUTEX_XATTR); err = action(dir, data); + reiserfs_write_lock(inode->i_sb); jerror = journal_end(&th, inode->i_sb, blocks); + reiserfs_write_unlock(inode->i_sb); mutex_unlock(&dir->d_parent->d_inode->i_mutex); err = jerror ?: err; } @@ -455,9 +451,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) } if (dentry->d_inode) { - reiserfs_write_lock(inode->i_sb); err = xattr_unlink(xadir->d_inode, dentry); - reiserfs_write_unlock(inode->i_sb); update_ctime(inode); } @@ -491,24 +485,17 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, if (get_inode_sd_version(inode) == STAT_DATA_V1) return -EOPNOTSUPP; - reiserfs_write_unlock(inode->i_sb); - if (!buffer) { err = lookup_and_delete_xattr(inode, name); - reiserfs_write_lock(inode->i_sb); return err; } dentry = xattr_lookup(inode, name, flags); - if (IS_ERR(dentry)) { - reiserfs_write_lock(inode->i_sb); + if (IS_ERR(dentry)) return PTR_ERR(dentry); - } down_write(&REISERFS_I(inode)->i_xattr_sem); - reiserfs_write_lock(inode->i_sb); - xahash = xattr_hash(buffer, buffer_size); while (buffer_pos < buffer_size || buffer_pos == 0) { size_t chunk; @@ -538,6 +525,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, rxh->h_hash = cpu_to_le32(xahash); } + reiserfs_write_lock(inode->i_sb); err = __reiserfs_write_begin(page, page_offset, chunk + skip); if (!err) { if (buffer) @@ -546,6 +534,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, page_offset + chunk + skip); } + reiserfs_write_unlock(inode->i_sb); unlock_page(page); reiserfs_put_page(page); buffer_pos += chunk; @@ -563,10 +552,8 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; - reiserfs_write_unlock(inode->i_sb); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); inode_dio_wait(dentry->d_inode); - reiserfs_write_lock(inode->i_sb); err = reiserfs_setattr(dentry, &newattrs); mutex_unlock(&dentry->d_inode->i_mutex); @@ -592,18 +579,19 @@ int reiserfs_xattr_set(struct inode *inode, const char *name, reiserfs_write_lock(inode->i_sb); error = journal_begin(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); if (error) { - reiserfs_write_unlock(inode->i_sb); return error; } error = reiserfs_xattr_set_handle(&th, inode, name, buffer, buffer_size, flags); + reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); if (error == 0) error = error2; - reiserfs_write_unlock(inode->i_sb); return error; } @@ -968,7 +956,7 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s); + mutex_lock(&s->s_root->d_inode->i_mutex); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { @@ -996,14 +984,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) goto error; if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) { - reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s); + mutex_lock(&s->s_root->d_inode->i_mutex); err = create_privroot(REISERFS_SB(s)->priv_root); mutex_unlock(&s->s_root->d_inode->i_mutex); } if (privroot->d_inode) { s->s_xattr = reiserfs_xattr_handlers; - reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s); + mutex_lock(&privroot->d_inode->i_mutex); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; dentry = lookup_one_len(XAROOT_NAME, privroot, diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 6c8767fdfc6a..06c04f73da65 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -49,13 +49,15 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value, reiserfs_write_lock(inode->i_sb); error = journal_begin(&th, inode->i_sb, jcreate_blocks); + reiserfs_write_unlock(inode->i_sb); if (error == 0) { error = reiserfs_set_acl(&th, inode, type, acl); + reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th, inode->i_sb, jcreate_blocks); + reiserfs_write_unlock(inode->i_sb); if (error2) error = error2; } - reiserfs_write_unlock(inode->i_sb); release_and_out: posix_acl_release(acl); @@ -435,12 +437,14 @@ int reiserfs_cache_default_acl(struct inode *inode) return nblocks; } +/* + * Called under i_mutex + */ int reiserfs_acl_chmod(struct inode *inode) { struct reiserfs_transaction_handle th; struct posix_acl *acl; size_t size; - int depth; int error; if (IS_PRIVATE(inode)) @@ -454,9 +458,7 @@ int reiserfs_acl_chmod(struct inode *inode) return 0; } - reiserfs_write_unlock(inode->i_sb); acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - reiserfs_write_lock(inode->i_sb); if (!acl) return 0; if (IS_ERR(acl)) @@ -466,16 +468,18 @@ int reiserfs_acl_chmod(struct inode *inode) return error; size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count)); - depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); error = journal_begin(&th, inode->i_sb, size * 2); + reiserfs_write_unlock(inode->i_sb); if (!error) { int error2; error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl); + reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th, inode->i_sb, size * 2); + reiserfs_write_unlock(inode->i_sb); if (error2) error = error2; } - reiserfs_write_unlock_once(inode->i_sb, depth); posix_acl_release(acl); return error; } diff --git a/fs/stat.c b/fs/stat.c index 04ce1ac20d20..d0ea7ef75e26 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -447,9 +447,8 @@ void inode_add_bytes(struct inode *inode, loff_t bytes) EXPORT_SYMBOL(inode_add_bytes); -void inode_sub_bytes(struct inode *inode, loff_t bytes) +void __inode_sub_bytes(struct inode *inode, loff_t bytes) { - spin_lock(&inode->i_lock); inode->i_blocks -= bytes >> 9; bytes &= 511; if (inode->i_bytes < bytes) { @@ -457,6 +456,14 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes) inode->i_bytes += 512; } inode->i_bytes -= bytes; +} + +EXPORT_SYMBOL(__inode_sub_bytes); + +void inode_sub_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + __inode_sub_bytes(inode, bytes); spin_unlock(&inode->i_lock); } diff --git a/fs/super.c b/fs/super.c index 68307c029228..f6961ea84c56 100644 --- a/fs/super.c +++ b/fs/super.c @@ -71,7 +71,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) if (!grab_super_passive(sb)) return -1; - if (sb->s_op && sb->s_op->nr_cached_objects) + if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb); total_objects = sb->s_nr_dentry_unused + @@ -152,15 +152,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) static const struct super_operations default_op; if (s) { - if (security_sb_alloc(s)) { - /* - * We cannot call security_sb_free() without - * security_sb_alloc() succeeding. So bail out manually - */ - kfree(s); - s = NULL; - goto out; - } + if (security_sb_alloc(s)) + goto out_free_sb; + #ifdef CONFIG_SMP s->s_files = alloc_percpu(struct list_head); if (!s->s_files) @@ -228,6 +222,7 @@ err_out: free_percpu(s->s_files); #endif destroy_sb_writers(s); +out_free_sb: kfree(s); s = NULL; goto out; @@ -414,6 +409,11 @@ void generic_shutdown_super(struct super_block *sb) evict_inodes(sb); + if (sb->s_dio_done_wq) { + destroy_workqueue(sb->s_dio_done_wq); + sb->s_dio_done_wq = NULL; + } + if (sop->put_super) sop->put_super(sb); diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 15c68f9489ae..c590cabd57bb 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -22,8 +22,7 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/mm.h> - -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "sysfs.h" @@ -391,7 +390,7 @@ out_unlock: return rc; } -static int open(struct inode * inode, struct file * file) +static int open(struct inode *inode, struct file *file) { struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; @@ -435,7 +434,7 @@ static int open(struct inode * inode, struct file * file) return error; } -static int release(struct inode * inode, struct file * file) +static int release(struct inode *inode, struct file *file) { struct bin_buffer *bb = file->private_data; @@ -481,7 +480,6 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd) * @kobj: object. * @attr: attribute descriptor. */ - int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { @@ -489,19 +487,16 @@ int sysfs_create_bin_file(struct kobject *kobj, return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); } - +EXPORT_SYMBOL_GPL(sysfs_create_bin_file); /** * sysfs_remove_bin_file - remove binary file for object. * @kobj: object. * @attr: attribute descriptor. */ - void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name); } - -EXPORT_SYMBOL_GPL(sysfs_create_bin_file); EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index e068e744dbdd..4d83cedb9fcb 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -46,7 +46,7 @@ static unsigned int sysfs_name_hash(const void *ns, const char *name) unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); - hash = ( end_name_hash(hash) ^ hash_ptr( (void *)ns, 31 ) ); + hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 1) @@ -258,7 +258,7 @@ static void sysfs_free_ino(unsigned int ino) spin_unlock(&sysfs_ino_lock); } -void release_sysfs_dirent(struct sysfs_dirent * sd) +void release_sysfs_dirent(struct sysfs_dirent *sd) { struct sysfs_dirent *parent_sd; @@ -297,7 +297,6 @@ static int sysfs_dentry_delete(const struct dentry *dentry) static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags) { struct sysfs_dirent *sd; - int is_dir; int type; if (flags & LOOKUP_RCU) @@ -341,18 +340,15 @@ out_bad: * is performed at its new name the dentry will be readded * to the dcache hashes. */ - is_dir = (sysfs_type(sd) == SYSFS_DIR); mutex_unlock(&sysfs_mutex); - if (is_dir) { - /* If we have submounts we must allow the vfs caches - * to lie about the state of the filesystem to prevent - * leaks and other nasty things. - */ - if (have_submounts(dentry)) - goto out_valid; - shrink_dcache_parent(dentry); - } - d_drop(dentry); + + /* If we have submounts we must allow the vfs caches + * to lie about the state of the filesystem to prevent + * leaks and other nasty things. + */ + if (check_submounts_and_drop(dentry) != 0) + goto out_valid; + return 0; } @@ -451,7 +447,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) { WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(acxt->parent_sd)? "required": "invalid", + sysfs_ns_type(acxt->parent_sd) ? "required" : "invalid", acxt->parent_sd->s_name, sd->s_name); return -EINVAL; } @@ -619,7 +615,7 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, if (!!sysfs_ns_type(parent_sd) != !!ns) { WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(parent_sd)? "required": "invalid", + sysfs_ns_type(parent_sd) ? "required" : "invalid", parent_sd->s_name, name); return NULL; } @@ -674,7 +670,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, enum kobj_ns_type type, const void *ns, const char *name, struct sysfs_dirent **p_sd) { - umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; + umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; int rc; @@ -735,9 +731,9 @@ static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) /** * sysfs_create_dir - create a directory for an object. - * @kobj: object we're creating directory for. + * @kobj: object we're creating directory for. */ -int sysfs_create_dir(struct kobject * kobj) +int sysfs_create_dir(struct kobject *kobj) { enum kobj_ns_type type; struct sysfs_dirent *parent_sd, *sd; @@ -764,8 +760,8 @@ int sysfs_create_dir(struct kobject * kobj) return error; } -static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) +static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) { struct dentry *ret = NULL; struct dentry *parent = dentry->d_parent; @@ -857,7 +853,7 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) * what used to be sysfs_rmdir() below, instead of calling separately. */ -void sysfs_remove_dir(struct kobject * kobj) +void sysfs_remove_dir(struct kobject *kobj) { struct sysfs_dirent *sd = kobj->sd; @@ -896,7 +892,9 @@ int sysfs_rename(struct sysfs_dirent *sd, sd->s_name = new_name; } - /* Move to the appropriate place in the appropriate directories rbtree. */ + /* + * Move to the appropriate place in the appropriate directories rbtree. + */ sysfs_unlink_sibling(sd); sysfs_get(new_parent_sd); sysfs_put(sd->s_parent); @@ -988,20 +986,21 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) { pos = sysfs_dir_pos(ns, parent_sd, ino, pos); - if (pos) do { - struct rb_node *node = rb_next(&pos->s_rb); - if (!node) - pos = NULL; - else - pos = to_sysfs_dirent(node); - } while (pos && pos->s_ns != ns); + if (pos) + do { + struct rb_node *node = rb_next(&pos->s_rb); + if (!node) + pos = NULL; + else + pos = to_sysfs_dirent(node); + } while (pos && pos->s_ns != ns); return pos; } static int sysfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct sysfs_dirent * parent_sd = dentry->d_fsdata; + struct sysfs_dirent *parent_sd = dentry->d_fsdata; struct sysfs_dirent *pos = file->private_data; enum kobj_ns_type type; const void *ns; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index d2bb7ed8fa74..15ef5eb13663 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -20,7 +20,7 @@ #include <linux/list.h> #include <linux/mutex.h> #include <linux/limits.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "sysfs.h" @@ -45,8 +45,8 @@ struct sysfs_open_dirent { struct sysfs_buffer { size_t count; loff_t pos; - char * page; - const struct sysfs_ops * ops; + char *page; + const struct sysfs_ops *ops; struct mutex mutex; int needs_read_fill; int event; @@ -59,16 +59,16 @@ struct sysfs_buffer { * @buffer: data buffer for file. * * Allocate @buffer->page, if it hasn't been already, then call the - * kobject's show() method to fill the buffer with this attribute's - * data. + * kobject's show() method to fill the buffer with this attribute's + * data. * This is called only once, on the file's first read unless an error * is returned. */ -static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) +static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer) { struct sysfs_dirent *attr_sd = dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - const struct sysfs_ops * ops = buffer->ops; + const struct sysfs_ops *ops = buffer->ops; int ret = 0; ssize_t count; @@ -106,7 +106,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer } /** - * sysfs_read_file - read an attribute. + * sysfs_read_file - read an attribute. * @file: file pointer. * @buf: buffer to fill. * @count: number of bytes to read. @@ -127,12 +127,12 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer static ssize_t sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct sysfs_buffer * buffer = file->private_data; + struct sysfs_buffer *buffer = file->private_data; ssize_t retval = 0; mutex_lock(&buffer->mutex); if (buffer->needs_read_fill || *ppos == 0) { - retval = fill_read_buffer(file->f_path.dentry,buffer); + retval = fill_read_buffer(file->f_path.dentry, buffer); if (retval) goto out; } @@ -154,9 +154,8 @@ out: * Allocate @buffer->page if it hasn't been already, then * copy the user-supplied buffer into it. */ - -static int -fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t count) +static int fill_write_buffer(struct sysfs_buffer *buffer, + const char __user *buf, size_t count) { int error; @@ -167,7 +166,7 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t if (count >= PAGE_SIZE) count = PAGE_SIZE - 1; - error = copy_from_user(buffer->page,buf,count); + error = copy_from_user(buffer->page, buf, count); buffer->needs_read_fill = 1; /* if buf is assumed to contain a string, terminate it by \0, so e.g. sscanf() can scan the string easily */ @@ -183,16 +182,15 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t * @count: number of bytes * * Get the correct pointers for the kobject and the attribute we're - * dealing with, then call the store() method for the attribute, + * dealing with, then call the store() method for the attribute, * passing the buffer that we acquired in fill_write_buffer(). */ - -static int -flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count) +static int flush_write_buffer(struct dentry *dentry, + struct sysfs_buffer *buffer, size_t count) { struct sysfs_dirent *attr_sd = dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - const struct sysfs_ops * ops = buffer->ops; + const struct sysfs_ops *ops = buffer->ops; int rc; /* need attr_sd for attr and ops, its parent for kobj */ @@ -219,15 +217,14 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t * then push it to the kobject in flush_write_buffer(). * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come - * on the first write. + * on the first write. * Hint: if you're writing a value, first read the file, modify only the - * the value you're changing, then write entire buffer back. + * the value you're changing, then write entire buffer back. */ - -static ssize_t -sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +static ssize_t sysfs_write_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { - struct sysfs_buffer * buffer = file->private_data; + struct sysfs_buffer *buffer = file->private_data; ssize_t len; mutex_lock(&buffer->mutex); @@ -339,13 +336,14 @@ static int sysfs_open_file(struct inode *inode, struct file *file) if (kobj->ktype && kobj->ktype->sysfs_ops) ops = kobj->ktype->sysfs_ops; else { - WARN(1, KERN_ERR "missing sysfs attribute operations for " - "kobject: %s\n", kobject_name(kobj)); + WARN(1, KERN_ERR + "missing sysfs attribute operations for kobject: %s\n", + kobject_name(kobj)); goto err_out; } /* File needs write support. - * The inode's perms must say it's ok, + * The inode's perms must say it's ok, * and we must have a store method. */ if (file->f_mode & FMODE_WRITE) { @@ -420,7 +418,7 @@ static int sysfs_release(struct inode *inode, struct file *filp) */ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) { - struct sysfs_buffer * buffer = filp->private_data; + struct sysfs_buffer *buffer = filp->private_data; struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; struct sysfs_open_dirent *od = attr_sd->s_attr.open; @@ -518,8 +516,9 @@ static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, ns = ops->namespace(kobj, attr); out: if (err) { - WARN(1, KERN_ERR "missing sysfs namespace attribute operation for " - "kobject: %s\n", kobject_name(kobj)); + WARN(1, KERN_ERR + "missing sysfs namespace attribute operation for kobject: %s\n", + kobject_name(kobj)); } *pns = ns; return err; @@ -566,17 +565,17 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, /** * sysfs_create_file - create an attribute file for an object. - * @kobj: object we're creating for. + * @kobj: object we're creating for. * @attr: attribute descriptor. */ - -int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) +int sysfs_create_file(struct kobject *kobj, const struct attribute *attr) { BUG_ON(!kobj || !kobj->sd || !attr); return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR); } +EXPORT_SYMBOL_GPL(sysfs_create_file); int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) { @@ -590,6 +589,7 @@ int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) sysfs_remove_file(kobj, ptr[i]); return err; } +EXPORT_SYMBOL_GPL(sysfs_create_files); /** * sysfs_add_file_to_group - add an attribute file to a pre-existing group. @@ -654,7 +654,6 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, } EXPORT_SYMBOL_GPL(sysfs_chmod_file); - /** * sysfs_remove_file - remove an object attribute. * @kobj: object we're acting for. @@ -662,8 +661,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); * * Hash the attribute name and kill the victim. */ - -void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) +void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) { const void *ns; @@ -672,13 +670,15 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) sysfs_hash_and_remove(kobj->sd, ns, attr->name); } +EXPORT_SYMBOL_GPL(sysfs_remove_file); -void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) +void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) { int i; for (i = 0; ptr[i]; i++) sysfs_remove_file(kobj, ptr[i]); } +EXPORT_SYMBOL_GPL(sysfs_remove_files); /** * sysfs_remove_file_from_group - remove an attribute file from a group. @@ -793,9 +793,3 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), return 0; } EXPORT_SYMBOL_GPL(sysfs_schedule_callback); - - -EXPORT_SYMBOL_GPL(sysfs_create_file); -EXPORT_SYMBOL_GPL(sysfs_remove_file); -EXPORT_SYMBOL_GPL(sysfs_remove_files); -EXPORT_SYMBOL_GPL(sysfs_create_files); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 09a1a25cd145..5f92cd2f61c1 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -3,8 +3,10 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2013 Greg Kroah-Hartman + * Copyright (c) 2013 The Linux Foundation * - * This file is released undert the GPL v2. + * This file is released undert the GPL v2. * */ @@ -19,8 +21,8 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, const struct attribute_group *grp) { - struct attribute *const* attr; - struct bin_attribute *const* bin_attr; + struct attribute *const *attr; + struct bin_attribute *const *bin_attr; if (grp->attrs) for (attr = grp->attrs; *attr; attr++) @@ -33,8 +35,8 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, const struct attribute_group *grp, int update) { - struct attribute *const* attr; - struct bin_attribute *const* bin_attr; + struct attribute *const *attr; + struct bin_attribute *const *bin_attr; int error = 0, i; if (grp->attrs) { @@ -129,6 +131,41 @@ int sysfs_create_group(struct kobject *kobj, { return internal_create_group(kobj, 0, grp); } +EXPORT_SYMBOL_GPL(sysfs_create_group); + +/** + * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups + * @kobj: The kobject to create the group on + * @groups: The attribute groups to create, NULL terminated + * + * This function creates a bunch of attribute groups. If an error occurs when + * creating a group, all previously created groups will be removed, unwinding + * everything back to the original state when this function was called. + * It will explicitly warn and error if any of the attribute files being + * created already exist. + * + * Returns 0 on success or error code from sysfs_create_group on error. + */ +int sysfs_create_groups(struct kobject *kobj, + const struct attribute_group **groups) +{ + int error = 0; + int i; + + if (!groups) + return 0; + + for (i = 0; groups[i]; i++) { + error = sysfs_create_group(kobj, groups[i]); + if (error) { + while (--i >= 0) + sysfs_remove_group(kobj, groups[i]); + break; + } + } + return error; +} +EXPORT_SYMBOL_GPL(sysfs_create_groups); /** * sysfs_update_group - given a directory kobject, update an attribute group @@ -152,11 +189,18 @@ int sysfs_update_group(struct kobject *kobj, { return internal_create_group(kobj, 1, grp); } +EXPORT_SYMBOL_GPL(sysfs_update_group); - - -void sysfs_remove_group(struct kobject * kobj, - const struct attribute_group * grp) +/** + * sysfs_remove_group: remove a group from a kobject + * @kobj: kobject to remove the group from + * @grp: group to remove + * + * This function removes a group of attributes from a kobject. The attributes + * previously have to have been created for this group, otherwise it will fail. + */ +void sysfs_remove_group(struct kobject *kobj, + const struct attribute_group *grp) { struct sysfs_dirent *dir_sd = kobj->sd; struct sysfs_dirent *sd; @@ -164,8 +208,9 @@ void sysfs_remove_group(struct kobject * kobj, if (grp->name) { sd = sysfs_get_dirent(dir_sd, NULL, grp->name); if (!sd) { - WARN(!sd, KERN_WARNING "sysfs group %p not found for " - "kobject '%s'\n", grp, kobject_name(kobj)); + WARN(!sd, KERN_WARNING + "sysfs group %p not found for kobject '%s'\n", + grp, kobject_name(kobj)); return; } } else @@ -177,6 +222,27 @@ void sysfs_remove_group(struct kobject * kobj, sysfs_put(sd); } +EXPORT_SYMBOL_GPL(sysfs_remove_group); + +/** + * sysfs_remove_groups - remove a list of groups + * + * @kobj: The kobject for the groups to be removed from + * @groups: NULL terminated list of groups to be removed + * + * If groups is not NULL, remove the specified groups from the kobject. + */ +void sysfs_remove_groups(struct kobject *kobj, + const struct attribute_group **groups) +{ + int i; + + if (!groups) + return; + for (i = 0; groups[i]; i++) + sysfs_remove_group(kobj, groups[i]); +} +EXPORT_SYMBOL_GPL(sysfs_remove_groups); /** * sysfs_merge_group - merge files into a pre-existing attribute group. @@ -273,7 +339,3 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); - -EXPORT_SYMBOL_GPL(sysfs_create_group); -EXPORT_SYMBOL_GPL(sysfs_update_group); -EXPORT_SYMBOL_GPL(sysfs_remove_group); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 3e2837a633ed..963f910c8034 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -10,7 +10,7 @@ * Please see Documentation/filesystems/sysfs.txt for more information. */ -#undef DEBUG +#undef DEBUG #include <linux/pagemap.h> #include <linux/namei.h> @@ -36,7 +36,7 @@ static struct backing_dev_info sysfs_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static const struct inode_operations sysfs_inode_operations ={ +static const struct inode_operations sysfs_inode_operations = { .permission = sysfs_permission, .setattr = sysfs_setattr, .getattr = sysfs_getattr, @@ -67,7 +67,7 @@ static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) return attrs; } -int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr) +int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr) { struct sysfs_inode_attrs *sd_attrs; struct iattr *iattrs; @@ -128,7 +128,8 @@ out: return error; } -static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len) +static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, + u32 *secdata_len) { struct sysfs_inode_attrs *iattrs; void *old_secdata; @@ -186,13 +187,13 @@ out: return error; } -static inline void set_default_inode_attr(struct inode * inode, umode_t mode) +static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; } -static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) +static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) { inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; @@ -220,7 +221,8 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) set_nlink(inode, sd->s_dir.subdirs + 2); } -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) { struct sysfs_dirent *sd = dentry->d_fsdata; struct inode *inode = dentry->d_inode; @@ -285,7 +287,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) * RETURNS: * Pointer to allocated inode on success, NULL on failure. */ -struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd) +struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd) { struct inode *inode; @@ -312,7 +314,8 @@ void sysfs_evict_inode(struct inode *inode) sysfs_put(sd); } -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name) +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, + const char *name) { struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index afd83273e6ce..834ec2cdb7a3 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -64,7 +64,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) /* instantiate and link root dentry */ root = d_make_root(inode); if (!root) { - pr_debug("%s: could not get root dentry!\n",__func__); + pr_debug("%s: could not get root dentry!\n", __func__); return -ENOMEM; } root->d_fsdata = &sysfs_root; @@ -112,8 +112,15 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; - if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) - return ERR_PTR(-EPERM); + if (!(flags & MS_KERNMOUNT)) { + if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) + return ERR_PTR(-EPERM); + + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { + if (!kobj_ns_current_may_mount(type)) + return ERR_PTR(-EPERM); + } + } info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 8c940df97a52..2dd4507d9edd 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -125,6 +125,7 @@ int sysfs_create_link(struct kobject *kobj, struct kobject *target, { return sysfs_do_create_link(kobj, target, name, 1); } +EXPORT_SYMBOL_GPL(sysfs_create_link); /** * sysfs_create_link_nowarn - create symlink between two objects. @@ -166,8 +167,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, * @kobj: object we're acting for. * @name: name of the symlink to remove. */ - -void sysfs_remove_link(struct kobject * kobj, const char * name) +void sysfs_remove_link(struct kobject *kobj, const char *name) { struct sysfs_dirent *parent_sd = NULL; @@ -178,6 +178,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name) sysfs_hash_and_remove(parent_sd, NULL, name); } +EXPORT_SYMBOL_GPL(sysfs_remove_link); /** * sysfs_rename_link - rename symlink in object's directory. @@ -223,6 +224,7 @@ out: sysfs_put(sd); return result; } +EXPORT_SYMBOL_GPL(sysfs_rename_link); static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, struct sysfs_dirent *target_sd, char *path) @@ -276,7 +278,7 @@ static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, return 0; } -static int sysfs_getlink(struct dentry *dentry, char * path) +static int sysfs_getlink(struct dentry *dentry, char *path) { struct sysfs_dirent *sd = dentry->d_fsdata; struct sysfs_dirent *parent_sd = sd->s_parent; @@ -295,7 +297,7 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); if (page) { - error = sysfs_getlink(dentry, (char *) page); + error = sysfs_getlink(dentry, (char *) page); if (error < 0) free_page((unsigned long)page); } @@ -303,7 +305,8 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) { char *page = nd_get_link(nd); if (!IS_ERR(page)) @@ -319,8 +322,3 @@ const struct inode_operations sysfs_symlink_inode_operations = { .getattr = sysfs_getattr, .permission = sysfs_permission, }; - - -EXPORT_SYMBOL_GPL(sysfs_create_link); -EXPORT_SYMBOL_GPL(sysfs_remove_link); -EXPORT_SYMBOL_GPL(sysfs_rename_link); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d1e4043eb0c3..b6deca3e301d 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -78,7 +78,7 @@ struct sysfs_dirent { }; unsigned short s_flags; - umode_t s_mode; + umode_t s_mode; unsigned int s_ino; struct sysfs_inode_attrs *s_iattr; }; @@ -123,9 +123,9 @@ do { \ key = &attr->skey; \ \ lockdep_init_map(&sd->dep_map, "s_active", key, 0); \ -} while(0) +} while (0) #else -#define sysfs_dirent_init_lockdep(sd) do {} while(0) +#define sysfs_dirent_init_lockdep(sd) do {} while (0) #endif /* @@ -186,8 +186,8 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd); void sysfs_remove_subdir(struct sysfs_dirent *sd); -int sysfs_rename(struct sysfs_dirent *sd, - struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name); +int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, + const void *ns, const char *new_name); static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) { @@ -214,10 +214,12 @@ void sysfs_evict_inode(struct inode *inode); int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); int sysfs_permission(struct inode *inode, int mask); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name); + size_t size, int flags); +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, + const char *name); int sysfs_inode_init(void); /* diff --git a/fs/udf/super.c b/fs/udf/super.c index 9ac4057a86c9..839a2bad7f45 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -630,6 +630,12 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) struct udf_sb_info *sbi = UDF_SB(sb); int error = 0; + if (sbi->s_lvid_bh) { + int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); + if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY)) + return -EACCES; + } + uopt.flags = sbi->s_flags; uopt.uid = sbi->s_uid; uopt.gid = sbi->s_gid; @@ -649,12 +655,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) sbi->s_dmode = uopt.dmode; write_unlock(&sbi->s_cred_lock); - if (sbi->s_lvid_bh) { - int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); - if (write_rev > UDF_MAX_WRITE_VERSION) - *flags |= MS_RDONLY; - } - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) goto out_unlock; @@ -843,27 +843,38 @@ static int udf_find_fileset(struct super_block *sb, return 1; } +/* + * Load primary Volume Descriptor Sequence + * + * Return <0 on error, 0 on success. -EAGAIN is special meaning next sequence + * should be tried. + */ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) { struct primaryVolDesc *pvoldesc; struct ustr *instr, *outstr; struct buffer_head *bh; uint16_t ident; - int ret = 1; + int ret = -ENOMEM; instr = kmalloc(sizeof(struct ustr), GFP_NOFS); if (!instr) - return 1; + return -ENOMEM; outstr = kmalloc(sizeof(struct ustr), GFP_NOFS); if (!outstr) goto out1; bh = udf_read_tagged(sb, block, block, &ident); - if (!bh) + if (!bh) { + ret = -EAGAIN; goto out2; + } - BUG_ON(ident != TAG_IDENT_PVD); + if (ident != TAG_IDENT_PVD) { + ret = -EIO; + goto out_bh; + } pvoldesc = (struct primaryVolDesc *)bh->b_data; @@ -889,8 +900,9 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) if (udf_CS0toUTF8(outstr, instr)) udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); - brelse(bh); ret = 0; +out_bh: + brelse(bh); out2: kfree(outstr); out1: @@ -947,7 +959,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) if (mdata->s_mirror_fe == NULL) { udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); - goto error_exit; + return -EIO; } } @@ -964,23 +976,18 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) addr.logicalBlockNum, addr.partitionReferenceNum); mdata->s_bitmap_fe = udf_iget(sb, &addr); - if (mdata->s_bitmap_fe == NULL) { if (sb->s_flags & MS_RDONLY) udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); else { udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); - goto error_exit; + return -EIO; } } } udf_debug("udf_load_metadata_files Ok\n"); - return 0; - -error_exit: - return 1; } static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, @@ -1069,7 +1076,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (!map->s_uspace.s_table) { udf_debug("cannot load unallocSpaceTable (part %d)\n", p_index); - return 1; + return -EIO; } map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; udf_debug("unallocSpaceTable (part %d) @ %ld\n", @@ -1079,7 +1086,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (phd->unallocSpaceBitmap.extLength) { struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); if (!bitmap) - return 1; + return -ENOMEM; map->s_uspace.s_bitmap = bitmap; bitmap->s_extPosition = le32_to_cpu( phd->unallocSpaceBitmap.extPosition); @@ -1102,7 +1109,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (!map->s_fspace.s_table) { udf_debug("cannot load freedSpaceTable (part %d)\n", p_index); - return 1; + return -EIO; } map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; @@ -1113,7 +1120,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (phd->freedSpaceBitmap.extLength) { struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); if (!bitmap) - return 1; + return -ENOMEM; map->s_fspace.s_bitmap = bitmap; bitmap->s_extPosition = le32_to_cpu( phd->freedSpaceBitmap.extPosition); @@ -1165,7 +1172,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) udf_find_vat_block(sb, p_index, type1_index, blocks - 1); } if (!sbi->s_vat_inode) - return 1; + return -EIO; if (map->s_partition_type == UDF_VIRTUAL_MAP15) { map->s_type_specific.s_virtual.s_start_offset = 0; @@ -1177,7 +1184,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) pos = udf_block_map(sbi->s_vat_inode, 0); bh = sb_bread(sb, pos); if (!bh) - return 1; + return -EIO; vat20 = (struct virtualAllocationTable20 *)bh->b_data; } else { vat20 = (struct virtualAllocationTable20 *) @@ -1195,6 +1202,12 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) return 0; } +/* + * Load partition descriptor block + * + * Returns <0 on error, 0 on success, -EAGAIN is special - try next descriptor + * sequence. + */ static int udf_load_partdesc(struct super_block *sb, sector_t block) { struct buffer_head *bh; @@ -1204,13 +1217,15 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) int i, type1_idx; uint16_t partitionNumber; uint16_t ident; - int ret = 0; + int ret; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) - return 1; - if (ident != TAG_IDENT_PD) + return -EAGAIN; + if (ident != TAG_IDENT_PD) { + ret = 0; goto out_bh; + } p = (struct partitionDesc *)bh->b_data; partitionNumber = le16_to_cpu(p->partitionNumber); @@ -1229,10 +1244,13 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) if (i >= sbi->s_partitions) { udf_debug("Partition (%d) not found in partition map\n", partitionNumber); + ret = 0; goto out_bh; } ret = udf_fill_partdesc_info(sb, p, i); + if (ret < 0) + goto out_bh; /* * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and @@ -1249,32 +1267,37 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) break; } - if (i >= sbi->s_partitions) + if (i >= sbi->s_partitions) { + ret = 0; goto out_bh; + } ret = udf_fill_partdesc_info(sb, p, i); - if (ret) + if (ret < 0) goto out_bh; if (map->s_partition_type == UDF_METADATA_MAP25) { ret = udf_load_metadata_files(sb, i); - if (ret) { + if (ret < 0) { udf_err(sb, "error loading MetaData partition map %d\n", i); goto out_bh; } } else { - ret = udf_load_vat(sb, i, type1_idx); - if (ret) - goto out_bh; /* - * Mark filesystem read-only if we have a partition with - * virtual map since we don't handle writing to it (we - * overwrite blocks instead of relocating them). + * If we have a partition with virtual map, we don't handle + * writing to it (we overwrite blocks instead of relocating + * them). */ - sb->s_flags |= MS_RDONLY; - pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n"); + if (!(sb->s_flags & MS_RDONLY)) { + ret = -EACCES; + goto out_bh; + } + ret = udf_load_vat(sb, i, type1_idx); + if (ret < 0) + goto out_bh; } + ret = 0; out_bh: /* In case loading failed, we handle cleanup in udf_fill_super */ brelse(bh); @@ -1340,11 +1363,11 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, uint16_t ident; struct buffer_head *bh; unsigned int table_len; - int ret = 0; + int ret; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) - return 1; + return -EAGAIN; BUG_ON(ident != TAG_IDENT_LVD); lvd = (struct logicalVolDesc *)bh->b_data; table_len = le32_to_cpu(lvd->mapTableLength); @@ -1352,7 +1375,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, udf_err(sb, "error loading logical volume descriptor: " "Partition table too long (%u > %lu)\n", table_len, sb->s_blocksize - sizeof(*lvd)); - ret = 1; + ret = -EIO; goto out_bh; } @@ -1396,11 +1419,10 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, } else if (!strncmp(upm2->partIdent.ident, UDF_ID_SPARABLE, strlen(UDF_ID_SPARABLE))) { - if (udf_load_sparable_map(sb, map, - (struct sparablePartitionMap *)gpm) < 0) { - ret = 1; + ret = udf_load_sparable_map(sb, map, + (struct sparablePartitionMap *)gpm); + if (ret < 0) goto out_bh; - } } else if (!strncmp(upm2->partIdent.ident, UDF_ID_METADATA, strlen(UDF_ID_METADATA))) { @@ -1465,7 +1487,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, } if (lvd->integritySeqExt.extLength) udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt)); - + ret = 0; out_bh: brelse(bh); return ret; @@ -1503,22 +1525,18 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ } /* - * udf_process_sequence - * - * PURPOSE - * Process a main/reserve volume descriptor sequence. - * - * PRE-CONDITIONS - * sb Pointer to _locked_ superblock. - * block First block of first extent of the sequence. - * lastblock Lastblock of first extent of the sequence. + * Process a main/reserve volume descriptor sequence. + * @block First block of first extent of the sequence. + * @lastblock Lastblock of first extent of the sequence. + * @fileset There we store extent containing root fileset * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. + * Returns <0 on error, 0 on success. -EAGAIN is special - try next descriptor + * sequence */ -static noinline int udf_process_sequence(struct super_block *sb, long block, - long lastblock, struct kernel_lb_addr *fileset) +static noinline int udf_process_sequence( + struct super_block *sb, + sector_t block, sector_t lastblock, + struct kernel_lb_addr *fileset) { struct buffer_head *bh = NULL; struct udf_vds_record vds[VDS_POS_LENGTH]; @@ -1529,6 +1547,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, uint32_t vdsn; uint16_t ident; long next_s = 0, next_e = 0; + int ret; memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); @@ -1543,7 +1562,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, udf_err(sb, "Block %llu of volume descriptor sequence is corrupted or we could not read it\n", (unsigned long long)block); - return 1; + return -EAGAIN; } /* Process each descriptor (ISO 13346 3/8.3-8.4) */ @@ -1616,14 +1635,19 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, */ if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { udf_err(sb, "Primary Volume Descriptor not found!\n"); - return 1; + return -EAGAIN; + } + ret = udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block); + if (ret < 0) + return ret; + + if (vds[VDS_POS_LOGICAL_VOL_DESC].block) { + ret = udf_load_logicalvol(sb, + vds[VDS_POS_LOGICAL_VOL_DESC].block, + fileset); + if (ret < 0) + return ret; } - if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block)) - return 1; - - if (vds[VDS_POS_LOGICAL_VOL_DESC].block && udf_load_logicalvol(sb, - vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset)) - return 1; if (vds[VDS_POS_PARTITION_DESC].block) { /* @@ -1632,19 +1656,27 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, */ for (block = vds[VDS_POS_PARTITION_DESC].block; block < vds[VDS_POS_TERMINATING_DESC].block; - block++) - if (udf_load_partdesc(sb, block)) - return 1; + block++) { + ret = udf_load_partdesc(sb, block); + if (ret < 0) + return ret; + } } return 0; } +/* + * Load Volume Descriptor Sequence described by anchor in bh + * + * Returns <0 on error, 0 on success + */ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, struct kernel_lb_addr *fileset) { struct anchorVolDescPtr *anchor; - long main_s, main_e, reserve_s, reserve_e; + sector_t main_s, main_e, reserve_s, reserve_e; + int ret; anchor = (struct anchorVolDescPtr *)bh->b_data; @@ -1662,18 +1694,26 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, /* Process the main & reserve sequences */ /* responsible for finding the PartitionDesc(s) */ - if (!udf_process_sequence(sb, main_s, main_e, fileset)) - return 1; - udf_sb_free_partitions(sb); - if (!udf_process_sequence(sb, reserve_s, reserve_e, fileset)) - return 1; + ret = udf_process_sequence(sb, main_s, main_e, fileset); + if (ret != -EAGAIN) + return ret; udf_sb_free_partitions(sb); - return 0; + ret = udf_process_sequence(sb, reserve_s, reserve_e, fileset); + if (ret < 0) { + udf_sb_free_partitions(sb); + /* No sequence was OK, return -EIO */ + if (ret == -EAGAIN) + ret = -EIO; + } + return ret; } /* * Check whether there is an anchor block in the given block and * load Volume Descriptor Sequence if so. + * + * Returns <0 on error, 0 on success, -EAGAIN is special - try next anchor + * block */ static int udf_check_anchor_block(struct super_block *sb, sector_t block, struct kernel_lb_addr *fileset) @@ -1685,33 +1725,40 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && udf_fixed_to_variable(block) >= sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) - return 0; + return -EAGAIN; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) - return 0; + return -EAGAIN; if (ident != TAG_IDENT_AVDP) { brelse(bh); - return 0; + return -EAGAIN; } ret = udf_load_sequence(sb, bh, fileset); brelse(bh); return ret; } -/* Search for an anchor volume descriptor pointer */ -static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock, - struct kernel_lb_addr *fileset) +/* + * Search for an anchor volume descriptor pointer. + * + * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set + * of anchors. + */ +static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, + struct kernel_lb_addr *fileset) { sector_t last[6]; int i; struct udf_sb_info *sbi = UDF_SB(sb); int last_count = 0; + int ret; /* First try user provided anchor */ if (sbi->s_anchor) { - if (udf_check_anchor_block(sb, sbi->s_anchor, fileset)) - return lastblock; + ret = udf_check_anchor_block(sb, sbi->s_anchor, fileset); + if (ret != -EAGAIN) + return ret; } /* * according to spec, anchor is in either: @@ -1720,39 +1767,46 @@ static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock, * lastblock * however, if the disc isn't closed, it could be 512. */ - if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset)) - return lastblock; + ret = udf_check_anchor_block(sb, sbi->s_session + 256, fileset); + if (ret != -EAGAIN) + return ret; /* * The trouble is which block is the last one. Drives often misreport * this so we try various possibilities. */ - last[last_count++] = lastblock; - if (lastblock >= 1) - last[last_count++] = lastblock - 1; - last[last_count++] = lastblock + 1; - if (lastblock >= 2) - last[last_count++] = lastblock - 2; - if (lastblock >= 150) - last[last_count++] = lastblock - 150; - if (lastblock >= 152) - last[last_count++] = lastblock - 152; + last[last_count++] = *lastblock; + if (*lastblock >= 1) + last[last_count++] = *lastblock - 1; + last[last_count++] = *lastblock + 1; + if (*lastblock >= 2) + last[last_count++] = *lastblock - 2; + if (*lastblock >= 150) + last[last_count++] = *lastblock - 150; + if (*lastblock >= 152) + last[last_count++] = *lastblock - 152; for (i = 0; i < last_count; i++) { if (last[i] >= sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) continue; - if (udf_check_anchor_block(sb, last[i], fileset)) - return last[i]; + ret = udf_check_anchor_block(sb, last[i], fileset); + if (ret != -EAGAIN) { + if (!ret) + *lastblock = last[i]; + return ret; + } if (last[i] < 256) continue; - if (udf_check_anchor_block(sb, last[i] - 256, fileset)) - return last[i]; + ret = udf_check_anchor_block(sb, last[i] - 256, fileset); + if (ret != -EAGAIN) { + if (!ret) + *lastblock = last[i]; + return ret; + } } /* Finally try block 512 in case media is open */ - if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset)) - return last[0]; - return 0; + return udf_check_anchor_block(sb, sbi->s_session + 512, fileset); } /* @@ -1760,54 +1814,59 @@ static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock, * area specified by it. The function expects sbi->s_lastblock to be the last * block on the media. * - * Return 1 if ok, 0 if not found. - * + * Return <0 on error, 0 if anchor found. -EAGAIN is special meaning anchor + * was not found. */ static int udf_find_anchor(struct super_block *sb, struct kernel_lb_addr *fileset) { - sector_t lastblock; struct udf_sb_info *sbi = UDF_SB(sb); + sector_t lastblock = sbi->s_last_block; + int ret; - lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); - if (lastblock) + ret = udf_scan_anchors(sb, &lastblock, fileset); + if (ret != -EAGAIN) goto out; /* No anchor found? Try VARCONV conversion of block numbers */ UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + lastblock = udf_variable_to_fixed(sbi->s_last_block); /* Firstly, we try to not convert number of the last block */ - lastblock = udf_scan_anchors(sb, - udf_variable_to_fixed(sbi->s_last_block), - fileset); - if (lastblock) + ret = udf_scan_anchors(sb, &lastblock, fileset); + if (ret != -EAGAIN) goto out; + lastblock = sbi->s_last_block; /* Secondly, we try with converted number of the last block */ - lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); - if (!lastblock) { + ret = udf_scan_anchors(sb, &lastblock, fileset); + if (ret < 0) { /* VARCONV didn't help. Clear it. */ UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); - return 0; } out: - sbi->s_last_block = lastblock; - return 1; + if (ret == 0) + sbi->s_last_block = lastblock; + return ret; } /* * Check Volume Structure Descriptor, find Anchor block and load Volume - * Descriptor Sequence + * Descriptor Sequence. + * + * Returns < 0 on error, 0 on success. -EAGAIN is special meaning anchor + * block was not found. */ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, int silent, struct kernel_lb_addr *fileset) { struct udf_sb_info *sbi = UDF_SB(sb); loff_t nsr_off; + int ret; if (!sb_set_blocksize(sb, uopt->blocksize)) { if (!silent) udf_warn(sb, "Bad block size\n"); - return 0; + return -EINVAL; } sbi->s_last_block = uopt->lastblock; if (!uopt->novrs) { @@ -1828,12 +1887,13 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, /* Look for anchor block and load Volume Descriptor Sequence */ sbi->s_anchor = uopt->anchor; - if (!udf_find_anchor(sb, fileset)) { - if (!silent) + ret = udf_find_anchor(sb, fileset); + if (ret < 0) { + if (!silent && ret == -EAGAIN) udf_warn(sb, "No anchor found\n"); - return 0; + return ret; } - return 1; + return 0; } static void udf_open_lvid(struct super_block *sb) @@ -1939,7 +1999,7 @@ u64 lvid_get_unique_id(struct super_block *sb) static int udf_fill_super(struct super_block *sb, void *options, int silent) { - int ret; + int ret = -EINVAL; struct inode *inode = NULL; struct udf_options uopt; struct kernel_lb_addr rootdir, fileset; @@ -2011,7 +2071,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } else { uopt.blocksize = bdev_logical_block_size(sb->s_bdev); ret = udf_load_vrs(sb, &uopt, silent, &fileset); - if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { + if (ret == -EAGAIN && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { if (!silent) pr_notice("Rescanning with blocksize %d\n", UDF_DEFAULT_BLOCKSIZE); @@ -2021,8 +2081,11 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) ret = udf_load_vrs(sb, &uopt, silent, &fileset); } } - if (!ret) { - udf_warn(sb, "No partition found (1)\n"); + if (ret < 0) { + if (ret == -EAGAIN) { + udf_warn(sb, "No partition found (1)\n"); + ret = -EINVAL; + } goto error_out; } @@ -2040,9 +2103,13 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) udf_err(sb, "minUDFReadRev=%x (max is %x)\n", le16_to_cpu(lvidiu->minUDFReadRev), UDF_MAX_READ_VERSION); + ret = -EINVAL; + goto error_out; + } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION && + !(sb->s_flags & MS_RDONLY)) { + ret = -EACCES; goto error_out; - } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) - sb->s_flags |= MS_RDONLY; + } sbi->s_udfrev = minUDFWriteRev; @@ -2054,17 +2121,20 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (!sbi->s_partitions) { udf_warn(sb, "No partition found (2)\n"); + ret = -EINVAL; goto error_out; } if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & - UDF_PART_FLAG_READ_ONLY) { - pr_notice("Partition marked readonly; forcing readonly mount\n"); - sb->s_flags |= MS_RDONLY; + UDF_PART_FLAG_READ_ONLY && + !(sb->s_flags & MS_RDONLY)) { + ret = -EACCES; + goto error_out; } if (udf_find_fileset(sb, &fileset, &rootdir)) { udf_warn(sb, "No fileset found\n"); + ret = -EINVAL; goto error_out; } @@ -2086,6 +2156,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (!inode) { udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", rootdir.logicalBlockNum, rootdir.partitionReferenceNum); + ret = -EIO; goto error_out; } @@ -2093,6 +2164,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) sb->s_root = d_make_root(inode); if (!sb->s_root) { udf_err(sb, "Couldn't allocate root dentry\n"); + ret = -ENOMEM; goto error_out; } sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -2113,7 +2185,7 @@ error_out: kfree(sbi); sb->s_fs_info = NULL; - return -EINVAL; + return ret; } void _udf_err(struct super_block *sb, const char *function, diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 4a4508023a3c..0719e4db93f2 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -27,9 +27,12 @@ xfs-y += xfs_trace.o # highlevel code xfs-y += xfs_aops.o \ + xfs_attr_inactive.o \ + xfs_attr_list.o \ xfs_bit.o \ + xfs_bmap_util.o \ xfs_buf.o \ - xfs_dfrag.o \ + xfs_dir2_readdir.o \ xfs_discard.o \ xfs_error.o \ xfs_export.o \ @@ -44,11 +47,11 @@ xfs-y += xfs_aops.o \ xfs_iops.o \ xfs_itable.o \ xfs_message.o \ + xfs_mount.o \ xfs_mru_cache.o \ - xfs_rename.o \ xfs_super.o \ - xfs_utils.o \ - xfs_vnodeops.o \ + xfs_symlink.o \ + xfs_trans.o \ xfs_xattr.o \ kmem.o \ uuid.o @@ -73,10 +76,13 @@ xfs-y += xfs_alloc.o \ xfs_ialloc_btree.o \ xfs_icreate_item.o \ xfs_inode.o \ + xfs_inode_fork.o \ + xfs_inode_buf.o \ xfs_log_recover.o \ - xfs_mount.o \ - xfs_symlink.o \ - xfs_trans.o + xfs_log_rlimit.o \ + xfs_sb.o \ + xfs_symlink_remote.o \ + xfs_trans_resv.o # low-level transaction/log code xfs-y += xfs_log.o \ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 306d883d89bc..69518960b2ba 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -16,11 +16,13 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_acl.h" #include "xfs_attr.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_vnodeops.h" +#include "xfs_ag.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_trace.h" @@ -68,14 +70,15 @@ xfs_acl_from_disk( switch (acl_e->e_tag) { case ACL_USER: + acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id)); + break; case ACL_GROUP: - acl_e->e_id = be32_to_cpu(ace->ae_id); + acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id)); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: - acl_e->e_id = ACL_UNDEFINED_ID; break; default: goto fail; @@ -101,7 +104,18 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) acl_e = &acl->a_entries[i]; ace->ae_tag = cpu_to_be32(acl_e->e_tag); - ace->ae_id = cpu_to_be32(acl_e->e_id); + switch (acl_e->e_tag) { + case ACL_USER: + ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); + break; + case ACL_GROUP: + ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid)); + break; + default: + ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID); + break; + } + ace->ae_perm = cpu_to_be16(acl_e->e_perm); } } @@ -360,7 +374,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, return -EINVAL; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return value ? -EACCES : 0; - if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) + if (!inode_owner_or_capable(inode)) return -EPERM; if (!value) diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 317aa86d96ea..1cb740afd674 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -227,59 +227,6 @@ typedef struct xfs_agfl { } xfs_agfl_t; /* - * Per-ag incore structure, copies of information in agf and agi, - * to improve the performance of allocation group selection. - */ -#define XFS_PAGB_NUM_SLOTS 128 - -typedef struct xfs_perag { - struct xfs_mount *pag_mount; /* owner filesystem */ - xfs_agnumber_t pag_agno; /* AG this structure belongs to */ - atomic_t pag_ref; /* perag reference count */ - char pagf_init; /* this agf's entry is initialized */ - char pagi_init; /* this agi's entry is initialized */ - char pagf_metadata; /* the agf is preferred to be metadata */ - char pagi_inodeok; /* The agi is ok for inodes */ - __uint8_t pagf_levels[XFS_BTNUM_AGF]; - /* # of levels in bno & cnt btree */ - __uint32_t pagf_flcount; /* count of blocks in freelist */ - xfs_extlen_t pagf_freeblks; /* total free blocks */ - xfs_extlen_t pagf_longest; /* longest free space */ - __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ - xfs_agino_t pagi_freecount; /* number of free inodes */ - xfs_agino_t pagi_count; /* number of allocated inodes */ - - /* - * Inode allocation search lookup optimisation. - * If the pagino matches, the search for new inodes - * doesn't need to search the near ones again straight away - */ - xfs_agino_t pagl_pagino; - xfs_agino_t pagl_leftrec; - xfs_agino_t pagl_rightrec; -#ifdef __KERNEL__ - spinlock_t pagb_lock; /* lock for pagb_tree */ - struct rb_root pagb_tree; /* ordered tree of busy extents */ - - atomic_t pagf_fstrms; /* # of filestreams active in this AG */ - - spinlock_t pag_ici_lock; /* incore inode cache lock */ - struct radix_tree_root pag_ici_root; /* incore inode cache root */ - int pag_ici_reclaimable; /* reclaimable inodes */ - struct mutex pag_ici_reclaim_lock; /* serialisation point */ - unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ - - /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ - struct rb_root pag_buf_tree; /* ordered tree of active buffers */ - - /* for rcu-safe freeing */ - struct rcu_head rcu_head; -#endif - int pagb_count; /* pagb slots in use */ -} xfs_perag_t; - -/* * tags for inode radix tree */ #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 71596e57283a..5a1393f5e020 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -878,7 +878,7 @@ xfs_alloc_ag_vextent_near( xfs_agblock_t ltnew; /* useful start bno of left side */ xfs_extlen_t rlen; /* length of returned extent */ int forced = 0; -#if defined(DEBUG) && defined(__KERNEL__) +#ifdef DEBUG /* * Randomly don't execute the first algorithm. */ @@ -938,8 +938,8 @@ restart: xfs_extlen_t blen=0; xfs_agblock_t bnew=0; -#if defined(DEBUG) && defined(__KERNEL__) - if (!dofirst) +#ifdef DEBUG + if (dofirst) break; #endif /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 596ec71da00e..977da0ec6604 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -28,9 +28,9 @@ #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_iomap.h" -#include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> @@ -86,14 +86,6 @@ xfs_destroy_ioend( bh->b_end_io(bh, !ioend->io_error); } - if (ioend->io_iocb) { - inode_dio_done(ioend->io_inode); - if (ioend->io_isasync) { - aio_complete(ioend->io_iocb, ioend->io_error ? - ioend->io_error : ioend->io_result, 0); - } - } - mempool_free(ioend, xfs_ioend_pool); } @@ -116,7 +108,7 @@ xfs_setfilesize_trans_alloc( tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return error; @@ -281,7 +273,6 @@ xfs_alloc_ioend( * all the I/O from calling the completion routine too early. */ atomic_set(&ioend->io_remaining, 1); - ioend->io_isasync = 0; ioend->io_isdirect = 0; ioend->io_error = 0; ioend->io_list = NULL; @@ -291,8 +282,6 @@ xfs_alloc_ioend( ioend->io_buffer_tail = NULL; ioend->io_offset = 0; ioend->io_size = 0; - ioend->io_iocb = NULL; - ioend->io_result = 0; ioend->io_append_trans = NULL; INIT_WORK(&ioend->io_work, xfs_end_io); @@ -451,7 +440,7 @@ xfs_start_page_writeback( end_page_writeback(page); } -static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) +static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) { return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); } @@ -525,7 +514,7 @@ xfs_submit_ioend( goto retry; } - if (bio_add_buffer(bio, bh) != bh->b_size) { + if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { xfs_submit_ioend_bio(wbc, ioend, bio); goto retry; } @@ -1292,8 +1281,10 @@ __xfs_get_blocks( if (create || !ISUNWRITTEN(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); if (create && ISUNWRITTEN(&imap)) { - if (direct) + if (direct) { bh_result->b_private = inode; + set_buffer_defer_completion(bh_result); + } set_buffer_unwritten(bh_result); } } @@ -1390,9 +1381,7 @@ xfs_end_io_direct_write( struct kiocb *iocb, loff_t offset, ssize_t size, - void *private, - int ret, - bool is_async) + void *private) { struct xfs_ioend *ioend = iocb->private; @@ -1414,17 +1403,10 @@ xfs_end_io_direct_write( ioend->io_offset = offset; ioend->io_size = size; - ioend->io_iocb = iocb; - ioend->io_result = ret; if (private && size > 0) ioend->io_type = XFS_IO_UNWRITTEN; - if (is_async) { - ioend->io_isasync = 1; - xfs_finish_ioend(ioend); - } else { - xfs_finish_ioend_sync(ioend); - } + xfs_finish_ioend_sync(ioend); } STATIC ssize_t @@ -1516,13 +1498,26 @@ xfs_vm_write_failed( loff_t pos, unsigned len) { - loff_t block_offset = pos & PAGE_MASK; + loff_t block_offset; loff_t block_start; loff_t block_end; loff_t from = pos & (PAGE_CACHE_SIZE - 1); loff_t to = from + len; struct buffer_head *bh, *head; + /* + * The request pos offset might be 32 or 64 bit, this is all fine + * on 64-bit platform. However, for 64-bit pos request on 32-bit + * platform, the high 32-bit will be masked off if we evaluate the + * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is + * 0xfffff000 as an unsigned long, hence the result is incorrect + * which could cause the following ASSERT failed in most cases. + * In order to avoid this, we can evaluate the block_offset of the + * start of the page by using shifts rather than masks the mismatch + * problem. + */ + block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + ASSERT(block_offset + from == pos); head = page_buffers(page); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index c325abb8d61a..f94dd459dff9 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -45,7 +45,6 @@ typedef struct xfs_ioend { unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ - unsigned int io_isasync : 1; /* needs aio_complete */ unsigned int io_isdirect : 1;/* direct I/O */ struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ @@ -54,8 +53,6 @@ typedef struct xfs_ioend { xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ struct xfs_trans *io_append_trans;/* xact. for size update */ - struct kiocb *io_iocb; - int io_result; } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 20fe3fe9d341..ddcf2267ffa6 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -17,10 +17,11 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -32,13 +33,13 @@ #include "xfs_alloc.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_attr_remote.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" -#include "xfs_vnodeops.h" #include "xfs_trace.h" /* @@ -62,7 +63,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); -STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context); /* * Internal routines when attribute list is more than one block. @@ -70,7 +70,6 @@ STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context); STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC int xfs_attr_node_addname(xfs_da_args_t *args); STATIC int xfs_attr_node_removename(xfs_da_args_t *args); -STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); @@ -90,7 +89,7 @@ xfs_attr_name_to_xname( return 0; } -STATIC int +int xfs_inode_hasattr( struct xfs_inode *ip) { @@ -227,13 +226,14 @@ xfs_attr_set_int( int valuelen, int flags) { - xfs_da_args_t args; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - int error, err2, committed; - xfs_mount_t *mp = dp->i_mount; - int rsvd = (flags & ATTR_ROOT) != 0; - int local; + xfs_da_args_t args; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + int error, err2, committed; + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans_res tres; + int rsvd = (flags & ATTR_ROOT) != 0; + int local; /* * Attach the dquots to the inode. @@ -293,11 +293,11 @@ xfs_attr_set_int( if (rsvd) args.trans->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(args.trans, args.total, - XFS_ATTRSETM_LOG_RES(mp) + - XFS_ATTRSETRT_LOG_RES(mp) * args.total, - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ATTRSET_LOG_COUNT); + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args.total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(args.trans, &tres, args.total, 0); if (error) { xfs_trans_cancel(args.trans, 0); return(error); @@ -517,11 +517,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) if (flags & ATTR_ROOT) args.trans->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(args.trans, - XFS_ATTRRM_SPACE_RES(mp), - XFS_ATTRRM_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ATTRRM_LOG_COUNT))) { + error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, + XFS_ATTRRM_SPACE_RES(mp), 0); + if (error) { xfs_trans_cancel(args.trans, 0); return(error); } @@ -611,228 +609,6 @@ xfs_attr_remove( return xfs_attr_remove_int(dp, &xname, flags); } -int -xfs_attr_list_int(xfs_attr_list_context_t *context) -{ - int error; - xfs_inode_t *dp = context->dp; - - XFS_STATS_INC(xs_attr_list); - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return EIO; - - xfs_ilock(dp, XFS_ILOCK_SHARED); - - /* - * Decide on what work routines to call based on the inode size. - */ - if (!xfs_inode_hasattr(dp)) { - error = 0; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = xfs_attr_shortform_list(context); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_list(context); - } else { - error = xfs_attr_node_list(context); - } - - xfs_iunlock(dp, XFS_ILOCK_SHARED); - - return error; -} - -#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ - (((struct attrlist_ent *) 0)->a_name - (char *) 0) -#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ - ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ - & ~(sizeof(u_int32_t)-1)) - -/* - * Format an attribute and copy it out to the user's buffer. - * Take care to check values and protect against them changing later, - * we may be reading them directly out of a user buffer. - */ -/*ARGSUSED*/ -STATIC int -xfs_attr_put_listent( - xfs_attr_list_context_t *context, - int flags, - unsigned char *name, - int namelen, - int valuelen, - unsigned char *value) -{ - struct attrlist *alist = (struct attrlist *)context->alist; - attrlist_ent_t *aep; - int arraytop; - - ASSERT(!(context->flags & ATTR_KERNOVAL)); - ASSERT(context->count >= 0); - ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*alist)); - ASSERT(context->firstu <= context->bufsize); - - /* - * Only list entries in the right namespace. - */ - if (((context->flags & ATTR_SECURE) == 0) != - ((flags & XFS_ATTR_SECURE) == 0)) - return 0; - if (((context->flags & ATTR_ROOT) == 0) != - ((flags & XFS_ATTR_ROOT) == 0)) - return 0; - - arraytop = sizeof(*alist) + - context->count * sizeof(alist->al_offset[0]); - context->firstu -= ATTR_ENTSIZE(namelen); - if (context->firstu < arraytop) { - trace_xfs_attr_list_full(context); - alist->al_more = 1; - context->seen_enough = 1; - return 1; - } - - aep = (attrlist_ent_t *)&context->alist[context->firstu]; - aep->a_valuelen = valuelen; - memcpy(aep->a_name, name, namelen); - aep->a_name[namelen] = 0; - alist->al_offset[context->count++] = context->firstu; - alist->al_count = context->count; - trace_xfs_attr_list_add(context); - return 0; -} - -/* - * Generate a list of extended attribute names and optionally - * also value lengths. Positive return value follows the XFS - * convention of being an error, zero or negative return code - * is the length of the buffer returned (negated), indicating - * success. - */ -int -xfs_attr_list( - xfs_inode_t *dp, - char *buffer, - int bufsize, - int flags, - attrlist_cursor_kern_t *cursor) -{ - xfs_attr_list_context_t context; - struct attrlist *alist; - int error; - - /* - * Validate the cursor. - */ - if (cursor->pad1 || cursor->pad2) - return(XFS_ERROR(EINVAL)); - if ((cursor->initted == 0) && - (cursor->hashval || cursor->blkno || cursor->offset)) - return XFS_ERROR(EINVAL); - - /* - * Check for a properly aligned buffer. - */ - if (((long)buffer) & (sizeof(int)-1)) - return XFS_ERROR(EFAULT); - if (flags & ATTR_KERNOVAL) - bufsize = 0; - - /* - * Initialize the output buffer. - */ - memset(&context, 0, sizeof(context)); - context.dp = dp; - context.cursor = cursor; - context.resynch = 1; - context.flags = flags; - context.alist = buffer; - context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ - context.firstu = context.bufsize; - context.put_listent = xfs_attr_put_listent; - - alist = (struct attrlist *)context.alist; - alist->al_count = 0; - alist->al_more = 0; - alist->al_offset[0] = context.bufsize; - - error = xfs_attr_list_int(&context); - ASSERT(error >= 0); - return error; -} - -int /* error */ -xfs_attr_inactive(xfs_inode_t *dp) -{ - xfs_trans_t *trans; - xfs_mount_t *mp; - int error; - - mp = dp->i_mount; - ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); - - xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return 0; - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); - - /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ - trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); - if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ATTRINVAL_LOG_COUNT))) { - xfs_trans_cancel(trans, 0); - return(error); - } - xfs_ilock(dp, XFS_ILOCK_EXCL); - - /* - * No need to make quota reservations here. We expect to release some - * blocks, not allocate, in the common case. - */ - xfs_trans_ijoin(trans, dp, 0); - - /* - * Decide on what work routines to call based on the inode size. - */ - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = 0; - goto out; - } - error = xfs_attr3_root_inactive(&trans, dp); - if (error) - goto out; - - error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); - if (error) - goto out; - - error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - - return(error); - -out: - xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); -} - - /*======================================================================== * External routines when attribute list is inside the inode @@ -1166,28 +942,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args) return error; } -/* - * Copy out attribute entries for attr_list(), for leaf attribute lists. - */ -STATIC int -xfs_attr_leaf_list(xfs_attr_list_context_t *context) -{ - int error; - struct xfs_buf *bp; - - trace_xfs_attr_leaf_list(context); - - context->cursor->blkno = 0; - error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); - if (error) - return XFS_ERROR(error); - - error = xfs_attr3_leaf_list_int(bp, context); - xfs_trans_brelse(NULL, bp); - return XFS_ERROR(error); -} - - /*======================================================================== * External routines when attribute list size > XFS_LBSIZE(mp). *========================================================================*/ @@ -1260,6 +1014,7 @@ restart: * have been a b-tree. */ xfs_da_state_free(state); + state = NULL; xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_node(args); if (!error) { @@ -1780,143 +1535,3 @@ xfs_attr_node_get(xfs_da_args_t *args) xfs_da_state_free(state); return(retval); } - -STATIC int /* error */ -xfs_attr_node_list(xfs_attr_list_context_t *context) -{ - attrlist_cursor_kern_t *cursor; - xfs_attr_leafblock_t *leaf; - xfs_da_intnode_t *node; - struct xfs_attr3_icleaf_hdr leafhdr; - struct xfs_da3_icnode_hdr nodehdr; - struct xfs_da_node_entry *btree; - int error, i; - struct xfs_buf *bp; - - trace_xfs_attr_node_list(context); - - cursor = context->cursor; - cursor->initted = 1; - - /* - * Do all sorts of validation on the passed-in cursor structure. - * If anything is amiss, ignore the cursor and look up the hashval - * starting from the btree root. - */ - bp = NULL; - if (cursor->blkno > 0) { - error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); - if ((error != 0) && (error != EFSCORRUPTED)) - return(error); - if (bp) { - struct xfs_attr_leaf_entry *entries; - - node = bp->b_addr; - switch (be16_to_cpu(node->hdr.info.magic)) { - case XFS_DA_NODE_MAGIC: - case XFS_DA3_NODE_MAGIC: - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); - bp = NULL; - break; - case XFS_ATTR_LEAF_MAGIC: - case XFS_ATTR3_LEAF_MAGIC: - leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); - entries = xfs_attr3_leaf_entryp(leaf); - if (cursor->hashval > be32_to_cpu( - entries[leafhdr.count - 1].hashval)) { - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); - bp = NULL; - } else if (cursor->hashval <= be32_to_cpu( - entries[0].hashval)) { - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); - bp = NULL; - } - break; - default: - trace_xfs_attr_list_wrong_blk(context); - xfs_trans_brelse(NULL, bp); - bp = NULL; - } - } - } - - /* - * We did not find what we expected given the cursor's contents, - * so we start from the top and work down based on the hash value. - * Note that start of node block is same as start of leaf block. - */ - if (bp == NULL) { - cursor->blkno = 0; - for (;;) { - __uint16_t magic; - - error = xfs_da3_node_read(NULL, context->dp, - cursor->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) - return(error); - node = bp->b_addr; - magic = be16_to_cpu(node->hdr.info.magic); - if (magic == XFS_ATTR_LEAF_MAGIC || - magic == XFS_ATTR3_LEAF_MAGIC) - break; - if (magic != XFS_DA_NODE_MAGIC && - magic != XFS_DA3_NODE_MAGIC) { - XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, - node); - xfs_trans_brelse(NULL, bp); - return XFS_ERROR(EFSCORRUPTED); - } - - xfs_da3_node_hdr_from_disk(&nodehdr, node); - btree = xfs_da3_node_tree_p(node); - for (i = 0; i < nodehdr.count; btree++, i++) { - if (cursor->hashval - <= be32_to_cpu(btree->hashval)) { - cursor->blkno = be32_to_cpu(btree->before); - trace_xfs_attr_list_node_descend(context, - btree); - break; - } - } - if (i == nodehdr.count) { - xfs_trans_brelse(NULL, bp); - return 0; - } - xfs_trans_brelse(NULL, bp); - } - } - ASSERT(bp != NULL); - - /* - * Roll upward through the blocks, processing each leaf block in - * order. As long as there is space in the result buffer, keep - * adding the information. - */ - for (;;) { - leaf = bp->b_addr; - error = xfs_attr3_leaf_list_int(bp, context); - if (error) { - xfs_trans_brelse(NULL, bp); - return error; - } - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); - if (context->seen_enough || leafhdr.forw == 0) - break; - cursor->blkno = leafhdr.forw; - xfs_trans_brelse(NULL, bp); - error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1, - &bp); - if (error) - return error; - } - xfs_trans_brelse(NULL, bp); - return 0; -} diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index de8dd58da46c..dd4824589470 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -141,5 +141,14 @@ typedef struct xfs_attr_list_context { */ int xfs_attr_inactive(struct xfs_inode *dp); int xfs_attr_list_int(struct xfs_attr_list_context *); +int xfs_inode_hasattr(struct xfs_inode *ip); +int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, + unsigned char *value, int *valuelenp, int flags); +int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, + unsigned char *value, int valuelen, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, + int flags, struct attrlist_cursor_kern *cursor); + #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c new file mode 100644 index 000000000000..bb24b07cbedb --- /dev/null +++ b/fs/xfs/xfs_attr_inactive.c @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_remote.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_trans_priv.h" + +/* + * Look at all the extents for this logical region, + * invalidate any buffers that are incore/in transactions. + */ +STATIC int +xfs_attr3_leaf_freextent( + struct xfs_trans **trans, + struct xfs_inode *dp, + xfs_dablk_t blkno, + int blkcnt) +{ + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_dablk_t tblkno; + xfs_daddr_t dblkno; + int tblkcnt; + int dblkcnt; + int nmap; + int error; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + tblkno = blkno; + tblkcnt = blkcnt; + while (tblkcnt > 0) { + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + &map, &nmap, XFS_BMAPI_ATTRFORK); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + + /* + * If it's a hole, these are already unmapped + * so there's nothing to invalidate. + */ + if (map.br_startblock != HOLESTARTBLOCK) { + + dblkno = XFS_FSB_TO_DADDR(dp->i_mount, + map.br_startblock); + dblkcnt = XFS_FSB_TO_BB(dp->i_mount, + map.br_blockcount); + bp = xfs_trans_get_buf(*trans, + dp->i_mount->m_ddev_targp, + dblkno, dblkcnt, 0); + if (!bp) + return ENOMEM; + xfs_trans_binval(*trans, bp); + /* + * Roll to next transaction. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return (error); + } + + tblkno += map.br_blockcount; + tblkcnt -= map.br_blockcount; + } + + return(0); +} + +/* + * Invalidate all of the "remote" value regions pointed to by a particular + * leaf block. + * Note that we must release the lock on the buffer so that we are not + * caught holding something that the logging code wants to flush to disk. + */ +STATIC int +xfs_attr3_leaf_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_attr_inactive_list *list; + struct xfs_attr_inactive_list *lp; + int error; + int count; + int size; + int tmp; + int i; + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + /* + * Count the number of "remote" value extents. + */ + count = 0; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) + count++; + } + } + + /* + * If there are no "remote" values, we're done. + */ + if (count == 0) { + xfs_trans_brelse(*trans, bp); + return 0; + } + + /* + * Allocate storage for a list of all the "remote" value extents. + */ + size = count * sizeof(xfs_attr_inactive_list_t); + list = kmem_alloc(size, KM_SLEEP); + + /* + * Identify each of the "remote" value extents. + */ + lp = list; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) { + lp->valueblk = be32_to_cpu(name_rmt->valueblk); + lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + lp++; + } + } + } + xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + + /* + * Invalidate each of the "remote" value extents. + */ + error = 0; + for (lp = list, i = 0; i < count; i++, lp++) { + tmp = xfs_attr3_leaf_freextent(trans, dp, + lp->valueblk, lp->valuelen); + + if (error == 0) + error = tmp; /* save only the 1st errno */ + } + + kmem_free(list); + return error; +} + +/* + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +STATIC int +xfs_attr3_node_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp, + int level) +{ + xfs_da_blkinfo_t *info; + xfs_da_intnode_t *node; + xfs_dablk_t child_fsb; + xfs_daddr_t parent_blkno, child_blkno; + int error, i; + struct xfs_buf *child_bp; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr ichdr; + + /* + * Since this code is recursive (gasp!) we must protect ourselves. + */ + if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + return XFS_ERROR(EIO); + } + + node = bp->b_addr; + xfs_da3_node_hdr_from_disk(&ichdr, node); + parent_blkno = bp->b_bn; + if (!ichdr.count) { + xfs_trans_brelse(*trans, bp); + return 0; + } + btree = xfs_da3_node_tree_p(node); + child_fsb = be32_to_cpu(btree[0].before); + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + + /* + * If this is the node level just above the leaves, simply loop + * over the leaves removing all of them. If this is higher up + * in the tree, recurse downward. + */ + for (i = 0; i < ichdr.count; i++) { + /* + * Read the subsidiary block to see what we have to work with. + * Don't do this in a transaction. This is a depth-first + * traversal of the tree so we may deal with many blocks + * before we come back to this one. + */ + error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, + XFS_ATTR_FORK); + if (error) + return(error); + if (child_bp) { + /* save for re-read later */ + child_blkno = XFS_BUF_ADDR(child_bp); + + /* + * Invalidate the subtree, however we have to. + */ + info = child_bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, + child_bp, level + 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, + child_bp); + break; + default: + error = XFS_ERROR(EIO); + xfs_trans_brelse(*trans, child_bp); + break; + } + if (error) + return error; + + /* + * Remove the subsidiary block from the cache + * and from the log. + */ + error = xfs_da_get_buf(*trans, dp, 0, child_blkno, + &child_bp, XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, child_bp); + } + + /* + * If we're not done, re-read the parent to get the next + * child block number. + */ + if (i + 1 < ichdr.count) { + error = xfs_da3_node_read(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); + if (error) + return error; + child_fsb = be32_to_cpu(btree[i + 1].before); + xfs_trans_brelse(*trans, bp); + } + /* + * Atomically commit the whole invalidate stuff. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return error; + } + + return 0; +} + +/* + * Indiscriminately delete the entire attribute fork + * + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +int +xfs_attr3_root_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp) +{ + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + xfs_daddr_t blkno; + int error; + + /* + * Read block 0 to see what we have to work with. + * We only get here if we have extents, since we remove + * the extents in reverse order the extent containing + * block 0 must still be there. + */ + error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + if (error) + return error; + blkno = bp->b_bn; + + /* + * Invalidate the tree, even if the "tree" is only a single leaf block. + * This is a depth-first traversal! + */ + info = bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, bp, 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, bp); + break; + default: + error = XFS_ERROR(EIO); + xfs_trans_brelse(*trans, bp); + break; + } + if (error) + return error; + + /* + * Invalidate the incore copy of the root block. + */ + error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, bp); /* remove from cache */ + /* + * Commit the invalidate and start the next transaction. + */ + error = xfs_trans_roll(trans, dp); + + return error; +} + +int +xfs_attr_inactive(xfs_inode_t *dp) +{ + xfs_trans_t *trans; + xfs_mount_t *mp; + int error; + + mp = dp->i_mount; + ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); + + xfs_ilock(dp, XFS_ILOCK_SHARED); + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return 0; + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); + error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); + if (error) { + xfs_trans_cancel(trans, 0); + return(error); + } + xfs_ilock(dp, XFS_ILOCK_EXCL); + + /* + * No need to make quota reservations here. We expect to release some + * blocks, not allocate, in the common case. + */ + xfs_trans_ijoin(trans, dp, 0); + + /* + * Decide on what work routines to call based on the inode size. + */ + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = 0; + goto out; + } + error = xfs_attr3_root_inactive(&trans, dp); + if (error) + goto out; + + error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); + if (error) + goto out; + + error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return(error); + +out: + xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index b800fbcafc7f..86db20a9cc02 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -22,6 +22,7 @@ #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -78,16 +79,6 @@ STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, int *number_usedbytes_in_blk1); /* - * Routines used for shrinking the Btree. - */ -STATIC int xfs_attr3_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - struct xfs_buf *bp, int level); -STATIC int xfs_attr3_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - struct xfs_buf *bp); -STATIC int xfs_attr3_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dablk_t blkno, int blkcnt); - -/* * Utility routines. */ STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf, @@ -635,7 +626,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) xfs_attr_sf_entry_t *sfe; int i; - ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE); + ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; @@ -751,182 +742,6 @@ out: return(error); } -STATIC int -xfs_attr_shortform_compare(const void *a, const void *b) -{ - xfs_attr_sf_sort_t *sa, *sb; - - sa = (xfs_attr_sf_sort_t *)a; - sb = (xfs_attr_sf_sort_t *)b; - if (sa->hash < sb->hash) { - return(-1); - } else if (sa->hash > sb->hash) { - return(1); - } else { - return(sa->entno - sb->entno); - } -} - - -#define XFS_ISRESET_CURSOR(cursor) \ - (!((cursor)->initted) && !((cursor)->hashval) && \ - !((cursor)->blkno) && !((cursor)->offset)) -/* - * Copy out entries of shortform attribute lists for attr_list(). - * Shortform attribute lists are not stored in hashval sorted order. - * If the output buffer is not large enough to hold them all, then we - * we have to calculate each entries' hashvalue and sort them before - * we can begin returning them to the user. - */ -/*ARGSUSED*/ -int -xfs_attr_shortform_list(xfs_attr_list_context_t *context) -{ - attrlist_cursor_kern_t *cursor; - xfs_attr_sf_sort_t *sbuf, *sbp; - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - xfs_inode_t *dp; - int sbsize, nsbuf, count, i; - int error; - - ASSERT(context != NULL); - dp = context->dp; - ASSERT(dp != NULL); - ASSERT(dp->i_afp != NULL); - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; - ASSERT(sf != NULL); - if (!sf->hdr.count) - return(0); - cursor = context->cursor; - ASSERT(cursor != NULL); - - trace_xfs_attr_list_sf(context); - - /* - * If the buffer is large enough and the cursor is at the start, - * do not bother with sorting since we will return everything in - * one buffer and another call using the cursor won't need to be - * made. - * Note the generous fudge factor of 16 overhead bytes per entry. - * If bufsize is zero then put_listent must be a search function - * and can just scan through what we have. - */ - if (context->bufsize == 0 || - (XFS_ISRESET_CURSOR(cursor) && - (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - error = context->put_listent(context, - sfe->flags, - sfe->nameval, - (int)sfe->namelen, - (int)sfe->valuelen, - &sfe->nameval[sfe->namelen]); - - /* - * Either search callback finished early or - * didn't fit it all in the buffer after all. - */ - if (context->seen_enough) - break; - - if (error) - return error; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - } - trace_xfs_attr_list_sf_all(context); - return(0); - } - - /* do no more for a search callback */ - if (context->bufsize == 0) - return 0; - - /* - * It didn't all fit, so we have to sort everything on hashval. - */ - sbsize = sf->hdr.count * sizeof(*sbuf); - sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); - - /* - * Scan the attribute list for the rest of the entries, storing - * the relevant info from only those that match into a buffer. - */ - nsbuf = 0; - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - if (unlikely( - ((char *)sfe < (char *)sf) || - ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { - XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, sfe); - kmem_free(sbuf); - return XFS_ERROR(EFSCORRUPTED); - } - - sbp->entno = i; - sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); - sbp->name = sfe->nameval; - sbp->namelen = sfe->namelen; - /* These are bytes, and both on-disk, don't endian-flip */ - sbp->valuelen = sfe->valuelen; - sbp->flags = sfe->flags; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - sbp++; - nsbuf++; - } - - /* - * Sort the entries on hash then entno. - */ - xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare); - - /* - * Re-find our place IN THE SORTED LIST. - */ - count = 0; - cursor->initted = 1; - cursor->blkno = 0; - for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) { - if (sbp->hash == cursor->hashval) { - if (cursor->offset == count) { - break; - } - count++; - } else if (sbp->hash > cursor->hashval) { - break; - } - } - if (i == nsbuf) { - kmem_free(sbuf); - return(0); - } - - /* - * Loop putting entries into the user buffer. - */ - for ( ; i < nsbuf; i++, sbp++) { - if (cursor->hashval != sbp->hash) { - cursor->hashval = sbp->hash; - cursor->offset = 0; - } - error = context->put_listent(context, - sbp->flags, - sbp->name, - sbp->namelen, - sbp->valuelen, - &sbp->name[sbp->namelen]); - if (error) - return error; - if (context->seen_enough) - break; - cursor->offset++; - } - - kmem_free(sbuf); - return(0); -} - /* * Check a leaf attribute block to see if all the entries would fit into * a shortform attribute list. @@ -1121,7 +936,6 @@ out: return error; } - /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -1482,7 +1296,6 @@ xfs_attr3_leaf_compact( ichdr_dst->freemap[0].size = ichdr_dst->firstused - ichdr_dst->freemap[0].base; - /* write the header back to initialise the underlying buffer */ xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); @@ -2643,130 +2456,6 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) return size; } -/* - * Copy out attribute list entries for attr_list(), for leaf attribute lists. - */ -int -xfs_attr3_leaf_list_int( - struct xfs_buf *bp, - struct xfs_attr_list_context *context) -{ - struct attrlist_cursor_kern *cursor; - struct xfs_attr_leafblock *leaf; - struct xfs_attr3_icleaf_hdr ichdr; - struct xfs_attr_leaf_entry *entries; - struct xfs_attr_leaf_entry *entry; - int retval; - int i; - - trace_xfs_attr_list_leaf(context); - - leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); - entries = xfs_attr3_leaf_entryp(leaf); - - cursor = context->cursor; - cursor->initted = 1; - - /* - * Re-find our place in the leaf block if this is a new syscall. - */ - if (context->resynch) { - entry = &entries[0]; - for (i = 0; i < ichdr.count; entry++, i++) { - if (be32_to_cpu(entry->hashval) == cursor->hashval) { - if (cursor->offset == context->dupcnt) { - context->dupcnt = 0; - break; - } - context->dupcnt++; - } else if (be32_to_cpu(entry->hashval) > - cursor->hashval) { - context->dupcnt = 0; - break; - } - } - if (i == ichdr.count) { - trace_xfs_attr_list_notfound(context); - return 0; - } - } else { - entry = &entries[0]; - i = 0; - } - context->resynch = 0; - - /* - * We have found our place, start copying out the new attributes. - */ - retval = 0; - for (; i < ichdr.count; entry++, i++) { - if (be32_to_cpu(entry->hashval) != cursor->hashval) { - cursor->hashval = be32_to_cpu(entry->hashval); - cursor->offset = 0; - } - - if (entry->flags & XFS_ATTR_INCOMPLETE) - continue; /* skip incomplete entries */ - - if (entry->flags & XFS_ATTR_LOCAL) { - xfs_attr_leaf_name_local_t *name_loc = - xfs_attr3_leaf_name_local(leaf, i); - - retval = context->put_listent(context, - entry->flags, - name_loc->nameval, - (int)name_loc->namelen, - be16_to_cpu(name_loc->valuelen), - &name_loc->nameval[name_loc->namelen]); - if (retval) - return retval; - } else { - xfs_attr_leaf_name_remote_t *name_rmt = - xfs_attr3_leaf_name_remote(leaf, i); - - int valuelen = be32_to_cpu(name_rmt->valuelen); - - if (context->put_value) { - xfs_da_args_t args; - - memset((char *)&args, 0, sizeof(args)); - args.dp = context->dp; - args.whichfork = XFS_ATTR_FORK; - args.valuelen = valuelen; - args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); - args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = xfs_attr3_rmt_blocks( - args.dp->i_mount, valuelen); - retval = xfs_attr_rmtval_get(&args); - if (retval) - return retval; - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - args.value); - kmem_free(args.value); - } else { - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - NULL); - } - if (retval) - return retval; - } - if (context->seen_enough) - break; - cursor->offset++; - } - trace_xfs_attr_list_leaf_end(context); - return retval; -} - /*======================================================================== * Manage the INCOMPLETE flag in a leaf entry @@ -3011,345 +2700,3 @@ xfs_attr3_leaf_flipflags( return error; } - -/*======================================================================== - * Indiscriminately delete the entire attribute fork - *========================================================================*/ - -/* - * Recurse (gasp!) through the attribute nodes until we find leaves. - * We're doing a depth-first traversal in order to invalidate everything. - */ -int -xfs_attr3_root_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp) -{ - struct xfs_da_blkinfo *info; - struct xfs_buf *bp; - xfs_daddr_t blkno; - int error; - - /* - * Read block 0 to see what we have to work with. - * We only get here if we have extents, since we remove - * the extents in reverse order the extent containing - * block 0 must still be there. - */ - error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); - if (error) - return error; - blkno = bp->b_bn; - - /* - * Invalidate the tree, even if the "tree" is only a single leaf block. - * This is a depth-first traversal! - */ - info = bp->b_addr; - switch (info->magic) { - case cpu_to_be16(XFS_DA_NODE_MAGIC): - case cpu_to_be16(XFS_DA3_NODE_MAGIC): - error = xfs_attr3_node_inactive(trans, dp, bp, 1); - break; - case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): - case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): - error = xfs_attr3_leaf_inactive(trans, dp, bp); - break; - default: - error = XFS_ERROR(EIO); - xfs_trans_brelse(*trans, bp); - break; - } - if (error) - return error; - - /* - * Invalidate the incore copy of the root block. - */ - error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); - if (error) - return error; - xfs_trans_binval(*trans, bp); /* remove from cache */ - /* - * Commit the invalidate and start the next transaction. - */ - error = xfs_trans_roll(trans, dp); - - return error; -} - -/* - * Recurse (gasp!) through the attribute nodes until we find leaves. - * We're doing a depth-first traversal in order to invalidate everything. - */ -STATIC int -xfs_attr3_node_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp, - int level) -{ - xfs_da_blkinfo_t *info; - xfs_da_intnode_t *node; - xfs_dablk_t child_fsb; - xfs_daddr_t parent_blkno, child_blkno; - int error, i; - struct xfs_buf *child_bp; - struct xfs_da_node_entry *btree; - struct xfs_da3_icnode_hdr ichdr; - - /* - * Since this code is recursive (gasp!) we must protect ourselves. - */ - if (level > XFS_DA_NODE_MAXDEPTH) { - xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - return XFS_ERROR(EIO); - } - - node = bp->b_addr; - xfs_da3_node_hdr_from_disk(&ichdr, node); - parent_blkno = bp->b_bn; - if (!ichdr.count) { - xfs_trans_brelse(*trans, bp); - return 0; - } - btree = xfs_da3_node_tree_p(node); - child_fsb = be32_to_cpu(btree[0].before); - xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - - /* - * If this is the node level just above the leaves, simply loop - * over the leaves removing all of them. If this is higher up - * in the tree, recurse downward. - */ - for (i = 0; i < ichdr.count; i++) { - /* - * Read the subsidiary block to see what we have to work with. - * Don't do this in a transaction. This is a depth-first - * traversal of the tree so we may deal with many blocks - * before we come back to this one. - */ - error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, - XFS_ATTR_FORK); - if (error) - return(error); - if (child_bp) { - /* save for re-read later */ - child_blkno = XFS_BUF_ADDR(child_bp); - - /* - * Invalidate the subtree, however we have to. - */ - info = child_bp->b_addr; - switch (info->magic) { - case cpu_to_be16(XFS_DA_NODE_MAGIC): - case cpu_to_be16(XFS_DA3_NODE_MAGIC): - error = xfs_attr3_node_inactive(trans, dp, - child_bp, level + 1); - break; - case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): - case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): - error = xfs_attr3_leaf_inactive(trans, dp, - child_bp); - break; - default: - error = XFS_ERROR(EIO); - xfs_trans_brelse(*trans, child_bp); - break; - } - if (error) - return error; - - /* - * Remove the subsidiary block from the cache - * and from the log. - */ - error = xfs_da_get_buf(*trans, dp, 0, child_blkno, - &child_bp, XFS_ATTR_FORK); - if (error) - return error; - xfs_trans_binval(*trans, child_bp); - } - - /* - * If we're not done, re-read the parent to get the next - * child block number. - */ - if (i + 1 < ichdr.count) { - error = xfs_da3_node_read(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK); - if (error) - return error; - child_fsb = be32_to_cpu(btree[i + 1].before); - xfs_trans_brelse(*trans, bp); - } - /* - * Atomically commit the whole invalidate stuff. - */ - error = xfs_trans_roll(trans, dp); - if (error) - return error; - } - - return 0; -} - -/* - * Invalidate all of the "remote" value regions pointed to by a particular - * leaf block. - * Note that we must release the lock on the buffer so that we are not - * caught holding something that the logging code wants to flush to disk. - */ -STATIC int -xfs_attr3_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) -{ - struct xfs_attr_leafblock *leaf; - struct xfs_attr3_icleaf_hdr ichdr; - struct xfs_attr_leaf_entry *entry; - struct xfs_attr_leaf_name_remote *name_rmt; - struct xfs_attr_inactive_list *list; - struct xfs_attr_inactive_list *lp; - int error; - int count; - int size; - int tmp; - int i; - - leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); - - /* - * Count the number of "remote" value extents. - */ - count = 0; - entry = xfs_attr3_leaf_entryp(leaf); - for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) - count++; - } - } - - /* - * If there are no "remote" values, we're done. - */ - if (count == 0) { - xfs_trans_brelse(*trans, bp); - return 0; - } - - /* - * Allocate storage for a list of all the "remote" value extents. - */ - size = count * sizeof(xfs_attr_inactive_list_t); - list = kmem_alloc(size, KM_SLEEP); - - /* - * Identify each of the "remote" value extents. - */ - lp = list; - entry = xfs_attr3_leaf_entryp(leaf); - for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) { - lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - lp++; - } - } - } - xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ - - /* - * Invalidate each of the "remote" value extents. - */ - error = 0; - for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr3_leaf_freextent(trans, dp, - lp->valueblk, lp->valuelen); - - if (error == 0) - error = tmp; /* save only the 1st errno */ - } - - kmem_free(list); - return error; -} - -/* - * Look at all the extents for this logical region, - * invalidate any buffers that are incore/in transactions. - */ -STATIC int -xfs_attr3_leaf_freextent( - struct xfs_trans **trans, - struct xfs_inode *dp, - xfs_dablk_t blkno, - int blkcnt) -{ - struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_dablk_t tblkno; - xfs_daddr_t dblkno; - int tblkcnt; - int dblkcnt; - int nmap; - int error; - - /* - * Roll through the "value", invalidating the attribute value's - * blocks. - */ - tblkno = blkno; - tblkcnt = blkcnt; - while (tblkcnt > 0) { - /* - * Try to remember where we decided to put the value. - */ - nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, - &map, &nmap, XFS_BMAPI_ATTRFORK); - if (error) { - return(error); - } - ASSERT(nmap == 1); - ASSERT(map.br_startblock != DELAYSTARTBLOCK); - - /* - * If it's a hole, these are already unmapped - * so there's nothing to invalidate. - */ - if (map.br_startblock != HOLESTARTBLOCK) { - - dblkno = XFS_FSB_TO_DADDR(dp->i_mount, - map.br_startblock); - dblkcnt = XFS_FSB_TO_BB(dp->i_mount, - map.br_blockcount); - bp = xfs_trans_get_buf(*trans, - dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, 0); - if (!bp) - return ENOMEM; - xfs_trans_binval(*trans, bp); - /* - * Roll to next transaction. - */ - error = xfs_trans_roll(trans, dp); - if (error) - return (error); - } - - tblkno += map.br_blockcount; - tblkcnt -= map.br_blockcount; - } - - return(0); -} diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 444a7704596c..c1022138c7e6 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -333,6 +333,8 @@ int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_buf **bpp); void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from); +void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from); extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c new file mode 100644 index 000000000000..cbc80d485177 --- /dev/null +++ b/fs/xfs/xfs_attr_list.c @@ -0,0 +1,655 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" + +STATIC int +xfs_attr_shortform_compare(const void *a, const void *b) +{ + xfs_attr_sf_sort_t *sa, *sb; + + sa = (xfs_attr_sf_sort_t *)a; + sb = (xfs_attr_sf_sort_t *)b; + if (sa->hash < sb->hash) { + return(-1); + } else if (sa->hash > sb->hash) { + return(1); + } else { + return(sa->entno - sb->entno); + } +} + +#define XFS_ISRESET_CURSOR(cursor) \ + (!((cursor)->initted) && !((cursor)->hashval) && \ + !((cursor)->blkno) && !((cursor)->offset)) +/* + * Copy out entries of shortform attribute lists for attr_list(). + * Shortform attribute lists are not stored in hashval sorted order. + * If the output buffer is not large enough to hold them all, then we + * we have to calculate each entries' hashvalue and sort them before + * we can begin returning them to the user. + */ +int +xfs_attr_shortform_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_sf_sort_t *sbuf, *sbp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_inode_t *dp; + int sbsize, nsbuf, count, i; + int error; + + ASSERT(context != NULL); + dp = context->dp; + ASSERT(dp != NULL); + ASSERT(dp->i_afp != NULL); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + ASSERT(sf != NULL); + if (!sf->hdr.count) + return(0); + cursor = context->cursor; + ASSERT(cursor != NULL); + + trace_xfs_attr_list_sf(context); + + /* + * If the buffer is large enough and the cursor is at the start, + * do not bother with sorting since we will return everything in + * one buffer and another call using the cursor won't need to be + * made. + * Note the generous fudge factor of 16 overhead bytes per entry. + * If bufsize is zero then put_listent must be a search function + * and can just scan through what we have. + */ + if (context->bufsize == 0 || + (XFS_ISRESET_CURSOR(cursor) && + (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + error = context->put_listent(context, + sfe->flags, + sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen, + &sfe->nameval[sfe->namelen]); + + /* + * Either search callback finished early or + * didn't fit it all in the buffer after all. + */ + if (context->seen_enough) + break; + + if (error) + return error; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + trace_xfs_attr_list_sf_all(context); + return(0); + } + + /* do no more for a search callback */ + if (context->bufsize == 0) + return 0; + + /* + * It didn't all fit, so we have to sort everything on hashval. + */ + sbsize = sf->hdr.count * sizeof(*sbuf); + sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); + + /* + * Scan the attribute list for the rest of the entries, storing + * the relevant info from only those that match into a buffer. + */ + nsbuf = 0; + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + if (unlikely( + ((char *)sfe < (char *)sf) || + ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { + XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, sfe); + kmem_free(sbuf); + return XFS_ERROR(EFSCORRUPTED); + } + + sbp->entno = i; + sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); + sbp->name = sfe->nameval; + sbp->namelen = sfe->namelen; + /* These are bytes, and both on-disk, don't endian-flip */ + sbp->valuelen = sfe->valuelen; + sbp->flags = sfe->flags; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sbp++; + nsbuf++; + } + + /* + * Sort the entries on hash then entno. + */ + xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare); + + /* + * Re-find our place IN THE SORTED LIST. + */ + count = 0; + cursor->initted = 1; + cursor->blkno = 0; + for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) { + if (sbp->hash == cursor->hashval) { + if (cursor->offset == count) { + break; + } + count++; + } else if (sbp->hash > cursor->hashval) { + break; + } + } + if (i == nsbuf) { + kmem_free(sbuf); + return(0); + } + + /* + * Loop putting entries into the user buffer. + */ + for ( ; i < nsbuf; i++, sbp++) { + if (cursor->hashval != sbp->hash) { + cursor->hashval = sbp->hash; + cursor->offset = 0; + } + error = context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen, + &sbp->name[sbp->namelen]); + if (error) + return error; + if (context->seen_enough) + break; + cursor->offset++; + } + + kmem_free(sbuf); + return(0); +} + +STATIC int +xfs_attr_node_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_leafblock_t *leaf; + xfs_da_intnode_t *node; + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int error, i; + struct xfs_buf *bp; + + trace_xfs_attr_node_list(context); + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Do all sorts of validation on the passed-in cursor structure. + * If anything is amiss, ignore the cursor and look up the hashval + * starting from the btree root. + */ + bp = NULL; + if (cursor->blkno > 0) { + error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); + if ((error != 0) && (error != EFSCORRUPTED)) + return(error); + if (bp) { + struct xfs_attr_leaf_entry *entries; + + node = bp->b_addr; + switch (be16_to_cpu(node->hdr.info.magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + if (cursor->hashval > be32_to_cpu( + entries[leafhdr.count - 1].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } else if (cursor->hashval <= be32_to_cpu( + entries[0].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } + break; + default: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } + } + } + + /* + * We did not find what we expected given the cursor's contents, + * so we start from the top and work down based on the hash value. + * Note that start of node block is same as start of leaf block. + */ + if (bp == NULL) { + cursor->blkno = 0; + for (;;) { + __uint16_t magic; + + error = xfs_da3_node_read(NULL, context->dp, + cursor->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + node = bp->b_addr; + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) + break; + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) { + XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, + node); + xfs_trans_brelse(NULL, bp); + return XFS_ERROR(EFSCORRUPTED); + } + + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + for (i = 0; i < nodehdr.count; btree++, i++) { + if (cursor->hashval + <= be32_to_cpu(btree->hashval)) { + cursor->blkno = be32_to_cpu(btree->before); + trace_xfs_attr_list_node_descend(context, + btree); + break; + } + } + if (i == nodehdr.count) { + xfs_trans_brelse(NULL, bp); + return 0; + } + xfs_trans_brelse(NULL, bp); + } + } + ASSERT(bp != NULL); + + /* + * Roll upward through the blocks, processing each leaf block in + * order. As long as there is space in the result buffer, keep + * adding the information. + */ + for (;;) { + leaf = bp->b_addr; + error = xfs_attr3_leaf_list_int(bp, context); + if (error) { + xfs_trans_brelse(NULL, bp); + return error; + } + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + if (context->seen_enough || leafhdr.forw == 0) + break; + cursor->blkno = leafhdr.forw; + xfs_trans_brelse(NULL, bp); + error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1, + &bp); + if (error) + return error; + } + xfs_trans_brelse(NULL, bp); + return 0; +} + +/* + * Copy out attribute list entries for attr_list(), for leaf attribute lists. + */ +int +xfs_attr3_leaf_list_int( + struct xfs_buf *bp, + struct xfs_attr_list_context *context) +{ + struct attrlist_cursor_kern *cursor; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_entry *entry; + int retval; + int i; + + trace_xfs_attr_list_leaf(context); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Re-find our place in the leaf block if this is a new syscall. + */ + if (context->resynch) { + entry = &entries[0]; + for (i = 0; i < ichdr.count; entry++, i++) { + if (be32_to_cpu(entry->hashval) == cursor->hashval) { + if (cursor->offset == context->dupcnt) { + context->dupcnt = 0; + break; + } + context->dupcnt++; + } else if (be32_to_cpu(entry->hashval) > + cursor->hashval) { + context->dupcnt = 0; + break; + } + } + if (i == ichdr.count) { + trace_xfs_attr_list_notfound(context); + return 0; + } + } else { + entry = &entries[0]; + i = 0; + } + context->resynch = 0; + + /* + * We have found our place, start copying out the new attributes. + */ + retval = 0; + for (; i < ichdr.count; entry++, i++) { + if (be32_to_cpu(entry->hashval) != cursor->hashval) { + cursor->hashval = be32_to_cpu(entry->hashval); + cursor->offset = 0; + } + + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* skip incomplete entries */ + + if (entry->flags & XFS_ATTR_LOCAL) { + xfs_attr_leaf_name_local_t *name_loc = + xfs_attr3_leaf_name_local(leaf, i); + + retval = context->put_listent(context, + entry->flags, + name_loc->nameval, + (int)name_loc->namelen, + be16_to_cpu(name_loc->valuelen), + &name_loc->nameval[name_loc->namelen]); + if (retval) + return retval; + } else { + xfs_attr_leaf_name_remote_t *name_rmt = + xfs_attr3_leaf_name_remote(leaf, i); + + int valuelen = be32_to_cpu(name_rmt->valuelen); + + if (context->put_value) { + xfs_da_args_t args; + + memset((char *)&args, 0, sizeof(args)); + args.dp = context->dp; + args.whichfork = XFS_ATTR_FORK; + args.valuelen = valuelen; + args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); + args.rmtblkno = be32_to_cpu(name_rmt->valueblk); + args.rmtblkcnt = xfs_attr3_rmt_blocks( + args.dp->i_mount, valuelen); + retval = xfs_attr_rmtval_get(&args); + if (retval) + return retval; + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + args.value); + kmem_free(args.value); + } else { + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + NULL); + } + if (retval) + return retval; + } + if (context->seen_enough) + break; + cursor->offset++; + } + trace_xfs_attr_list_leaf_end(context); + return retval; +} + +/* + * Copy out attribute entries for attr_list(), for leaf attribute lists. + */ +STATIC int +xfs_attr_leaf_list(xfs_attr_list_context_t *context) +{ + int error; + struct xfs_buf *bp; + + trace_xfs_attr_leaf_list(context); + + context->cursor->blkno = 0; + error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); + if (error) + return XFS_ERROR(error); + + error = xfs_attr3_leaf_list_int(bp, context); + xfs_trans_brelse(NULL, bp); + return XFS_ERROR(error); +} + +int +xfs_attr_list_int( + xfs_attr_list_context_t *context) +{ + int error; + xfs_inode_t *dp = context->dp; + + XFS_STATS_INC(xs_attr_list); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return EIO; + + xfs_ilock(dp, XFS_ILOCK_SHARED); + + /* + * Decide on what work routines to call based on the inode size. + */ + if (!xfs_inode_hasattr(dp)) { + error = 0; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_list(context); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_list(context); + } else { + error = xfs_attr_node_list(context); + } + + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + return error; +} + +#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ + (((struct attrlist_ent *) 0)->a_name - (char *) 0) +#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ + & ~(sizeof(u_int32_t)-1)) + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +STATIC int +xfs_attr_put_listent( + xfs_attr_list_context_t *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + struct attrlist *alist = (struct attrlist *)context->alist; + attrlist_ent_t *aep; + int arraytop; + + ASSERT(!(context->flags & ATTR_KERNOVAL)); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (((context->flags & ATTR_SECURE) == 0) != + ((flags & XFS_ATTR_SECURE) == 0)) + return 0; + if (((context->flags & ATTR_ROOT) == 0) != + ((flags & XFS_ATTR_ROOT) == 0)) + return 0; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + context->firstu -= ATTR_ENTSIZE(namelen); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return 1; + } + + aep = (attrlist_ent_t *)&context->alist[context->firstu]; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); + return 0; +} + +/* + * Generate a list of extended attribute names and optionally + * also value lengths. Positive return value follows the XFS + * convention of being an error, zero or negative return code + * is the length of the buffer returned (negated), indicating + * success. + */ +int +xfs_attr_list( + xfs_inode_t *dp, + char *buffer, + int bufsize, + int flags, + attrlist_cursor_kern_t *cursor) +{ + xfs_attr_list_context_t context; + struct attrlist *alist; + int error; + + /* + * Validate the cursor. + */ + if (cursor->pad1 || cursor->pad2) + return(XFS_ERROR(EINVAL)); + if ((cursor->initted == 0) && + (cursor->hashval || cursor->blkno || cursor->offset)) + return XFS_ERROR(EINVAL); + + /* + * Check for a properly aligned buffer. + */ + if (((long)buffer) & (sizeof(int)-1)) + return XFS_ERROR(EFAULT); + if (flags & ATTR_KERNOVAL) + bufsize = 0; + + /* + * Initialize the output buffer. + */ + memset(&context, 0, sizeof(context)); + context.dp = dp; + context.cursor = cursor; + context.resynch = 1; + context.flags = flags; + context.alist = buffer; + context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ + context.firstu = context.bufsize; + context.put_listent = xfs_attr_put_listent; + + alist = (struct attrlist *)context.alist; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list_int(&context); + ASSERT(error >= 0); + return error; +} diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index ef6b0c124528..712a502de619 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -22,6 +22,7 @@ #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -33,6 +34,7 @@ #include "xfs_alloc.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_attr_remote.h" @@ -237,7 +239,7 @@ xfs_attr_rmtval_copyout( xfs_ino_t ino, int *offset, int *valuelen, - char **dst) + __uint8_t **dst) { char *src = bp->b_addr; xfs_daddr_t bno = bp->b_bn; @@ -249,7 +251,7 @@ xfs_attr_rmtval_copyout( int hdr_size = 0; int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); - byte_cnt = min_t(int, *valuelen, byte_cnt); + byte_cnt = min(*valuelen, byte_cnt); if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset, @@ -284,7 +286,7 @@ xfs_attr_rmtval_copyin( xfs_ino_t ino, int *offset, int *valuelen, - char **src) + __uint8_t **src) { char *dst = bp->b_addr; xfs_daddr_t bno = bp->b_bn; @@ -337,7 +339,7 @@ xfs_attr_rmtval_get( struct xfs_mount *mp = args->dp->i_mount; struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; - char *dst = args->value; + __uint8_t *dst = args->value; int valuelen = args->valuelen; int nmap; int error; @@ -401,7 +403,7 @@ xfs_attr_rmtval_set( struct xfs_bmbt_irec map; xfs_dablk_t lblkno; xfs_fileoff_t lfileoff = 0; - char *src = args->value; + __uint8_t *src = args->value; int blkcnt; int valuelen; int nmap; @@ -543,11 +545,6 @@ xfs_attr_rmtval_remove( /* * Roll through the "value", invalidating the attribute value's blocks. - * Note that args->rmtblkcnt is the minimum number of data blocks we'll - * see for a CRC enabled remote attribute. Each extent will have a - * header, and so we may have more blocks than we realise here. If we - * fail to map the blocks correctly, we'll have problems with the buffer - * lookups. */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; @@ -628,4 +625,3 @@ xfs_attr_rmtval_remove( } return(0); } - diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 05c698ccb238..92b830901d60 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -17,16 +17,17 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" #include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" @@ -39,6 +40,7 @@ #include "xfs_extfree_item.h" #include "xfs_alloc.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_attr_leaf.h" @@ -46,7 +48,6 @@ #include "xfs_trans_space.h" #include "xfs_buf_item.h" #include "xfs_filestream.h" -#include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_symlink.h" @@ -108,19 +109,6 @@ xfs_bmap_compute_maxlevels( mp->m_bm_maxlevels[whichfork] = level; } -/* - * Convert the given file system block to a disk block. We have to treat it - * differently based on whether the file is a real time file or not, because the - * bmap code does. - */ -xfs_daddr_t -xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) -{ - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); -} - STATIC int /* error */ xfs_bmbt_lookup_eq( struct xfs_btree_cur *cur, @@ -263,173 +251,6 @@ xfs_bmap_forkoff_reset( } /* - * Extent tree block counting routines. - */ - -/* - * Count leaf blocks given a range of extent records. - */ -STATIC void -xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count) -{ - int b; - - for (b = 0; b < numrecs; b++) { - xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); - *count += xfs_bmbt_get_blockcount(frp); - } -} - -/* - * Count leaf blocks given a range of extent records originally - * in btree format. - */ -STATIC void -xfs_bmap_disk_count_leaves( - struct xfs_mount *mp, - struct xfs_btree_block *block, - int numrecs, - int *count) -{ - int b; - xfs_bmbt_rec_t *frp; - - for (b = 1; b <= numrecs; b++) { - frp = XFS_BMBT_REC_ADDR(mp, block, b); - *count += xfs_bmbt_disk_get_blockcount(frp); - } -} - -/* - * Recursively walks each level of a btree - * to count total fsblocks is use. - */ -STATIC int /* error */ -xfs_bmap_count_tree( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fsblock_t blockno, /* file system block number */ - int levelin, /* level in btree */ - int *count) /* Count of blocks */ -{ - int error; - xfs_buf_t *bp, *nbp; - int level = levelin; - __be64 *pp; - xfs_fsblock_t bno = blockno; - xfs_fsblock_t nextbno; - struct xfs_btree_block *block, *nextblock; - int numrecs; - - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - - if (--level) { - /* Not at node above leaves, count this level of nodes */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - while (nextbno != NULLFSBLOCK) { - error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - nextblock = XFS_BUF_TO_BLOCK(nbp); - nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); - xfs_trans_brelse(tp, nbp); - } - - /* Dive to the next level */ - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - if (unlikely((error = - xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { - xfs_trans_brelse(tp, bp); - XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_trans_brelse(tp, bp); - } else { - /* count all level 1 nodes and their leaves */ - for (;;) { - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - numrecs = be16_to_cpu(block->bb_numrecs); - xfs_bmap_disk_count_leaves(mp, block, numrecs, count); - xfs_trans_brelse(tp, bp); - if (nextbno == NULLFSBLOCK) - break; - bno = nextbno; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - } - } - return 0; -} - -/* - * Count fsblocks of the given fork. - */ -int /* error */ -xfs_bmap_count_blocks( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - int *count) /* out: count of blocks */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - xfs_bmap_count_leaves(ifp, 0, - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count); - return 0; - } - - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - block = ifp->if_broot; - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, - mp); - return XFS_ERROR(EFSCORRUPTED); - } - - return 0; -} - -/* * Debug/sanity checking code */ @@ -724,8 +545,8 @@ xfs_bmap_trace_exlist( /* * Validate that the bmbt_irecs being returned from bmapi are valid - * given the callers original parameters. Specifically check the - * ranges of the returned irecs to ensure that they only extent beyond + * given the caller's original parameters. Specifically check the + * ranges of the returned irecs to ensure that they only extend beyond * the given parameters if the XFS_BMAPI_ENTIRE flag was set. */ STATIC void @@ -823,7 +644,7 @@ xfs_bmap_add_free( * Remove the entry "free" from the free item list. Prev points to the * previous entry, unless "free" is the head of the list. */ -STATIC void +void xfs_bmap_del_free( xfs_bmap_free_t *flist, /* free item list header */ xfs_bmap_free_item_t *prev, /* previous item on list, if any */ @@ -837,92 +658,6 @@ xfs_bmap_del_free( kmem_zone_free(xfs_bmap_free_item_zone, free); } - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. We never free any extents in - * the first transaction. - * - * Return 1 if the given transaction was committed and a new one - * started, and 0 otherwise in the committed parameter. - */ -int /* error */ -xfs_bmap_finish( - xfs_trans_t **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed) /* xact committed or not */ -{ - xfs_efd_log_item_t *efd; /* extent free data */ - xfs_efi_log_item_t *efi; /* extent free intention */ - int error; /* error return value */ - xfs_bmap_free_item_t *free; /* free extent item */ - unsigned int logres; /* new log reservation */ - unsigned int logcount; /* new log count */ - xfs_mount_t *mp; /* filesystem mount structure */ - xfs_bmap_free_item_t *next; /* next item on free list */ - xfs_trans_t *ntp; /* new transaction pointer */ - - ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) { - *committed = 0; - return 0; - } - ntp = *tp; - efi = xfs_trans_get_efi(ntp, flist->xbf_count); - for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, - free->xbfi_blockcount); - logres = ntp->t_log_res; - logcount = ntp->t_log_count; - ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0); - *tp = ntp; - *committed = 1; - /* - * We have a new transaction, so we should return committed=1, - * even though we're returning an error. - */ - if (error) - return error; - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(ntp->t_ticket); - - if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, - logcount))) - return error; - efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); - for (free = flist->xbf_first; free != NULL; free = next) { - next = free->xbfi_next; - if ((error = xfs_free_extent(ntp, free->xbfi_startblock, - free->xbfi_blockcount))) { - /* - * The bmap free list will be cleaned up at a - * higher level. The EFI will be canceled when - * this transaction is aborted. - * Need to force shutdown here to make sure it - * happens, since this transaction may not be - * dirty yet. - */ - mp = ntp->t_mountp; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, - (error == EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); - return error; - } - xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, - free->xbfi_blockcount); - xfs_bmap_del_free(flist, NULL, free); - } - return 0; -} - /* * Free up any items left in the list. */ @@ -1413,8 +1148,8 @@ xfs_bmap_add_attrfork( blks = XFS_ADDAFORK_SPACE_RES(mp); if (rsvd) tp->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); + if (error) goto error0; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? @@ -1815,7 +1550,7 @@ xfs_bmap_first_unused( } /* - * Returns the file-relative block number of the last block + 1 before + * Returns the file-relative block number of the last block - 1 before * last_block (input value) in the file. * This is not based on i_size, it is based on the extent records. * Returns 0 for local files, as they do not have extent records. @@ -1863,7 +1598,7 @@ xfs_bmap_last_before( return 0; } -STATIC int +int xfs_bmap_last_extent( struct xfs_trans *tp, struct xfs_inode *ip, @@ -1927,29 +1662,6 @@ xfs_bmap_isaeof( } /* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. All offsets are considered outside - * the end of file for an empty fork, so 1 is returned in *eof in that case. - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof) -{ - struct xfs_bmbt_irec rec; - int error; - - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); - if (error || *eof) - return error; - - *eof = endoff >= rec.br_startoff + rec.br_blockcount; - return 0; -} - -/* * Returns the file-relative block number of the first block past eof in * the file. This is not based on i_size, it is based on the extent records. * Returns 0 for local files, as they do not have extent records. @@ -3488,7 +3200,7 @@ done: /* * Adjust the size of the new extent based on di_extsize and rt extsize. */ -STATIC int +int xfs_bmap_extsize_align( xfs_mount_t *mp, xfs_bmbt_irec_t *gotp, /* next extent pointer */ @@ -3650,9 +3362,9 @@ xfs_bmap_extsize_align( #define XFS_ALLOC_GAP_UNITS 4 -STATIC void +void xfs_bmap_adjacent( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { xfs_fsblock_t adjust; /* adjustment to block numbers */ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ @@ -3799,109 +3511,6 @@ xfs_bmap_adjacent( } STATIC int -xfs_bmap_rtalloc( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ -{ - xfs_alloctype_t atype = 0; /* type for allocation routines */ - int error; /* error return value */ - xfs_mount_t *mp; /* mount point structure */ - xfs_extlen_t prod = 0; /* product factor for allocators */ - xfs_extlen_t ralen = 0; /* realtime allocation length */ - xfs_extlen_t align; /* minimum allocation alignment */ - xfs_rtblock_t rtb; - - mp = ap->ip->i_mount; - align = xfs_get_extsz_hint(ap->ip); - prod = align / mp->m_sb.sb_rextsize; - error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, - align, 1, ap->eof, 0, - ap->conv, &ap->offset, &ap->length); - if (error) - return error; - ASSERT(ap->length); - ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); - - /* - * If the offset & length are not perfectly aligned - * then kill prod, it will just get us in trouble. - */ - if (do_mod(ap->offset, align) || ap->length % align) - prod = 1; - /* - * Set ralen to be the actual requested length in rtextents. - */ - ralen = ap->length / mp->m_sb.sb_rextsize; - /* - * If the old value was close enough to MAXEXTLEN that - * we rounded up to it, cut it back so it's valid again. - * Note that if it's a really large request (bigger than - * MAXEXTLEN), we don't hear about that number, and can't - * adjust the starting point to match it. - */ - if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) - ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; - - /* - * Lock out other modifications to the RT bitmap inode. - */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - - /* - * If it's an allocation to an empty file at offset 0, - * pick an extent that will space things out in the rt area. - */ - if (ap->eof && ap->offset == 0) { - xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ - - error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); - if (error) - return error; - ap->blkno = rtx * mp->m_sb.sb_rextsize; - } else { - ap->blkno = 0; - } - - xfs_bmap_adjacent(ap); - - /* - * Realtime allocation, done through xfs_rtallocate_extent. - */ - atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; - do_div(ap->blkno, mp->m_sb.sb_rextsize); - rtb = ap->blkno; - ap->length = ralen; - if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, - &ralen, atype, ap->wasdel, prod, &rtb))) - return error; - if (rtb == NULLFSBLOCK && prod > 1 && - (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, - ap->length, &ralen, atype, - ap->wasdel, 1, &rtb))) - return error; - ap->blkno = rtb; - if (ap->blkno != NULLFSBLOCK) { - ap->blkno *= mp->m_sb.sb_rextsize; - ralen *= mp->m_sb.sb_rextsize; - ap->length = ralen; - ap->ip->i_d.di_nblocks += ralen; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - if (ap->wasdel) - ap->ip->i_delayed_blks -= ralen; - /* - * Adjust the disk quota also. This was reserved - * earlier. - */ - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : - XFS_TRANS_DQ_RTBCOUNT, (long) ralen); - } else { - ap->length = 0; - } - return 0; -} - -STATIC int xfs_bmap_btalloc_nullfb( struct xfs_bmalloca *ap, struct xfs_alloc_arg *args, @@ -4018,7 +3627,7 @@ xfs_bmap_btalloc_nullfb( STATIC int xfs_bmap_btalloc( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { xfs_mount_t *mp; /* mount point structure */ xfs_alloctype_t atype = 0; /* type for allocation routines */ @@ -4250,7 +3859,7 @@ xfs_bmap_btalloc( */ STATIC int xfs_bmap_alloc( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) return xfs_bmap_rtalloc(ap); @@ -4638,7 +4247,7 @@ xfs_bmapi_delay( } -STATIC int +int __xfs_bmapi_allocate( struct xfs_bmalloca *bma) { @@ -4648,12 +4257,9 @@ __xfs_bmapi_allocate( struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; int error; - int rt; ASSERT(bma->length > 0); - rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); - /* * For the wasdelay case, we could also just allocate the stuff asked * for in this bmap call but that wouldn't be as good. @@ -4756,45 +4362,6 @@ __xfs_bmapi_allocate( return 0; } -static void -xfs_bmapi_allocate_worker( - struct work_struct *work) -{ - struct xfs_bmalloca *args = container_of(work, - struct xfs_bmalloca, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_bmapi_allocate(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Some allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Otherwise just - * call directly to avoid the context switch overhead here. - */ -int -xfs_bmapi_allocate( - struct xfs_bmalloca *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_bmapi_allocate(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - STATIC int xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, @@ -5789,359 +5356,3 @@ error0: } return error; } - -/* - * returns 1 for success, 0 if we failed to map the extent. - */ -STATIC int -xfs_getbmapx_fix_eof_hole( - xfs_inode_t *ip, /* xfs incore inode pointer */ - struct getbmapx *out, /* output structure */ - int prealloced, /* this is a file with - * preallocated data space */ - __int64_t end, /* last block requested */ - xfs_fsblock_t startblock) -{ - __int64_t fixlen; - xfs_mount_t *mp; /* file system mount point */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent pointer */ - xfs_fileoff_t fileblock; - - if (startblock == HOLESTARTBLOCK) { - mp = ip->i_mount; - out->bmv_block = -1; - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); - fixlen -= out->bmv_offset; - if (prealloced && out->bmv_offset + out->bmv_length == end) { - /* Came to hole at EOF. Trim it. */ - if (fixlen <= 0) - return 0; - out->bmv_length = fixlen; - } - } else { - if (startblock == DELAYSTARTBLOCK) - out->bmv_block = -2; - else - out->bmv_block = xfs_fsb_to_db(ip, startblock); - fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && - (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) - out->bmv_oflags |= BMV_OF_LAST; - } - - return 1; -} - -/* - * Get inode's extents as described in bmv, and format for output. - * Calls formatter to fill the user's buffer until all extents - * are mapped, until the passed-in bmv->bmv_count slots have - * been filled, or until the formatter short-circuits the loop, - * if it is tracking filled-in extents on its own. - */ -int /* error code */ -xfs_getbmap( - xfs_inode_t *ip, - struct getbmapx *bmv, /* user bmap structure */ - xfs_bmap_format_t formatter, /* format to user */ - void *arg) /* formatter arg */ -{ - __int64_t bmvend; /* last block requested */ - int error = 0; /* return value */ - __int64_t fixlen; /* length for -1 case */ - int i; /* extent number */ - int lock; /* lock state */ - xfs_bmbt_irec_t *map; /* buffer for user's data */ - xfs_mount_t *mp; /* file system mount point */ - int nex; /* # of user extents can do */ - int nexleft; /* # of user extents left */ - int subnex; /* # of bmapi's can do */ - int nmap; /* number of map entries */ - struct getbmapx *out; /* output structure */ - int whichfork; /* data or attr fork */ - int prealloced; /* this is a file with - * preallocated data space */ - int iflags; /* interface flags */ - int bmapi_flags; /* flags for xfs_bmapi */ - int cur_ext = 0; - - mp = ip->i_mount; - iflags = bmv->bmv_iflags; - whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; - - if (whichfork == XFS_ATTR_FORK) { - if (XFS_IFORK_Q(ip)) { - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && - ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EINVAL); - } else if (unlikely( - ip->i_d.di_aformat != 0 && - ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { - XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - - prealloced = 0; - fixlen = 1LL << 32; - } else { - if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EINVAL); - - if (xfs_get_extsz_hint(ip) || - ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ - prealloced = 1; - fixlen = mp->m_super->s_maxbytes; - } else { - prealloced = 0; - fixlen = XFS_ISIZE(ip); - } - } - - if (bmv->bmv_length == -1) { - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); - bmv->bmv_length = - max_t(__int64_t, fixlen - bmv->bmv_offset, 0); - } else if (bmv->bmv_length == 0) { - bmv->bmv_entries = 0; - return 0; - } else if (bmv->bmv_length < 0) { - return XFS_ERROR(EINVAL); - } - - nex = bmv->bmv_count - 1; - if (nex <= 0) - return XFS_ERROR(EINVAL); - bmvend = bmv->bmv_offset + bmv->bmv_length; - - - if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) - return XFS_ERROR(ENOMEM); - out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL); - if (!out) { - out = kmem_zalloc_large(bmv->bmv_count * - sizeof(struct getbmapx)); - if (!out) - return XFS_ERROR(ENOMEM); - } - - xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { - if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { - error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); - if (error) - goto out_unlock_iolock; - } - /* - * even after flushing the inode, there can still be delalloc - * blocks on the inode beyond EOF due to speculative - * preallocation. These are not removed until the release - * function is called or the inode is inactivated. Hence we - * cannot assert here that ip->i_delayed_blks == 0. - */ - } - - lock = xfs_ilock_map_shared(ip); - - /* - * Don't let nex be bigger than the number of extents - * we can have assuming alternating holes and real extents. - */ - if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) - nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; - - bmapi_flags = xfs_bmapi_aflag(whichfork); - if (!(iflags & BMV_IF_PREALLOC)) - bmapi_flags |= XFS_BMAPI_IGSTATE; - - /* - * Allocate enough space to handle "subnex" maps at a time. - */ - error = ENOMEM; - subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); - if (!map) - goto out_unlock_ilock; - - bmv->bmv_entries = 0; - - if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && - (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { - error = 0; - goto out_free_map; - } - - nexleft = nex; - - do { - nmap = (nexleft > subnex) ? subnex : nexleft; - error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), - XFS_BB_TO_FSB(mp, bmv->bmv_length), - map, &nmap, bmapi_flags); - if (error) - goto out_free_map; - ASSERT(nmap <= subnex); - - for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { - out[cur_ext].bmv_oflags = 0; - if (map[i].br_state == XFS_EXT_UNWRITTEN) - out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; - else if (map[i].br_startblock == DELAYSTARTBLOCK) - out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; - out[cur_ext].bmv_offset = - XFS_FSB_TO_BB(mp, map[i].br_startoff); - out[cur_ext].bmv_length = - XFS_FSB_TO_BB(mp, map[i].br_blockcount); - out[cur_ext].bmv_unused1 = 0; - out[cur_ext].bmv_unused2 = 0; - - /* - * delayed allocation extents that start beyond EOF can - * occur due to speculative EOF allocation when the - * delalloc extent is larger than the largest freespace - * extent at conversion time. These extents cannot be - * converted by data writeback, so can exist here even - * if we are not supposed to be finding delalloc - * extents. - */ - if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) - ASSERT((iflags & BMV_IF_DELALLOC) != 0); - - if (map[i].br_startblock == HOLESTARTBLOCK && - whichfork == XFS_ATTR_FORK) { - /* came to the end of attribute fork */ - out[cur_ext].bmv_oflags |= BMV_OF_LAST; - goto out_free_map; - } - - if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], - prealloced, bmvend, - map[i].br_startblock)) - goto out_free_map; - - bmv->bmv_offset = - out[cur_ext].bmv_offset + - out[cur_ext].bmv_length; - bmv->bmv_length = - max_t(__int64_t, 0, bmvend - bmv->bmv_offset); - - /* - * In case we don't want to return the hole, - * don't increase cur_ext so that we can reuse - * it in the next loop. - */ - if ((iflags & BMV_IF_NO_HOLES) && - map[i].br_startblock == HOLESTARTBLOCK) { - memset(&out[cur_ext], 0, sizeof(out[cur_ext])); - continue; - } - - nexleft--; - bmv->bmv_entries++; - cur_ext++; - } - } while (nmap && nexleft && bmv->bmv_length); - - out_free_map: - kmem_free(map); - out_unlock_ilock: - xfs_iunlock_map_shared(ip, lock); - out_unlock_iolock: - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - for (i = 0; i < cur_ext; i++) { - int full = 0; /* user array is full */ - - /* format results & advance arg */ - error = formatter(&arg, &out[i], &full); - if (error || full) - break; - } - - if (is_vmalloc_addr(out)) - kmem_free_large(out); - else - kmem_free(out); - return error; -} - -/* - * dead simple method of punching delalyed allocation blocks from a range in - * the inode. Walks a block at a time so will be slow, but is only executed in - * rare error cases so the overhead is not critical. This will alays punch out - * both the start and end blocks, even if the ranges only partially overlap - * them, so it is up to the caller to ensure that partial blocks are not - * passed in. - */ -int -xfs_bmap_punch_delalloc_range( - struct xfs_inode *ip, - xfs_fileoff_t start_fsb, - xfs_fileoff_t length) -{ - xfs_fileoff_t remaining = length; - int error = 0; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - do { - int done; - xfs_bmbt_irec_t imap; - int nimaps = 1; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - - /* - * Map the range first and check that it is a delalloc extent - * before trying to unmap the range. Otherwise we will be - * trying to remove a real extent (which requires a - * transaction) or a hole, which is probably a bad idea... - */ - error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, - XFS_BMAPI_ENTIRE); - - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "Failed delalloc mapping lookup ino %lld fsb %lld.", - ip->i_ino, start_fsb); - } - break; - } - if (!nimaps) { - /* nothing there */ - goto next_block; - } - if (imap.br_startblock != DELAYSTARTBLOCK) { - /* been converted, ignore */ - goto next_block; - } - WARN_ON(imap.br_blockcount == 0); - - /* - * Note: while we initialise the firstblock/flist pair, they - * should never be used because blocks should never be - * allocated or freed for a delalloc extent and hence we need - * don't cancel or finish them after the xfs_bunmapi() call. - */ - xfs_bmap_init(&flist, &firstblock); - error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, - &flist, &done); - if (error) - break; - - ASSERT(!flist.xbf_count && !flist.xbf_first); -next_block: - start_fsb++; - remaining--; - } while(remaining > 0); - - return error; -} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 1cf1292d29b7..33b41f351225 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -108,41 +108,6 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) } /* - * Argument structure for xfs_bmap_alloc. - */ -typedef struct xfs_bmalloca { - xfs_fsblock_t *firstblock; /* i/o first block allocated */ - struct xfs_bmap_free *flist; /* bmap freelist */ - struct xfs_trans *tp; /* transaction pointer */ - struct xfs_inode *ip; /* incore inode pointer */ - struct xfs_bmbt_irec prev; /* extent before the new one */ - struct xfs_bmbt_irec got; /* extent after, or delayed */ - - xfs_fileoff_t offset; /* offset in file filling in */ - xfs_extlen_t length; /* i/o length asked/allocated */ - xfs_fsblock_t blkno; /* starting block of new extent */ - - struct xfs_btree_cur *cur; /* btree cursor */ - xfs_extnum_t idx; /* current extent index */ - int nallocs;/* number of extents alloc'd */ - int logflags;/* flags for transaction logging */ - - xfs_extlen_t total; /* total blocks needed for xaction */ - xfs_extlen_t minlen; /* minimum allocation size (blocks) */ - xfs_extlen_t minleft; /* amount must be left after alloc */ - char eof; /* set if allocating past last extent */ - char wasdel; /* replacing a delayed allocation */ - char userdata;/* set if is user data */ - char aeof; /* allocated space at eof */ - char conv; /* overwriting unwritten extents */ - char stack_switch; - int flags; - struct completion *done; - struct work_struct work; - int result; -} xfs_bmalloca_t; - -/* * Flags for xfs_bmap_add_extent*. */ #define BMAP_LEFT_CONTIG (1 << 0) @@ -162,7 +127,7 @@ typedef struct xfs_bmalloca { { BMAP_RIGHT_FILLING, "RF" }, \ { BMAP_ATTRFORK, "ATTR" } -#if defined(__KERNEL) && defined(DEBUG) +#ifdef DEBUG void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int whichfork, unsigned long caller_ip); #define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ @@ -205,23 +170,4 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); -#ifdef __KERNEL__ -/* bmap to userspace formatter - copy to user & advance pointer */ -typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); - -int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, - int *committed); -int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, - xfs_bmap_format_t formatter, void *arg); -int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, - int whichfork, int *eof); -int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, - int whichfork, int *count); -int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, - xfs_fileoff_t start_fsb, xfs_fileoff_t length); - -xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); - -#endif /* __KERNEL__ */ - #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 0c61a22be6fd..cf3bc76710c3 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -17,7 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_trans.h" @@ -722,7 +722,7 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } -static int +static bool xfs_bmbt_verify( struct xfs_buf *bp) { @@ -775,7 +775,6 @@ xfs_bmbt_verify( return false; return true; - } static void @@ -789,7 +788,6 @@ xfs_bmbt_read_verify( bp->b_target->bt_mount, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); } - } static void diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c new file mode 100644 index 000000000000..541d59f5e658 --- /dev/null +++ b/fs/xfs/xfs_bmap_util.c @@ -0,0 +1,2026 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_extfree_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_icache.h" + +/* Kernel only BMAP related definitions and functions */ + +/* + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. + */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. We never free any extents in + * the first transaction. + * + * Return 1 if the given transaction was committed and a new one + * started, and 0 otherwise in the committed parameter. + */ +int /* error */ +xfs_bmap_finish( + xfs_trans_t **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *committed) /* xact committed or not */ +{ + xfs_efd_log_item_t *efd; /* extent free data */ + xfs_efi_log_item_t *efi; /* extent free intention */ + int error; /* error return value */ + xfs_bmap_free_item_t *free; /* free extent item */ + struct xfs_trans_res tres; /* new log reservation */ + xfs_mount_t *mp; /* filesystem mount structure */ + xfs_bmap_free_item_t *next; /* next item on free list */ + xfs_trans_t *ntp; /* new transaction pointer */ + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + if (flist->xbf_count == 0) { + *committed = 0; + return 0; + } + ntp = *tp; + efi = xfs_trans_get_efi(ntp, flist->xbf_count); + for (free = flist->xbf_first; free; free = free->xbfi_next) + xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + free->xbfi_blockcount); + + tres.tr_logres = ntp->t_log_res; + tres.tr_logcount = ntp->t_log_count; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + ntp = xfs_trans_dup(*tp); + error = xfs_trans_commit(*tp, 0); + *tp = ntp; + *committed = 1; + /* + * We have a new transaction, so we should return committed=1, + * even though we're returning an error. + */ + if (error) + return error; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + + error = xfs_trans_reserve(ntp, &tres, 0, 0); + if (error) + return error; + efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + for (free = flist->xbf_first; free != NULL; free = next) { + next = free->xbfi_next; + if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + free->xbfi_blockcount))) { + /* + * The bmap free list will be cleaned up at a + * higher level. The EFI will be canceled when + * this transaction is aborted. + * Need to force shutdown here to make sure it + * happens, since this transaction may not be + * dirty yet. + */ + mp = ntp->t_mountp; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, + (error == EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + return error; + } + xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); + } + return 0; +} + +int +xfs_bmap_rtalloc( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +{ + xfs_alloctype_t atype = 0; /* type for allocation routines */ + int error; /* error return value */ + xfs_mount_t *mp; /* mount point structure */ + xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_rtblock_t rtb; + + mp = ap->ip->i_mount; + align = xfs_get_extsz_hint(ap->ip); + prod = align / mp->m_sb.sb_rextsize; + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 1, ap->eof, 0, + ap->conv, &ap->offset, &ap->length); + if (error) + return error; + ASSERT(ap->length); + ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); + + /* + * If the offset & length are not perfectly aligned + * then kill prod, it will just get us in trouble. + */ + if (do_mod(ap->offset, align) || ap->length % align) + prod = 1; + /* + * Set ralen to be the actual requested length in rtextents. + */ + ralen = ap->length / mp->m_sb.sb_rextsize; + /* + * If the old value was close enough to MAXEXTLEN that + * we rounded up to it, cut it back so it's valid again. + * Note that if it's a really large request (bigger than + * MAXEXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) + ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + + /* + * Lock out other modifications to the RT bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + + /* + * If it's an allocation to an empty file at offset 0, + * pick an extent that will space things out in the rt area. + */ + if (ap->eof && ap->offset == 0) { + xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + + error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + if (error) + return error; + ap->blkno = rtx * mp->m_sb.sb_rextsize; + } else { + ap->blkno = 0; + } + + xfs_bmap_adjacent(ap); + + /* + * Realtime allocation, done through xfs_rtallocate_extent. + */ + atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->blkno, mp->m_sb.sb_rextsize); + rtb = ap->blkno; + ap->length = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, + &ralen, atype, ap->wasdel, prod, &rtb))) + return error; + if (rtb == NULLFSBLOCK && prod > 1 && + (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, + ap->length, &ralen, atype, + ap->wasdel, 1, &rtb))) + return error; + ap->blkno = rtb; + if (ap->blkno != NULLFSBLOCK) { + ap->blkno *= mp->m_sb.sb_rextsize; + ralen *= mp->m_sb.sb_rextsize; + ap->length = ralen; + ap->ip->i_d.di_nblocks += ralen; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= ralen; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : + XFS_TRANS_DQ_RTBCOUNT, (long) ralen); + } else { + ap->length = 0; + } + return 0; +} + +/* + * Stack switching interfaces for allocation + */ +static void +xfs_bmapi_allocate_worker( + struct work_struct *work) +{ + struct xfs_bmalloca *args = container_of(work, + struct xfs_bmalloca, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_bmapi_allocate(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + +/* + * Some allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Otherwise just + * call directly to avoid the context switch overhead here. + */ +int +xfs_bmapi_allocate( + struct xfs_bmalloca *args) +{ + DECLARE_COMPLETION_ONSTACK(done); + + if (!args->stack_switch) + return __xfs_bmapi_allocate(args); + + + args->done = &done; + INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Extent tree block counting routines. + */ + +/* + * Count leaf blocks given a range of extent records. + */ +STATIC void +xfs_bmap_count_leaves( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + int numrecs, + int *count) +{ + int b; + + for (b = 0; b < numrecs; b++) { + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); + *count += xfs_bmbt_get_blockcount(frp); + } +} + +/* + * Count leaf blocks given a range of extent records originally + * in btree format. + */ +STATIC void +xfs_bmap_disk_count_leaves( + struct xfs_mount *mp, + struct xfs_btree_block *block, + int numrecs, + int *count) +{ + int b; + xfs_bmbt_rec_t *frp; + + for (b = 1; b <= numrecs; b++) { + frp = XFS_BMBT_REC_ADDR(mp, block, b); + *count += xfs_bmbt_disk_get_blockcount(frp); + } +} + +/* + * Recursively walks each level of a btree + * to count total fsblocks in use. + */ +STATIC int /* error */ +xfs_bmap_count_tree( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fsblock_t blockno, /* file system block number */ + int levelin, /* level in btree */ + int *count) /* Count of blocks */ +{ + int error; + xfs_buf_t *bp, *nbp; + int level = levelin; + __be64 *pp; + xfs_fsblock_t bno = blockno; + xfs_fsblock_t nextbno; + struct xfs_btree_block *block, *nextblock; + int numrecs; + + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + + if (--level) { + /* Not at node above leaves, count this level of nodes */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + while (nextbno != NULLFSBLOCK) { + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + nextblock = XFS_BUF_TO_BLOCK(nbp); + nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); + xfs_trans_brelse(tp, nbp); + } + + /* Dive to the next level */ + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + if (unlikely((error = + xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { + xfs_trans_brelse(tp, bp); + XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_trans_brelse(tp, bp); + } else { + /* count all level 1 nodes and their leaves */ + for (;;) { + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + numrecs = be16_to_cpu(block->bb_numrecs); + xfs_bmap_disk_count_leaves(mp, block, numrecs, count); + xfs_trans_brelse(tp, bp); + if (nextbno == NULLFSBLOCK) + break; + bno = nextbno; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + } + } + return 0; +} + +/* + * Count fsblocks of the given fork. + */ +int /* error */ +xfs_bmap_count_blocks( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + int *count) /* out: count of blocks */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { + xfs_bmap_count_leaves(ifp, 0, + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), + count); + return 0; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + + return 0; +} + +/* + * returns 1 for success, 0 if we failed to map the extent. + */ +STATIC int +xfs_getbmapx_fix_eof_hole( + xfs_inode_t *ip, /* xfs incore inode pointer */ + struct getbmapx *out, /* output structure */ + int prealloced, /* this is a file with + * preallocated data space */ + __int64_t end, /* last block requested */ + xfs_fsblock_t startblock) +{ + __int64_t fixlen; + xfs_mount_t *mp; /* file system mount point */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent pointer */ + xfs_fileoff_t fileblock; + + if (startblock == HOLESTARTBLOCK) { + mp = ip->i_mount; + out->bmv_block = -1; + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); + fixlen -= out->bmv_offset; + if (prealloced && out->bmv_offset + out->bmv_length == end) { + /* Came to hole at EOF. Trim it. */ + if (fixlen <= 0) + return 0; + out->bmv_length = fixlen; + } + } else { + if (startblock == DELAYSTARTBLOCK) + out->bmv_block = -2; + else + out->bmv_block = xfs_fsb_to_db(ip, startblock); + fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && + (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) + out->bmv_oflags |= BMV_OF_LAST; + } + + return 1; +} + +/* + * Get inode's extents as described in bmv, and format for output. + * Calls formatter to fill the user's buffer until all extents + * are mapped, until the passed-in bmv->bmv_count slots have + * been filled, or until the formatter short-circuits the loop, + * if it is tracking filled-in extents on its own. + */ +int /* error code */ +xfs_getbmap( + xfs_inode_t *ip, + struct getbmapx *bmv, /* user bmap structure */ + xfs_bmap_format_t formatter, /* format to user */ + void *arg) /* formatter arg */ +{ + __int64_t bmvend; /* last block requested */ + int error = 0; /* return value */ + __int64_t fixlen; /* length for -1 case */ + int i; /* extent number */ + int lock; /* lock state */ + xfs_bmbt_irec_t *map; /* buffer for user's data */ + xfs_mount_t *mp; /* file system mount point */ + int nex; /* # of user extents can do */ + int nexleft; /* # of user extents left */ + int subnex; /* # of bmapi's can do */ + int nmap; /* number of map entries */ + struct getbmapx *out; /* output structure */ + int whichfork; /* data or attr fork */ + int prealloced; /* this is a file with + * preallocated data space */ + int iflags; /* interface flags */ + int bmapi_flags; /* flags for xfs_bmapi */ + int cur_ext = 0; + + mp = ip->i_mount; + iflags = bmv->bmv_iflags; + whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; + + if (whichfork == XFS_ATTR_FORK) { + if (XFS_IFORK_Q(ip)) { + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && + ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + } else if (unlikely( + ip->i_d.di_aformat != 0 && + ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { + XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + + prealloced = 0; + fixlen = 1LL << 32; + } else { + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && + ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + + if (xfs_get_extsz_hint(ip) || + ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ + prealloced = 1; + fixlen = mp->m_super->s_maxbytes; + } else { + prealloced = 0; + fixlen = XFS_ISIZE(ip); + } + } + + if (bmv->bmv_length == -1) { + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); + bmv->bmv_length = + max_t(__int64_t, fixlen - bmv->bmv_offset, 0); + } else if (bmv->bmv_length == 0) { + bmv->bmv_entries = 0; + return 0; + } else if (bmv->bmv_length < 0) { + return XFS_ERROR(EINVAL); + } + + nex = bmv->bmv_count - 1; + if (nex <= 0) + return XFS_ERROR(EINVAL); + bmvend = bmv->bmv_offset + bmv->bmv_length; + + + if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) + return XFS_ERROR(ENOMEM); + out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL); + if (!out) { + out = kmem_zalloc_large(bmv->bmv_count * + sizeof(struct getbmapx)); + if (!out) + return XFS_ERROR(ENOMEM); + } + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { + if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out_unlock_iolock; + } + /* + * even after flushing the inode, there can still be delalloc + * blocks on the inode beyond EOF due to speculative + * preallocation. These are not removed until the release + * function is called or the inode is inactivated. Hence we + * cannot assert here that ip->i_delayed_blks == 0. + */ + } + + lock = xfs_ilock_map_shared(ip); + + /* + * Don't let nex be bigger than the number of extents + * we can have assuming alternating holes and real extents. + */ + if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) + nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; + + bmapi_flags = xfs_bmapi_aflag(whichfork); + if (!(iflags & BMV_IF_PREALLOC)) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + /* + * Allocate enough space to handle "subnex" maps at a time. + */ + error = ENOMEM; + subnex = 16; + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); + if (!map) + goto out_unlock_ilock; + + bmv->bmv_entries = 0; + + if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && + (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { + error = 0; + goto out_free_map; + } + + nexleft = nex; + + do { + nmap = (nexleft > subnex) ? subnex : nexleft; + error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + map, &nmap, bmapi_flags); + if (error) + goto out_free_map; + ASSERT(nmap <= subnex); + + for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + out[cur_ext].bmv_oflags = 0; + if (map[i].br_state == XFS_EXT_UNWRITTEN) + out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; + else if (map[i].br_startblock == DELAYSTARTBLOCK) + out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; + out[cur_ext].bmv_offset = + XFS_FSB_TO_BB(mp, map[i].br_startoff); + out[cur_ext].bmv_length = + XFS_FSB_TO_BB(mp, map[i].br_blockcount); + out[cur_ext].bmv_unused1 = 0; + out[cur_ext].bmv_unused2 = 0; + + /* + * delayed allocation extents that start beyond EOF can + * occur due to speculative EOF allocation when the + * delalloc extent is larger than the largest freespace + * extent at conversion time. These extents cannot be + * converted by data writeback, so can exist here even + * if we are not supposed to be finding delalloc + * extents. + */ + if (map[i].br_startblock == DELAYSTARTBLOCK && + map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + ASSERT((iflags & BMV_IF_DELALLOC) != 0); + + if (map[i].br_startblock == HOLESTARTBLOCK && + whichfork == XFS_ATTR_FORK) { + /* came to the end of attribute fork */ + out[cur_ext].bmv_oflags |= BMV_OF_LAST; + goto out_free_map; + } + + if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], + prealloced, bmvend, + map[i].br_startblock)) + goto out_free_map; + + bmv->bmv_offset = + out[cur_ext].bmv_offset + + out[cur_ext].bmv_length; + bmv->bmv_length = + max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + + /* + * In case we don't want to return the hole, + * don't increase cur_ext so that we can reuse + * it in the next loop. + */ + if ((iflags & BMV_IF_NO_HOLES) && + map[i].br_startblock == HOLESTARTBLOCK) { + memset(&out[cur_ext], 0, sizeof(out[cur_ext])); + continue; + } + + nexleft--; + bmv->bmv_entries++; + cur_ext++; + } + } while (nmap && nexleft && bmv->bmv_length); + + out_free_map: + kmem_free(map); + out_unlock_ilock: + xfs_iunlock_map_shared(ip, lock); + out_unlock_iolock: + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + for (i = 0; i < cur_ext; i++) { + int full = 0; /* user array is full */ + + /* format results & advance arg */ + error = formatter(&arg, &out[i], &full); + if (error || full) + break; + } + + if (is_vmalloc_addr(out)) + kmem_free_large(out); + else + kmem_free(out); + return error; +} + +/* + * dead simple method of punching delalyed allocation blocks from a range in + * the inode. Walks a block at a time so will be slow, but is only executed in + * rare error cases so the overhead is not critical. This will always punch out + * both the start and end blocks, even if the ranges only partially overlap + * them, so it is up to the caller to ensure that partial blocks are not + * passed in. + */ +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length) +{ + xfs_fileoff_t remaining = length; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + do { + int done; + xfs_bmbt_irec_t imap; + int nimaps = 1; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, + XFS_BMAPI_ENTIRE); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "Failed delalloc mapping lookup ino %lld fsb %lld.", + ip->i_ino, start_fsb); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_block; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_block; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, + &flist, &done); + if (error) + break; + + ASSERT(!flist.xbf_count && !flist.xbf_first); +next_block: + start_fsb++; + remaining--; + } while(remaining > 0); + + return error; +} + +/* + * Test whether it is appropriate to check an inode for and free post EOF + * blocks. The 'force' parameter determines whether we should also consider + * regular files that are marked preallocated or append-only. + */ +bool +xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) +{ + /* prealloc/delalloc exists only on regular files */ + if (!S_ISREG(ip->i_d.di_mode)) + return false; + + /* + * Zero sized files with no cached pages and delalloc blocks will not + * have speculative prealloc/delalloc blocks to remove. + */ + if (VFS_I(ip)->i_size == 0 && + VN_CACHED(VFS_I(ip)) == 0 && + ip->i_delayed_blks == 0) + return false; + + /* If we haven't read in the extent list, then don't do it now. */ + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return false; + + /* + * Do not free real preallocated or append-only files unless the file + * has delalloc blocks and we are forced to remove them. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) + if (!force || ip->i_delayed_blks == 0) + return false; + + return true; +} + +/* + * This is called by xfs_inactive to free any blocks beyond eof + * when the link count isn't zero and by xfs_dm_punch_hole() when + * punching a hole to EOF. + */ +int +xfs_free_eofblocks( + xfs_mount_t *mp, + xfs_inode_t *ip, + bool need_iolock) +{ + xfs_trans_t *tp; + int error; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t map_len; + int nimaps; + xfs_bmbt_irec_t imap; + + /* + * Figure out if there are any blocks beyond the end + * of the file. If not, then there is nothing to do. + */ + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + if (last_fsb <= end_fsb) + return 0; + map_len = last_fsb - end_fsb; + + nimaps = 1; + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!error && (nimaps != 0) && + (imap.br_startblock != HOLESTARTBLOCK || + ip->i_delayed_blks)) { + /* + * Attach the dquots to the inode up front. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + /* + * There are blocks after the end of file. + * Free them up now by truncating the file to + * its current size. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + if (need_iolock) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + xfs_trans_cancel(tp, 0); + return EAGAIN; + } + } + + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Do not update the on-disk file size. If we update the + * on-disk file size and then the system crashes before the + * contents of the file are flushed to disk then the files + * may be full of holes (ie NULL files bug). + */ + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip)); + if (error) { + /* + * If we get an error at this point we simply don't + * bother truncating the file. + */ + xfs_trans_cancel(tp, + (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + } else { + error = xfs_trans_commit(tp, + XFS_TRANS_RELEASE_LOG_RES); + if (!error) + xfs_inode_clear_eofblocks_tag(ip); + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + return error; +} + +/* + * xfs_alloc_file_space() + * This routine allocates disk space for the given file. + * + * If alloc_type == 0, this request is for an ALLOCSP type + * request which will change the file size. In this case, no + * DMAPI event will be generated by the call. A TRUNCATE event + * will be generated later by xfs_setattr. + * + * If alloc_type != 0, this request is for a RESVSP type + * request, and a DMAPI DM_EVENT_WRITE will be generated if the + * lower block boundary byte address is less than the file's + * length. + * + * RETURNS: + * 0 on success + * errno on error + * + */ +STATIC int +xfs_alloc_file_space( + xfs_inode_t *ip, + xfs_off_t offset, + xfs_off_t len, + int alloc_type, + int attr_flags) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_off_t count; + xfs_filblks_t allocated_fsb; + xfs_filblks_t allocatesize_fsb; + xfs_extlen_t extsz, temp; + xfs_fileoff_t startoffset_fsb; + xfs_fsblock_t firstfsb; + int nimaps; + int quota_flag; + int rt; + xfs_trans_t *tp; + xfs_bmbt_irec_t imaps[1], *imapp; + xfs_bmap_free_t free_list; + uint qblocks, resblks, resrtextents; + int committed; + int error; + + trace_xfs_alloc_file_space(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + if (len <= 0) + return XFS_ERROR(EINVAL); + + rt = XFS_IS_REALTIME_INODE(ip); + extsz = xfs_get_extsz_hint(ip); + + count = len; + imapp = &imaps[0]; + nimaps = 1; + startoffset_fsb = XFS_B_TO_FSBT(mp, offset); + allocatesize_fsb = XFS_B_TO_FSB(mp, count); + + /* + * Allocate file space until done or until there is an error + */ + while (allocatesize_fsb && !error) { + xfs_fileoff_t s, e; + + /* + * Determine space reservations for data/realtime. + */ + if (unlikely(extsz)) { + s = startoffset_fsb; + do_div(s, extsz); + s *= extsz; + e = startoffset_fsb + allocatesize_fsb; + if ((temp = do_mod(startoffset_fsb, extsz))) + e += temp; + if ((temp = do_mod(e, extsz))) + e += extsz - temp; + } else { + s = 0; + e = allocatesize_fsb; + } + + /* + * The transaction reservation is limited to a 32-bit block + * count, hence we need to limit the number of blocks we are + * trying to reserve to avoid an overflow. We can't allocate + * more than @nimaps extents, and an extent is limited on disk + * to MAXEXTLEN (21 bits), so use that to enforce the limit. + */ + resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); + if (unlikely(rt)) { + resrtextents = qblocks = resblks; + resrtextents /= mp->m_sb.sb_rextsize; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + + /* + * Allocate and setup the transaction. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, resrtextents); + /* + * Check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, + 0, quota_flag); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bmapi_write(tp, ip, startoffset_fsb, + allocatesize_fsb, alloc_type, &firstfsb, + 0, imapp, &nimaps, &free_list); + if (error) { + goto error0; + } + + /* + * Complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) { + break; + } + + allocated_fsb = imapp->br_blockcount; + + if (nimaps == 0) { + error = XFS_ERROR(ENOSPC); + break; + } + + startoffset_fsb += allocated_fsb; + allocatesize_fsb -= allocated_fsb; + } + + return error; + +error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ + xfs_bmap_cancel(&free_list); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); + +error1: /* Just cancel transaction */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Zero file bytes between startoff and endoff inclusive. + * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. + */ +STATIC int +xfs_zero_remaining_bytes( + xfs_inode_t *ip, + xfs_off_t startoff, + xfs_off_t endoff) +{ + xfs_bmbt_irec_t imap; + xfs_fileoff_t offset_fsb; + xfs_off_t lastoffset; + xfs_off_t offset; + xfs_buf_t *bp; + xfs_mount_t *mp = ip->i_mount; + int nimap; + int error = 0; + + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= XFS_ISIZE(ip)) + return 0; + + if (endoff > XFS_ISIZE(ip)) + endoff = XFS_ISIZE(ip); + + bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp, + BTOBB(mp->m_sb.sb_blocksize), 0); + if (!bp) + return XFS_ERROR(ENOMEM); + + xfs_buf_unlock(bp); + + for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { + offset_fsb = XFS_B_TO_FSBT(mp, offset); + nimap = 1; + error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); + if (error || nimap < 1) + break; + ASSERT(imap.br_blockcount >= 1); + ASSERT(imap.br_startoff == offset_fsb); + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; + if (lastoffset > endoff) + lastoffset = endoff; + if (imap.br_startblock == HOLESTARTBLOCK) + continue; + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + if (imap.br_state == XFS_EXT_UNWRITTEN) + continue; + XFS_BUF_UNDONE(bp); + XFS_BUF_UNWRITE(bp); + XFS_BUF_READ(bp); + XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); + xfsbdstrat(mp, bp); + error = xfs_buf_iowait(bp); + if (error) { + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(read)"); + break; + } + memset(bp->b_addr + + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), + 0, lastoffset - offset + 1); + XFS_BUF_UNDONE(bp); + XFS_BUF_UNREAD(bp); + XFS_BUF_WRITE(bp); + xfsbdstrat(mp, bp); + error = xfs_buf_iowait(bp); + if (error) { + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(write)"); + break; + } + } + xfs_buf_free(bp); + return error; +} + +/* + * xfs_free_file_space() + * This routine frees disk space for the given file. + * + * This routine is only called by xfs_change_file_space + * for an UNRESVSP type call. + * + * RETURNS: + * 0 on success + * errno on error + * + */ +STATIC int +xfs_free_file_space( + xfs_inode_t *ip, + xfs_off_t offset, + xfs_off_t len, + int attr_flags) +{ + int committed; + int done; + xfs_fileoff_t endoffset_fsb; + int error; + xfs_fsblock_t firstfsb; + xfs_bmap_free_t free_list; + xfs_bmbt_irec_t imap; + xfs_off_t ioffset; + xfs_extlen_t mod=0; + xfs_mount_t *mp; + int nimap; + uint resblks; + xfs_off_t rounding; + int rt; + xfs_fileoff_t startoffset_fsb; + xfs_trans_t *tp; + int need_iolock = 1; + + mp = ip->i_mount; + + trace_xfs_free_file_space(ip); + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + error = 0; + if (len <= 0) /* if nothing being freed */ + return error; + rt = XFS_IS_REALTIME_INODE(ip); + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); + + if (attr_flags & XFS_ATTR_NOLOCK) + need_iolock = 0; + if (need_iolock) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + /* wait for the completion of any pending DIOs */ + inode_dio_wait(VFS_I(ip)); + } + + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + ioffset = offset & ~(rounding - 1); + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ioffset, -1); + if (error) + goto out_unlock_iolock; + truncate_pagecache_range(VFS_I(ip), ioffset, -1); + + /* + * Need to zero the stuff we're not freeing, on disk. + * If it's a realtime file & can't use unwritten extents then we + * actually need to zero the extent edges. Otherwise xfs_bunmapi + * will take care of it for us. + */ + if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + nimap = 1; + error = xfs_bmapi_read(ip, startoffset_fsb, 1, + &imap, &nimap, 0); + if (error) + goto out_unlock_iolock; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } + nimap = 1; + error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, + &imap, &nimap, 0); + if (error) + goto out_unlock_iolock; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && (mod != mp->m_sb.sb_rextsize)) + endoffset_fsb -= mod; + } + } + if ((done = (endoffset_fsb <= startoffset_fsb))) + /* + * One contiguous piece to clear + */ + error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); + else { + /* + * Some full blocks, possibly two pieces to clear + */ + if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) + error = xfs_zero_remaining_bytes(ip, offset, + XFS_FSB_TO_B(mp, startoffset_fsb) - 1); + if (!error && + XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) + error = xfs_zero_remaining_bytes(ip, + XFS_FSB_TO_B(mp, endoffset_fsb), + offset + len - 1); + } + + /* + * free file space until done or until there is an error + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + while (!error && !done) { + + /* + * allocate and setup the transaction. Allow this + * transaction to dip into the reserve blocks to ensure + * the freeing of the space succeeds at ENOSPC. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); + + /* + * check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, + ip->i_udquot, ip->i_gdquot, ip->i_pdquot, + resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, 0); + + /* + * issue the bunmapi() call to free the blocks + */ + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, + 0, 2, &firstfsb, &free_list, &done); + if (error) { + goto error0; + } + + /* + * complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + out_unlock_iolock: + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + + error0: + xfs_bmap_cancel(&free_list); + error1: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) : + XFS_ILOCK_EXCL); + return error; +} + + +STATIC int +xfs_zero_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + int attr_flags) +{ + struct xfs_mount *mp = ip->i_mount; + uint granularity; + xfs_off_t start_boundary; + xfs_off_t end_boundary; + int error; + + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + + /* + * Round the range of extents we are going to convert inwards. If the + * offset is aligned, then it doesn't get changed so we zero from the + * start of the block offset points to. + */ + start_boundary = round_up(offset, granularity); + end_boundary = round_down(offset + len, granularity); + + ASSERT(start_boundary >= offset); + ASSERT(end_boundary <= offset + len); + + if (!(attr_flags & XFS_ATTR_NOLOCK)) + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (start_boundary < end_boundary - 1) { + /* punch out the page cache over the conversion range */ + truncate_pagecache_range(VFS_I(ip), start_boundary, + end_boundary - 1); + /* convert the blocks */ + error = xfs_alloc_file_space(ip, start_boundary, + end_boundary - start_boundary - 1, + XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT, + attr_flags); + if (error) + goto out_unlock; + + /* We've handled the interior of the range, now for the edges */ + if (start_boundary != offset) + error = xfs_iozero(ip, offset, start_boundary - offset); + if (error) + goto out_unlock; + + if (end_boundary != offset + len) + error = xfs_iozero(ip, end_boundary, + offset + len - end_boundary); + + } else { + /* + * It's either a sub-granularity range or the range spanned lies + * partially across two adjacent blocks. + */ + error = xfs_iozero(ip, offset, len); + } + +out_unlock: + if (!(attr_flags & XFS_ATTR_NOLOCK)) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + +} + +/* + * xfs_change_file_space() + * This routine allocates or frees disk space for the given file. + * The user specified parameters are checked for alignment and size + * limitations. + * + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_change_file_space( + xfs_inode_t *ip, + int cmd, + xfs_flock64_t *bf, + xfs_off_t offset, + int attr_flags) +{ + xfs_mount_t *mp = ip->i_mount; + int clrprealloc; + int error; + xfs_fsize_t fsize; + int setprealloc; + xfs_off_t startoffset; + xfs_trans_t *tp; + struct iattr iattr; + + if (!S_ISREG(ip->i_d.di_mode)) + return XFS_ERROR(EINVAL); + + switch (bf->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + bf->l_start += offset; + break; + case 2: /*SEEK_END*/ + bf->l_start += XFS_ISIZE(ip); + break; + default: + return XFS_ERROR(EINVAL); + } + + /* + * length of <= 0 for resv/unresv/zero is invalid. length for + * alloc/free is ignored completely and we have no idea what userspace + * might have set it to, so set it to zero to allow range + * checks to pass. + */ + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if (bf->l_len <= 0) + return XFS_ERROR(EINVAL); + break; + default: + bf->l_len = 0; + break; + } + + if (bf->l_start < 0 || + bf->l_start > mp->m_super->s_maxbytes || + bf->l_start + bf->l_len < 0 || + bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) + return XFS_ERROR(EINVAL); + + bf->l_whence = 0; + + startoffset = bf->l_start; + fsize = XFS_ISIZE(ip); + + setprealloc = clrprealloc = 0; + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + error = xfs_zero_file_space(ip, startoffset, bf->l_len, + attr_flags); + if (error) + return error; + setprealloc = 1; + break; + + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + error = xfs_alloc_file_space(ip, startoffset, bf->l_len, + XFS_BMAPI_PREALLOC, attr_flags); + if (error) + return error; + setprealloc = 1; + break; + + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if ((error = xfs_free_file_space(ip, startoffset, bf->l_len, + attr_flags))) + return error; + break; + + case XFS_IOC_ALLOCSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP: + case XFS_IOC_FREESP64: + /* + * These operations actually do IO when extending the file, but + * the allocation is done seperately to the zeroing that is + * done. This set of operations need to be serialised against + * other IO operations, such as truncate and buffered IO. We + * need to take the IOLOCK here to serialise the allocation and + * zeroing IO to prevent other IOLOCK holders (e.g. getbmap, + * truncate, direct IO) from racing against the transient + * allocated but not written state we can have here. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (startoffset > fsize) { + error = xfs_alloc_file_space(ip, fsize, + startoffset - fsize, 0, + attr_flags | XFS_ATTR_NOLOCK); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + break; + } + } + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = startoffset; + + error = xfs_setattr_size(ip, &iattr, + attr_flags | XFS_ATTR_NOLOCK); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + if (error) + return error; + + clrprealloc = 1; + break; + + default: + ASSERT(0); + return XFS_ERROR(EINVAL); + } + + /* + * update the inode timestamp, mode, and prealloc flag bits + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + if ((attr_flags & XFS_ATTR_DMI) == 0) { + ip->i_d.di_mode &= ~S_ISUID; + + /* + * Note that we don't have to worry about mandatory + * file locking being disabled here because we only + * clear the S_ISGID bit if the Group execute bit is + * on, but if it was on then mandatory locking wouldn't + * have been enabled. + */ + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } + if (setprealloc) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + else if (clrprealloc) + ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + if (attr_flags & XFS_ATTR_SYNC) + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp, 0); +} + +/* + * We need to check that the format of the data fork in the temporary inode is + * valid for the target inode before doing the swap. This is not a problem with + * attr1 because of the fixed fork offset, but attr2 has a dynamically sized + * data fork depending on the space the attribute fork is taking so we can get + * invalid formats on the target inode. + * + * E.g. target has space for 7 extents in extent format, temp inode only has + * space for 6. If we defragment down to 7 extents, then the tmp format is a + * btree, but when swapped it needs to be in extent format. Hence we can't just + * blindly swap data forks on attr2 filesystems. + * + * Note that we check the swap in both directions so that we don't end up with + * a corrupt temporary inode, either. + * + * Note that fixing the way xfs_fsr sets up the attribute fork in the source + * inode will prevent this situation from occurring, so all we do here is + * reject and log the attempt. basically we are putting the responsibility on + * userspace to get this right. + */ +static int +xfs_swap_extents_check_format( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip) /* tmp inode */ +{ + + /* Should never get a local format */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + return EINVAL; + + /* + * if the target inode has less extents that then temporary inode then + * why did userspace call us? + */ + if (ip->i_d.di_nextents < tip->i_d.di_nextents) + return EINVAL; + + /* + * if the target inode is in extent form and the temp inode is in btree + * form then we will end up with the target inode in the wrong format + * as we already know there are less extents in the temp inode. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + return EINVAL; + + /* Check temp in extent form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return EINVAL; + + /* Check target in extent form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return EINVAL; + + /* + * If we are in a btree format, check that the temp root block will fit + * in the target and that it has enough extents to be in btree format + * in the target. + * + * Note that we have to be careful to allow btree->extent conversions + * (a common defrag case) which will occur when the temp inode is in + * extent format... + */ + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(ip) && + XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) + return EINVAL; + if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return EINVAL; + } + + /* Reciprocal target->temp btree format checks */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(tip) && + XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) + return EINVAL; + if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return EINVAL; + } + + return 0; +} + +int +xfs_swap_extents( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ + xfs_swapext_t *sxp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + xfs_bstat_t *sbp = &sxp->sx_stat; + xfs_ifork_t *tempifp, *ifp, *tifp; + int src_log_flags, target_log_flags; + int error = 0; + int aforkblks = 0; + int taforkblks = 0; + __uint64_t tmp; + + /* + * We have no way of updating owner information in the BMBT blocks for + * each inode on CRC enabled filesystems, so to avoid corrupting the + * this metadata we simply don't allow extent swaps to occur. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return XFS_ERROR(EINVAL); + + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); + if (!tempifp) { + error = XFS_ERROR(ENOMEM); + goto out; + } + + /* + * we have to do two separate lock calls here to keep lockdep + * happy. If we try to get all the locks in one call, lock will + * report false positives when we drop the ILOCK and regain them + * below. + */ + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + + /* Verify that both files have the same format */ + if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); + if (error) + goto out_unlock; + truncate_pagecache_range(VFS_I(tip), 0, -1); + + /* Verify O_DIRECT for ftmp */ + if (VN_CACHED(VFS_I(tip)) != 0) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + /* Verify all data are being swapped */ + if (sxp->sx_offset != 0 || + sxp->sx_length != ip->i_d.di_size || + sxp->sx_length != tip->i_d.di_size) { + error = XFS_ERROR(EFAULT); + goto out_unlock; + } + + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_notice(mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __func__, ip->i_ino); + goto out_unlock; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + error = XFS_ERROR(EBUSY); + goto out_unlock; + } + + /* We need to fail if the file is memory mapped. Once we have tossed + * all existing pages, the page fault will have no option + * but to go to the filesystem for pages. By making the page fault call + * vop_read (or write in the case of autogrow) they block on the iolock + * until we have switched the extents. + */ + if (VN_MAPPED(VFS_I(ip))) { + error = XFS_ERROR(EBUSY); + goto out_unlock; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL); + + /* + * There is a race condition here since we gave up the + * ilock. However, the data fork will not change since + * we have the iolock (locked for truncation too) so we + * are safe. We don't really care if non-io related + * fields change. + */ + truncate_pagecache_range(VFS_I(ip), 0, -1); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_IOLOCK_EXCL); + xfs_trans_cancel(tp, 0); + goto out; + } + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + + /* + * Count the number of extended attribute blocks + */ + if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && + (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + if (error) + goto out_trans_cancel; + } + if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && + (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, + &taforkblks); + if (error) + goto out_trans_cancel; + } + + /* + * Swap the data forks of the inodes + */ + ifp = &ip->i_df; + tifp = &tip->i_df; + *tempifp = *ifp; /* struct copy */ + *ifp = *tifp; /* struct copy */ + *tifp = *tempifp; /* struct copy */ + + /* + * Fix the on-disk inode values + */ + tmp = (__uint64_t)ip->i_d.di_nblocks; + ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; + tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; + + tmp = (__uint64_t) ip->i_d.di_nextents; + ip->i_d.di_nextents = tip->i_d.di_nextents; + tip->i_d.di_nextents = tmp; + + tmp = (__uint64_t) ip->i_d.di_format; + ip->i_d.di_format = tip->i_d.di_format; + tip->i_d.di_format = tmp; + + /* + * The extents in the source inode could still contain speculative + * preallocation beyond EOF (e.g. the file is open but not modified + * while defrag is in progress). In that case, we need to copy over the + * number of delalloc blocks the data fork in the source inode is + * tracking beyond EOF so that when the fork is truncated away when the + * temporary inode is unlinked we don't underrun the i_delayed_blks + * counter on that inode. + */ + ASSERT(tip->i_delayed_blks == 0); + tip->i_delayed_blks = ip->i_delayed_blks; + ip->i_delayed_blks = 0; + + src_log_flags = XFS_ILOG_CORE; + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { + ifp->if_u1.if_extents = + ifp->if_u2.if_inline_ext; + } + src_log_flags |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + src_log_flags |= XFS_ILOG_DBROOT; + break; + } + + target_log_flags = XFS_ILOG_CORE; + switch (tip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { + tifp->if_u1.if_extents = + tifp->if_u2.if_inline_ext; + } + target_log_flags |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + target_log_flags |= XFS_ILOG_DBROOT; + break; + } + + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + xfs_trans_log_inode(tp, ip, src_log_flags); + xfs_trans_log_inode(tp, tip, target_log_flags); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, 0); + + trace_xfs_swap_extent_after(ip, 0); + trace_xfs_swap_extent_after(tip, 1); +out: + kmem_free(tempifp); + return error; + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + goto out; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + goto out_unlock; +} diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h new file mode 100644 index 000000000000..061260946f7a --- /dev/null +++ b/fs/xfs/xfs_bmap_util.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BMAP_UTIL_H__ +#define __XFS_BMAP_UTIL_H__ + +/* Kernel only BMAP related definitions and functions */ + +struct xfs_bmbt_irec; +struct xfs_bmap_free_item; +struct xfs_ifork; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * Argument structure for xfs_bmap_alloc. + */ +struct xfs_bmalloca { + xfs_fsblock_t *firstblock; /* i/o first block allocated */ + struct xfs_bmap_free *flist; /* bmap freelist */ + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bmbt_irec prev; /* extent before the new one */ + struct xfs_bmbt_irec got; /* extent after, or delayed */ + + xfs_fileoff_t offset; /* offset in file filling in */ + xfs_extlen_t length; /* i/o length asked/allocated */ + xfs_fsblock_t blkno; /* starting block of new extent */ + + struct xfs_btree_cur *cur; /* btree cursor */ + xfs_extnum_t idx; /* current extent index */ + int nallocs;/* number of extents alloc'd */ + int logflags;/* flags for transaction logging */ + + xfs_extlen_t total; /* total blocks needed for xaction */ + xfs_extlen_t minlen; /* minimum allocation size (blocks) */ + xfs_extlen_t minleft; /* amount must be left after alloc */ + char eof; /* set if allocating past last extent */ + char wasdel; /* replacing a delayed allocation */ + char userdata;/* set if is user data */ + char aeof; /* allocated space at eof */ + char conv; /* overwriting unwritten extents */ + char stack_switch; + int flags; + struct completion *done; + struct work_struct work; + int result; +}; + +int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, + int *committed); +int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); +int xfs_bmapi_allocate(struct xfs_bmalloca *args); +int __xfs_bmapi_allocate(struct xfs_bmalloca *args); +int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, + int whichfork, int *eof); +int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, int *count); +int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, + xfs_fileoff_t start_fsb, xfs_fileoff_t length); + +/* bmap to userspace formatter - copy to user & advance pointer */ +typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); +int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, + xfs_bmap_format_t formatter, void *arg); + +/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ +void xfs_bmap_del_free(struct xfs_bmap_free *flist, + struct xfs_bmap_free_item *prev, + struct xfs_bmap_free_item *free); +int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, + struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, + int rt, int eof, int delay, int convert, + xfs_fileoff_t *offp, xfs_extlen_t *lenp); +void xfs_bmap_adjacent(struct xfs_bmalloca *ap); +int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *rec, + int *is_empty); + +/* preallocation and hole punch interface */ +int xfs_change_file_space(struct xfs_inode *ip, int cmd, + xfs_flock64_t *bf, xfs_off_t offset, + int attr_flags); + +/* EOF block manipulation functions */ +bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); +int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, + bool need_iolock); + +int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, + struct xfs_swapext *sx); + +xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); + +#endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 0903960410a2..7a2b4da3c0db 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -510,7 +510,7 @@ xfs_btree_ptr_addr( } /* - * Get a the root block which is stored in the inode. + * Get the root block which is stored in the inode. * * For now this btree implementation assumes the btree root is always * stored in the if_broot field of an inode fork. @@ -978,6 +978,7 @@ xfs_btree_init_block_int( buf->bb_u.l.bb_owner = cpu_to_be64(owner); uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); buf->bb_u.l.bb_pad = 0; + buf->bb_u.l.bb_lsn = 0; } } else { /* owner is a 32 bit value on short blocks */ @@ -989,6 +990,7 @@ xfs_btree_init_block_int( buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); buf->bb_u.s.bb_owner = cpu_to_be32(__owner); uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.s.bb_lsn = 0; } } } @@ -1684,7 +1686,7 @@ xfs_lookup_get_search_key( /* * Lookup the record. The cursor is made to point to it, based on dir. - * Return 0 if can't find any such record, 1 for success. + * stat is set to 0 if can't find any such record, 1 for success. */ int /* error */ xfs_btree_lookup( @@ -2756,7 +2758,6 @@ xfs_btree_make_block_unfull( if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ - xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); } else { /* A root block that needs replacing */ diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 55e3c7cc3c3d..c8473c7ef45e 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -88,13 +88,11 @@ struct xfs_btree_block { #define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) #define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) - #define XFS_BTREE_SBLOCK_CRC_OFF \ offsetof(struct xfs_btree_block, bb_u.s.bb_crc) #define XFS_BTREE_LBLOCK_CRC_OFF \ offsetof(struct xfs_btree_block, bb_u.l.bb_crc) - /* * Generic key, ptr and record wrapper structures. * diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1b2472a46e46..c06823fe10d3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -35,6 +35,7 @@ #include <linux/freezer.h> #include "xfs_sb.h" +#include "xfs_trans_resv.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -303,7 +304,7 @@ _xfs_buf_free_pages( * Releases the specified buffer. * * The modification state of any associated pages is left unchanged. - * The buffer most not be on any hash - use xfs_buf_rele instead for + * The buffer must not be on any hash - use xfs_buf_rele instead for * hashed and refcounted buffers */ void @@ -1621,7 +1622,7 @@ xfs_setsize_buftarg_flags( /* * When allocating the initial buffer target we have not yet * read in the superblock, so don't know what sized sectors - * are being used is at this early stage. Play safe. + * are being used at this early stage. Play safe. */ STATIC int xfs_setsize_buftarg_early( diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index bfc4e0c26fd3..3a944b198e35 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -39,6 +39,14 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); +static inline int +xfs_buf_log_format_size( + struct xfs_buf_log_format *blfp) +{ + return offsetof(struct xfs_buf_log_format, blf_data_map) + + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); +} + /* * This returns the number of log iovecs needed to log the * given buf log item. @@ -49,25 +57,27 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); * * If the XFS_BLI_STALE flag has been set, then log nothing. */ -STATIC uint +STATIC void xfs_buf_item_size_segment( struct xfs_buf_log_item *bip, - struct xfs_buf_log_format *blfp) + struct xfs_buf_log_format *blfp, + int *nvecs, + int *nbytes) { struct xfs_buf *bp = bip->bli_buf; - uint nvecs; int next_bit; int last_bit; last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (last_bit == -1) - return 0; + return; /* * initial count for a dirty buffer is 2 vectors - the format structure * and the first dirty region. */ - nvecs = 2; + *nvecs += 2; + *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK; while (last_bit != -1) { /* @@ -87,18 +97,17 @@ xfs_buf_item_size_segment( break; } else if (next_bit != last_bit + 1) { last_bit = next_bit; - nvecs++; + (*nvecs)++; } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + XFS_BLF_CHUNK)) { last_bit = next_bit; - nvecs++; + (*nvecs)++; } else { last_bit++; } + *nbytes += XFS_BLF_CHUNK; } - - return nvecs; } /* @@ -118,12 +127,13 @@ xfs_buf_item_size_segment( * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log * format structures. */ -STATIC uint +STATIC void xfs_buf_item_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); - uint nvecs; int i; ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -135,7 +145,11 @@ xfs_buf_item_size( */ trace_xfs_buf_item_size_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - return bip->bli_format_count; + *nvecs += bip->bli_format_count; + for (i = 0; i < bip->bli_format_count; i++) { + *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); + } + return; } ASSERT(bip->bli_flags & XFS_BLI_LOGGED); @@ -147,7 +161,8 @@ xfs_buf_item_size( * commit, so no vectors are used at all. */ trace_xfs_buf_item_size_ordered(bip); - return XFS_LOG_VEC_ORDERED; + *nvecs = XFS_LOG_VEC_ORDERED; + return; } /* @@ -159,13 +174,11 @@ xfs_buf_item_size( * count for the extra buf log format structure that will need to be * written. */ - nvecs = 0; for (i = 0; i < bip->bli_format_count; i++) { - nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]); + xfs_buf_item_size_segment(bip, &bip->bli_formats[i], + nvecs, nbytes); } - trace_xfs_buf_item_size(bip); - return nvecs; } static struct xfs_log_iovec * @@ -192,8 +205,7 @@ xfs_buf_item_format_segment( * the actual size of the dirty bitmap rather than the size of the in * memory structure. */ - base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + - (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); + base_size = xfs_buf_log_format_size(blfp); nvecs = 0; first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); @@ -601,11 +613,9 @@ xfs_buf_item_unlock( } } } - if (clean) - xfs_buf_item_relse(bp); - else if (aborted) { + if (clean || aborted) { if (atomic_dec_and_test(&bip->bli_refcount)) { - ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + ASSERT(!aborted || XFS_FORCED_SHUTDOWN(lip->li_mountp)); xfs_buf_item_relse(bp); } } else diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 0f1c247dc680..db6371087fe8 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -18,101 +18,9 @@ #ifndef __XFS_BUF_ITEM_H__ #define __XFS_BUF_ITEM_H__ -extern kmem_zone_t *xfs_buf_item_zone; - -/* - * This flag indicates that the buffer contains on disk inodes - * and requires special recovery handling. - */ -#define XFS_BLF_INODE_BUF (1<<0) -/* - * This flag indicates that the buffer should not be replayed - * during recovery because its blocks are being freed. - */ -#define XFS_BLF_CANCEL (1<<1) - -/* - * This flag indicates that the buffer contains on disk - * user or group dquots and may require special recovery handling. - */ -#define XFS_BLF_UDQUOT_BUF (1<<2) -#define XFS_BLF_PDQUOT_BUF (1<<3) -#define XFS_BLF_GDQUOT_BUF (1<<4) - -#define XFS_BLF_CHUNK 128 -#define XFS_BLF_SHIFT 7 -#define BIT_TO_WORD_SHIFT 5 -#define NBWORD (NBBY * sizeof(unsigned int)) - -/* - * This is the structure used to lay out a buf log item in the - * log. The data map describes which 128 byte chunks of the buffer - * have been logged. - */ -#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) +/* kernel only definitions */ -typedef struct xfs_buf_log_format { - unsigned short blf_type; /* buf log item type indicator */ - unsigned short blf_size; /* size of this item */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ - __int64_t blf_blkno; /* starting blkno of this buf */ - unsigned int blf_map_size; /* used size of data bitmap in words */ - unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ -} xfs_buf_log_format_t; - -/* - * All buffers now need to tell recovery where the magic number - * is so that it can verify and calculate the CRCs on the buffer correctly - * once the changes have been replayed into the buffer. - * - * The type value is held in the upper 5 bits of the blf_flags field, which is - * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down. - */ -#define XFS_BLFT_BITS 5 -#define XFS_BLFT_SHIFT 11 -#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT) - -enum xfs_blft { - XFS_BLFT_UNKNOWN_BUF = 0, - XFS_BLFT_UDQUOT_BUF, - XFS_BLFT_PDQUOT_BUF, - XFS_BLFT_GDQUOT_BUF, - XFS_BLFT_BTREE_BUF, - XFS_BLFT_AGF_BUF, - XFS_BLFT_AGFL_BUF, - XFS_BLFT_AGI_BUF, - XFS_BLFT_DINO_BUF, - XFS_BLFT_SYMLINK_BUF, - XFS_BLFT_DIR_BLOCK_BUF, - XFS_BLFT_DIR_DATA_BUF, - XFS_BLFT_DIR_FREE_BUF, - XFS_BLFT_DIR_LEAF1_BUF, - XFS_BLFT_DIR_LEAFN_BUF, - XFS_BLFT_DA_NODE_BUF, - XFS_BLFT_ATTR_LEAF_BUF, - XFS_BLFT_ATTR_RMT_BUF, - XFS_BLFT_SB_BUF, - XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), -}; - -static inline void -xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) -{ - ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF); - blf->blf_flags &= ~XFS_BLFT_MASK; - blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); -} - -static inline __uint16_t -xfs_blft_from_flags(struct xfs_buf_log_format *blf) -{ - return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; -} - -/* - * buf log item flags - */ +/* buf log item flags */ #define XFS_BLI_HOLD 0x01 #define XFS_BLI_DIRTY 0x02 #define XFS_BLI_STALE 0x04 @@ -133,8 +41,6 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf) { XFS_BLI_ORDERED, "ORDERED" } -#ifdef __KERNEL__ - struct xfs_buf; struct xfs_mount; struct xfs_buf_log_item; @@ -169,6 +75,6 @@ void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, enum xfs_blft); void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp); -#endif /* __KERNEL__ */ +extern kmem_zone_t *xfs_buf_item_zone; #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 0b8b2a13cd24..d4e59a4ff59f 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -27,8 +27,8 @@ #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2.h" #include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_dinode.h" #include "xfs_inode.h" @@ -399,7 +399,7 @@ xfs_da3_split( struct xfs_da_intnode *node; struct xfs_buf *bp; int max; - int action; + int action = 0; int error; int i; @@ -2454,9 +2454,9 @@ static int xfs_buf_map_from_irec( struct xfs_mount *mp, struct xfs_buf_map **mapp, - unsigned int *nmaps, + int *nmaps, struct xfs_bmbt_irec *irecs, - unsigned int nirecs) + int nirecs) { struct xfs_buf_map *map; int i; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 6fb3371c63cf..b1f267995dea 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -133,12 +133,19 @@ extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from); static inline int -xfs_da3_node_hdr_size(struct xfs_da_intnode *dap) +__xfs_da3_node_hdr_size(bool v3) { - if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) + if (v3) return sizeof(struct xfs_da3_node_hdr); return sizeof(struct xfs_da_node_hdr); } +static inline int +xfs_da3_node_hdr_size(struct xfs_da_intnode *dap) +{ + bool v3 = dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC); + + return __xfs_da3_node_hdr_size(v3); +} static inline struct xfs_da_node_entry * xfs_da3_node_tree_p(struct xfs_da_intnode *dap) @@ -176,6 +183,7 @@ enum xfs_dacmp { typedef struct xfs_da_args { const __uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ + __uint8_t filetype; /* filetype of inode for directories */ __uint8_t *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ int flags; /* argument flags (eg: ATTR_NOCREATE) */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c deleted file mode 100644 index e36445ceaf80..000000000000 --- a/fs/xfs/xfs_dfrag.c +++ /dev/null @@ -1,459 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_itable.h" -#include "xfs_dfrag.h" -#include "xfs_error.h" -#include "xfs_vnodeops.h" -#include "xfs_trace.h" - - -static int xfs_swap_extents( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip, /* tmp inode */ - xfs_swapext_t *sxp); - -/* - * ioctl interface for swapext - */ -int -xfs_swapext( - xfs_swapext_t *sxp) -{ - xfs_inode_t *ip, *tip; - struct fd f, tmp; - int error = 0; - - /* Pull information for the target fd */ - f = fdget((int)sxp->sx_fdtarget); - if (!f.file) { - error = XFS_ERROR(EINVAL); - goto out; - } - - if (!(f.file->f_mode & FMODE_WRITE) || - !(f.file->f_mode & FMODE_READ) || - (f.file->f_flags & O_APPEND)) { - error = XFS_ERROR(EBADF); - goto out_put_file; - } - - tmp = fdget((int)sxp->sx_fdtmp); - if (!tmp.file) { - error = XFS_ERROR(EINVAL); - goto out_put_file; - } - - if (!(tmp.file->f_mode & FMODE_WRITE) || - !(tmp.file->f_mode & FMODE_READ) || - (tmp.file->f_flags & O_APPEND)) { - error = XFS_ERROR(EBADF); - goto out_put_tmp_file; - } - - if (IS_SWAPFILE(file_inode(f.file)) || - IS_SWAPFILE(file_inode(tmp.file))) { - error = XFS_ERROR(EINVAL); - goto out_put_tmp_file; - } - - ip = XFS_I(file_inode(f.file)); - tip = XFS_I(file_inode(tmp.file)); - - if (ip->i_mount != tip->i_mount) { - error = XFS_ERROR(EINVAL); - goto out_put_tmp_file; - } - - if (ip->i_ino == tip->i_ino) { - error = XFS_ERROR(EINVAL); - goto out_put_tmp_file; - } - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - error = XFS_ERROR(EIO); - goto out_put_tmp_file; - } - - error = xfs_swap_extents(ip, tip, sxp); - - out_put_tmp_file: - fdput(tmp); - out_put_file: - fdput(f); - out: - return error; -} - -/* - * We need to check that the format of the data fork in the temporary inode is - * valid for the target inode before doing the swap. This is not a problem with - * attr1 because of the fixed fork offset, but attr2 has a dynamically sized - * data fork depending on the space the attribute fork is taking so we can get - * invalid formats on the target inode. - * - * E.g. target has space for 7 extents in extent format, temp inode only has - * space for 6. If we defragment down to 7 extents, then the tmp format is a - * btree, but when swapped it needs to be in extent format. Hence we can't just - * blindly swap data forks on attr2 filesystems. - * - * Note that we check the swap in both directions so that we don't end up with - * a corrupt temporary inode, either. - * - * Note that fixing the way xfs_fsr sets up the attribute fork in the source - * inode will prevent this situation from occurring, so all we do here is - * reject and log the attempt. basically we are putting the responsibility on - * userspace to get this right. - */ -static int -xfs_swap_extents_check_format( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip) /* tmp inode */ -{ - - /* Should never get a local format */ - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || - tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) - return EINVAL; - - /* - * if the target inode has less extents that then temporary inode then - * why did userspace call us? - */ - if (ip->i_d.di_nextents < tip->i_d.di_nextents) - return EINVAL; - - /* - * if the target inode is in extent form and the temp inode is in btree - * form then we will end up with the target inode in the wrong format - * as we already know there are less extents in the temp inode. - */ - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) - return EINVAL; - - /* Check temp in extent form to max in target */ - if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) - return EINVAL; - - /* Check target in extent form to max in temp */ - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) - return EINVAL; - - /* - * If we are in a btree format, check that the temp root block will fit - * in the target and that it has enough extents to be in btree format - * in the target. - * - * Note that we have to be careful to allow btree->extent conversions - * (a common defrag case) which will occur when the temp inode is in - * extent format... - */ - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_BOFF(ip) && - XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) - return EINVAL; - if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) - return EINVAL; - } - - /* Reciprocal target->temp btree format checks */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_BOFF(tip) && - XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) - return EINVAL; - if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) - return EINVAL; - } - - return 0; -} - -static int -xfs_swap_extents( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip, /* tmp inode */ - xfs_swapext_t *sxp) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_trans_t *tp; - xfs_bstat_t *sbp = &sxp->sx_stat; - xfs_ifork_t *tempifp, *ifp, *tifp; - int src_log_flags, target_log_flags; - int error = 0; - int aforkblks = 0; - int taforkblks = 0; - __uint64_t tmp; - - /* - * We have no way of updating owner information in the BMBT blocks for - * each inode on CRC enabled filesystems, so to avoid corrupting the - * this metadata we simply don't allow extent swaps to occur. - */ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return XFS_ERROR(EINVAL); - - tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); - if (!tempifp) { - error = XFS_ERROR(ENOMEM); - goto out; - } - - /* - * we have to do two separate lock calls here to keep lockdep - * happy. If we try to get all the locks in one call, lock will - * report false positives when we drop the ILOCK and regain them - * below. - */ - xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - - /* Verify that both files have the same format */ - if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { - error = XFS_ERROR(EINVAL); - goto out_unlock; - } - - /* Verify both files are either real-time or non-realtime */ - if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { - error = XFS_ERROR(EINVAL); - goto out_unlock; - } - - error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); - if (error) - goto out_unlock; - truncate_pagecache_range(VFS_I(tip), 0, -1); - - /* Verify O_DIRECT for ftmp */ - if (VN_CACHED(VFS_I(tip)) != 0) { - error = XFS_ERROR(EINVAL); - goto out_unlock; - } - - /* Verify all data are being swapped */ - if (sxp->sx_offset != 0 || - sxp->sx_length != ip->i_d.di_size || - sxp->sx_length != tip->i_d.di_size) { - error = XFS_ERROR(EFAULT); - goto out_unlock; - } - - trace_xfs_swap_extent_before(ip, 0); - trace_xfs_swap_extent_before(tip, 1); - - /* check inode formats now that data is flushed */ - error = xfs_swap_extents_check_format(ip, tip); - if (error) { - xfs_notice(mp, - "%s: inode 0x%llx format is incompatible for exchanging.", - __func__, ip->i_ino); - goto out_unlock; - } - - /* - * Compare the current change & modify times with that - * passed in. If they differ, we abort this swap. - * This is the mechanism used to ensure the calling - * process that the file was not changed out from - * under it. - */ - if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || - (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || - (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || - (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { - error = XFS_ERROR(EBUSY); - goto out_unlock; - } - - /* We need to fail if the file is memory mapped. Once we have tossed - * all existing pages, the page fault will have no option - * but to go to the filesystem for pages. By making the page fault call - * vop_read (or write in the case of autogrow) they block on the iolock - * until we have switched the extents. - */ - if (VN_MAPPED(VFS_I(ip))) { - error = XFS_ERROR(EBUSY); - goto out_unlock; - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL); - - /* - * There is a race condition here since we gave up the - * ilock. However, the data fork will not change since - * we have the iolock (locked for truncation too) so we - * are safe. We don't really care if non-io related - * fields change. - */ - truncate_pagecache_range(VFS_I(ip), 0, -1); - - tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); - if ((error = xfs_trans_reserve(tp, 0, - XFS_ICHANGE_LOG_RES(mp), 0, - 0, 0))) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_IOLOCK_EXCL); - xfs_trans_cancel(tp, 0); - goto out; - } - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - - /* - * Count the number of extended attribute blocks - */ - if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && - (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); - if (error) - goto out_trans_cancel; - } - if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && - (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, - &taforkblks); - if (error) - goto out_trans_cancel; - } - - /* - * Swap the data forks of the inodes - */ - ifp = &ip->i_df; - tifp = &tip->i_df; - *tempifp = *ifp; /* struct copy */ - *ifp = *tifp; /* struct copy */ - *tifp = *tempifp; /* struct copy */ - - /* - * Fix the on-disk inode values - */ - tmp = (__uint64_t)ip->i_d.di_nblocks; - ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; - tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; - - tmp = (__uint64_t) ip->i_d.di_nextents; - ip->i_d.di_nextents = tip->i_d.di_nextents; - tip->i_d.di_nextents = tmp; - - tmp = (__uint64_t) ip->i_d.di_format; - ip->i_d.di_format = tip->i_d.di_format; - tip->i_d.di_format = tmp; - - /* - * The extents in the source inode could still contain speculative - * preallocation beyond EOF (e.g. the file is open but not modified - * while defrag is in progress). In that case, we need to copy over the - * number of delalloc blocks the data fork in the source inode is - * tracking beyond EOF so that when the fork is truncated away when the - * temporary inode is unlinked we don't underrun the i_delayed_blks - * counter on that inode. - */ - ASSERT(tip->i_delayed_blks == 0); - tip->i_delayed_blks = ip->i_delayed_blks; - ip->i_delayed_blks = 0; - - src_log_flags = XFS_ILOG_CORE; - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. - */ - if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { - ifp->if_u1.if_extents = - ifp->if_u2.if_inline_ext; - } - src_log_flags |= XFS_ILOG_DEXT; - break; - case XFS_DINODE_FMT_BTREE: - src_log_flags |= XFS_ILOG_DBROOT; - break; - } - - target_log_flags = XFS_ILOG_CORE; - switch (tip->i_d.di_format) { - case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. - */ - if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { - tifp->if_u1.if_extents = - tifp->if_u2.if_inline_ext; - } - target_log_flags |= XFS_ILOG_DEXT; - break; - case XFS_DINODE_FMT_BTREE: - target_log_flags |= XFS_ILOG_DBROOT; - break; - } - - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - xfs_trans_log_inode(tp, ip, src_log_flags); - xfs_trans_log_inode(tp, tip, target_log_flags); - - /* - * If this is a synchronous mount, make sure that the - * transaction goes to disk before returning to the user. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(tp); - - error = xfs_trans_commit(tp, 0); - - trace_xfs_swap_extent_after(ip, 0); - trace_xfs_swap_extent_after(tip, 1); -out: - kmem_free(tempifp); - return error; - -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - goto out; - -out_trans_cancel: - xfs_trans_cancel(tp, 0); - goto out_unlock; -} diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h deleted file mode 100644 index 20bdd935c121..000000000000 --- a/fs/xfs/xfs_dfrag.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DFRAG_H__ -#define __XFS_DFRAG_H__ - -/* - * Structure passed to xfs_swapext - */ - -typedef struct xfs_swapext -{ - __int64_t sx_version; /* version */ - __int64_t sx_fdtarget; /* fd of target file */ - __int64_t sx_fdtmp; /* fd of tmp file */ - xfs_off_t sx_offset; /* offset into file */ - xfs_off_t sx_length; /* leng from offset */ - char sx_pad[16]; /* pad space, unused */ - xfs_bstat_t sx_stat; /* stat of target b4 copy */ -} xfs_swapext_t; - -/* - * Version flag - */ -#define XFS_SX_VERSION 0 - -#ifdef __KERNEL__ -/* - * Prototypes for visible xfs_dfrag.c routines. - */ - -/* - * Syscall interface for xfs_swapext - */ -int xfs_swapext(struct xfs_swapext *sx); - -#endif /* __KERNEL__ */ - -#endif /* __XFS_DFRAG_H__ */ diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 8f023dee404d..edf203ab50af 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -31,14 +31,14 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" -#include "xfs_dir2.h" #include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" -#include "xfs_vnodeops.h" #include "xfs_trace.h" -struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2}; +struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; + /* * ASCII case-insensitive (ie. A-Z) support for directories that was @@ -90,6 +90,9 @@ void xfs_dir_mount( xfs_mount_t *mp) { + int nodehdr_size; + + ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb)); ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= XFS_MAX_BLOCKSIZE); @@ -98,12 +101,13 @@ xfs_dir_mount( mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp)); mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp)); - mp->m_attr_node_ents = - (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) / - (uint)sizeof(xfs_da_node_entry_t); - mp->m_dir_node_ents = - (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) / - (uint)sizeof(xfs_da_node_entry_t); + + nodehdr_size = __xfs_da3_node_hdr_size(xfs_sb_version_hascrc(&mp->m_sb)); + mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - nodehdr_size) / + (uint)sizeof(xfs_da_node_entry_t); + mp->m_dir_node_ents = (mp->m_dirblksize - nodehdr_size) / + (uint)sizeof(xfs_da_node_entry_t); + mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; if (xfs_sb_version_hasasciici(&mp->m_sb)) mp->m_dirnameops = &xfs_ascii_ci_nameops; @@ -209,6 +213,7 @@ xfs_dir_createname( memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; args.namelen = name->len; + args.filetype = name->type; args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.inumber = inum; args.dp = dp; @@ -283,6 +288,7 @@ xfs_dir_lookup( memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; args.namelen = name->len; + args.filetype = name->type; args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.dp = dp; args.whichfork = XFS_DATA_FORK; @@ -338,6 +344,7 @@ xfs_dir_removename( memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; args.namelen = name->len; + args.filetype = name->type; args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.inumber = ino; args.dp = dp; @@ -363,37 +370,6 @@ xfs_dir_removename( } /* - * Read a directory. - */ -int -xfs_readdir( - xfs_inode_t *dp, - struct dir_context *ctx, - size_t bufsize) -{ - int rval; /* return value */ - int v; /* type-checking value */ - - trace_xfs_readdir(dp); - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return XFS_ERROR(EIO); - - ASSERT(S_ISDIR(dp->i_d.di_mode)); - XFS_STATS_INC(xs_dir_getdents); - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(dp, ctx); - else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) - ; - else if (v) - rval = xfs_dir2_block_getdents(dp, ctx); - else - rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize); - return rval; -} - -/* * Replace the inode number of a directory entry. */ int @@ -418,6 +394,7 @@ xfs_dir_replace( memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; args.namelen = name->len; + args.filetype = name->type; args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.inumber = inum; args.dp = dp; @@ -465,6 +442,7 @@ xfs_dir_canenter( memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; args.namelen = name->len; + args.filetype = name->type; args.hashval = dp->i_mount->m_dirnameops->hashname(name); args.dp = dp; args.whichfork = XFS_DATA_FORK; diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index e937d9991c18..9910401327d4 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -23,6 +23,11 @@ struct xfs_da_args; struct xfs_inode; struct xfs_mount; struct xfs_trans; +struct xfs_dir2_sf_hdr; +struct xfs_dir2_sf_entry; +struct xfs_dir2_data_hdr; +struct xfs_dir2_data_entry; +struct xfs_dir2_data_unused; extern struct xfs_name xfs_name_dotdot; @@ -57,4 +62,45 @@ extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, */ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); +/* + * Interface routines used by userspace utilities + */ +extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); +extern void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *sfp, + xfs_ino_t ino); +extern xfs_ino_t xfs_dir3_sfe_get_ino(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, struct xfs_dir2_sf_entry *sfep); +extern void xfs_dir3_sfe_put_ino(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino); + +extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r); +extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r); +extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, + struct xfs_buf *bp); + +extern void xfs_dir2_data_freescan(struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr, int *loghead); +extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_dir2_data_entry *dep); +extern void xfs_dir2_data_log_header(struct xfs_trans *tp, + struct xfs_buf *bp); +extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_dir2_data_unused *dup); +extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp, + xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, + int *needlogp, int *needscanp); +extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); + +extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( + struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup); + +extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 5e7fbd72cf52..0957aa98b6c0 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -31,8 +31,8 @@ #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_buf_item.h" -#include "xfs_dir2.h" #include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -126,7 +126,7 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .verify_write = xfs_dir3_block_write_verify, }; -static int +int xfs_dir3_block_read( struct xfs_trans *tp, struct xfs_inode *dp, @@ -369,7 +369,7 @@ xfs_dir2_block_addname( if (error) return error; - len = xfs_dir2_data_entsize(args->namelen); + len = xfs_dir3_data_entsize(mp, args->namelen); /* * Set up pointers to parts of the block. @@ -549,7 +549,8 @@ xfs_dir2_block_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, args->namelen); - tagp = xfs_dir2_data_entry_tag_p(dep); + xfs_dir3_dirent_put_ftype(mp, dep, args->filetype); + tagp = xfs_dir3_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Clean up the bestfree array and log the header, tail, and entry. @@ -565,101 +566,6 @@ xfs_dir2_block_addname( } /* - * Readdir for block directories. - */ -int /* error */ -xfs_dir2_block_getdents( - xfs_inode_t *dp, /* incore inode */ - struct dir_context *ctx) -{ - xfs_dir2_data_hdr_t *hdr; /* block header */ - struct xfs_buf *bp; /* buffer for block */ - xfs_dir2_block_tail_t *btp; /* block tail */ - xfs_dir2_data_entry_t *dep; /* block data entry */ - xfs_dir2_data_unused_t *dup; /* block unused entry */ - char *endptr; /* end of the data entries */ - int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ - char *ptr; /* current data entry */ - int wantoff; /* starting block offset */ - xfs_off_t cook; - - mp = dp->i_mount; - /* - * If the block number in the offset is out of range, we're done. - */ - if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) - return 0; - - error = xfs_dir3_block_read(NULL, dp, &bp); - if (error) - return error; - - /* - * Extract the byte offset we start at from the seek pointer. - * We'll skip entries before this. - */ - wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos); - hdr = bp->b_addr; - xfs_dir3_data_check(dp, bp); - /* - * Set up values for the loop. - */ - btp = xfs_dir2_block_tail_p(mp, hdr); - ptr = (char *)xfs_dir3_data_entry_p(hdr); - endptr = (char *)xfs_dir2_block_leaf_p(btp); - - /* - * Loop over the data portion of the block. - * Each object is a real entry (dep) or an unused one (dup). - */ - while (ptr < endptr) { - dup = (xfs_dir2_data_unused_t *)ptr; - /* - * Unused, skip it. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ptr += be16_to_cpu(dup->length); - continue; - } - - dep = (xfs_dir2_data_entry_t *)ptr; - - /* - * Bump pointer for the next iteration. - */ - ptr += xfs_dir2_data_entsize(dep->namelen); - /* - * The entry is before the desired starting point, skip it. - */ - if ((char *)dep - (char *)hdr < wantoff) - continue; - - cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - (char *)dep - (char *)hdr); - - ctx->pos = cook & 0x7fffffff; - /* - * If it didn't fit, set the final offset to here & return. - */ - if (!dir_emit(ctx, (char *)dep->name, dep->namelen, - be64_to_cpu(dep->inumber), DT_UNKNOWN)) { - xfs_trans_brelse(NULL, bp); - return 0; - } - } - - /* - * Reached the end of the block. - * Set the offset to a non-existent block 1 and return. - */ - ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & - 0x7fffffff; - xfs_trans_brelse(NULL, bp); - return 0; -} - -/* * Log leaf entries from the block. */ static void @@ -736,6 +642,7 @@ xfs_dir2_block_lookup( * Fill in inode number, CI name if appropriate, release the block. */ args->inumber = be64_to_cpu(dep->inumber); + args->filetype = xfs_dir3_dirent_get_ftype(mp, dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(args->trans, bp); return XFS_ERROR(error); @@ -894,7 +801,7 @@ xfs_dir2_block_removename( needlog = needscan = 0; xfs_dir2_data_make_free(tp, bp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), - xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); + xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan); /* * Fix up the block tail. */ @@ -968,6 +875,7 @@ xfs_dir2_block_replace( * Change the inode number to the new value. */ dep->inumber = cpu_to_be64(args->inumber); + xfs_dir3_dirent_put_ftype(mp, dep, args->filetype); xfs_dir2_data_log_entry(args->trans, bp, dep); xfs_dir3_data_check(dp, bp); return 0; @@ -1254,7 +1162,8 @@ xfs_dir2_sf_to_block( dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; - tagp = xfs_dir2_data_entry_tag_p(dep); + xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR); + tagp = xfs_dir3_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(tp, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); @@ -1267,7 +1176,8 @@ xfs_dir2_sf_to_block( dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; - tagp = xfs_dir2_data_entry_tag_p(dep); + xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR); + tagp = xfs_dir3_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(tp, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); @@ -1312,10 +1222,12 @@ xfs_dir2_sf_to_block( * Copy a real entry. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset); - dep->inumber = cpu_to_be64(xfs_dir2_sfe_get_ino(sfp, sfep)); + dep->inumber = cpu_to_be64(xfs_dir3_sfe_get_ino(mp, sfp, sfep)); dep->namelen = sfep->namelen; + xfs_dir3_dirent_put_ftype(mp, dep, + xfs_dir3_sfe_get_ftype(mp, sfp, sfep)); memcpy(dep->name, sfep->name, dep->namelen); - tagp = xfs_dir2_data_entry_tag_p(dep); + tagp = xfs_dir3_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(tp, bp, dep); name.name = sfep->name; @@ -1328,7 +1240,7 @@ xfs_dir2_sf_to_block( if (++i == sfp->count) sfep = NULL; else - sfep = xfs_dir2_sf_nextentry(sfp, sfep); + sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); } /* Done with the temporary buffer */ kmem_free(sfp); diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index c2930238005c..47e1326c169a 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -29,14 +29,12 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" -STATIC xfs_dir2_data_free_t * -xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); - /* * Check the consistency of the data block. * The input can also be a block-format directory. @@ -149,8 +147,10 @@ __xfs_dir3_data_check( XFS_WANT_CORRUPTED_RETURN( !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); XFS_WANT_CORRUPTED_RETURN( - be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == + be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)) == (char *)dep - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN( + xfs_dir3_dirent_get_ftype(mp, dep) < XFS_DIR3_FT_MAX); count++; lastfree = 0; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || @@ -168,7 +168,7 @@ __xfs_dir3_data_check( } XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); } - p += xfs_dir2_data_entsize(dep->namelen); + p += xfs_dir3_data_entsize(mp, dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. @@ -325,7 +325,7 @@ xfs_dir3_data_readahead( * Given a data block and an unused entry from that block, * return the bestfree entry if any that corresponds to it. */ -STATIC xfs_dir2_data_free_t * +xfs_dir2_data_free_t * xfs_dir2_data_freefind( xfs_dir2_data_hdr_t *hdr, /* data block */ xfs_dir2_data_unused_t *dup) /* data unused entry */ @@ -333,7 +333,7 @@ xfs_dir2_data_freefind( xfs_dir2_data_free_t *dfp; /* bestfree entry */ xfs_dir2_data_aoff_t off; /* offset value needed */ struct xfs_dir2_data_free *bf; -#if defined(DEBUG) && defined(__KERNEL__) +#ifdef DEBUG int matched; /* matched the value */ int seenzero; /* saw a 0 bestfree entry */ #endif @@ -341,7 +341,7 @@ xfs_dir2_data_freefind( off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); bf = xfs_dir3_data_bestfree_p(hdr); -#if defined(DEBUG) && defined(__KERNEL__) +#ifdef DEBUG /* * Validate some consistency in the bestfree table. * Check order, non-overlapping entries, and if we find the @@ -538,8 +538,8 @@ xfs_dir2_data_freescan( else { dep = (xfs_dir2_data_entry_t *)p; ASSERT((char *)dep - (char *)hdr == - be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep))); - p += xfs_dir2_data_entsize(dep->namelen); + be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep))); + p += xfs_dir3_data_entsize(mp, dep->namelen); } } } @@ -629,7 +629,8 @@ xfs_dir2_data_log_entry( struct xfs_buf *bp, xfs_dir2_data_entry_t *dep) /* data entry pointer */ { - xfs_dir2_data_hdr_t *hdr = bp->b_addr; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + struct xfs_mount *mp = tp->t_mountp; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || @@ -637,7 +638,7 @@ xfs_dir2_data_log_entry( hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), - (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - + (uint)((char *)(xfs_dir3_data_entry_tag_p(mp, dep) + 1) - (char *)hdr - 1)); } diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index 7826782b8d78..a0961a61ac1a 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -69,6 +69,23 @@ #define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */ /* + * Dirents in version 3 directories have a file type field. Additions to this + * list are an on-disk format change, requiring feature bits. Valid values + * are as follows: + */ +#define XFS_DIR3_FT_UNKNOWN 0 +#define XFS_DIR3_FT_REG_FILE 1 +#define XFS_DIR3_FT_DIR 2 +#define XFS_DIR3_FT_CHRDEV 3 +#define XFS_DIR3_FT_BLKDEV 4 +#define XFS_DIR3_FT_FIFO 5 +#define XFS_DIR3_FT_SOCK 6 +#define XFS_DIR3_FT_SYMLINK 7 +#define XFS_DIR3_FT_WHT 8 + +#define XFS_DIR3_FT_MAX 9 + +/* * Byte offset in data block and shortform entry. */ typedef __uint16_t xfs_dir2_data_off_t; @@ -138,6 +155,9 @@ typedef struct xfs_dir2_sf_entry { xfs_dir2_sf_off_t offset; /* saved offset */ __u8 name[]; /* name, variable size */ /* + * A single byte containing the file type field follows the inode + * number for version 3 directory entries. + * * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a * variable offset after the name. */ @@ -162,16 +182,6 @@ xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) put_unaligned_be16(off, &sfep->offset.i); } -static inline int -xfs_dir2_sf_entsize(struct xfs_dir2_sf_hdr *hdr, int len) -{ - return sizeof(struct xfs_dir2_sf_entry) + /* namelen + offset */ - len + /* name */ - (hdr->i8count ? /* ino */ - sizeof(xfs_dir2_ino8_t) : - sizeof(xfs_dir2_ino4_t)); -} - static inline struct xfs_dir2_sf_entry * xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) { @@ -179,14 +189,78 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count)); } +static inline int +xfs_dir3_sf_entsize( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ + + count += len; /* name */ + count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : + sizeof(xfs_dir2_ino4_t); /* ino # */ + if (xfs_sb_version_hasftype(&mp->m_sb)) + count += sizeof(__uint8_t); /* file type */ + return count; +} + static inline struct xfs_dir2_sf_entry * -xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr, - struct xfs_dir2_sf_entry *sfep) +xfs_dir3_sf_nextentry( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) { return (struct xfs_dir2_sf_entry *) - ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); + ((char *)sfep + xfs_dir3_sf_entsize(mp, hdr, sfep->namelen)); } +/* + * in dir3 shortform directories, the file type field is stored at a variable + * offset after the inode number. Because it's only a single byte, endian + * conversion is not necessary. + */ +static inline __uint8_t * +xfs_dir3_sfe_ftypep( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (__uint8_t *)&sfep->name[sfep->namelen]; +} + +static inline __uint8_t +xfs_dir3_sfe_get_ftype( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + __uint8_t *ftp; + + if (!xfs_sb_version_hasftype(&mp->m_sb)) + return XFS_DIR3_FT_UNKNOWN; + + ftp = xfs_dir3_sfe_ftypep(hdr, sfep); + if (*ftp >= XFS_DIR3_FT_MAX) + return XFS_DIR3_FT_UNKNOWN; + return *ftp; +} + +static inline void +xfs_dir3_sfe_put_ftype( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype) +{ + __uint8_t *ftp; + + ASSERT(ftype < XFS_DIR3_FT_MAX); + + if (!xfs_sb_version_hasftype(&mp->m_sb)) + return; + ftp = xfs_dir3_sfe_ftypep(hdr, sfep); + *ftp = ftype; +} /* * Data block structures. @@ -286,12 +360,18 @@ xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) * Active entry in a data block. * * Aligned to 8 bytes. After the variable length name field there is a - * 2 byte tag field, which can be accessed using xfs_dir2_data_entry_tag_p. + * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p. + * + * For dir3 structures, there is file type field between the name and the tag. + * This can only be manipulated by helper functions. It is packed hard against + * the end of the name so any padding for rounding is between the file type and + * the tag. */ typedef struct xfs_dir2_data_entry { __be64 inumber; /* inode number */ __u8 namelen; /* name length */ __u8 name[]; /* name bytes, no null */ + /* __u8 filetype; */ /* type of inode we point to */ /* __be16 tag; */ /* starting offset of us */ } xfs_dir2_data_entry_t; @@ -311,20 +391,67 @@ typedef struct xfs_dir2_data_unused { /* * Size of a data entry. */ -static inline int xfs_dir2_data_entsize(int n) +static inline int +__xfs_dir3_data_entsize( + bool ftype, + int n) { - return (int)roundup(offsetof(struct xfs_dir2_data_entry, name[0]) + n + - (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN); + int size = offsetof(struct xfs_dir2_data_entry, name[0]); + + size += n; + size += sizeof(xfs_dir2_data_off_t); + if (ftype) + size += sizeof(__uint8_t); + return roundup(size, XFS_DIR2_DATA_ALIGN); +} +static inline int +xfs_dir3_data_entsize( + struct xfs_mount *mp, + int n) +{ + bool ftype = xfs_sb_version_hasftype(&mp->m_sb) ? true : false; + return __xfs_dir3_data_entsize(ftype, n); +} + +static inline __uint8_t +xfs_dir3_dirent_get_ftype( + struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep) +{ + if (xfs_sb_version_hasftype(&mp->m_sb)) { + __uint8_t type = dep->name[dep->namelen]; + + ASSERT(type < XFS_DIR3_FT_MAX); + if (type < XFS_DIR3_FT_MAX) + return type; + + } + return XFS_DIR3_FT_UNKNOWN; +} + +static inline void +xfs_dir3_dirent_put_ftype( + struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep, + __uint8_t type) +{ + ASSERT(type < XFS_DIR3_FT_MAX); + ASSERT(dep->namelen != 0); + + if (xfs_sb_version_hasftype(&mp->m_sb)) + dep->name[dep->namelen] = type; } /* * Pointer to an entry's tag word. */ static inline __be16 * -xfs_dir2_data_entry_tag_p(struct xfs_dir2_data_entry *dep) +xfs_dir3_data_entry_tag_p( + struct xfs_mount *mp, + struct xfs_dir2_data_entry *dep) { return (__be16 *)((char *)dep + - xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); + xfs_dir3_data_entsize(mp, dep->namelen) - sizeof(__be16)); } /* @@ -375,13 +502,17 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) * data block header because the sfe embeds the block offset of the entry into * it so that it doesn't change when format conversion occurs. Bad Things Happen * if we don't follow this rule. + * + * XXX: there is scope for significant optimisation of the logic here. Right + * now we are checking for "dir3 format" over and over again. Ideally we should + * only do it once for each operation. */ #define XFS_DIR3_DATA_DOT_OFFSET(mp) \ xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb)) #define XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \ - (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir2_data_entsize(1)) + (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 1)) #define XFS_DIR3_DATA_FIRST_OFFSET(mp) \ - (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir2_data_entsize(2)) + (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 2)) static inline xfs_dir2_data_aoff_t xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr) @@ -392,13 +523,19 @@ xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr) static inline xfs_dir2_data_aoff_t xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr) { - return xfs_dir3_data_dot_offset(hdr) + xfs_dir2_data_entsize(1); + bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + return xfs_dir3_data_dot_offset(hdr) + + __xfs_dir3_data_entsize(dir3, 1); } static inline xfs_dir2_data_aoff_t xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr) { - return xfs_dir3_data_dotdot_offset(hdr) + xfs_dir2_data_entsize(2); + bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + return xfs_dir3_data_dotdot_offset(hdr) + + __xfs_dir3_data_entsize(dir3, 2); } /* @@ -519,6 +656,9 @@ struct xfs_dir3_leaf { #define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) +extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from); + static inline int xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp) { diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 2aed25cae04d..08984eeee159 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -31,6 +31,7 @@ #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -695,7 +696,7 @@ xfs_dir2_leaf_addname( ents = xfs_dir3_leaf_ents_p(leaf); xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); - length = xfs_dir2_data_entsize(args->namelen); + length = xfs_dir3_data_entsize(mp, args->namelen); /* * See if there are any entries with the same hash value @@ -896,7 +897,8 @@ xfs_dir2_leaf_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - tagp = xfs_dir2_data_entry_tag_p(dep); + xfs_dir3_dirent_put_ftype(mp, dep, args->filetype); + tagp = xfs_dir3_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Need to scan fix up the bestfree table. @@ -1083,396 +1085,6 @@ xfs_dir3_leaf_compact_x1( *highstalep = highstale; } -struct xfs_dir2_leaf_map_info { - xfs_extlen_t map_blocks; /* number of fsbs in map */ - xfs_dablk_t map_off; /* last mapped file offset */ - int map_size; /* total entries in *map */ - int map_valid; /* valid entries in *map */ - int nmap; /* mappings to ask xfs_bmapi */ - xfs_dir2_db_t curdb; /* db for current block */ - int ra_current; /* number of read-ahead blks */ - int ra_index; /* *map index for read-ahead */ - int ra_offset; /* map entry offset for ra */ - int ra_want; /* readahead count wanted */ - struct xfs_bmbt_irec map[]; /* map vector for blocks */ -}; - -STATIC int -xfs_dir2_leaf_readbuf( - struct xfs_inode *dp, - size_t bufsize, - struct xfs_dir2_leaf_map_info *mip, - xfs_dir2_off_t *curoff, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = dp->i_mount; - struct xfs_buf *bp = *bpp; - struct xfs_bmbt_irec *map = mip->map; - struct blk_plug plug; - int error = 0; - int length; - int i; - int j; - - /* - * If we have a buffer, we need to release it and - * take it out of the mapping. - */ - - if (bp) { - xfs_trans_brelse(NULL, bp); - bp = NULL; - mip->map_blocks -= mp->m_dirblkfsbs; - /* - * Loop to get rid of the extents for the - * directory block. - */ - for (i = mp->m_dirblkfsbs; i > 0; ) { - j = min_t(int, map->br_blockcount, i); - map->br_blockcount -= j; - map->br_startblock += j; - map->br_startoff += j; - /* - * If mapping is done, pitch it from - * the table. - */ - if (!map->br_blockcount && --mip->map_valid) - memmove(&map[0], &map[1], - sizeof(map[0]) * mip->map_valid); - i -= j; - } - } - - /* - * Recalculate the readahead blocks wanted. - */ - mip->ra_want = howmany(bufsize + mp->m_dirblksize, - mp->m_sb.sb_blocksize) - 1; - ASSERT(mip->ra_want >= 0); - - /* - * If we don't have as many as we want, and we haven't - * run out of data blocks, get some more mappings. - */ - if (1 + mip->ra_want > mip->map_blocks && - mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { - /* - * Get more bmaps, fill in after the ones - * we already have in the table. - */ - mip->nmap = mip->map_size - mip->map_valid; - error = xfs_bmapi_read(dp, mip->map_off, - xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - - mip->map_off, - &map[mip->map_valid], &mip->nmap, 0); - - /* - * Don't know if we should ignore this or try to return an - * error. The trouble with returning errors is that readdir - * will just stop without actually passing the error through. - */ - if (error) - goto out; /* XXX */ - - /* - * If we got all the mappings we asked for, set the final map - * offset based on the last bmap value received. Otherwise, - * we've reached the end. - */ - if (mip->nmap == mip->map_size - mip->map_valid) { - i = mip->map_valid + mip->nmap - 1; - mip->map_off = map[i].br_startoff + map[i].br_blockcount; - } else - mip->map_off = xfs_dir2_byte_to_da(mp, - XFS_DIR2_LEAF_OFFSET); - - /* - * Look for holes in the mapping, and eliminate them. Count up - * the valid blocks. - */ - for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { - if (map[i].br_startblock == HOLESTARTBLOCK) { - mip->nmap--; - length = mip->map_valid + mip->nmap - i; - if (length) - memmove(&map[i], &map[i + 1], - sizeof(map[i]) * length); - } else { - mip->map_blocks += map[i].br_blockcount; - i++; - } - } - mip->map_valid += mip->nmap; - } - - /* - * No valid mappings, so no more data blocks. - */ - if (!mip->map_valid) { - *curoff = xfs_dir2_da_to_byte(mp, mip->map_off); - goto out; - } - - /* - * Read the directory block starting at the first mapping. - */ - mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_dir3_data_read(NULL, dp, map->br_startoff, - map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); - - /* - * Should just skip over the data block instead of giving up. - */ - if (error) - goto out; /* XXX */ - - /* - * Adjust the current amount of read-ahead: we just read a block that - * was previously ra. - */ - if (mip->ra_current) - mip->ra_current -= mp->m_dirblkfsbs; - - /* - * Do we need more readahead? - */ - blk_start_plug(&plug); - for (mip->ra_index = mip->ra_offset = i = 0; - mip->ra_want > mip->ra_current && i < mip->map_blocks; - i += mp->m_dirblkfsbs) { - ASSERT(mip->ra_index < mip->map_valid); - /* - * Read-ahead a contiguous directory block. - */ - if (i > mip->ra_current && - map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_dir3_data_readahead(NULL, dp, - map[mip->ra_index].br_startoff + mip->ra_offset, - XFS_FSB_TO_DADDR(mp, - map[mip->ra_index].br_startblock + - mip->ra_offset)); - mip->ra_current = i; - } - - /* - * Read-ahead a non-contiguous directory block. This doesn't - * use our mapping, but this is a very rare case. - */ - else if (i > mip->ra_current) { - xfs_dir3_data_readahead(NULL, dp, - map[mip->ra_index].br_startoff + - mip->ra_offset, -1); - mip->ra_current = i; - } - - /* - * Advance offset through the mapping table. - */ - for (j = 0; j < mp->m_dirblkfsbs; j++) { - /* - * The rest of this extent but not more than a dir - * block. - */ - length = min_t(int, mp->m_dirblkfsbs, - map[mip->ra_index].br_blockcount - - mip->ra_offset); - j += length; - mip->ra_offset += length; - - /* - * Advance to the next mapping if this one is used up. - */ - if (mip->ra_offset == map[mip->ra_index].br_blockcount) { - mip->ra_offset = 0; - mip->ra_index++; - } - } - } - blk_finish_plug(&plug); - -out: - *bpp = bp; - return error; -} - -/* - * Getdents (readdir) for leaf and node directories. - * This reads the data blocks only, so is the same for both forms. - */ -int /* error */ -xfs_dir2_leaf_getdents( - xfs_inode_t *dp, /* incore directory inode */ - struct dir_context *ctx, - size_t bufsize) -{ - struct xfs_buf *bp = NULL; /* data block buffer */ - xfs_dir2_data_hdr_t *hdr; /* data block header */ - xfs_dir2_data_entry_t *dep; /* data entry */ - xfs_dir2_data_unused_t *dup; /* unused entry */ - int error = 0; /* error return value */ - int length; /* temporary length value */ - xfs_mount_t *mp; /* filesystem mount point */ - int byteoff; /* offset in current block */ - xfs_dir2_off_t curoff; /* current overall offset */ - xfs_dir2_off_t newoff; /* new curoff after new blk */ - char *ptr = NULL; /* pointer to current data */ - struct xfs_dir2_leaf_map_info *map_info; - - /* - * If the offset is at or past the largest allowed value, - * give up right away. - */ - if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) - return 0; - - mp = dp->i_mount; - - /* - * Set up to bmap a number of blocks based on the caller's - * buffer size, the directory block size, and the filesystem - * block size. - */ - length = howmany(bufsize + mp->m_dirblksize, - mp->m_sb.sb_blocksize); - map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + - (length * sizeof(struct xfs_bmbt_irec)), - KM_SLEEP | KM_NOFS); - map_info->map_size = length; - - /* - * Inside the loop we keep the main offset value as a byte offset - * in the directory file. - */ - curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos); - - /* - * Force this conversion through db so we truncate the offset - * down to get the start of the data block. - */ - map_info->map_off = xfs_dir2_db_to_da(mp, - xfs_dir2_byte_to_db(mp, curoff)); - - /* - * Loop over directory entries until we reach the end offset. - * Get more blocks and readahead as necessary. - */ - while (curoff < XFS_DIR2_LEAF_OFFSET) { - /* - * If we have no buffer, or we're off the end of the - * current buffer, need to get another one. - */ - if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) { - - error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info, - &curoff, &bp); - if (error || !map_info->map_valid) - break; - - /* - * Having done a read, we need to set a new offset. - */ - newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0); - /* - * Start of the current block. - */ - if (curoff < newoff) - curoff = newoff; - /* - * Make sure we're in the right block. - */ - else if (curoff > newoff) - ASSERT(xfs_dir2_byte_to_db(mp, curoff) == - map_info->curdb); - hdr = bp->b_addr; - xfs_dir3_data_check(dp, bp); - /* - * Find our position in the block. - */ - ptr = (char *)xfs_dir3_data_entry_p(hdr); - byteoff = xfs_dir2_byte_to_off(mp, curoff); - /* - * Skip past the header. - */ - if (byteoff == 0) - curoff += xfs_dir3_data_entry_offset(hdr); - /* - * Skip past entries until we reach our offset. - */ - else { - while ((char *)ptr - (char *)hdr < byteoff) { - dup = (xfs_dir2_data_unused_t *)ptr; - - if (be16_to_cpu(dup->freetag) - == XFS_DIR2_DATA_FREE_TAG) { - - length = be16_to_cpu(dup->length); - ptr += length; - continue; - } - dep = (xfs_dir2_data_entry_t *)ptr; - length = - xfs_dir2_data_entsize(dep->namelen); - ptr += length; - } - /* - * Now set our real offset. - */ - curoff = - xfs_dir2_db_off_to_byte(mp, - xfs_dir2_byte_to_db(mp, curoff), - (char *)ptr - (char *)hdr); - if (ptr >= (char *)hdr + mp->m_dirblksize) { - continue; - } - } - } - /* - * We have a pointer to an entry. - * Is it a live one? - */ - dup = (xfs_dir2_data_unused_t *)ptr; - /* - * No, it's unused, skip over it. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - length = be16_to_cpu(dup->length); - ptr += length; - curoff += length; - continue; - } - - dep = (xfs_dir2_data_entry_t *)ptr; - length = xfs_dir2_data_entsize(dep->namelen); - - ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; - if (!dir_emit(ctx, (char *)dep->name, dep->namelen, - be64_to_cpu(dep->inumber), DT_UNKNOWN)) - break; - - /* - * Advance to next entry in the block. - */ - ptr += length; - curoff += length; - /* bufsize may have just been a guess; don't go negative */ - bufsize = bufsize > length ? bufsize - length : 0; - } - - /* - * All done. Set output offset value to current offset. - */ - if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) - ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; - else - ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; - kmem_free(map_info); - if (bp) - xfs_trans_brelse(NULL, bp); - return error; -} - - /* * Log the bests entries indicated from a leaf1 block. */ @@ -1614,6 +1226,7 @@ xfs_dir2_leaf_lookup( * Return the found inode number & CI name if appropriate */ args->inumber = be64_to_cpu(dep->inumber); + args->filetype = xfs_dir3_dirent_get_ftype(dp->i_mount, dep); error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); xfs_trans_brelse(tp, dbp); xfs_trans_brelse(tp, lbp); @@ -1816,7 +1429,7 @@ xfs_dir2_leaf_removename( */ xfs_dir2_data_make_free(tp, dbp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), - xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); + xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan); /* * We just mark the leaf entry stale by putting a null in it. */ @@ -1944,6 +1557,7 @@ xfs_dir2_leaf_replace( * Put the new inode number in, log it. */ dep->inumber = cpu_to_be64(args->inumber); + xfs_dir3_dirent_put_ftype(dp->i_mount, dep, args->filetype); tp = args->trans; xfs_dir2_data_log_entry(tp, dbp, dep); xfs_dir3_leaf_check(dp->i_mount, lbp); @@ -1975,10 +1589,6 @@ xfs_dir2_leaf_search_hash( ents = xfs_dir3_leaf_ents_p(leaf); xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); -#ifndef __KERNEL__ - if (!leafhdr.count) - return 0; -#endif /* * Note, the table cannot be empty, so we have to go through the loop. * Binary search the leaf entries looking for our hash value. diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 2226a00acd15..4c3dba7ffb74 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -30,6 +30,7 @@ #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -312,11 +313,13 @@ xfs_dir2_free_log_header( struct xfs_trans *tp, struct xfs_buf *bp) { +#ifdef DEBUG xfs_dir2_free_t *free; /* freespace structure */ free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); +#endif xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1); } @@ -602,7 +605,7 @@ xfs_dir2_leafn_lookup_for_addname( ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); } - length = xfs_dir2_data_entsize(args->namelen); + length = xfs_dir3_data_entsize(mp, args->namelen); /* * Loop over leaf entries with the right hash value. */ @@ -813,6 +816,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_trans_brelse(tp, state->extrablk.bp); args->cmpresult = cmp; args->inumber = be64_to_cpu(dep->inumber); + args->filetype = xfs_dir3_dirent_get_ftype(mp, dep); *indexp = index; state->extravalid = 1; state->extrablk.bp = curbp; @@ -1256,7 +1260,7 @@ xfs_dir2_leafn_remove( longest = be16_to_cpu(bf[0].length); needlog = needscan = 0; xfs_dir2_data_make_free(tp, dbp, off, - xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); + xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan); /* * Rescan the data block freespaces for bestfree. * Log the data block header if needed. @@ -1708,7 +1712,7 @@ xfs_dir2_node_addname_int( dp = args->dp; mp = dp->i_mount; tp = args->trans; - length = xfs_dir2_data_entsize(args->namelen); + length = xfs_dir3_data_entsize(mp, args->namelen); /* * If we came in with a freespace block that means that lookup * found an entry with our hash value. This is the freespace @@ -2004,7 +2008,8 @@ xfs_dir2_node_addname_int( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - tagp = xfs_dir2_data_entry_tag_p(dep); + xfs_dir3_dirent_put_ftype(mp, dep, args->filetype); + tagp = xfs_dir3_data_entry_tag_p(mp, dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); xfs_dir2_data_log_entry(tp, dbp, dep); /* @@ -2224,6 +2229,7 @@ xfs_dir2_node_replace( * Fill in the new inode number and log the entry. */ dep->inumber = cpu_to_be64(inum); + xfs_dir3_dirent_put_ftype(state->mp, dep, args->filetype); xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep); rval = 0; } diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 0511cda4a712..1bad84c40829 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -18,23 +18,26 @@ #ifndef __XFS_DIR2_PRIV_H__ #define __XFS_DIR2_PRIV_H__ +struct dir_context; + /* xfs_dir2.c */ extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); -extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r); -extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); -extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, - struct xfs_buf *bp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); -/* xfs_dir2_block.c */ -extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; +#define S_SHIFT 12 +extern const unsigned char xfs_mode_to_ftype[]; + +extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, + __uint8_t filetype); + +/* xfs_dir2_block.c */ +extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_buf **bpp); extern int xfs_dir2_block_addname(struct xfs_da_args *args); -extern int xfs_dir2_block_getdents(struct xfs_inode *dp, - struct dir_context *ctx); extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); extern int xfs_dir2_block_replace(struct xfs_da_args *args); @@ -48,9 +51,6 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, #define xfs_dir3_data_check(dp,bp) #endif -extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; -extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; - extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); @@ -60,27 +60,10 @@ extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup, int *loghead); -extern void xfs_dir2_data_freescan(struct xfs_mount *mp, - struct xfs_dir2_data_hdr *hdr, int *loghead); extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, struct xfs_buf **bpp); -extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_dir2_data_entry *dep); -extern void xfs_dir2_data_log_header(struct xfs_trans *tp, - struct xfs_buf *bp); -extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_dir2_data_unused *dup); -extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp, - xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, - int *needlogp, int *needscanp); -extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, - xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ -extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; -extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; - extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, @@ -91,8 +74,6 @@ extern void xfs_dir3_leaf_compact(struct xfs_da_args *args, extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_dir2_leaf_entry *ents, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); -extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx, - size_t bufsize); extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, struct xfs_buf **bpp, __uint16_t magic); extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, @@ -144,18 +125,18 @@ extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ -extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); -extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp, - struct xfs_dir2_sf_entry *sfep); extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, int size, xfs_dir2_sf_hdr_t *sfhp); extern int xfs_dir2_sf_addname(struct xfs_da_args *args); extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); -extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); +/* xfs_dir2_readdir.c */ +extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, + size_t bufsize); + #endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c new file mode 100644 index 000000000000..8993ec17452c --- /dev/null +++ b/fs/xfs/xfs_dir2_readdir.c @@ -0,0 +1,695 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" + +/* + * Directory file type support functions + */ +static unsigned char xfs_dir3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, + DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, +}; + +unsigned char +xfs_dir3_get_dtype( + struct xfs_mount *mp, + __uint8_t filetype) +{ + if (!xfs_sb_version_hasftype(&mp->m_sb)) + return DT_UNKNOWN; + + if (filetype >= XFS_DIR3_FT_MAX) + return DT_UNKNOWN; + + return xfs_dir3_filetype_table[filetype]; +} +/* + * @mode, if set, indicates that the type field needs to be set up. + * This uses the transformation from file mode to DT_* as defined in linux/fs.h + * for file type specification. This will be propagated into the directory + * structure if appropriate for the given operation and filesystem config. + */ +const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { + [0] = XFS_DIR3_FT_UNKNOWN, + [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR, + [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK, + [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK, +}; + +STATIC int +xfs_dir2_sf_getdents( + xfs_inode_t *dp, /* incore directory inode */ + struct dir_context *ctx) +{ + int i; /* shortform entry number */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_dataptr_t off; /* current entry's offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + xfs_dir2_dataptr_t dot_offset; + xfs_dir2_dataptr_t dotdot_offset; + xfs_ino_t ino; + + mp = dp->i_mount; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Give up if the directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + return XFS_ERROR(EIO); + } + + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) + return 0; + + /* + * Precalculate offsets for . and .. as we will always need them. + * + * XXX(hch): the second argument is sometimes 0 and sometimes + * mp->m_dirdatablk. + */ + dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + XFS_DIR3_DATA_DOT_OFFSET(mp)); + dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + XFS_DIR3_DATA_DOTDOT_OFFSET(mp)); + + /* + * Put . entry unless we're starting past it. + */ + if (ctx->pos <= dot_offset) { + ctx->pos = dot_offset & 0x7fffffff; + if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR)) + return 0; + } + + /* + * Put .. entry unless we're starting past it. + */ + if (ctx->pos <= dotdot_offset) { + ino = xfs_dir2_sf_get_parent_ino(sfp); + ctx->pos = dotdot_offset & 0x7fffffff; + if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) + return 0; + } + + /* + * Loop while there are more entries and put'ing works. + */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + __uint8_t filetype; + + off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + xfs_dir2_sf_get_offset(sfep)); + + if (ctx->pos > off) { + sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); + continue; + } + + ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep); + filetype = xfs_dir3_sfe_get_ftype(mp, sfp, sfep); + ctx->pos = off & 0x7fffffff; + if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, + xfs_dir3_get_dtype(mp, filetype))) + return 0; + sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); + } + + ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & + 0x7fffffff; + return 0; +} + +/* + * Readdir for block directories. + */ +STATIC int +xfs_dir2_block_getdents( + xfs_inode_t *dp, /* incore inode */ + struct dir_context *ctx) +{ + xfs_dir2_data_hdr_t *hdr; /* block header */ + struct xfs_buf *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + char *endptr; /* end of the data entries */ + int error; /* error return value */ + xfs_mount_t *mp; /* filesystem mount point */ + char *ptr; /* current data entry */ + int wantoff; /* starting block offset */ + xfs_off_t cook; + + mp = dp->i_mount; + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) + return 0; + + error = xfs_dir3_block_read(NULL, dp, &bp); + if (error) + return error; + + /* + * Extract the byte offset we start at from the seek pointer. + * We'll skip entries before this. + */ + wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos); + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + /* + * Set up values for the loop. + */ + btp = xfs_dir2_block_tail_p(mp, hdr); + ptr = (char *)xfs_dir3_data_entry_p(hdr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + while (ptr < endptr) { + __uint8_t filetype; + + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * Unused, skip it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ptr += be16_to_cpu(dup->length); + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + + /* + * Bump pointer for the next iteration. + */ + ptr += xfs_dir3_data_entsize(mp, dep->namelen); + /* + * The entry is before the desired starting point, skip it. + */ + if ((char *)dep - (char *)hdr < wantoff) + continue; + + cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + (char *)dep - (char *)hdr); + + ctx->pos = cook & 0x7fffffff; + filetype = xfs_dir3_dirent_get_ftype(mp, dep); + /* + * If it didn't fit, set the final offset to here & return. + */ + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), + xfs_dir3_get_dtype(mp, filetype))) { + xfs_trans_brelse(NULL, bp); + return 0; + } + } + + /* + * Reached the end of the block. + * Set the offset to a non-existent block 1 and return. + */ + ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & + 0x7fffffff; + xfs_trans_brelse(NULL, bp); + return 0; +} + +struct xfs_dir2_leaf_map_info { + xfs_extlen_t map_blocks; /* number of fsbs in map */ + xfs_dablk_t map_off; /* last mapped file offset */ + int map_size; /* total entries in *map */ + int map_valid; /* valid entries in *map */ + int nmap; /* mappings to ask xfs_bmapi */ + xfs_dir2_db_t curdb; /* db for current block */ + int ra_current; /* number of read-ahead blks */ + int ra_index; /* *map index for read-ahead */ + int ra_offset; /* map entry offset for ra */ + int ra_want; /* readahead count wanted */ + struct xfs_bmbt_irec map[]; /* map vector for blocks */ +}; + +STATIC int +xfs_dir2_leaf_readbuf( + struct xfs_inode *dp, + size_t bufsize, + struct xfs_dir2_leaf_map_info *mip, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp = *bpp; + struct xfs_bmbt_irec *map = mip->map; + struct blk_plug plug; + int error = 0; + int length; + int i; + int j; + + /* + * If we have a buffer, we need to release it and + * take it out of the mapping. + */ + + if (bp) { + xfs_trans_brelse(NULL, bp); + bp = NULL; + mip->map_blocks -= mp->m_dirblkfsbs; + /* + * Loop to get rid of the extents for the + * directory block. + */ + for (i = mp->m_dirblkfsbs; i > 0; ) { + j = min_t(int, map->br_blockcount, i); + map->br_blockcount -= j; + map->br_startblock += j; + map->br_startoff += j; + /* + * If mapping is done, pitch it from + * the table. + */ + if (!map->br_blockcount && --mip->map_valid) + memmove(&map[0], &map[1], + sizeof(map[0]) * mip->map_valid); + i -= j; + } + } + + /* + * Recalculate the readahead blocks wanted. + */ + mip->ra_want = howmany(bufsize + mp->m_dirblksize, + mp->m_sb.sb_blocksize) - 1; + ASSERT(mip->ra_want >= 0); + + /* + * If we don't have as many as we want, and we haven't + * run out of data blocks, get some more mappings. + */ + if (1 + mip->ra_want > mip->map_blocks && + mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { + /* + * Get more bmaps, fill in after the ones + * we already have in the table. + */ + mip->nmap = mip->map_size - mip->map_valid; + error = xfs_bmapi_read(dp, mip->map_off, + xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - + mip->map_off, + &map[mip->map_valid], &mip->nmap, 0); + + /* + * Don't know if we should ignore this or try to return an + * error. The trouble with returning errors is that readdir + * will just stop without actually passing the error through. + */ + if (error) + goto out; /* XXX */ + + /* + * If we got all the mappings we asked for, set the final map + * offset based on the last bmap value received. Otherwise, + * we've reached the end. + */ + if (mip->nmap == mip->map_size - mip->map_valid) { + i = mip->map_valid + mip->nmap - 1; + mip->map_off = map[i].br_startoff + map[i].br_blockcount; + } else + mip->map_off = xfs_dir2_byte_to_da(mp, + XFS_DIR2_LEAF_OFFSET); + + /* + * Look for holes in the mapping, and eliminate them. Count up + * the valid blocks. + */ + for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { + if (map[i].br_startblock == HOLESTARTBLOCK) { + mip->nmap--; + length = mip->map_valid + mip->nmap - i; + if (length) + memmove(&map[i], &map[i + 1], + sizeof(map[i]) * length); + } else { + mip->map_blocks += map[i].br_blockcount; + i++; + } + } + mip->map_valid += mip->nmap; + } + + /* + * No valid mappings, so no more data blocks. + */ + if (!mip->map_valid) { + *curoff = xfs_dir2_da_to_byte(mp, mip->map_off); + goto out; + } + + /* + * Read the directory block starting at the first mapping. + */ + mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); + error = xfs_dir3_data_read(NULL, dp, map->br_startoff, + map->br_blockcount >= mp->m_dirblkfsbs ? + XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); + + /* + * Should just skip over the data block instead of giving up. + */ + if (error) + goto out; /* XXX */ + + /* + * Adjust the current amount of read-ahead: we just read a block that + * was previously ra. + */ + if (mip->ra_current) + mip->ra_current -= mp->m_dirblkfsbs; + + /* + * Do we need more readahead? + */ + blk_start_plug(&plug); + for (mip->ra_index = mip->ra_offset = i = 0; + mip->ra_want > mip->ra_current && i < mip->map_blocks; + i += mp->m_dirblkfsbs) { + ASSERT(mip->ra_index < mip->map_valid); + /* + * Read-ahead a contiguous directory block. + */ + if (i > mip->ra_current && + map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { + xfs_dir3_data_readahead(NULL, dp, + map[mip->ra_index].br_startoff + mip->ra_offset, + XFS_FSB_TO_DADDR(mp, + map[mip->ra_index].br_startblock + + mip->ra_offset)); + mip->ra_current = i; + } + + /* + * Read-ahead a non-contiguous directory block. This doesn't + * use our mapping, but this is a very rare case. + */ + else if (i > mip->ra_current) { + xfs_dir3_data_readahead(NULL, dp, + map[mip->ra_index].br_startoff + + mip->ra_offset, -1); + mip->ra_current = i; + } + + /* + * Advance offset through the mapping table. + */ + for (j = 0; j < mp->m_dirblkfsbs; j++) { + /* + * The rest of this extent but not more than a dir + * block. + */ + length = min_t(int, mp->m_dirblkfsbs, + map[mip->ra_index].br_blockcount - + mip->ra_offset); + j += length; + mip->ra_offset += length; + + /* + * Advance to the next mapping if this one is used up. + */ + if (mip->ra_offset == map[mip->ra_index].br_blockcount) { + mip->ra_offset = 0; + mip->ra_index++; + } + } + } + blk_finish_plug(&plug); + +out: + *bpp = bp; + return error; +} + +/* + * Getdents (readdir) for leaf and node directories. + * This reads the data blocks only, so is the same for both forms. + */ +STATIC int +xfs_dir2_leaf_getdents( + xfs_inode_t *dp, /* incore directory inode */ + struct dir_context *ctx, + size_t bufsize) +{ + struct xfs_buf *bp = NULL; /* data block buffer */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_dir2_data_unused_t *dup; /* unused entry */ + int error = 0; /* error return value */ + int length; /* temporary length value */ + xfs_mount_t *mp; /* filesystem mount point */ + int byteoff; /* offset in current block */ + xfs_dir2_off_t curoff; /* current overall offset */ + xfs_dir2_off_t newoff; /* new curoff after new blk */ + char *ptr = NULL; /* pointer to current data */ + struct xfs_dir2_leaf_map_info *map_info; + + /* + * If the offset is at or past the largest allowed value, + * give up right away. + */ + if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) + return 0; + + mp = dp->i_mount; + + /* + * Set up to bmap a number of blocks based on the caller's + * buffer size, the directory block size, and the filesystem + * block size. + */ + length = howmany(bufsize + mp->m_dirblksize, + mp->m_sb.sb_blocksize); + map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + + (length * sizeof(struct xfs_bmbt_irec)), + KM_SLEEP | KM_NOFS); + map_info->map_size = length; + + /* + * Inside the loop we keep the main offset value as a byte offset + * in the directory file. + */ + curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos); + + /* + * Force this conversion through db so we truncate the offset + * down to get the start of the data block. + */ + map_info->map_off = xfs_dir2_db_to_da(mp, + xfs_dir2_byte_to_db(mp, curoff)); + + /* + * Loop over directory entries until we reach the end offset. + * Get more blocks and readahead as necessary. + */ + while (curoff < XFS_DIR2_LEAF_OFFSET) { + __uint8_t filetype; + + /* + * If we have no buffer, or we're off the end of the + * current buffer, need to get another one. + */ + if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) { + + error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info, + &curoff, &bp); + if (error || !map_info->map_valid) + break; + + /* + * Having done a read, we need to set a new offset. + */ + newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0); + /* + * Start of the current block. + */ + if (curoff < newoff) + curoff = newoff; + /* + * Make sure we're in the right block. + */ + else if (curoff > newoff) + ASSERT(xfs_dir2_byte_to_db(mp, curoff) == + map_info->curdb); + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + /* + * Find our position in the block. + */ + ptr = (char *)xfs_dir3_data_entry_p(hdr); + byteoff = xfs_dir2_byte_to_off(mp, curoff); + /* + * Skip past the header. + */ + if (byteoff == 0) + curoff += xfs_dir3_data_entry_offset(hdr); + /* + * Skip past entries until we reach our offset. + */ + else { + while ((char *)ptr - (char *)hdr < byteoff) { + dup = (xfs_dir2_data_unused_t *)ptr; + + if (be16_to_cpu(dup->freetag) + == XFS_DIR2_DATA_FREE_TAG) { + + length = be16_to_cpu(dup->length); + ptr += length; + continue; + } + dep = (xfs_dir2_data_entry_t *)ptr; + length = + xfs_dir3_data_entsize(mp, dep->namelen); + ptr += length; + } + /* + * Now set our real offset. + */ + curoff = + xfs_dir2_db_off_to_byte(mp, + xfs_dir2_byte_to_db(mp, curoff), + (char *)ptr - (char *)hdr); + if (ptr >= (char *)hdr + mp->m_dirblksize) { + continue; + } + } + } + /* + * We have a pointer to an entry. + * Is it a live one? + */ + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * No, it's unused, skip over it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + length = be16_to_cpu(dup->length); + ptr += length; + curoff += length; + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + length = xfs_dir3_data_entsize(mp, dep->namelen); + filetype = xfs_dir3_dirent_get_ftype(mp, dep); + + ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), + xfs_dir3_get_dtype(mp, filetype))) + break; + + /* + * Advance to next entry in the block. + */ + ptr += length; + curoff += length; + /* bufsize may have just been a guess; don't go negative */ + bufsize = bufsize > length ? bufsize - length : 0; + } + + /* + * All done. Set output offset value to current offset. + */ + if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) + ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; + else + ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + kmem_free(map_info); + if (bp) + xfs_trans_brelse(NULL, bp); + return error; +} + +/* + * Read a directory. + */ +int +xfs_readdir( + xfs_inode_t *dp, + struct dir_context *ctx, + size_t bufsize) +{ + int rval; /* return value */ + int v; /* type-checking value */ + + trace_xfs_readdir(dp); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return XFS_ERROR(EIO); + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + XFS_STATS_INC(xs_dir_getdents); + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_getdents(dp, ctx); + else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) + ; + else if (v) + rval = xfs_dir2_block_getdents(dp, ctx); + else + rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize); + return rval; +} diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 97676a347da1..bb6e2848f473 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -29,8 +29,8 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_error.h" -#include "xfs_dir2.h" #include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_trace.h" @@ -95,7 +95,7 @@ xfs_dir2_sf_get_parent_ino( return xfs_dir2_sf_get_ino(hdr, &hdr->parent); } -static void +void xfs_dir2_sf_put_parent_ino( struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino) @@ -105,31 +105,38 @@ xfs_dir2_sf_put_parent_ino( /* * In short-form directory entries the inode numbers are stored at variable - * offset behind the entry name. The inode numbers may only be accessed - * through the helpers below. + * offset behind the entry name. If the entry stores a filetype value, then it + * sits between the name and the inode number. Hence the inode numbers may only + * be accessed through the helpers below. */ static xfs_dir2_inou_t * -xfs_dir2_sfe_inop( +xfs_dir3_sfe_inop( + struct xfs_mount *mp, struct xfs_dir2_sf_entry *sfep) { - return (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]; + __uint8_t *ptr = &sfep->name[sfep->namelen]; + if (xfs_sb_version_hasftype(&mp->m_sb)) + ptr++; + return (xfs_dir2_inou_t *)ptr; } xfs_ino_t -xfs_dir2_sfe_get_ino( +xfs_dir3_sfe_get_ino( + struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep) { - return xfs_dir2_sf_get_ino(hdr, xfs_dir2_sfe_inop(sfep)); + return xfs_dir2_sf_get_ino(hdr, xfs_dir3_sfe_inop(mp, sfep)); } -static void -xfs_dir2_sfe_put_ino( +void +xfs_dir3_sfe_put_ino( + struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, xfs_dir2_sfe_inop(sfep), ino); + xfs_dir2_sf_put_ino(hdr, xfs_dir3_sfe_inop(mp, sfep), ino); } /* @@ -157,9 +164,16 @@ xfs_dir2_block_sfsize( int namelen; /* total name bytes */ xfs_ino_t parent = 0; /* parent inode number */ int size=0; /* total computed size */ + int has_ftype; mp = dp->i_mount; + /* + * if there is a filetype field, add the extra byte to the namelen + * for each entry that we see. + */ + has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0; + count = i8count = namelen = 0; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -188,9 +202,10 @@ xfs_dir2_block_sfsize( if (!isdot) i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM; #endif + /* take into account the file type field */ if (!isdot && !isdotdot) { count++; - namelen += dep->namelen; + namelen += dep->namelen + has_ftype; } else if (isdotdot) parent = be64_to_cpu(dep->inumber); /* @@ -316,12 +331,14 @@ xfs_dir2_block_to_sf( (xfs_dir2_data_aoff_t) ((char *)dep - (char *)hdr)); memcpy(sfep->name, dep->name, dep->namelen); - xfs_dir2_sfe_put_ino(sfp, sfep, + xfs_dir3_sfe_put_ino(mp, sfp, sfep, be64_to_cpu(dep->inumber)); + xfs_dir3_sfe_put_ftype(mp, sfp, sfep, + xfs_dir3_dirent_get_ftype(mp, dep)); - sfep = xfs_dir2_sf_nextentry(sfp, sfep); + sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); } - ptr += xfs_dir2_data_entsize(dep->namelen); + ptr += xfs_dir3_data_entsize(mp, dep->namelen); } ASSERT((char *)sfep - (char *)sfp == size); xfs_dir2_sf_check(args); @@ -372,7 +389,7 @@ xfs_dir2_sf_addname( /* * Compute entry (and change in) size. */ - add_entsize = xfs_dir2_sf_entsize(sfp, args->namelen); + add_entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen); incr_isize = add_entsize; objchange = 0; #if XFS_BIG_INUMS @@ -466,8 +483,9 @@ xfs_dir2_sf_addname_easy( /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, xfs_dir2_sf_entsize(sfp, args->namelen), - XFS_DATA_FORK); + xfs_idata_realloc(dp, + xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen), + XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. */ @@ -479,7 +497,9 @@ xfs_dir2_sf_addname_easy( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); + xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep, args->inumber); + xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep, args->filetype); + /* * Update the header and inode. */ @@ -519,11 +539,13 @@ xfs_dir2_sf_addname_hard( xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */ + struct xfs_mount *mp; /* * Copy the old directory to the stack buffer. */ dp = args->dp; + mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_d.di_size; @@ -535,13 +557,13 @@ xfs_dir2_sf_addname_hard( * to insert the new entry. * If it's going to end up at the end then oldsfep will point there. */ - for (offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount), + for (offset = XFS_DIR3_DATA_FIRST_OFFSET(mp), oldsfep = xfs_dir2_sf_firstentry(oldsfp), - add_datasize = xfs_dir2_data_entsize(args->namelen), + add_datasize = xfs_dir3_data_entsize(mp, args->namelen), eof = (char *)oldsfep == &buf[old_isize]; !eof; - offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen), - oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep), + offset = new_offset + xfs_dir3_data_entsize(mp, oldsfep->namelen), + oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep), eof = (char *)oldsfep == &buf[old_isize]) { new_offset = xfs_dir2_sf_get_offset(oldsfep); if (offset + add_datasize <= new_offset) @@ -570,7 +592,8 @@ xfs_dir2_sf_addname_hard( sfep->namelen = args->namelen; xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); + xfs_dir3_sfe_put_ino(mp, sfp, sfep, args->inumber); + xfs_dir3_sfe_put_ftype(mp, sfp, sfep, args->filetype); sfp->count++; #if XFS_BIG_INUMS if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) @@ -580,7 +603,7 @@ xfs_dir2_sf_addname_hard( * If there's more left to copy, do that. */ if (!eof) { - sfep = xfs_dir2_sf_nextentry(sfp, sfep); + sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } kmem_free(buf); @@ -616,7 +639,7 @@ xfs_dir2_sf_addname_pick( mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - size = xfs_dir2_data_entsize(args->namelen); + size = xfs_dir3_data_entsize(mp, args->namelen); offset = XFS_DIR3_DATA_FIRST_OFFSET(mp); sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; @@ -629,8 +652,8 @@ xfs_dir2_sf_addname_pick( if (!holefit) holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); offset = xfs_dir2_sf_get_offset(sfep) + - xfs_dir2_data_entsize(sfep->namelen); - sfep = xfs_dir2_sf_nextentry(sfp, sfep); + xfs_dir3_data_entsize(mp, sfep->namelen); + sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep); } /* * Calculate data bytes used excluding the new entry, if this @@ -684,31 +707,34 @@ xfs_dir2_sf_check( int offset; /* data offset */ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_mount *mp; dp = args->dp; + mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount); + offset = XFS_DIR3_DATA_FIRST_OFFSET(mp); ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep)) { ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); - ino = xfs_dir2_sfe_get_ino(sfp, sfep); + ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; offset = xfs_dir2_sf_get_offset(sfep) + - xfs_dir2_data_entsize(sfep->namelen); + xfs_dir3_data_entsize(mp, sfep->namelen); + ASSERT(xfs_dir3_sfe_get_ftype(mp, sfp, sfep) < + XFS_DIR3_FT_MAX); } ASSERT(i8count == sfp->i8count); ASSERT(XFS_BIG_INUMS || i8count == 0); ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); ASSERT(offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + - (uint)sizeof(xfs_dir2_block_tail_t) <= - dp->i_mount->m_dirblksize); + (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dirblksize); } #endif /* DEBUG */ @@ -765,100 +791,6 @@ xfs_dir2_sf_create( return 0; } -int /* error */ -xfs_dir2_sf_getdents( - xfs_inode_t *dp, /* incore directory inode */ - struct dir_context *ctx) -{ - int i; /* shortform entry number */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_dir2_dataptr_t off; /* current entry's offset */ - xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - xfs_dir2_dataptr_t dot_offset; - xfs_dir2_dataptr_t dotdot_offset; - xfs_ino_t ino; - - mp = dp->i_mount; - - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Give up if the directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - return XFS_ERROR(EIO); - } - - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); - - /* - * If the block number in the offset is out of range, we're done. - */ - if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) - return 0; - - /* - * Precalculate offsets for . and .. as we will always need them. - * - * XXX(hch): the second argument is sometimes 0 and sometimes - * mp->m_dirdatablk. - */ - dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR3_DATA_DOT_OFFSET(mp)); - dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR3_DATA_DOTDOT_OFFSET(mp)); - - /* - * Put . entry unless we're starting past it. - */ - if (ctx->pos <= dot_offset) { - ctx->pos = dot_offset & 0x7fffffff; - if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR)) - return 0; - } - - /* - * Put .. entry unless we're starting past it. - */ - if (ctx->pos <= dotdot_offset) { - ino = xfs_dir2_sf_get_parent_ino(sfp); - ctx->pos = dotdot_offset & 0x7fffffff; - if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) - return 0; - } - - /* - * Loop while there are more entries and put'ing works. - */ - sfep = xfs_dir2_sf_firstentry(sfp); - for (i = 0; i < sfp->count; i++) { - off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - xfs_dir2_sf_get_offset(sfep)); - - if (ctx->pos > off) { - sfep = xfs_dir2_sf_nextentry(sfp, sfep); - continue; - } - - ino = xfs_dir2_sfe_get_ino(sfp, sfep); - ctx->pos = off & 0x7fffffff; - if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, - ino, DT_UNKNOWN)) - return 0; - sfep = xfs_dir2_sf_nextentry(sfp, sfep); - } - - ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & - 0x7fffffff; - return 0; -} - /* * Lookup an entry in a shortform directory. * Returns EEXIST if found, ENOENT if not found. @@ -898,6 +830,7 @@ xfs_dir2_sf_lookup( if (args->namelen == 1 && args->name[0] == '.') { args->inumber = dp->i_ino; args->cmpresult = XFS_CMP_EXACT; + args->filetype = XFS_DIR3_FT_DIR; return XFS_ERROR(EEXIST); } /* @@ -907,6 +840,7 @@ xfs_dir2_sf_lookup( args->name[0] == '.' && args->name[1] == '.') { args->inumber = xfs_dir2_sf_get_parent_ino(sfp); args->cmpresult = XFS_CMP_EXACT; + args->filetype = XFS_DIR3_FT_DIR; return XFS_ERROR(EEXIST); } /* @@ -914,7 +848,7 @@ xfs_dir2_sf_lookup( */ ci_sfep = NULL; for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) { /* * Compare name and if it's an exact match, return the inode * number. If it's the first case-insensitive match, store the @@ -924,7 +858,10 @@ xfs_dir2_sf_lookup( sfep->namelen); if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { args->cmpresult = cmp; - args->inumber = xfs_dir2_sfe_get_ino(sfp, sfep); + args->inumber = xfs_dir3_sfe_get_ino(dp->i_mount, + sfp, sfep); + args->filetype = xfs_dir3_sfe_get_ftype(dp->i_mount, + sfp, sfep); if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); ci_sfep = sfep; @@ -980,10 +917,10 @@ xfs_dir2_sf_removename( * Find the one we're deleting. */ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { - ASSERT(xfs_dir2_sfe_get_ino(sfp, sfep) == + ASSERT(xfs_dir3_sfe_get_ino(dp->i_mount, sfp, sfep) == args->inumber); break; } @@ -997,7 +934,7 @@ xfs_dir2_sf_removename( * Calculate sizes. */ byteoff = (int)((char *)sfep - (char *)sfp); - entsize = xfs_dir2_sf_entsize(sfp, args->namelen); + entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen); newsize = oldsize - entsize; /* * Copy the part if any after the removed entry, sliding it down. @@ -1113,16 +1050,19 @@ xfs_dir2_sf_replace( * Normal entry, look for the name. */ else { - for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); - i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; + i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) { if (xfs_da_compname(args, sfep->name, sfep->namelen) == XFS_CMP_EXACT) { #if XFS_BIG_INUMS || defined(DEBUG) - ino = xfs_dir2_sfe_get_ino(sfp, sfep); + ino = xfs_dir3_sfe_get_ino(dp->i_mount, + sfp, sfep); ASSERT(args->inumber != ino); #endif - xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); + xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep, + args->inumber); + xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep, + args->filetype); break; } } @@ -1189,10 +1129,12 @@ xfs_dir2_sf_toino4( int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ + struct xfs_mount *mp; trace_xfs_dir2_sf_toino4(args); dp = args->dp; + mp = dp->i_mount; /* * Copy the old directory to the buffer. @@ -1230,13 +1172,15 @@ xfs_dir2_sf_toino4( for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), - oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { + i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep), + oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - xfs_dir2_sfe_put_ino(sfp, sfep, - xfs_dir2_sfe_get_ino(oldsfp, oldsfep)); + xfs_dir3_sfe_put_ino(mp, sfp, sfep, + xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep)); + xfs_dir3_sfe_put_ftype(mp, sfp, sfep, + xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep)); } /* * Clean up the inode. @@ -1264,10 +1208,12 @@ xfs_dir2_sf_toino8( int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ + struct xfs_mount *mp; trace_xfs_dir2_sf_toino8(args); dp = args->dp; + mp = dp->i_mount; /* * Copy the old directory to the buffer. @@ -1305,13 +1251,15 @@ xfs_dir2_sf_toino8( for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), oldsfep = xfs_dir2_sf_firstentry(oldsfp); i < sfp->count; - i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), - oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { + i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep), + oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - xfs_dir2_sfe_put_ino(sfp, sfep, - xfs_dir2_sfe_get_ino(oldsfp, oldsfep)); + xfs_dir3_sfe_put_ino(mp, sfp, sfep, + xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep)); + xfs_dir3_sfe_put_ftype(mp, sfp, sfep, + xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep)); } /* * Clean up the inode. diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 69cf4fcde03e..45560ee1a4ba 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -16,12 +16,13 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_sb.h" +#include "xfs_format.h" #include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" -#include "xfs_trans.h" #include "xfs_alloc_btree.h" #include "xfs_bmap_btree.h" #include "xfs_ialloc_btree.h" diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 0adf27ecf3f1..251c66632e5e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -17,6 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_trans.h" @@ -28,6 +29,7 @@ #include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" @@ -710,10 +712,8 @@ xfs_qm_dqread( if (flags & XFS_QMOPT_DQALLOC) { tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_QM_DQALLOC_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm, + XFS_QM_DQALLOC_SPACE_RES(mp), 0); if (error) goto error1; cancelflags = XFS_TRANS_RELEASE_LOG_RES; diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 57aa4b03720c..60c6e1f12695 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -17,6 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -43,14 +44,15 @@ static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) /* * returns the number of iovecs needed to log the given dquot item. */ -STATIC uint +STATIC void xfs_qm_dquot_logitem_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - /* - * we need only two iovecs, one for the format, one for the real thing - */ - return 2; + *nvecs += 2; + *nbytes += sizeof(struct xfs_dq_logformat) + + sizeof(struct xfs_disk_dquot); } /* @@ -285,11 +287,14 @@ static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip) * We only need 1 iovec for an quotaoff item. It just logs the * quotaoff_log_format structure. */ -STATIC uint +STATIC void xfs_qm_qoff_logitem_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - return 1; + *nvecs += 1; + *nbytes += sizeof(struct xfs_qoff_logitem); } /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 35d3f5b041dd..1123d93ff795 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -26,7 +26,6 @@ #include "xfs_bmap_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_utils.h" #include "xfs_error.h" #ifdef DEBUG diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index c585bc646395..066df425c14f 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -21,10 +21,11 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_export.h" -#include "xfs_vnodeops.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_inode_item.h" diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 85e9f87a1a7c..86f559f6e5d3 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -147,7 +147,7 @@ xfs_extent_busy_search( * extent. If the overlap covers the beginning, the end, or all of the busy * extent, the overlapping portion can be made unbusy and used for the * allocation. We can't split a busy extent because we can't modify a - * transaction/CIL context busy list, but we can update an entries block + * transaction/CIL context busy list, but we can update an entry's block * number or length. * * Returns true if the extent can safely be reused, or false if the search diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 452920a3f03f..dc53e8febbbe 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -73,11 +73,22 @@ __xfs_efi_release( * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. */ -STATIC uint +static inline int +xfs_efi_item_sizeof( + struct xfs_efi_log_item *efip) +{ + return sizeof(struct xfs_efi_log_format) + + (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); +} + +STATIC void xfs_efi_item_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - return 1; + *nvecs += 1; + *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip)); } /* @@ -93,21 +104,17 @@ xfs_efi_item_format( struct xfs_log_iovec *log_vector) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - uint size; ASSERT(atomic_read(&efip->efi_next_extent) == efip->efi_format.efi_nextents); efip->efi_format.efi_type = XFS_LI_EFI; - - size = sizeof(xfs_efi_log_format_t); - size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); efip->efi_format.efi_size = 1; log_vector->i_addr = &efip->efi_format; - log_vector->i_len = size; + log_vector->i_len = xfs_efi_item_sizeof(efip); log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; - ASSERT(size >= sizeof(xfs_efi_log_format_t)); + ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t)); } @@ -333,11 +340,22 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp) * We only need 1 iovec for an efd item. It just logs the efd_log_format * structure. */ -STATIC uint +static inline int +xfs_efd_item_sizeof( + struct xfs_efd_log_item *efdp) +{ + return sizeof(xfs_efd_log_format_t) + + (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); +} + +STATIC void xfs_efd_item_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - return 1; + *nvecs += 1; + *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip)); } /* @@ -353,20 +371,16 @@ xfs_efd_item_format( struct xfs_log_iovec *log_vector) { struct xfs_efd_log_item *efdp = EFD_ITEM(lip); - uint size; ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); efdp->efd_format.efd_type = XFS_LI_EFD; - - size = sizeof(xfs_efd_log_format_t); - size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); efdp->efd_format.efd_size = 1; log_vector->i_addr = &efdp->efd_format; - log_vector->i_len = size; + log_vector->i_len = xfs_efd_item_sizeof(efdp); log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; - ASSERT(size >= sizeof(xfs_efd_log_format_t)); + ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t)); } /* diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 432222418c56..0ffbce32d569 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -18,93 +18,11 @@ #ifndef __XFS_EXTFREE_ITEM_H__ #define __XFS_EXTFREE_ITEM_H__ +/* kernel only EFI/EFD definitions */ + struct xfs_mount; struct kmem_zone; -typedef struct xfs_extent { - xfs_dfsbno_t ext_start; - xfs_extlen_t ext_len; -} xfs_extent_t; - -/* - * Since an xfs_extent_t has types (start:64, len: 32) - * there are different alignments on 32 bit and 64 bit kernels. - * So we provide the different variants for use by a - * conversion routine. - */ - -typedef struct xfs_extent_32 { - __uint64_t ext_start; - __uint32_t ext_len; -} __attribute__((packed)) xfs_extent_32_t; - -typedef struct xfs_extent_64 { - __uint64_t ext_start; - __uint32_t ext_len; - __uint32_t ext_pad; -} xfs_extent_64_t; - -/* - * This is the structure used to lay out an efi log item in the - * log. The efi_extents field is a variable size array whose - * size is given by efi_nextents. - */ -typedef struct xfs_efi_log_format { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[1]; /* array of extents to free */ -} xfs_efi_log_format_t; - -typedef struct xfs_efi_log_format_32 { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[1]; /* array of extents to free */ -} __attribute__((packed)) xfs_efi_log_format_32_t; - -typedef struct xfs_efi_log_format_64 { - __uint16_t efi_type; /* efi log item type */ - __uint16_t efi_size; /* size of this item */ - __uint32_t efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[1]; /* array of extents to free */ -} xfs_efi_log_format_64_t; - -/* - * This is the structure used to lay out an efd log item in the - * log. The efd_extents array is a variable size array whose - * size is given by efd_nextents; - */ -typedef struct xfs_efd_log_format { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[1]; /* array of extents freed */ -} xfs_efd_log_format_t; - -typedef struct xfs_efd_log_format_32 { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[1]; /* array of extents freed */ -} __attribute__((packed)) xfs_efd_log_format_32_t; - -typedef struct xfs_efd_log_format_64 { - __uint16_t efd_type; /* efd log item type */ - __uint16_t efd_size; /* size of this item */ - __uint32_t efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[1]; /* array of extents freed */ -} xfs_efd_log_format_64_t; - - -#ifdef __KERNEL__ - /* * Max number of extents in fast allocation path. */ @@ -160,6 +78,4 @@ int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt); void xfs_efi_item_free(xfs_efi_log_item_t *); -#endif /* __KERNEL__ */ - #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index de3dc98f4e8f..4c749ab543d0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -28,10 +28,11 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_error.h" -#include "xfs_vnodeops.h" #include "xfs_da_btree.h" #include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_ioctl.h" #include "xfs_trace.h" diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 5170306a1009..ce78e654d37b 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -16,18 +16,18 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_log.h" #include "xfs_bmap_btree.h" #include "xfs_inum.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ag.h" -#include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_alloc.h" -#include "xfs_utils.h" #include "xfs_mru_cache.h" #include "xfs_filestream.h" #include "xfs_trace.h" @@ -668,8 +668,8 @@ exit: */ int xfs_filestream_new_ag( - xfs_bmalloca_t *ap, - xfs_agnumber_t *agp) + struct xfs_bmalloca *ap, + xfs_agnumber_t *agp) { int flags, err; xfs_inode_t *ip, *pip = NULL; diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 09dd9af45434..6d61dbee8564 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -18,8 +18,6 @@ #ifndef __XFS_FILESTREAM_H__ #define __XFS_FILESTREAM_H__ -#ifdef __KERNEL__ - struct xfs_mount; struct xfs_inode; struct xfs_perag; @@ -69,6 +67,4 @@ xfs_inode_is_filestream( (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); } -#endif /* __KERNEL__ */ - #endif /* __XFS_FILESTREAM_H__ */ diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h new file mode 100644 index 000000000000..35c08ff54ca0 --- /dev/null +++ b/fs/xfs/xfs_format.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FORMAT_H__ +#define __XFS_FORMAT_H__ + +/* + * XFS On Disk Format Definitions + * + * This header file defines all the on-disk format definitions for + * general XFS objects. Directory and attribute related objects are defined in + * xfs_da_format.h, which log and log item formats are defined in + * xfs_log_format.h. Everything else goes here. + */ + +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; +struct xfs_buf; +struct xfs_ifork; + +/* + * RealTime Device format definitions + */ + +/* Min and max rt extent sizes, specified in bytes */ +#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ +#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ +#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ + +#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) +#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) +#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) +#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) + +/* + * RT Summary and bit manipulation macros. + */ +#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) +#define XFS_SUMOFFSTOBLOCK(mp,s) \ + (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) +#define XFS_SUMPTR(mp,bp,so) \ + ((xfs_suminfo_t *)((bp)->b_addr + \ + (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) + +#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) +#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) +#define XFS_BITTOWORD(mp,bi) \ + ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) + +#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) +#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) + +#define XFS_RTLOBIT(w) xfs_lowbit32(w) +#define XFS_RTHIBIT(w) xfs_highbit32(w) + +#if XFS_BIG_BLKNOS +#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) +#else +#define XFS_RTBLOCKLOG(b) xfs_highbit32(b) +#endif + +/* + * Dquot and dquot block format definitions + */ +#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ +#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ + +/* + * This is the main portion of the on-disk representation of quota + * information for a user. This is the q_core of the xfs_dquot_t that + * is kept in kernel memory. We pad this with some more expansion room + * to construct the on disk structure. + */ +typedef struct xfs_disk_dquot { + __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ + __u8 d_version; /* dquot version */ + __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ + __be32 d_id; /* user,project,group id */ + __be64 d_blk_hardlimit;/* absolute limit on disk blks */ + __be64 d_blk_softlimit;/* preferred limit on disk blks */ + __be64 d_ino_hardlimit;/* maximum # allocated inodes */ + __be64 d_ino_softlimit;/* preferred inode limit */ + __be64 d_bcount; /* disk blocks owned by the user */ + __be64 d_icount; /* inodes owned by the user */ + __be32 d_itimer; /* zero if within inode limits if not, + this is when we refuse service */ + __be32 d_btimer; /* similar to above; for disk blocks */ + __be16 d_iwarns; /* warnings issued wrt num inodes */ + __be16 d_bwarns; /* warnings issued wrt disk blocks */ + __be32 d_pad0; /* 64 bit align */ + __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */ + __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */ + __be64 d_rtbcount; /* realtime blocks owned */ + __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ + __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ + __be16 d_pad; +} xfs_disk_dquot_t; + +/* + * This is what goes on disk. This is separated from the xfs_disk_dquot because + * carrying the unnecessary padding would be a waste of memory. + */ +typedef struct xfs_dqblk { + xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ + char dd_fill[4]; /* filling for posterity */ + + /* + * These two are only present on filesystems with the CRC bits set. + */ + __be32 dd_crc; /* checksum */ + __be64 dd_lsn; /* last modification in log */ + uuid_t dd_uuid; /* location information */ +} xfs_dqblk_t; + +#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) + +/* + * Remote symlink format and access functions. + */ +#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ + +struct xfs_dsymlink_hdr { + __be32 sl_magic; + __be32 sl_offset; + __be32 sl_bytes; + __be32 sl_crc; + uuid_t sl_uuid; + __be64 sl_owner; + __be64 sl_blkno; + __be64 sl_lsn; +}; + +/* + * The maximum pathlen is 1024 bytes. Since the minimum file system + * blocksize is 512 bytes, we can get a max of 3 extents back from + * bmapi when crc headers are taken into account. + */ +#define XFS_SYMLINK_MAPS 3 + +#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_dsymlink_hdr) : 0)) + +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); +int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp); + +extern const struct xfs_buf_ops xfs_symlink_buf_ops; + +#endif /* __XFS_FORMAT_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index d04695545397..1edb5cc3e5f4 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -240,7 +240,9 @@ typedef struct xfs_fsop_resblks { /* - * Minimum and maximum sizes need for growth checks + * Minimum and maximum sizes need for growth checks. + * + * Block counts are in units of filesystem blocks, not basic blocks. */ #define XFS_MIN_AG_BLOCKS 64 #define XFS_MIN_LOG_BLOCKS 512ULL @@ -311,6 +313,17 @@ typedef struct xfs_bstat { } xfs_bstat_t; /* + * Project quota id helpers (previously projid was 16bit only + * and using two 16bit values to hold new 32bit projid was choosen + * to retain compatibility with "old" filesystems). + */ +static inline __uint32_t +bstat_get_projid(struct xfs_bstat *bs) +{ + return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo; +} + +/* * The user-level BulkStat Request interface structure. */ typedef struct xfs_fsop_bulkreq { @@ -344,7 +357,7 @@ typedef struct xfs_error_injection { * Speculative preallocation trimming. */ #define XFS_EOFBLOCKS_VERSION 1 -struct xfs_eofblocks { +struct xfs_fs_eofblocks { __u32 eof_version; __u32 eof_flags; uid_t eof_uid; @@ -450,6 +463,21 @@ typedef struct xfs_handle { + (handle).ha_fid.fid_len) /* + * Structure passed to XFS_IOC_SWAPEXT + */ +typedef struct xfs_swapext +{ + __int64_t sx_version; /* version */ +#define XFS_SX_VERSION 0 + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} xfs_swapext_t; + +/* * Flags for going down operation */ #define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ @@ -511,8 +539,14 @@ typedef struct xfs_handle { #define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ + /* XFS_IOC_FREEZE -- FIFREEZE 119 */ /* XFS_IOC_THAW -- FITHAW 120 */ +#ifndef FIFREEZE +#define XFS_IOC_FREEZE _IOWR('X', 119, int) +#define XFS_IOC_THAW _IOWR('X', 120, int) +#endif + #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 614eb0cc3608..e64ee5288b86 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -203,8 +203,9 @@ xfs_growfs_data_private( tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); tp->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), - XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) { + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0); + if (error) { xfs_trans_cancel(tp, 0); return error; } @@ -739,8 +740,7 @@ xfs_fs_log_dummy( int error; tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return error; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 7a0c17d7ec09..ccf2fb143962 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -39,6 +39,7 @@ #include "xfs_cksum.h" #include "xfs_buf_item.h" #include "xfs_icreate_item.h" +#include "xfs_icache.h" /* @@ -506,7 +507,7 @@ xfs_ialloc_next_ag( /* * Select an allocation group to look for a free inode in, based on the parent - * inode and then mode. Return the allocation group buffer. + * inode and the mode. Return the allocation group buffer. */ STATIC xfs_agnumber_t xfs_ialloc_ag_select( @@ -728,7 +729,7 @@ xfs_dialloc_ag( error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(j == 1, error0); if (rec.ir_freecount > 0) { /* @@ -1341,7 +1342,7 @@ xfs_imap( xfs_agblock_t cluster_agbno; /* first block in inode cluster */ int error; /* error code */ int offset; /* index of inode in its buffer */ - int offset_agbno; /* blks from chunk start to inode */ + xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ ASSERT(ino != NULLFSINO); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3f90e1ceb8d6..16219b9c6790 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -17,6 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_types.h" #include "xfs_log.h" #include "xfs_log_priv.h" @@ -31,12 +32,12 @@ #include "xfs_dinode.h" #include "xfs_error.h" #include "xfs_filestream.h" -#include "xfs_vnodeops.h" #include "xfs_inode_item.h" #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_fsops.h" #include "xfs_icache.h" +#include "xfs_bmap_util.h" #include <linux/kthread.h> #include <linux/freezer.h> @@ -619,7 +620,7 @@ restart: /* * Background scanning to trim post-EOF preallocated space. This is queued - * based on the 'background_prealloc_discard_period' tunable (5m by default). + * based on the 'speculative_prealloc_lifetime' tunable (5m by default). */ STATIC void xfs_queue_eofblocks( @@ -1203,15 +1204,15 @@ xfs_inode_match_id( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { - if (eofb->eof_flags & XFS_EOF_FLAGS_UID && - ip->i_d.di_uid != eofb->eof_uid) + if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) return 0; - if (eofb->eof_flags & XFS_EOF_FLAGS_GID && - ip->i_d.di_gid != eofb->eof_gid) + if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) return 0; - if (eofb->eof_flags & XFS_EOF_FLAGS_PRID && + if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && xfs_get_projid(ip) != eofb->eof_prid) return 0; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index a01afbb3909a..8a89f7d791bd 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -21,9 +21,24 @@ struct xfs_mount; struct xfs_perag; +struct xfs_eofblocks { + __u32 eof_flags; + kuid_t eof_uid; + kgid_t eof_gid; + prid_t eof_prid; + __u64 eof_min_file_size; +}; + #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ +/* + * Flags for xfs_iget() + */ +#define XFS_IGET_CREATE 0x1 +#define XFS_IGET_UNTRUSTED 0x2 +#define XFS_IGET_DONTCACHE 0x4 + int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); @@ -49,4 +64,39 @@ int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, int flags, void *args), int flags, void *args, int tag); +static inline int +xfs_fs_eofblocks_from_user( + struct xfs_fs_eofblocks *src, + struct xfs_eofblocks *dst) +{ + if (src->eof_version != XFS_EOFBLOCKS_VERSION) + return EINVAL; + + if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) + return EINVAL; + + if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || + memchr_inv(src->pad64, 0, sizeof(src->pad64))) + return EINVAL; + + dst->eof_flags = src->eof_flags; + dst->eof_prid = src->eof_prid; + dst->eof_min_file_size = src->eof_min_file_size; + + dst->eof_uid = INVALID_UID; + if (src->eof_flags & XFS_EOF_FLAGS_UID) { + dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); + if (!uid_valid(dst->eof_uid)) + return EINVAL; + } + + dst->eof_gid = INVALID_GID; + if (src->eof_flags & XFS_EOF_FLAGS_GID) { + dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); + if (!gid_valid(dst->eof_gid)) + return EINVAL; + } + return 0; +} + #endif diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 7716a4e7375e..5a5a593994d4 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -20,23 +20,11 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" -#include "xfs_buf_item.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_error.h" #include "xfs_icreate_item.h" @@ -52,11 +40,14 @@ static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip) * * We only need one iovec for the icreate log structure. */ -STATIC uint +STATIC void xfs_icreate_item_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - return 1; + *nvecs += 1; + *nbytes += sizeof(struct xfs_icreate_log); } /* diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h index 88ba8aa0bc41..59e89f87c09b 100644 --- a/fs/xfs/xfs_icreate_item.h +++ b/fs/xfs/xfs_icreate_item.h @@ -18,24 +18,6 @@ #ifndef XFS_ICREATE_ITEM_H #define XFS_ICREATE_ITEM_H 1 -/* - * on disk log item structure - * - * Log recovery assumes the first two entries are the type and size and they fit - * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so - * decoding can be done correctly. - */ -struct xfs_icreate_log { - __uint16_t icl_type; /* type of log format structure */ - __uint16_t icl_size; /* size of log format structure */ - __be32 icl_ag; /* ag being allocated in */ - __be32 icl_agbno; /* start block of inode range */ - __be32 icl_count; /* number of inodes to initialise */ - __be32 icl_isize; /* size of inodes */ - __be32 icl_length; /* length of extent to initialise */ - __be32 icl_gen; /* inode generation number to use */ -}; - /* in memory log item structure */ struct xfs_icreate_item { struct xfs_log_item ic_item; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bb262c25c8de..e3d75385aa76 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -19,18 +19,23 @@ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" +#include "xfs_trans_space.h" #include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_attr_sf.h" +#include "xfs_attr.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_buf_item.h" @@ -39,16 +44,15 @@ #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_error.h" -#include "xfs_utils.h" #include "xfs_quota.h" #include "xfs_filestream.h" -#include "xfs_vnodeops.h" #include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_symlink.h" -kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; /* @@ -58,9 +62,6 @@ kmem_zone_t *xfs_inode_zone; #define XFS_ITRUNC_MAX_EXTENTS 2 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); -STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); -STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); -STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); /* * helper function to extract extent size hint from inode @@ -310,623 +311,202 @@ xfs_isilocked( } #endif -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wait); -} - #ifdef DEBUG +int xfs_locked_n; +int xfs_small_retries; +int xfs_middle_retries; +int xfs_lots_retries; +int xfs_lock_delays; +#endif + /* - * Make sure that the extents in the given memory buffer - * are valid. + * Bump the subclass so xfs_lock_inodes() acquires each lock with + * a different value */ -STATIC void -xfs_validate_extents( - xfs_ifork_t *ifp, - int nrecs, - xfs_exntfmt_t fmt) +static inline int +xfs_lock_inumorder(int lock_mode, int subclass) { - xfs_bmbt_irec_t irec; - xfs_bmbt_rec_host_t rec; - int i; + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; - for (i = 0; i < nrecs; i++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - rec.l0 = get_unaligned(&ep->l0); - rec.l1 = get_unaligned(&ep->l1); - xfs_bmbt_get_all(&rec, &irec); - if (fmt == XFS_EXTFMT_NOSTATE) - ASSERT(irec.br_state == XFS_EXT_NORM); - } + return lock_mode; } -#else /* DEBUG */ -#define xfs_validate_extents(ifp, nrecs, fmt) -#endif /* DEBUG */ /* - * Check that none of the inode's in the buffer have a next - * unlinked field of 0. + * The following routine will lock n inodes in exclusive mode. + * We assume the caller calls us with the inodes in i_ino order. + * + * We need to detect deadlock where an inode that we lock + * is in the AIL and we start waiting for another inode that is locked + * by a thread in a long running transaction (such as truncate). This can + * result in deadlock since the long running trans might need to wait + * for the inode we just locked in order to push the tail and free space + * in the log. */ -#if defined(DEBUG) void -xfs_inobp_check( - xfs_mount_t *mp, - xfs_buf_t *bp) +xfs_lock_inodes( + xfs_inode_t **ips, + int inodes, + uint lock_mode) { - int i; - int j; - xfs_dinode_t *dip; + int attempts = 0, i, j, try_lock; + xfs_log_item_t *lp; - j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + ASSERT(ips && (inodes >= 2)); /* we need at least two */ - for (i = 0; i < j; i++) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - i * mp->m_sb.sb_inodesize); - if (!dip->di_next_unlinked) { - xfs_alert(mp, - "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.", - bp); - ASSERT(dip->di_next_unlinked); - } - } -} -#endif + try_lock = 0; + i = 0; -static void -xfs_inode_buf_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - int i; - int ni; - - /* - * Validate the magic number and version of every inode in the buffer - */ - ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; - for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - - dip = (struct xfs_dinode *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - XFS_DINODE_GOOD_VERSION(dip->di_version); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - xfs_buf_ioerror(bp, EFSCORRUPTED); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, - mp, dip); -#ifdef DEBUG - xfs_emerg(mp, - "bad inode magic/vsn daddr %lld #%d (magic=%x)", - (unsigned long long)bp->b_bn, i, - be16_to_cpu(dip->di_magic)); - ASSERT(0); -#endif - } - } - xfs_inobp_check(mp, bp); -} - - -static void -xfs_inode_buf_read_verify( - struct xfs_buf *bp) -{ - xfs_inode_buf_verify(bp); -} - -static void -xfs_inode_buf_write_verify( - struct xfs_buf *bp) -{ - xfs_inode_buf_verify(bp); -} - -const struct xfs_buf_ops xfs_inode_buf_ops = { - .verify_read = xfs_inode_buf_read_verify, - .verify_write = xfs_inode_buf_write_verify, -}; +again: + for (; i < inodes; i++) { + ASSERT(ips[i]); + if (i && (ips[i] == ips[i-1])) /* Already locked */ + continue; -/* - * This routine is called to map an inode to the buffer containing the on-disk - * version of the inode. It returns a pointer to the buffer containing the - * on-disk inode in the bpp parameter, and in the dipp parameter it returns a - * pointer to the on-disk inode within that buffer. - * - * If a non-zero error is returned, then the contents of bpp and dipp are - * undefined. - */ -int -xfs_imap_to_bp( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_imap *imap, - struct xfs_dinode **dipp, - struct xfs_buf **bpp, - uint buf_flags, - uint iget_flags) -{ - struct xfs_buf *bp; - int error; + /* + * If try_lock is not set yet, make sure all locked inodes + * are not in the AIL. + * If any are, set try_lock to be used later. + */ - buf_flags |= XBF_UNMAPPED; - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp, - &xfs_inode_buf_ops); - if (error) { - if (error == EAGAIN) { - ASSERT(buf_flags & XBF_TRYLOCK); - return error; + if (!try_lock) { + for (j = (i - 1); j >= 0 && !try_lock; j--) { + lp = (xfs_log_item_t *)ips[j]->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + try_lock++; + } + } } - if (error == EFSCORRUPTED && - (iget_flags & XFS_IGET_UNTRUSTED)) - return XFS_ERROR(EINVAL); - - xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); - return error; - } - - *bpp = bp; - *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); - return 0; -} - -/* - * Move inode type and inode format specific information from the - * on-disk inode to the in-core inode. For fifos, devs, and sockets - * this means set if_rdev to the proper value. For files, directories, - * and symlinks this means to bring in the in-line data or extent - * pointers. For a file in B-tree format, only the root is immediately - * brought in-core. The rest will be in-lined in if_extents when it - * is first referenced (see xfs_iread_extents()). - */ -STATIC int -xfs_iformat( - xfs_inode_t *ip, - xfs_dinode_t *dip) -{ - xfs_attr_shortform_t *atp; - int size; - int error = 0; - xfs_fsize_t di_size; - - if (unlikely(be32_to_cpu(dip->di_nextents) + - be16_to_cpu(dip->di_anextents) > - be64_to_cpu(dip->di_nblocks))) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", - (unsigned long long)ip->i_ino, - (int)(be32_to_cpu(dip->di_nextents) + - be16_to_cpu(dip->di_anextents)), - (unsigned long long) - be64_to_cpu(dip->di_nblocks)); - XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { - xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", - (unsigned long long)ip->i_ino, - dip->di_forkoff); - XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && - !ip->i_mount->m_rtdev_targp)) { - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, has realtime flag set.", - ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", - XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFIFO: - case S_IFCHR: - case S_IFBLK: - case S_IFSOCK: - if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { - XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - ip->i_d.di_size = 0; - ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); - break; + /* + * If any of the previous locks we have locked is in the AIL, + * we must TRY to get the second and subsequent locks. If + * we can't get any, we must release all we have + * and try again. + */ - case S_IFREG: - case S_IFLNK: - case S_IFDIR: - switch (dip->di_format) { - case XFS_DINODE_FMT_LOCAL: + if (try_lock) { + /* try_lock must be 0 if i is 0. */ /* - * no local regular files yet + * try_lock means we have an inode locked + * that is in the AIL. */ - if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (local format for regular file).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(4)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } + ASSERT(i != 0); + if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { + attempts++; + + /* + * Unlock all previous guys and try again. + * xfs_iunlock will try to push the tail + * if the inode is in the AIL. + */ + + for(j = i - 1; j >= 0; j--) { + + /* + * Check to see if we've already + * unlocked this one. + * Not the first one going back, + * and the inode ptr is the same. + */ + if ((j != (i - 1)) && ips[j] == + ips[j+1]) + continue; + + xfs_iunlock(ips[j], lock_mode); + } - di_size = be64_to_cpu(dip->di_size); - if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %Ld for local inode).", - (unsigned long long) ip->i_ino, - (long long) di_size); - XFS_CORRUPTION_ERROR("xfs_iformat(5)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ +#ifdef DEBUG + xfs_lock_delays++; +#endif + } + i = 0; + try_lock = 0; + goto again; } - - size = (int)di_size; - error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); - break; - default: - XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + } else { + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); } - break; - - default: - XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - if (error) { - return error; } - if (!XFS_DFORK_Q(dip)) - return 0; - - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); - - switch (dip->di_aformat) { - case XFS_DINODE_FMT_LOCAL: - atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); - size = be16_to_cpu(atp->hdr.totsize); - - if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad attr fork size %Ld).", - (unsigned long long) ip->i_ino, - (long long) size); - XFS_CORRUPTION_ERROR("xfs_iformat(8)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); - break; - default: - error = XFS_ERROR(EFSCORRUPTED); - break; - } - if (error) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - xfs_idestroy_fork(ip, XFS_DATA_FORK); +#ifdef DEBUG + if (attempts) { + if (attempts < 5) xfs_small_retries++; + else if (attempts < 100) xfs_middle_retries++; + else xfs_lots_retries++; + } else { + xfs_locked_n++; } - return error; +#endif } /* - * The file is in-lined in the on-disk inode. - * If it fits into if_inline_data, then copy - * it there, otherwise allocate a buffer for it - * and copy the data there. Either way, set - * if_data to point at the data. - * If we allocate a buffer for the data, make - * sure that its size is a multiple of 4 and - * record the real size in i_real_bytes. + * xfs_lock_two_inodes() can only be used to lock one type of lock + * at a time - the iolock or the ilock, but not both at once. If + * we lock both at once, lockdep will report false positives saying + * we have violated locking orders. */ -STATIC int -xfs_iformat_local( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork, - int size) +void +xfs_lock_two_inodes( + xfs_inode_t *ip0, + xfs_inode_t *ip1, + uint lock_mode) { - xfs_ifork_t *ifp; - int real_size; - - /* - * If the size is unreasonable, then something - * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. - */ - if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { - xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %d).", - (unsigned long long) ip->i_ino, size, - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); - XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - ifp = XFS_IFORK_PTR(ip, whichfork); - real_size = 0; - if (size == 0) - ifp->if_u1.if_data = NULL; - else if (size <= sizeof(ifp->if_u2.if_inline_data)) - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - else { - real_size = roundup(size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); - } - ifp->if_bytes = size; - ifp->if_real_bytes = real_size; - if (size) - memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFINLINE; - return 0; -} + xfs_inode_t *temp; + int attempts = 0; + xfs_log_item_t *lp; -/* - * The file consists of a set of extents all - * of which fit into the on-disk inode. - * If there are few enough extents to fit into - * the if_inline_ext, then copy them there. - * Otherwise allocate a buffer for them and copy - * them into it. Either way, set if_extents - * to point at the extents. - */ -STATIC int -xfs_iformat_extents( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork) -{ - xfs_bmbt_rec_t *dp; - xfs_ifork_t *ifp; - int nex; - int size; - int i; - - ifp = XFS_IFORK_PTR(ip, whichfork); - nex = XFS_DFORK_NEXTENTS(dip, whichfork); - size = nex * (uint)sizeof(xfs_bmbt_rec_t); - - /* - * If the number of extents is unreasonable, then something - * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. - */ - if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { - xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", - (unsigned long long) ip->i_ino, nex); - XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - ifp->if_real_bytes = 0; - if (nex == 0) - ifp->if_u1.if_extents = NULL; - else if (nex <= XFS_INLINE_EXTS) - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - else - xfs_iext_add(ifp, 0, nex); - - ifp->if_bytes = size; - if (size) { - dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); - xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); - for (i = 0; i < nex; i++, dp++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - ep->l0 = get_unaligned_be64(&dp->l0); - ep->l1 = get_unaligned_be64(&dp->l1); - } - XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); - if (whichfork != XFS_DATA_FORK || - XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) - if (unlikely(xfs_check_nostate_extents( - ifp, 0, nex))) { - XFS_ERROR_REPORT("xfs_iformat_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - } - ifp->if_flags |= XFS_IFEXTENTS; - return 0; -} + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); + ASSERT(ip0->i_ino != ip1->i_ino); -/* - * The file has too many extents to fit into - * the inode, so they are in B-tree format. - * Allocate a buffer for the root of the B-tree - * and copy the root into it. The i_extents - * field will remain NULL until all of the - * extents are read in (when they are needed). - */ -STATIC int -xfs_iformat_btree( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_bmdr_block_t *dfp; - xfs_ifork_t *ifp; - /* REFERENCED */ - int nrecs; - int size; - - ifp = XFS_IFORK_PTR(ip, whichfork); - dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(mp, dfp); - nrecs = be16_to_cpu(dfp->bb_numrecs); - - /* - * blow out if -- fork has less extents than can fit in - * fork (fork shouldn't be a btree format), root btree - * block has more records than can fit into the fork, - * or the number of extents is greater than the number of - * blocks. - */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || - XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, mp, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { - xfs_warn(mp, "corrupt inode %Lu (btree).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, - mp, dip); - return XFS_ERROR(EFSCORRUPTED); - } - - ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); - ASSERT(ifp->if_broot != NULL); - /* - * Copy and convert from the on-disk structure - * to the in-memory structure. - */ - xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), - ifp->if_broot, size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFBROOT; + if (ip0->i_ino > ip1->i_ino) { + temp = ip0; + ip0 = ip1; + ip1 = temp; + } - return 0; -} + again: + xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); -STATIC void -xfs_dinode_from_disk( - xfs_icdinode_t *to, - xfs_dinode_t *from) -{ - to->di_magic = be16_to_cpu(from->di_magic); - to->di_mode = be16_to_cpu(from->di_mode); - to->di_version = from ->di_version; - to->di_format = from->di_format; - to->di_onlink = be16_to_cpu(from->di_onlink); - to->di_uid = be32_to_cpu(from->di_uid); - to->di_gid = be32_to_cpu(from->di_gid); - to->di_nlink = be32_to_cpu(from->di_nlink); - to->di_projid_lo = be16_to_cpu(from->di_projid_lo); - to->di_projid_hi = be16_to_cpu(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - to->di_flushiter = be16_to_cpu(from->di_flushiter); - to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); - to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); - to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); - to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); - to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); - to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); - to->di_size = be64_to_cpu(from->di_size); - to->di_nblocks = be64_to_cpu(from->di_nblocks); - to->di_extsize = be32_to_cpu(from->di_extsize); - to->di_nextents = be32_to_cpu(from->di_nextents); - to->di_anextents = be16_to_cpu(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = be32_to_cpu(from->di_dmevmask); - to->di_dmstate = be16_to_cpu(from->di_dmstate); - to->di_flags = be16_to_cpu(from->di_flags); - to->di_gen = be32_to_cpu(from->di_gen); - - if (to->di_version == 3) { - to->di_changecount = be64_to_cpu(from->di_changecount); - to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); - to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); - to->di_flags2 = be64_to_cpu(from->di_flags2); - to->di_ino = be64_to_cpu(from->di_ino); - to->di_lsn = be64_to_cpu(from->di_lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); - uuid_copy(&to->di_uuid, &from->di_uuid); + /* + * If the first lock we have locked is in the AIL, we must TRY to get + * the second lock. If we can't get it, we must release the first one + * and try again. + */ + lp = (xfs_log_item_t *)ip0->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { + xfs_iunlock(ip0, lock_mode); + if ((++attempts % 5) == 0) + delay(1); /* Don't just spin the CPU */ + goto again; + } + } else { + xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); } } + void -xfs_dinode_to_disk( - xfs_dinode_t *to, - xfs_icdinode_t *from) +__xfs_iflock( + struct xfs_inode *ip) { - to->di_magic = cpu_to_be16(from->di_magic); - to->di_mode = cpu_to_be16(from->di_mode); - to->di_version = from ->di_version; - to->di_format = from->di_format; - to->di_onlink = cpu_to_be16(from->di_onlink); - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); - to->di_nlink = cpu_to_be32(from->di_nlink); - to->di_projid_lo = cpu_to_be16(from->di_projid_lo); - to->di_projid_hi = cpu_to_be16(from->di_projid_hi); - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); - to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); - to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); - to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); - to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); - to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); - to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); - to->di_size = cpu_to_be64(from->di_size); - to->di_nblocks = cpu_to_be64(from->di_nblocks); - to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); - to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; - to->di_dmevmask = cpu_to_be32(from->di_dmevmask); - to->di_dmstate = cpu_to_be16(from->di_dmstate); - to->di_flags = cpu_to_be16(from->di_flags); - to->di_gen = cpu_to_be32(from->di_gen); - - if (from->di_version == 3) { - to->di_changecount = cpu_to_be64(from->di_changecount); - to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); - to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); - to->di_flags2 = cpu_to_be64(from->di_flags2); - to->di_ino = cpu_to_be64(from->di_ino); - to->di_lsn = cpu_to_be64(from->di_lsn); - memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); - uuid_copy(&to->di_uuid, &from->di_uuid); - to->di_flushiter = 0; - } else { - to->di_flushiter = cpu_to_be16(from->di_flushiter); - } + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); } STATIC uint @@ -987,235 +567,50 @@ xfs_dic2xflags( (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); } -static bool -xfs_dinode_verify( - struct xfs_mount *mp, - struct xfs_inode *ip, - struct xfs_dinode *dip) -{ - if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) - return false; - - /* only version 3 or greater inodes are extensively verified here */ - if (dip->di_version < 3) - return true; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, - offsetof(struct xfs_dinode, di_crc))) - return false; - if (be64_to_cpu(dip->di_ino) != ip->i_ino) - return false; - if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) - return false; - return true; -} - -void -xfs_dinode_calc_crc( - struct xfs_mount *mp, - struct xfs_dinode *dip) -{ - __uint32_t crc; - - if (dip->di_version < 3) - return; - - ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); - crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, - offsetof(struct xfs_dinode, di_crc)); - dip->di_crc = xfs_end_cksum(crc); -} - /* - * Read the disk inode attributes into the in-core inode structure. - * - * For version 5 superblocks, if we are initialising a new inode and we are not - * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new - * inode core with a random generation number. If we are keeping inodes around, - * we need to read the inode cluster to get the existing generation number off - * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode - * format) then log recovery is dependent on the di_flushiter field being - * initialised from the current on-disk value and hence we must also read the - * inode off disk. + * Lookups up an inode from "name". If ci_name is not NULL, then a CI match + * is allowed, otherwise it has to be an exact match. If a CI match is found, + * ci_name->name will point to a the actual name (caller must free) or + * will be set to NULL if an exact match is found. */ int -xfs_iread( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_inode_t *ip, - uint iget_flags) +xfs_lookup( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t **ipp, + struct xfs_name *ci_name) { - xfs_buf_t *bp; - xfs_dinode_t *dip; - int error; - - /* - * Fill in the location information in the in-core inode. - */ - error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); - if (error) - return error; - - /* shortcut IO on inode allocation if possible */ - if ((iget_flags & XFS_IGET_CREATE) && - xfs_sb_version_hascrc(&mp->m_sb) && - !(mp->m_flags & XFS_MOUNT_IKEEP)) { - /* initialise the on-disk inode core */ - memset(&ip->i_d, 0, sizeof(ip->i_d)); - ip->i_d.di_magic = XFS_DINODE_MAGIC; - ip->i_d.di_gen = prandom_u32(); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - ip->i_d.di_version = 3; - ip->i_d.di_ino = ip->i_ino; - uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); - } else - ip->i_d.di_version = 2; - return 0; - } - - /* - * Get pointers to the on-disk inode and the buffer containing it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); - if (error) - return error; + xfs_ino_t inum; + int error; + uint lock_mode; - /* even unallocated inodes are verified */ - if (!xfs_dinode_verify(mp, ip, dip)) { - xfs_alert(mp, "%s: validation failed for inode %lld failed", - __func__, ip->i_ino); + trace_xfs_lookup(dp, name); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); - error = XFS_ERROR(EFSCORRUPTED); - goto out_brelse; - } + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return XFS_ERROR(EIO); - /* - * If the on-disk inode is already linked to a directory - * entry, copy all of the inode into the in-core inode. - * xfs_iformat() handles copying in the inode format - * specific information. - * Otherwise, just get the truly permanent information. - */ - if (dip->di_mode) { - xfs_dinode_from_disk(&ip->i_d, dip); - error = xfs_iformat(ip, dip); - if (error) { -#ifdef DEBUG - xfs_alert(mp, "%s: xfs_iformat() returned error %d", - __func__, error); -#endif /* DEBUG */ - goto out_brelse; - } - } else { - /* - * Partial initialisation of the in-core inode. Just the bits - * that xfs_ialloc won't overwrite or relies on being correct. - */ - ip->i_d.di_magic = be16_to_cpu(dip->di_magic); - ip->i_d.di_version = dip->di_version; - ip->i_d.di_gen = be32_to_cpu(dip->di_gen); - ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); - - if (dip->di_version == 3) { - ip->i_d.di_ino = be64_to_cpu(dip->di_ino); - uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); - } + lock_mode = xfs_ilock_map_shared(dp); + error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); + xfs_iunlock_map_shared(dp, lock_mode); - /* - * Make sure to pull in the mode here as well in - * case the inode is released without being used. - * This ensures that xfs_inactive() will see that - * the inode is already free and not try to mess - * with the uninitialized part of it. - */ - ip->i_d.di_mode = 0; - } - - /* - * The inode format changed when we moved the link count and - * made it 32 bits long. If this is an old format inode, - * convert it in memory to look like a new one. If it gets - * flushed to disk we will convert back before flushing or - * logging it. We zero out the new projid field and the old link - * count field. We'll handle clearing the pad field (the remains - * of the old uuid field) when we actually convert the inode to - * the new format. We don't change the version number so that we - * can distinguish this from a real new format inode. - */ - if (ip->i_d.di_version == 1) { - ip->i_d.di_nlink = ip->i_d.di_onlink; - ip->i_d.di_onlink = 0; - xfs_set_projid(ip, 0); - } + if (error) + goto out; - ip->i_delayed_blks = 0; + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); + if (error) + goto out_free_name; - /* - * Mark the buffer containing the inode as something to keep - * around for a while. This helps to keep recently accessed - * meta-data in-core longer. - */ - xfs_buf_set_ref(bp, XFS_INO_REF); + return 0; - /* - * Use xfs_trans_brelse() to release the buffer containing the on-disk - * inode, because it was acquired with xfs_trans_read_buf() in - * xfs_imap_to_bp() above. If tp is NULL, this is just a normal - * brelse(). If we're within a transaction, then xfs_trans_brelse() - * will only release the buffer if it is not dirty within the - * transaction. It will be OK to release the buffer in this case, - * because inodes on disk are never destroyed and we will be locking the - * new in-core inode before putting it in the cache where other - * processes can find it. Thus we don't have to worry about the inode - * being changed just because we released the buffer. - */ - out_brelse: - xfs_trans_brelse(tp, bp); +out_free_name: + if (ci_name) + kmem_free(ci_name->name); +out: + *ipp = NULL; return error; } /* - * Read in extents from a btree-format inode. - * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. - */ -int -xfs_iread_extents( - xfs_trans_t *tp, - xfs_inode_t *ip, - int whichfork) -{ - int error; - xfs_ifork_t *ifp; - xfs_extnum_t nextents; - - if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { - XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - nextents = XFS_IFORK_NEXTENTS(ip, whichfork); - ifp = XFS_IFORK_PTR(ip, whichfork); - - /* - * We know that the size is valid (it's checked in iformat_btree) - */ - ifp->if_bytes = ifp->if_real_bytes = 0; - ifp->if_flags |= XFS_IFEXTENTS; - xfs_iext_add(ifp, 0, nextents); - error = xfs_bmap_read_extents(tp, ip, whichfork); - if (error) { - xfs_iext_destroy(ifp); - ifp->if_flags &= ~XFS_IFEXTENTS; - return error; - } - xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); - return 0; -} - -/* * Allocate an inode on disk and return a copy of its in-core version. * The in-core inode is locked exclusively. Set mode, nlink, and rdev * appropriately within the inode. The uid and gid for the inode are @@ -1295,8 +690,8 @@ xfs_ialloc( ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; ASSERT(ip->i_d.di_nlink == nlink); - ip->i_d.di_uid = current_fsuid(); - ip->i_d.di_gid = current_fsgid(); + ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); + ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); xfs_set_projid(ip, prid); memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); @@ -1335,7 +730,7 @@ xfs_ialloc( */ if ((irix_sgid_inherit) && (ip->i_d.di_mode & S_ISGID) && - (!in_group_p((gid_t)ip->i_d.di_gid))) { + (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) { ip->i_d.di_mode &= ~S_ISGID; } @@ -1467,6 +862,583 @@ xfs_ialloc( } /* + * Allocates a new inode from disk and return a pointer to the + * incore copy. This routine will internally commit the current + * transaction and allocate a new one if the Space Manager needed + * to do an allocation to replenish the inode free-list. + * + * This routine is designed to be called from xfs_create and + * xfs_create_dir. + * + */ +int +xfs_dir_ialloc( + xfs_trans_t **tpp, /* input: current transaction; + output: may be a new transaction. */ + xfs_inode_t *dp, /* directory within whose allocate + the inode. */ + umode_t mode, + xfs_nlink_t nlink, + xfs_dev_t rdev, + prid_t prid, /* project id */ + int okalloc, /* ok to allocate new space */ + xfs_inode_t **ipp, /* pointer to inode; it will be + locked. */ + int *committed) + +{ + xfs_trans_t *tp; + xfs_trans_t *ntp; + xfs_inode_t *ip; + xfs_buf_t *ialloc_context = NULL; + int code; + void *dqinfo; + uint tflags; + + tp = *tpp; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* + * xfs_ialloc will return a pointer to an incore inode if + * the Space Manager has an available inode on the free + * list. Otherwise, it will do an allocation and replenish + * the freelist. Since we can only do one allocation per + * transaction without deadlocks, we will need to commit the + * current transaction and start a new one. We will then + * need to call xfs_ialloc again to get the inode. + * + * If xfs_ialloc did an allocation to replenish the freelist, + * it returns the bp containing the head of the freelist as + * ialloc_context. We will hold a lock on it across the + * transaction commit so that no other process can steal + * the inode(s) that we've just allocated. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, + &ialloc_context, &ip); + + /* + * Return an error if we were unable to allocate a new inode. + * This should only happen if we run out of space on disk or + * encounter a disk error. + */ + if (code) { + *ipp = NULL; + return code; + } + if (!ialloc_context && !ip) { + *ipp = NULL; + return XFS_ERROR(ENOSPC); + } + + /* + * If the AGI buffer is non-NULL, then we were unable to get an + * inode in one operation. We need to commit the current + * transaction and call xfs_ialloc() again. It is guaranteed + * to succeed the second time. + */ + if (ialloc_context) { + struct xfs_trans_res tres; + + /* + * Normally, xfs_trans_commit releases all the locks. + * We call bhold to hang on to the ialloc_context across + * the commit. Holding this buffer prevents any other + * processes from doing any allocations in this + * allocation group. + */ + xfs_trans_bhold(tp, ialloc_context); + /* + * Save the log reservation so we can use + * them in the next transaction. + */ + tres.tr_logres = xfs_trans_get_log_res(tp); + tres.tr_logcount = xfs_trans_get_log_count(tp); + + /* + * We want the quota changes to be associated with the next + * transaction, NOT this one. So, detach the dqinfo from this + * and attach it to the next transaction. + */ + dqinfo = NULL; + tflags = 0; + if (tp->t_dqinfo) { + dqinfo = (void *)tp->t_dqinfo; + tp->t_dqinfo = NULL; + tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; + tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); + } + + ntp = xfs_trans_dup(tp); + code = xfs_trans_commit(tp, 0); + tp = ntp; + if (committed != NULL) { + *committed = 1; + } + /* + * If we get an error during the commit processing, + * release the buffer that is still held and return + * to the caller. + */ + if (code) { + xfs_buf_relse(ialloc_context); + if (dqinfo) { + tp->t_dqinfo = dqinfo; + xfs_trans_free_dqinfo(tp); + } + *tpp = ntp; + *ipp = NULL; + return code; + } + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + code = xfs_trans_reserve(tp, &tres, 0, 0); + + /* + * Re-attach the quota info that we detached from prev trx. + */ + if (dqinfo) { + tp->t_dqinfo = dqinfo; + tp->t_flags |= tflags; + } + + if (code) { + xfs_buf_relse(ialloc_context); + *tpp = ntp; + *ipp = NULL; + return code; + } + xfs_trans_bjoin(tp, ialloc_context); + + /* + * Call ialloc again. Since we've locked out all + * other allocations in this allocation group, + * this call should always succeed. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, + okalloc, &ialloc_context, &ip); + + /* + * If we get an error at this point, return to the caller + * so that the current transaction can be aborted. + */ + if (code) { + *tpp = tp; + *ipp = NULL; + return code; + } + ASSERT(!ialloc_context && ip); + + } else { + if (committed != NULL) + *committed = 0; + } + + *ipp = ip; + *tpp = tp; + + return 0; +} + +/* + * Decrement the link count on an inode & log the change. + * If this causes the link count to go to zero, initiate the + * logging activity required to truncate a file. + */ +int /* error */ +xfs_droplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + int error; + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT (ip->i_d.di_nlink > 0); + ip->i_d.di_nlink--; + drop_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = 0; + if (ip->i_d.di_nlink == 0) { + /* + * We're dropping the last link to this file. + * Move the on-disk inode to the AGI unlinked list. + * From xfs_inactive() we will pull the inode from + * the list and free it. + */ + error = xfs_iunlink(tp, ip); + } + return error; +} + +/* + * This gets called when the inode's version needs to be changed from 1 to 2. + * Currently this happens when the nlink field overflows the old 16-bit value + * or when chproj is called to change the project for the first time. + * As a side effect the superblock version will also get rev'd + * to contain the NLINK bit. + */ +void +xfs_bump_ino_vers2( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(ip->i_d.di_version == 1); + + ip->i_d.di_version = 2; + ip->i_d.di_onlink = 0; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + mp = tp->t_mountp; + if (!xfs_sb_version_hasnlink(&mp->m_sb)) { + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasnlink(&mp->m_sb)) { + xfs_sb_version_addnlink(&mp->m_sb); + spin_unlock(&mp->m_sb_lock); + xfs_mod_sb(tp, XFS_SB_VERSIONNUM); + } else { + spin_unlock(&mp->m_sb_lock); + } + } + /* Caller must log the inode */ +} + +/* + * Increment the link count on an inode & log the change. + */ +int +xfs_bumplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT(ip->i_d.di_nlink > 0); + ip->i_d.di_nlink++; + inc_nlink(VFS_I(ip)); + if ((ip->i_d.di_version == 1) && + (ip->i_d.di_nlink > XFS_MAXLINK_1)) { + /* + * The inode has increased its number of links beyond + * what can fit in an old format inode. It now needs + * to be converted to a version 2 inode with a 32 bit + * link count. If this is the first inode in the file + * system to do this, then we need to bump the superblock + * version number as well. + */ + xfs_bump_ino_vers2(tp, ip); + } + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} + +int +xfs_create( + xfs_inode_t *dp, + struct xfs_name *name, + umode_t mode, + xfs_dev_t rdev, + xfs_inode_t **ipp) +{ + int is_dir = S_ISDIR(mode); + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res tres; + uint resblks; + + trace_xfs_create(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + prid = xfs_get_projid(dp); + else + prid = XFS_PROJID_DEFAULT; + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + if (is_dir) { + rdev = 0; + resblks = XFS_MKDIR_SPACE_RES(mp, name->len); + tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres; + tres.tr_logcount = XFS_MKDIR_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); + } else { + resblks = XFS_CREATE_SPACE_RES(mp, name->len); + tres.tr_logres = M_RES(mp)->tr_create.tr_logres; + tres.tr_logcount = XFS_CREATE_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); + } + + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + + /* + * Initially assume that the file does not exist and + * reserve the resources for that case. If that is not + * the case we'll drop the one we have and get a more + * appropriate transaction later. + */ + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(tp, &tres, resblks, 0); + if (error == ENOSPC) { + /* flush outstanding delalloc blocks and retry */ + xfs_flush_inodes(mp); + error = xfs_trans_reserve(tp, &tres, resblks, 0); + } + if (error == ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, &tres, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + xfs_bmap_init(&free_list, &first_block); + + /* + * Reserve disk quota and the inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + error = xfs_dir_canenter(tp, dp, name, resblks); + if (error) + goto out_trans_cancel; + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, + prid, resblks > 0, &ip, &committed); + if (error) { + if (error == ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; + } + + /* + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks ? + resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + if (error) { + ASSERT(error != ENOSPC); + goto out_trans_abort; + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + if (is_dir) { + error = xfs_dir_init(tp, ip, dp); + if (error) + goto out_bmap_cancel; + + error = xfs_bumplink(tp, dp); + if (error) + goto out_bmap_cancel; + } + + /* + * If this is a synchronous mount, make sure that the + * create transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to + * release the inode. This prevents recursive transactions + * and deadlocks from xfs_inactive. + */ + if (ip) + IRELE(ip); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +int +xfs_link( + xfs_inode_t *tdp, + xfs_inode_t *sip, + struct xfs_name *target_name) +{ + xfs_mount_t *mp = tdp->i_mount; + xfs_trans_t *tp; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + int resblks; + + trace_xfs_link(tdp, target_name); + + ASSERT(!S_ISDIR(sip->i_d.di_mode)); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = xfs_qm_dqattach(sip, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(tdp, 0); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + resblks = XFS_LINK_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow hard link + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { + error = XFS_ERROR(EXDEV); + goto error_return; + } + + error = xfs_dir_canenter(tp, tdp, target_name, resblks); + if (error) + goto error_return; + + xfs_bmap_init(&free_list, &first_block); + + error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto abort_return; + xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); + + error = xfs_bumplink(tp, sip); + if (error) + goto abort_return; + + /* + * If this is a synchronous mount, make sure that the + * link transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish (&tp, &free_list, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + goto abort_return; + } + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; +} + +/* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and * data fork, and does not modify the inode size, which is left to the caller. @@ -1576,10 +1548,7 @@ xfs_itruncate_extents( * reference that we gained in xfs_trans_dup() */ xfs_log_ticket_put(tp->t_ticket); - error = xfs_trans_reserve(tp, 0, - XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto out; } @@ -1605,6 +1574,271 @@ out_bmap_cancel: goto out; } +int +xfs_release( + xfs_inode_t *ip) +{ + xfs_mount_t *mp = ip->i_mount; + int error; + + if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) + return 0; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + + if (!XFS_FORCED_SHUTDOWN(mp)) { + int truncated; + + /* + * If we are using filestreams, and we have an unlinked + * file that we are processing the last close on, then nothing + * will be able to reopen and write to this file. Purge this + * inode from the filestreams cache so that it doesn't delay + * teardown of the inode. + */ + if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + + /* + * If we previously truncated this file and removed old data + * in the process, we want to initiate "early" writeout on + * the last close. This is an attempt to combat the notorious + * NULL files problem which is particularly noticeable from a + * truncate down, buffered (re-)write (delalloc), followed by + * a crash. What we are effectively doing here is + * significantly reducing the time window where we'd otherwise + * be exposed to that problem. + */ + truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); + if (truncated) { + xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); + if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { + error = -filemap_flush(VFS_I(ip)->i_mapping); + if (error) + return error; + } + } + } + + if (ip->i_d.di_nlink == 0) + return 0; + + if (xfs_can_free_eofblocks(ip, false)) { + + /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem + * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. + * + * Further, check if the inode is being opened, written and + * closed frequently and we have delayed allocation blocks + * outstanding (e.g. streaming writes from the NFS server), + * truncating the blocks past EOF will cause fragmentation to + * occur. + * + * In this case don't do the truncation, either, but we have to + * be careful how we detect this case. Blocks beyond EOF show + * up as i_delayed_blks even when the inode is clean, so we + * need to truncate them away first before checking for a dirty + * release. Hence on the first dirty close we will still remove + * the speculative allocation, but after that we will leave it + * in place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + + error = xfs_free_eofblocks(mp, ip, true); + if (error && error != EAGAIN) + return error; + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) + xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); + } + return 0; +} + +/* + * xfs_inactive + * + * This is called when the vnode reference count for the vnode + * goes to zero. If the file has been unlinked, then it must + * now be truncated. Also, we clear all of the read-ahead state + * kept for the inode here since the file is now closed. + */ +int +xfs_inactive( + xfs_inode_t *ip) +{ + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_trans_res *resp; + int error; + int truncate = 0; + + /* + * If the inode is already free, then there can be nothing + * to clean up here. + */ + if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) { + ASSERT(ip->i_df.if_real_bytes == 0); + ASSERT(ip->i_df.if_broot_bytes == 0); + return VN_INACTIVE_CACHE; + } + + mp = ip->i_mount; + + error = 0; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + goto out; + + if (ip->i_d.di_nlink != 0) { + /* + * force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ + if (xfs_can_free_eofblocks(ip, true)) { + error = xfs_free_eofblocks(mp, ip, false); + if (error) + return VN_INACTIVE_CACHE; + } + goto out; + } + + if (S_ISREG(ip->i_d.di_mode) && + (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || + ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + truncate = 1; + + error = xfs_qm_dqattach(ip, 0); + if (error) + return VN_INACTIVE_CACHE; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + resp = (truncate || S_ISLNK(ip->i_d.di_mode)) ? + &M_RES(mp)->tr_itruncate : &M_RES(mp)->tr_ifree; + + error = xfs_trans_reserve(tp, resp, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + return VN_INACTIVE_CACHE; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + if (S_ISLNK(ip->i_d.di_mode)) { + error = xfs_inactive_symlink(ip, &tp); + if (error) + goto out_cancel; + } else if (truncate) { + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + if (error) + goto out_cancel; + + ASSERT(ip->i_d.di_nextents == 0); + } + + /* + * If there are attributes associated with the file then blow them away + * now. The code calls a routine that recursively deconstructs the + * attribute fork. We need to just commit the current transaction + * because we can't use it for xfs_attr_inactive(). + */ + if (ip->i_d.di_anextents > 0) { + ASSERT(ip->i_d.di_forkoff != 0); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_unlock; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + error = xfs_attr_inactive(ip); + if (error) + goto out; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + ASSERT(ip->i_d.di_anextents == 0); + + /* + * Free the inode. + */ + xfs_bmap_init(&free_list, &first_block); + error = xfs_ifree(tp, ip, &free_list); + if (error) { + /* + * If we fail to free the inode, shut down. The cancel + * might do that, we need to make sure. Otherwise the + * inode might be lost for a long time or forever. + */ + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_notice(mp, "%s: xfs_ifree returned error %d", + __func__, error); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + } else { + /* + * Credit the quota account(s). The inode is gone. + */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); + + /* + * Just ignore errors at this point. There is nothing we can + * do except to try to keep going. Make sure it's not a silent + * error. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", + __func__, error); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + xfs_notice(mp, "%s: xfs_trans_commit returned error %d", + __func__, error); + } + + /* + * Release the dquots held by inode, if any. + */ + xfs_qm_dqdetach(ip); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + return VN_INACTIVE_CACHE; +out_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + goto out_unlock; +} + /* * This is called when the inode's link count goes to 0. * We place the on-disk inode on a list in the AGI. It @@ -1861,7 +2095,7 @@ xfs_iunlink_remove( } /* - * A big issue when freeing the inode cluster is is that we _cannot_ skip any + * A big issue when freeing the inode cluster is that we _cannot_ skip any * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. */ @@ -2094,272 +2328,6 @@ xfs_ifree( } /* - * Reallocate the space for if_broot based on the number of records - * being added or deleted as indicated in rec_diff. Move the records - * and pointers in if_broot to fit the new size. When shrinking this - * will eliminate holes between the records and pointers created by - * the caller. When growing this will create holes to be filled in - * by the caller. - * - * The caller must not request to add more records than would fit in - * the on-disk inode root. If the if_broot is currently NULL, then - * if we adding records one will be allocated. The caller must also - * not request that the number of records go below zero, although - * it can go to zero. - * - * ip -- the inode whose if_broot area is changing - * ext_diff -- the change in the number of records, positive or negative, - * requested for the if_broot array. - */ -void -xfs_iroot_realloc( - xfs_inode_t *ip, - int rec_diff, - int whichfork) -{ - struct xfs_mount *mp = ip->i_mount; - int cur_max; - xfs_ifork_t *ifp; - struct xfs_btree_block *new_broot; - int new_max; - size_t new_size; - char *np; - char *op; - - /* - * Handle the degenerate case quietly. - */ - if (rec_diff == 0) { - return; - } - - ifp = XFS_IFORK_PTR(ip, whichfork); - if (rec_diff > 0) { - /* - * If there wasn't any memory allocated before, just - * allocate it now and get out. - */ - if (ifp->if_broot_bytes == 0) { - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); - ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); - ifp->if_broot_bytes = (int)new_size; - return; - } - - /* - * If there is already an existing if_broot, then we need - * to realloc() it and shift the pointers to their new - * location. The records don't change location because - * they are kept butted up against the btree block header. - */ - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); - new_max = cur_max + rec_diff; - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), - KM_SLEEP | KM_NOFS); - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - (int)new_size); - ifp->if_broot_bytes = (int)new_size; - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); - memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); - return; - } - - /* - * rec_diff is less than 0. In this case, we are shrinking the - * if_broot buffer. It must already exist. If we go to zero - * records, just get rid of the root and clear the status bit. - */ - ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); - new_max = cur_max + rec_diff; - ASSERT(new_max >= 0); - if (new_max > 0) - new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); - else - new_size = 0; - if (new_size > 0) { - new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); - /* - * First copy over the btree block header. - */ - memcpy(new_broot, ifp->if_broot, - XFS_BMBT_BLOCK_LEN(ip->i_mount)); - } else { - new_broot = NULL; - ifp->if_flags &= ~XFS_IFBROOT; - } - - /* - * Only copy the records and pointers if there are any. - */ - if (new_max > 0) { - /* - * First copy the records. - */ - op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); - np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); - memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); - - /* - * Then copy the pointers. - */ - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, - (int)new_size); - memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); - } - kmem_free(ifp->if_broot); - ifp->if_broot = new_broot; - ifp->if_broot_bytes = (int)new_size; - if (ifp->if_broot) - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); - return; -} - - -/* - * This is called when the amount of space needed for if_data - * is increased or decreased. The change in size is indicated by - * the number of bytes that need to be added or deleted in the - * byte_diff parameter. - * - * If the amount of space needed has decreased below the size of the - * inline buffer, then switch to using the inline buffer. Otherwise, - * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer - * to what is needed. - * - * ip -- the inode whose if_data area is changing - * byte_diff -- the change in the number of bytes, positive or negative, - * requested for the if_data array. - */ -void -xfs_idata_realloc( - xfs_inode_t *ip, - int byte_diff, - int whichfork) -{ - xfs_ifork_t *ifp; - int new_size; - int real_size; - - if (byte_diff == 0) { - return; - } - - ifp = XFS_IFORK_PTR(ip, whichfork); - new_size = (int)ifp->if_bytes + byte_diff; - ASSERT(new_size >= 0); - - if (new_size == 0) { - if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - kmem_free(ifp->if_u1.if_data); - } - ifp->if_u1.if_data = NULL; - real_size = 0; - } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { - /* - * If the valid extents/data can fit in if_inline_ext/data, - * copy them from the malloc'd vector and free it. - */ - if (ifp->if_u1.if_data == NULL) { - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - ASSERT(ifp->if_real_bytes != 0); - memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, - new_size); - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } - real_size = 0; - } else { - /* - * Stuck with malloc/realloc. - * For inline data, the underlying buffer must be - * a multiple of 4 bytes in size so that it can be - * logged and stay on word boundaries. We enforce - * that here. - */ - real_size = roundup(new_size, 4); - if (ifp->if_u1.if_data == NULL) { - ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, - KM_SLEEP | KM_NOFS); - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - /* - * Only do the realloc if the underlying size - * is really changing. - */ - if (ifp->if_real_bytes != real_size) { - ifp->if_u1.if_data = - kmem_realloc(ifp->if_u1.if_data, - real_size, - ifp->if_real_bytes, - KM_SLEEP | KM_NOFS); - } - } else { - ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, - KM_SLEEP | KM_NOFS); - memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, - ifp->if_bytes); - } - } - ifp->if_real_bytes = real_size; - ifp->if_bytes = new_size; - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); -} - -void -xfs_idestroy_fork( - xfs_inode_t *ip, - int whichfork) -{ - xfs_ifork_t *ifp; - - ifp = XFS_IFORK_PTR(ip, whichfork); - if (ifp->if_broot != NULL) { - kmem_free(ifp->if_broot); - ifp->if_broot = NULL; - } - - /* - * If the format is local, then we can't have an extents - * array so just look for an inline data array. If we're - * not local then we may or may not have an extents list, - * so check and free it up if we do. - */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && - (ifp->if_u1.if_data != NULL)) { - ASSERT(ifp->if_real_bytes != 0); - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; - ifp->if_real_bytes = 0; - } - } else if ((ifp->if_flags & XFS_IFEXTENTS) && - ((ifp->if_flags & XFS_IFEXTIREC) || - ((ifp->if_u1.if_extents != NULL) && - (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { - ASSERT(ifp->if_real_bytes != 0); - xfs_iext_destroy(ifp); - } - ASSERT(ifp->if_u1.if_extents == NULL || - ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); - ASSERT(ifp->if_real_bytes == 0); - if (whichfork == XFS_ATTR_FORK) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - } -} - -/* * This is called to unpin an inode. The caller must have the inode locked * in at least shared mode so that the buffer cannot be subsequently pinned * once someone is waiting for it to be unpinned. @@ -2402,162 +2370,471 @@ xfs_iunpin_wait( __xfs_iunpin_wait(ip); } -/* - * xfs_iextents_copy() - * - * This is called to copy the REAL extents (as opposed to the delayed - * allocation extents) from the inode into the given buffer. It - * returns the number of bytes copied into the buffer. - * - * If there are no delayed allocation extents, then we can just - * memcpy() the extents into the buffer. Otherwise, we need to - * examine each extent in turn and skip those which are delayed. - */ int -xfs_iextents_copy( - xfs_inode_t *ip, - xfs_bmbt_rec_t *dp, - int whichfork) +xfs_remove( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t *ip) { - int copied; - int i; - xfs_ifork_t *ifp; - int nrecs; - xfs_fsblock_t start_block; + xfs_mount_t *mp = dp->i_mount; + xfs_trans_t *tp = NULL; + int is_dir = S_ISDIR(ip->i_d.di_mode); + int error = 0; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + int link_zero; + uint resblks; + uint log_count; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(ifp->if_bytes > 0); + trace_xfs_remove(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = xfs_qm_dqattach(dp, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(ip, 0); + if (error) + goto std_return; - nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); - ASSERT(nrecs > 0); + if (is_dir) { + tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); + log_count = XFS_DEFAULT_LOG_COUNT; + } else { + tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + log_count = XFS_REMOVE_LOG_COUNT; + } + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* - * There are some delayed allocation extents in the - * inode, so copy the extents one at a time and skip - * the delayed ones. There must be at least one - * non-delayed extent. + * We try to get the real space reservation first, + * allowing for directory btree deletion(s) implying + * possible bmap insert(s). If we can't get the space + * reservation then we use 0 instead, and avoid the bmap + * btree insert(s) in the directory code by, if the bmap + * insert tries to happen, instead trimming the LAST + * block from the directory. */ - copied = 0; - for (i = 0; i < nrecs; i++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - start_block = xfs_bmbt_get_startblock(ep); - if (isnullstartblock(start_block)) { - /* - * It's a delayed allocation extent, so skip it. - */ - continue; + resblks = XFS_REMOVE_SPACE_RES(mp); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); + } + if (error) { + ASSERT(error != ENOSPC); + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + /* + * If we're removing a directory perform some additional validation. + */ + if (is_dir) { + ASSERT(ip->i_d.di_nlink >= 2); + if (ip->i_d.di_nlink != 2) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; } + if (!xfs_dir_isempty(ip)) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; + } + } + + xfs_bmap_init(&free_list, &first_block); + error = xfs_dir_removename(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) { + ASSERT(error != ENOENT); + goto out_bmap_cancel; + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - /* Translate to on disk format */ - put_unaligned(cpu_to_be64(ep->l0), &dp->l0); - put_unaligned(cpu_to_be64(ep->l1), &dp->l1); - dp++; - copied++; + if (is_dir) { + /* + * Drop the link from ip's "..". + */ + error = xfs_droplink(tp, dp); + if (error) + goto out_bmap_cancel; + + /* + * Drop the "." link from ip to self. + */ + error = xfs_droplink(tp, ip); + if (error) + goto out_bmap_cancel; + } else { + /* + * When removing a non-directory we need to log the parent + * inode here. For a directory this is done implicitly + * by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); } - ASSERT(copied != 0); - xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); - return (copied * (uint)sizeof(xfs_bmbt_rec_t)); + /* + * Drop the link from dp to ip. + */ + error = xfs_droplink(tp, ip); + if (error) + goto out_bmap_cancel; + + /* + * Determine if this is the last link while + * we are in the transaction. + */ + link_zero = (ip->i_d.di_nlink == 0); + + /* + * If this is a synchronous mount, make sure that the + * remove transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto std_return; + + /* + * If we are using filestreams, kill the stream association. + * If the file is still open it may get a new one but that + * will get killed on last close in xfs_close() so we don't + * have to worry about that. + */ + if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; } /* - * Each of the following cases stores data into the same region - * of the on-disk inode, so only one of them can be valid at - * any given time. While it is possible to have conflicting formats - * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is - * in EXTENTS format, this can only happen when the fork has - * changed formats after being modified but before being flushed. - * In these cases, the format always takes precedence, because the - * format indicates the current state of the fork. + * Enter all inodes for a rename transaction into a sorted array. */ -/*ARGSUSED*/ STATIC void -xfs_iflush_fork( - xfs_inode_t *ip, - xfs_dinode_t *dip, - xfs_inode_log_item_t *iip, - int whichfork, - xfs_buf_t *bp) -{ - char *cp; - xfs_ifork_t *ifp; - xfs_mount_t *mp; - static const short brootflag[2] = - { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; - static const short dataflag[2] = - { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; - static const short extflag[2] = - { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; - - if (!iip) - return; - ifp = XFS_IFORK_PTR(ip, whichfork); - /* - * This can happen if we gave up in iformat in an error path, - * for the attribute fork. - */ - if (!ifp) { - ASSERT(whichfork == XFS_ATTR_FORK); - return; - } - cp = XFS_DFORK_PTR(dip, whichfork); - mp = ip->i_mount; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { - case XFS_DINODE_FMT_LOCAL: - if ((iip->ili_fields & dataflag[whichfork]) && - (ifp->if_bytes > 0)) { - ASSERT(ifp->if_u1.if_data != NULL); - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); - memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); - } - break; +xfs_sort_for_rename( + xfs_inode_t *dp1, /* in: old (source) directory inode */ + xfs_inode_t *dp2, /* in: new (target) directory inode */ + xfs_inode_t *ip1, /* in: inode of old entry */ + xfs_inode_t *ip2, /* in: inode of new entry, if it + already exists, NULL otherwise. */ + xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ + int *num_inodes) /* out: number of inodes in array */ +{ + xfs_inode_t *temp; + int i, j; - case XFS_DINODE_FMT_EXTENTS: - ASSERT((ifp->if_flags & XFS_IFEXTENTS) || - !(iip->ili_fields & extflag[whichfork])); - if ((iip->ili_fields & extflag[whichfork]) && - (ifp->if_bytes > 0)) { - ASSERT(xfs_iext_get_ext(ifp, 0)); - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); - (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, - whichfork); - } - break; + /* + * i_tab contains a list of pointers to inodes. We initialize + * the table here & we'll sort it. We will then use it to + * order the acquisition of the inode locks. + * + * Note that the table may contain duplicates. e.g., dp1 == dp2. + */ + i_tab[0] = dp1; + i_tab[1] = dp2; + i_tab[2] = ip1; + if (ip2) { + *num_inodes = 4; + i_tab[3] = ip2; + } else { + *num_inodes = 3; + i_tab[3] = NULL; + } - case XFS_DINODE_FMT_BTREE: - if ((iip->ili_fields & brootflag[whichfork]) && - (ifp->if_broot_bytes > 0)) { - ASSERT(ifp->if_broot != NULL); - ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); - xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, - (xfs_bmdr_block_t *)cp, - XFS_DFORK_SIZE(dip, mp, whichfork)); + /* + * Sort the elements via bubble sort. (Remember, there are at + * most 4 elements to sort, so this is adequate.) + */ + for (i = 0; i < *num_inodes; i++) { + for (j = 1; j < *num_inodes; j++) { + if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { + temp = i_tab[j]; + i_tab[j] = i_tab[j-1]; + i_tab[j-1] = temp; + } } - break; + } +} + +/* + * xfs_rename + */ +int +xfs_rename( + xfs_inode_t *src_dp, + struct xfs_name *src_name, + xfs_inode_t *src_ip, + xfs_inode_t *target_dp, + struct xfs_name *target_name, + xfs_inode_t *target_ip) +{ + xfs_trans_t *tp = NULL; + xfs_mount_t *mp = src_dp->i_mount; + int new_parent; /* moving to a new dir */ + int src_is_directory; /* src_name is a directory */ + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + xfs_inode_t *inodes[4]; + int spaceres; + int num_inodes; + + trace_xfs_rename(src_dp, target_dp, src_name, target_name); + + new_parent = (src_dp != target_dp); + src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, + inodes, &num_inodes); + + xfs_bmap_init(&free_list, &first_block); + tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); + if (error == ENOSPC) { + spaceres = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); + } + if (error) { + xfs_trans_cancel(tp, 0); + goto std_return; + } + + /* + * Attach the dquots to the inodes + */ + error = xfs_qm_vop_rename_dqattach(inodes); + if (error) { + xfs_trans_cancel(tp, cancel_flags); + goto std_return; + } + + /* + * Lock all the participating inodes. Depending upon whether + * the target_name exists in the target directory, and + * whether the target directory is the same as the source + * directory, we can lock from 2 to 4 inodes. + */ + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); + + /* + * Join all the inodes to the transaction. From this point on, + * we can rely on either trans_commit or trans_cancel to unlock + * them. + */ + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + if (new_parent) + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); + if (target_ip) + xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. + */ + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { + error = XFS_ERROR(EXDEV); + goto error_return; + } + + /* + * Set up the target. + */ + if (target_ip == NULL) { + /* + * If there's no space reservation, check the entry will + * fit before actually inserting it. + */ + error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); + if (error) + goto error_return; + /* + * If target does not exist and the rename crosses + * directories, adjust the target directory link count + * to account for the ".." reference from the new entry. + */ + error = xfs_dir_createname(tp, target_dp, target_name, + src_ip->i_ino, &first_block, + &free_list, spaceres); + if (error == ENOSPC) + goto error_return; + if (error) + goto abort_return; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - case XFS_DINODE_FMT_DEV: - if (iip->ili_fields & XFS_ILOG_DEV) { - ASSERT(whichfork == XFS_DATA_FORK); - xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); + if (new_parent && src_is_directory) { + error = xfs_bumplink(tp, target_dp); + if (error) + goto abort_return; + } + } else { /* target_ip != NULL */ + /* + * If target exists and it's a directory, check that both + * target and source are directories and that target can be + * destroyed, or that neither is a directory. + */ + if (S_ISDIR(target_ip->i_d.di_mode)) { + /* + * Make sure target dir is empty. + */ + if (!(xfs_dir_isempty(target_ip)) || + (target_ip->i_d.di_nlink > 2)) { + error = XFS_ERROR(EEXIST); + goto error_return; + } } - break; - case XFS_DINODE_FMT_UUID: - if (iip->ili_fields & XFS_ILOG_UUID) { - ASSERT(whichfork == XFS_DATA_FORK); - memcpy(XFS_DFORK_DPTR(dip), - &ip->i_df.if_u2.if_uuid, - sizeof(uuid_t)); + /* + * Link the source inode under the target name. + * If the source inode is a directory and we are moving + * it across directories, its ".." entry will be + * inconsistent until we replace that down below. + * + * In case there is already an entry with the same + * name at the destination directory, remove it first. + */ + error = xfs_dir_replace(tp, target_dp, target_name, + src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto abort_return; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Decrement the link count on the target since the target + * dir no longer points to it. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto abort_return; + + if (src_is_directory) { + /* + * Drop the link from the old "." entry. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto abort_return; } - break; + } /* target_ip != NULL */ - default: - ASSERT(0); - break; + /* + * Remove the source. + */ + if (new_parent && src_is_directory) { + /* + * Rewrite the ".." entry to point to the new + * directory. + */ + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, + &first_block, &free_list, spaceres); + ASSERT(error != EEXIST); + if (error) + goto abort_return; + } + + /* + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. + */ + xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); + + /* + * Adjust the link count on src_dp. This is necessary when + * renaming a directory, either within one parent when + * the target existed, or across two parent directories. + */ + if (src_is_directory && (new_parent || target_ip != NULL)) { + + /* + * Decrement link count on src_directory since the + * entry that's moved no longer points to it. + */ + error = xfs_droplink(tp, src_dp); + if (error) + goto abort_return; + } + + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto abort_return; + + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * rename transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + goto std_return; + } + + /* + * trans_commit will unlock src_ip, target_ip & decrement + * the vnode references. + */ + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; } STATIC int @@ -2816,7 +3093,6 @@ abort_out: return error; } - STATIC int xfs_iflush_int( struct xfs_inode *ip, @@ -3004,1072 +3280,3 @@ xfs_iflush_int( corrupt_out: return XFS_ERROR(EFSCORRUPTED); } - -/* - * Return a pointer to the extent record at file index idx. - */ -xfs_bmbt_rec_host_t * -xfs_iext_get_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx) /* index of target extent */ -{ - ASSERT(idx >= 0); - ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - - if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { - return ifp->if_u1.if_ext_irec->er_extbuf; - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_ext_irec_t *erp; /* irec pointer */ - int erp_idx = 0; /* irec index */ - xfs_extnum_t page_idx = idx; /* ext index in target list */ - - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - return &erp->er_extbuf[page_idx]; - } else if (ifp->if_bytes) { - return &ifp->if_u1.if_extents[idx]; - } else { - return NULL; - } -} - -/* - * Insert new item(s) into the extent records for incore inode - * fork 'ifp'. 'count' new items are inserted at index 'idx'. - */ -void -xfs_iext_insert( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* starting index of new items */ - xfs_extnum_t count, /* number of inserted items */ - xfs_bmbt_irec_t *new, /* items to insert */ - int state) /* type of extent conversion */ -{ - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; - xfs_extnum_t i; /* extent record index */ - - trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); - - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - xfs_iext_add(ifp, idx, count); - for (i = idx; i < idx + count; i++, new++) - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); -} - -/* - * This is called when the amount of space required for incore file - * extents needs to be increased. The ext_diff parameter stores the - * number of new extents being added and the idx parameter contains - * the extent index where the new extents will be added. If the new - * extents are being appended, then we just need to (re)allocate and - * initialize the space. Otherwise, if the new extents are being - * inserted into the middle of the existing entries, a bit more work - * is required to make room for the new extents to be inserted. The - * caller is responsible for filling in the new extent entries upon - * return. - */ -void -xfs_iext_add( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin adding exts */ - int ext_diff) /* number of extents to add */ -{ - int byte_diff; /* new bytes being added */ - int new_size; /* size of extents after adding */ - xfs_extnum_t nextents; /* number of extents in file */ - - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT((idx >= 0) && (idx <= nextents)); - byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); - new_size = ifp->if_bytes + byte_diff; - /* - * If the new number of extents (nextents + ext_diff) - * fits inside the inode, then continue to use the inline - * extent buffer. - */ - if (nextents + ext_diff <= XFS_INLINE_EXTS) { - if (idx < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], - &ifp->if_u2.if_inline_ext[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); - } - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; - } - /* - * Otherwise use a linear (direct) extent list. - * If the extents are currently inside the inode, - * xfs_iext_realloc_direct will switch us from - * inline to direct extent allocation mode. - */ - else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, new_size); - if (idx < nextents) { - memmove(&ifp->if_u1.if_extents[idx + ext_diff], - &ifp->if_u1.if_extents[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); - } - } - /* Indirection array */ - else { - xfs_ext_irec_t *erp; - int erp_idx = 0; - int page_idx = idx; - - ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); - if (ifp->if_flags & XFS_IFEXTIREC) { - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); - } else { - xfs_iext_irec_init(ifp); - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = ifp->if_u1.if_ext_irec; - } - /* Extents fit in target extent page */ - if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { - if (page_idx < erp->er_extcount) { - memmove(&erp->er_extbuf[page_idx + ext_diff], - &erp->er_extbuf[page_idx], - (erp->er_extcount - page_idx) * - sizeof(xfs_bmbt_rec_t)); - memset(&erp->er_extbuf[page_idx], 0, byte_diff); - } - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - } - /* Insert a new extent page */ - else if (erp) { - xfs_iext_add_indirect_multi(ifp, - erp_idx, page_idx, ext_diff); - } - /* - * If extent(s) are being appended to the last page in - * the indirection array and the new extent(s) don't fit - * in the page, then erp is NULL and erp_idx is set to - * the next index needed in the indirection array. - */ - else { - int count = ext_diff; - - while (count) { - erp = xfs_iext_irec_new(ifp, erp_idx); - erp->er_extcount = count; - count -= MIN(count, (int)XFS_LINEAR_EXTS); - if (count) { - erp_idx++; - } - } - } - } - ifp->if_bytes = new_size; -} - -/* - * This is called when incore extents are being added to the indirection - * array and the new extents do not fit in the target extent list. The - * erp_idx parameter contains the irec index for the target extent list - * in the indirection array, and the idx parameter contains the extent - * index within the list. The number of extents being added is stored - * in the count parameter. - * - * |-------| |-------| - * | | | | idx - number of extents before idx - * | idx | | count | - * | | | | count - number of extents being inserted at idx - * |-------| |-------| - * | count | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_add_indirect_multi( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* target extent irec index */ - xfs_extnum_t idx, /* index within target list */ - int count) /* new extents being added */ -{ - int byte_diff; /* new bytes being added */ - xfs_ext_irec_t *erp; /* pointer to irec entry */ - xfs_extnum_t ext_diff; /* number of extents to add */ - xfs_extnum_t ext_cnt; /* new extents still needed */ - xfs_extnum_t nex2; /* extents after idx + count */ - xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ - int nlists; /* number of irec's (lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex2 = erp->er_extcount - idx; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* - * Save second part of target extent list - * (all extents past */ - if (nex2) { - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); - memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); - erp->er_extcount -= nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); - memset(&erp->er_extbuf[idx], 0, byte_diff); - } - - /* - * Add the new extents to the end of the target - * list, then allocate new irec record(s) and - * extent buffer(s) as needed to store the rest - * of the new extents. - */ - ext_cnt = count; - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); - if (ext_diff) { - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - while (ext_cnt) { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); - erp->er_extcount = ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - - /* Add nex2 extents back to indirection array */ - if (nex2) { - xfs_extnum_t ext_avail; - int i; - - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - i = 0; - /* - * If nex2 extents fit in the current page, append - * nex2_ep after the new extents. - */ - if (nex2 <= ext_avail) { - i = erp->er_extcount; - } - /* - * Otherwise, check if space is available in the - * next page. - */ - else if ((erp_idx < nlists - 1) && - (nex2 <= (ext_avail = XFS_LINEAR_EXTS - - ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { - erp_idx++; - erp++; - /* Create a hole for nex2 extents */ - memmove(&erp->er_extbuf[nex2], erp->er_extbuf, - erp->er_extcount * sizeof(xfs_bmbt_rec_t)); - } - /* - * Final choice, create a new extent page for - * nex2 extents. - */ - else { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - } - memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); - kmem_free(nex2_ep); - erp->er_extcount += nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); - } -} - -/* - * This is called when the amount of space required for incore file - * extents needs to be decreased. The ext_diff parameter stores the - * number of extents to be removed and the idx parameter contains - * the extent index where the extents will be removed from. - * - * If the amount of space needed has decreased below the linear - * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous - * extent array. Otherwise, use kmem_realloc() to adjust the - * size to what is needed. - */ -void -xfs_iext_remove( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff, /* number of extents to remove */ - int state) /* type of extent conversion */ -{ - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - trace_xfs_iext_remove(ip, idx, state, _RET_IP_); - - ASSERT(ext_diff > 0); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_iext_remove_indirect(ifp, idx, ext_diff); - } else if (ifp->if_real_bytes) { - xfs_iext_remove_direct(ifp, idx, ext_diff); - } else { - xfs_iext_remove_inline(ifp, idx, ext_diff); - } - ifp->if_bytes = new_size; -} - -/* - * This removes ext_diff extents from the inline buffer, beginning - * at extent index idx. - */ -void -xfs_iext_remove_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - int nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - ASSERT(idx < XFS_INLINE_EXTS); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(((nextents - ext_diff) > 0) && - (nextents - ext_diff) < XFS_INLINE_EXTS); - - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx], - &ifp->if_u2.if_inline_ext[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); - } else { - memset(&ifp->if_u2.if_inline_ext[idx], 0, - ext_diff * sizeof(xfs_bmbt_rec_t)); - } -} - -/* - * This removes ext_diff extents from a linear (direct) extent list, - * beginning at extent index idx. If the extents are being removed - * from the end of the list (ie. truncate) then we just need to re- - * allocate the list to remove the extra space. Otherwise, if the - * extents are being removed from the middle of the existing extent - * entries, then we first need to move the extent records beginning - * at idx + ext_diff up in the list to overwrite the records being - * removed, then remove the extra space via kmem_realloc. - */ -void -xfs_iext_remove_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - new_size = ifp->if_bytes - - (ext_diff * sizeof(xfs_bmbt_rec_t)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - return; - } - /* Move extents up in the list (if needed) */ - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u1.if_extents[idx], - &ifp->if_u1.if_extents[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - } - memset(&ifp->if_u1.if_extents[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); - /* - * Reallocate the direct extent list. If the extents - * will fit inside the inode then xfs_iext_realloc_direct - * will switch from direct to inline extent allocation - * mode for us. - */ - xfs_iext_realloc_direct(ifp, new_size); - ifp->if_bytes = new_size; -} - -/* - * This is called when incore extents are being removed from the - * indirection array and the extents being removed span multiple extent - * buffers. The idx parameter contains the file extent index where we - * want to begin removing extents, and the count parameter contains - * how many extents need to be removed. - * - * |-------| |-------| - * | nex1 | | | nex1 - number of extents before idx - * |-------| | count | - * | | | | count - number of extents being removed at idx - * | count | |-------| - * | | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_remove_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing extents */ - int count) /* number of extents to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int erp_idx = 0; /* indirection array index */ - xfs_extnum_t ext_cnt; /* extents left to remove */ - xfs_extnum_t ext_diff; /* extents to remove in current list */ - xfs_extnum_t nex1; /* number of extents before idx */ - xfs_extnum_t nex2; /* extents after idx + count */ - int page_idx = idx; /* index in target extent list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - ASSERT(erp != NULL); - nex1 = page_idx; - ext_cnt = count; - while (ext_cnt) { - nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); - ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); - /* - * Check for deletion of entire list; - * xfs_iext_irec_remove() updates extent offsets. - */ - if (ext_diff == erp->er_extcount) { - xfs_iext_irec_remove(ifp, erp_idx); - ext_cnt -= ext_diff; - nex1 = 0; - if (ext_cnt) { - ASSERT(erp_idx < ifp->if_real_bytes / - XFS_IEXT_BUFSZ); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex1 = 0; - continue; - } else { - break; - } - } - /* Move extents up (if needed) */ - if (nex2) { - memmove(&erp->er_extbuf[nex1], - &erp->er_extbuf[nex1 + ext_diff], - nex2 * sizeof(xfs_bmbt_rec_t)); - } - /* Zero out rest of page */ - memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - - ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); - /* Update remaining counters */ - erp->er_extcount -= ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); - ext_cnt -= ext_diff; - nex1 = 0; - erp_idx++; - erp++; - } - ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact(ifp); -} - -/* - * Create, destroy, or resize a linear (direct) block of extents. - */ -void -xfs_iext_realloc_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new size of extents */ -{ - int rnew_size; /* real new size of extents */ - - rnew_size = new_size; - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || - ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && - (new_size != ifp->if_real_bytes))); - - /* Free extent records */ - if (new_size == 0) { - xfs_iext_destroy(ifp); - } - /* Resize direct extent list and zero any new bytes */ - else if (ifp->if_real_bytes) { - /* Check if extents will fit inside the inode */ - if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { - xfs_iext_direct_to_inline(ifp, new_size / - (uint)sizeof(xfs_bmbt_rec_t)); - ifp->if_bytes = new_size; - return; - } - if (!is_power_of_2(new_size)){ - rnew_size = roundup_pow_of_two(new_size); - } - if (rnew_size != ifp->if_real_bytes) { - ifp->if_u1.if_extents = - kmem_realloc(ifp->if_u1.if_extents, - rnew_size, - ifp->if_real_bytes, KM_NOFS); - } - if (rnew_size > ifp->if_real_bytes) { - memset(&ifp->if_u1.if_extents[ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)], 0, - rnew_size - ifp->if_real_bytes); - } - } - /* - * Switch from the inline extent buffer to a direct - * extent list. Be sure to include the inline extent - * bytes in new_size. - */ - else { - new_size += ifp->if_bytes; - if (!is_power_of_2(new_size)) { - rnew_size = roundup_pow_of_two(new_size); - } - xfs_iext_inline_to_direct(ifp, rnew_size); - } - ifp->if_real_bytes = rnew_size; - ifp->if_bytes = new_size; -} - -/* - * Switch from linear (direct) extent records to inline buffer. - */ -void -xfs_iext_direct_to_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t nextents) /* number of extents in file */ -{ - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(nextents <= XFS_INLINE_EXTS); - /* - * The inline buffer was zeroed when we switched - * from inline to direct extent allocation mode, - * so we don't need to clear it here. - */ - memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, - nextents * sizeof(xfs_bmbt_rec_t)); - kmem_free(ifp->if_u1.if_extents); - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; -} - -/* - * Switch from inline buffer to linear (direct) extent records. - * new_size should already be rounded up to the next power of 2 - * by the caller (when appropriate), so use new_size as it is. - * However, since new_size may be rounded up, we can't update - * if_bytes here. It is the caller's responsibility to update - * if_bytes upon return. - */ -void -xfs_iext_inline_to_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* number of extents in file */ -{ - ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); - memset(ifp->if_u1.if_extents, 0, new_size); - if (ifp->if_bytes) { - memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, - ifp->if_bytes); - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_real_bytes = new_size; -} - -/* - * Resize an extent indirection array to new_size bytes. - */ -STATIC void -xfs_iext_realloc_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new indirection array size */ -{ - int nlists; /* number of irec's (ex lists) */ - int size; /* current indirection array size */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - size = nlists * sizeof(xfs_ext_irec_t); - ASSERT(ifp->if_real_bytes); - ASSERT((new_size >= 0) && (new_size != size)); - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else { - ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) - kmem_realloc(ifp->if_u1.if_ext_irec, - new_size, size, KM_NOFS); - } -} - -/* - * Switch from indirection array to linear (direct) extent allocations. - */ -STATIC void -xfs_iext_indirect_to_direct( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - int size; /* size of file extents */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nextents <= XFS_LINEAR_EXTS); - size = nextents * sizeof(xfs_bmbt_rec_t); - - xfs_iext_irec_compact_pages(ifp); - ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); - - ep = ifp->if_u1.if_ext_irec->er_extbuf; - kmem_free(ifp->if_u1.if_ext_irec); - ifp->if_flags &= ~XFS_IFEXTIREC; - ifp->if_u1.if_extents = ep; - ifp->if_bytes = size; - if (nextents < XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, size); - } -} - -/* - * Free incore file extents. - */ -void -xfs_iext_destroy( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - if (ifp->if_flags & XFS_IFEXTIREC) { - int erp_idx; - int nlists; - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { - xfs_iext_irec_remove(ifp, erp_idx); - } - ifp->if_flags &= ~XFS_IFEXTIREC; - } else if (ifp->if_real_bytes) { - kmem_free(ifp->if_u1.if_extents); - } else if (ifp->if_bytes) { - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_u1.if_extents = NULL; - ifp->if_real_bytes = 0; - ifp->if_bytes = 0; -} - -/* - * Return a pointer to the extent record for file system block bno. - */ -xfs_bmbt_rec_host_t * /* pointer to found extent record */ -xfs_iext_bno_to_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - xfs_extnum_t *idxp) /* index of target extent */ -{ - xfs_bmbt_rec_host_t *base; /* pointer to first extent */ - xfs_filblks_t blockcount = 0; /* number of blocks in extent */ - xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - int high; /* upper boundary in search */ - xfs_extnum_t idx = 0; /* index of target extent */ - int low; /* lower boundary in search */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_fileoff_t startoff = 0; /* start offset of extent */ - - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *idxp = 0; - return NULL; - } - low = 0; - if (ifp->if_flags & XFS_IFEXTIREC) { - /* Find target extent list */ - int erp_idx = 0; - erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); - base = erp->er_extbuf; - high = erp->er_extcount - 1; - } else { - base = ifp->if_u1.if_extents; - high = nextents - 1; - } - /* Binary search extent records */ - while (low <= high) { - idx = (low + high) >> 1; - ep = base + idx; - startoff = xfs_bmbt_get_startoff(ep); - blockcount = xfs_bmbt_get_blockcount(ep); - if (bno < startoff) { - high = idx - 1; - } else if (bno >= startoff + blockcount) { - low = idx + 1; - } else { - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - *idxp = idx; - return ep; - } - } - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - if (bno >= startoff + blockcount) { - if (++idx == nextents) { - ep = NULL; - } else { - ep = xfs_iext_get_ext(ifp, idx); - } - } - *idxp = idx; - return ep; -} - -/* - * Return a pointer to the indirection array entry containing the - * extent record for filesystem block bno. Store the index of the - * target irec in *erp_idxp. - */ -xfs_ext_irec_t * /* pointer to found extent record */ -xfs_iext_bno_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - int *erp_idxp) /* irec index of target ext list */ -{ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - xfs_ext_irec_t *erp_next; /* next indirection array entry */ - int erp_idx; /* indirection array index */ - int nlists; /* number of extent irec's (lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; - if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { - high = erp_idx - 1; - } else if (erp_next && bno >= - xfs_bmbt_get_startoff(erp_next->er_extbuf)) { - low = erp_idx + 1; - } else { - break; - } - } - *erp_idxp = erp_idx; - return erp; -} - -/* - * Return a pointer to the indirection array entry containing the - * extent record at file extent index *idxp. Store the index of the - * target irec in *erp_idxp and store the page index of the target - * extent record in *idxp. - */ -xfs_ext_irec_t * -xfs_iext_idx_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t *idxp, /* extent index (file -> page) */ - int *erp_idxp, /* pointer to target irec */ - int realloc) /* new bytes were just added */ -{ - xfs_ext_irec_t *prev; /* pointer to previous irec */ - xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ - int erp_idx; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - xfs_extnum_t page_idx = *idxp; /* extent index in target list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - ASSERT(page_idx >= 0); - ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - - /* Binary search extent irec's */ - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - prev = erp_idx > 0 ? erp - 1 : NULL; - if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && - realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { - high = erp_idx - 1; - } else if (page_idx > erp->er_extoff + erp->er_extcount || - (page_idx == erp->er_extoff + erp->er_extcount && - !realloc)) { - low = erp_idx + 1; - } else if (page_idx == erp->er_extoff + erp->er_extcount && - erp->er_extcount == XFS_LINEAR_EXTS) { - ASSERT(realloc); - page_idx = 0; - erp_idx++; - erp = erp_idx < nlists ? erp + 1 : NULL; - break; - } else { - page_idx -= erp->er_extoff; - break; - } - } - *idxp = page_idx; - *erp_idxp = erp_idx; - return(erp); -} - -/* - * Allocate and initialize an indirection array once the space needed - * for incore extents increases above XFS_IEXT_BUFSZ. - */ -void -xfs_iext_irec_init( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nextents <= XFS_LINEAR_EXTS); - - erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); - - if (nextents == 0) { - ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); - } else if (!ifp->if_real_bytes) { - xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); - } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { - xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); - } - erp->er_extbuf = ifp->if_u1.if_extents; - erp->er_extcount = nextents; - erp->er_extoff = 0; - - ifp->if_flags |= XFS_IFEXTIREC; - ifp->if_real_bytes = XFS_IEXT_BUFSZ; - ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); - ifp->if_u1.if_ext_irec = erp; - - return; -} - -/* - * Allocate and initialize a new entry in the indirection array. - */ -xfs_ext_irec_t * -xfs_iext_irec_new( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* index for new irec */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* Resize indirection array */ - xfs_iext_realloc_indirect(ifp, ++nlists * - sizeof(xfs_ext_irec_t)); - /* - * Move records down in the array so the - * new page can use erp_idx. - */ - erp = ifp->if_u1.if_ext_irec; - for (i = nlists - 1; i > erp_idx; i--) { - memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); - } - ASSERT(i == erp_idx); - - /* Initialize new extent record */ - erp = ifp->if_u1.if_ext_irec; - erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; - memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); - erp[erp_idx].er_extcount = 0; - erp[erp_idx].er_extoff = erp_idx > 0 ? - erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; - return (&erp[erp_idx]); -} - -/* - * Remove a record from the indirection array. - */ -void -xfs_iext_irec_remove( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* irec index to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - if (erp->er_extbuf) { - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, - -erp->er_extcount); - kmem_free(erp->er_extbuf); - } - /* Compact extent records */ - erp = ifp->if_u1.if_ext_irec; - for (i = erp_idx; i < nlists - 1; i++) { - memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); - } - /* - * Manually free the last extent record from the indirection - * array. A call to xfs_iext_realloc_indirect() with a size - * of zero would result in a call to xfs_iext_destroy() which - * would in turn call this function again, creating a nasty - * infinite loop. - */ - if (--nlists) { - xfs_iext_realloc_indirect(ifp, - nlists * sizeof(xfs_ext_irec_t)); - } else { - kmem_free(ifp->if_u1.if_ext_irec); - } - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; -} - -/* - * This is called to clean up large amounts of unused memory allocated - * by the indirection array. Before compacting anything though, verify - * that the indirection array is still needed and switch back to the - * linear extent list (or even the inline buffer) if possible. The - * compaction policy is as follows: - * - * Full Compaction: Extents fit into a single page (or inline buffer) - * Partial Compaction: Extents occupy less than 50% of allocated space - * No Compaction: Extents occupy at least 50% of allocated space - */ -void -xfs_iext_irec_compact( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - - if (nextents == 0) { - xfs_iext_destroy(ifp); - } else if (nextents <= XFS_INLINE_EXTS) { - xfs_iext_indirect_to_direct(ifp); - xfs_iext_direct_to_inline(ifp, nextents); - } else if (nextents <= XFS_LINEAR_EXTS) { - xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { - xfs_iext_irec_compact_pages(ifp); - } -} - -/* - * Combine extents from neighboring extent pages. - */ -void -xfs_iext_irec_compact_pages( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ - int erp_idx = 0; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - while (erp_idx < nlists - 1) { - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp + 1; - if (erp_next->er_extcount <= - (XFS_LINEAR_EXTS - erp->er_extcount)) { - memcpy(&erp->er_extbuf[erp->er_extcount], - erp_next->er_extbuf, erp_next->er_extcount * - sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += erp_next->er_extcount; - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - } else { - erp_idx++; - } - } -} - -/* - * This is called to update the er_extoff field in the indirection - * array when extents have been added or removed from one of the - * extent lists. erp_idx contains the irec index to begin updating - * at and ext_diff contains the number of extents that were added - * or removed. - */ -void -xfs_iext_irec_update_extoffs( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* irec index to update */ - int ext_diff) /* number of new extents */ -{ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (i = erp_idx; i < nlists; i++) { - ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; - } -} - -/* - * Test whether it is appropriate to check an inode for and free post EOF - * blocks. The 'force' parameter determines whether we should also consider - * regular files that are marked preallocated or append-only. - */ -bool -xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) -{ - /* prealloc/delalloc exists only on regular files */ - if (!S_ISREG(ip->i_d.di_mode)) - return false; - - /* - * Zero sized files with no cached pages and delalloc blocks will not - * have speculative prealloc/delalloc blocks to remove. - */ - if (VFS_I(ip)->i_size == 0 && - VN_CACHED(VFS_I(ip)) == 0 && - ip->i_delayed_blks == 0) - return false; - - /* If we haven't read in the extent list, then don't do it now. */ - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) - return false; - - /* - * Do not free real preallocated or append-only files unless the file - * has delalloc blocks and we are forced to remove them. - */ - if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) - if (!force || ip->i_delayed_blks == 0) - return false; - - return true; -} - diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b55fd347ab5b..4a91358c1470 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -18,225 +18,15 @@ #ifndef __XFS_INODE_H__ #define __XFS_INODE_H__ -struct posix_acl; -struct xfs_dinode; -struct xfs_inode; - -/* - * Fork identifiers. - */ -#define XFS_DATA_FORK 0 -#define XFS_ATTR_FORK 1 - -/* - * The following xfs_ext_irec_t struct introduces a second (top) level - * to the in-core extent allocation scheme. These structs are allocated - * in a contiguous block, creating an indirection array where each entry - * (irec) contains a pointer to a buffer of in-core extent records which - * it manages. Each extent buffer is 4k in size, since 4k is the system - * page size on Linux i386 and systems with larger page sizes don't seem - * to gain much, if anything, by using their native page size as the - * extent buffer size. Also, using 4k extent buffers everywhere provides - * a consistent interface for CXFS across different platforms. - * - * There is currently no limit on the number of irec's (extent lists) - * allowed, so heavily fragmented files may require an indirection array - * which spans multiple system pages of memory. The number of extents - * which would require this amount of contiguous memory is very large - * and should not cause problems in the foreseeable future. However, - * if the memory needed for the contiguous array ever becomes a problem, - * it is possible that a third level of indirection may be required. - */ -typedef struct xfs_ext_irec { - xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */ - xfs_extnum_t er_extoff; /* extent offset in file */ - xfs_extnum_t er_extcount; /* number of extents in page/block */ -} xfs_ext_irec_t; +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" /* - * File incore extent information, present for each of data & attr forks. + * Kernel only inode definitions */ -#define XFS_IEXT_BUFSZ 4096 -#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t)) -#define XFS_INLINE_EXTS 2 -#define XFS_INLINE_DATA 32 -typedef struct xfs_ifork { - int if_bytes; /* bytes in if_u1 */ - int if_real_bytes; /* bytes allocated in if_u1 */ - struct xfs_btree_block *if_broot; /* file's incore btree root */ - short if_broot_bytes; /* bytes allocated for root */ - unsigned char if_flags; /* per-fork flags */ - union { - xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ - xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ - char *if_data; /* inline file data */ - } if_u1; - union { - xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS]; - /* very small file extents */ - char if_inline_data[XFS_INLINE_DATA]; - /* very small file data */ - xfs_dev_t if_rdev; /* dev number if special */ - uuid_t if_uuid; /* mount point value */ - } if_u2; -} xfs_ifork_t; - -/* - * Inode location information. Stored in the inode and passed to - * xfs_imap_to_bp() to get a buffer and dinode for a given inode. - */ -struct xfs_imap { - xfs_daddr_t im_blkno; /* starting BB of inode chunk */ - ushort im_len; /* length in BBs of inode chunk */ - ushort im_boffset; /* inode offset in block in bytes */ -}; - -/* - * This is the xfs in-core inode structure. - * Most of the on-disk inode is embedded in the i_d field. - * - * The extent pointers/inline file space, however, are managed - * separately. The memory for this information is pointed to by - * the if_u1 unions depending on the type of the data. - * This is used to linearize the array of extents for fast in-core - * access. This is used until the file's number of extents - * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers - * are accessed through the buffer cache. - * - * Other state kept in the in-core inode is used for identification, - * locking, transactional updating, etc of the inode. - * - * Generally, we do not want to hold the i_rlock while holding the - * i_ilock. Hierarchy is i_iolock followed by i_rlock. - * - * xfs_iptr_t contains all the inode fields up to and including the - * i_mnext and i_mprev fields, it is used as a marker in the inode - * chain off the mount structure by xfs_sync calls. - */ - -typedef struct xfs_ictimestamp { - __int32_t t_sec; /* timestamp seconds */ - __int32_t t_nsec; /* timestamp nanoseconds */ -} xfs_ictimestamp_t; - -/* - * NOTE: This structure must be kept identical to struct xfs_dinode - * in xfs_dinode.h except for the endianness annotations. - */ -typedef struct xfs_icdinode { - __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ - __uint16_t di_mode; /* mode and type of file */ - __int8_t di_version; /* inode version */ - __int8_t di_format; /* format of di_c data */ - __uint16_t di_onlink; /* old number of links to file */ - __uint32_t di_uid; /* owner's user id */ - __uint32_t di_gid; /* owner's group id */ - __uint32_t di_nlink; /* number of links to file */ - __uint16_t di_projid_lo; /* lower part of owner's project id */ - __uint16_t di_projid_hi; /* higher part of owner's project id */ - __uint8_t di_pad[6]; /* unused, zeroed space */ - __uint16_t di_flushiter; /* incremented on flush */ - xfs_ictimestamp_t di_atime; /* time last accessed */ - xfs_ictimestamp_t di_mtime; /* time last modified */ - xfs_ictimestamp_t di_ctime; /* time created/inode modified */ - xfs_fsize_t di_size; /* number of bytes in file */ - xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ - xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ - __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - __int8_t di_aformat; /* format of attr fork's data */ - __uint32_t di_dmevmask; /* DMIG event mask */ - __uint16_t di_dmstate; /* DMIG state info */ - __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ - __uint32_t di_gen; /* generation number */ - - /* di_next_unlinked is the only non-core field in the old dinode */ - xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ - - /* start of the extended dinode, writable fields */ - __uint32_t di_crc; /* CRC of the inode */ - __uint64_t di_changecount; /* number of attribute changes */ - xfs_lsn_t di_lsn; /* flush sequence */ - __uint64_t di_flags2; /* more random flags */ - __uint8_t di_pad2[16]; /* more padding for future expansion */ - - /* fields only written to during inode creation */ - xfs_ictimestamp_t di_crtime; /* time created */ - xfs_ino_t di_ino; /* inode number */ - uuid_t di_uuid; /* UUID of the filesystem */ - - /* structure must be padded to 64 bit alignment */ -} xfs_icdinode_t; - -static inline uint xfs_icdinode_size(int version) -{ - if (version == 3) - return sizeof(struct xfs_icdinode); - return offsetof(struct xfs_icdinode, di_next_unlinked); -} - -/* - * Flags for xfs_ichgtime(). - */ -#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ -#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ -#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ - -/* - * Per-fork incore inode flags. - */ -#define XFS_IFINLINE 0x01 /* Inline data is read in */ -#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ -#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ -#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ - -/* - * Fork handling. - */ - -#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) -#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) - -#define XFS_IFORK_PTR(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - &(ip)->i_df : \ - (ip)->i_afp) -#define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) -#define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ - XFS_IFORK_BOFF(ip) : \ - 0) -#define XFS_IFORK_SIZE(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_IFORK_DSIZE(ip) : \ - XFS_IFORK_ASIZE(ip)) -#define XFS_IFORK_FORMAT(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_format : \ - (ip)->i_d.di_aformat) -#define XFS_IFORK_FMT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_format = (n)) : \ - ((ip)->i_d.di_aformat = (n))) -#define XFS_IFORK_NEXTENTS(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_nextents : \ - (ip)->i_d.di_anextents) -#define XFS_IFORK_NEXT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_nextents = (n)) : \ - ((ip)->i_d.di_anextents = (n))) -#define XFS_IFORK_MAXEXT(ip, w) \ - (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) - - -#ifdef __KERNEL__ +struct xfs_dinode; +struct xfs_inode; struct xfs_buf; struct xfs_bmap_free; struct xfs_bmbt_irec; @@ -525,9 +315,21 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) ((pip)->i_d.di_mode & S_ISGID)) -/* - * xfs_inode.c prototypes. - */ +int xfs_release(struct xfs_inode *ip); +int xfs_inactive(struct xfs_inode *ip); +int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name); +int xfs_create(struct xfs_inode *dp, struct xfs_name *name, + umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); +int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode *ip); +int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, + struct xfs_name *target_name); +int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, + struct xfs_inode *src_ip, struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip); + void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); @@ -548,13 +350,28 @@ int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); + void xfs_iunpin_wait(xfs_inode_t *); +#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) + int xfs_iflush(struct xfs_inode *, struct xfs_buf **); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, + xfs_nlink_t, xfs_dev_t, prid_t, int, + struct xfs_inode **, int *); +int xfs_droplink(struct xfs_trans *, struct xfs_inode *); +int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); +void xfs_bump_ino_vers2(struct xfs_trans *, struct xfs_inode *); + +/* from xfs_file.c */ +int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); +int xfs_iozero(struct xfs_inode *, loff_t, size_t); + + #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ @@ -568,65 +385,6 @@ do { \ iput(VFS_I(ip)); \ } while (0) -#endif /* __KERNEL__ */ - -/* - * Flags for xfs_iget() - */ -#define XFS_IGET_CREATE 0x1 -#define XFS_IGET_UNTRUSTED 0x2 -#define XFS_IGET_DONTCACHE 0x4 - -int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, - struct xfs_imap *, struct xfs_dinode **, - struct xfs_buf **, uint, uint); -int xfs_iread(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, uint); -void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); -void xfs_dinode_to_disk(struct xfs_dinode *, - struct xfs_icdinode *); -void xfs_idestroy_fork(struct xfs_inode *, int); -void xfs_idata_realloc(struct xfs_inode *, int, int); -void xfs_iroot_realloc(struct xfs_inode *, int, int); -int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); -int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int); - -xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t); -void xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t, - xfs_bmbt_irec_t *, int); -void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int); -void xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int); -void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_realloc_direct(xfs_ifork_t *, int); -void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); -void xfs_iext_inline_to_direct(xfs_ifork_t *, int); -void xfs_iext_destroy(xfs_ifork_t *); -xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *); -xfs_ext_irec_t *xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *); -xfs_ext_irec_t *xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int); -void xfs_iext_irec_init(xfs_ifork_t *); -xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int); -void xfs_iext_irec_remove(xfs_ifork_t *, int); -void xfs_iext_irec_compact(xfs_ifork_t *); -void xfs_iext_irec_compact_pages(xfs_ifork_t *); -void xfs_iext_irec_compact_full(xfs_ifork_t *); -void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); -bool xfs_can_free_eofblocks(struct xfs_inode *, bool); - -#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) - -#if defined(DEBUG) -void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); -#else -#define xfs_inobp_check(mp, bp) -#endif /* DEBUG */ - -extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; -extern struct kmem_zone *xfs_ili_zone; -extern const struct xfs_buf_ops xfs_inode_buf_ops; #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c new file mode 100644 index 000000000000..e011d597f12f --- /dev/null +++ b/fs/xfs/xfs_inode_buf.c @@ -0,0 +1,483 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_icache.h" +#include "xfs_ialloc.h" + +/* + * Check that none of the inode's in the buffer have a next + * unlinked field of 0. + */ +#if defined(DEBUG) +void +xfs_inobp_check( + xfs_mount_t *mp, + xfs_buf_t *bp) +{ + int i; + int j; + xfs_dinode_t *dip; + + j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + + for (i = 0; i < j; i++) { + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + i * mp->m_sb.sb_inodesize); + if (!dip->di_next_unlinked) { + xfs_alert(mp, + "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.", + bp); + ASSERT(dip->di_next_unlinked); + } + } +} +#endif + +/* + * If we are doing readahead on an inode buffer, we might be in log recovery + * reading an inode allocation buffer that hasn't yet been replayed, and hence + * has not had the inode cores stamped into it. Hence for readahead, the buffer + * may be potentially invalid. + * + * If the readahead buffer is invalid, we don't want to mark it with an error, + * but we do want to clear the DONE status of the buffer so that a followup read + * will re-read it from disk. This will ensure that we don't get an unnecessary + * warnings during log recovery and we don't get unnecssary panics on debug + * kernels. + */ +static void +xfs_inode_buf_verify( + struct xfs_buf *bp, + bool readahead) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int i; + int ni; + + /* + * Validate the magic number and version of every inode in the buffer + */ + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (struct xfs_dinode *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + XFS_DINODE_GOOD_VERSION(dip->di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (readahead) { + bp->b_flags &= ~XBF_DONE; + return; + } + + xfs_buf_ioerror(bp, EFSCORRUPTED); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, + mp, dip); +#ifdef DEBUG + xfs_emerg(mp, + "bad inode magic/vsn daddr %lld #%d (magic=%x)", + (unsigned long long)bp->b_bn, i, + be16_to_cpu(dip->di_magic)); + ASSERT(0); +#endif + } + } + xfs_inobp_check(mp, bp); +} + + +static void +xfs_inode_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, false); +} + +static void +xfs_inode_buf_readahead_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, true); +} + +static void +xfs_inode_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, false); +} + +const struct xfs_buf_ops xfs_inode_buf_ops = { + .verify_read = xfs_inode_buf_read_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + +const struct xfs_buf_ops xfs_inode_buf_ra_ops = { + .verify_read = xfs_inode_buf_readahead_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + + +/* + * This routine is called to map an inode to the buffer containing the on-disk + * version of the inode. It returns a pointer to the buffer containing the + * on-disk inode in the bpp parameter, and in the dipp parameter it returns a + * pointer to the on-disk inode within that buffer. + * + * If a non-zero error is returned, then the contents of bpp and dipp are + * undefined. + */ +int +xfs_imap_to_bp( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + uint buf_flags, + uint iget_flags) +{ + struct xfs_buf *bp; + int error; + + buf_flags |= XBF_UNMAPPED; + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + (int)imap->im_len, buf_flags, &bp, + &xfs_inode_buf_ops); + if (error) { + if (error == EAGAIN) { + ASSERT(buf_flags & XBF_TRYLOCK); + return error; + } + + if (error == EFSCORRUPTED && + (iget_flags & XFS_IGET_UNTRUSTED)) + return XFS_ERROR(EINVAL); + + xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", + __func__, error); + return error; + } + + *bpp = bp; + *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); + return 0; +} + +STATIC void +xfs_dinode_from_disk( + xfs_icdinode_t *to, + xfs_dinode_t *from) +{ + to->di_magic = be16_to_cpu(from->di_magic); + to->di_mode = be16_to_cpu(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = be16_to_cpu(from->di_onlink); + to->di_uid = be32_to_cpu(from->di_uid); + to->di_gid = be32_to_cpu(from->di_gid); + to->di_nlink = be32_to_cpu(from->di_nlink); + to->di_projid_lo = be16_to_cpu(from->di_projid_lo); + to->di_projid_hi = be16_to_cpu(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_flushiter = be16_to_cpu(from->di_flushiter); + to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); + to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); + to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); + to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); + to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); + to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); + to->di_size = be64_to_cpu(from->di_size); + to->di_nblocks = be64_to_cpu(from->di_nblocks); + to->di_extsize = be32_to_cpu(from->di_extsize); + to->di_nextents = be32_to_cpu(from->di_nextents); + to->di_anextents = be16_to_cpu(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = be32_to_cpu(from->di_dmevmask); + to->di_dmstate = be16_to_cpu(from->di_dmstate); + to->di_flags = be16_to_cpu(from->di_flags); + to->di_gen = be32_to_cpu(from->di_gen); + + if (to->di_version == 3) { + to->di_changecount = be64_to_cpu(from->di_changecount); + to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); + to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_ino = be64_to_cpu(from->di_ino); + to->di_lsn = be64_to_cpu(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } +} + +void +xfs_dinode_to_disk( + xfs_dinode_t *to, + xfs_icdinode_t *from) +{ + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = cpu_to_be16(from->di_onlink); + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); + to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); + to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); + to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); + to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); + to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); + } +} + +static bool +xfs_dinode_verify( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) + return false; + + /* only version 3 or greater inodes are extensively verified here */ + if (dip->di_version < 3) + return true; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + offsetof(struct xfs_dinode, di_crc))) + return false; + if (be64_to_cpu(dip->di_ino) != ip->i_ino) + return false; + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + return false; + return true; +} + +void +xfs_dinode_calc_crc( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + __uint32_t crc; + + if (dip->di_version < 3) + return; + + ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + offsetof(struct xfs_dinode, di_crc)); + dip->di_crc = xfs_end_cksum(crc); +} + +/* + * Read the disk inode attributes into the in-core inode structure. + * + * For version 5 superblocks, if we are initialising a new inode and we are not + * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new + * inode core with a random generation number. If we are keeping inodes around, + * we need to read the inode cluster to get the existing generation number off + * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode + * format) then log recovery is dependent on the di_flushiter field being + * initialised from the current on-disk value and hence we must also read the + * inode off disk. + */ +int +xfs_iread( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_inode_t *ip, + uint iget_flags) +{ + xfs_buf_t *bp; + xfs_dinode_t *dip; + int error; + + /* + * Fill in the location information in the in-core inode. + */ + error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); + if (error) + return error; + + /* shortcut IO on inode allocation if possible */ + if ((iget_flags & XFS_IGET_CREATE) && + xfs_sb_version_hascrc(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + /* initialise the on-disk inode core */ + memset(&ip->i_d, 0, sizeof(ip->i_d)); + ip->i_d.di_magic = XFS_DINODE_MAGIC; + ip->i_d.di_gen = prandom_u32(); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ip->i_d.di_version = 3; + ip->i_d.di_ino = ip->i_ino; + uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); + } else + ip->i_d.di_version = 2; + return 0; + } + + /* + * Get pointers to the on-disk inode and the buffer containing it. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); + if (error) + return error; + + /* even unallocated inodes are verified */ + if (!xfs_dinode_verify(mp, ip, dip)) { + xfs_alert(mp, "%s: validation failed for inode %lld failed", + __func__, ip->i_ino); + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); + error = XFS_ERROR(EFSCORRUPTED); + goto out_brelse; + } + + /* + * If the on-disk inode is already linked to a directory + * entry, copy all of the inode into the in-core inode. + * xfs_iformat_fork() handles copying in the inode format + * specific information. + * Otherwise, just get the truly permanent information. + */ + if (dip->di_mode) { + xfs_dinode_from_disk(&ip->i_d, dip); + error = xfs_iformat_fork(ip, dip); + if (error) { +#ifdef DEBUG + xfs_alert(mp, "%s: xfs_iformat() returned error %d", + __func__, error); +#endif /* DEBUG */ + goto out_brelse; + } + } else { + /* + * Partial initialisation of the in-core inode. Just the bits + * that xfs_ialloc won't overwrite or relies on being correct. + */ + ip->i_d.di_magic = be16_to_cpu(dip->di_magic); + ip->i_d.di_version = dip->di_version; + ip->i_d.di_gen = be32_to_cpu(dip->di_gen); + ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); + + if (dip->di_version == 3) { + ip->i_d.di_ino = be64_to_cpu(dip->di_ino); + uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); + } + + /* + * Make sure to pull in the mode here as well in + * case the inode is released without being used. + * This ensures that xfs_inactive() will see that + * the inode is already free and not try to mess + * with the uninitialized part of it. + */ + ip->i_d.di_mode = 0; + } + + /* + * The inode format changed when we moved the link count and + * made it 32 bits long. If this is an old format inode, + * convert it in memory to look like a new one. If it gets + * flushed to disk we will convert back before flushing or + * logging it. We zero out the new projid field and the old link + * count field. We'll handle clearing the pad field (the remains + * of the old uuid field) when we actually convert the inode to + * the new format. We don't change the version number so that we + * can distinguish this from a real new format inode. + */ + if (ip->i_d.di_version == 1) { + ip->i_d.di_nlink = ip->i_d.di_onlink; + ip->i_d.di_onlink = 0; + xfs_set_projid(ip, 0); + } + + ip->i_delayed_blks = 0; + + /* + * Mark the buffer containing the inode as something to keep + * around for a while. This helps to keep recently accessed + * meta-data in-core longer. + */ + xfs_buf_set_ref(bp, XFS_INO_REF); + + /* + * Use xfs_trans_brelse() to release the buffer containing the on-disk + * inode, because it was acquired with xfs_trans_read_buf() in + * xfs_imap_to_bp() above. If tp is NULL, this is just a normal + * brelse(). If we're within a transaction, then xfs_trans_brelse() + * will only release the buffer if it is not dirty within the + * transaction. It will be OK to release the buffer in this case, + * because inodes on disk are never destroyed and we will be locking the + * new in-core inode before putting it in the cache where other + * processes can find it. Thus we don't have to worry about the inode + * being changed just because we released the buffer. + */ + out_brelse: + xfs_trans_brelse(tp, bp); + return error; +} diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h new file mode 100644 index 000000000000..599e6c0ca2a9 --- /dev/null +++ b/fs/xfs/xfs_inode_buf.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_BUF_H__ +#define __XFS_INODE_BUF_H__ + +struct xfs_inode; +struct xfs_dinode; +struct xfs_icdinode; + +/* + * Inode location information. Stored in the inode and passed to + * xfs_imap_to_bp() to get a buffer and dinode for a given inode. + */ +struct xfs_imap { + xfs_daddr_t im_blkno; /* starting BB of inode chunk */ + ushort im_len; /* length in BBs of inode chunk */ + ushort im_boffset; /* inode offset in block in bytes */ +}; + +int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, + struct xfs_imap *, struct xfs_dinode **, + struct xfs_buf **, uint, uint); +int xfs_iread(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, uint); +void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); +void xfs_dinode_to_disk(struct xfs_dinode *, + struct xfs_icdinode *); + +#if defined(DEBUG) +void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); +#else +#define xfs_inobp_check(mp, bp) +#endif /* DEBUG */ + +extern const struct xfs_buf_ops xfs_inode_buf_ops; +extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; + +#endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c new file mode 100644 index 000000000000..02f1083955bb --- /dev/null +++ b/fs/xfs/xfs_inode_fork.c @@ -0,0 +1,1920 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <linux/log2.h> + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_buf_item.h" +#include "xfs_inode_item.h" +#include "xfs_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_filestream.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_icache.h" + +kmem_zone_t *xfs_ifork_zone; + +STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); +STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); +STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); + +#ifdef DEBUG +/* + * Make sure that the extents in the given memory buffer + * are valid. + */ +void +xfs_validate_extents( + xfs_ifork_t *ifp, + int nrecs, + xfs_exntfmt_t fmt) +{ + xfs_bmbt_irec_t irec; + xfs_bmbt_rec_host_t rec; + int i; + + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + rec.l0 = get_unaligned(&ep->l0); + rec.l1 = get_unaligned(&ep->l1); + xfs_bmbt_get_all(&rec, &irec); + if (fmt == XFS_EXTFMT_NOSTATE) + ASSERT(irec.br_state == XFS_EXT_NORM); + } +} +#else /* DEBUG */ +#define xfs_validate_extents(ifp, nrecs, fmt) +#endif /* DEBUG */ + + +/* + * Move inode type and inode format specific information from the + * on-disk inode to the in-core inode. For fifos, devs, and sockets + * this means set if_rdev to the proper value. For files, directories, + * and symlinks this means to bring in the in-line data or extent + * pointers. For a file in B-tree format, only the root is immediately + * brought in-core. The rest will be in-lined in if_extents when it + * is first referenced (see xfs_iread_extents()). + */ +int +xfs_iformat_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip) +{ + xfs_attr_shortform_t *atp; + int size; + int error = 0; + xfs_fsize_t di_size; + + if (unlikely(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents) > + be64_to_cpu(dip->di_nblocks))) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", + (unsigned long long)ip->i_ino, + (int)(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents)), + (unsigned long long) + be64_to_cpu(dip->di_nblocks)); + XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { + xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", + (unsigned long long)ip->i_ino, + dip->di_forkoff); + XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && + !ip->i_mount->m_rtdev_targp)) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, has realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { + XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + ip->i_d.di_size = 0; + ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); + break; + + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (dip->di_format) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (local format for regular file).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(4)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + di_size = be64_to_cpu(dip->di_size); + if (unlikely(di_size < 0 || + di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %Ld for local inode).", + (unsigned long long) ip->i_ino, + (long long) di_size); + XFS_CORRUPTION_ERROR("xfs_iformat(5)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + size = (int)di_size; + error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + break; + default: + XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + break; + + default: + XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + if (error) { + return error; + } + if (!XFS_DFORK_Q(dip)) + return 0; + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); + + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); + size = be16_to_cpu(atp->hdr.totsize); + + if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad attr fork size %Ld).", + (unsigned long long) ip->i_ino, + (long long) size); + XFS_CORRUPTION_ERROR("xfs_iformat(8)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); + break; + default: + error = XFS_ERROR(EFSCORRUPTED); + break; + } + if (error) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + xfs_idestroy_fork(ip, XFS_DATA_FORK); + } + return error; +} + +/* + * The file is in-lined in the on-disk inode. + * If it fits into if_inline_data, then copy + * it there, otherwise allocate a buffer for it + * and copy the data there. Either way, set + * if_data to point at the data. + * If we allocate a buffer for the data, make + * sure that its size is a multiple of 4 and + * record the real size in i_real_bytes. + */ +STATIC int +xfs_iformat_local( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork, + int size) +{ + xfs_ifork_t *ifp; + int real_size; + + /* + * If the size is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %d for local fork, size = %d).", + (unsigned long long) ip->i_ino, size, + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); + XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + ifp = XFS_IFORK_PTR(ip, whichfork); + real_size = 0; + if (size == 0) + ifp->if_u1.if_data = NULL; + else if (size <= sizeof(ifp->if_u2.if_inline_data)) + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + else { + real_size = roundup(size, 4); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + } + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + if (size) + memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFINLINE; + return 0; +} + +/* + * The file consists of a set of extents all + * of which fit into the on-disk inode. + * If there are few enough extents to fit into + * the if_inline_ext, then copy them there. + * Otherwise allocate a buffer for them and copy + * them into it. Either way, set if_extents + * to point at the extents. + */ +STATIC int +xfs_iformat_extents( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + xfs_bmbt_rec_t *dp; + xfs_ifork_t *ifp; + int nex; + int size; + int i; + + ifp = XFS_IFORK_PTR(ip, whichfork); + nex = XFS_DFORK_NEXTENTS(dip, whichfork); + size = nex * (uint)sizeof(xfs_bmbt_rec_t); + + /* + * If the number of extents is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", + (unsigned long long) ip->i_ino, nex); + XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + ifp->if_real_bytes = 0; + if (nex == 0) + ifp->if_u1.if_extents = NULL; + else if (nex <= XFS_INLINE_EXTS) + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + else + xfs_iext_add(ifp, 0, nex); + + ifp->if_bytes = size; + if (size) { + dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); + xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); + for (i = 0; i < nex; i++, dp++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + ep->l0 = get_unaligned_be64(&dp->l0); + ep->l1 = get_unaligned_be64(&dp->l1); + } + XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); + if (whichfork != XFS_DATA_FORK || + XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) + if (unlikely(xfs_check_nostate_extents( + ifp, 0, nex))) { + XFS_ERROR_REPORT("xfs_iformat_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + } + ifp->if_flags |= XFS_IFEXTENTS; + return 0; +} + +/* + * The file has too many extents to fit into + * the inode, so they are in B-tree format. + * Allocate a buffer for the root of the B-tree + * and copy the root into it. The i_extents + * field will remain NULL until all of the + * extents are read in (when they are needed). + */ +STATIC int +xfs_iformat_btree( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_bmdr_block_t *dfp; + xfs_ifork_t *ifp; + /* REFERENCED */ + int nrecs; + int size; + + ifp = XFS_IFORK_PTR(ip, whichfork); + dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); + size = XFS_BMAP_BROOT_SPACE(mp, dfp); + nrecs = be16_to_cpu(dfp->bb_numrecs); + + /* + * blow out if -- fork has less extents than can fit in + * fork (fork shouldn't be a btree format), root btree + * block has more records than can fit into the fork, + * or the number of extents is greater than the number of + * blocks. + */ + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, mp, whichfork) || + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + xfs_warn(mp, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, + mp, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + ifp->if_broot_bytes = size; + ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); + ASSERT(ifp->if_broot != NULL); + /* + * Copy and convert from the on-disk structure + * to the in-memory structure. + */ + xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + ifp->if_broot, size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFBROOT; + + return 0; +} + +/* + * Read in extents from a btree-format inode. + * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. + */ +int +xfs_iread_extents( + xfs_trans_t *tp, + xfs_inode_t *ip, + int whichfork) +{ + int error; + xfs_ifork_t *ifp; + xfs_extnum_t nextents; + + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + nextents = XFS_IFORK_NEXTENTS(ip, whichfork); + ifp = XFS_IFORK_PTR(ip, whichfork); + + /* + * We know that the size is valid (it's checked in iformat_btree) + */ + ifp->if_bytes = ifp->if_real_bytes = 0; + ifp->if_flags |= XFS_IFEXTENTS; + xfs_iext_add(ifp, 0, nextents); + error = xfs_bmap_read_extents(tp, ip, whichfork); + if (error) { + xfs_iext_destroy(ifp); + ifp->if_flags &= ~XFS_IFEXTENTS; + return error; + } + xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); + return 0; +} +/* + * Reallocate the space for if_broot based on the number of records + * being added or deleted as indicated in rec_diff. Move the records + * and pointers in if_broot to fit the new size. When shrinking this + * will eliminate holes between the records and pointers created by + * the caller. When growing this will create holes to be filled in + * by the caller. + * + * The caller must not request to add more records than would fit in + * the on-disk inode root. If the if_broot is currently NULL, then + * if we are adding records, one will be allocated. The caller must also + * not request that the number of records go below zero, although + * it can go to zero. + * + * ip -- the inode whose if_broot area is changing + * ext_diff -- the change in the number of records, positive or negative, + * requested for the if_broot array. + */ +void +xfs_iroot_realloc( + xfs_inode_t *ip, + int rec_diff, + int whichfork) +{ + struct xfs_mount *mp = ip->i_mount; + int cur_max; + xfs_ifork_t *ifp; + struct xfs_btree_block *new_broot; + int new_max; + size_t new_size; + char *np; + char *op; + + /* + * Handle the degenerate case quietly. + */ + if (rec_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (rec_diff > 0) { + /* + * If there wasn't any memory allocated before, just + * allocate it now and get out. + */ + if (ifp->if_broot_bytes == 0) { + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); + ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + ifp->if_broot_bytes = (int)new_size; + return; + } + + /* + * If there is already an existing if_broot, then we need + * to realloc() it and shift the pointers to their new + * location. The records don't change location because + * they are kept butted up against the btree block header. + */ + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, + XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), + KM_SLEEP | KM_NOFS); + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + (int)new_size); + ifp->if_broot_bytes = (int)new_size; + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); + return; + } + + /* + * rec_diff is less than 0. In this case, we are shrinking the + * if_broot buffer. It must already exist. If we go to zero + * records, just get rid of the root and clear the status bit. + */ + ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + ASSERT(new_max >= 0); + if (new_max > 0) + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + else + new_size = 0; + if (new_size > 0) { + new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + /* + * First copy over the btree block header. + */ + memcpy(new_broot, ifp->if_broot, + XFS_BMBT_BLOCK_LEN(ip->i_mount)); + } else { + new_broot = NULL; + ifp->if_flags &= ~XFS_IFBROOT; + } + + /* + * Only copy the records and pointers if there are any. + */ + if (new_max > 0) { + /* + * First copy the records. + */ + op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); + np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); + memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); + + /* + * Then copy the pointers. + */ + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, + (int)new_size); + memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); + } + kmem_free(ifp->if_broot); + ifp->if_broot = new_broot; + ifp->if_broot_bytes = (int)new_size; + if (ifp->if_broot) + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + return; +} + + +/* + * This is called when the amount of space needed for if_data + * is increased or decreased. The change in size is indicated by + * the number of bytes that need to be added or deleted in the + * byte_diff parameter. + * + * If the amount of space needed has decreased below the size of the + * inline buffer, then switch to using the inline buffer. Otherwise, + * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer + * to what is needed. + * + * ip -- the inode whose if_data area is changing + * byte_diff -- the change in the number of bytes, positive or negative, + * requested for the if_data array. + */ +void +xfs_idata_realloc( + xfs_inode_t *ip, + int byte_diff, + int whichfork) +{ + xfs_ifork_t *ifp; + int new_size; + int real_size; + + if (byte_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + new_size = (int)ifp->if_bytes + byte_diff; + ASSERT(new_size >= 0); + + if (new_size == 0) { + if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + kmem_free(ifp->if_u1.if_data); + } + ifp->if_u1.if_data = NULL; + real_size = 0; + } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { + /* + * If the valid extents/data can fit in if_inline_ext/data, + * copy them from the malloc'd vector and free it. + */ + if (ifp->if_u1.if_data == NULL) { + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + ASSERT(ifp->if_real_bytes != 0); + memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, + new_size); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } + real_size = 0; + } else { + /* + * Stuck with malloc/realloc. + * For inline data, the underlying buffer must be + * a multiple of 4 bytes in size so that it can be + * logged and stay on word boundaries. We enforce + * that here. + */ + real_size = roundup(new_size, 4); + if (ifp->if_u1.if_data == NULL) { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + /* + * Only do the realloc if the underlying size + * is really changing. + */ + if (ifp->if_real_bytes != real_size) { + ifp->if_u1.if_data = + kmem_realloc(ifp->if_u1.if_data, + real_size, + ifp->if_real_bytes, + KM_SLEEP | KM_NOFS); + } + } else { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, + ifp->if_bytes); + } + } + ifp->if_real_bytes = real_size; + ifp->if_bytes = new_size; + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); +} + +void +xfs_idestroy_fork( + xfs_inode_t *ip, + int whichfork) +{ + xfs_ifork_t *ifp; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (ifp->if_broot != NULL) { + kmem_free(ifp->if_broot); + ifp->if_broot = NULL; + } + + /* + * If the format is local, then we can't have an extents + * array so just look for an inline data array. If we're + * not local then we may or may not have an extents list, + * so check and free it up if we do. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && + (ifp->if_u1.if_data != NULL)) { + ASSERT(ifp->if_real_bytes != 0); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = NULL; + ifp->if_real_bytes = 0; + } + } else if ((ifp->if_flags & XFS_IFEXTENTS) && + ((ifp->if_flags & XFS_IFEXTIREC) || + ((ifp->if_u1.if_extents != NULL) && + (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { + ASSERT(ifp->if_real_bytes != 0); + xfs_iext_destroy(ifp); + } + ASSERT(ifp->if_u1.if_extents == NULL || + ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); + ASSERT(ifp->if_real_bytes == 0); + if (whichfork == XFS_ATTR_FORK) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + } +} + +/* + * xfs_iextents_copy() + * + * This is called to copy the REAL extents (as opposed to the delayed + * allocation extents) from the inode into the given buffer. It + * returns the number of bytes copied into the buffer. + * + * If there are no delayed allocation extents, then we can just + * memcpy() the extents into the buffer. Otherwise, we need to + * examine each extent in turn and skip those which are delayed. + */ +int +xfs_iextents_copy( + xfs_inode_t *ip, + xfs_bmbt_rec_t *dp, + int whichfork) +{ + int copied; + int i; + xfs_ifork_t *ifp; + int nrecs; + xfs_fsblock_t start_block; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(ifp->if_bytes > 0); + + nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); + ASSERT(nrecs > 0); + + /* + * There are some delayed allocation extents in the + * inode, so copy the extents one at a time and skip + * the delayed ones. There must be at least one + * non-delayed extent. + */ + copied = 0; + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + start_block = xfs_bmbt_get_startblock(ep); + if (isnullstartblock(start_block)) { + /* + * It's a delayed allocation extent, so skip it. + */ + continue; + } + + /* Translate to on disk format */ + put_unaligned_be64(ep->l0, &dp->l0); + put_unaligned_be64(ep->l1, &dp->l1); + dp++; + copied++; + } + ASSERT(copied != 0); + xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); + + return (copied * (uint)sizeof(xfs_bmbt_rec_t)); +} + +/* + * Each of the following cases stores data into the same region + * of the on-disk inode, so only one of them can be valid at + * any given time. While it is possible to have conflicting formats + * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is + * in EXTENTS format, this can only happen when the fork has + * changed formats after being modified but before being flushed. + * In these cases, the format always takes precedence, because the + * format indicates the current state of the fork. + */ +void +xfs_iflush_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip, + xfs_inode_log_item_t *iip, + int whichfork, + xfs_buf_t *bp) +{ + char *cp; + xfs_ifork_t *ifp; + xfs_mount_t *mp; + static const short brootflag[2] = + { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; + static const short dataflag[2] = + { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; + static const short extflag[2] = + { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; + + if (!iip) + return; + ifp = XFS_IFORK_PTR(ip, whichfork); + /* + * This can happen if we gave up in iformat in an error path, + * for the attribute fork. + */ + if (!ifp) { + ASSERT(whichfork == XFS_ATTR_FORK); + return; + } + cp = XFS_DFORK_PTR(dip, whichfork); + mp = ip->i_mount; + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: + if ((iip->ili_fields & dataflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); + memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); + } + break; + + case XFS_DINODE_FMT_EXTENTS: + ASSERT((ifp->if_flags & XFS_IFEXTENTS) || + !(iip->ili_fields & extflag[whichfork])); + if ((iip->ili_fields & extflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(xfs_iext_get_ext(ifp, 0)); + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); + (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, + whichfork); + } + break; + + case XFS_DINODE_FMT_BTREE: + if ((iip->ili_fields & brootflag[whichfork]) && + (ifp->if_broot_bytes > 0)) { + ASSERT(ifp->if_broot != NULL); + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, + (xfs_bmdr_block_t *)cp, + XFS_DFORK_SIZE(dip, mp, whichfork)); + } + break; + + case XFS_DINODE_FMT_DEV: + if (iip->ili_fields & XFS_ILOG_DEV) { + ASSERT(whichfork == XFS_DATA_FORK); + xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); + } + break; + + case XFS_DINODE_FMT_UUID: + if (iip->ili_fields & XFS_ILOG_UUID) { + ASSERT(whichfork == XFS_DATA_FORK); + memcpy(XFS_DFORK_DPTR(dip), + &ip->i_df.if_u2.if_uuid, + sizeof(uuid_t)); + } + break; + + default: + ASSERT(0); + break; + } +} + +/* + * Return a pointer to the extent record at file index idx. + */ +xfs_bmbt_rec_host_t * +xfs_iext_get_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx) /* index of target extent */ +{ + ASSERT(idx >= 0); + ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + + if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { + return ifp->if_u1.if_ext_irec->er_extbuf; + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_ext_irec_t *erp; /* irec pointer */ + int erp_idx = 0; /* irec index */ + xfs_extnum_t page_idx = idx; /* ext index in target list */ + + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + return &erp->er_extbuf[page_idx]; + } else if (ifp->if_bytes) { + return &ifp->if_u1.if_extents[idx]; + } else { + return NULL; + } +} + +/* + * Insert new item(s) into the extent records for incore inode + * fork 'ifp'. 'count' new items are inserted at index 'idx'. + */ +void +xfs_iext_insert( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting index of new items */ + xfs_extnum_t count, /* number of inserted items */ + xfs_bmbt_irec_t *new, /* items to insert */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t i; /* extent record index */ + + trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); + + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_iext_add(ifp, idx, count); + for (i = idx; i < idx + count; i++, new++) + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be increased. The ext_diff parameter stores the + * number of new extents being added and the idx parameter contains + * the extent index where the new extents will be added. If the new + * extents are being appended, then we just need to (re)allocate and + * initialize the space. Otherwise, if the new extents are being + * inserted into the middle of the existing entries, a bit more work + * is required to make room for the new extents to be inserted. The + * caller is responsible for filling in the new extent entries upon + * return. + */ +void +xfs_iext_add( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin adding exts */ + int ext_diff) /* number of extents to add */ +{ + int byte_diff; /* new bytes being added */ + int new_size; /* size of extents after adding */ + xfs_extnum_t nextents; /* number of extents in file */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT((idx >= 0) && (idx <= nextents)); + byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); + new_size = ifp->if_bytes + byte_diff; + /* + * If the new number of extents (nextents + ext_diff) + * fits inside the inode, then continue to use the inline + * extent buffer. + */ + if (nextents + ext_diff <= XFS_INLINE_EXTS) { + if (idx < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], + &ifp->if_u2.if_inline_ext[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); + } + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; + } + /* + * Otherwise use a linear (direct) extent list. + * If the extents are currently inside the inode, + * xfs_iext_realloc_direct will switch us from + * inline to direct extent allocation mode. + */ + else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, new_size); + if (idx < nextents) { + memmove(&ifp->if_u1.if_extents[idx + ext_diff], + &ifp->if_u1.if_extents[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); + } + } + /* Indirection array */ + else { + xfs_ext_irec_t *erp; + int erp_idx = 0; + int page_idx = idx; + + ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); + if (ifp->if_flags & XFS_IFEXTIREC) { + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); + } else { + xfs_iext_irec_init(ifp); + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = ifp->if_u1.if_ext_irec; + } + /* Extents fit in target extent page */ + if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { + if (page_idx < erp->er_extcount) { + memmove(&erp->er_extbuf[page_idx + ext_diff], + &erp->er_extbuf[page_idx], + (erp->er_extcount - page_idx) * + sizeof(xfs_bmbt_rec_t)); + memset(&erp->er_extbuf[page_idx], 0, byte_diff); + } + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + } + /* Insert a new extent page */ + else if (erp) { + xfs_iext_add_indirect_multi(ifp, + erp_idx, page_idx, ext_diff); + } + /* + * If extent(s) are being appended to the last page in + * the indirection array and the new extent(s) don't fit + * in the page, then erp is NULL and erp_idx is set to + * the next index needed in the indirection array. + */ + else { + int count = ext_diff; + + while (count) { + erp = xfs_iext_irec_new(ifp, erp_idx); + erp->er_extcount = count; + count -= MIN(count, (int)XFS_LINEAR_EXTS); + if (count) { + erp_idx++; + } + } + } + } + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being added to the indirection + * array and the new extents do not fit in the target extent list. The + * erp_idx parameter contains the irec index for the target extent list + * in the indirection array, and the idx parameter contains the extent + * index within the list. The number of extents being added is stored + * in the count parameter. + * + * |-------| |-------| + * | | | | idx - number of extents before idx + * | idx | | count | + * | | | | count - number of extents being inserted at idx + * |-------| |-------| + * | count | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_add_indirect_multi( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* target extent irec index */ + xfs_extnum_t idx, /* index within target list */ + int count) /* new extents being added */ +{ + int byte_diff; /* new bytes being added */ + xfs_ext_irec_t *erp; /* pointer to irec entry */ + xfs_extnum_t ext_diff; /* number of extents to add */ + xfs_extnum_t ext_cnt; /* new extents still needed */ + xfs_extnum_t nex2; /* extents after idx + count */ + xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ + int nlists; /* number of irec's (lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex2 = erp->er_extcount - idx; + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* + * Save second part of target extent list + * (all extents past */ + if (nex2) { + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); + memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); + erp->er_extcount -= nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); + memset(&erp->er_extbuf[idx], 0, byte_diff); + } + + /* + * Add the new extents to the end of the target + * list, then allocate new irec record(s) and + * extent buffer(s) as needed to store the rest + * of the new extents. + */ + ext_cnt = count; + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); + if (ext_diff) { + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + while (ext_cnt) { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); + erp->er_extcount = ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + + /* Add nex2 extents back to indirection array */ + if (nex2) { + xfs_extnum_t ext_avail; + int i; + + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; + i = 0; + /* + * If nex2 extents fit in the current page, append + * nex2_ep after the new extents. + */ + if (nex2 <= ext_avail) { + i = erp->er_extcount; + } + /* + * Otherwise, check if space is available in the + * next page. + */ + else if ((erp_idx < nlists - 1) && + (nex2 <= (ext_avail = XFS_LINEAR_EXTS - + ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { + erp_idx++; + erp++; + /* Create a hole for nex2 extents */ + memmove(&erp->er_extbuf[nex2], erp->er_extbuf, + erp->er_extcount * sizeof(xfs_bmbt_rec_t)); + } + /* + * Final choice, create a new extent page for + * nex2 extents. + */ + else { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + } + memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); + kmem_free(nex2_ep); + erp->er_extcount += nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); + } +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be decreased. The ext_diff parameter stores the + * number of extents to be removed and the idx parameter contains + * the extent index where the extents will be removed from. + * + * If the amount of space needed has decreased below the linear + * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous + * extent array. Otherwise, use kmem_realloc() to adjust the + * size to what is needed. + */ +void +xfs_iext_remove( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff, /* number of extents to remove */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + trace_xfs_iext_remove(ip, idx, state, _RET_IP_); + + ASSERT(ext_diff > 0); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_iext_remove_indirect(ifp, idx, ext_diff); + } else if (ifp->if_real_bytes) { + xfs_iext_remove_direct(ifp, idx, ext_diff); + } else { + xfs_iext_remove_inline(ifp, idx, ext_diff); + } + ifp->if_bytes = new_size; +} + +/* + * This removes ext_diff extents from the inline buffer, beginning + * at extent index idx. + */ +void +xfs_iext_remove_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + int nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + ASSERT(idx < XFS_INLINE_EXTS); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(((nextents - ext_diff) > 0) && + (nextents - ext_diff) < XFS_INLINE_EXTS); + + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx], + &ifp->if_u2.if_inline_ext[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + } else { + memset(&ifp->if_u2.if_inline_ext[idx], 0, + ext_diff * sizeof(xfs_bmbt_rec_t)); + } +} + +/* + * This removes ext_diff extents from a linear (direct) extent list, + * beginning at extent index idx. If the extents are being removed + * from the end of the list (ie. truncate) then we just need to re- + * allocate the list to remove the extra space. Otherwise, if the + * extents are being removed from the middle of the existing extent + * entries, then we first need to move the extent records beginning + * at idx + ext_diff up in the list to overwrite the records being + * removed, then remove the extra space via kmem_realloc. + */ +void +xfs_iext_remove_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + new_size = ifp->if_bytes - + (ext_diff * sizeof(xfs_bmbt_rec_t)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + return; + } + /* Move extents up in the list (if needed) */ + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u1.if_extents[idx], + &ifp->if_u1.if_extents[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + } + memset(&ifp->if_u1.if_extents[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + /* + * Reallocate the direct extent list. If the extents + * will fit inside the inode then xfs_iext_realloc_direct + * will switch from direct to inline extent allocation + * mode for us. + */ + xfs_iext_realloc_direct(ifp, new_size); + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being removed from the + * indirection array and the extents being removed span multiple extent + * buffers. The idx parameter contains the file extent index where we + * want to begin removing extents, and the count parameter contains + * how many extents need to be removed. + * + * |-------| |-------| + * | nex1 | | | nex1 - number of extents before idx + * |-------| | count | + * | | | | count - number of extents being removed at idx + * | count | |-------| + * | | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_remove_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing extents */ + int count) /* number of extents to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int erp_idx = 0; /* indirection array index */ + xfs_extnum_t ext_cnt; /* extents left to remove */ + xfs_extnum_t ext_diff; /* extents to remove in current list */ + xfs_extnum_t nex1; /* number of extents before idx */ + xfs_extnum_t nex2; /* extents after idx + count */ + int page_idx = idx; /* index in target extent list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + ASSERT(erp != NULL); + nex1 = page_idx; + ext_cnt = count; + while (ext_cnt) { + nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); + ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); + /* + * Check for deletion of entire list; + * xfs_iext_irec_remove() updates extent offsets. + */ + if (ext_diff == erp->er_extcount) { + xfs_iext_irec_remove(ifp, erp_idx); + ext_cnt -= ext_diff; + nex1 = 0; + if (ext_cnt) { + ASSERT(erp_idx < ifp->if_real_bytes / + XFS_IEXT_BUFSZ); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex1 = 0; + continue; + } else { + break; + } + } + /* Move extents up (if needed) */ + if (nex2) { + memmove(&erp->er_extbuf[nex1], + &erp->er_extbuf[nex1 + ext_diff], + nex2 * sizeof(xfs_bmbt_rec_t)); + } + /* Zero out rest of page */ + memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - + ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); + /* Update remaining counters */ + erp->er_extcount -= ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); + ext_cnt -= ext_diff; + nex1 = 0; + erp_idx++; + erp++; + } + ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); + xfs_iext_irec_compact(ifp); +} + +/* + * Create, destroy, or resize a linear (direct) block of extents. + */ +void +xfs_iext_realloc_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new size of extents */ +{ + int rnew_size; /* real new size of extents */ + + rnew_size = new_size; + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || + ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && + (new_size != ifp->if_real_bytes))); + + /* Free extent records */ + if (new_size == 0) { + xfs_iext_destroy(ifp); + } + /* Resize direct extent list and zero any new bytes */ + else if (ifp->if_real_bytes) { + /* Check if extents will fit inside the inode */ + if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { + xfs_iext_direct_to_inline(ifp, new_size / + (uint)sizeof(xfs_bmbt_rec_t)); + ifp->if_bytes = new_size; + return; + } + if (!is_power_of_2(new_size)){ + rnew_size = roundup_pow_of_two(new_size); + } + if (rnew_size != ifp->if_real_bytes) { + ifp->if_u1.if_extents = + kmem_realloc(ifp->if_u1.if_extents, + rnew_size, + ifp->if_real_bytes, KM_NOFS); + } + if (rnew_size > ifp->if_real_bytes) { + memset(&ifp->if_u1.if_extents[ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)], 0, + rnew_size - ifp->if_real_bytes); + } + } + /* + * Switch from the inline extent buffer to a direct + * extent list. Be sure to include the inline extent + * bytes in new_size. + */ + else { + new_size += ifp->if_bytes; + if (!is_power_of_2(new_size)) { + rnew_size = roundup_pow_of_two(new_size); + } + xfs_iext_inline_to_direct(ifp, rnew_size); + } + ifp->if_real_bytes = rnew_size; + ifp->if_bytes = new_size; +} + +/* + * Switch from linear (direct) extent records to inline buffer. + */ +void +xfs_iext_direct_to_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t nextents) /* number of extents in file */ +{ + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(nextents <= XFS_INLINE_EXTS); + /* + * The inline buffer was zeroed when we switched + * from inline to direct extent allocation mode, + * so we don't need to clear it here. + */ + memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, + nextents * sizeof(xfs_bmbt_rec_t)); + kmem_free(ifp->if_u1.if_extents); + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; +} + +/* + * Switch from inline buffer to linear (direct) extent records. + * new_size should already be rounded up to the next power of 2 + * by the caller (when appropriate), so use new_size as it is. + * However, since new_size may be rounded up, we can't update + * if_bytes here. It is the caller's responsibility to update + * if_bytes upon return. + */ +void +xfs_iext_inline_to_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* number of extents in file */ +{ + ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); + memset(ifp->if_u1.if_extents, 0, new_size); + if (ifp->if_bytes) { + memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, + ifp->if_bytes); + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_real_bytes = new_size; +} + +/* + * Resize an extent indirection array to new_size bytes. + */ +STATIC void +xfs_iext_realloc_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new indirection array size */ +{ + int nlists; /* number of irec's (ex lists) */ + int size; /* current indirection array size */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + size = nlists * sizeof(xfs_ext_irec_t); + ASSERT(ifp->if_real_bytes); + ASSERT((new_size >= 0) && (new_size != size)); + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else { + ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) + kmem_realloc(ifp->if_u1.if_ext_irec, + new_size, size, KM_NOFS); + } +} + +/* + * Switch from indirection array to linear (direct) extent allocations. + */ +STATIC void +xfs_iext_indirect_to_direct( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + int size; /* size of file extents */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + size = nextents * sizeof(xfs_bmbt_rec_t); + + xfs_iext_irec_compact_pages(ifp); + ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); + + ep = ifp->if_u1.if_ext_irec->er_extbuf; + kmem_free(ifp->if_u1.if_ext_irec); + ifp->if_flags &= ~XFS_IFEXTIREC; + ifp->if_u1.if_extents = ep; + ifp->if_bytes = size; + if (nextents < XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, size); + } +} + +/* + * Free incore file extents. + */ +void +xfs_iext_destroy( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + if (ifp->if_flags & XFS_IFEXTIREC) { + int erp_idx; + int nlists; + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { + xfs_iext_irec_remove(ifp, erp_idx); + } + ifp->if_flags &= ~XFS_IFEXTIREC; + } else if (ifp->if_real_bytes) { + kmem_free(ifp->if_u1.if_extents); + } else if (ifp->if_bytes) { + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_u1.if_extents = NULL; + ifp->if_real_bytes = 0; + ifp->if_bytes = 0; +} + +/* + * Return a pointer to the extent record for file system block bno. + */ +xfs_bmbt_rec_host_t * /* pointer to found extent record */ +xfs_iext_bno_to_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + xfs_extnum_t *idxp) /* index of target extent */ +{ + xfs_bmbt_rec_host_t *base; /* pointer to first extent */ + xfs_filblks_t blockcount = 0; /* number of blocks in extent */ + xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + int high; /* upper boundary in search */ + xfs_extnum_t idx = 0; /* index of target extent */ + int low; /* lower boundary in search */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_fileoff_t startoff = 0; /* start offset of extent */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *idxp = 0; + return NULL; + } + low = 0; + if (ifp->if_flags & XFS_IFEXTIREC) { + /* Find target extent list */ + int erp_idx = 0; + erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); + base = erp->er_extbuf; + high = erp->er_extcount - 1; + } else { + base = ifp->if_u1.if_extents; + high = nextents - 1; + } + /* Binary search extent records */ + while (low <= high) { + idx = (low + high) >> 1; + ep = base + idx; + startoff = xfs_bmbt_get_startoff(ep); + blockcount = xfs_bmbt_get_blockcount(ep); + if (bno < startoff) { + high = idx - 1; + } else if (bno >= startoff + blockcount) { + low = idx + 1; + } else { + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + *idxp = idx; + return ep; + } + } + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + if (bno >= startoff + blockcount) { + if (++idx == nextents) { + ep = NULL; + } else { + ep = xfs_iext_get_ext(ifp, idx); + } + } + *idxp = idx; + return ep; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record for filesystem block bno. Store the index of the + * target irec in *erp_idxp. + */ +xfs_ext_irec_t * /* pointer to found extent record */ +xfs_iext_bno_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + int *erp_idxp) /* irec index of target ext list */ +{ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + xfs_ext_irec_t *erp_next; /* next indirection array entry */ + int erp_idx; /* indirection array index */ + int nlists; /* number of extent irec's (lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; + if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { + high = erp_idx - 1; + } else if (erp_next && bno >= + xfs_bmbt_get_startoff(erp_next->er_extbuf)) { + low = erp_idx + 1; + } else { + break; + } + } + *erp_idxp = erp_idx; + return erp; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record at file extent index *idxp. Store the index of the + * target irec in *erp_idxp and store the page index of the target + * extent record in *idxp. + */ +xfs_ext_irec_t * +xfs_iext_idx_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t *idxp, /* extent index (file -> page) */ + int *erp_idxp, /* pointer to target irec */ + int realloc) /* new bytes were just added */ +{ + xfs_ext_irec_t *prev; /* pointer to previous irec */ + xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ + int erp_idx; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + xfs_extnum_t page_idx = *idxp; /* extent index in target list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + ASSERT(page_idx >= 0); + ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + + /* Binary search extent irec's */ + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + prev = erp_idx > 0 ? erp - 1 : NULL; + if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && + realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { + high = erp_idx - 1; + } else if (page_idx > erp->er_extoff + erp->er_extcount || + (page_idx == erp->er_extoff + erp->er_extcount && + !realloc)) { + low = erp_idx + 1; + } else if (page_idx == erp->er_extoff + erp->er_extcount && + erp->er_extcount == XFS_LINEAR_EXTS) { + ASSERT(realloc); + page_idx = 0; + erp_idx++; + erp = erp_idx < nlists ? erp + 1 : NULL; + break; + } else { + page_idx -= erp->er_extoff; + break; + } + } + *idxp = page_idx; + *erp_idxp = erp_idx; + return(erp); +} + +/* + * Allocate and initialize an indirection array once the space needed + * for incore extents increases above XFS_IEXT_BUFSZ. + */ +void +xfs_iext_irec_init( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + + erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); + + if (nextents == 0) { + ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + } else if (!ifp->if_real_bytes) { + xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); + } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { + xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); + } + erp->er_extbuf = ifp->if_u1.if_extents; + erp->er_extcount = nextents; + erp->er_extoff = 0; + + ifp->if_flags |= XFS_IFEXTIREC; + ifp->if_real_bytes = XFS_IEXT_BUFSZ; + ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); + ifp->if_u1.if_ext_irec = erp; + + return; +} + +/* + * Allocate and initialize a new entry in the indirection array. + */ +xfs_ext_irec_t * +xfs_iext_irec_new( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* index for new irec */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* Resize indirection array */ + xfs_iext_realloc_indirect(ifp, ++nlists * + sizeof(xfs_ext_irec_t)); + /* + * Move records down in the array so the + * new page can use erp_idx. + */ + erp = ifp->if_u1.if_ext_irec; + for (i = nlists - 1; i > erp_idx; i--) { + memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); + } + ASSERT(i == erp_idx); + + /* Initialize new extent record */ + erp = ifp->if_u1.if_ext_irec; + erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; + memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); + erp[erp_idx].er_extcount = 0; + erp[erp_idx].er_extoff = erp_idx > 0 ? + erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; + return (&erp[erp_idx]); +} + +/* + * Remove a record from the indirection array. + */ +void +xfs_iext_irec_remove( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* irec index to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + if (erp->er_extbuf) { + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, + -erp->er_extcount); + kmem_free(erp->er_extbuf); + } + /* Compact extent records */ + erp = ifp->if_u1.if_ext_irec; + for (i = erp_idx; i < nlists - 1; i++) { + memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); + } + /* + * Manually free the last extent record from the indirection + * array. A call to xfs_iext_realloc_indirect() with a size + * of zero would result in a call to xfs_iext_destroy() which + * would in turn call this function again, creating a nasty + * infinite loop. + */ + if (--nlists) { + xfs_iext_realloc_indirect(ifp, + nlists * sizeof(xfs_ext_irec_t)); + } else { + kmem_free(ifp->if_u1.if_ext_irec); + } + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; +} + +/* + * This is called to clean up large amounts of unused memory allocated + * by the indirection array. Before compacting anything though, verify + * that the indirection array is still needed and switch back to the + * linear extent list (or even the inline buffer) if possible. The + * compaction policy is as follows: + * + * Full Compaction: Extents fit into a single page (or inline buffer) + * Partial Compaction: Extents occupy less than 50% of allocated space + * No Compaction: Extents occupy at least 50% of allocated space + */ +void +xfs_iext_irec_compact( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (nextents == 0) { + xfs_iext_destroy(ifp); + } else if (nextents <= XFS_INLINE_EXTS) { + xfs_iext_indirect_to_direct(ifp); + xfs_iext_direct_to_inline(ifp, nextents); + } else if (nextents <= XFS_LINEAR_EXTS) { + xfs_iext_indirect_to_direct(ifp); + } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { + xfs_iext_irec_compact_pages(ifp); + } +} + +/* + * Combine extents from neighboring extent pages. + */ +void +xfs_iext_irec_compact_pages( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ + int erp_idx = 0; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + while (erp_idx < nlists - 1) { + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp + 1; + if (erp_next->er_extcount <= + (XFS_LINEAR_EXTS - erp->er_extcount)) { + memcpy(&erp->er_extbuf[erp->er_extcount], + erp_next->er_extbuf, erp_next->er_extcount * + sizeof(xfs_bmbt_rec_t)); + erp->er_extcount += erp_next->er_extcount; + /* + * Free page before removing extent record + * so er_extoffs don't get modified in + * xfs_iext_irec_remove. + */ + kmem_free(erp_next->er_extbuf); + erp_next->er_extbuf = NULL; + xfs_iext_irec_remove(ifp, erp_idx + 1); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + } else { + erp_idx++; + } + } +} + +/* + * This is called to update the er_extoff field in the indirection + * array when extents have been added or removed from one of the + * extent lists. erp_idx contains the irec index to begin updating + * at and ext_diff contains the number of extents that were added + * or removed. + */ +void +xfs_iext_irec_update_extoffs( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* irec index to update */ + int ext_diff) /* number of new extents */ +{ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (i = erp_idx; i < nlists; i++) { + ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; + } +} diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h new file mode 100644 index 000000000000..28661a0d9058 --- /dev/null +++ b/fs/xfs/xfs_inode_fork.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_FORK_H__ +#define __XFS_INODE_FORK_H__ + +struct xfs_inode_log_item; + +/* + * The following xfs_ext_irec_t struct introduces a second (top) level + * to the in-core extent allocation scheme. These structs are allocated + * in a contiguous block, creating an indirection array where each entry + * (irec) contains a pointer to a buffer of in-core extent records which + * it manages. Each extent buffer is 4k in size, since 4k is the system + * page size on Linux i386 and systems with larger page sizes don't seem + * to gain much, if anything, by using their native page size as the + * extent buffer size. Also, using 4k extent buffers everywhere provides + * a consistent interface for CXFS across different platforms. + * + * There is currently no limit on the number of irec's (extent lists) + * allowed, so heavily fragmented files may require an indirection array + * which spans multiple system pages of memory. The number of extents + * which would require this amount of contiguous memory is very large + * and should not cause problems in the foreseeable future. However, + * if the memory needed for the contiguous array ever becomes a problem, + * it is possible that a third level of indirection may be required. + */ +typedef struct xfs_ext_irec { + xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */ + xfs_extnum_t er_extoff; /* extent offset in file */ + xfs_extnum_t er_extcount; /* number of extents in page/block */ +} xfs_ext_irec_t; + +/* + * File incore extent information, present for each of data & attr forks. + */ +#define XFS_IEXT_BUFSZ 4096 +#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t)) +#define XFS_INLINE_EXTS 2 +#define XFS_INLINE_DATA 32 +typedef struct xfs_ifork { + int if_bytes; /* bytes in if_u1 */ + int if_real_bytes; /* bytes allocated in if_u1 */ + struct xfs_btree_block *if_broot; /* file's incore btree root */ + short if_broot_bytes; /* bytes allocated for root */ + unsigned char if_flags; /* per-fork flags */ + union { + xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ + xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ + char *if_data; /* inline file data */ + } if_u1; + union { + xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS]; + /* very small file extents */ + char if_inline_data[XFS_INLINE_DATA]; + /* very small file data */ + xfs_dev_t if_rdev; /* dev number if special */ + uuid_t if_uuid; /* mount point value */ + } if_u2; +} xfs_ifork_t; + +/* + * Per-fork incore inode flags. + */ +#define XFS_IFINLINE 0x01 /* Inline data is read in */ +#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ +#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ +#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ + +/* + * Fork handling. + */ + +#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) +#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) + +#define XFS_IFORK_PTR(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + &(ip)->i_df : \ + (ip)->i_afp) +#define XFS_IFORK_DSIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_IFORK_BOFF(ip) : \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) +#define XFS_IFORK_ASIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ + XFS_IFORK_BOFF(ip) : \ + 0) +#define XFS_IFORK_SIZE(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_IFORK_DSIZE(ip) : \ + XFS_IFORK_ASIZE(ip)) +#define XFS_IFORK_FORMAT(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_format : \ + (ip)->i_d.di_aformat) +#define XFS_IFORK_FMT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_format = (n)) : \ + ((ip)->i_d.di_aformat = (n))) +#define XFS_IFORK_NEXTENTS(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_nextents : \ + (ip)->i_d.di_anextents) +#define XFS_IFORK_NEXT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_nextents = (n)) : \ + ((ip)->i_d.di_anextents = (n))) +#define XFS_IFORK_MAXEXT(ip, w) \ + (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) + +int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); +void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, + struct xfs_inode_log_item *, int, + struct xfs_buf *); +void xfs_idestroy_fork(struct xfs_inode *, int); +void xfs_idata_realloc(struct xfs_inode *, int, int); +void xfs_iroot_realloc(struct xfs_inode *, int, int); +int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); +int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, + int); + +struct xfs_bmbt_rec_host * + xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); +void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t, + struct xfs_bmbt_irec *, int); +void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_add_indirect_multi(struct xfs_ifork *, int, + xfs_extnum_t, int); +void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int); +void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_realloc_direct(struct xfs_ifork *, int); +void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t); +void xfs_iext_inline_to_direct(struct xfs_ifork *, int); +void xfs_iext_destroy(struct xfs_ifork *); +struct xfs_bmbt_rec_host * + xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *); +struct xfs_ext_irec * + xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *); +struct xfs_ext_irec * + xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *, + int); +void xfs_iext_irec_init(struct xfs_ifork *); +struct xfs_ext_irec * + xfs_iext_irec_new(struct xfs_ifork *, int); +void xfs_iext_irec_remove(struct xfs_ifork *, int); +void xfs_iext_irec_compact(struct xfs_ifork *); +void xfs_iext_irec_compact_pages(struct xfs_ifork *); +void xfs_iext_irec_compact_full(struct xfs_ifork *); +void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); + +extern struct kmem_zone *xfs_ifork_zone; + +#endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f76ff52e43c0..378081109844 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -47,32 +47,44 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) * inode core, and possibly one for the inode data/extents/b-tree root * and one for the inode attribute data/extents/b-tree root. */ -STATIC uint +STATIC void xfs_inode_item_size( - struct xfs_log_item *lip) + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - uint nvecs = 2; + + *nvecs += 2; + *nbytes += sizeof(struct xfs_inode_log_format) + + xfs_icdinode_size(ip->i_d.di_version); switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_DEXT) && ip->i_d.di_nextents > 0 && - ip->i_df.if_bytes > 0) - nvecs++; + ip->i_df.if_bytes > 0) { + /* worst case, doesn't subtract delalloc extents */ + *nbytes += XFS_IFORK_DSIZE(ip); + *nvecs += 1; + } break; case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_DBROOT) && - ip->i_df.if_broot_bytes > 0) - nvecs++; + ip->i_df.if_broot_bytes > 0) { + *nbytes += ip->i_df.if_broot_bytes; + *nvecs += 1; + } break; case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_DDATA) && - ip->i_df.if_bytes > 0) - nvecs++; + ip->i_df.if_bytes > 0) { + *nbytes += roundup(ip->i_df.if_bytes, 4); + *nvecs += 1; + } break; case XFS_DINODE_FMT_DEV: @@ -85,7 +97,7 @@ xfs_inode_item_size( } if (!XFS_IFORK_Q(ip)) - return nvecs; + return; /* @@ -95,28 +107,33 @@ xfs_inode_item_size( case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_AEXT) && ip->i_d.di_anextents > 0 && - ip->i_afp->if_bytes > 0) - nvecs++; + ip->i_afp->if_bytes > 0) { + /* worst case, doesn't subtract unused space */ + *nbytes += XFS_IFORK_ASIZE(ip); + *nvecs += 1; + } break; case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_ABROOT) && - ip->i_afp->if_broot_bytes > 0) - nvecs++; + ip->i_afp->if_broot_bytes > 0) { + *nbytes += ip->i_afp->if_broot_bytes; + *nvecs += 1; + } break; case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_ADATA) && - ip->i_afp->if_bytes > 0) - nvecs++; + ip->i_afp->if_bytes > 0) { + *nbytes += roundup(ip->i_afp->if_bytes, 4); + *nvecs += 1; + } break; default: ASSERT(0); break; } - - return nvecs; } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 779812fb3d80..dce4d656768c 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -18,123 +18,13 @@ #ifndef __XFS_INODE_ITEM_H__ #define __XFS_INODE_ITEM_H__ -/* - * This is the structure used to lay out an inode log item in the - * log. The size of the inline data/extents/b-tree root to be logged - * (if any) is indicated in the ilf_dsize field. Changes to this structure - * must be added on to the end. - */ -typedef struct xfs_inode_log_format { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint64_t ilf_ino; /* inode number */ - union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ - } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ -} xfs_inode_log_format_t; - -typedef struct xfs_inode_log_format_32 { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint64_t ilf_ino; /* inode number */ - union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ - } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ -} __attribute__((packed)) xfs_inode_log_format_32_t; - -typedef struct xfs_inode_log_format_64 { - __uint16_t ilf_type; /* inode log item type */ - __uint16_t ilf_size; /* size of this item */ - __uint32_t ilf_fields; /* flags for fields logged */ - __uint16_t ilf_asize; /* size of attr d/ext/root */ - __uint16_t ilf_dsize; /* size of data/ext/root */ - __uint32_t ilf_pad; /* pad for 64 bit boundary */ - __uint64_t ilf_ino; /* inode number */ - union { - __uint32_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ - } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - __int32_t ilf_len; /* len of inode buffer */ - __int32_t ilf_boffset; /* off of inode in buffer */ -} xfs_inode_log_format_64_t; - -/* - * Flags for xfs_trans_log_inode flags field. - */ -#define XFS_ILOG_CORE 0x001 /* log standard inode fields */ -#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */ -#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ -#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ -#define XFS_ILOG_DEV 0x010 /* log the dev field */ -#define XFS_ILOG_UUID 0x020 /* log the uuid field */ -#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ -#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ -#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ - - -/* - * The timestamps are dirty, but not necessarily anything else in the inode - * core. Unlike the other fields above this one must never make it to disk - * in the ilf_fields of the inode_log_format, but is purely store in-memory in - * ili_fields in the inode_log_item. - */ -#define XFS_ILOG_TIMESTAMP 0x4000 - -#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ - XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ - XFS_ILOG_UUID | XFS_ILOG_ADATA | \ - XFS_ILOG_AEXT | XFS_ILOG_ABROOT) - -#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ - XFS_ILOG_DBROOT) - -#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT) - -#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ - XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ - XFS_ILOG_DEV | XFS_ILOG_UUID | \ - XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP) - -static inline int xfs_ilog_fbroot(int w) -{ - return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); -} - -static inline int xfs_ilog_fext(int w) -{ - return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); -} - -static inline int xfs_ilog_fdata(int w) -{ - return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); -} - -#ifdef __KERNEL__ +/* kernel only definitions */ struct xfs_buf; struct xfs_bmbt_rec; struct xfs_inode; struct xfs_mount; - typedef struct xfs_inode_log_item { xfs_log_item_t ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ @@ -151,7 +41,6 @@ typedef struct xfs_inode_log_item { xfs_inode_log_format_t ili_format; /* logged structure */ } xfs_inode_log_item_t; - static inline int xfs_inode_clean(xfs_inode_t *ip) { return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); @@ -165,6 +54,6 @@ extern void xfs_iflush_abort(struct xfs_inode *, bool); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, xfs_inode_log_format_t *); -#endif /* __KERNEL__ */ +extern struct kmem_zone *xfs_ili_zone; #endif /* __XFS_INODE_ITEM_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6e2bca5d44d6..bdebc21078d7 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -17,6 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -32,17 +33,16 @@ #include "xfs_error.h" #include "xfs_attr.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_dfrag.h" #include "xfs_fsops.h" -#include "xfs_vnodeops.h" #include "xfs_discard.h" #include "xfs_quota.h" #include "xfs_inode_item.h" #include "xfs_export.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_symlink.h" #include <linux/capability.h> #include <linux/dcache.h> @@ -350,6 +350,40 @@ xfs_readlink_by_handle( return error; } +int +xfs_set_dmattrs( + xfs_inode_t *ip, + u_int evmask, + u_int16_t state) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + ip->i_d.di_dmevmask = evmask; + ip->i_d.di_dmstate = state; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp, 0); + + return error; +} + STATIC int xfs_fssetdm_by_handle( struct file *parfilp, @@ -967,7 +1001,7 @@ xfs_ioctl_setattr( * first do an error checking pass. */ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (code) goto error_return; @@ -981,15 +1015,22 @@ xfs_ioctl_setattr( * to the file owner ID, except in cases where the * CAP_FSETID capability is applicable. */ - if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) { + if (!inode_owner_or_capable(VFS_I(ip))) { code = XFS_ERROR(EPERM); goto error_return; } /* * Do a quota reservation only if projid is actually going to change. + * Only allow changing of projid from init_user_ns since it is a + * non user namespace aware identifier. */ if (mask & FSX_PROJID) { + if (current_user_ns() != &init_user_ns) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && xfs_get_projid(ip) != fa->fsx_projid) { @@ -1103,7 +1144,7 @@ xfs_ioctl_setattr( * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && - !capable(CAP_FSETID)) + !inode_capable(VFS_I(ip), CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* @@ -1328,6 +1369,75 @@ xfs_ioc_getbmapx( return 0; } +int +xfs_ioc_swapext( + xfs_swapext_t *sxp) +{ + xfs_inode_t *ip, *tip; + struct fd f, tmp; + int error = 0; + + /* Pull information for the target fd */ + f = fdget((int)sxp->sx_fdtarget); + if (!f.file) { + error = XFS_ERROR(EINVAL); + goto out; + } + + if (!(f.file->f_mode & FMODE_WRITE) || + !(f.file->f_mode & FMODE_READ) || + (f.file->f_flags & O_APPEND)) { + error = XFS_ERROR(EBADF); + goto out_put_file; + } + + tmp = fdget((int)sxp->sx_fdtmp); + if (!tmp.file) { + error = XFS_ERROR(EINVAL); + goto out_put_file; + } + + if (!(tmp.file->f_mode & FMODE_WRITE) || + !(tmp.file->f_mode & FMODE_READ) || + (tmp.file->f_flags & O_APPEND)) { + error = XFS_ERROR(EBADF); + goto out_put_tmp_file; + } + + if (IS_SWAPFILE(file_inode(f.file)) || + IS_SWAPFILE(file_inode(tmp.file))) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + ip = XFS_I(file_inode(f.file)); + tip = XFS_I(file_inode(tmp.file)); + + if (ip->i_mount != tip->i_mount) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + if (ip->i_ino == tip->i_ino) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + error = XFS_ERROR(EIO); + goto out_put_tmp_file; + } + + error = xfs_swap_extents(ip, tip, sxp); + + out_put_tmp_file: + fdput(tmp); + out_put_file: + fdput(f); + out: + return error; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. @@ -1472,7 +1582,7 @@ xfs_file_ioctl( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_swapext(&sxp); + error = xfs_ioc_swapext(&sxp); mnt_drop_write_file(filp); return -error; } @@ -1610,23 +1720,23 @@ xfs_file_ioctl( return -error; case XFS_IOC_FREE_EOFBLOCKS: { - struct xfs_eofblocks eofb; + struct xfs_fs_eofblocks eofb; + struct xfs_eofblocks keofb; - if (copy_from_user(&eofb, arg, sizeof(eofb))) - return -XFS_ERROR(EFAULT); + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; - if (eofb.eof_version != XFS_EOFBLOCKS_VERSION) - return -XFS_ERROR(EINVAL); + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -XFS_ERROR(EROFS); - if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) - return -XFS_ERROR(EINVAL); + if (copy_from_user(&eofb, arg, sizeof(eofb))) + return -XFS_ERROR(EFAULT); - if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) || - memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64))) - return -XFS_ERROR(EINVAL); + error = xfs_fs_eofblocks_from_user(&eofb, &keofb); + if (error) + return -error; - error = xfs_icache_free_eofblocks(mp, &eofb); - return -error; + return -xfs_icache_free_eofblocks(mp, &keofb); } default: diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index d56173b34a2a..77c02c7900b6 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -27,6 +27,10 @@ xfs_ioc_space( unsigned int cmd, xfs_flock64_t *bf); +int +xfs_ioc_swapext( + xfs_swapext_t *sxp); + extern int xfs_find_handle( unsigned int cmd, @@ -82,4 +86,10 @@ xfs_file_compat_ioctl( unsigned int cmd, unsigned long arg); +extern int +xfs_set_dmattrs( + struct xfs_inode *ip, + u_int evmask, + u_int16_t state); + #endif diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c0c66259cc91..d3ab9534307f 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -33,8 +33,6 @@ #include "xfs_inode.h" #include "xfs_itable.h" #include "xfs_error.h" -#include "xfs_dfrag.h" -#include "xfs_vnodeops.h" #include "xfs_fsops.h" #include "xfs_alloc.h" #include "xfs_rtalloc.h" @@ -644,7 +642,7 @@ xfs_file_compat_ioctl( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_swapext(&sxp); + error = xfs_ioc_swapext(&sxp); mnt_drop_write_file(filp); return -error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6a7096422295..8d4d49b6fbf3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -17,6 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -32,13 +33,13 @@ #include "xfs_inode_item.h" #include "xfs_btree.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" -#include "xfs_utils.h" #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -187,10 +188,8 @@ xfs_iomap_write_direct( * Allocate and setup the transaction */ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), resrtextents, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, resrtextents); /* * Check for running out of space, note: need lock to return */ @@ -698,10 +697,8 @@ xfs_iomap_write_allocate( tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_reserve(tp, nres, - XFS_WRITE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + nres, 0); if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); @@ -864,10 +861,8 @@ xfs_iomap_write_unwritten( sb_start_intwrite(mp->m_super); tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, 0); if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 96dda62d497b..2b8952d9199b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -17,6 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_acl.h" #include "xfs_log.h" #include "xfs_trans.h" @@ -29,16 +30,19 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" #include "xfs_attr.h" #include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_vnodeops.h" #include "xfs_inode_item.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include <linux/capability.h> #include <linux/xattr.h> @@ -87,10 +91,12 @@ xfs_init_security( static void xfs_dentry_to_name( struct xfs_name *namep, - struct dentry *dentry) + struct dentry *dentry, + int mode) { namep->name = dentry->d_name.name; namep->len = dentry->d_name.len; + namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT]; } STATIC void @@ -106,7 +112,7 @@ xfs_cleanup_inode( * xfs_init_security we must back out. * ENOSPC can hit here, among other things. */ - xfs_dentry_to_name(&teardown, dentry); + xfs_dentry_to_name(&teardown, dentry, 0); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); iput(inode); @@ -146,7 +152,7 @@ xfs_vn_mknod( mode &= ~current_umask(); } - xfs_dentry_to_name(&name, dentry); + xfs_dentry_to_name(&name, dentry, mode); error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); if (unlikely(error)) goto out_free_acl; @@ -207,7 +213,7 @@ xfs_vn_lookup( if (dentry->d_name.len >= MAXNAMELEN) return ERR_PTR(-ENAMETOOLONG); - xfs_dentry_to_name(&name, dentry); + xfs_dentry_to_name(&name, dentry, 0); error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); if (unlikely(error)) { if (unlikely(error != ENOENT)) @@ -234,7 +240,7 @@ xfs_vn_ci_lookup( if (dentry->d_name.len >= MAXNAMELEN) return ERR_PTR(-ENAMETOOLONG); - xfs_dentry_to_name(&xname, dentry); + xfs_dentry_to_name(&xname, dentry, 0); error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); if (unlikely(error)) { if (unlikely(error != ENOENT)) @@ -269,7 +275,7 @@ xfs_vn_link( struct xfs_name name; int error; - xfs_dentry_to_name(&name, dentry); + xfs_dentry_to_name(&name, dentry, inode->i_mode); error = xfs_link(XFS_I(dir), XFS_I(inode), &name); if (unlikely(error)) @@ -288,7 +294,7 @@ xfs_vn_unlink( struct xfs_name name; int error; - xfs_dentry_to_name(&name, dentry); + xfs_dentry_to_name(&name, dentry, 0); error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); if (error) @@ -318,7 +324,7 @@ xfs_vn_symlink( mode = S_IFLNK | (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); - xfs_dentry_to_name(&name, dentry); + xfs_dentry_to_name(&name, dentry, mode); error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) @@ -350,12 +356,12 @@ xfs_vn_rename( struct xfs_name oname; struct xfs_name nname; - xfs_dentry_to_name(&oname, odentry); - xfs_dentry_to_name(&nname, ndentry); + xfs_dentry_to_name(&oname, odentry, 0); + xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode); return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), XFS_I(ndir), &nname, new_inode ? - XFS_I(new_inode) : NULL); + XFS_I(new_inode) : NULL); } /* @@ -420,8 +426,8 @@ xfs_vn_getattr( stat->dev = inode->i_sb->s_dev; stat->mode = ip->i_d.di_mode; stat->nlink = ip->i_d.di_nlink; - stat->uid = ip->i_d.di_uid; - stat->gid = ip->i_d.di_gid; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; stat->ino = ip->i_ino; stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; @@ -485,8 +491,8 @@ xfs_setattr_nonsize( int mask = iattr->ia_valid; xfs_trans_t *tp; int error; - uid_t uid = 0, iuid = 0; - gid_t gid = 0, igid = 0; + kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID; struct xfs_dquot *udqp = NULL, *gdqp = NULL; struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; @@ -522,13 +528,13 @@ xfs_setattr_nonsize( uid = iattr->ia_uid; qflags |= XFS_QMOPT_UQUOTA; } else { - uid = ip->i_d.di_uid; + uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { gid = iattr->ia_gid; qflags |= XFS_QMOPT_GQUOTA; } else { - gid = ip->i_d.di_gid; + gid = inode->i_gid; } /* @@ -538,14 +544,16 @@ xfs_setattr_nonsize( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), - qflags, &udqp, &gdqp, NULL); + error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), + xfs_kgid_to_gid(gid), + xfs_get_projid(ip), + qflags, &udqp, &gdqp, NULL); if (error) return error; } tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) goto out_dqrele; @@ -561,8 +569,8 @@ xfs_setattr_nonsize( * while we didn't have the inode locked, inode's dquot(s) * would have changed also. */ - iuid = ip->i_d.di_uid; - igid = ip->i_d.di_gid; + iuid = inode->i_uid; + igid = inode->i_gid; gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; @@ -571,8 +579,8 @@ xfs_setattr_nonsize( * going to change. */ if (XFS_IS_QUOTA_RUNNING(mp) && - ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || - (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { + ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) || + (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) { ASSERT(tp); error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, NULL, capable(CAP_FOWNER) ? @@ -602,17 +610,17 @@ xfs_setattr_nonsize( * Change the ownerships and register quota modifications * in the transaction. */ - if (iuid != uid) { + if (!uid_eq(iuid, uid)) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { ASSERT(mask & ATTR_UID); ASSERT(udqp); olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - ip->i_d.di_uid = uid; + ip->i_d.di_uid = xfs_kuid_to_uid(uid); inode->i_uid = uid; } - if (igid != gid) { + if (!gid_eq(igid, gid)) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { ASSERT(!XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); @@ -620,7 +628,7 @@ xfs_setattr_nonsize( olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - ip->i_d.di_gid = gid; + ip->i_d.di_gid = xfs_kgid_to_gid(gid); inode->i_gid = gid; } } @@ -807,9 +815,7 @@ xfs_setattr_size( goto out_unlock; tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto out_trans_cancel; @@ -932,7 +938,7 @@ xfs_vn_update_time( trace_xfs_update_time(ip); tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return -error; @@ -1173,8 +1179,8 @@ xfs_setup_inode( inode->i_mode = ip->i_d.di_mode; set_nlink(inode, ip->i_d.di_nlink); - inode->i_uid = ip->i_d.di_uid; - inode->i_gid = ip->i_d.di_gid; + inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); + inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); switch (inode->i_mode & S_IFMT) { case S_IFBLK: diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index ef41c92ce66e..d81fb41205ec 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -27,4 +27,17 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); extern void xfs_setup_inode(struct xfs_inode *); +/* + * Internal setattr interfaces. + */ +#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ +#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if op would block */ +#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ +#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ +#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */ + +extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, + int flags); +extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags); + #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 800f896a6cc4..f9bb590acc0e 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -32,6 +32,38 @@ # define XFS_BIG_INUMS 0 #endif +/* + * Kernel specific type declarations for XFS + */ +typedef signed char __int8_t; +typedef unsigned char __uint8_t; +typedef signed short int __int16_t; +typedef unsigned short int __uint16_t; +typedef signed int __int32_t; +typedef unsigned int __uint32_t; +typedef signed long long int __int64_t; +typedef unsigned long long int __uint64_t; + +typedef __uint32_t inst_t; /* an instruction */ + +typedef __s64 xfs_off_t; /* <file offset> type */ +typedef unsigned long long xfs_ino_t; /* <inode> type */ +typedef __s64 xfs_daddr_t; /* <disk address> type */ +typedef char * xfs_caddr_t; /* <core address> type */ +typedef __u32 xfs_dev_t; +typedef __u32 xfs_nlink_t; + +/* __psint_t is the same size as a pointer */ +#if (BITS_PER_LONG == 32) +typedef __int32_t __psint_t; +typedef __uint32_t __psunsigned_t; +#elif (BITS_PER_LONG == 64) +typedef __int64_t __psint_t; +typedef __uint64_t __psunsigned_t; +#else +#error BITS_PER_LONG must be 32 or 64 +#endif + #include "xfs_types.h" #include "kmem.h" @@ -114,8 +146,6 @@ #define xfs_inherit_sync xfs_params.inherit_sync.val #define xfs_inherit_nodump xfs_params.inherit_nodump.val #define xfs_inherit_noatime xfs_params.inherit_noatim.val -#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val -#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val @@ -159,6 +189,32 @@ #define MAX(a,b) (max(a,b)) #define howmany(x, y) (((x)+((y)-1))/(y)) +/* Kernel uid/gid conversion. These are used to convert to/from the on disk + * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. + * The conversion here is type only, the value will remain the same since we + * are converting to the init_user_ns. The uid is later mapped to a particular + * user namespace value when crossing the kernel/user boundary. + */ +static inline __uint32_t xfs_kuid_to_uid(kuid_t uid) +{ + return from_kuid(&init_user_ns, uid); +} + +static inline kuid_t xfs_uid_to_kuid(__uint32_t uid) +{ + return make_kuid(&init_user_ns, uid); +} + +static inline __uint32_t xfs_kgid_to_gid(kgid_t gid) +{ + return from_kgid(&init_user_ns, gid); +} + +static inline kgid_t xfs_gid_to_kgid(__uint32_t gid) +{ + return make_kgid(&init_user_ns, gid); +} + /* * Various platform dependent calls that don't fit anywhere else */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d852a2b3e1fd..5372d58ef93a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -614,7 +614,8 @@ xfs_log_mount( xfs_daddr_t blk_offset, int num_bblks) { - int error; + int error = 0; + int min_logfsbs; if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) xfs_notice(mp, "Mounting Filesystem"); @@ -631,6 +632,50 @@ xfs_log_mount( } /* + * Validate the given log space and drop a critical message via syslog + * if the log size is too small that would lead to some unexpected + * situations in transaction log space reservation stage. + * + * Note: we can't just reject the mount if the validation fails. This + * would mean that people would have to downgrade their kernel just to + * remedy the situation as there is no way to grow the log (short of + * black magic surgery with xfs_db). + * + * We can, however, reject mounts for CRC format filesystems, as the + * mkfs binary being used to make the filesystem should never create a + * filesystem with a log that is too small. + */ + min_logfsbs = xfs_log_calc_minimum_size(mp); + + if (mp->m_sb.sb_logblocks < min_logfsbs) { + xfs_warn(mp, + "Log size %d blocks too small, minimum size is %d blocks", + mp->m_sb.sb_logblocks, min_logfsbs); + error = EINVAL; + } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { + xfs_warn(mp, + "Log size %d blocks too large, maximum size is %lld blocks", + mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); + error = EINVAL; + } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { + xfs_warn(mp, + "log size %lld bytes too large, maximum size is %lld bytes", + XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), + XFS_MAX_LOG_BYTES); + error = EINVAL; + } + if (error) { + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); + ASSERT(0); + goto out_free_log; + } + xfs_crit(mp, +"Log size out of supported range. Continuing onwards, but if log hangs are\n" +"experienced then please report this message in the bug report."); + } + + /* * Initialize the AIL now we have a log. */ error = xfs_trans_ail_init(mp); @@ -720,7 +765,7 @@ xfs_log_mount_finish(xfs_mount_t *mp) * Unmount record used to have a string "Unmount filesystem--" in the * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). * We just write the magic number now since that particular field isn't - * currently architecture converted and "nUmount" is a bit foo. + * currently architecture converted and "Unmount" is a bit foo. * As far as I know, there weren't any dependencies on the old behaviour. */ @@ -1941,7 +1986,7 @@ xlog_print_tic_res( xfs_alert_tag(mp, XFS_PTAG_LOGRES, "xlog_write: reservation ran out. Need to up reservation"); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); } /* @@ -2044,7 +2089,7 @@ xlog_write_setup_ophdr( * Set up the parameters of the region copy into the log. This has * to handle region write split across multiple log buffers - this * state is kept external to this function so that this code can - * can be written in an obvious, self documenting manner. + * be written in an obvious, self documenting manner. */ static int xlog_write_setup_copy( @@ -3391,24 +3436,17 @@ xfs_log_ticket_get( } /* - * Allocate and initialise a new log ticket. + * Figure out the total log space unit (in bytes) that would be + * required for a log ticket. */ -struct xlog_ticket * -xlog_ticket_alloc( - struct xlog *log, - int unit_bytes, - int cnt, - char client, - bool permanent, - xfs_km_flags_t alloc_flags) +int +xfs_log_calc_unit_res( + struct xfs_mount *mp, + int unit_bytes) { - struct xlog_ticket *tic; - uint num_headers; - int iclog_space; - - tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); - if (!tic) - return NULL; + struct xlog *log = mp->m_log; + int iclog_space; + uint num_headers; /* * Permanent reservations have up to 'cnt'-1 active log operations @@ -3483,20 +3521,43 @@ xlog_ticket_alloc( unit_bytes += log->l_iclog_hsize; /* for roundoff padding for transaction data and one for commit record */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1) { + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { /* log su roundoff */ - unit_bytes += 2*log->l_mp->m_sb.sb_logsunit; + unit_bytes += 2 * mp->m_sb.sb_logsunit; } else { /* BB roundoff */ - unit_bytes += 2*BBSIZE; + unit_bytes += 2 * BBSIZE; } + return unit_bytes; +} + +/* + * Allocate and initialise a new log ticket. + */ +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int cnt, + char client, + bool permanent, + xfs_km_flags_t alloc_flags) +{ + struct xlog_ticket *tic; + int unit_res; + + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); + if (!tic) + return NULL; + + unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); + atomic_set(&tic->t_ref, 1); tic->t_task = current; INIT_LIST_HEAD(&tic->t_queue); - tic->t_unit_res = unit_bytes; - tic->t_curr_res = unit_bytes; + tic->t_unit_res = unit_res; + tic->t_curr_res = unit_res; tic->t_cnt = cnt; tic->t_ocnt = cnt; tic->t_tid = prandom_u32(); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index fb630e496c12..1c458487f000 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -18,14 +18,30 @@ #ifndef __XFS_LOG_H__ #define __XFS_LOG_H__ -/* get lsn fields */ -#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) -#define BLOCK_LSN(lsn) ((uint)(lsn)) +#include "xfs_log_format.h" -/* this is used in a spot where we might otherwise double-endian-flip */ -#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) +struct xfs_log_vec { + struct xfs_log_vec *lv_next; /* next lv in build list */ + int lv_niovecs; /* number of iovecs in lv */ + struct xfs_log_iovec *lv_iovecp; /* iovec array */ + struct xfs_log_item *lv_item; /* owner */ + char *lv_buf; /* formatted buffer */ + int lv_buf_len; /* size of formatted buffer */ + int lv_size; /* size of allocated lv */ +}; + +#define XFS_LOG_VEC_ORDERED (-1) + +/* + * Structure used to pass callback function and the function's argument + * to the log manager. + */ +typedef struct xfs_log_callback { + struct xfs_log_callback *cb_next; + void (*cb_func)(void *, int); + void *cb_arg; +} xfs_log_callback_t; -#ifdef __KERNEL__ /* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number @@ -59,67 +75,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) */ #define XFS_LOG_SYNC 0x1 -#endif /* __KERNEL__ */ - - -/* Log Clients */ -#define XFS_TRANSACTION 0x69 -#define XFS_VOLUME 0x2 -#define XFS_LOG 0xaa - - -/* Region types for iovec's i_type */ -#define XLOG_REG_TYPE_BFORMAT 1 -#define XLOG_REG_TYPE_BCHUNK 2 -#define XLOG_REG_TYPE_EFI_FORMAT 3 -#define XLOG_REG_TYPE_EFD_FORMAT 4 -#define XLOG_REG_TYPE_IFORMAT 5 -#define XLOG_REG_TYPE_ICORE 6 -#define XLOG_REG_TYPE_IEXT 7 -#define XLOG_REG_TYPE_IBROOT 8 -#define XLOG_REG_TYPE_ILOCAL 9 -#define XLOG_REG_TYPE_IATTR_EXT 10 -#define XLOG_REG_TYPE_IATTR_BROOT 11 -#define XLOG_REG_TYPE_IATTR_LOCAL 12 -#define XLOG_REG_TYPE_QFORMAT 13 -#define XLOG_REG_TYPE_DQUOT 14 -#define XLOG_REG_TYPE_QUOTAOFF 15 -#define XLOG_REG_TYPE_LRHEADER 16 -#define XLOG_REG_TYPE_UNMOUNT 17 -#define XLOG_REG_TYPE_COMMIT 18 -#define XLOG_REG_TYPE_TRANSHDR 19 -#define XLOG_REG_TYPE_ICREATE 20 -#define XLOG_REG_TYPE_MAX 20 - -typedef struct xfs_log_iovec { - void *i_addr; /* beginning address of region */ - int i_len; /* length in bytes of region */ - uint i_type; /* type of region */ -} xfs_log_iovec_t; - -struct xfs_log_vec { - struct xfs_log_vec *lv_next; /* next lv in build list */ - int lv_niovecs; /* number of iovecs in lv */ - struct xfs_log_iovec *lv_iovecp; /* iovec array */ - struct xfs_log_item *lv_item; /* owner */ - char *lv_buf; /* formatted buffer */ - int lv_buf_len; /* size of formatted buffer */ -}; - -#define XFS_LOG_VEC_ORDERED (-1) - -/* - * Structure used to pass callback function and the function's argument - * to the log manager. - */ -typedef struct xfs_log_callback { - struct xfs_log_callback *cb_next; - void (*cb_func)(void *, int); - void *cb_arg; -} xfs_log_callback_t; - - -#ifdef __KERNEL__ /* Log manager interfaces */ struct xfs_mount; struct xlog_in_core; @@ -188,5 +143,4 @@ void xfs_log_work_queue(struct xfs_mount *mp); void xfs_log_worker(struct work_struct *work); void xfs_log_quiesce(struct xfs_mount *mp); -#endif #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 02b9cf3f8252..cfe97973ba36 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -80,6 +80,83 @@ xlog_cil_init_post_recovery( log->l_curr_block); } +STATIC int +xlog_cil_lv_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + int index; + char *ptr; + + /* format new vectors into array */ + lip->li_ops->iop_format(lip, lv->lv_iovecp); + + /* copy data into existing array */ + ptr = lv->lv_buf; + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; + + memcpy(ptr, vec->i_addr, vec->i_len); + vec->i_addr = ptr; + ptr += vec->i_len; + } + + /* + * some size calculations for log vectors over-estimate, so the caller + * doesn't know the amount of space actually used by the item. Return + * the byte count to the caller so they can check and store it + * appropriately. + */ + return ptr - lv->lv_buf; +} + +/* + * Prepare the log item for insertion into the CIL. Calculate the difference in + * log space and vectors it will consume, and if it is a new item pin it as + * well. + */ +STATIC void +xfs_cil_prepare_item( + struct xlog *log, + struct xfs_log_vec *lv, + struct xfs_log_vec *old_lv, + int *diff_len, + int *diff_iovecs) +{ + /* Account for the new LV being passed in */ + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { + *diff_len += lv->lv_buf_len; + *diff_iovecs += lv->lv_niovecs; + } + + /* + * If there is no old LV, this is the first time we've seen the item in + * this CIL context and so we need to pin it. If we are replacing the + * old_lv, then remove the space it accounts for and free it. + */ + if (!old_lv) + lv->lv_item->li_ops->iop_pin(lv->lv_item); + else if (old_lv != lv) { + ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); + + *diff_len -= old_lv->lv_buf_len; + *diff_iovecs -= old_lv->lv_niovecs; + kmem_free(old_lv); + } + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + /* + * If this is the first time the item is being committed to the + * CIL, store the sequence number on the log item so we can + * tell in future commits whether this is the first checkpoint + * the item is being committed into. + */ + if (!lv->lv_item->li_seq) + lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; +} + /* * Format log item into a flat buffers * @@ -106,35 +183,39 @@ xlog_cil_init_post_recovery( * format the regions into the iclog as though they are being formatted * directly out of the objects themselves. */ -static struct xfs_log_vec * -xlog_cil_prepare_log_vecs( - struct xfs_trans *tp) +static void +xlog_cil_insert_format_items( + struct xlog *log, + struct xfs_trans *tp, + int *diff_len, + int *diff_iovecs) { struct xfs_log_item_desc *lidp; - struct xfs_log_vec *lv = NULL; - struct xfs_log_vec *ret_lv = NULL; /* Bail out if we didn't find a log item. */ if (list_empty(&tp->t_items)) { ASSERT(0); - return NULL; + return; } list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_vec *new_lv; - void *ptr; - int index; - int len = 0; - uint niovecs; + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv; + struct xfs_log_vec *old_lv; + int niovecs = 0; + int nbytes = 0; + int buf_size; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; + /* get number of vecs and size of data to be stored */ + lip->li_ops->iop_size(lip, &niovecs, &nbytes); + /* Skip items that do not have any vectors for writing */ - niovecs = IOP_SIZE(lidp->lid_item); if (!niovecs) continue; @@ -146,109 +227,63 @@ xlog_cil_prepare_log_vecs( if (niovecs == XFS_LOG_VEC_ORDERED) { ordered = true; niovecs = 0; + nbytes = 0; } - new_lv = kmem_zalloc(sizeof(*new_lv) + - niovecs * sizeof(struct xfs_log_iovec), - KM_SLEEP|KM_NOFS); - - new_lv->lv_item = lidp->lid_item; - new_lv->lv_niovecs = niovecs; - if (ordered) { - /* track as an ordered logvec */ - new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED; - goto next; - } - - /* The allocated iovec region lies beyond the log vector. */ - new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; + /* grab the old item if it exists for reservation accounting */ + old_lv = lip->li_lv; - /* build the vector array and calculate it's length */ - IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); - for (index = 0; index < new_lv->lv_niovecs; index++) - len += new_lv->lv_iovecp[index].i_len; + /* calc buffer size */ + buf_size = sizeof(struct xfs_log_vec) + nbytes + + niovecs * sizeof(struct xfs_log_iovec); - new_lv->lv_buf_len = len; - new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len, - KM_SLEEP|KM_NOFS); - ptr = new_lv->lv_buf; + /* compare to existing item size */ + if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { + /* same or smaller, optimise common overwrite case */ + lv = lip->li_lv; + lv->lv_next = NULL; - for (index = 0; index < new_lv->lv_niovecs; index++) { - struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index]; + if (ordered) + goto insert; - memcpy(ptr, vec->i_addr, vec->i_len); - vec->i_addr = ptr; - ptr += vec->i_len; - } - ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); - -next: - if (!ret_lv) - ret_lv = new_lv; - else - lv->lv_next = new_lv; - lv = new_lv; - } - - return ret_lv; -} - -/* - * Prepare the log item for insertion into the CIL. Calculate the difference in - * log space and vectors it will consume, and if it is a new item pin it as - * well. - */ -STATIC void -xfs_cil_prepare_item( - struct xlog *log, - struct xfs_log_vec *lv, - int *len, - int *diff_iovecs) -{ - struct xfs_log_vec *old = lv->lv_item->li_lv; + /* + * set the item up as though it is a new insertion so + * that the space reservation accounting is correct. + */ + *diff_iovecs -= lv->lv_niovecs; + *diff_len -= lv->lv_buf_len; - if (old) { - /* existing lv on log item, space used is a delta */ - ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) || - old->lv_buf_len == XFS_LOG_VEC_ORDERED); + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = niovecs; + lv->lv_buf = (char *)lv + buf_size - nbytes; - /* - * If the new item is ordered, keep the old one that is already - * tracking dirty or ordered regions - */ - if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { - ASSERT(!lv->lv_buf); - kmem_free(lv); - return; + lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv); + goto insert; } - *len += lv->lv_buf_len - old->lv_buf_len; - *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; - kmem_free(old->lv_buf); - kmem_free(old); - } else { - /* new lv, must pin the log item */ - ASSERT(!lv->lv_item->li_lv); - - if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { - *len += lv->lv_buf_len; - *diff_iovecs += lv->lv_niovecs; + /* allocate new data chunk */ + lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + lv->lv_item = lip; + lv->lv_size = buf_size; + lv->lv_niovecs = niovecs; + if (ordered) { + /* track as an ordered logvec */ + ASSERT(lip->li_lv == NULL); + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + goto insert; } - IOP_PIN(lv->lv_item); - } + /* The allocated iovec region lies beyond the log vector. */ + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; - /* attach new log vector to log item */ - lv->lv_item->li_lv = lv; + /* The allocated data region lies beyond the iovec region */ + lv->lv_buf = (char *)lv + buf_size - nbytes; - /* - * If this is the first time the item is being committed to the - * CIL, store the sequence number on the log item so we can - * tell in future commits whether this is the first checkpoint - * the item is being committed into. - */ - if (!lv->lv_item->li_seq) - lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; + lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv); +insert: + ASSERT(lv->lv_buf_len <= nbytes); + xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); + } } /* @@ -261,53 +296,47 @@ xfs_cil_prepare_item( static void xlog_cil_insert_items( struct xlog *log, - struct xfs_log_vec *log_vector, - struct xlog_ticket *ticket) + struct xfs_trans *tp) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; - struct xfs_log_vec *lv; + struct xfs_log_item_desc *lidp; int len = 0; int diff_iovecs = 0; int iclog_space; - ASSERT(log_vector); + ASSERT(tp); /* - * Do all the accounting aggregation and switching of log vectors - * around in a separate loop to the insertion of items into the CIL. - * Then we can do a separate loop to update the CIL within a single - * lock/unlock pair. This reduces the number of round trips on the CIL - * lock from O(nr_logvectors) to O(1) and greatly reduces the overall - * hold time for the transaction commit. - * - * If this is the first time the item is being placed into the CIL in - * this context, pin it so it can't be written to disk until the CIL is - * flushed to the iclog and the iclog written to disk. - * * We can do this safely because the context can't checkpoint until we * are done so it doesn't matter exactly how we update the CIL. */ + xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs); + + /* + * Now (re-)position everything modified at the tail of the CIL. + * We do this here so we only need to take the CIL lock once during + * the transaction commit. + */ spin_lock(&cil->xc_cil_lock); - for (lv = log_vector; lv; ) { - struct xfs_log_vec *next = lv->lv_next; + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; - ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil)); - lv->lv_next = NULL; + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; - /* - * xfs_cil_prepare_item() may free the lv, so move the item on - * the CIL first. - */ - list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); - xfs_cil_prepare_item(log, lv, &len, &diff_iovecs); - lv = next; + list_move_tail(&lip->li_cil, &cil->xc_cil); } /* account for space used by new iovec headers */ len += diff_iovecs * sizeof(xlog_op_header_t); ctx->nvecs += diff_iovecs; + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) + list_splice_init(&tp->t_busy, &ctx->busy_extents); + /* * Now transfer enough transaction reservation to the context ticket * for the checkpoint. The context ticket is special - the unit @@ -316,10 +345,8 @@ xlog_cil_insert_items( * during the transaction commit. */ if (ctx->ticket->t_curr_res == 0) { - /* first commit in checkpoint, steal the header reservation */ - ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; - ticket->t_curr_res -= ctx->ticket->t_unit_res; + tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res; } /* do we need space for more log record headers? */ @@ -333,10 +360,10 @@ xlog_cil_insert_items( hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); ctx->ticket->t_unit_res += hdrs; ctx->ticket->t_curr_res += hdrs; - ticket->t_curr_res -= hdrs; - ASSERT(ticket->t_curr_res >= len); + tp->t_ticket->t_curr_res -= hdrs; + ASSERT(tp->t_ticket->t_curr_res >= len); } - ticket->t_curr_res -= len; + tp->t_ticket->t_curr_res -= len; ctx->space_used += len; spin_unlock(&cil->xc_cil_lock); @@ -350,7 +377,6 @@ xlog_cil_free_logvec( for (lv = log_vector; lv; ) { struct xfs_log_vec *next = lv->lv_next; - kmem_free(lv->lv_buf); kmem_free(lv); lv = next; } @@ -376,9 +402,9 @@ xlog_cil_committed( xfs_extent_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); - spin_lock(&ctx->cil->xc_cil_lock); + spin_lock(&ctx->cil->xc_push_lock); list_del(&ctx->committing); - spin_unlock(&ctx->cil->xc_cil_lock); + spin_unlock(&ctx->cil->xc_push_lock); xlog_cil_free_logvec(ctx->lv_chain); @@ -433,7 +459,7 @@ xlog_cil_push( down_write(&cil->xc_ctx_lock); ctx = cil->xc_ctx; - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); push_seq = cil->xc_push_seq; ASSERT(push_seq <= ctx->sequence); @@ -444,10 +470,10 @@ xlog_cil_push( */ if (list_empty(&cil->xc_cil)) { cil->xc_push_seq = 0; - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); goto out_skip; } - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); /* check for a previously pushed seqeunce */ @@ -515,9 +541,9 @@ xlog_cil_push( * that higher sequences will wait for us to write out a commit record * before they do. */ - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); list_add(&ctx->committing, &cil->xc_committing); - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock); /* @@ -552,7 +578,7 @@ xlog_cil_push( * order the commit records so replay will get them in the right order. */ restart: - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); list_for_each_entry(new_ctx, &cil->xc_committing, committing) { /* * Higher sequences will wait for this one so skip them. @@ -565,11 +591,11 @@ restart: * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ - xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); goto restart; } } - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); /* xfs_log_done always frees the ticket on error. */ commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); @@ -588,10 +614,10 @@ restart: * callbacks to the iclog we can assign the commit LSN to the context * and wake up anyone who is waiting for the commit to complete. */ - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); ctx->commit_lsn = commit_lsn; wake_up_all(&cil->xc_commit_wait); - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); /* release the hounds! */ return xfs_log_release_iclog(log->l_mp, commit_iclog); @@ -644,12 +670,12 @@ xlog_cil_push_background( if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) return; - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); } - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); } @@ -672,14 +698,14 @@ xlog_cil_push_foreground( * If the CIL is empty or we've already pushed the sequence then * there's no work we need to do. */ - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); return; } cil->xc_push_seq = push_seq; - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); /* do the push now */ xlog_cil_push(log); @@ -706,43 +732,25 @@ xfs_log_commit_cil( int flags) { struct xlog *log = mp->m_log; + struct xfs_cil *cil = log->l_cilp; int log_flags = 0; - struct xfs_log_vec *log_vector; if (flags & XFS_TRANS_RELEASE_LOG_RES) log_flags = XFS_LOG_REL_PERM_RESERV; - /* - * Do all the hard work of formatting items (including memory - * allocation) outside the CIL context lock. This prevents stalling CIL - * pushes when we are low on memory and a transaction commit spends a - * lot of time in memory reclaim. - */ - log_vector = xlog_cil_prepare_log_vecs(tp); - if (!log_vector) - return ENOMEM; - /* lock out background commit */ - down_read(&log->l_cilp->xc_ctx_lock); - if (commit_lsn) - *commit_lsn = log->l_cilp->xc_ctx->sequence; + down_read(&cil->xc_ctx_lock); - /* xlog_cil_insert_items() destroys log_vector list */ - xlog_cil_insert_items(log, log_vector, tp->t_ticket); + xlog_cil_insert_items(log, tp); /* check we didn't blow the reservation */ if (tp->t_ticket->t_curr_res < 0) - xlog_print_tic_res(log->l_mp, tp->t_ticket); + xlog_print_tic_res(mp, tp->t_ticket); - /* attach the transaction to the CIL if it has any busy extents */ - if (!list_empty(&tp->t_busy)) { - spin_lock(&log->l_cilp->xc_cil_lock); - list_splice_init(&tp->t_busy, - &log->l_cilp->xc_ctx->busy_extents); - spin_unlock(&log->l_cilp->xc_cil_lock); - } + tp->t_commit_lsn = cil->xc_ctx->sequence; + if (commit_lsn) + *commit_lsn = tp->t_commit_lsn; - tp->t_commit_lsn = *commit_lsn; xfs_log_done(mp, tp->t_ticket, NULL, log_flags); xfs_trans_unreserve_and_mod_sb(tp); @@ -757,11 +765,11 @@ xfs_log_commit_cil( * the log items. This affects (at least) processing of stale buffers, * inodes and EFIs. */ - xfs_trans_free_items(tp, *commit_lsn, 0); + xfs_trans_free_items(tp, tp->t_commit_lsn, 0); xlog_cil_push_background(log); - up_read(&log->l_cilp->xc_ctx_lock); + up_read(&cil->xc_ctx_lock); return 0; } @@ -800,7 +808,7 @@ xlog_cil_force_lsn( * on commits for those as well. */ restart: - spin_lock(&cil->xc_cil_lock); + spin_lock(&cil->xc_push_lock); list_for_each_entry(ctx, &cil->xc_committing, committing) { if (ctx->sequence > sequence) continue; @@ -809,7 +817,7 @@ restart: * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ - xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); goto restart; } if (ctx->sequence != sequence) @@ -817,7 +825,7 @@ restart: /* found it! */ commit_lsn = ctx->commit_lsn; } - spin_unlock(&cil->xc_cil_lock); + spin_unlock(&cil->xc_push_lock); return commit_lsn; } @@ -875,6 +883,7 @@ xlog_cil_init( INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); + spin_lock_init(&cil->xc_push_lock); init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_commit_wait); diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h new file mode 100644 index 000000000000..31e3a06c4644 --- /dev/null +++ b/fs/xfs/xfs_log_format.h @@ -0,0 +1,852 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_FORMAT_H__ +#define __XFS_LOG_FORMAT_H__ + +struct xfs_mount; +struct xfs_trans_res; + +/* + * On-disk Log Format definitions. + * + * This file contains all the on-disk format definitions used within the log. It + * includes the physical log structure itself, as well as all the log item + * format structures that are written into the log and intepreted by log + * recovery. We start with the physical log format definitions, and then work + * through all the log items definitions and everything they encode into the + * log. + */ +typedef __uint32_t xlog_tid_t; + +#define XLOG_MIN_ICLOGS 2 +#define XLOG_MAX_ICLOGS 8 +#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ +#define XLOG_VERSION_1 1 +#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ +#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2) +#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */ +#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ +#define XLOG_MAX_RECORD_BSIZE (256*1024) +#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ +#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ +#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ +#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ +#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ + (log)->l_mp->m_sb.sb_logsunit) +#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) + +#define XLOG_HEADER_SIZE 512 + +/* Minimum number of transactions that must fit in the log (defined by mkfs) */ +#define XFS_MIN_LOG_FACTOR 3 + +#define XLOG_REC_SHIFT(log) \ + BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) +#define XLOG_TOTAL_REC_SHIFT(log) \ + BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) + +/* get lsn fields */ +#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) +#define BLOCK_LSN(lsn) ((uint)(lsn)) + +/* this is used in a spot where we might otherwise double-endian-flip */ +#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) + +static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) +{ + return ((xfs_lsn_t)cycle << 32) | block; +} + +static inline uint xlog_get_cycle(char *ptr) +{ + if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) + return be32_to_cpu(*((__be32 *)ptr + 1)); + else + return be32_to_cpu(*(__be32 *)ptr); +} + +/* Log Clients */ +#define XFS_TRANSACTION 0x69 +#define XFS_VOLUME 0x2 +#define XFS_LOG 0xaa + +#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ + +/* Region types for iovec's i_type */ +#define XLOG_REG_TYPE_BFORMAT 1 +#define XLOG_REG_TYPE_BCHUNK 2 +#define XLOG_REG_TYPE_EFI_FORMAT 3 +#define XLOG_REG_TYPE_EFD_FORMAT 4 +#define XLOG_REG_TYPE_IFORMAT 5 +#define XLOG_REG_TYPE_ICORE 6 +#define XLOG_REG_TYPE_IEXT 7 +#define XLOG_REG_TYPE_IBROOT 8 +#define XLOG_REG_TYPE_ILOCAL 9 +#define XLOG_REG_TYPE_IATTR_EXT 10 +#define XLOG_REG_TYPE_IATTR_BROOT 11 +#define XLOG_REG_TYPE_IATTR_LOCAL 12 +#define XLOG_REG_TYPE_QFORMAT 13 +#define XLOG_REG_TYPE_DQUOT 14 +#define XLOG_REG_TYPE_QUOTAOFF 15 +#define XLOG_REG_TYPE_LRHEADER 16 +#define XLOG_REG_TYPE_UNMOUNT 17 +#define XLOG_REG_TYPE_COMMIT 18 +#define XLOG_REG_TYPE_TRANSHDR 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_MAX 20 + +/* + * Flags to log operation header + * + * The first write of a new transaction will be preceded with a start + * record, XLOG_START_TRANS. Once a transaction is committed, a commit + * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into + * the remainder of the current active in-core log, it is split up into + * multiple regions. Each partial region will be marked with a + * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS. + * + */ +#define XLOG_START_TRANS 0x01 /* Start a new transaction */ +#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */ +#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */ +#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ +#define XLOG_END_TRANS 0x10 /* End a continued transaction */ +#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ + + +typedef struct xlog_op_header { + __be32 oh_tid; /* transaction id of operation : 4 b */ + __be32 oh_len; /* bytes in data region : 4 b */ + __u8 oh_clientid; /* who sent me this : 1 b */ + __u8 oh_flags; /* : 1 b */ + __u16 oh_res2; /* 32 bit align : 2 b */ +} xlog_op_header_t; + +/* valid values for h_fmt */ +#define XLOG_FMT_UNKNOWN 0 +#define XLOG_FMT_LINUX_LE 1 +#define XLOG_FMT_LINUX_BE 2 +#define XLOG_FMT_IRIX_BE 3 + +/* our fmt */ +#ifdef XFS_NATIVE_HOST +#define XLOG_FMT XLOG_FMT_LINUX_BE +#else +#define XLOG_FMT XLOG_FMT_LINUX_LE +#endif + +typedef struct xlog_rec_header { + __be32 h_magicno; /* log record (LR) identifier : 4 */ + __be32 h_cycle; /* write cycle of log : 4 */ + __be32 h_version; /* LR version : 4 */ + __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ + __be64 h_lsn; /* lsn of this LR : 8 */ + __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ + __le32 h_crc; /* crc of log record : 4 */ + __be32 h_prev_block; /* block number to previous LR : 4 */ + __be32 h_num_logops; /* number of log operations in this LR : 4 */ + __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; + /* new fields */ + __be32 h_fmt; /* format of log record : 4 */ + uuid_t h_fs_uuid; /* uuid of FS : 16 */ + __be32 h_size; /* iclog size : 4 */ +} xlog_rec_header_t; + +typedef struct xlog_rec_ext_header { + __be32 xh_cycle; /* write cycle of log : 4 */ + __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ +} xlog_rec_ext_header_t; + +/* + * Quite misnamed, because this union lays out the actual on-disk log buffer. + */ +typedef union xlog_in_core2 { + xlog_rec_header_t hic_header; + xlog_rec_ext_header_t hic_xheader; + char hic_sector[XLOG_HEADER_SIZE]; +} xlog_in_core_2_t; + +/* not an on-disk structure, but needed by log recovery in userspace */ +typedef struct xfs_log_iovec { + void *i_addr; /* beginning address of region */ + int i_len; /* length in bytes of region */ + uint i_type; /* type of region */ +} xfs_log_iovec_t; + + +/* + * Transaction Header definitions. + * + * This is the structure written in the log at the head of every transaction. It + * identifies the type and id of the transaction, and contains the number of + * items logged by the transaction so we know how many to expect during + * recovery. + * + * Do not change the below structure without redoing the code in + * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). + */ +typedef struct xfs_trans_header { + uint th_magic; /* magic number */ + uint th_type; /* transaction type */ + __int32_t th_tid; /* transaction id (unused) */ + uint th_num_items; /* num items logged by trans */ +} xfs_trans_header_t; + +#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ + +/* + * Log item types. + */ +#define XFS_LI_EFI 0x1236 +#define XFS_LI_EFD 0x1237 +#define XFS_LI_IUNLINK 0x1238 +#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ +#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ +#define XFS_LI_DQUOT 0x123d +#define XFS_LI_QUOTAOFF 0x123e +#define XFS_LI_ICREATE 0x123f + +#define XFS_LI_TYPE_DESC \ + { XFS_LI_EFI, "XFS_LI_EFI" }, \ + { XFS_LI_EFD, "XFS_LI_EFD" }, \ + { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ + { XFS_LI_INODE, "XFS_LI_INODE" }, \ + { XFS_LI_BUF, "XFS_LI_BUF" }, \ + { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ + { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ + { XFS_LI_ICREATE, "XFS_LI_ICREATE" } + +/* + * Transaction types. Used to distinguish types of buffers. + */ +#define XFS_TRANS_SETATTR_NOT_SIZE 1 +#define XFS_TRANS_SETATTR_SIZE 2 +#define XFS_TRANS_INACTIVE 3 +#define XFS_TRANS_CREATE 4 +#define XFS_TRANS_CREATE_TRUNC 5 +#define XFS_TRANS_TRUNCATE_FILE 6 +#define XFS_TRANS_REMOVE 7 +#define XFS_TRANS_LINK 8 +#define XFS_TRANS_RENAME 9 +#define XFS_TRANS_MKDIR 10 +#define XFS_TRANS_RMDIR 11 +#define XFS_TRANS_SYMLINK 12 +#define XFS_TRANS_SET_DMATTRS 13 +#define XFS_TRANS_GROWFS 14 +#define XFS_TRANS_STRAT_WRITE 15 +#define XFS_TRANS_DIOSTRAT 16 +/* 17 was XFS_TRANS_WRITE_SYNC */ +#define XFS_TRANS_WRITEID 18 +#define XFS_TRANS_ADDAFORK 19 +#define XFS_TRANS_ATTRINVAL 20 +#define XFS_TRANS_ATRUNCATE 21 +#define XFS_TRANS_ATTR_SET 22 +#define XFS_TRANS_ATTR_RM 23 +#define XFS_TRANS_ATTR_FLAG 24 +#define XFS_TRANS_CLEAR_AGI_BUCKET 25 +#define XFS_TRANS_QM_SBCHANGE 26 +/* + * Dummy entries since we use the transaction type to index into the + * trans_type[] in xlog_recover_print_trans_head() + */ +#define XFS_TRANS_DUMMY1 27 +#define XFS_TRANS_DUMMY2 28 +#define XFS_TRANS_QM_QUOTAOFF 29 +#define XFS_TRANS_QM_DQALLOC 30 +#define XFS_TRANS_QM_SETQLIM 31 +#define XFS_TRANS_QM_DQCLUSTER 32 +#define XFS_TRANS_QM_QINOCREATE 33 +#define XFS_TRANS_QM_QUOTAOFF_END 34 +#define XFS_TRANS_SB_UNIT 35 +#define XFS_TRANS_FSYNC_TS 36 +#define XFS_TRANS_GROWFSRT_ALLOC 37 +#define XFS_TRANS_GROWFSRT_ZERO 38 +#define XFS_TRANS_GROWFSRT_FREE 39 +#define XFS_TRANS_SWAPEXT 40 +#define XFS_TRANS_SB_COUNT 41 +#define XFS_TRANS_CHECKPOINT 42 +#define XFS_TRANS_ICREATE 43 +#define XFS_TRANS_TYPE_MAX 43 +/* new transaction types need to be reflected in xfs_logprint(8) */ + +#define XFS_TRANS_TYPES \ + { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ + { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ + { XFS_TRANS_INACTIVE, "INACTIVE" }, \ + { XFS_TRANS_CREATE, "CREATE" }, \ + { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ + { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ + { XFS_TRANS_REMOVE, "REMOVE" }, \ + { XFS_TRANS_LINK, "LINK" }, \ + { XFS_TRANS_RENAME, "RENAME" }, \ + { XFS_TRANS_MKDIR, "MKDIR" }, \ + { XFS_TRANS_RMDIR, "RMDIR" }, \ + { XFS_TRANS_SYMLINK, "SYMLINK" }, \ + { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ + { XFS_TRANS_GROWFS, "GROWFS" }, \ + { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ + { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ + { XFS_TRANS_WRITEID, "WRITEID" }, \ + { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ + { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ + { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ + { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ + { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ + { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ + { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ + { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ + { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ + { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ + { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ + { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ + { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ + { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ + { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \ + { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ + { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ + { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ + { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ + { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ + { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ + { XFS_TRANS_DUMMY1, "DUMMY1" }, \ + { XFS_TRANS_DUMMY2, "DUMMY2" }, \ + { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } + +/* + * This structure is used to track log items associated with + * a transaction. It points to the log item and keeps some + * flags to track the state of the log item. It also tracks + * the amount of space needed to log the item it describes + * once we get to commit processing (see xfs_trans_commit()). + */ +struct xfs_log_item_desc { + struct xfs_log_item *lid_item; + struct list_head lid_trans; + unsigned char lid_flags; +}; + +#define XFS_LID_DIRTY 0x1 + +/* + * Values for t_flags. + */ +#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ +#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ +#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ +#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ +#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ +#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ +#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer + count in superblock */ + +/* + * Values for call flags parameter. + */ +#define XFS_TRANS_RELEASE_LOG_RES 0x4 +#define XFS_TRANS_ABORT 0x8 + +/* + * Field values for xfs_trans_mod_sb. + */ +#define XFS_TRANS_SB_ICOUNT 0x00000001 +#define XFS_TRANS_SB_IFREE 0x00000002 +#define XFS_TRANS_SB_FDBLOCKS 0x00000004 +#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 +#define XFS_TRANS_SB_FREXTENTS 0x00000010 +#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 +#define XFS_TRANS_SB_DBLOCKS 0x00000040 +#define XFS_TRANS_SB_AGCOUNT 0x00000080 +#define XFS_TRANS_SB_IMAXPCT 0x00000100 +#define XFS_TRANS_SB_REXTSIZE 0x00000200 +#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 +#define XFS_TRANS_SB_RBLOCKS 0x00000800 +#define XFS_TRANS_SB_REXTENTS 0x00001000 +#define XFS_TRANS_SB_REXTSLOG 0x00002000 + +/* + * Here we centralize the specification of XFS meta-data buffer + * reference count values. This determine how hard the buffer + * cache tries to hold onto the buffer. + */ +#define XFS_AGF_REF 4 +#define XFS_AGI_REF 4 +#define XFS_AGFL_REF 3 +#define XFS_INO_BTREE_REF 3 +#define XFS_ALLOC_BTREE_REF 2 +#define XFS_BMAP_BTREE_REF 2 +#define XFS_DIR_BTREE_REF 2 +#define XFS_INO_REF 2 +#define XFS_ATTR_BTREE_REF 1 +#define XFS_DQUOT_REF 1 + +/* + * Flags for xfs_trans_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ + + +/* + * Inode Log Item Format definitions. + * + * This is the structure used to lay out an inode log item in the + * log. The size of the inline data/extents/b-tree root to be logged + * (if any) is indicated in the ilf_dsize field. Changes to this structure + * must be added on to the end. + */ +typedef struct xfs_inode_log_format { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_t; + +typedef struct xfs_inode_log_format_32 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} __attribute__((packed)) xfs_inode_log_format_32_t; + +typedef struct xfs_inode_log_format_64 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint32_t ilf_pad; /* pad for 64 bit boundary */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_64_t; + +/* + * Flags for xfs_trans_log_inode flags field. + */ +#define XFS_ILOG_CORE 0x001 /* log standard inode fields */ +#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */ +#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ +#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ +#define XFS_ILOG_DEV 0x010 /* log the dev field */ +#define XFS_ILOG_UUID 0x020 /* log the uuid field */ +#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ +#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ +#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ + + +/* + * The timestamps are dirty, but not necessarily anything else in the inode + * core. Unlike the other fields above this one must never make it to disk + * in the ilf_fields of the inode_log_format, but is purely store in-memory in + * ili_fields in the inode_log_item. + */ +#define XFS_ILOG_TIMESTAMP 0x4000 + +#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ + XFS_ILOG_UUID | XFS_ILOG_ADATA | \ + XFS_ILOG_AEXT | XFS_ILOG_ABROOT) + +#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT) + +#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT) + +#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ + XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ + XFS_ILOG_DEV | XFS_ILOG_UUID | \ + XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP) + +static inline int xfs_ilog_fbroot(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); +} + +static inline int xfs_ilog_fext(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); +} + +static inline int xfs_ilog_fdata(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); +} + +/* + * Incore version of the on-disk inode core structures. We log this directly + * into the journal in host CPU format (for better or worse) and as such + * directly mirrors the xfs_dinode structure as it must contain all the same + * information. + */ +typedef struct xfs_ictimestamp { + __int32_t t_sec; /* timestamp seconds */ + __int32_t t_nsec; /* timestamp nanoseconds */ +} xfs_ictimestamp_t; + +/* + * NOTE: This structure must be kept identical to struct xfs_dinode + * in xfs_dinode.h except for the endianness annotations. + */ +typedef struct xfs_icdinode { + __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __uint16_t di_mode; /* mode and type of file */ + __int8_t di_version; /* inode version */ + __int8_t di_format; /* format of di_c data */ + __uint16_t di_onlink; /* old number of links to file */ + __uint32_t di_uid; /* owner's user id */ + __uint32_t di_gid; /* owner's group id */ + __uint32_t di_nlink; /* number of links to file */ + __uint16_t di_projid_lo; /* lower part of owner's project id */ + __uint16_t di_projid_hi; /* higher part of owner's project id */ + __uint8_t di_pad[6]; /* unused, zeroed space */ + __uint16_t di_flushiter; /* incremented on flush */ + xfs_ictimestamp_t di_atime; /* time last accessed */ + xfs_ictimestamp_t di_mtime; /* time last modified */ + xfs_ictimestamp_t di_ctime; /* time created/inode modified */ + xfs_fsize_t di_size; /* number of bytes in file */ + xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ + xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ + xfs_extnum_t di_nextents; /* number of extents in data fork */ + xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + __int8_t di_aformat; /* format of attr fork's data */ + __uint32_t di_dmevmask; /* DMIG event mask */ + __uint16_t di_dmstate; /* DMIG state info */ + __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + __uint32_t di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __uint32_t di_crc; /* CRC of the inode */ + __uint64_t di_changecount; /* number of attribute changes */ + xfs_lsn_t di_lsn; /* flush sequence */ + __uint64_t di_flags2; /* more random flags */ + __uint8_t di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_ictimestamp_t di_crtime; /* time created */ + xfs_ino_t di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_icdinode_t; + +static inline uint xfs_icdinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_icdinode); + return offsetof(struct xfs_icdinode, di_next_unlinked); +} + +/* + * Buffer Log Format defintions + * + * These are the physical dirty bitmap defintions for the log format structure. + */ +#define XFS_BLF_CHUNK 128 +#define XFS_BLF_SHIFT 7 +#define BIT_TO_WORD_SHIFT 5 +#define NBWORD (NBBY * sizeof(unsigned int)) + +/* + * This flag indicates that the buffer contains on disk inodes + * and requires special recovery handling. + */ +#define XFS_BLF_INODE_BUF (1<<0) + +/* + * This flag indicates that the buffer should not be replayed + * during recovery because its blocks are being freed. + */ +#define XFS_BLF_CANCEL (1<<1) + +/* + * This flag indicates that the buffer contains on disk + * user or group dquots and may require special recovery handling. + */ +#define XFS_BLF_UDQUOT_BUF (1<<2) +#define XFS_BLF_PDQUOT_BUF (1<<3) +#define XFS_BLF_GDQUOT_BUF (1<<4) + +/* + * This is the structure used to lay out a buf log item in the + * log. The data map describes which 128 byte chunks of the buffer + * have been logged. + */ +#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + +typedef struct xfs_buf_log_format { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + __int64_t blf_blkno; /* starting blkno of this buf */ + unsigned int blf_map_size; /* used size of data bitmap in words */ + unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ +} xfs_buf_log_format_t; + +/* + * All buffers now need to tell recovery where the magic number + * is so that it can verify and calculate the CRCs on the buffer correctly + * once the changes have been replayed into the buffer. + * + * The type value is held in the upper 5 bits of the blf_flags field, which is + * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down. + */ +#define XFS_BLFT_BITS 5 +#define XFS_BLFT_SHIFT 11 +#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT) + +enum xfs_blft { + XFS_BLFT_UNKNOWN_BUF = 0, + XFS_BLFT_UDQUOT_BUF, + XFS_BLFT_PDQUOT_BUF, + XFS_BLFT_GDQUOT_BUF, + XFS_BLFT_BTREE_BUF, + XFS_BLFT_AGF_BUF, + XFS_BLFT_AGFL_BUF, + XFS_BLFT_AGI_BUF, + XFS_BLFT_DINO_BUF, + XFS_BLFT_SYMLINK_BUF, + XFS_BLFT_DIR_BLOCK_BUF, + XFS_BLFT_DIR_DATA_BUF, + XFS_BLFT_DIR_FREE_BUF, + XFS_BLFT_DIR_LEAF1_BUF, + XFS_BLFT_DIR_LEAFN_BUF, + XFS_BLFT_DA_NODE_BUF, + XFS_BLFT_ATTR_LEAF_BUF, + XFS_BLFT_ATTR_RMT_BUF, + XFS_BLFT_SB_BUF, + XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), +}; + +static inline void +xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) +{ + ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF); + blf->blf_flags &= ~XFS_BLFT_MASK; + blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); +} + +static inline __uint16_t +xfs_blft_from_flags(struct xfs_buf_log_format *blf) +{ + return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; +} + +/* + * EFI/EFD log format definitions + */ +typedef struct xfs_extent { + xfs_dfsbno_t ext_start; + xfs_extlen_t ext_len; +} xfs_extent_t; + +/* + * Since an xfs_extent_t has types (start:64, len: 32) + * there are different alignments on 32 bit and 64 bit kernels. + * So we provide the different variants for use by a + * conversion routine. + */ +typedef struct xfs_extent_32 { + __uint64_t ext_start; + __uint32_t ext_len; +} __attribute__((packed)) xfs_extent_32_t; + +typedef struct xfs_extent_64 { + __uint64_t ext_start; + __uint32_t ext_len; + __uint32_t ext_pad; +} xfs_extent_64_t; + +/* + * This is the structure used to lay out an efi log item in the + * log. The efi_extents field is a variable size array whose + * size is given by efi_nextents. + */ +typedef struct xfs_efi_log_format { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_t; + +typedef struct xfs_efi_log_format_32 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_32_t efi_extents[1]; /* array of extents to free */ +} __attribute__((packed)) xfs_efi_log_format_32_t; + +typedef struct xfs_efi_log_format_64 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_64_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_64_t; + +/* + * This is the structure used to lay out an efd log item in the + * log. The efd_extents array is a variable size array whose + * size is given by efd_nextents; + */ +typedef struct xfs_efd_log_format { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_t; + +typedef struct xfs_efd_log_format_32 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_32_t efd_extents[1]; /* array of extents freed */ +} __attribute__((packed)) xfs_efd_log_format_32_t; + +typedef struct xfs_efd_log_format_64 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_64_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_64_t; + +/* + * Dquot Log format definitions. + * + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + */ +typedef struct xfs_dq_logformat { + __uint16_t qlf_type; /* dquot log item type */ + __uint16_t qlf_size; /* size of this item */ + xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ + __int64_t qlf_blkno; /* blkno of dquot buffer */ + __int32_t qlf_len; /* len of dquot buffer */ + __uint32_t qlf_boffset; /* off of dquot in buffer */ +} xfs_dq_logformat_t; + +/* + * log format struct for QUOTAOFF records. + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer + * to the first and ensures that the first logitem is taken out of the AIL + * only when the last one is securely committed. + */ +typedef struct xfs_qoff_logformat { + unsigned short qf_type; /* quotaoff log item type */ + unsigned short qf_size; /* size of this item */ + unsigned int qf_flags; /* USR and/or GRP */ + char qf_pad[12]; /* padding for future */ +} xfs_qoff_logformat_t; + + +/* + * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. + */ +#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ +#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ +#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ +#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */ +#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */ +#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */ +#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ + +/* + * Conversion to and from the combined OQUOTA flag (if necessary) + * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk() + */ +#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */ +#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */ +#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */ +#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */ + +#define XFS_ALL_QUOTA_ACCT \ + (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) +#define XFS_ALL_QUOTA_ENFD \ + (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD) +#define XFS_ALL_QUOTA_CHKD \ + (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD) + +#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\ + XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\ + XFS_PQUOTA_CHKD) + +/* + * Inode create log item structure + * + * Log recovery assumes the first two entries are the type and size and they fit + * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so + * decoding can be done correctly. + */ +struct xfs_icreate_log { + __uint16_t icl_type; /* type of log format structure */ + __uint16_t icl_size; /* size of log format structure */ + __be32 icl_ag; /* ag being allocated in */ + __be32 icl_agbno; /* start block of inode range */ + __be32 icl_count; /* number of inodes to initialise */ + __be32 icl_isize; /* size of inodes */ + __be32 icl_length; /* length of extent to initialise */ + __be32 icl_gen; /* inode generation number to use */ +}; + +int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); +int xfs_log_calc_minimum_size(struct xfs_mount *); + + +#endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b9ea262dd1c2..136654b9400d 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -24,51 +24,13 @@ struct xlog_ticket; struct xfs_mount; /* - * Macros, structures, prototypes for internal log manager use. + * Flags for log structure */ - -#define XLOG_MIN_ICLOGS 2 -#define XLOG_MAX_ICLOGS 8 -#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ -#define XLOG_VERSION_1 1 -#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ -#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2) -#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */ -#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ -#define XLOG_MAX_RECORD_BSIZE (256*1024) -#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ -#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ -#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ -#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ -#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ - (log)->l_mp->m_sb.sb_logsunit) -#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) - -#define XLOG_HEADER_SIZE 512 - -#define XLOG_REC_SHIFT(log) \ - BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ - XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) -#define XLOG_TOTAL_REC_SHIFT(log) \ - BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ - XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) - -static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) -{ - return ((xfs_lsn_t)cycle << 32) | block; -} - -static inline uint xlog_get_cycle(char *ptr) -{ - if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) - return be32_to_cpu(*((__be32 *)ptr + 1)); - else - return be32_to_cpu(*(__be32 *)ptr); -} - -#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) - -#ifdef __KERNEL__ +#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ +#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ /* * get client id from packed copy. @@ -101,28 +63,8 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ #define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ #define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ -#endif /* __KERNEL__ */ /* - * Flags to log operation header - * - * The first write of a new transaction will be preceded with a start - * record, XLOG_START_TRANS. Once a transaction is committed, a commit - * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into - * the remainder of the current active in-core log, it is split up into - * multiple regions. Each partial region will be marked with a - * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS. - * - */ -#define XLOG_START_TRANS 0x01 /* Start a new transaction */ -#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */ -#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */ -#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ -#define XLOG_END_TRANS 0x10 /* End a continued transaction */ -#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ - -#ifdef __KERNEL__ -/* * Flags to log ticket */ #define XLOG_TIC_INITED 0x1 /* has been initialized */ @@ -132,22 +74,6 @@ static inline uint xlog_get_client_id(__be32 i) { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } -#endif /* __KERNEL__ */ - -#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ - -/* - * Flags for log structure - */ -#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ -#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ -#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being - shutdown */ -#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ - -typedef __uint32_t xlog_tid_t; - -#ifdef __KERNEL__ /* * Below are states for covering allocation transactions. * By covering, we mean changing the h_tail_lsn in the last on-disk @@ -223,7 +149,6 @@ typedef __uint32_t xlog_tid_t; #define XLOG_COVER_OPS 5 - /* Ticket reservation region accounting */ #define XLOG_TIC_LEN_MAX 15 @@ -258,64 +183,6 @@ typedef struct xlog_ticket { xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ } xlog_ticket_t; -#endif - - -typedef struct xlog_op_header { - __be32 oh_tid; /* transaction id of operation : 4 b */ - __be32 oh_len; /* bytes in data region : 4 b */ - __u8 oh_clientid; /* who sent me this : 1 b */ - __u8 oh_flags; /* : 1 b */ - __u16 oh_res2; /* 32 bit align : 2 b */ -} xlog_op_header_t; - - -/* valid values for h_fmt */ -#define XLOG_FMT_UNKNOWN 0 -#define XLOG_FMT_LINUX_LE 1 -#define XLOG_FMT_LINUX_BE 2 -#define XLOG_FMT_IRIX_BE 3 - -/* our fmt */ -#ifdef XFS_NATIVE_HOST -#define XLOG_FMT XLOG_FMT_LINUX_BE -#else -#define XLOG_FMT XLOG_FMT_LINUX_LE -#endif - -typedef struct xlog_rec_header { - __be32 h_magicno; /* log record (LR) identifier : 4 */ - __be32 h_cycle; /* write cycle of log : 4 */ - __be32 h_version; /* LR version : 4 */ - __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ - __be64 h_lsn; /* lsn of this LR : 8 */ - __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ - __le32 h_crc; /* crc of log record : 4 */ - __be32 h_prev_block; /* block number to previous LR : 4 */ - __be32 h_num_logops; /* number of log operations in this LR : 4 */ - __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; - /* new fields */ - __be32 h_fmt; /* format of log record : 4 */ - uuid_t h_fs_uuid; /* uuid of FS : 16 */ - __be32 h_size; /* iclog size : 4 */ -} xlog_rec_header_t; - -typedef struct xlog_rec_ext_header { - __be32 xh_cycle; /* write cycle of log : 4 */ - __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ -} xlog_rec_ext_header_t; - -#ifdef __KERNEL__ - -/* - * Quite misnamed, because this union lays out the actual on-disk log buffer. - */ -typedef union xlog_in_core2 { - xlog_rec_header_t hic_header; - xlog_rec_ext_header_t hic_xheader; - char hic_sector[XLOG_HEADER_SIZE]; -} xlog_in_core_2_t; - /* * - A log record header is 512 bytes. There is plenty of room to grow the * xlog_rec_header_t into the reserved space. @@ -411,14 +278,17 @@ struct xfs_cil { struct xlog *xc_log; struct list_head xc_cil; spinlock_t xc_cil_lock; + + struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; struct xfs_cil_ctx *xc_ctx; - struct rw_semaphore xc_ctx_lock; + + spinlock_t xc_push_lock ____cacheline_aligned_in_smp; + xfs_lsn_t xc_push_seq; struct list_head xc_committing; wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; struct work_struct xc_push_work; - xfs_lsn_t xc_push_seq; -}; +} ____cacheline_aligned_in_smp; /* * The amount of log space we allow the CIL to aggregate is difficult to size. @@ -686,6 +556,5 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) schedule(); remove_wait_queue(wq, &wait); } -#endif /* __KERNEL__ */ #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 7681b19aa5dc..7c0c1fdc728b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -17,7 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" @@ -41,7 +41,6 @@ #include "xfs_extfree_item.h" #include "xfs_trans_priv.h" #include "xfs_quota.h" -#include "xfs_utils.h" #include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -51,10 +50,12 @@ #include "xfs_symlink.h" #include "xfs_da_btree.h" #include "xfs_dir2_format.h" -#include "xfs_dir2_priv.h" +#include "xfs_dir2.h" #include "xfs_attr_leaf.h" #include "xfs_attr_remote.h" +#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) + STATIC int xlog_find_zeroed( struct xlog *, @@ -607,7 +608,7 @@ out: /* * Head is defined to be the point of the log where the next log write - * write could go. This means that incomplete LR writes at the end are + * could go. This means that incomplete LR writes at the end are * eliminated when calculating the head. We aren't guaranteed that previous * LR have complete transactions. We only know that a cycle number of * current cycle number -1 won't be present in the log if we start writing @@ -963,6 +964,7 @@ xlog_find_tail( } if (!found) { xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + xlog_put_bp(bp); ASSERT(0); return XFS_ERROR(EIO); } @@ -1144,7 +1146,8 @@ xlog_find_zeroed( */ xfs_warn(log->l_mp, "Log inconsistent or not a log (last==0, first!=1)"); - return XFS_ERROR(EINVAL); + error = XFS_ERROR(EINVAL); + goto bp_err; } /* we have a partially zeroed log */ @@ -1766,19 +1769,11 @@ xlog_recover_buffer_pass1( /* * Check to see whether the buffer being recovered has a corresponding - * entry in the buffer cancel record table. If it does then return 1 - * so that it will be cancelled, otherwise return 0. If the buffer is - * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement - * the refcount on the entry in the table and remove it from the table - * if this is the last reference. - * - * We remove the cancel record from the table when we encounter its - * last occurrence in the log so that if the same buffer is re-used - * again after its last cancellation we actually replay the changes - * made at that point. + * entry in the buffer cancel record table. If it is, return the cancel + * buffer structure to the caller. */ -STATIC int -xlog_check_buffer_cancelled( +STATIC struct xfs_buf_cancel * +xlog_peek_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, @@ -1787,22 +1782,16 @@ xlog_check_buffer_cancelled( struct list_head *bucket; struct xfs_buf_cancel *bcp; - if (log->l_buf_cancel_table == NULL) { - /* - * There is nothing in the table built in pass one, - * so this buffer must not be cancelled. - */ + if (!log->l_buf_cancel_table) { + /* empty table means no cancelled buffers in the log */ ASSERT(!(flags & XFS_BLF_CANCEL)); - return 0; + return NULL; } - /* - * Search for an entry in the cancel table that matches our buffer. - */ bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); list_for_each_entry(bcp, bucket, bc_list) { if (bcp->bc_blkno == blkno && bcp->bc_len == len) - goto found; + return bcp; } /* @@ -1810,9 +1799,32 @@ xlog_check_buffer_cancelled( * that the buffer is NOT cancelled. */ ASSERT(!(flags & XFS_BLF_CANCEL)); - return 0; + return NULL; +} + +/* + * If the buffer is being cancelled then return 1 so that it will be cancelled, + * otherwise return 0. If the buffer is actually a buffer cancel item + * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the + * table and remove it from the table if this is the last reference. + * + * We remove the cancel record from the table when we encounter its last + * occurrence in the log so that if the same buffer is re-used again after its + * last cancellation we actually replay the changes made at that point. + */ +STATIC int +xlog_check_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len, + ushort flags) +{ + struct xfs_buf_cancel *bcp; + + bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); + if (!bcp) + return 0; -found: /* * We've go a match, so return 1 so that the recovery of this buffer * is cancelled. If this buffer is actually a buffer cancel log @@ -1947,6 +1959,104 @@ xlog_recover_do_inode_buffer( } /* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents + * temporarily invalid on disk. + * + * The magic number might not match the buffer type we are going to recover + * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence + * extract the LSN of the existing object in the buffer based on it's current + * magic number. If we don't recognise the magic number in the buffer, then + * return a LSN of -1 so that the caller knows it was an unrecognised block and + * so can recover the buffer. + */ +static xfs_lsn_t +xlog_recover_get_buf_lsn( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + void *blk = bp->b_addr; + + /* v4 filesystems always recover immediately */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto recover_immediately; + + magic32 = be32_to_cpu(*(__be32 *)blk); + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + return be64_to_cpu( + ((struct xfs_btree_block *)blk)->bb_u.s.bb_lsn); + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + return be64_to_cpu( + ((struct xfs_btree_block *)blk)->bb_u.l.bb_lsn); + case XFS_AGF_MAGIC: + return be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + case XFS_AGFL_MAGIC: + return be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + case XFS_AGI_MAGIC: + return be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + case XFS_SYMLINK_MAGIC: + return be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + case XFS_DIR3_BLOCK_MAGIC: + case XFS_DIR3_DATA_MAGIC: + case XFS_DIR3_FREE_MAGIC: + return be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + case XFS_ATTR3_RMT_MAGIC: + return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); + case XFS_SB_MAGIC: + return be64_to_cpu(((struct xfs_sb *)blk)->sb_lsn); + default: + break; + } + + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + case XFS_DA3_NODE_MAGIC: + return be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + default: + break; + } + + /* + * We do individual object checks on dquot and inode buffers as they + * have their own individual LSN records. Also, we could have a stale + * buffer here, so we have to at least recognise these buffer types. + * + * A notd complexity here is inode unlinked list processing - it logs + * the inode directly in the buffer, but we don't know which inodes have + * been modified, and there is no global buffer LSN. Hence we need to + * recover all inode buffer types immediately. This problem will be + * fixed by logical logging of the unlinked list modifications. + */ + magic16 = be16_to_cpu(*(__be16 *)blk); + switch (magic16) { + case XFS_DQUOT_MAGIC: + case XFS_DINODE_MAGIC: + goto recover_immediately; + default: + break; + } + + /* unknown buffer contents, recover immediately */ + +recover_immediately: + return (xfs_lsn_t)-1; + +} + +/* * Validate the recovered buffer is of the correct type and attach the * appropriate buffer operations to them for writeback. Magic numbers are in a * few places: @@ -1955,7 +2065,7 @@ xlog_recover_do_inode_buffer( * inside a struct xfs_da_blkinfo at the start of the buffer. */ static void -xlog_recovery_validate_buf_type( +xlog_recover_validate_buf_type( struct xfs_mount *mp, struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) @@ -2234,7 +2344,7 @@ xlog_recover_do_reg_buffer( * just avoid the verification stage for non-crc filesystems */ if (xfs_sb_version_hascrc(&mp->m_sb)) - xlog_recovery_validate_buf_type(mp, bp, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f); } /* @@ -2366,7 +2476,7 @@ xfs_qm_dqcheck( /* * Perform a dquot buffer recovery. - * Simple algorithm: if we have found a QUOTAOFF logitem of the same type + * Simple algorithm: if we have found a QUOTAOFF log item of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. * Else, treat it as a regular buffer and do recovery. */ @@ -2425,20 +2535,22 @@ xlog_recover_do_dquot_buffer( * over the log during recovery. During the first we build a table of * those buffers which have been cancelled, and during the second we * only replay those buffers which do not have corresponding cancel - * records in the table. See xlog_recover_do_buffer_pass[1,2] above + * records in the table. See xlog_recover_buffer_pass[1,2] above * for more details on the implementation of the table of cancel records. */ STATIC int xlog_recover_buffer_pass2( struct xlog *log, struct list_head *buffer_list, - struct xlog_recover_item *item) + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; int error; uint buf_flags; + xfs_lsn_t lsn; /* * In this pass we only want to recover all the buffers which have @@ -2463,10 +2575,17 @@ xlog_recover_buffer_pass2( error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); - xfs_buf_relse(bp); - return error; + goto out_release; } + /* + * recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) + goto out_release; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (buf_f->blf_flags & @@ -2476,7 +2595,7 @@ xlog_recover_buffer_pass2( xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } if (error) - return XFS_ERROR(error); + goto out_release; /* * Perform delayed write on the buffer. Asynchronous writes will be @@ -2505,6 +2624,7 @@ xlog_recover_buffer_pass2( xfs_buf_delwri_queue(bp, buffer_list); } +out_release: xfs_buf_relse(bp); return error; } @@ -2513,7 +2633,8 @@ STATIC int xlog_recover_inode_pass2( struct xlog *log, struct list_head *buffer_list, - struct xlog_recover_item *item) + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_inode_log_format_t *in_f; xfs_mount_t *mp = log->l_mp; @@ -2593,6 +2714,20 @@ xlog_recover_inode_pass2( } /* + * If the inode has an LSN in it, recover the inode only if it's less + * than the lsn of the transaction we are replaying. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_release; + } + } + + /* * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes * are transactional and if ordering is necessary we can determine that * more accurately by the LSN field in the V3 inode core. Don't trust @@ -2781,6 +2916,8 @@ write_inode_buffer: ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); + +out_release: xfs_buf_relse(bp); error: if (need_free) @@ -2822,7 +2959,8 @@ STATIC int xlog_recover_dquot_pass2( struct xlog *log, struct list_head *buffer_list, - struct xlog_recover_item *item) + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; @@ -2896,6 +3034,19 @@ xlog_recover_dquot_pass2( return XFS_ERROR(EIO); } + /* + * If the dquot has an LSN in it, recover the dquot only if it's less + * than the lsn of the transaction we are replaying. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; + xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + goto out_release; + } + } + memcpy(ddq, recddq, item->ri_buf[1].i_len); if (xfs_sb_version_hascrc(&mp->m_sb)) { xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), @@ -2906,9 +3057,10 @@ xlog_recover_dquot_pass2( ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); - xfs_buf_relse(bp); - return (0); +out_release: + xfs_buf_relse(bp); + return 0; } /* @@ -3116,6 +3268,106 @@ xlog_recover_free_trans( kmem_free(trans); } +STATIC void +xlog_recover_buffer_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_mount *mp = log->l_mp; + + if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + return; + } + + xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, + buf_f->blf_len, NULL); +} + +STATIC void +xlog_recover_inode_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_inode_log_format ilf_buf; + struct xfs_inode_log_format *ilfp; + struct xfs_mount *mp = log->l_mp; + int error; + + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + ilfp = item->ri_buf[0].i_addr; + } else { + ilfp = &ilf_buf; + memset(ilfp, 0, sizeof(*ilfp)); + error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); + if (error) + return; + } + + if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, + ilfp->ilf_len, &xfs_inode_buf_ra_ops); +} + +STATIC void +xlog_recover_dquot_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_disk_dquot *recddq; + struct xfs_dq_logformat *dq_f; + uint type; + + + if (mp->m_qflags == 0) + return; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) + return; + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + return; + + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return; + + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + ASSERT(dq_f->qlf_len == 1); + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); +} + +STATIC void +xlog_recover_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + xlog_recover_buffer_ra_pass2(log, item); + break; + case XFS_LI_INODE: + xlog_recover_inode_ra_pass2(log, item); + break; + case XFS_LI_DQUOT: + xlog_recover_dquot_ra_pass2(log, item); + break; + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_QUOTAOFF: + default: + break; + } +} + STATIC int xlog_recover_commit_pass1( struct xlog *log, @@ -3155,15 +3407,18 @@ xlog_recover_commit_pass2( switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - return xlog_recover_buffer_pass2(log, buffer_list, item); + return xlog_recover_buffer_pass2(log, buffer_list, item, + trans->r_lsn); case XFS_LI_INODE: - return xlog_recover_inode_pass2(log, buffer_list, item); + return xlog_recover_inode_pass2(log, buffer_list, item, + trans->r_lsn); case XFS_LI_EFI: return xlog_recover_efi_pass2(log, item, trans->r_lsn); case XFS_LI_EFD: return xlog_recover_efd_pass2(log, item); case XFS_LI_DQUOT: - return xlog_recover_dquot_pass2(log, buffer_list, item); + return xlog_recover_dquot_pass2(log, buffer_list, item, + trans->r_lsn); case XFS_LI_ICREATE: return xlog_recover_do_icreate_pass2(log, buffer_list, item); case XFS_LI_QUOTAOFF: @@ -3177,6 +3432,26 @@ xlog_recover_commit_pass2( } } +STATIC int +xlog_recover_items_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct list_head *item_list) +{ + struct xlog_recover_item *item; + int error = 0; + + list_for_each_entry(item, item_list, ri_list) { + error = xlog_recover_commit_pass2(log, trans, + buffer_list, item); + if (error) + return error; + } + + return error; +} + /* * Perform the transaction. * @@ -3189,9 +3464,16 @@ xlog_recover_commit_trans( struct xlog_recover *trans, int pass) { - int error = 0, error2; - xlog_recover_item_t *item; - LIST_HEAD (buffer_list); + int error = 0; + int error2; + int items_queued = 0; + struct xlog_recover_item *item; + struct xlog_recover_item *next; + LIST_HEAD (buffer_list); + LIST_HEAD (ra_list); + LIST_HEAD (done_list); + + #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 hlist_del(&trans->r_list); @@ -3199,14 +3481,22 @@ xlog_recover_commit_trans( if (error) return error; - list_for_each_entry(item, &trans->r_itemq, ri_list) { + list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { switch (pass) { case XLOG_RECOVER_PASS1: error = xlog_recover_commit_pass1(log, trans, item); break; case XLOG_RECOVER_PASS2: - error = xlog_recover_commit_pass2(log, trans, - &buffer_list, item); + xlog_recover_ra_pass2(log, item); + list_move_tail(&item->ri_list, &ra_list); + items_queued++; + if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + items_queued = 0; + } + break; default: ASSERT(0); @@ -3216,9 +3506,19 @@ xlog_recover_commit_trans( goto out; } +out: + if (!list_empty(&ra_list)) { + if (!error) + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + } + + if (!list_empty(&done_list)) + list_splice_init(&done_list, &trans->r_itemq); + xlog_recover_free_trans(trans); -out: error2 = xfs_buf_delwri_submit(&buffer_list); return error ? error : error2; } @@ -3376,7 +3676,7 @@ xlog_recover_process_efi( } tp = xfs_trans_alloc(mp, 0); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto abort_error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); @@ -3482,8 +3782,7 @@ xlog_recover_clear_agi_bucket( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), - 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); if (error) goto out_abort; diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c new file mode 100644 index 000000000000..bbcec0bbc12d --- /dev/null +++ b/fs/xfs/xfs_log_rlimit.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2013 Jie Liu. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_trans_space.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_da_btree.h" +#include "xfs_attr_leaf.h" + +/* + * Calculate the maximum length in bytes that would be required for a local + * attribute value as large attributes out of line are not logged. + */ +STATIC int +xfs_log_calc_max_attrsetm_res( + struct xfs_mount *mp) +{ + int size; + int nblks; + + size = xfs_attr_leaf_entsize_local_max(mp->m_sb.sb_blocksize) - + MAXNAMELEN - 1; + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + nblks += XFS_B_TO_FSB(mp, size); + nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK); + + return M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * nblks; +} + +/* + * Iterate over the log space reservation table to figure out and return + * the maximum one in terms of the pre-calculated values which were done + * at mount time. + */ +STATIC void +xfs_log_get_max_trans_res( + struct xfs_mount *mp, + struct xfs_trans_res *max_resp) +{ + struct xfs_trans_res *resp; + struct xfs_trans_res *end_resp; + int log_space = 0; + int attr_space; + + attr_space = xfs_log_calc_max_attrsetm_res(mp); + + resp = (struct xfs_trans_res *)M_RES(mp); + end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); + for (; resp < end_resp; resp++) { + int tmp = resp->tr_logcount > 1 ? + resp->tr_logres * resp->tr_logcount : + resp->tr_logres; + if (log_space < tmp) { + log_space = tmp; + *max_resp = *resp; /* struct copy */ + } + } + + if (attr_space > log_space) { + *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ + max_resp->tr_logres = attr_space; + } +} + +/* + * Calculate the minimum valid log size for the given superblock configuration. + * Used to calculate the minimum log size at mkfs time, and to determine if + * the log is large enough or not at mount time. Returns the minimum size in + * filesystem block size units. + */ +int +xfs_log_calc_minimum_size( + struct xfs_mount *mp) +{ + struct xfs_trans_res tres = {0}; + int max_logres; + int min_logblks = 0; + int lsunit = 0; + + xfs_log_get_max_trans_res(mp, &tres); + + max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres); + if (tres.tr_logcount > 1) + max_logres *= tres.tr_logcount; + + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + lsunit = BTOBB(mp->m_sb.sb_logsunit); + + /* + * Two factors should be taken into account for calculating the minimum + * log space. + * 1) The fundamental limitation is that no single transaction can be + * larger than half size of the log. + * + * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR + * define, which is set to 3. That means we can definitely fit + * maximally sized 2 transactions in the log. We'll use this same + * value here. + * + * 2) If the lsunit option is specified, a transaction requires 2 LSU + * for the reservation because there are two log writes that can + * require padding - the transaction data and the commit record which + * are written separately and both can require padding to the LSU. + * Consider that we can have an active CIL reservation holding 2*LSU, + * but the CIL is not over a push threshold, in this case, if we + * don't have enough log space for at one new transaction, which + * includes another 2*LSU in the reservation, we will run into dead + * loop situation in log space grant procedure. i.e. + * xlog_grant_head_wait(). + * + * Hence the log size needs to be able to contain two maximally sized + * and padded transactions, which is (2 * (2 * LSU + maxlres)). + * + * Also, the log size should be a multiple of the log stripe unit, round + * it up to lsunit boundary if lsunit is specified. + */ + if (lsunit) { + min_logblks = roundup_64(BTOBB(max_logres), lsunit) + + 2 * lsunit; + } else + min_logblks = BTOBB(max_logres) + 2 * BBSIZE; + min_logblks *= XFS_MIN_LOG_FACTOR; + + return XFS_BB_TO_FSB(mp, min_logblks); +} diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 2b0ba3581656..5dcc68019d1b 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -17,7 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" @@ -25,8 +25,10 @@ #include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" @@ -40,7 +42,6 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_fsops.h" -#include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_cksum.h" @@ -59,69 +60,6 @@ STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); #define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) #endif -static const struct { - short offset; - short type; /* 0 = integer - * 1 = binary / string (no translation) - */ -} xfs_sb_info[] = { - { offsetof(xfs_sb_t, sb_magicnum), 0 }, - { offsetof(xfs_sb_t, sb_blocksize), 0 }, - { offsetof(xfs_sb_t, sb_dblocks), 0 }, - { offsetof(xfs_sb_t, sb_rblocks), 0 }, - { offsetof(xfs_sb_t, sb_rextents), 0 }, - { offsetof(xfs_sb_t, sb_uuid), 1 }, - { offsetof(xfs_sb_t, sb_logstart), 0 }, - { offsetof(xfs_sb_t, sb_rootino), 0 }, - { offsetof(xfs_sb_t, sb_rbmino), 0 }, - { offsetof(xfs_sb_t, sb_rsumino), 0 }, - { offsetof(xfs_sb_t, sb_rextsize), 0 }, - { offsetof(xfs_sb_t, sb_agblocks), 0 }, - { offsetof(xfs_sb_t, sb_agcount), 0 }, - { offsetof(xfs_sb_t, sb_rbmblocks), 0 }, - { offsetof(xfs_sb_t, sb_logblocks), 0 }, - { offsetof(xfs_sb_t, sb_versionnum), 0 }, - { offsetof(xfs_sb_t, sb_sectsize), 0 }, - { offsetof(xfs_sb_t, sb_inodesize), 0 }, - { offsetof(xfs_sb_t, sb_inopblock), 0 }, - { offsetof(xfs_sb_t, sb_fname[0]), 1 }, - { offsetof(xfs_sb_t, sb_blocklog), 0 }, - { offsetof(xfs_sb_t, sb_sectlog), 0 }, - { offsetof(xfs_sb_t, sb_inodelog), 0 }, - { offsetof(xfs_sb_t, sb_inopblog), 0 }, - { offsetof(xfs_sb_t, sb_agblklog), 0 }, - { offsetof(xfs_sb_t, sb_rextslog), 0 }, - { offsetof(xfs_sb_t, sb_inprogress), 0 }, - { offsetof(xfs_sb_t, sb_imax_pct), 0 }, - { offsetof(xfs_sb_t, sb_icount), 0 }, - { offsetof(xfs_sb_t, sb_ifree), 0 }, - { offsetof(xfs_sb_t, sb_fdblocks), 0 }, - { offsetof(xfs_sb_t, sb_frextents), 0 }, - { offsetof(xfs_sb_t, sb_uquotino), 0 }, - { offsetof(xfs_sb_t, sb_gquotino), 0 }, - { offsetof(xfs_sb_t, sb_qflags), 0 }, - { offsetof(xfs_sb_t, sb_flags), 0 }, - { offsetof(xfs_sb_t, sb_shared_vn), 0 }, - { offsetof(xfs_sb_t, sb_inoalignmt), 0 }, - { offsetof(xfs_sb_t, sb_unit), 0 }, - { offsetof(xfs_sb_t, sb_width), 0 }, - { offsetof(xfs_sb_t, sb_dirblklog), 0 }, - { offsetof(xfs_sb_t, sb_logsectlog), 0 }, - { offsetof(xfs_sb_t, sb_logsectsize),0 }, - { offsetof(xfs_sb_t, sb_logsunit), 0 }, - { offsetof(xfs_sb_t, sb_features2), 0 }, - { offsetof(xfs_sb_t, sb_bad_features2), 0 }, - { offsetof(xfs_sb_t, sb_features_compat), 0 }, - { offsetof(xfs_sb_t, sb_features_ro_compat), 0 }, - { offsetof(xfs_sb_t, sb_features_incompat), 0 }, - { offsetof(xfs_sb_t, sb_features_log_incompat), 0 }, - { offsetof(xfs_sb_t, sb_crc), 0 }, - { offsetof(xfs_sb_t, sb_pad), 0 }, - { offsetof(xfs_sb_t, sb_pquotino), 0 }, - { offsetof(xfs_sb_t, sb_lsn), 0 }, - { sizeof(xfs_sb_t), 0 } -}; - static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; @@ -197,64 +135,6 @@ xfs_uuid_unmount( } -/* - * Reference counting access wrappers to the perag structures. - * Because we never free per-ag structures, the only thing we - * have to protect against changes is the tree structure itself. - */ -struct xfs_perag * -xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ref = 0; - - rcu_read_lock(); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); - if (pag) { - ASSERT(atomic_read(&pag->pag_ref) >= 0); - ref = atomic_inc_return(&pag->pag_ref); - } - rcu_read_unlock(); - trace_xfs_perag_get(mp, agno, ref, _RET_IP_); - return pag; -} - -/* - * search from @first to find the next perag with the given tag set. - */ -struct xfs_perag * -xfs_perag_get_tag( - struct xfs_mount *mp, - xfs_agnumber_t first, - int tag) -{ - struct xfs_perag *pag; - int found; - int ref; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - ref = atomic_inc_return(&pag->pag_ref); - rcu_read_unlock(); - trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); - return pag; -} - -void -xfs_perag_put(struct xfs_perag *pag) -{ - int ref; - - ASSERT(atomic_read(&pag->pag_ref) > 0); - ref = atomic_dec_return(&pag->pag_ref); - trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); -} - STATIC void __xfs_free_perag( struct rcu_head *head) @@ -307,184 +187,6 @@ xfs_sb_validate_fsb_count( return 0; } -/* - * Check the validity of the SB found. - */ -STATIC int -xfs_mount_validate_sb( - xfs_mount_t *mp, - xfs_sb_t *sbp, - bool check_inprogress, - bool check_version) -{ - - /* - * If the log device and data device have the - * same device number, the log is internal. - * Consequently, the sb_logstart should be non-zero. If - * we have a zero sb_logstart in this case, we may be trying to mount - * a volume filesystem in a non-volume manner. - */ - if (sbp->sb_magicnum != XFS_SB_MAGIC) { - xfs_warn(mp, "bad magic number"); - return XFS_ERROR(EWRONGFS); - } - - - if (!xfs_sb_good_version(sbp)) { - xfs_warn(mp, "bad version"); - return XFS_ERROR(EWRONGFS); - } - - if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) && - (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | - XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) { - xfs_notice(mp, -"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n"); - return XFS_ERROR(EFSCORRUPTED); - } - - /* - * Version 5 superblock feature mask validation. Reject combinations the - * kernel cannot support up front before checking anything else. For - * write validation, we don't need to check feature masks. - */ - if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { - xfs_alert(mp, -"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" -"Use of these features in this kernel is at your own risk!"); - - if (xfs_sb_has_compat_feature(sbp, - XFS_SB_FEAT_COMPAT_UNKNOWN)) { - xfs_warn(mp, -"Superblock has unknown compatible features (0x%x) enabled.\n" -"Using a more recent kernel is recommended.", - (sbp->sb_features_compat & - XFS_SB_FEAT_COMPAT_UNKNOWN)); - } - - if (xfs_sb_has_ro_compat_feature(sbp, - XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { - xfs_alert(mp, -"Superblock has unknown read-only compatible features (0x%x) enabled.", - (sbp->sb_features_ro_compat & - XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_warn(mp, -"Attempted to mount read-only compatible filesystem read-write.\n" -"Filesystem can only be safely mounted read only."); - return XFS_ERROR(EINVAL); - } - } - if (xfs_sb_has_incompat_feature(sbp, - XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { - xfs_warn(mp, -"Superblock has unknown incompatible features (0x%x) enabled.\n" -"Filesystem can not be safely mounted by this kernel.", - (sbp->sb_features_incompat & - XFS_SB_FEAT_INCOMPAT_UNKNOWN)); - return XFS_ERROR(EINVAL); - } - } - - if (unlikely( - sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { - xfs_warn(mp, - "filesystem is marked as having an external log; " - "specify logdev on the mount command line."); - return XFS_ERROR(EINVAL); - } - - if (unlikely( - sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { - xfs_warn(mp, - "filesystem is marked as having an internal log; " - "do not specify logdev on the mount command line."); - return XFS_ERROR(EINVAL); - } - - /* - * More sanity checking. Most of these were stolen directly from - * xfs_repair. - */ - if (unlikely( - sbp->sb_agcount <= 0 || - sbp->sb_sectsize < XFS_MIN_SECTORSIZE || - sbp->sb_sectsize > XFS_MAX_SECTORSIZE || - sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || - sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || - sbp->sb_sectsize != (1 << sbp->sb_sectlog) || - sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || - sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || - sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || - sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || - sbp->sb_blocksize != (1 << sbp->sb_blocklog) || - sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || - sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || - sbp->sb_inodelog < XFS_DINODE_MIN_LOG || - sbp->sb_inodelog > XFS_DINODE_MAX_LOG || - sbp->sb_inodesize != (1 << sbp->sb_inodelog) || - (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || - (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || - (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || - (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || - sbp->sb_dblocks == 0 || - sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || - sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { - XFS_CORRUPTION_ERROR("SB sanity check failed", - XFS_ERRLEVEL_LOW, mp, sbp); - return XFS_ERROR(EFSCORRUPTED); - } - - /* - * Until this is fixed only page-sized or smaller data blocks work. - */ - if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - xfs_warn(mp, - "File system with blocksize %d bytes. " - "Only pagesize (%ld) or less will currently work.", - sbp->sb_blocksize, PAGE_SIZE); - return XFS_ERROR(ENOSYS); - } - - /* - * Currently only very few inode sizes are supported. - */ - switch (sbp->sb_inodesize) { - case 256: - case 512: - case 1024: - case 2048: - break; - default: - xfs_warn(mp, "inode size of %d bytes not supported", - sbp->sb_inodesize); - return XFS_ERROR(ENOSYS); - } - - if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || - xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { - xfs_warn(mp, - "file system too large to be mounted on this system."); - return XFS_ERROR(EFBIG); - } - - if (check_inprogress && sbp->sb_inprogress) { - xfs_warn(mp, "Offline file system operation in progress!"); - return XFS_ERROR(EFSCORRUPTED); - } - - /* - * Version 1 directory format has never worked on Linux. - */ - if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { - xfs_warn(mp, "file system using version 1 directory format"); - return XFS_ERROR(ENOSYS); - } - - return 0; -} - int xfs_initialize_perag( xfs_mount_t *mp, @@ -569,283 +271,15 @@ out_unwind: return error; } -static void -xfs_sb_quota_from_disk(struct xfs_sb *sbp) -{ - if (sbp->sb_qflags & XFS_OQUOTA_ENFD) - sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? - XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD; - if (sbp->sb_qflags & XFS_OQUOTA_CHKD) - sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? - XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; - sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); -} - -void -xfs_sb_from_disk( - struct xfs_sb *to, - xfs_dsb_t *from) -{ - to->sb_magicnum = be32_to_cpu(from->sb_magicnum); - to->sb_blocksize = be32_to_cpu(from->sb_blocksize); - to->sb_dblocks = be64_to_cpu(from->sb_dblocks); - to->sb_rblocks = be64_to_cpu(from->sb_rblocks); - to->sb_rextents = be64_to_cpu(from->sb_rextents); - memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); - to->sb_logstart = be64_to_cpu(from->sb_logstart); - to->sb_rootino = be64_to_cpu(from->sb_rootino); - to->sb_rbmino = be64_to_cpu(from->sb_rbmino); - to->sb_rsumino = be64_to_cpu(from->sb_rsumino); - to->sb_rextsize = be32_to_cpu(from->sb_rextsize); - to->sb_agblocks = be32_to_cpu(from->sb_agblocks); - to->sb_agcount = be32_to_cpu(from->sb_agcount); - to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks); - to->sb_logblocks = be32_to_cpu(from->sb_logblocks); - to->sb_versionnum = be16_to_cpu(from->sb_versionnum); - to->sb_sectsize = be16_to_cpu(from->sb_sectsize); - to->sb_inodesize = be16_to_cpu(from->sb_inodesize); - to->sb_inopblock = be16_to_cpu(from->sb_inopblock); - memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); - to->sb_blocklog = from->sb_blocklog; - to->sb_sectlog = from->sb_sectlog; - to->sb_inodelog = from->sb_inodelog; - to->sb_inopblog = from->sb_inopblog; - to->sb_agblklog = from->sb_agblklog; - to->sb_rextslog = from->sb_rextslog; - to->sb_inprogress = from->sb_inprogress; - to->sb_imax_pct = from->sb_imax_pct; - to->sb_icount = be64_to_cpu(from->sb_icount); - to->sb_ifree = be64_to_cpu(from->sb_ifree); - to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks); - to->sb_frextents = be64_to_cpu(from->sb_frextents); - to->sb_uquotino = be64_to_cpu(from->sb_uquotino); - to->sb_gquotino = be64_to_cpu(from->sb_gquotino); - to->sb_qflags = be16_to_cpu(from->sb_qflags); - to->sb_flags = from->sb_flags; - to->sb_shared_vn = from->sb_shared_vn; - to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt); - to->sb_unit = be32_to_cpu(from->sb_unit); - to->sb_width = be32_to_cpu(from->sb_width); - to->sb_dirblklog = from->sb_dirblklog; - to->sb_logsectlog = from->sb_logsectlog; - to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize); - to->sb_logsunit = be32_to_cpu(from->sb_logsunit); - to->sb_features2 = be32_to_cpu(from->sb_features2); - to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); - to->sb_features_compat = be32_to_cpu(from->sb_features_compat); - to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat); - to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); - to->sb_features_log_incompat = - be32_to_cpu(from->sb_features_log_incompat); - to->sb_pad = 0; - to->sb_pquotino = be64_to_cpu(from->sb_pquotino); - to->sb_lsn = be64_to_cpu(from->sb_lsn); -} - -static inline void -xfs_sb_quota_to_disk( - xfs_dsb_t *to, - xfs_sb_t *from, - __int64_t *fields) -{ - __uint16_t qflags = from->sb_qflags; - - if (*fields & XFS_SB_QFLAGS) { - /* - * The in-core version of sb_qflags do not have - * XFS_OQUOTA_* flags, whereas the on-disk version - * does. So, convert incore XFS_{PG}QUOTA_* flags - * to on-disk XFS_OQUOTA_* flags. - */ - qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | - XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); - - if (from->sb_qflags & - (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) - qflags |= XFS_OQUOTA_ENFD; - if (from->sb_qflags & - (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) - qflags |= XFS_OQUOTA_CHKD; - to->sb_qflags = cpu_to_be16(qflags); - *fields &= ~XFS_SB_QFLAGS; - } -} - -/* - * Copy in core superblock to ondisk one. - * - * The fields argument is mask of superblock fields to copy. - */ -void -xfs_sb_to_disk( - xfs_dsb_t *to, - xfs_sb_t *from, - __int64_t fields) -{ - xfs_caddr_t to_ptr = (xfs_caddr_t)to; - xfs_caddr_t from_ptr = (xfs_caddr_t)from; - xfs_sb_field_t f; - int first; - int size; - - ASSERT(fields); - if (!fields) - return; - - xfs_sb_quota_to_disk(to, from, &fields); - while (fields) { - f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); - first = xfs_sb_info[f].offset; - size = xfs_sb_info[f + 1].offset - first; - - ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); - - if (size == 1 || xfs_sb_info[f].type == 1) { - memcpy(to_ptr + first, from_ptr + first, size); - } else { - switch (size) { - case 2: - *(__be16 *)(to_ptr + first) = - cpu_to_be16(*(__u16 *)(from_ptr + first)); - break; - case 4: - *(__be32 *)(to_ptr + first) = - cpu_to_be32(*(__u32 *)(from_ptr + first)); - break; - case 8: - *(__be64 *)(to_ptr + first) = - cpu_to_be64(*(__u64 *)(from_ptr + first)); - break; - default: - ASSERT(0); - } - } - - fields &= ~(1LL << f); - } -} - -static int -xfs_sb_verify( - struct xfs_buf *bp, - bool check_version) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_sb sb; - - xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); - - /* - * Only check the in progress field for the primary superblock as - * mkfs.xfs doesn't clear it from secondary superblocks. - */ - return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, - check_version); -} - -/* - * If the superblock has the CRC feature bit set or the CRC field is non-null, - * check that the CRC is valid. We check the CRC field is non-null because a - * single bit error could clear the feature bit and unused parts of the - * superblock are supposed to be zero. Hence a non-null crc field indicates that - * we've potentially lost a feature bit and we should check it anyway. - */ -static void -xfs_sb_read_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); - int error; - - /* - * open code the version check to avoid needing to convert the entire - * superblock from disk order just to check the version number - */ - if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) && - (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) == - XFS_SB_VERSION_5) || - dsb->sb_crc != 0)) { - - if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize), - offsetof(struct xfs_sb, sb_crc))) { - error = EFSCORRUPTED; - goto out_error; - } - } - error = xfs_sb_verify(bp, true); - -out_error: - if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, error); - } -} - -/* - * We may be probed for a filesystem match, so we may not want to emit - * messages when the superblock buffer is not actually an XFS superblock. - * If we find an XFS superblock, the run a normal, noisy mount because we are - * really going to mount it and want to know about errors. - */ -static void -xfs_sb_quiet_read_verify( - struct xfs_buf *bp) -{ - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); - - - if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { - /* XFS filesystem, verify noisily! */ - xfs_sb_read_verify(bp); - return; - } - /* quietly fail */ - xfs_buf_ioerror(bp, EWRONGFS); -} - -static void -xfs_sb_write_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; - int error; - - error = xfs_sb_verify(bp, false); - if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, error); - return; - } - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; - - if (bip) - XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); - - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_sb, sb_crc)); -} - -const struct xfs_buf_ops xfs_sb_buf_ops = { - .verify_read = xfs_sb_read_verify, - .verify_write = xfs_sb_write_verify, -}; - -static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { - .verify_read = xfs_sb_quiet_read_verify, - .verify_write = xfs_sb_write_verify, -}; - /* * xfs_readsb * * Does the initial read of the superblock. */ int -xfs_readsb(xfs_mount_t *mp, int flags) +xfs_readsb( + struct xfs_mount *mp, + int flags) { unsigned int sector_size; struct xfs_buf *bp; @@ -884,8 +318,8 @@ reread: * Initialize the mount structure from the superblock. */ xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); - xfs_sb_quota_from_disk(&mp->m_sb); + /* * We must be able to do sector-sized and sector-aligned IO. */ @@ -922,107 +356,6 @@ release_buf: return error; } - -/* - * xfs_mount_common - * - * Mount initialization code establishing various mount - * fields from the superblock associated with the given - * mount structure - */ -STATIC void -xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) -{ - mp->m_agfrotor = mp->m_agirotor = 0; - spin_lock_init(&mp->m_agirotor_lock); - mp->m_maxagi = mp->m_sb.sb_agcount; - mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; - mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; - mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; - mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; - mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; - mp->m_blockmask = sbp->sb_blocksize - 1; - mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; - mp->m_blockwmask = mp->m_blockwsize - 1; - - mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); - mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; - mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; - - mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); - mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2; - mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2; - - mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); - mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; - mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; - - mp->m_bsize = XFS_FSB_TO_BB(mp, 1); - mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, - sbp->sb_inopblock); - mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; -} - -/* - * xfs_initialize_perag_data - * - * Read in each per-ag structure so we can count up the number of - * allocated inodes, free inodes and used filesystem blocks as this - * information is no longer persistent in the superblock. Once we have - * this information, write it into the in-core superblock structure. - */ -STATIC int -xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) -{ - xfs_agnumber_t index; - xfs_perag_t *pag; - xfs_sb_t *sbp = &mp->m_sb; - uint64_t ifree = 0; - uint64_t ialloc = 0; - uint64_t bfree = 0; - uint64_t bfreelst = 0; - uint64_t btree = 0; - int error; - - for (index = 0; index < agcount; index++) { - /* - * read the agf, then the agi. This gets us - * all the information we need and populates the - * per-ag structures for us. - */ - error = xfs_alloc_pagf_init(mp, NULL, index, 0); - if (error) - return error; - - error = xfs_ialloc_pagi_init(mp, NULL, index); - if (error) - return error; - pag = xfs_perag_get(mp, index); - ifree += pag->pagi_freecount; - ialloc += pag->pagi_count; - bfree += pag->pagf_freeblks; - bfreelst += pag->pagf_flcount; - btree += pag->pagf_btreeblks; - xfs_perag_put(pag); - } - /* - * Overwrite incore superblock counters with just-read data - */ - spin_lock(&mp->m_sb_lock); - sbp->sb_ifree = ifree; - sbp->sb_icount = ialloc; - sbp->sb_fdblocks = bfree + bfreelst + btree; - spin_unlock(&mp->m_sb_lock); - - /* Fixup the per-cpu counters as well. */ - xfs_icsb_reinit_counters(mp); - - return 0; -} - /* * Update alignment values based on mount options and sb values */ @@ -1194,7 +527,7 @@ xfs_set_inoalignment(xfs_mount_t *mp) } /* - * Check that the data (and log if separate) are an ok size. + * Check that the data (and log if separate) is an ok size. */ STATIC int xfs_check_sizes(xfs_mount_t *mp) @@ -1264,8 +597,7 @@ xfs_mount_reset_sbqflags( return 0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), - 0, 0, XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0); if (error) { xfs_trans_cancel(tp, 0); xfs_alert(mp, "%s: Superblock update failed!", __func__); @@ -1315,7 +647,7 @@ xfs_mountfs( uint quotaflags = 0; int error = 0; - xfs_mount_common(mp, sbp); + xfs_sb_mount_common(mp, sbp); /* * Check for a mismatched features2 values. Older kernels @@ -1400,7 +732,7 @@ xfs_mountfs( xfs_set_inoalignment(mp); /* - * Check that the data (and log if separate) are an ok size. + * Check that the data (and log if separate) is an ok size. */ error = xfs_check_sizes(mp); if (error) @@ -1738,8 +1070,7 @@ xfs_log_sbcount(xfs_mount_t *mp) return 0; tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return error; @@ -1752,49 +1083,7 @@ xfs_log_sbcount(xfs_mount_t *mp) } /* - * xfs_mod_sb() can be used to copy arbitrary changes to the - * in-core superblock into the superblock buffer to be logged. - * It does not provide the higher level of locking that is - * needed to protect the in-core superblock from concurrent - * access. - */ -void -xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) -{ - xfs_buf_t *bp; - int first; - int last; - xfs_mount_t *mp; - xfs_sb_field_t f; - - ASSERT(fields); - if (!fields) - return; - mp = tp->t_mountp; - bp = xfs_trans_getsb(tp, mp, 0); - first = sizeof(xfs_sb_t); - last = 0; - - /* translate/copy */ - - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); - - /* find modified range */ - f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); - ASSERT((1LL << f) & XFS_SB_MOD_BITS); - last = xfs_sb_info[f + 1].offset - 1; - - f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); - ASSERT((1LL << f) & XFS_SB_MOD_BITS); - first = xfs_sb_info[f].offset; - - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); - xfs_trans_log_buf(tp, bp, first, last); -} - - -/* - * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply + * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply * a delta to a specified field in the in-core superblock. Simply * switch on the field indicated and apply the delta to that field. * Fields are not allowed to dip below zero, so if the delta would @@ -2101,8 +1390,7 @@ xfs_mount_log_sb( XFS_SB_VERSIONNUM)); tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); - error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return error; @@ -2260,12 +1548,6 @@ xfs_icsb_init_counters( if (mp->m_sb_cnts == NULL) return -ENOMEM; -#ifdef CONFIG_HOTPLUG_CPU - mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; - mp->m_icsb_notifier.priority = 0; - register_hotcpu_notifier(&mp->m_icsb_notifier); -#endif /* CONFIG_HOTPLUG_CPU */ - for_each_online_cpu(i) { cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); @@ -2278,6 +1560,13 @@ xfs_icsb_init_counters( * initial balance kicks us off correctly */ mp->m_icsb_counters = -1; + +#ifdef CONFIG_HOTPLUG_CPU + mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; + mp->m_icsb_notifier.priority = 0; + register_hotcpu_notifier(&mp->m_icsb_notifier); +#endif /* CONFIG_HOTPLUG_CPU */ + return 0; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 4e374d4a9189..1fa0584b5627 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -18,45 +18,7 @@ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ -typedef struct xfs_trans_reservations { - uint tr_write; /* extent alloc trans */ - uint tr_itruncate; /* truncate trans */ - uint tr_rename; /* rename trans */ - uint tr_link; /* link trans */ - uint tr_remove; /* unlink trans */ - uint tr_symlink; /* symlink trans */ - uint tr_create; /* create trans */ - uint tr_mkdir; /* mkdir trans */ - uint tr_ifree; /* inode free trans */ - uint tr_ichange; /* inode update trans */ - uint tr_growdata; /* fs data section grow trans */ - uint tr_swrite; /* sync write inode trans */ - uint tr_addafork; /* cvt inode to attributed trans */ - uint tr_writeid; /* write setuid/setgid file */ - uint tr_attrinval; /* attr fork buffer invalidation */ - uint tr_attrsetm; /* set/create an attribute at mount time */ - uint tr_attrsetrt; /* set/create an attribute at runtime */ - uint tr_attrrm; /* remove an attribute */ - uint tr_clearagi; /* clear bad agi unlinked ino bucket */ - uint tr_growrtalloc; /* grow realtime allocations */ - uint tr_growrtzero; /* grow realtime zeroing */ - uint tr_growrtfree; /* grow realtime freeing */ - uint tr_qm_sbchange; /* change quota flags */ - uint tr_qm_setqlim; /* adjust quota limits */ - uint tr_qm_dqalloc; /* allocate quota on disk */ - uint tr_qm_quotaoff; /* turn quota off */ - uint tr_qm_equotaoff;/* end of turn quota off */ - uint tr_sb; /* modify superblock */ -} xfs_trans_reservations_t; - -#ifndef __KERNEL__ - -#define xfs_daddr_to_agno(mp,d) \ - ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks)) -#define xfs_daddr_to_agbno(mp,d) \ - ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks)) - -#else /* __KERNEL__ */ +#ifdef __KERNEL__ struct xlog; struct xfs_inode; @@ -174,7 +136,7 @@ typedef struct xfs_mount { int m_ialloc_blks; /* blocks in inode allocation */ int m_inoalign_mask;/* mask sb_inoalignmt if used */ uint m_qflags; /* quota status flags */ - xfs_trans_reservations_t m_reservations;/* precomputed res values */ + struct xfs_trans_resv m_resv; /* precomputed res values */ __uint64_t m_maxicount; /* maximum inode count */ __uint64_t m_resblks; /* total reserved blocks */ __uint64_t m_resblks_avail;/* available reserved blocks */ @@ -330,14 +292,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* - * perag get/put wrappers for ref counting - */ -struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); -struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, - int tag); -void xfs_perag_put(struct xfs_perag *pag); - -/* * Per-cpu superblock locking functions */ #ifdef HAVE_PERCPU_SB @@ -366,9 +320,63 @@ typedef struct xfs_mod_sb { int64_t msb_delta; /* Change to make to specified field */ } xfs_mod_sb_t; +/* + * Per-ag incore structure, copies of information in agf and agi, to improve the + * performance of allocation group selection. This is defined for the kernel + * only, and hence is defined here instead of in xfs_ag.h. You need the struct + * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree), + * so this doesn't introduce any strange header file dependencies. + */ +typedef struct xfs_perag { + struct xfs_mount *pag_mount; /* owner filesystem */ + xfs_agnumber_t pag_agno; /* AG this structure belongs to */ + atomic_t pag_ref; /* perag reference count */ + char pagf_init; /* this agf's entry is initialized */ + char pagi_init; /* this agi's entry is initialized */ + char pagf_metadata; /* the agf is preferred to be metadata */ + char pagi_inodeok; /* The agi is ok for inodes */ + __uint8_t pagf_levels[XFS_BTNUM_AGF]; + /* # of levels in bno & cnt btree */ + __uint32_t pagf_flcount; /* count of blocks in freelist */ + xfs_extlen_t pagf_freeblks; /* total free blocks */ + xfs_extlen_t pagf_longest; /* longest free space */ + __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ + xfs_agino_t pagi_freecount; /* number of free inodes */ + xfs_agino_t pagi_count; /* number of allocated inodes */ + + /* + * Inode allocation search lookup optimisation. + * If the pagino matches, the search for new inodes + * doesn't need to search the near ones again straight away + */ + xfs_agino_t pagl_pagino; + xfs_agino_t pagl_leftrec; + xfs_agino_t pagl_rightrec; + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ + + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ + + spinlock_t pag_ici_lock; /* incore inode cache lock */ + struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ + struct mutex pag_ici_reclaim_lock; /* serialisation point */ + unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ + + /* buffer cache index */ + spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ + struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + + /* for rcu-safe freeing */ + struct rcu_head rcu_head; + int pagb_count; /* pagb slots in use */ +} xfs_perag_t; + extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); +extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi); extern void xfs_unmountfs(xfs_mount_t *); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); @@ -387,13 +395,4 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ -extern void xfs_sb_calc_crc(struct xfs_buf *); -extern void xfs_mod_sb(struct xfs_trans *, __int64_t); -extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, - xfs_agnumber_t *); -extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); -extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); - -extern const struct xfs_buf_ops xfs_sb_buf_ops; - #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index d320794d03ce..6218a0aeeeea 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -17,6 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_trans.h" @@ -37,7 +38,6 @@ #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" -#include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -834,21 +834,52 @@ xfs_qm_qino_alloc( int error; int committed; + *ip = NULL; + /* + * With superblock that doesn't have separate pquotino, we + * share an inode between gquota and pquota. If the on-disk + * superblock has GQUOTA and the filesystem is now mounted + * with PQUOTA, just use sb_gquotino for sb_pquotino and + * vice-versa. + */ + if (!xfs_sb_version_has_pquotino(&mp->m_sb) && + (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) { + xfs_ino_t ino = NULLFSINO; + + if ((flags & XFS_QMOPT_PQUOTA) && + (mp->m_sb.sb_gquotino != NULLFSINO)) { + ino = mp->m_sb.sb_gquotino; + ASSERT(mp->m_sb.sb_pquotino == NULLFSINO); + } else if ((flags & XFS_QMOPT_GQUOTA) && + (mp->m_sb.sb_pquotino != NULLFSINO)) { + ino = mp->m_sb.sb_pquotino; + ASSERT(mp->m_sb.sb_gquotino == NULLFSINO); + } + if (ino != NULLFSINO) { + error = xfs_iget(mp, NULL, ino, 0, 0, ip); + if (error) + return error; + mp->m_sb.sb_gquotino = NULLFSINO; + mp->m_sb.sb_pquotino = NULLFSINO; + } + } + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); - if ((error = xfs_trans_reserve(tp, - XFS_QM_QINOCREATE_SPACE_RES(mp), - XFS_CREATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_CREATE_LOG_COUNT))) { + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, + XFS_QM_QINOCREATE_SPACE_RES(mp), 0); + if (error) { xfs_trans_cancel(tp, 0); return error; } - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); - if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); - return error; + if (!*ip) { + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, + &committed); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + return error; + } } /* @@ -860,21 +891,25 @@ xfs_qm_qino_alloc( if (flags & XFS_QMOPT_SBVERSION) { ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == - (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_QFLAGS)); + XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) == + (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | + XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | + XFS_SB_QFLAGS)); xfs_sb_version_addquota(&mp->m_sb); mp->m_sb.sb_uquotino = NULLFSINO; mp->m_sb.sb_gquotino = NULLFSINO; + mp->m_sb.sb_pquotino = NULLFSINO; - /* qflags will get updated _after_ quotacheck */ - mp->m_sb.sb_qflags = 0; + /* qflags will get updated fully _after_ quotacheck */ + mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT; } if (flags & XFS_QMOPT_UQUOTA) mp->m_sb.sb_uquotino = (*ip)->i_ino; - else + else if (flags & XFS_QMOPT_GQUOTA) mp->m_sb.sb_gquotino = (*ip)->i_ino; + else + mp->m_sb.sb_pquotino = (*ip)->i_ino; spin_unlock(&mp->m_sb_lock); xfs_mod_sb(tp, sbfields); @@ -1484,11 +1519,10 @@ xfs_qm_init_quotainos( if (error) goto error_rele; } - /* XXX: Use gquotino for now */ if (XFS_IS_PQUOTA_ON(mp) && - mp->m_sb.sb_gquotino != NULLFSINO) { - ASSERT(mp->m_sb.sb_gquotino > 0); - error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + mp->m_sb.sb_pquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_pquotino > 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, 0, 0, &pip); if (error) goto error_rele; @@ -1496,7 +1530,8 @@ xfs_qm_init_quotainos( } else { flags |= XFS_QMOPT_SBVERSION; sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_QFLAGS); + XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | + XFS_SB_QFLAGS); } /* @@ -1524,9 +1559,8 @@ xfs_qm_init_quotainos( flags &= ~XFS_QMOPT_SBVERSION; } if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { - /* XXX: Use XFS_SB_GQUOTINO for now */ error = xfs_qm_qino_alloc(mp, &pip, - sbflags | XFS_SB_GQUOTINO, + sbflags | XFS_SB_PQUOTINO, flags | XFS_QMOPT_PQUOTA); if (error) goto error_rele; @@ -1704,8 +1738,7 @@ xfs_qm_write_sb_changes( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), - 0, 0, XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return error; @@ -1734,8 +1767,8 @@ xfs_qm_write_sb_changes( int xfs_qm_vop_dqalloc( struct xfs_inode *ip, - uid_t uid, - gid_t gid, + xfs_dqid_t uid, + xfs_dqid_t gid, prid_t prid, uint flags, struct xfs_dquot **O_udqpp, @@ -1782,7 +1815,7 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, + error = xfs_qm_dqget(mp, NULL, uid, XFS_DQ_USER, XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, @@ -1809,7 +1842,7 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { if (ip->i_d.di_gid != gid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, + error = xfs_qm_dqget(mp, NULL, gid, XFS_DQ_GROUP, XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, @@ -1943,7 +1976,7 @@ xfs_qm_vop_chown_reserve( XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && - ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) { + ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 579d6a02a5b6..670cd4464070 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -160,6 +160,8 @@ extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, struct fs_disk_quota *); extern int xfs_qm_scall_getqstat(struct xfs_mount *, struct fs_quota_stat *); +extern int xfs_qm_scall_getqstatv(struct xfs_mount *, + struct fs_quota_statv *); extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 437a52d91f6d..3af50ccdfac1 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -17,6 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index e4f8b2d6f38b..8174aad0b388 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -20,6 +20,7 @@ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_trans.h" @@ -37,7 +38,6 @@ #include "xfs_error.h" #include "xfs_attr.h" #include "xfs_buf_item.h" -#include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -247,9 +247,7 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_IOLOCK_EXCL); tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { xfs_trans_cancel(tp, 0); xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -296,8 +294,10 @@ xfs_qm_scall_trunc_qfiles( if (flags & XFS_DQ_USER) error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); - if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) + if (flags & XFS_DQ_GROUP) error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (flags & XFS_DQ_PROJ) + error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); return error ? error : error2; } @@ -404,6 +404,7 @@ xfs_qm_scall_quotaon( /* * Return quota status information, such as uquota-off, enforcements, etc. + * for Q_XGETQSTAT command. */ int xfs_qm_scall_getqstat( @@ -413,8 +414,10 @@ xfs_qm_scall_getqstat( struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_inode *uip = NULL; struct xfs_inode *gip = NULL; + struct xfs_inode *pip = NULL; bool tempuqip = false; bool tempgqip = false; + bool temppqip = false; memset(out, 0, sizeof(fs_quota_stat_t)); @@ -424,16 +427,106 @@ xfs_qm_scall_getqstat( out->qs_gquota.qfs_ino = NULLFSINO; return (0); } + + out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & + (XFS_ALL_QUOTA_ACCT| + XFS_ALL_QUOTA_ENFD)); + if (q) { + uip = q->qi_uquotaip; + gip = q->qi_gquotaip; + pip = q->qi_pquotaip; + } + if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip) == 0) + tempuqip = true; + } + if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip) == 0) + tempgqip = true; + } + /* + * Q_XGETQSTAT doesn't have room for both group and project quotas. + * So, allow the project quota values to be copied out only if + * there is no group quota information available. + */ + if (!gip) { + if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, + 0, 0, &pip) == 0) + temppqip = true; + } + } else + pip = NULL; + if (uip) { + out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; + out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; + out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; + if (tempuqip) + IRELE(uip); + } + + if (gip) { + out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; + out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; + out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; + if (tempgqip) + IRELE(gip); + } + if (pip) { + out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; + out->qs_gquota.qfs_nblks = pip->i_d.di_nblocks; + out->qs_gquota.qfs_nextents = pip->i_d.di_nextents; + if (temppqip) + IRELE(pip); + } + if (q) { + out->qs_incoredqs = q->qi_dquots; + out->qs_btimelimit = q->qi_btimelimit; + out->qs_itimelimit = q->qi_itimelimit; + out->qs_rtbtimelimit = q->qi_rtbtimelimit; + out->qs_bwarnlimit = q->qi_bwarnlimit; + out->qs_iwarnlimit = q->qi_iwarnlimit; + } + return 0; +} + +/* + * Return quota status information, such as uquota-off, enforcements, etc. + * for Q_XGETQSTATV command, to support separate project quota field. + */ +int +xfs_qm_scall_getqstatv( + struct xfs_mount *mp, + struct fs_quota_statv *out) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + struct xfs_inode *pip = NULL; + bool tempuqip = false; + bool tempgqip = false; + bool temppqip = false; + + if (!xfs_sb_version_hasquota(&mp->m_sb)) { + out->qs_uquota.qfs_ino = NULLFSINO; + out->qs_gquota.qfs_ino = NULLFSINO; + out->qs_pquota.qfs_ino = NULLFSINO; + return (0); + } + out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & (XFS_ALL_QUOTA_ACCT| XFS_ALL_QUOTA_ENFD)); - out->qs_pad = 0; out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; + out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino; if (q) { uip = q->qi_uquotaip; gip = q->qi_gquotaip; + pip = q->qi_pquotaip; } if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, @@ -445,18 +538,30 @@ xfs_qm_scall_getqstat( 0, 0, &gip) == 0) tempgqip = true; } + if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, + 0, 0, &pip) == 0) + temppqip = true; + } if (uip) { out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; if (tempuqip) IRELE(uip); } + if (gip) { out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; if (tempgqip) IRELE(gip); } + if (pip) { + out->qs_pquota.qfs_nblks = pip->i_d.di_nblocks; + out->qs_pquota.qfs_nextents = pip->i_d.di_nextents; + if (temppqip) + IRELE(pip); + } if (q) { out->qs_incoredqs = q->qi_dquots; out->qs_btimelimit = q->qi_btimelimit; @@ -515,8 +620,7 @@ xfs_qm_scall_setqlim( xfs_dqunlock(dqp); tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), - 0, 0, XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); if (error) { xfs_trans_cancel(tp, 0); goto out_rele; @@ -650,8 +754,7 @@ xfs_qm_log_quotaoff_end( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); - error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp), - 0, 0, XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return (error); @@ -684,8 +787,7 @@ xfs_qm_log_quotaoff( uint oldsbqflag=0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); - error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp), - 0, 0, XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); if (error) goto error0; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index b14f42c714b6..e7d84d2d8683 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -18,267 +18,14 @@ #ifndef __XFS_QUOTA_H__ #define __XFS_QUOTA_H__ -struct xfs_trans; - -/* - * The ondisk form of a dquot structure. - */ -#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ -#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ - -/* - * uid_t and gid_t are hard-coded to 32 bits in the inode. - * Hence, an 'id' in a dquot is 32 bits.. - */ -typedef __uint32_t xfs_dqid_t; - -/* - * Even though users may not have quota limits occupying all 64-bits, - * they may need 64-bit accounting. Hence, 64-bit quota-counters, - * and quota-limits. This is a waste in the common case, but hey ... - */ -typedef __uint64_t xfs_qcnt_t; -typedef __uint16_t xfs_qwarncnt_t; - -/* - * This is the main portion of the on-disk representation of quota - * information for a user. This is the q_core of the xfs_dquot_t that - * is kept in kernel memory. We pad this with some more expansion room - * to construct the on disk structure. - */ -typedef struct xfs_disk_dquot { - __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ - __u8 d_version; /* dquot version */ - __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ - __be32 d_id; /* user,project,group id */ - __be64 d_blk_hardlimit;/* absolute limit on disk blks */ - __be64 d_blk_softlimit;/* preferred limit on disk blks */ - __be64 d_ino_hardlimit;/* maximum # allocated inodes */ - __be64 d_ino_softlimit;/* preferred inode limit */ - __be64 d_bcount; /* disk blocks owned by the user */ - __be64 d_icount; /* inodes owned by the user */ - __be32 d_itimer; /* zero if within inode limits if not, - this is when we refuse service */ - __be32 d_btimer; /* similar to above; for disk blocks */ - __be16 d_iwarns; /* warnings issued wrt num inodes */ - __be16 d_bwarns; /* warnings issued wrt disk blocks */ - __be32 d_pad0; /* 64 bit align */ - __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */ - __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */ - __be64 d_rtbcount; /* realtime blocks owned */ - __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ - __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ - __be16 d_pad; -} xfs_disk_dquot_t; - -/* - * This is what goes on disk. This is separated from the xfs_disk_dquot because - * carrying the unnecessary padding would be a waste of memory. - */ -typedef struct xfs_dqblk { - xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ - char dd_fill[4]; /* filling for posterity */ - - /* - * These two are only present on filesystems with the CRC bits set. - */ - __be32 dd_crc; /* checksum */ - __be64 dd_lsn; /* last modification in log */ - uuid_t dd_uuid; /* location information */ -} xfs_dqblk_t; - -#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) - -/* - * flags for q_flags field in the dquot. - */ -#define XFS_DQ_USER 0x0001 /* a user quota */ -#define XFS_DQ_PROJ 0x0002 /* project quota */ -#define XFS_DQ_GROUP 0x0004 /* a group quota */ -#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ - -#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) - -#define XFS_DQ_FLAGS \ - { XFS_DQ_USER, "USER" }, \ - { XFS_DQ_PROJ, "PROJ" }, \ - { XFS_DQ_GROUP, "GROUP" }, \ - { XFS_DQ_DIRTY, "DIRTY" }, \ - { XFS_DQ_FREEING, "FREEING" } - -/* - * We have the possibility of all three quota types being active at once, and - * hence free space modification requires modification of all three current - * dquots in a single transaction. For this case we need to have a reservation - * of at least 3 dquots. - * - * However, a chmod operation can change both UID and GID in a single - * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be - * modified. Hence for this case we need to reserve space for at least 4 dquots. - * - * And in the worst case, there's a rename operation that can be modifying up to - * 4 inodes with dquots attached to them. In reality, the only inodes that can - * have their dquots modified are the source and destination directory inodes - * due to directory name creation and removal. That can require space allocation - * and/or freeing on both directory inodes, and hence all three dquots on each - * inode can be modified. And if the directories are world writeable, all the - * dquots can be unique and so 6 dquots can be modified.... - * - * And, of course, we also need to take into account the dquot log format item - * used to describe each dquot. - */ -#define XFS_DQUOT_LOGRES(mp) \ - ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) - -/* - * These are the structures used to lay out dquots and quotaoff - * records on the log. Quite similar to those of inodes. - */ - -/* - * log format struct for dquots. - * The first two fields must be the type and size fitting into - * 32 bits : log_recovery code assumes that. - */ -typedef struct xfs_dq_logformat { - __uint16_t qlf_type; /* dquot log item type */ - __uint16_t qlf_size; /* size of this item */ - xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ - __int64_t qlf_blkno; /* blkno of dquot buffer */ - __int32_t qlf_len; /* len of dquot buffer */ - __uint32_t qlf_boffset; /* off of dquot in buffer */ -} xfs_dq_logformat_t; - -/* - * log format struct for QUOTAOFF records. - * The first two fields must be the type and size fitting into - * 32 bits : log_recovery code assumes that. - * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer - * to the first and ensures that the first logitem is taken out of the AIL - * only when the last one is securely committed. - */ -typedef struct xfs_qoff_logformat { - unsigned short qf_type; /* quotaoff log item type */ - unsigned short qf_size; /* size of this item */ - unsigned int qf_flags; /* USR and/or GRP */ - char qf_pad[12]; /* padding for future */ -} xfs_qoff_logformat_t; - - -/* - * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. - */ -#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ -#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ -#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ -#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */ -#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */ -#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */ -#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ - -/* - * Conversion to and from the combined OQUOTA flag (if necessary) - * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk() - */ -#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */ -#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */ -#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */ -#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */ - -/* - * Quota Accounting/Enforcement flags - */ -#define XFS_ALL_QUOTA_ACCT \ - (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) -#define XFS_ALL_QUOTA_ENFD \ - (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD) -#define XFS_ALL_QUOTA_CHKD \ - (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD) - -#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) -#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) -#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) -#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) -#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) -#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) -#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD) - -/* - * Incore only flags for quotaoff - these bits get cleared when quota(s) - * are in the process of getting turned off. These flags are in m_qflags but - * never in sb_qflags. - */ -#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ -#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ -#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ -#define XFS_ALL_QUOTA_ACTIVE \ - (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) +#include "xfs_quota_defs.h" /* - * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees - * quota will be not be switched off as long as that inode lock is held. + * Kernel only quota definitions and functions */ -#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE | \ - XFS_PQUOTA_ACTIVE)) -#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \ - XFS_PQUOTA_ACTIVE)) -#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) -#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) -#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) -/* - * Flags to tell various functions what to do. Not all of these are meaningful - * to a single function. None of these XFS_QMOPT_* flags are meant to have - * persistent values (ie. their values can and will change between versions) - */ -#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ -#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ -#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ -#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ -#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ -#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ -#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ - -/* - * flags to xfs_trans_mod_dquot to indicate which field needs to be - * modified. - */ -#define XFS_QMOPT_RES_REGBLKS 0x0010000 -#define XFS_QMOPT_RES_RTBLKS 0x0020000 -#define XFS_QMOPT_BCOUNT 0x0040000 -#define XFS_QMOPT_ICOUNT 0x0080000 -#define XFS_QMOPT_RTBCOUNT 0x0100000 -#define XFS_QMOPT_DELBCOUNT 0x0200000 -#define XFS_QMOPT_DELRTBCOUNT 0x0400000 -#define XFS_QMOPT_RES_INOS 0x0800000 - -/* - * flags for dqalloc. - */ -#define XFS_QMOPT_INHERIT 0x1000000 - -/* - * flags to xfs_trans_mod_dquot. - */ -#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS -#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS -#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS -#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT -#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT -#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT -#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT -#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT - - -#define XFS_QMOPT_QUOTALL \ - (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) -#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) +struct xfs_trans; -#ifdef __KERNEL__ /* * This check is done typically without holding the inode lock; * that may seem racy, but it is harmless in the context that it is used. @@ -301,13 +48,6 @@ typedef struct xfs_qoff_logformat { (XFS_IS_PQUOTA_ON(mp) && \ (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) -#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ - XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\ - XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\ - XFS_PQUOTA_CHKD) - - /* * The structure kept inside the xfs_trans_t keep track of dquot changes * within a transaction and apply them later. @@ -340,8 +80,9 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, long, long, uint); -extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint, - struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **); +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, + prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, + struct xfs_dquot **); extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *); extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); @@ -362,9 +103,9 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *); #else static inline int -xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, - uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp, - struct xfs_dquot **pdqp) +xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, + prid_t prid, uint flags, struct xfs_dquot **udqp, + struct xfs_dquot **gdqp, struct xfs_dquot **pdqp) { *udqp = NULL; *gdqp = NULL; @@ -415,5 +156,4 @@ extern int xfs_mount_reset_sbqflags(struct xfs_mount *); extern const struct xfs_buf_ops xfs_dquot_buf_ops; -#endif /* __KERNEL__ */ #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h new file mode 100644 index 000000000000..e6b0d6e1f4f2 --- /dev/null +++ b/fs/xfs/xfs_quota_defs.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QUOTA_DEFS_H__ +#define __XFS_QUOTA_DEFS_H__ + +/* + * Quota definitions shared between user and kernel source trees. + */ + +/* + * Even though users may not have quota limits occupying all 64-bits, + * they may need 64-bit accounting. Hence, 64-bit quota-counters, + * and quota-limits. This is a waste in the common case, but hey ... + */ +typedef __uint64_t xfs_qcnt_t; +typedef __uint16_t xfs_qwarncnt_t; + +/* + * flags for q_flags field in the dquot. + */ +#define XFS_DQ_USER 0x0001 /* a user quota */ +#define XFS_DQ_PROJ 0x0002 /* project quota */ +#define XFS_DQ_GROUP 0x0004 /* a group quota */ +#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ +#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ + +#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) + +#define XFS_DQ_FLAGS \ + { XFS_DQ_USER, "USER" }, \ + { XFS_DQ_PROJ, "PROJ" }, \ + { XFS_DQ_GROUP, "GROUP" }, \ + { XFS_DQ_DIRTY, "DIRTY" }, \ + { XFS_DQ_FREEING, "FREEING" } + +/* + * We have the possibility of all three quota types being active at once, and + * hence free space modification requires modification of all three current + * dquots in a single transaction. For this case we need to have a reservation + * of at least 3 dquots. + * + * However, a chmod operation can change both UID and GID in a single + * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be + * modified. Hence for this case we need to reserve space for at least 4 dquots. + * + * And in the worst case, there's a rename operation that can be modifying up to + * 4 inodes with dquots attached to them. In reality, the only inodes that can + * have their dquots modified are the source and destination directory inodes + * due to directory name creation and removal. That can require space allocation + * and/or freeing on both directory inodes, and hence all three dquots on each + * inode can be modified. And if the directories are world writeable, all the + * dquots can be unique and so 6 dquots can be modified.... + * + * And, of course, we also need to take into account the dquot log format item + * used to describe each dquot. + */ +#define XFS_DQUOT_LOGRES(mp) \ + ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) + +#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) +#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) +#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) +#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) +#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) +#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD) + +/* + * Incore only flags for quotaoff - these bits get cleared when quota(s) + * are in the process of getting turned off. These flags are in m_qflags but + * never in sb_qflags. + */ +#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ +#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ +#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ +#define XFS_ALL_QUOTA_ACTIVE \ + (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) + +/* + * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees + * quota will be not be switched off as long as that inode lock is held. + */ +#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ + XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) +#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) +#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) +#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) +#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) + +/* + * Flags to tell various functions what to do. Not all of these are meaningful + * to a single function. None of these XFS_QMOPT_* flags are meant to have + * persistent values (ie. their values can and will change between versions) + */ +#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ +#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ +#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ +#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ +#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ +#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ +#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ + +/* + * flags to xfs_trans_mod_dquot to indicate which field needs to be + * modified. + */ +#define XFS_QMOPT_RES_REGBLKS 0x0010000 +#define XFS_QMOPT_RES_RTBLKS 0x0020000 +#define XFS_QMOPT_BCOUNT 0x0040000 +#define XFS_QMOPT_ICOUNT 0x0080000 +#define XFS_QMOPT_RTBCOUNT 0x0100000 +#define XFS_QMOPT_DELBCOUNT 0x0200000 +#define XFS_QMOPT_DELRTBCOUNT 0x0400000 +#define XFS_QMOPT_RES_INOS 0x0800000 + +/* + * flags for dqalloc. + */ +#define XFS_QMOPT_INHERIT 0x1000000 + +/* + * flags to xfs_trans_mod_dquot. + */ +#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS +#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS +#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS +#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT +#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT +#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT +#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT +#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT + + +#define XFS_QMOPT_QUOTALL \ + (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) +#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + +#endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 20e30f93b0c7..1326d81596c2 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -16,8 +16,10 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_sb.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" #include "xfs_log.h" +#include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" @@ -54,6 +56,18 @@ xfs_fs_get_xstate( } STATIC int +xfs_fs_get_xstatev( + struct super_block *sb, + struct fs_quota_statv *fqs) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + return -xfs_qm_scall_getqstatv(mp, fqs); +} + +STATIC int xfs_fs_set_xstate( struct super_block *sb, unsigned int uflags, @@ -133,6 +147,7 @@ xfs_fs_set_dqblk( } const struct quotactl_ops xfs_quotactl_operations = { + .get_xstatev = xfs_fs_get_xstatev, .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, .get_dqblk = xfs_fs_get_dqblk, diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c deleted file mode 100644 index 30ff5f401d28..000000000000 --- a/fs/xfs/xfs_rename.c +++ /dev/null @@ -1,346 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_error.h" -#include "xfs_quota.h" -#include "xfs_utils.h" -#include "xfs_trans_space.h" -#include "xfs_vnodeops.h" -#include "xfs_trace.h" - - -/* - * Enter all inodes for a rename transaction into a sorted array. - */ -STATIC void -xfs_sort_for_rename( - xfs_inode_t *dp1, /* in: old (source) directory inode */ - xfs_inode_t *dp2, /* in: new (target) directory inode */ - xfs_inode_t *ip1, /* in: inode of old entry */ - xfs_inode_t *ip2, /* in: inode of new entry, if it - already exists, NULL otherwise. */ - xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ - int *num_inodes) /* out: number of inodes in array */ -{ - xfs_inode_t *temp; - int i, j; - - /* - * i_tab contains a list of pointers to inodes. We initialize - * the table here & we'll sort it. We will then use it to - * order the acquisition of the inode locks. - * - * Note that the table may contain duplicates. e.g., dp1 == dp2. - */ - i_tab[0] = dp1; - i_tab[1] = dp2; - i_tab[2] = ip1; - if (ip2) { - *num_inodes = 4; - i_tab[3] = ip2; - } else { - *num_inodes = 3; - i_tab[3] = NULL; - } - - /* - * Sort the elements via bubble sort. (Remember, there are at - * most 4 elements to sort, so this is adequate.) - */ - for (i = 0; i < *num_inodes; i++) { - for (j = 1; j < *num_inodes; j++) { - if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { - temp = i_tab[j]; - i_tab[j] = i_tab[j-1]; - i_tab[j-1] = temp; - } - } - } -} - -/* - * xfs_rename - */ -int -xfs_rename( - xfs_inode_t *src_dp, - struct xfs_name *src_name, - xfs_inode_t *src_ip, - xfs_inode_t *target_dp, - struct xfs_name *target_name, - xfs_inode_t *target_ip) -{ - xfs_trans_t *tp = NULL; - xfs_mount_t *mp = src_dp->i_mount; - int new_parent; /* moving to a new dir */ - int src_is_directory; /* src_name is a directory */ - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - xfs_inode_t *inodes[4]; - int spaceres; - int num_inodes; - - trace_xfs_rename(src_dp, target_dp, src_name, target_name); - - new_parent = (src_dp != target_dp); - src_is_directory = S_ISDIR(src_ip->i_d.di_mode); - - xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, - inodes, &num_inodes); - - xfs_bmap_init(&free_list, &first_block); - tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); - if (error == ENOSPC) { - spaceres = 0; - error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); - } - if (error) { - xfs_trans_cancel(tp, 0); - goto std_return; - } - - /* - * Attach the dquots to the inodes - */ - error = xfs_qm_vop_rename_dqattach(inodes); - if (error) { - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - } - - /* - * Lock all the participating inodes. Depending upon whether - * the target_name exists in the target directory, and - * whether the target directory is the same as the source - * directory, we can lock from 2 to 4 inodes. - */ - xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); - - /* - * Join all the inodes to the transaction. From this point on, - * we can rely on either trans_commit or trans_cancel to unlock - * them. - */ - xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); - if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); - if (target_ip) - xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); - - /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { - error = XFS_ERROR(EXDEV); - goto error_return; - } - - /* - * Set up the target. - */ - if (target_ip == NULL) { - /* - * If there's no space reservation, check the entry will - * fit before actually inserting it. - */ - error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); - if (error) - goto error_return; - /* - * If target does not exist and the rename crosses - * directories, adjust the target directory link count - * to account for the ".." reference from the new entry. - */ - error = xfs_dir_createname(tp, target_dp, target_name, - src_ip->i_ino, &first_block, - &free_list, spaceres); - if (error == ENOSPC) - goto error_return; - if (error) - goto abort_return; - - xfs_trans_ichgtime(tp, target_dp, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - if (new_parent && src_is_directory) { - error = xfs_bumplink(tp, target_dp); - if (error) - goto abort_return; - } - } else { /* target_ip != NULL */ - /* - * If target exists and it's a directory, check that both - * target and source are directories and that target can be - * destroyed, or that neither is a directory. - */ - if (S_ISDIR(target_ip->i_d.di_mode)) { - /* - * Make sure target dir is empty. - */ - if (!(xfs_dir_isempty(target_ip)) || - (target_ip->i_d.di_nlink > 2)) { - error = XFS_ERROR(EEXIST); - goto error_return; - } - } - - /* - * Link the source inode under the target name. - * If the source inode is a directory and we are moving - * it across directories, its ".." entry will be - * inconsistent until we replace that down below. - * - * In case there is already an entry with the same - * name at the destination directory, remove it first. - */ - error = xfs_dir_replace(tp, target_dp, target_name, - src_ip->i_ino, - &first_block, &free_list, spaceres); - if (error) - goto abort_return; - - xfs_trans_ichgtime(tp, target_dp, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - /* - * Decrement the link count on the target since the target - * dir no longer points to it. - */ - error = xfs_droplink(tp, target_ip); - if (error) - goto abort_return; - - if (src_is_directory) { - /* - * Drop the link from the old "." entry. - */ - error = xfs_droplink(tp, target_ip); - if (error) - goto abort_return; - } - } /* target_ip != NULL */ - - /* - * Remove the source. - */ - if (new_parent && src_is_directory) { - /* - * Rewrite the ".." entry to point to the new - * directory. - */ - error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, - target_dp->i_ino, - &first_block, &free_list, spaceres); - ASSERT(error != EEXIST); - if (error) - goto abort_return; - } - - /* - * We always want to hit the ctime on the source inode. - * - * This isn't strictly required by the standards since the source - * inode isn't really being changed, but old unix file systems did - * it and some incremental backup programs won't work without it. - */ - xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); - - /* - * Adjust the link count on src_dp. This is necessary when - * renaming a directory, either within one parent when - * the target existed, or across two parent directories. - */ - if (src_is_directory && (new_parent || target_ip != NULL)) { - - /* - * Decrement link count on src_directory since the - * entry that's moved no longer points to it. - */ - error = xfs_droplink(tp, src_dp); - if (error) - goto abort_return; - } - - error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, - &first_block, &free_list, spaceres); - if (error) - goto abort_return; - - xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); - if (new_parent) - xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - - /* - * If this is a synchronous mount, make sure that the - * rename transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - goto std_return; - } - - /* - * trans_commit will unlock src_ip, target_ip & decrement - * the vnode references. - */ - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - - abort_return: - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, cancel_flags); - std_return: - return error; -} diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 98dc670d3ee0..6f9e63c9fc26 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -17,25 +17,24 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_alloc.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_rtalloc.h" #include "xfs_fsops.h" #include "xfs_error.h" #include "xfs_inode_item.h" #include "xfs_trans_space.h" -#include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_buf.h" #include "xfs_icache.h" @@ -101,10 +100,9 @@ xfs_growfs_rt_alloc( /* * Reserve space & log for one extent added to the file. */ - if ((error = xfs_trans_reserve(tp, resblks, - XFS_GROWRTALLOC_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_DEFAULT_PERM_LOG_COUNT))) + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, + resblks, 0); + if (error) goto error_cancel; cancelflags = XFS_TRANS_RELEASE_LOG_RES; /* @@ -147,8 +145,9 @@ xfs_growfs_rt_alloc( /* * Reserve log for one block zeroing. */ - if ((error = xfs_trans_reserve(tp, 0, - XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0))) + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, + 0, 0); + if (error) goto error_cancel; /* * Lock the bitmap inode. @@ -736,8 +735,8 @@ xfs_rtallocate_range( { xfs_rtblock_t end; /* end of the allocated extent */ int error; /* error value */ - xfs_rtblock_t postblock; /* first block allocated > end */ - xfs_rtblock_t preblock; /* first block allocated < start */ + xfs_rtblock_t postblock = 0; /* first block allocated > end */ + xfs_rtblock_t preblock = 0; /* first block allocated < start */ end = start + len - 1; /* @@ -1958,8 +1957,9 @@ xfs_growfs_rt( * Start a transaction, get the log reservation. */ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); - if ((error = xfs_trans_reserve(tp, 0, - XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0))) + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree, + 0, 0); + if (error) goto error_cancel; /* * Lock out other callers by grabbing the bitmap inode lock. @@ -2148,7 +2148,7 @@ xfs_rtfree_extent( ASSERT(mp->m_rbmip->i_itemp != NULL); ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); -#if defined(__KERNEL__) && defined(DEBUG) +#ifdef DEBUG /* * Check to see that this whole range is currently allocated. */ diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index f7f3a359c1c5..b2a1a24c0e2f 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -18,58 +18,11 @@ #ifndef __XFS_RTALLOC_H__ #define __XFS_RTALLOC_H__ +/* kernel only definitions and functions */ + struct xfs_mount; struct xfs_trans; -/* Min and max rt extent sizes, specified in bytes */ -#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ -#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ -#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ - -/* - * Constants for bit manipulations. - */ -#define XFS_NBBYLOG 3 /* log2(NBBY) */ -#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ -#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) -#define XFS_NBWORD (1 << XFS_NBWORDLOG) -#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) - -#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) -#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) -#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) -#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) - -/* - * Summary and bit manipulation macros. - */ -#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) -#define XFS_SUMOFFSTOBLOCK(mp,s) \ - (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) -#define XFS_SUMPTR(mp,bp,so) \ - ((xfs_suminfo_t *)((bp)->b_addr + \ - (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) - -#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) -#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) -#define XFS_BITTOWORD(mp,bi) \ - ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) - -#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) -#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) - -#define XFS_RTLOBIT(w) xfs_lowbit32(w) -#define XFS_RTHIBIT(w) xfs_highbit32(w) - -#if XFS_BIG_BLKNOS -#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) -#else -#define XFS_RTBLOCKLOG(b) xfs_highbit32(b) -#endif - - -#ifdef __KERNEL__ - #ifdef CONFIG_XFS_RT /* * Function prototypes for exported functions. @@ -161,6 +114,4 @@ xfs_rtmount_init( # define xfs_rtunmount_inodes(m) #endif /* CONFIG_XFS_RT */ -#endif /* __KERNEL__ */ - #endif /* __XFS_RTALLOC_H__ */ diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c new file mode 100644 index 000000000000..a5b59d92eb70 --- /dev/null +++ b/fs/xfs/xfs_sb.c @@ -0,0 +1,834 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_fsops.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" + +/* + * Physical superblock buffer manipulations. Shared with libxfs in userspace. + */ + +static const struct { + short offset; + short type; /* 0 = integer + * 1 = binary / string (no translation) + */ +} xfs_sb_info[] = { + { offsetof(xfs_sb_t, sb_magicnum), 0 }, + { offsetof(xfs_sb_t, sb_blocksize), 0 }, + { offsetof(xfs_sb_t, sb_dblocks), 0 }, + { offsetof(xfs_sb_t, sb_rblocks), 0 }, + { offsetof(xfs_sb_t, sb_rextents), 0 }, + { offsetof(xfs_sb_t, sb_uuid), 1 }, + { offsetof(xfs_sb_t, sb_logstart), 0 }, + { offsetof(xfs_sb_t, sb_rootino), 0 }, + { offsetof(xfs_sb_t, sb_rbmino), 0 }, + { offsetof(xfs_sb_t, sb_rsumino), 0 }, + { offsetof(xfs_sb_t, sb_rextsize), 0 }, + { offsetof(xfs_sb_t, sb_agblocks), 0 }, + { offsetof(xfs_sb_t, sb_agcount), 0 }, + { offsetof(xfs_sb_t, sb_rbmblocks), 0 }, + { offsetof(xfs_sb_t, sb_logblocks), 0 }, + { offsetof(xfs_sb_t, sb_versionnum), 0 }, + { offsetof(xfs_sb_t, sb_sectsize), 0 }, + { offsetof(xfs_sb_t, sb_inodesize), 0 }, + { offsetof(xfs_sb_t, sb_inopblock), 0 }, + { offsetof(xfs_sb_t, sb_fname[0]), 1 }, + { offsetof(xfs_sb_t, sb_blocklog), 0 }, + { offsetof(xfs_sb_t, sb_sectlog), 0 }, + { offsetof(xfs_sb_t, sb_inodelog), 0 }, + { offsetof(xfs_sb_t, sb_inopblog), 0 }, + { offsetof(xfs_sb_t, sb_agblklog), 0 }, + { offsetof(xfs_sb_t, sb_rextslog), 0 }, + { offsetof(xfs_sb_t, sb_inprogress), 0 }, + { offsetof(xfs_sb_t, sb_imax_pct), 0 }, + { offsetof(xfs_sb_t, sb_icount), 0 }, + { offsetof(xfs_sb_t, sb_ifree), 0 }, + { offsetof(xfs_sb_t, sb_fdblocks), 0 }, + { offsetof(xfs_sb_t, sb_frextents), 0 }, + { offsetof(xfs_sb_t, sb_uquotino), 0 }, + { offsetof(xfs_sb_t, sb_gquotino), 0 }, + { offsetof(xfs_sb_t, sb_qflags), 0 }, + { offsetof(xfs_sb_t, sb_flags), 0 }, + { offsetof(xfs_sb_t, sb_shared_vn), 0 }, + { offsetof(xfs_sb_t, sb_inoalignmt), 0 }, + { offsetof(xfs_sb_t, sb_unit), 0 }, + { offsetof(xfs_sb_t, sb_width), 0 }, + { offsetof(xfs_sb_t, sb_dirblklog), 0 }, + { offsetof(xfs_sb_t, sb_logsectlog), 0 }, + { offsetof(xfs_sb_t, sb_logsectsize), 0 }, + { offsetof(xfs_sb_t, sb_logsunit), 0 }, + { offsetof(xfs_sb_t, sb_features2), 0 }, + { offsetof(xfs_sb_t, sb_bad_features2), 0 }, + { offsetof(xfs_sb_t, sb_features_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_ro_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_incompat), 0 }, + { offsetof(xfs_sb_t, sb_features_log_incompat), 0 }, + { offsetof(xfs_sb_t, sb_crc), 0 }, + { offsetof(xfs_sb_t, sb_pad), 0 }, + { offsetof(xfs_sb_t, sb_pquotino), 0 }, + { offsetof(xfs_sb_t, sb_lsn), 0 }, + { sizeof(xfs_sb_t), 0 } +}; + +/* + * Reference counting access wrappers to the perag structures. + * Because we never free per-ag structures, the only thing we + * have to protect against changes is the tree structure itself. + */ +struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + rcu_read_lock(); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + ref = atomic_inc_return(&pag->pag_ref); + } + rcu_read_unlock(); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_get_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + int tag) +{ + struct xfs_perag *pag; + int found; + int ref; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + ref = atomic_inc_return(&pag->pag_ref); + rcu_read_unlock(); + trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put( + struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + +/* + * Check the validity of the SB found. + */ +STATIC int +xfs_mount_validate_sb( + xfs_mount_t *mp, + xfs_sb_t *sbp, + bool check_inprogress, + bool check_version) +{ + + /* + * If the log device and data device have the + * same device number, the log is internal. + * Consequently, the sb_logstart should be non-zero. If + * we have a zero sb_logstart in this case, we may be trying to mount + * a volume filesystem in a non-volume manner. + */ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + xfs_warn(mp, "bad magic number"); + return XFS_ERROR(EWRONGFS); + } + + + if (!xfs_sb_good_version(sbp)) { + xfs_warn(mp, "bad version"); + return XFS_ERROR(EWRONGFS); + } + + /* + * Version 5 superblock feature mask validation. Reject combinations the + * kernel cannot support up front before checking anything else. For + * write validation, we don't need to check feature masks. + */ + if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + xfs_alert(mp, +"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" +"Use of these features in this kernel is at your own risk!"); + + if (xfs_sb_has_compat_feature(sbp, + XFS_SB_FEAT_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown compatible features (0x%x) enabled.\n" +"Using a more recent kernel is recommended.", + (sbp->sb_features_compat & + XFS_SB_FEAT_COMPAT_UNKNOWN)); + } + + if (xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_alert(mp, +"Superblock has unknown read-only compatible features (0x%x) enabled.", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, +"Attempted to mount read-only compatible filesystem read-write.\n" +"Filesystem can only be safely mounted read only."); + return XFS_ERROR(EINVAL); + } + } + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown incompatible features (0x%x) enabled.\n" +"Filesystem can not be safely mounted by this kernel.", + (sbp->sb_features_incompat & + XFS_SB_FEAT_INCOMPAT_UNKNOWN)); + return XFS_ERROR(EINVAL); + } + } + + if (xfs_sb_version_has_pquotino(sbp)) { + if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { + xfs_notice(mp, + "Version 5 of Super block has XFS_OQUOTA bits.\n"); + return XFS_ERROR(EFSCORRUPTED); + } + } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | + XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { + xfs_notice(mp, +"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.\n"); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely( + sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { + xfs_warn(mp, + "filesystem is marked as having an external log; " + "specify logdev on the mount command line."); + return XFS_ERROR(EINVAL); + } + + if (unlikely( + sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { + xfs_warn(mp, + "filesystem is marked as having an internal log; " + "do not specify logdev on the mount command line."); + return XFS_ERROR(EINVAL); + } + + /* + * More sanity checking. Most of these were stolen directly from + * xfs_repair. + */ + if (unlikely( + sbp->sb_agcount <= 0 || + sbp->sb_sectsize < XFS_MIN_SECTORSIZE || + sbp->sb_sectsize > XFS_MAX_SECTORSIZE || + sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || + sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || + sbp->sb_sectsize != (1 << sbp->sb_sectlog) || + sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || + sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || + sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_blocksize != (1 << sbp->sb_blocklog) || + sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || + sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || + sbp->sb_inodelog < XFS_DINODE_MIN_LOG || + sbp->sb_inodelog > XFS_DINODE_MAX_LOG || + sbp->sb_inodesize != (1 << sbp->sb_inodelog) || + (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || + (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || + (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || + sbp->sb_dblocks == 0 || + sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || + sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { + XFS_CORRUPTION_ERROR("SB sanity check failed", + XFS_ERRLEVEL_LOW, mp, sbp); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Until this is fixed only page-sized or smaller data blocks work. + */ + if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { + xfs_warn(mp, + "File system with blocksize %d bytes. " + "Only pagesize (%ld) or less will currently work.", + sbp->sb_blocksize, PAGE_SIZE); + return XFS_ERROR(ENOSYS); + } + + /* + * Currently only very few inode sizes are supported. + */ + switch (sbp->sb_inodesize) { + case 256: + case 512: + case 1024: + case 2048: + break; + default: + xfs_warn(mp, "inode size of %d bytes not supported", + sbp->sb_inodesize); + return XFS_ERROR(ENOSYS); + } + + if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || + xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { + xfs_warn(mp, + "file system too large to be mounted on this system."); + return XFS_ERROR(EFBIG); + } + + if (check_inprogress && sbp->sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Version 1 directory format has never worked on Linux. + */ + if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { + xfs_warn(mp, "file system using version 1 directory format"); + return XFS_ERROR(ENOSYS); + } + + return 0; +} + +void +xfs_sb_quota_from_disk(struct xfs_sb *sbp) +{ + /* + * older mkfs doesn't initialize quota inodes to NULLFSINO. This + * leads to in-core values having two different values for a quota + * inode to be invalid: 0 and NULLFSINO. Change it to a single value + * NULLFSINO. + * + * Note that this change affect only the in-core values. These + * values are not written back to disk unless any quota information + * is written to the disk. Even in that case, sb_pquotino field is + * not written to disk unless the superblock supports pquotino. + */ + if (sbp->sb_uquotino == 0) + sbp->sb_uquotino = NULLFSINO; + if (sbp->sb_gquotino == 0) + sbp->sb_gquotino = NULLFSINO; + if (sbp->sb_pquotino == 0) + sbp->sb_pquotino = NULLFSINO; + + /* + * We need to do these manipilations only if we are working + * with an older version of on-disk superblock. + */ + if (xfs_sb_version_has_pquotino(sbp)) + return; + + if (sbp->sb_qflags & XFS_OQUOTA_ENFD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD; + if (sbp->sb_qflags & XFS_OQUOTA_CHKD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; + sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); + + if (sbp->sb_qflags & XFS_PQUOTA_ACCT) { + /* + * In older version of superblock, on-disk superblock only + * has sb_gquotino, and in-core superblock has both sb_gquotino + * and sb_pquotino. But, only one of them is supported at any + * point of time. So, if PQUOTA is set in disk superblock, + * copy over sb_gquotino to sb_pquotino. + */ + sbp->sb_pquotino = sbp->sb_gquotino; + sbp->sb_gquotino = NULLFSINO; + } +} + +void +xfs_sb_from_disk( + struct xfs_sb *to, + xfs_dsb_t *from) +{ + to->sb_magicnum = be32_to_cpu(from->sb_magicnum); + to->sb_blocksize = be32_to_cpu(from->sb_blocksize); + to->sb_dblocks = be64_to_cpu(from->sb_dblocks); + to->sb_rblocks = be64_to_cpu(from->sb_rblocks); + to->sb_rextents = be64_to_cpu(from->sb_rextents); + memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); + to->sb_logstart = be64_to_cpu(from->sb_logstart); + to->sb_rootino = be64_to_cpu(from->sb_rootino); + to->sb_rbmino = be64_to_cpu(from->sb_rbmino); + to->sb_rsumino = be64_to_cpu(from->sb_rsumino); + to->sb_rextsize = be32_to_cpu(from->sb_rextsize); + to->sb_agblocks = be32_to_cpu(from->sb_agblocks); + to->sb_agcount = be32_to_cpu(from->sb_agcount); + to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks); + to->sb_logblocks = be32_to_cpu(from->sb_logblocks); + to->sb_versionnum = be16_to_cpu(from->sb_versionnum); + to->sb_sectsize = be16_to_cpu(from->sb_sectsize); + to->sb_inodesize = be16_to_cpu(from->sb_inodesize); + to->sb_inopblock = be16_to_cpu(from->sb_inopblock); + memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); + to->sb_blocklog = from->sb_blocklog; + to->sb_sectlog = from->sb_sectlog; + to->sb_inodelog = from->sb_inodelog; + to->sb_inopblog = from->sb_inopblog; + to->sb_agblklog = from->sb_agblklog; + to->sb_rextslog = from->sb_rextslog; + to->sb_inprogress = from->sb_inprogress; + to->sb_imax_pct = from->sb_imax_pct; + to->sb_icount = be64_to_cpu(from->sb_icount); + to->sb_ifree = be64_to_cpu(from->sb_ifree); + to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks); + to->sb_frextents = be64_to_cpu(from->sb_frextents); + to->sb_uquotino = be64_to_cpu(from->sb_uquotino); + to->sb_gquotino = be64_to_cpu(from->sb_gquotino); + to->sb_qflags = be16_to_cpu(from->sb_qflags); + to->sb_flags = from->sb_flags; + to->sb_shared_vn = from->sb_shared_vn; + to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt); + to->sb_unit = be32_to_cpu(from->sb_unit); + to->sb_width = be32_to_cpu(from->sb_width); + to->sb_dirblklog = from->sb_dirblklog; + to->sb_logsectlog = from->sb_logsectlog; + to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize); + to->sb_logsunit = be32_to_cpu(from->sb_logsunit); + to->sb_features2 = be32_to_cpu(from->sb_features2); + to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); + to->sb_features_compat = be32_to_cpu(from->sb_features_compat); + to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat); + to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); + to->sb_features_log_incompat = + be32_to_cpu(from->sb_features_log_incompat); + to->sb_pad = 0; + to->sb_pquotino = be64_to_cpu(from->sb_pquotino); + to->sb_lsn = be64_to_cpu(from->sb_lsn); +} + +static inline void +xfs_sb_quota_to_disk( + xfs_dsb_t *to, + xfs_sb_t *from, + __int64_t *fields) +{ + __uint16_t qflags = from->sb_qflags; + + /* + * We need to do these manipilations only if we are working + * with an older version of on-disk superblock. + */ + if (xfs_sb_version_has_pquotino(from)) + return; + + if (*fields & XFS_SB_QFLAGS) { + /* + * The in-core version of sb_qflags do not have + * XFS_OQUOTA_* flags, whereas the on-disk version + * does. So, convert incore XFS_{PG}QUOTA_* flags + * to on-disk XFS_OQUOTA_* flags. + */ + qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | + XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); + + if (from->sb_qflags & + (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) + qflags |= XFS_OQUOTA_ENFD; + if (from->sb_qflags & + (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) + qflags |= XFS_OQUOTA_CHKD; + to->sb_qflags = cpu_to_be16(qflags); + *fields &= ~XFS_SB_QFLAGS; + } + + /* + * GQUOTINO and PQUOTINO cannot be used together in versions + * of superblock that do not have pquotino. from->sb_flags + * tells us which quota is active and should be copied to + * disk. + */ + if ((*fields & XFS_SB_GQUOTINO) && + (from->sb_qflags & XFS_GQUOTA_ACCT)) + to->sb_gquotino = cpu_to_be64(from->sb_gquotino); + else if ((*fields & XFS_SB_PQUOTINO) && + (from->sb_qflags & XFS_PQUOTA_ACCT)) + to->sb_gquotino = cpu_to_be64(from->sb_pquotino); + + *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO); +} + +/* + * Copy in core superblock to ondisk one. + * + * The fields argument is mask of superblock fields to copy. + */ +void +xfs_sb_to_disk( + xfs_dsb_t *to, + xfs_sb_t *from, + __int64_t fields) +{ + xfs_caddr_t to_ptr = (xfs_caddr_t)to; + xfs_caddr_t from_ptr = (xfs_caddr_t)from; + xfs_sb_field_t f; + int first; + int size; + + ASSERT(fields); + if (!fields) + return; + + xfs_sb_quota_to_disk(to, from, &fields); + while (fields) { + f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); + first = xfs_sb_info[f].offset; + size = xfs_sb_info[f + 1].offset - first; + + ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); + + if (size == 1 || xfs_sb_info[f].type == 1) { + memcpy(to_ptr + first, from_ptr + first, size); + } else { + switch (size) { + case 2: + *(__be16 *)(to_ptr + first) = + cpu_to_be16(*(__u16 *)(from_ptr + first)); + break; + case 4: + *(__be32 *)(to_ptr + first) = + cpu_to_be32(*(__u32 *)(from_ptr + first)); + break; + case 8: + *(__be64 *)(to_ptr + first) = + cpu_to_be64(*(__u64 *)(from_ptr + first)); + break; + default: + ASSERT(0); + } + } + + fields &= ~(1LL << f); + } +} + +static int +xfs_sb_verify( + struct xfs_buf *bp, + bool check_version) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_sb sb; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + /* + * Only check the in progress field for the primary superblock as + * mkfs.xfs doesn't clear it from secondary superblocks. + */ + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + check_version); +} + +/* + * If the superblock has the CRC feature bit set or the CRC field is non-null, + * check that the CRC is valid. We check the CRC field is non-null because a + * single bit error could clear the feature bit and unused parts of the + * superblock are supposed to be zero. Hence a non-null crc field indicates that + * we've potentially lost a feature bit and we should check it anyway. + */ +static void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + int error; + + /* + * open code the version check to avoid needing to convert the entire + * superblock from disk order just to check the version number + */ + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) && + (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) == + XFS_SB_VERSION_5) || + dsb->sb_crc != 0)) { + + if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize), + offsetof(struct xfs_sb, sb_crc))) { + error = EFSCORRUPTED; + goto out_error; + } + } + error = xfs_sb_verify(bp, true); + +out_error: + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + mp, bp->b_addr); + xfs_buf_ioerror(bp, error); + } +} + +/* + * We may be probed for a filesystem match, so we may not want to emit + * messages when the superblock buffer is not actually an XFS superblock. + * If we find an XFS superblock, then run a normal, noisy mount because we are + * really going to mount it and want to know about errors. + */ +static void +xfs_sb_quiet_read_verify( + struct xfs_buf *bp) +{ + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + + + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { + /* XFS filesystem, verify noisily! */ + xfs_sb_read_verify(bp); + return; + } + /* quietly fail */ + xfs_buf_ioerror(bp, EWRONGFS); +} + +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + int error; + + error = xfs_sb_verify(bp, false); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + mp, bp->b_addr); + xfs_buf_ioerror(bp, error); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_sb, sb_crc)); +} + +const struct xfs_buf_ops xfs_sb_buf_ops = { + .verify_read = xfs_sb_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .verify_read = xfs_sb_quiet_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +/* + * xfs_mount_common + * + * Mount initialization code establishing various mount + * fields from the superblock associated with the given + * mount structure + */ +void +xfs_sb_mount_common( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + mp->m_agfrotor = mp->m_agirotor = 0; + spin_lock_init(&mp->m_agirotor_lock); + mp->m_maxagi = mp->m_sb.sb_agcount; + mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; + mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; + mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; + mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; + mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; + mp->m_blockmask = sbp->sb_blocksize - 1; + mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; + mp->m_blockwmask = mp->m_blockwsize - 1; + + mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; + mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; + + mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2; + mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2; + + mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; + mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; + + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); + mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, + sbp->sb_inopblock); + mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; +} + +/* + * xfs_initialize_perag_data + * + * Read in each per-ag structure so we can count up the number of + * allocated inodes, free inodes and used filesystem blocks as this + * information is no longer persistent in the superblock. Once we have + * this information, write it into the in-core superblock structure. + */ +int +xfs_initialize_perag_data( + struct xfs_mount *mp, + xfs_agnumber_t agcount) +{ + xfs_agnumber_t index; + xfs_perag_t *pag; + xfs_sb_t *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + int error; + + for (index = 0; index < agcount; index++) { + /* + * read the agf, then the agi. This gets us + * all the information we need and populates the + * per-ag structures for us. + */ + error = xfs_alloc_pagf_init(mp, NULL, index, 0); + if (error) + return error; + + error = xfs_ialloc_pagi_init(mp, NULL, index); + if (error) + return error; + pag = xfs_perag_get(mp, index); + ifree += pag->pagi_freecount; + ialloc += pag->pagi_count; + bfree += pag->pagf_freeblks; + bfreelst += pag->pagf_flcount; + btree += pag->pagf_btreeblks; + xfs_perag_put(pag); + } + /* + * Overwrite incore superblock counters with just-read data + */ + spin_lock(&mp->m_sb_lock); + sbp->sb_ifree = ifree; + sbp->sb_icount = ialloc; + sbp->sb_fdblocks = bfree + bfreelst + btree; + spin_unlock(&mp->m_sb_lock); + + /* Fixup the per-cpu counters as well. */ + xfs_icsb_reinit_counters(mp); + + return 0; +} + +/* + * xfs_mod_sb() can be used to copy arbitrary changes to the + * in-core superblock into the superblock buffer to be logged. + * It does not provide the higher level of locking that is + * needed to protect the in-core superblock from concurrent + * access. + */ +void +xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) +{ + xfs_buf_t *bp; + int first; + int last; + xfs_mount_t *mp; + xfs_sb_field_t f; + + ASSERT(fields); + if (!fields) + return; + mp = tp->t_mountp; + bp = xfs_trans_getsb(tp, mp, 0); + first = sizeof(xfs_sb_t); + last = 0; + + /* translate/copy */ + + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); + + /* find modified range */ + f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + last = xfs_sb_info[f + 1].offset - 1; + + f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + first = xfs_sb_info[f].offset; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(tp, bp, first, last); +} diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 78f9e70b80c7..6835b44f850e 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -26,6 +26,7 @@ struct xfs_buf; struct xfs_mount; +struct xfs_trans; #define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ #define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ @@ -83,11 +84,13 @@ struct xfs_mount; #define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ #define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ #define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ +#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */ #define XFS_SB_VERSION2_OKREALFBITS \ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ XFS_SB_VERSION2_ATTR2BIT | \ - XFS_SB_VERSION2_PROJID32BIT) + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_FTYPE) #define XFS_SB_VERSION2_OKSASHFBITS \ (0) #define XFS_SB_VERSION2_OKREALBITS \ @@ -354,15 +357,8 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp) (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) return 0; -#ifdef __KERNEL__ if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) return 0; -#else - if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) && - sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) - return 0; -#endif - return 1; } if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) @@ -554,12 +550,13 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); } -static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) +static inline void xfs_sb_version_addprojid32bit(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; + sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT; } - /* * Extended v5 superblock feature masks. These are to be used for new v5 * superblock features only. @@ -598,7 +595,10 @@ xfs_sb_has_ro_compat_feature( return (sbp->sb_features_ro_compat & feature) != 0; } -#define XFS_SB_FEAT_INCOMPAT_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_ALL \ + (XFS_SB_FEAT_INCOMPAT_FTYPE) + #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool xfs_sb_has_incompat_feature( @@ -618,16 +618,39 @@ xfs_sb_has_incompat_log_feature( return (sbp->sb_features_log_incompat & feature) != 0; } -static inline bool -xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) +/* + * V5 superblock specific feature checks + */ +static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) { - return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_has_pquotino(xfs_sb_t *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); } /* * end of superblock version macros */ +static inline bool +xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) +{ + return (ino == sbp->sb_uquotino || + ino == sbp->sb_gquotino || + ino == sbp->sb_pquotino); +} + #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) #define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) @@ -660,4 +683,23 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) #define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) +/* + * perag get/put wrappers for ref counting + */ +extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); +extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, + int tag); +extern void xfs_perag_put(struct xfs_perag *pag); +extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); + +extern void xfs_sb_calc_crc(struct xfs_buf *); +extern void xfs_mod_sb(struct xfs_trans *, __int64_t); +extern void xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *); +extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); +extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); +extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); + +extern const struct xfs_buf_ops xfs_sb_buf_ops; +extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; + #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1d68ffcdeaa7..979a77d4b87d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -17,12 +17,12 @@ */ #include "xfs.h" +#include "xfs_format.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" #include "xfs_quota.h" #include "xfs_mount.h" @@ -40,12 +40,12 @@ #include "xfs_fsops.h" #include "xfs_attr.h" #include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_vnodeops.h" #include "xfs_log_priv.h" #include "xfs_trans_priv.h" #include "xfs_filestream.h" #include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" #include "xfs_inode_item.h" @@ -421,12 +421,6 @@ xfs_parseargs( } #endif - if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && - (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) { - xfs_warn(mp, "cannot mount with both project and group quota"); - return EINVAL; - } - if ((dsunit && !dswidth) || (!dsunit && dswidth)) { xfs_warn(mp, "sunit and swidth must be specified together"); return EINVAL; @@ -556,14 +550,13 @@ xfs_showargs( else if (mp->m_qflags & XFS_UQUOTA_ACCT) seq_puts(m, "," MNTOPT_UQUOTANOENF); - /* Either project or group quotas can be active, not both */ - if (mp->m_qflags & XFS_PQUOTA_ACCT) { if (mp->m_qflags & XFS_PQUOTA_ENFD) seq_puts(m, "," MNTOPT_PRJQUOTA); else seq_puts(m, "," MNTOPT_PQUOTANOENF); - } else if (mp->m_qflags & XFS_GQUOTA_ACCT) { + } + if (mp->m_qflags & XFS_GQUOTA_ACCT) { if (mp->m_qflags & XFS_GQUOTA_ENFD) seq_puts(m, "," MNTOPT_GRPQUOTA); else @@ -870,17 +863,17 @@ xfs_init_mount_workqueues( goto out_destroy_unwritten; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - WQ_NON_REENTRANT, 0, mp->m_fsname); + 0, 0, mp->m_fsname); if (!mp->m_reclaim_workqueue) goto out_destroy_cil; mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", - WQ_NON_REENTRANT, 0, mp->m_fsname); + 0, 0, mp->m_fsname); if (!mp->m_log_workqueue) goto out_destroy_reclaim; mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - WQ_NON_REENTRANT, 0, mp->m_fsname); + 0, 0, mp->m_fsname); if (!mp->m_eofblocks_workqueue) goto out_destroy_log; @@ -1396,6 +1389,14 @@ xfs_finish_flags( return XFS_ERROR(EROFS); } + if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && + (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) && + !xfs_sb_version_has_pquotino(&mp->m_sb)) { + xfs_warn(mp, + "Super block does not support project and group quota together"); + return XFS_ERROR(EINVAL); + } + return 0; } diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index f4895b662fcb..2f2a7c005be2 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -18,200 +18,29 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_mount.h" #include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2.h" #include "xfs_bmap_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_itable.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" #include "xfs_error.h" #include "xfs_quota.h" -#include "xfs_utils.h" #include "xfs_trans_space.h" -#include "xfs_log_priv.h" #include "xfs_trace.h" #include "xfs_symlink.h" -#include "xfs_cksum.h" -#include "xfs_buf_item.h" - - -/* - * Each contiguous block has a header, so it is not just a simple pathlen - * to FSB conversion. - */ -int -xfs_symlink_blocks( - struct xfs_mount *mp, - int pathlen) -{ - int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - - return (pathlen + buflen - 1) / buflen; -} - -static int -xfs_symlink_hdr_set( - struct xfs_mount *mp, - xfs_ino_t ino, - uint32_t offset, - uint32_t size, - struct xfs_buf *bp) -{ - struct xfs_dsymlink_hdr *dsl = bp->b_addr; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return 0; - - dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); - dsl->sl_offset = cpu_to_be32(offset); - dsl->sl_bytes = cpu_to_be32(size); - uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); - dsl->sl_owner = cpu_to_be64(ino); - dsl->sl_blkno = cpu_to_be64(bp->b_bn); - bp->b_ops = &xfs_symlink_buf_ops; - - return sizeof(struct xfs_dsymlink_hdr); -} - -/* - * Checking of the symlink header is split into two parts. the verifier does - * CRC, location and bounds checking, the unpacking function checks the path - * parameters and owner. - */ -bool -xfs_symlink_hdr_ok( - struct xfs_mount *mp, - xfs_ino_t ino, - uint32_t offset, - uint32_t size, - struct xfs_buf *bp) -{ - struct xfs_dsymlink_hdr *dsl = bp->b_addr; - - if (offset != be32_to_cpu(dsl->sl_offset)) - return false; - if (size != be32_to_cpu(dsl->sl_bytes)) - return false; - if (ino != be64_to_cpu(dsl->sl_owner)) - return false; - - /* ok */ - return true; -} - -static bool -xfs_symlink_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dsymlink_hdr *dsl = bp->b_addr; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) - return false; - if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) - return false; - if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) - return false; - if (be32_to_cpu(dsl->sl_offset) + - be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN) - return false; - if (dsl->sl_owner == 0) - return false; - - return true; -} - -static void -xfs_symlink_read_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - - /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; - - if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_dsymlink_hdr, sl_crc)) || - !xfs_symlink_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } -} - -static void -xfs_symlink_write_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; - - /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; - - if (!xfs_symlink_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - return; - } - - if (bip) { - struct xfs_dsymlink_hdr *dsl = bp->b_addr; - dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); - } - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_dsymlink_hdr, sl_crc)); -} - -const struct xfs_buf_ops xfs_symlink_buf_ops = { - .verify_read = xfs_symlink_read_verify, - .verify_write = xfs_symlink_write_verify, -}; - -void -xfs_symlink_local_to_remote( - struct xfs_trans *tp, - struct xfs_buf *bp, - struct xfs_inode *ip, - struct xfs_ifork *ifp) -{ - struct xfs_mount *mp = ip->i_mount; - char *buf; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) { - bp->b_ops = NULL; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); - return; - } - - /* - * As this symlink fits in an inode literal area, it must also fit in - * the smallest buffer the filesystem supports. - */ - ASSERT(BBTOB(bp->b_length) >= - ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr)); - - bp->b_ops = &xfs_symlink_buf_ops; - - buf = bp->b_addr; - buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); - memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); -} /* ----- Kernel only functions below ----- */ STATIC int @@ -386,8 +215,11 @@ xfs_symlink( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); + error = xfs_qm_vop_dqalloc(dp, + xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); if (error) goto std_return; @@ -402,12 +234,10 @@ xfs_symlink( else fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); - error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0); if (error == ENOSPC && fs_blocks == 0) { resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); } if (error) { cancel_flags = 0; @@ -710,8 +540,8 @@ xfs_inactive_symlink_rmt( * Put an itruncate log reservation in the new transaction * for our caller. */ - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); goto error0; } diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index 374394880c01..99338ba666ac 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -17,50 +17,11 @@ #ifndef __XFS_SYMLINK_H #define __XFS_SYMLINK_H 1 -struct xfs_mount; -struct xfs_trans; -struct xfs_inode; -struct xfs_buf; -struct xfs_ifork; -struct xfs_name; - -#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ - -struct xfs_dsymlink_hdr { - __be32 sl_magic; - __be32 sl_offset; - __be32 sl_bytes; - __be32 sl_crc; - uuid_t sl_uuid; - __be64 sl_owner; - __be64 sl_blkno; - __be64 sl_lsn; -}; - -/* - * The maximum pathlen is 1024 bytes. Since the minimum file system - * blocksize is 512 bytes, we can get a max of 3 extents back from - * bmapi when crc headers are taken into account. - */ -#define XFS_SYMLINK_MAPS 3 - -#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ - sizeof(struct xfs_dsymlink_hdr) : 0)) - -int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); - -void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_inode *ip, struct xfs_ifork *ifp); - -extern const struct xfs_buf_ops xfs_symlink_buf_ops; - -#ifdef __KERNEL__ +/* Kernel only symlink defintions */ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); int xfs_readlink(struct xfs_inode *ip, char *link); int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp); -#endif /* __KERNEL__ */ #endif /* __XFS_SYMLINK_H */ diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c new file mode 100644 index 000000000000..01c85e3f6470 --- /dev/null +++ b/fs/xfs/xfs_symlink_remote.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" + + +/* + * Each contiguous block has a header, so it is not just a simple pathlen + * to FSB conversion. + */ +int +xfs_symlink_blocks( + struct xfs_mount *mp, + int pathlen) +{ + int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + + return (pathlen + buflen - 1) / buflen; +} + +int +xfs_symlink_hdr_set( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); + dsl->sl_offset = cpu_to_be32(offset); + dsl->sl_bytes = cpu_to_be32(size); + uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); + dsl->sl_owner = cpu_to_be64(ino); + dsl->sl_blkno = cpu_to_be64(bp->b_bn); + bp->b_ops = &xfs_symlink_buf_ops; + + return sizeof(struct xfs_dsymlink_hdr); +} + +/* + * Checking of the symlink header is split into two parts. the verifier does + * CRC, location and bounds checking, the unpacking function checks the path + * parameters and owner. + */ +bool +xfs_symlink_hdr_ok( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (offset != be32_to_cpu(dsl->sl_offset)) + return false; + if (size != be32_to_cpu(dsl->sl_bytes)) + return false; + if (ino != be64_to_cpu(dsl->sl_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_symlink_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + return false; + if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) + return false; + if (be32_to_cpu(dsl->sl_offset) + + be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN) + return false; + if (dsl->sl_owner == 0) + return false; + + return true; +} + +static void +xfs_symlink_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_dsymlink_hdr, sl_crc)) || + !xfs_symlink_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_symlink_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_symlink_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (bip) { + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_dsymlink_hdr, sl_crc)); +} + +const struct xfs_buf_ops xfs_symlink_buf_ops = { + .verify_read = xfs_symlink_read_verify, + .verify_write = xfs_symlink_write_verify, +}; + +void +xfs_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + struct xfs_mount *mp = ip->i_mount; + char *buf; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + bp->b_ops = NULL; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + return; + } + + /* + * As this symlink fits in an inode literal area, it must also fit in + * the smallest buffer the filesystem supports. + */ + ASSERT(BBTOB(bp->b_length) >= + ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr)); + + bp->b_ops = &xfs_symlink_buf_ops; + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); + memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); +} diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index b6e3897c1d9f..5d7b3e40705f 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -18,6 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" +#include "xfs_format.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 35a229981354..5411e01ab452 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -18,7 +18,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -49,629 +49,6 @@ kmem_zone_t *xfs_trans_zone; kmem_zone_t *xfs_log_item_desc_zone; /* - * A buffer has a format structure overhead in the log in addition - * to the data, so we need to take this into account when reserving - * space in a transaction for a buffer. Round the space required up - * to a multiple of 128 bytes so that we don't change the historical - * reservation that has been used for this overhead. - */ -STATIC uint -xfs_buf_log_overhead(void) -{ - return round_up(sizeof(struct xlog_op_header) + - sizeof(struct xfs_buf_log_format), 128); -} - -/* - * Calculate out transaction log reservation per item in bytes. - * - * The nbufs argument is used to indicate the number of items that - * will be changed in a transaction. size is used to tell how many - * bytes should be reserved per item. - */ -STATIC uint -xfs_calc_buf_res( - uint nbufs, - uint size) -{ - return nbufs * (size + xfs_buf_log_overhead()); -} - -/* - * Various log reservation values. - * - * These are based on the size of the file system block because that is what - * most transactions manipulate. Each adds in an additional 128 bytes per - * item logged to try to account for the overhead of the transaction mechanism. - * - * Note: Most of the reservations underestimate the number of allocation - * groups into which they could free extents in the xfs_bmap_finish() call. - * This is because the number in the worst case is quite high and quite - * unusual. In order to fix this we need to change xfs_bmap_finish() to free - * extents in only a single AG at a time. This will require changes to the - * EFI code as well, however, so that the EFI for the extents not freed is - * logged again in each transaction. See SGI PV #261917. - * - * Reservation functions here avoid a huge stack in xfs_trans_init due to - * register overflow from temporaries in the calculations. - */ - - -/* - * In a write transaction we can allocate a maximum of 2 - * extents. This gives: - * the inode getting the new extents: inode size - * the inode's bmap btree: max depth * block size - * the agfs of the ags from which the extents are allocated: 2 * sector - * the superblock free block counter: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: - * the agfs of the ags containing the blocks: 2 * sector size - * the agfls of the ags containing the blocks: 2 * sector size - * the super block free block counter: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_write_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), - XFS_FSB_TO_B(mp, 1)))); -} - -/* - * In truncating a file we free up to two extents at once. We can modify: - * the inode being truncated: inode size - * the inode's bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: - * the agf for each of the ags: 4 * sector size - * the agfl for each of the ags: 4 * sector size - * the super block to reflect the freed blocks: sector size - * worst case split in allocation btrees per extent assuming 4 extents: - * 4 exts * 2 trees * (2 * max depth - 1) * block size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -STATIC uint -xfs_calc_itruncate_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(5, 0) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + - mp->m_in_maxlevels, 0))); -} - -/* - * In renaming a files we can modify: - * the four inodes involved: 4 * inode size - * the two directory btrees: 2 * (max depth + v2) * dir block size - * the two directory bmap btrees: 2 * max depth * block size - * And the bmap_finish transaction can free dir and bmap blocks (two sets - * of bmap blocks) giving: - * the agf for the ags in which the blocks live: 3 * sector size - * the agfl for the ags in which the blocks live: 3 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_rename_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), - XFS_FSB_TO_B(mp, 1)))); -} - -/* - * For creating a link to an inode: - * the parent directory inode: inode size - * the linked inode: inode size - * the directory btree could split: (max depth + v2) * dir block size - * the directory bmap btree could join or split: (max depth + v2) * blocksize - * And the bmap_finish transaction can free some bmap blocks giving: - * the agf for the ag in which the blocks live: sector size - * the agfl for the ag in which the blocks live: sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_link_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)))); -} - -/* - * For removing a directory entry we can modify: - * the parent directory inode: inode size - * the removed inode: inode size - * the directory btree could join: (max depth + v2) * dir block size - * the directory bmap btree could join or split: (max depth + v2) * blocksize - * And the bmap_finish transaction can free the dir and bmap blocks giving: - * the agf for the ag in which the blocks live: 2 * sector size - * the agfl for the ag in which the blocks live: 2 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_remove_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), - XFS_FSB_TO_B(mp, 1)))); -} - -/* - * For create, break it in to the two cases that the transaction - * covers. We start with the modify case - allocation done by modification - * of the state of existing inodes - and the allocation case. - */ - -/* - * For create we can modify: - * the parent directory inode: inode size - * the new inode: inode size - * the inode btree entry: block size - * the superblock for the nlink flag: sector size - * the directory btree: (max depth + v2) * dir block size - * the directory inode's bmap btree: (max depth + v2) * block size - */ -STATIC uint -xfs_calc_create_resv_modify( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - (uint)XFS_FSB_TO_B(mp, 1) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); -} - -/* - * For create we can allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the superblock for the nlink flag: sector size - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -STATIC uint -xfs_calc_create_resv_alloc( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - mp->m_sb.sb_sectsize + - xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)); -} - -STATIC uint -__xfs_calc_create_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX(xfs_calc_create_resv_alloc(mp), - xfs_calc_create_resv_modify(mp)); -} - -/* - * For icreate we can allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the superblock for the nlink flag: sector size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -STATIC uint -xfs_calc_icreate_resv_alloc( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - mp->m_sb.sb_sectsize + - xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)); -} - -STATIC uint -xfs_calc_icreate_reservation(xfs_mount_t *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX(xfs_calc_icreate_resv_alloc(mp), - xfs_calc_create_resv_modify(mp)); -} - -STATIC uint -xfs_calc_create_reservation( - struct xfs_mount *mp) -{ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return xfs_calc_icreate_reservation(mp); - return __xfs_calc_create_reservation(mp); - -} - -/* - * Making a new directory is the same as creating a new file. - */ -STATIC uint -xfs_calc_mkdir_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_create_reservation(mp); -} - - -/* - * Making a new symplink is the same as creating a new file, but - * with the added blocks for remote symlink data which can be up to 1kB in - * length (MAXPATHLEN). - */ -STATIC uint -xfs_calc_symlink_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_create_reservation(mp) + - xfs_calc_buf_res(1, MAXPATHLEN); -} - -/* - * In freeing an inode we can modify: - * the inode being freed: inode size - * the super block free inode counter: sector size - * the agi hash list and counters: sector size - * the inode btree entry: block size - * the on disk inode before ours in the agi hash list: inode cluster size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -STATIC uint -xfs_calc_ifree_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + - MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), - XFS_INODE_CLUSTER_SIZE(mp)) + - xfs_calc_buf_res(1, 0) + - xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + - mp->m_in_maxlevels, 0) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)); -} - -/* - * When only changing the inode we log the inode and possibly the superblock - * We also add a bit of slop for the transaction stuff. - */ -STATIC uint -xfs_calc_ichange_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize + - 512; - -} - -/* - * Growing the data section of the filesystem. - * superblock - * agi and agf - * allocation btrees - */ -STATIC uint -xfs_calc_growdata_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)); -} - -/* - * Growing the rt section of the filesystem. - * In the first set of transactions (ALLOC) we allocate space to the - * bitmap or summary files. - * superblock: sector size - * agf of the ag from which the extent is allocated: sector size - * bmap btree for bitmap/summary inode: max depth * blocksize - * bitmap/summary inode: inode size - * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize - */ -STATIC uint -xfs_calc_growrtalloc_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)); -} - -/* - * Growing the rt section of the filesystem. - * In the second set of transactions (ZERO) we zero the new metadata blocks. - * one bitmap/summary block: blocksize - */ -STATIC uint -xfs_calc_growrtzero_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize); -} - -/* - * Growing the rt section of the filesystem. - * In the third set of transactions (FREE) we update metadata without - * allocating any new blocks. - * superblock: sector size - * bitmap inode: inode size - * summary inode: inode size - * one bitmap block: blocksize - * summary blocks: new summary size - */ -STATIC uint -xfs_calc_growrtfree_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + - xfs_calc_buf_res(1, mp->m_rsumsize); -} - -/* - * Logging the inode modification timestamp on a synchronous write. - * inode - */ -STATIC uint -xfs_calc_swrite_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize); -} - -/* - * Logging the inode mode bits when writing a setuid/setgid file - * inode - */ -STATIC uint -xfs_calc_writeid_reservation(xfs_mount_t *mp) -{ - return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize); -} - -/* - * Converting the inode from non-attributed to attributed. - * the inode being converted: inode size - * agf block and superblock (for block allocation) - * the new block (directory sized) - * bmap blocks for the new directory block - * allocation btrees - */ -STATIC uint -xfs_calc_addafork_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(1, mp->m_dirblksize) + - xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)); -} - -/* - * Removing the attribute fork of a file - * the inode being truncated: inode size - * the inode's bmap btree: max depth * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: - * the agf for each of the ags: 4 * sector size - * the agfl for each of the ags: 4 * sector size - * the super block to reflect the freed blocks: sector size - * worst case split in allocation btrees per extent assuming 4 extents: - * 4 exts * 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_attrinval_reservation( - struct xfs_mount *mp) -{ - return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), - XFS_FSB_TO_B(mp, 1)))); -} - -/* - * Setting an attribute at mount time. - * the inode getting the attribute - * the superblock for allocations - * the agfs extents are allocated from - * the attribute btree * max depth - * the inode allocation btree - * Since attribute transaction space is dependent on the size of the attribute, - * the calculation is done partially at mount time and partially at runtime(see - * below). - */ -STATIC uint -xfs_calc_attrsetm_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); -} - -/* - * Setting an attribute at runtime, transaction space unit per block. - * the superblock for allocations: sector size - * the inode bmap btree could join or split: max depth * block size - * Since the runtime attribute transaction space is dependent on the total - * blocks needed for the 1st bmap, here we calculate out the space unit for - * one block so that the caller could figure out the total space according - * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp). - */ -STATIC uint -xfs_calc_attrsetrt_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), - XFS_FSB_TO_B(mp, 1)); -} - -/* - * Removing an attribute. - * the inode: inode size - * the attribute btree could join: max depth * block size - * the inode bmap btree could join or split: max depth * block size - * And the bmap_finish transaction can free the attr blocks freed giving: - * the agf for the ag in which the blocks live: 2 * sector size - * the agfl for the ag in which the blocks live: 2 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -STATIC uint -xfs_calc_attrrm_reservation( - struct xfs_mount *mp) -{ - return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, - XFS_FSB_TO_B(mp, 1)) + - (uint)XFS_FSB_TO_B(mp, - XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), - XFS_FSB_TO_B(mp, 1)))); -} - -/* - * Clearing a bad agino number in an agi hash bucket. - */ -STATIC uint -xfs_calc_clear_agi_bucket_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); -} - -/* - * Clearing the quotaflags in the superblock. - * the super block for changing quota flags: sector size - */ -STATIC uint -xfs_calc_qm_sbchange_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); -} - -/* - * Adjusting quota limits. - * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) - */ -STATIC uint -xfs_calc_qm_setqlim_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); -} - -/* - * Allocating quota on disk if needed. - * the write transaction log space: XFS_WRITE_LOG_RES(mp) - * the unit of quota allocation: one system block size - */ -STATIC uint -xfs_calc_qm_dqalloc_reservation( - struct xfs_mount *mp) -{ - return XFS_WRITE_LOG_RES(mp) + - xfs_calc_buf_res(1, - XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); -} - -/* - * Turning off quotas. - * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 - * the superblock for the quota flags: sector size - */ -STATIC uint -xfs_calc_qm_quotaoff_reservation( - struct xfs_mount *mp) -{ - return sizeof(struct xfs_qoff_logitem) * 2 + - xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); -} - -/* - * End of turning off quotas. - * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 - */ -STATIC uint -xfs_calc_qm_quotaoff_end_reservation( - struct xfs_mount *mp) -{ - return sizeof(struct xfs_qoff_logitem) * 2; -} - -/* - * Syncing the incore super block changes to disk. - * the super block to reflect the changes: sector size - */ -STATIC uint -xfs_calc_sb_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); -} - -/* * Initialize the precomputed transaction reservation values * in the mount structure. */ @@ -679,36 +56,7 @@ void xfs_trans_init( struct xfs_mount *mp) { - struct xfs_trans_reservations *resp = &mp->m_reservations; - - resp->tr_write = xfs_calc_write_reservation(mp); - resp->tr_itruncate = xfs_calc_itruncate_reservation(mp); - resp->tr_rename = xfs_calc_rename_reservation(mp); - resp->tr_link = xfs_calc_link_reservation(mp); - resp->tr_remove = xfs_calc_remove_reservation(mp); - resp->tr_symlink = xfs_calc_symlink_reservation(mp); - resp->tr_create = xfs_calc_create_reservation(mp); - resp->tr_mkdir = xfs_calc_mkdir_reservation(mp); - resp->tr_ifree = xfs_calc_ifree_reservation(mp); - resp->tr_ichange = xfs_calc_ichange_reservation(mp); - resp->tr_growdata = xfs_calc_growdata_reservation(mp); - resp->tr_swrite = xfs_calc_swrite_reservation(mp); - resp->tr_writeid = xfs_calc_writeid_reservation(mp); - resp->tr_addafork = xfs_calc_addafork_reservation(mp); - resp->tr_attrinval = xfs_calc_attrinval_reservation(mp); - resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp); - resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp); - resp->tr_attrrm = xfs_calc_attrrm_reservation(mp); - resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp); - resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp); - resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp); - resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp); - resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp); - resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp); - resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp); - resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp); - resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp); - resp->tr_sb = xfs_calc_sb_reservation(mp); + xfs_trans_resv_calc(mp, M_RES(mp)); } /* @@ -744,7 +92,7 @@ _xfs_trans_alloc( atomic_inc(&mp->m_active_trans); tp = kmem_zone_zalloc(xfs_trans_zone, memflags); - tp->t_magic = XFS_TRANS_MAGIC; + tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_type = type; tp->t_mountp = mp; INIT_LIST_HEAD(&tp->t_items); @@ -789,7 +137,7 @@ xfs_trans_dup( /* * Initialize the new transaction structure. */ - ntp->t_magic = XFS_TRANS_MAGIC; + ntp->t_magic = XFS_TRANS_HEADER_MAGIC; ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; INIT_LIST_HEAD(&ntp->t_items); @@ -832,12 +180,10 @@ xfs_trans_dup( */ int xfs_trans_reserve( - xfs_trans_t *tp, - uint blocks, - uint logspace, - uint rtextents, - uint flags, - uint logcount) + struct xfs_trans *tp, + struct xfs_trans_res *resp, + uint blocks, + uint rtextents) { int error = 0; int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; @@ -863,13 +209,15 @@ xfs_trans_reserve( /* * Reserve the log space needed for this transaction. */ - if (logspace > 0) { + if (resp->tr_logres > 0) { bool permanent = false; - ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace); - ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount); + ASSERT(tp->t_log_res == 0 || + tp->t_log_res == resp->tr_logres); + ASSERT(tp->t_log_count == 0 || + tp->t_log_count == resp->tr_logcount); - if (flags & XFS_TRANS_PERM_LOG_RES) { + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { tp->t_flags |= XFS_TRANS_PERM_LOG_RES; permanent = true; } else { @@ -878,20 +226,21 @@ xfs_trans_reserve( } if (tp->t_ticket != NULL) { - ASSERT(flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); } else { - error = xfs_log_reserve(tp->t_mountp, logspace, - logcount, &tp->t_ticket, - XFS_TRANSACTION, permanent, - tp->t_type); + error = xfs_log_reserve(tp->t_mountp, + resp->tr_logres, + resp->tr_logcount, + &tp->t_ticket, XFS_TRANSACTION, + permanent, tp->t_type); } if (error) goto undo_blocks; - tp->t_log_res = logspace; - tp->t_log_count = logcount; + tp->t_log_res = resp->tr_logres; + tp->t_log_count = resp->tr_logcount; } /* @@ -916,10 +265,10 @@ xfs_trans_reserve( * reservations which have already been performed. */ undo_log: - if (logspace > 0) { + if (resp->tr_logres > 0) { int log_flags; - if (flags & XFS_TRANS_PERM_LOG_RES) { + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { log_flags = XFS_LOG_REL_PERM_RESERV; } else { log_flags = 0; @@ -1367,10 +716,10 @@ xfs_trans_free_items( lip->li_desc = NULL; if (commit_lsn != NULLCOMMITLSN) - IOP_COMMITTING(lip, commit_lsn); + lip->li_ops->iop_committing(lip, commit_lsn); if (flags & XFS_TRANS_ABORT) lip->li_flags |= XFS_LI_ABORTED; - IOP_UNLOCK(lip); + lip->li_ops->iop_unlock(lip); xfs_trans_free_item_desc(lidp); } @@ -1390,8 +739,11 @@ xfs_log_item_batch_insert( /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); - for (i = 0; i < nr_items; i++) - IOP_UNPIN(log_items[i], 0); + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + + lip->li_ops->iop_unpin(lip, 0); + } } /* @@ -1401,11 +753,11 @@ xfs_log_item_batch_insert( * * If we are called with the aborted flag set, it is because a log write during * a CIL checkpoint commit has failed. In this case, all the items in the - * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which + * checkpoint have already gone through iop_commited and iop_unlock, which * means that checkpoint commit abort handling is treated exactly the same * as an iclog write error even though we haven't started any IO yet. Hence in - * this case all we need to do is IOP_COMMITTED processing, followed by an - * IOP_UNPIN(aborted) call. + * this case all we need to do is iop_committed processing, followed by an + * iop_unpin(aborted) call. * * The AIL cursor is used to optimise the insert process. If commit_lsn is not * at the end of the AIL, the insert cursor avoids the need to walk @@ -1438,7 +790,7 @@ xfs_trans_committed_bulk( if (aborted) lip->li_flags |= XFS_LI_ABORTED; - item_lsn = IOP_COMMITTED(lip, commit_lsn); + item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); /* item_lsn of -1 means the item needs no further processing */ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) @@ -1450,7 +802,7 @@ xfs_trans_committed_bulk( */ if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); - IOP_UNPIN(lip, 1); + lip->li_ops->iop_unpin(lip, 1); continue; } @@ -1468,7 +820,7 @@ xfs_trans_committed_bulk( xfs_trans_ail_update(ailp, lip, item_lsn); else spin_unlock(&ailp->xa_lock); - IOP_UNPIN(lip, 0); + lip->li_ops->iop_unpin(lip, 0); continue; } @@ -1666,7 +1018,7 @@ xfs_trans_roll( struct xfs_inode *dp) { struct xfs_trans *trans; - unsigned int logres, count; + struct xfs_trans_res tres; int error; /* @@ -1678,8 +1030,8 @@ xfs_trans_roll( /* * Copy the critical parameters from one trans to the next. */ - logres = trans->t_log_res; - count = trans->t_log_count; + tres.tr_logres = trans->t_log_res; + tres.tr_logcount = trans->t_log_count; *tpp = xfs_trans_dup(trans); /* @@ -1710,8 +1062,8 @@ xfs_trans_roll( * across this call, or that anything that is locked be logged in * the prior and the next transactions. */ - error = xfs_trans_reserve(trans, 0, logres, 0, - XFS_TRANS_PERM_LOG_RES, count); + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(trans, &tres, 0, 0); /* * Ensure that the inode is in the new transaction and locked. */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 2b4946393e30..09cf40b89e8c 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -20,285 +20,9 @@ struct xfs_log_item; -/* - * This is the structure written in the log at the head of - * every transaction. It identifies the type and id of the - * transaction, and contains the number of items logged by - * the transaction so we know how many to expect during recovery. - * - * Do not change the below structure without redoing the code in - * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). - */ -typedef struct xfs_trans_header { - uint th_magic; /* magic number */ - uint th_type; /* transaction type */ - __int32_t th_tid; /* transaction id (unused) */ - uint th_num_items; /* num items logged by trans */ -} xfs_trans_header_t; - -#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ - -/* - * Log item types. - */ -#define XFS_LI_EFI 0x1236 -#define XFS_LI_EFD 0x1237 -#define XFS_LI_IUNLINK 0x1238 -#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ -#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ -#define XFS_LI_DQUOT 0x123d -#define XFS_LI_QUOTAOFF 0x123e -#define XFS_LI_ICREATE 0x123f - -#define XFS_LI_TYPE_DESC \ - { XFS_LI_EFI, "XFS_LI_EFI" }, \ - { XFS_LI_EFD, "XFS_LI_EFD" }, \ - { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ - { XFS_LI_INODE, "XFS_LI_INODE" }, \ - { XFS_LI_BUF, "XFS_LI_BUF" }, \ - { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ - { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" } - -/* - * Transaction types. Used to distinguish types of buffers. - */ -#define XFS_TRANS_SETATTR_NOT_SIZE 1 -#define XFS_TRANS_SETATTR_SIZE 2 -#define XFS_TRANS_INACTIVE 3 -#define XFS_TRANS_CREATE 4 -#define XFS_TRANS_CREATE_TRUNC 5 -#define XFS_TRANS_TRUNCATE_FILE 6 -#define XFS_TRANS_REMOVE 7 -#define XFS_TRANS_LINK 8 -#define XFS_TRANS_RENAME 9 -#define XFS_TRANS_MKDIR 10 -#define XFS_TRANS_RMDIR 11 -#define XFS_TRANS_SYMLINK 12 -#define XFS_TRANS_SET_DMATTRS 13 -#define XFS_TRANS_GROWFS 14 -#define XFS_TRANS_STRAT_WRITE 15 -#define XFS_TRANS_DIOSTRAT 16 -/* 17 was XFS_TRANS_WRITE_SYNC */ -#define XFS_TRANS_WRITEID 18 -#define XFS_TRANS_ADDAFORK 19 -#define XFS_TRANS_ATTRINVAL 20 -#define XFS_TRANS_ATRUNCATE 21 -#define XFS_TRANS_ATTR_SET 22 -#define XFS_TRANS_ATTR_RM 23 -#define XFS_TRANS_ATTR_FLAG 24 -#define XFS_TRANS_CLEAR_AGI_BUCKET 25 -#define XFS_TRANS_QM_SBCHANGE 26 -/* - * Dummy entries since we use the transaction type to index into the - * trans_type[] in xlog_recover_print_trans_head() - */ -#define XFS_TRANS_DUMMY1 27 -#define XFS_TRANS_DUMMY2 28 -#define XFS_TRANS_QM_QUOTAOFF 29 -#define XFS_TRANS_QM_DQALLOC 30 -#define XFS_TRANS_QM_SETQLIM 31 -#define XFS_TRANS_QM_DQCLUSTER 32 -#define XFS_TRANS_QM_QINOCREATE 33 -#define XFS_TRANS_QM_QUOTAOFF_END 34 -#define XFS_TRANS_SB_UNIT 35 -#define XFS_TRANS_FSYNC_TS 36 -#define XFS_TRANS_GROWFSRT_ALLOC 37 -#define XFS_TRANS_GROWFSRT_ZERO 38 -#define XFS_TRANS_GROWFSRT_FREE 39 -#define XFS_TRANS_SWAPEXT 40 -#define XFS_TRANS_SB_COUNT 41 -#define XFS_TRANS_CHECKPOINT 42 -#define XFS_TRANS_ICREATE 43 -#define XFS_TRANS_TYPE_MAX 43 -/* new transaction types need to be reflected in xfs_logprint(8) */ - -#define XFS_TRANS_TYPES \ - { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ - { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ - { XFS_TRANS_INACTIVE, "INACTIVE" }, \ - { XFS_TRANS_CREATE, "CREATE" }, \ - { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ - { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ - { XFS_TRANS_REMOVE, "REMOVE" }, \ - { XFS_TRANS_LINK, "LINK" }, \ - { XFS_TRANS_RENAME, "RENAME" }, \ - { XFS_TRANS_MKDIR, "MKDIR" }, \ - { XFS_TRANS_RMDIR, "RMDIR" }, \ - { XFS_TRANS_SYMLINK, "SYMLINK" }, \ - { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ - { XFS_TRANS_GROWFS, "GROWFS" }, \ - { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ - { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ - { XFS_TRANS_WRITEID, "WRITEID" }, \ - { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ - { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ - { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ - { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ - { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ - { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ - { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ - { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ - { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ - { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ - { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ - { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ - { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ - { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ - { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \ - { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ - { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ - { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ - { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ - { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ - { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ - { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ - { XFS_TRANS_DUMMY1, "DUMMY1" }, \ - { XFS_TRANS_DUMMY2, "DUMMY2" }, \ - { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } - -/* - * This structure is used to track log items associated with - * a transaction. It points to the log item and keeps some - * flags to track the state of the log item. It also tracks - * the amount of space needed to log the item it describes - * once we get to commit processing (see xfs_trans_commit()). - */ -struct xfs_log_item_desc { - struct xfs_log_item *lid_item; - struct list_head lid_trans; - unsigned char lid_flags; -}; - -#define XFS_LID_DIRTY 0x1 - -#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */ -/* - * Values for t_flags. - */ -#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ -#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ -#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ -#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ -#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ -#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer - count in superblock */ - -/* - * Values for call flags parameter. - */ -#define XFS_TRANS_RELEASE_LOG_RES 0x4 -#define XFS_TRANS_ABORT 0x8 - -/* - * Field values for xfs_trans_mod_sb. - */ -#define XFS_TRANS_SB_ICOUNT 0x00000001 -#define XFS_TRANS_SB_IFREE 0x00000002 -#define XFS_TRANS_SB_FDBLOCKS 0x00000004 -#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 -#define XFS_TRANS_SB_FREXTENTS 0x00000010 -#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 -#define XFS_TRANS_SB_DBLOCKS 0x00000040 -#define XFS_TRANS_SB_AGCOUNT 0x00000080 -#define XFS_TRANS_SB_IMAXPCT 0x00000100 -#define XFS_TRANS_SB_REXTSIZE 0x00000200 -#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 -#define XFS_TRANS_SB_RBLOCKS 0x00000800 -#define XFS_TRANS_SB_REXTENTS 0x00001000 -#define XFS_TRANS_SB_REXTSLOG 0x00002000 - - -/* - * Per-extent log reservation for the allocation btree changes - * involved in freeing or allocating an extent. - * 2 trees * (2 blocks/level * max depth - 1) - */ -#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ - ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) - -/* - * Per-directory log reservation for any directory change. - * dir blocks: (1 btree block per level + data block + free block) - * bmap btree: (levels + 2) * max depth - * v2 directory blocks can be fragmented below the dirblksize down to the fsb - * size, so account for that in the DAENTER macros. - */ -#define XFS_DIROP_LOG_COUNT(mp) \ - (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ - XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) - +#include "xfs_trans_resv.h" -#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) -#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) -#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename) -#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link) -#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove) -#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) -#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) -#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir) -#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree) -#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange) -#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata) -#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc) -#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero) -#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree) -#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) -/* - * Logging the inode timestamps on an fsync -- same as SWRITE - * as long as SWRITE logs the entire inode core - */ -#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) -#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) -#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) -#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) -#define XFS_ATTRSETM_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetm) -#define XFS_ATTRSETRT_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetrt) -#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) -#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) -#define XFS_QM_SBCHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_qm_sbchange) -#define XFS_QM_SETQLIM_LOG_RES(mp) ((mp)->m_reservations.tr_qm_setqlim) -#define XFS_QM_DQALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_qm_dqalloc) -#define XFS_QM_QUOTAOFF_LOG_RES(mp) ((mp)->m_reservations.tr_qm_quotaoff) -#define XFS_QM_QUOTAOFF_END_LOG_RES(mp) ((mp)->m_reservations.tr_qm_equotaoff) -#define XFS_SB_LOG_RES(mp) ((mp)->m_reservations.tr_sb) - -/* - * Various log count values. - */ -#define XFS_DEFAULT_LOG_COUNT 1 -#define XFS_DEFAULT_PERM_LOG_COUNT 2 -#define XFS_ITRUNCATE_LOG_COUNT 2 -#define XFS_INACTIVE_LOG_COUNT 2 -#define XFS_CREATE_LOG_COUNT 2 -#define XFS_MKDIR_LOG_COUNT 3 -#define XFS_SYMLINK_LOG_COUNT 3 -#define XFS_REMOVE_LOG_COUNT 2 -#define XFS_LINK_LOG_COUNT 2 -#define XFS_RENAME_LOG_COUNT 2 -#define XFS_WRITE_LOG_COUNT 2 -#define XFS_ADDAFORK_LOG_COUNT 2 -#define XFS_ATTRINVAL_LOG_COUNT 1 -#define XFS_ATTRSET_LOG_COUNT 3 -#define XFS_ATTRRM_LOG_COUNT 3 - -/* - * Here we centralize the specification of XFS meta-data buffer - * reference count values. This determine how hard the buffer - * cache tries to hold onto the buffer. - */ -#define XFS_AGF_REF 4 -#define XFS_AGI_REF 4 -#define XFS_AGFL_REF 3 -#define XFS_INO_BTREE_REF 3 -#define XFS_ALLOC_BTREE_REF 2 -#define XFS_BMAP_BTREE_REF 2 -#define XFS_DIR_BTREE_REF 2 -#define XFS_INO_REF 2 -#define XFS_ATTR_BTREE_REF 1 -#define XFS_DQUOT_REF 1 - -#ifdef __KERNEL__ +/* kernel only transaction subsystem defines */ struct xfs_buf; struct xfs_buftarg; @@ -310,6 +34,7 @@ struct xfs_log_iovec; struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; +struct xfs_trans_res; struct xfs_dquot_acct; struct xfs_busy_extent; @@ -342,7 +67,7 @@ typedef struct xfs_log_item { { XFS_LI_ABORTED, "ABORTED" } struct xfs_item_ops { - uint (*iop_size)(xfs_log_item_t *); + void (*iop_size)(xfs_log_item_t *, int *, int *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); void (*iop_unpin)(xfs_log_item_t *, int remove); @@ -352,17 +77,8 @@ struct xfs_item_ops { void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); }; -#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) -#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) -#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) -#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove) -#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list) -#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) -#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) -#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) - /* - * Return values for the IOP_PUSH() routines. + * Return values for the iop_push() routines. */ #define XFS_ITEM_SUCCESS 0 #define XFS_ITEM_PINNED 1 @@ -446,7 +162,7 @@ typedef struct xfs_trans { xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); xfs_trans_t *xfs_trans_dup(xfs_trans_t *); -int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, +int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, uint, uint); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -528,9 +244,4 @@ void xfs_trans_ail_destroy(struct xfs_mount *); extern kmem_zone_t *xfs_trans_zone; extern kmem_zone_t *xfs_log_item_desc_zone; -#endif /* __KERNEL__ */ - -void xfs_trans_init(struct xfs_mount *); -int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); - #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 0eda7254305f..21c6d7ddbc06 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -61,20 +61,6 @@ xfs_ail_check( #endif /* DEBUG */ /* - * Return a pointer to the first item in the AIL. If the AIL is empty, then - * return NULL. - */ -xfs_log_item_t * -xfs_ail_min( - struct xfs_ail *ailp) -{ - if (list_empty(&ailp->xa_ail)) - return NULL; - - return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); -} - - /* * Return a pointer to the last item in the AIL. If the AIL is empty, then * return NULL. */ @@ -393,11 +379,11 @@ xfsaild_push( int lock_result; /* - * Note that IOP_PUSH may unlock and reacquire the AIL lock. We + * Note that iop_push may unlock and reacquire the AIL lock. We * rely on the AIL cursor implementation to be able to deal with * the dropped lock. */ - lock_result = IOP_PUSH(lip, &ailp->xa_buf_list); + lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index aa5a04b844d6..8c75b8f67270 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -505,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t *tp, /* * Mark the buffer as not needing to be unlocked when the buf item's - * IOP_UNLOCK() routine is called. The buffer must already be locked + * iop_unlock() routine is called. The buffer must already be locked * and associated with the given transaction. */ /* ARGSUSED */ diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 61407a847b86..54ee3c5dee76 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -17,6 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 53b7c9b0f8f7..c52def0b441c 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -25,6 +25,9 @@ struct xfs_trans; struct xfs_ail; struct xfs_log_vec; + +void xfs_trans_init(struct xfs_mount *); +int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, @@ -83,6 +86,18 @@ void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t lsn) __releases(ailp->xa_lock); +/* + * Return a pointer to the first item in the AIL. If the AIL is empty, then + * return NULL. + */ +static inline struct xfs_log_item * +xfs_ail_min( + struct xfs_ail *ailp) +{ + return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item, + li_ail); +} + static inline void xfs_trans_ail_update( struct xfs_ail *ailp, diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c new file mode 100644 index 000000000000..a65a3cc40610 --- /dev/null +++ b/fs/xfs/xfs_trans_resv.c @@ -0,0 +1,803 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log.h" +#include "xfs_trans_resv.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_extent_busy.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" + +/* + * A buffer has a format structure overhead in the log in addition + * to the data, so we need to take this into account when reserving + * space in a transaction for a buffer. Round the space required up + * to a multiple of 128 bytes so that we don't change the historical + * reservation that has been used for this overhead. + */ +STATIC uint +xfs_buf_log_overhead(void) +{ + return round_up(sizeof(struct xlog_op_header) + + sizeof(struct xfs_buf_log_format), 128); +} + +/* + * Calculate out transaction log reservation per item in bytes. + * + * The nbufs argument is used to indicate the number of items that + * will be changed in a transaction. size is used to tell how many + * bytes should be reserved per item. + */ +STATIC uint +xfs_calc_buf_res( + uint nbufs, + uint size) +{ + return nbufs * (size + xfs_buf_log_overhead()); +} + +/* + * Logging inodes is really tricksy. They are logged in memory format, + * which means that what we write into the log doesn't directly translate into + * the amount of space they use on disk. + * + * Case in point - btree format forks in memory format use more space than the + * on-disk format. In memory, the buffer contains a normal btree block header so + * the btree code can treat it as though it is just another generic buffer. + * However, when we write it to the inode fork, we don't write all of this + * header as it isn't needed. e.g. the root is only ever in the inode, so + * there's no need for sibling pointers which would waste 16 bytes of space. + * + * Hence when we have an inode with a maximally sized btree format fork, then + * amount of information we actually log is greater than the size of the inode + * on disk. Hence we need an inode reservation function that calculates all this + * correctly. So, we log: + * + * - log op headers for object + * - inode log format object + * - the entire inode contents (core + 2 forks) + * - two bmap btree block headers + */ +STATIC uint +xfs_calc_inode_res( + struct xfs_mount *mp, + uint ninodes) +{ + return ninodes * (sizeof(struct xlog_op_header) + + sizeof(struct xfs_inode_log_format) + + mp->m_sb.sb_inodesize + + 2 * XFS_BMBT_BLOCK_LEN(mp)); +} + +/* + * Various log reservation values. + * + * These are based on the size of the file system block because that is what + * most transactions manipulate. Each adds in an additional 128 bytes per + * item logged to try to account for the overhead of the transaction mechanism. + * + * Note: Most of the reservations underestimate the number of allocation + * groups into which they could free extents in the xfs_bmap_finish() call. + * This is because the number in the worst case is quite high and quite + * unusual. In order to fix this we need to change xfs_bmap_finish() to free + * extents in only a single AG at a time. This will require changes to the + * EFI code as well, however, so that the EFI for the extents not freed is + * logged again in each transaction. See SGI PV #261917. + * + * Reservation functions here avoid a huge stack in xfs_trans_init due to + * register overflow from temporaries in the calculations. + */ + + +/* + * In a write transaction we can allocate a maximum of 2 + * extents. This gives: + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join: + * the agfs of the ags containing the blocks: 2 * sector size + * the agfls of the ags containing the blocks: 2 * sector size + * the super block free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_write_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * In truncating a file we free up to two extents at once. We can modify: + * the inode being truncated: inode size + * the inode's bmap btree: (max depth + 1) * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_itruncate_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(5, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + mp->m_in_maxlevels, 0))); +} + +/* + * In renaming a files we can modify: + * the four inodes involved: 4 * inode size + * the two directory btrees: 2 * (max depth + v2) * dir block size + * the two directory bmap btrees: 2 * max depth * block size + * And the bmap_finish transaction can free dir and bmap blocks (two sets + * of bmap blocks) giving: + * the agf for the ags in which the blocks live: 3 * sector size + * the agfl for the ags in which the blocks live: 3 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_rename_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 4) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For creating a link to an inode: + * the parent directory inode: inode size + * the linked inode: inode size + * the directory btree could split: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free some bmap blocks giving: + * the agf for the ag in which the blocks live: sector size + * the agfl for the ag in which the blocks live: sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_link_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For removing a directory entry we can modify: + * the parent directory inode: inode size + * the removed inode: inode size + * the directory btree could join: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free the dir and bmap blocks giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_remove_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For create, break it in to the two cases that the transaction + * covers. We start with the modify case - allocation done by modification + * of the state of existing inodes - and the allocation case. + */ + +/* + * For create we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: block size + * the superblock for the nlink flag: sector size + * the directory btree: (max depth + v2) * dir block size + * the directory inode's bmap btree: (max depth + v2) * block size + */ +STATIC uint +xfs_calc_create_resv_modify( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + (uint)XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); +} + +/* + * For create we can allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_create_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +STATIC uint +__xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX(xfs_calc_create_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +/* + * For icreate we can allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_icreate_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +STATIC uint +xfs_calc_icreate_reservation(xfs_mount_t *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX(xfs_calc_icreate_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +STATIC uint +xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return xfs_calc_icreate_reservation(mp); + return __xfs_calc_create_reservation(mp); + +} + +/* + * Making a new directory is the same as creating a new file. + */ +STATIC uint +xfs_calc_mkdir_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp); +} + + +/* + * Making a new symplink is the same as creating a new file, but + * with the added blocks for remote symlink data which can be up to 1kB in + * length (MAXPATHLEN). + */ +STATIC uint +xfs_calc_symlink_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp) + + xfs_calc_buf_res(1, MAXPATHLEN); +} + +/* + * In freeing an inode we can modify: + * the inode being freed: inode size + * the super block free inode counter: sector size + * the agi hash list and counters: sector size + * the inode btree entry: block size + * the on disk inode before ours in the agi hash list: inode cluster size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_ifree_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + + MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), + XFS_INODE_CLUSTER_SIZE(mp)) + + xfs_calc_buf_res(1, 0) + + xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + mp->m_in_maxlevels, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * When only changing the inode we log the inode and possibly the superblock + * We also add a bit of slop for the transaction stuff. + */ +STATIC uint +xfs_calc_ichange_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); + +} + +/* + * Growing the data section of the filesystem. + * superblock + * agi and agf + * allocation btrees + */ +STATIC uint +xfs_calc_growdata_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the first set of transactions (ALLOC) we allocate space to the + * bitmap or summary files. + * superblock: sector size + * agf of the ag from which the extent is allocated: sector size + * bmap btree for bitmap/summary inode: max depth * blocksize + * bitmap/summary inode: inode size + * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize + */ +STATIC uint +xfs_calc_growrtalloc_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the second set of transactions (ZERO) we zero the new metadata blocks. + * one bitmap/summary block: blocksize + */ +STATIC uint +xfs_calc_growrtzero_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize); +} + +/* + * Growing the rt section of the filesystem. + * In the third set of transactions (FREE) we update metadata without + * allocating any new blocks. + * superblock: sector size + * bitmap inode: inode size + * summary inode: inode size + * one bitmap block: blocksize + * summary blocks: new summary size + */ +STATIC uint +xfs_calc_growrtfree_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(1, mp->m_rsumsize); +} + +/* + * Logging the inode modification timestamp on a synchronous write. + * inode + */ +STATIC uint +xfs_calc_swrite_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 1); +} + +/* + * Logging the inode mode bits when writing a setuid/setgid file + * inode + */ +STATIC uint +xfs_calc_writeid_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 1); +} + +/* + * Converting the inode from non-attributed to attributed. + * the inode being converted: inode size + * agf block and superblock (for block allocation) + * the new block (directory sized) + * bmap blocks for the new directory block + * allocation btrees + */ +STATIC uint +xfs_calc_addafork_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, mp->m_dirblksize) + + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Removing the attribute fork of a file + * the inode being truncated: inode size + * the inode's bmap btree: max depth * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrinval_reservation( + struct xfs_mount *mp) +{ + return MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * Setting an attribute at mount time. + * the inode getting the attribute + * the superblock for allocations + * the agfs extents are allocated from + * the attribute btree * max depth + * the inode allocation btree + * Since attribute transaction space is dependent on the size of the attribute, + * the calculation is done partially at mount time and partially at runtime(see + * below). + */ +STATIC uint +xfs_calc_attrsetm_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); +} + +/* + * Setting an attribute at runtime, transaction space unit per block. + * the superblock for allocations: sector size + * the inode bmap btree could join or split: max depth * block size + * Since the runtime attribute transaction space is dependent on the total + * blocks needed for the 1st bmap, here we calculate out the space unit for + * one block so that the caller could figure out the total space according + * to the attibute extent length in blocks by: + * ext * M_RES(mp)->tr_attrsetrt.tr_logres + */ +STATIC uint +xfs_calc_attrsetrt_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Removing an attribute. + * the inode: inode size + * the attribute btree could join: max depth * block size + * the inode bmap btree could join or split: max depth * block size + * And the bmap_finish transaction can free the attr blocks freed giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrrm_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, + XFS_FSB_TO_B(mp, 1)) + + (uint)XFS_FSB_TO_B(mp, + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * Clearing a bad agino number in an agi hash bucket. + */ +STATIC uint +xfs_calc_clear_agi_bucket_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Clearing the quotaflags in the superblock. + * the super block for changing quota flags: sector size + */ +STATIC uint +xfs_calc_qm_sbchange_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Adjusting quota limits. + * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + */ +STATIC uint +xfs_calc_qm_setqlim_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); +} + +/* + * Allocating quota on disk if needed. + * the write transaction log space: M_RES(mp)->tr_write.tr_logres + * the unit of quota allocation: one system block size + */ +STATIC uint +xfs_calc_qm_dqalloc_reservation( + struct xfs_mount *mp) +{ + ASSERT(M_RES(mp)->tr_write.tr_logres); + return M_RES(mp)->tr_write.tr_logres + + xfs_calc_buf_res(1, + XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); +} + +/* + * Turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the superblock for the quota flags: sector size + */ +STATIC uint +xfs_calc_qm_quotaoff_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2 + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * End of turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + */ +STATIC uint +xfs_calc_qm_quotaoff_end_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2; +} + +/* + * Syncing the incore super block changes to disk. + * the super block to reflect the changes: sector size + */ +STATIC uint +xfs_calc_sb_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +void +xfs_trans_resv_calc( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* + * The following transactions are logged in physical format and + * require a permanent reservation on space. + */ + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); + resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT; + resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_link.tr_logres = xfs_calc_link_reservation(mp); + resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT; + resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp); + resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT; + resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp); + resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT; + resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_create.tr_logres = xfs_calc_create_reservation(mp); + resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; + resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); + resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; + resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp); + resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT; + resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp); + resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT; + resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp); + resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT; + resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp); + resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT; + resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp); + resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT; + resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp); + resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; + resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + /* + * The following transactions are logged in logical format with + * a default log count. + */ + resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp); + resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp); + resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); + resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_equotaoff.tr_logres = + xfs_calc_qm_quotaoff_end_reservation(mp); + resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); + resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + /* The following transaction are logged in logical format */ + resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); + resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); + resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp); + resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); + resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); + resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); + resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp); + resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); + resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); +} diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h new file mode 100644 index 000000000000..de7de9aaad8a --- /dev/null +++ b/fs/xfs/xfs_trans_resv.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_RESV_H__ +#define __XFS_TRANS_RESV_H__ + +struct xfs_mount; + +/* + * structure for maintaining pre-calculated transaction reservations. + */ +struct xfs_trans_res { + uint tr_logres; /* log space unit in bytes per log ticket */ + int tr_logcount; /* number of log operations per log ticket */ + int tr_logflags; /* log flags, currently only used for indicating + * a reservation request is permanent or not */ +}; + +struct xfs_trans_resv { + struct xfs_trans_res tr_write; /* extent alloc trans */ + struct xfs_trans_res tr_itruncate; /* truncate trans */ + struct xfs_trans_res tr_rename; /* rename trans */ + struct xfs_trans_res tr_link; /* link trans */ + struct xfs_trans_res tr_remove; /* unlink trans */ + struct xfs_trans_res tr_symlink; /* symlink trans */ + struct xfs_trans_res tr_create; /* create trans */ + struct xfs_trans_res tr_mkdir; /* mkdir trans */ + struct xfs_trans_res tr_ifree; /* inode free trans */ + struct xfs_trans_res tr_ichange; /* inode update trans */ + struct xfs_trans_res tr_growdata; /* fs data section grow trans */ + struct xfs_trans_res tr_swrite; /* sync write inode trans */ + struct xfs_trans_res tr_addafork; /* add inode attr fork trans */ + struct xfs_trans_res tr_writeid; /* write setuid/setgid file */ + struct xfs_trans_res tr_attrinval; /* attr fork buffer + * invalidation */ + struct xfs_trans_res tr_attrsetm; /* set/create an attribute at + * mount time */ + struct xfs_trans_res tr_attrsetrt; /* set/create an attribute at + * runtime */ + struct xfs_trans_res tr_attrrm; /* remove an attribute */ + struct xfs_trans_res tr_clearagi; /* clear agi unlinked bucket */ + struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */ + struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */ + struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ + struct xfs_trans_res tr_qm_sbchange; /* change quota flags */ + struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ + struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ + struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ + struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */ + struct xfs_trans_res tr_sb; /* modify superblock */ + struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ +}; + +/* shorthand way of accessing reservation structure */ +#define M_RES(mp) (&(mp)->m_resv) + +/* + * Per-extent log reservation for the allocation btree changes + * involved in freeing or allocating an extent. + * 2 trees * (2 blocks/level * max depth - 1) * block size + */ +#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) +#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ + ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + +/* + * Per-directory log reservation for any directory change. + * dir blocks: (1 btree block per level + data block + free block) * dblock size + * bmap btree: (levels + 2) * max depth * block size + * v2 directory blocks can be fragmented below the dirblksize down to the fsb + * size, so account for that in the DAENTER macros. + */ +#define XFS_DIROP_LOG_RES(mp) \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1))) +#define XFS_DIROP_LOG_COUNT(mp) \ + (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ + XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) + +/* + * Various log count values. + */ +#define XFS_DEFAULT_LOG_COUNT 1 +#define XFS_DEFAULT_PERM_LOG_COUNT 2 +#define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_INACTIVE_LOG_COUNT 2 +#define XFS_CREATE_LOG_COUNT 2 +#define XFS_MKDIR_LOG_COUNT 3 +#define XFS_SYMLINK_LOG_COUNT 3 +#define XFS_REMOVE_LOG_COUNT 2 +#define XFS_LINK_LOG_COUNT 2 +#define XFS_RENAME_LOG_COUNT 2 +#define XFS_WRITE_LOG_COUNT 2 +#define XFS_ADDAFORK_LOG_COUNT 2 +#define XFS_ATTRINVAL_LOG_COUNT 1 +#define XFS_ATTRSET_LOG_COUNT 3 +#define XFS_ATTRRM_LOG_COUNT 3 + +void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); + +#endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 61ba1cfa974c..82bbc34d54a3 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -18,42 +18,7 @@ #ifndef __XFS_TYPES_H__ #define __XFS_TYPES_H__ -#ifdef __KERNEL__ - -/* - * Additional type declarations for XFS - */ -typedef signed char __int8_t; -typedef unsigned char __uint8_t; -typedef signed short int __int16_t; -typedef unsigned short int __uint16_t; -typedef signed int __int32_t; -typedef unsigned int __uint32_t; -typedef signed long long int __int64_t; -typedef unsigned long long int __uint64_t; - -typedef __uint32_t prid_t; /* project ID */ -typedef __uint32_t inst_t; /* an instruction */ - -typedef __s64 xfs_off_t; /* <file offset> type */ -typedef unsigned long long xfs_ino_t; /* <inode> type */ -typedef __s64 xfs_daddr_t; /* <disk address> type */ -typedef char * xfs_caddr_t; /* <core address> type */ -typedef __u32 xfs_dev_t; -typedef __u32 xfs_nlink_t; - -/* __psint_t is the same size as a pointer */ -#if (BITS_PER_LONG == 32) -typedef __int32_t __psint_t; -typedef __uint32_t __psunsigned_t; -#elif (BITS_PER_LONG == 64) -typedef __int64_t __psint_t; -typedef __uint64_t __psunsigned_t; -#else -#error BITS_PER_LONG must be 32 or 64 -#endif - -#endif /* __KERNEL__ */ +typedef __uint32_t prid_t; /* project ID */ typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ @@ -146,6 +111,12 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ #define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) /* + * Inode fork identifiers. + */ +#define XFS_DATA_FORK 0 +#define XFS_ATTR_FORK 1 + +/* * Min numbers of data/attr fork btree root pointers. */ #define MINDBTPTRS 3 @@ -169,6 +140,23 @@ typedef enum { struct xfs_name { const unsigned char *name; int len; + int type; }; +/* + * uid_t and gid_t are hard-coded to 32 bits in the inode. + * Hence, an 'id' in a dquot is 32 bits.. + */ +typedef __uint32_t xfs_dqid_t; + +/* + * Constants for bit manipulations. + */ +#define XFS_NBBYLOG 3 /* log2(NBBY) */ +#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ +#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) +#define XFS_NBWORD (1 << XFS_NBWORDLOG) +#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) + + #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c deleted file mode 100644 index 0025c78ac03c..000000000000 --- a/fs/xfs/xfs_utils.c +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_error.h" -#include "xfs_quota.h" -#include "xfs_itable.h" -#include "xfs_utils.h" - - -/* - * Allocates a new inode from disk and return a pointer to the - * incore copy. This routine will internally commit the current - * transaction and allocate a new one if the Space Manager needed - * to do an allocation to replenish the inode free-list. - * - * This routine is designed to be called from xfs_create and - * xfs_create_dir. - * - */ -int -xfs_dir_ialloc( - xfs_trans_t **tpp, /* input: current transaction; - output: may be a new transaction. */ - xfs_inode_t *dp, /* directory within whose allocate - the inode. */ - umode_t mode, - xfs_nlink_t nlink, - xfs_dev_t rdev, - prid_t prid, /* project id */ - int okalloc, /* ok to allocate new space */ - xfs_inode_t **ipp, /* pointer to inode; it will be - locked. */ - int *committed) - -{ - xfs_trans_t *tp; - xfs_trans_t *ntp; - xfs_inode_t *ip; - xfs_buf_t *ialloc_context = NULL; - int code; - uint log_res; - uint log_count; - void *dqinfo; - uint tflags; - - tp = *tpp; - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - - /* - * xfs_ialloc will return a pointer to an incore inode if - * the Space Manager has an available inode on the free - * list. Otherwise, it will do an allocation and replenish - * the freelist. Since we can only do one allocation per - * transaction without deadlocks, we will need to commit the - * current transaction and start a new one. We will then - * need to call xfs_ialloc again to get the inode. - * - * If xfs_ialloc did an allocation to replenish the freelist, - * it returns the bp containing the head of the freelist as - * ialloc_context. We will hold a lock on it across the - * transaction commit so that no other process can steal - * the inode(s) that we've just allocated. - */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, - &ialloc_context, &ip); - - /* - * Return an error if we were unable to allocate a new inode. - * This should only happen if we run out of space on disk or - * encounter a disk error. - */ - if (code) { - *ipp = NULL; - return code; - } - if (!ialloc_context && !ip) { - *ipp = NULL; - return XFS_ERROR(ENOSPC); - } - - /* - * If the AGI buffer is non-NULL, then we were unable to get an - * inode in one operation. We need to commit the current - * transaction and call xfs_ialloc() again. It is guaranteed - * to succeed the second time. - */ - if (ialloc_context) { - /* - * Normally, xfs_trans_commit releases all the locks. - * We call bhold to hang on to the ialloc_context across - * the commit. Holding this buffer prevents any other - * processes from doing any allocations in this - * allocation group. - */ - xfs_trans_bhold(tp, ialloc_context); - /* - * Save the log reservation so we can use - * them in the next transaction. - */ - log_res = xfs_trans_get_log_res(tp); - log_count = xfs_trans_get_log_count(tp); - - /* - * We want the quota changes to be associated with the next - * transaction, NOT this one. So, detach the dqinfo from this - * and attach it to the next transaction. - */ - dqinfo = NULL; - tflags = 0; - if (tp->t_dqinfo) { - dqinfo = (void *)tp->t_dqinfo; - tp->t_dqinfo = NULL; - tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; - tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); - } - - ntp = xfs_trans_dup(tp); - code = xfs_trans_commit(tp, 0); - tp = ntp; - if (committed != NULL) { - *committed = 1; - } - /* - * If we get an error during the commit processing, - * release the buffer that is still held and return - * to the caller. - */ - if (code) { - xfs_buf_relse(ialloc_context); - if (dqinfo) { - tp->t_dqinfo = dqinfo; - xfs_trans_free_dqinfo(tp); - } - *tpp = ntp; - *ipp = NULL; - return code; - } - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - code = xfs_trans_reserve(tp, 0, log_res, 0, - XFS_TRANS_PERM_LOG_RES, log_count); - /* - * Re-attach the quota info that we detached from prev trx. - */ - if (dqinfo) { - tp->t_dqinfo = dqinfo; - tp->t_flags |= tflags; - } - - if (code) { - xfs_buf_relse(ialloc_context); - *tpp = ntp; - *ipp = NULL; - return code; - } - xfs_trans_bjoin(tp, ialloc_context); - - /* - * Call ialloc again. Since we've locked out all - * other allocations in this allocation group, - * this call should always succeed. - */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, - okalloc, &ialloc_context, &ip); - - /* - * If we get an error at this point, return to the caller - * so that the current transaction can be aborted. - */ - if (code) { - *tpp = tp; - *ipp = NULL; - return code; - } - ASSERT(!ialloc_context && ip); - - } else { - if (committed != NULL) - *committed = 0; - } - - *ipp = ip; - *tpp = tp; - - return 0; -} - -/* - * Decrement the link count on an inode & log the change. - * If this causes the link count to go to zero, initiate the - * logging activity required to truncate a file. - */ -int /* error */ -xfs_droplink( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - int error; - - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - - ASSERT (ip->i_d.di_nlink > 0); - ip->i_d.di_nlink--; - drop_nlink(VFS_I(ip)); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - error = 0; - if (ip->i_d.di_nlink == 0) { - /* - * We're dropping the last link to this file. - * Move the on-disk inode to the AGI unlinked list. - * From xfs_inactive() we will pull the inode from - * the list and free it. - */ - error = xfs_iunlink(tp, ip); - } - return error; -} - -/* - * This gets called when the inode's version needs to be changed from 1 to 2. - * Currently this happens when the nlink field overflows the old 16-bit value - * or when chproj is called to change the project for the first time. - * As a side effect the superblock version will also get rev'd - * to contain the NLINK bit. - */ -void -xfs_bump_ino_vers2( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - xfs_mount_t *mp; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(ip->i_d.di_version == 1); - - ip->i_d.di_version = 2; - ip->i_d.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - mp = tp->t_mountp; - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - xfs_sb_version_addnlink(&mp->m_sb); - spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, XFS_SB_VERSIONNUM); - } else { - spin_unlock(&mp->m_sb_lock); - } - } - /* Caller must log the inode */ -} - -/* - * Increment the link count on an inode & log the change. - */ -int -xfs_bumplink( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - - ASSERT(ip->i_d.di_nlink > 0); - ip->i_d.di_nlink++; - inc_nlink(VFS_I(ip)); - if ((ip->i_d.di_version == 1) && - (ip->i_d.di_nlink > XFS_MAXLINK_1)) { - /* - * The inode has increased its number of links beyond - * what can fit in an old format inode. It now needs - * to be converted to a version 2 inode with a 32 bit - * link count. If this is the first inode in the file - * system to do this, then we need to bump the superblock - * version number as well. - */ - xfs_bump_ino_vers2(tp, ip); - } - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return 0; -} diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h deleted file mode 100644 index 5eeab4690cfe..000000000000 --- a/fs/xfs/xfs_utils.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_UTILS_H__ -#define __XFS_UTILS_H__ - -extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, umode_t, xfs_nlink_t, - xfs_dev_t, prid_t, int, xfs_inode_t **, int *); -extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); -extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); -extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *); - -#endif /* __XFS_UTILS_H__ */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c deleted file mode 100644 index dc730ac272be..000000000000 --- a/fs/xfs/xfs_vnodeops.c +++ /dev/null @@ -1,1870 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * Copyright (c) 2012 Red Hat, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_itable.h" -#include "xfs_ialloc.h" -#include "xfs_alloc.h" -#include "xfs_bmap.h" -#include "xfs_acl.h" -#include "xfs_attr.h" -#include "xfs_error.h" -#include "xfs_quota.h" -#include "xfs_utils.h" -#include "xfs_rtalloc.h" -#include "xfs_trans_space.h" -#include "xfs_log_priv.h" -#include "xfs_filestream.h" -#include "xfs_vnodeops.h" -#include "xfs_trace.h" -#include "xfs_icache.h" -#include "xfs_symlink.h" - - -/* - * This is called by xfs_inactive to free any blocks beyond eof - * when the link count isn't zero and by xfs_dm_punch_hole() when - * punching a hole to EOF. - */ -int -xfs_free_eofblocks( - xfs_mount_t *mp, - xfs_inode_t *ip, - bool need_iolock) -{ - xfs_trans_t *tp; - int error; - xfs_fileoff_t end_fsb; - xfs_fileoff_t last_fsb; - xfs_filblks_t map_len; - int nimaps; - xfs_bmbt_irec_t imap; - - /* - * Figure out if there are any blocks beyond the end - * of the file. If not, then there is nothing to do. - */ - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); - last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - if (last_fsb <= end_fsb) - return 0; - map_len = last_fsb - end_fsb; - - nimaps = 1; - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!error && (nimaps != 0) && - (imap.br_startblock != HOLESTARTBLOCK || - ip->i_delayed_blks)) { - /* - * Attach the dquots to the inode up front. - */ - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; - - /* - * There are blocks after the end of file. - * Free them up now by truncating the file to - * its current size. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - - if (need_iolock) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - xfs_trans_cancel(tp, 0); - return EAGAIN; - } - } - - error = xfs_trans_reserve(tp, 0, - XFS_ITRUNCATE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - /* - * Do not update the on-disk file size. If we update the - * on-disk file size and then the system crashes before the - * contents of the file are flushed to disk then the files - * may be full of holes (ie NULL files bug). - */ - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, - XFS_ISIZE(ip)); - if (error) { - /* - * If we get an error at this point we simply don't - * bother truncating the file. - */ - xfs_trans_cancel(tp, - (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - } else { - error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES); - if (!error) - xfs_inode_clear_eofblocks_tag(ip); - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - } - return error; -} - -int -xfs_release( - xfs_inode_t *ip) -{ - xfs_mount_t *mp = ip->i_mount; - int error; - - if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) - return 0; - - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (mp->m_flags & XFS_MOUNT_RDONLY) - return 0; - - if (!XFS_FORCED_SHUTDOWN(mp)) { - int truncated; - - /* - * If we are using filestreams, and we have an unlinked - * file that we are processing the last close on, then nothing - * will be able to reopen and write to this file. Purge this - * inode from the filestreams cache so that it doesn't delay - * teardown of the inode. - */ - if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip)) - xfs_filestream_deassociate(ip); - - /* - * If we previously truncated this file and removed old data - * in the process, we want to initiate "early" writeout on - * the last close. This is an attempt to combat the notorious - * NULL files problem which is particularly noticeable from a - * truncate down, buffered (re-)write (delalloc), followed by - * a crash. What we are effectively doing here is - * significantly reducing the time window where we'd otherwise - * be exposed to that problem. - */ - truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); - if (truncated) { - xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); - if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { - error = -filemap_flush(VFS_I(ip)->i_mapping); - if (error) - return error; - } - } - } - - if (ip->i_d.di_nlink == 0) - return 0; - - if (xfs_can_free_eofblocks(ip, false)) { - - /* - * If we can't get the iolock just skip truncating the blocks - * past EOF because we could deadlock with the mmap_sem - * otherwise. We'll get another chance to drop them once the - * last reference to the inode is dropped, so we'll never leak - * blocks permanently. - * - * Further, check if the inode is being opened, written and - * closed frequently and we have delayed allocation blocks - * outstanding (e.g. streaming writes from the NFS server), - * truncating the blocks past EOF will cause fragmentation to - * occur. - * - * In this case don't do the truncation, either, but we have to - * be careful how we detect this case. Blocks beyond EOF show - * up as i_delayed_blks even when the inode is clean, so we - * need to truncate them away first before checking for a dirty - * release. Hence on the first dirty close we will still remove - * the speculative allocation, but after that we will leave it - * in place. - */ - if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) - return 0; - - error = xfs_free_eofblocks(mp, ip, true); - if (error && error != EAGAIN) - return error; - - /* delalloc blocks after truncation means it really is dirty */ - if (ip->i_delayed_blks) - xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); - } - return 0; -} - -/* - * xfs_inactive - * - * This is called when the vnode reference count for the vnode - * goes to zero. If the file has been unlinked, then it must - * now be truncated. Also, we clear all of the read-ahead state - * kept for the inode here since the file is now closed. - */ -int -xfs_inactive( - xfs_inode_t *ip) -{ - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int committed; - xfs_trans_t *tp; - xfs_mount_t *mp; - int error; - int truncate = 0; - - /* - * If the inode is already free, then there can be nothing - * to clean up here. - */ - if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) { - ASSERT(ip->i_df.if_real_bytes == 0); - ASSERT(ip->i_df.if_broot_bytes == 0); - return VN_INACTIVE_CACHE; - } - - mp = ip->i_mount; - - error = 0; - - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (mp->m_flags & XFS_MOUNT_RDONLY) - goto out; - - if (ip->i_d.di_nlink != 0) { - /* - * force is true because we are evicting an inode from the - * cache. Post-eof blocks must be freed, lest we end up with - * broken free space accounting. - */ - if (xfs_can_free_eofblocks(ip, true)) { - error = xfs_free_eofblocks(mp, ip, false); - if (error) - return VN_INACTIVE_CACHE; - } - goto out; - } - - if (S_ISREG(ip->i_d.di_mode) && - (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) - truncate = 1; - - error = xfs_qm_dqattach(ip, 0); - if (error) - return VN_INACTIVE_CACHE; - - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, 0, - (truncate || S_ISLNK(ip->i_d.di_mode)) ? - XFS_ITRUNCATE_LOG_RES(mp) : - XFS_IFREE_LOG_RES(mp), - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - return VN_INACTIVE_CACHE; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - if (S_ISLNK(ip->i_d.di_mode)) { - error = xfs_inactive_symlink(ip, &tp); - if (error) - goto out_cancel; - } else if (truncate) { - ip->i_d.di_size = 0; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); - if (error) - goto out_cancel; - - ASSERT(ip->i_d.di_nextents == 0); - } - - /* - * If there are attributes associated with the file then blow them away - * now. The code calls a routine that recursively deconstructs the - * attribute fork. We need to just commit the current transaction - * because we can't use it for xfs_attr_inactive(). - */ - if (ip->i_d.di_anextents > 0) { - ASSERT(ip->i_d.di_forkoff != 0); - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - goto out_unlock; - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - error = xfs_attr_inactive(ip); - if (error) - goto out; - - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, 0, - XFS_IFREE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_INACTIVE_LOG_COUNT); - if (error) { - xfs_trans_cancel(tp, 0); - goto out; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - } - - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - - ASSERT(ip->i_d.di_anextents == 0); - - /* - * Free the inode. - */ - xfs_bmap_init(&free_list, &first_block); - error = xfs_ifree(tp, ip, &free_list); - if (error) { - /* - * If we fail to free the inode, shut down. The cancel - * might do that, we need to make sure. Otherwise the - * inode might be lost for a long time or forever. - */ - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_notice(mp, "%s: xfs_ifree returned error %d", - __func__, error); - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } else { - /* - * Credit the quota account(s). The inode is gone. - */ - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); - - /* - * Just ignore errors at this point. There is nothing we can - * do except to try to keep going. Make sure it's not a silent - * error. - */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) - xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", - __func__, error); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - xfs_notice(mp, "%s: xfs_trans_commit returned error %d", - __func__, error); - } - - /* - * Release the dquots held by inode, if any. - */ - xfs_qm_dqdetach(ip); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); -out: - return VN_INACTIVE_CACHE; -out_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - goto out_unlock; -} - -/* - * Lookups up an inode from "name". If ci_name is not NULL, then a CI match - * is allowed, otherwise it has to be an exact match. If a CI match is found, - * ci_name->name will point to a the actual name (caller must free) or - * will be set to NULL if an exact match is found. - */ -int -xfs_lookup( - xfs_inode_t *dp, - struct xfs_name *name, - xfs_inode_t **ipp, - struct xfs_name *ci_name) -{ - xfs_ino_t inum; - int error; - uint lock_mode; - - trace_xfs_lookup(dp, name); - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return XFS_ERROR(EIO); - - lock_mode = xfs_ilock_map_shared(dp); - error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); - xfs_iunlock_map_shared(dp, lock_mode); - - if (error) - goto out; - - error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); - if (error) - goto out_free_name; - - return 0; - -out_free_name: - if (ci_name) - kmem_free(ci_name->name); -out: - *ipp = NULL; - return error; -} - -int -xfs_create( - xfs_inode_t *dp, - struct xfs_name *name, - umode_t mode, - xfs_dev_t rdev, - xfs_inode_t **ipp) -{ - int is_dir = S_ISDIR(mode); - struct xfs_mount *mp = dp->i_mount; - struct xfs_inode *ip = NULL; - struct xfs_trans *tp = NULL; - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - bool unlock_dp_on_error = false; - uint cancel_flags; - int committed; - prid_t prid; - struct xfs_dquot *udqp = NULL; - struct xfs_dquot *gdqp = NULL; - struct xfs_dquot *pdqp = NULL; - uint resblks; - uint log_res; - uint log_count; - - trace_xfs_create(dp, name); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = xfs_get_projid(dp); - else - prid = XFS_PROJID_DEFAULT; - - /* - * Make sure that we have allocated dquot(s) on disk. - */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, - &udqp, &gdqp, &pdqp); - if (error) - return error; - - if (is_dir) { - rdev = 0; - resblks = XFS_MKDIR_SPACE_RES(mp, name->len); - log_res = XFS_MKDIR_LOG_RES(mp); - log_count = XFS_MKDIR_LOG_COUNT; - tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); - } else { - resblks = XFS_CREATE_SPACE_RES(mp, name->len); - log_res = XFS_CREATE_LOG_RES(mp); - log_count = XFS_CREATE_LOG_COUNT; - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); - } - - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - - /* - * Initially assume that the file does not exist and - * reserve the resources for that case. If that is not - * the case we'll drop the one we have and get a more - * appropriate transaction later. - */ - error = xfs_trans_reserve(tp, resblks, log_res, 0, - XFS_TRANS_PERM_LOG_RES, log_count); - if (error == ENOSPC) { - /* flush outstanding delalloc blocks and retry */ - xfs_flush_inodes(mp); - error = xfs_trans_reserve(tp, resblks, log_res, 0, - XFS_TRANS_PERM_LOG_RES, log_count); - } - if (error == ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_reserve(tp, 0, log_res, 0, - XFS_TRANS_PERM_LOG_RES, log_count); - } - if (error) { - cancel_flags = 0; - goto out_trans_cancel; - } - - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = true; - - xfs_bmap_init(&free_list, &first_block); - - /* - * Reserve disk quota and the inode. - */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, - pdqp, resblks, 1, 0); - if (error) - goto out_trans_cancel; - - error = xfs_dir_canenter(tp, dp, name, resblks); - if (error) - goto out_trans_cancel; - - /* - * A newly created regular or special file just has one directory - * entry pointing to them, but a directory also the "." entry - * pointing to itself. - */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, resblks > 0, &ip, &committed); - if (error) { - if (error == ENOSPC) - goto out_trans_cancel; - goto out_trans_abort; - } - - /* - * Now we join the directory inode to the transaction. We do not do it - * earlier because xfs_dir_ialloc might commit the previous transaction - * (and release all the locks). An error from here on will result in - * the transaction cancel unlocking dp so don't do it explicitly in the - * error path. - */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = false; - - error = xfs_dir_createname(tp, dp, name, ip->i_ino, - &first_block, &free_list, resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); - if (error) { - ASSERT(error != ENOSPC); - goto out_trans_abort; - } - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - - if (is_dir) { - error = xfs_dir_init(tp, ip, dp); - if (error) - goto out_bmap_cancel; - - error = xfs_bumplink(tp, dp); - if (error) - goto out_bmap_cancel; - } - - /* - * If this is a synchronous mount, make sure that the - * create transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) - xfs_trans_set_sync(tp); - - /* - * Attach the dquot(s) to the inodes and modify them incore. - * These ids of the inode couldn't have changed since the new - * inode has been locked ever since it was created. - */ - xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) - goto out_bmap_cancel; - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - goto out_release_inode; - - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - xfs_qm_dqrele(pdqp); - - *ipp = ip; - return 0; - - out_bmap_cancel: - xfs_bmap_cancel(&free_list); - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; - out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); - out_release_inode: - /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. - */ - if (ip) - IRELE(ip); - - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - xfs_qm_dqrele(pdqp); - - if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; -} - -#ifdef DEBUG -int xfs_locked_n; -int xfs_small_retries; -int xfs_middle_retries; -int xfs_lots_retries; -int xfs_lock_delays; -#endif - -/* - * Bump the subclass so xfs_lock_inodes() acquires each lock with - * a different value - */ -static inline int -xfs_lock_inumorder(int lock_mode, int subclass) -{ - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; - if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) - lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; - - return lock_mode; -} - -/* - * The following routine will lock n inodes in exclusive mode. - * We assume the caller calls us with the inodes in i_ino order. - * - * We need to detect deadlock where an inode that we lock - * is in the AIL and we start waiting for another inode that is locked - * by a thread in a long running transaction (such as truncate). This can - * result in deadlock since the long running trans might need to wait - * for the inode we just locked in order to push the tail and free space - * in the log. - */ -void -xfs_lock_inodes( - xfs_inode_t **ips, - int inodes, - uint lock_mode) -{ - int attempts = 0, i, j, try_lock; - xfs_log_item_t *lp; - - ASSERT(ips && (inodes >= 2)); /* we need at least two */ - - try_lock = 0; - i = 0; - -again: - for (; i < inodes; i++) { - ASSERT(ips[i]); - - if (i && (ips[i] == ips[i-1])) /* Already locked */ - continue; - - /* - * If try_lock is not set yet, make sure all locked inodes - * are not in the AIL. - * If any are, set try_lock to be used later. - */ - - if (!try_lock) { - for (j = (i - 1); j >= 0 && !try_lock; j--) { - lp = (xfs_log_item_t *)ips[j]->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { - try_lock++; - } - } - } - - /* - * If any of the previous locks we have locked is in the AIL, - * we must TRY to get the second and subsequent locks. If - * we can't get any, we must release all we have - * and try again. - */ - - if (try_lock) { - /* try_lock must be 0 if i is 0. */ - /* - * try_lock means we have an inode locked - * that is in the AIL. - */ - ASSERT(i != 0); - if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { - attempts++; - - /* - * Unlock all previous guys and try again. - * xfs_iunlock will try to push the tail - * if the inode is in the AIL. - */ - - for(j = i - 1; j >= 0; j--) { - - /* - * Check to see if we've already - * unlocked this one. - * Not the first one going back, - * and the inode ptr is the same. - */ - if ((j != (i - 1)) && ips[j] == - ips[j+1]) - continue; - - xfs_iunlock(ips[j], lock_mode); - } - - if ((attempts % 5) == 0) { - delay(1); /* Don't just spin the CPU */ -#ifdef DEBUG - xfs_lock_delays++; -#endif - } - i = 0; - try_lock = 0; - goto again; - } - } else { - xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); - } - } - -#ifdef DEBUG - if (attempts) { - if (attempts < 5) xfs_small_retries++; - else if (attempts < 100) xfs_middle_retries++; - else xfs_lots_retries++; - } else { - xfs_locked_n++; - } -#endif -} - -/* - * xfs_lock_two_inodes() can only be used to lock one type of lock - * at a time - the iolock or the ilock, but not both at once. If - * we lock both at once, lockdep will report false positives saying - * we have violated locking orders. - */ -void -xfs_lock_two_inodes( - xfs_inode_t *ip0, - xfs_inode_t *ip1, - uint lock_mode) -{ - xfs_inode_t *temp; - int attempts = 0; - xfs_log_item_t *lp; - - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) - ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); - ASSERT(ip0->i_ino != ip1->i_ino); - - if (ip0->i_ino > ip1->i_ino) { - temp = ip0; - ip0 = ip1; - ip1 = temp; - } - - again: - xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); - - /* - * If the first lock we have locked is in the AIL, we must TRY to get - * the second lock. If we can't get it, we must release the first one - * and try again. - */ - lp = (xfs_log_item_t *)ip0->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { - if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { - xfs_iunlock(ip0, lock_mode); - if ((++attempts % 5) == 0) - delay(1); /* Don't just spin the CPU */ - goto again; - } - } else { - xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); - } -} - -int -xfs_remove( - xfs_inode_t *dp, - struct xfs_name *name, - xfs_inode_t *ip) -{ - xfs_mount_t *mp = dp->i_mount; - xfs_trans_t *tp = NULL; - int is_dir = S_ISDIR(ip->i_d.di_mode); - int error = 0; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - int link_zero; - uint resblks; - uint log_count; - - trace_xfs_remove(dp, name); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - error = xfs_qm_dqattach(dp, 0); - if (error) - goto std_return; - - error = xfs_qm_dqattach(ip, 0); - if (error) - goto std_return; - - if (is_dir) { - tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); - log_count = XFS_DEFAULT_LOG_COUNT; - } else { - tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - log_count = XFS_REMOVE_LOG_COUNT; - } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - - /* - * We try to get the real space reservation first, - * allowing for directory btree deletion(s) implying - * possible bmap insert(s). If we can't get the space - * reservation then we use 0 instead, and avoid the bmap - * btree insert(s) in the directory code by, if the bmap - * insert tries to happen, instead trimming the LAST - * block from the directory. - */ - resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, log_count); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, log_count); - } - if (error) { - ASSERT(error != ENOSPC); - cancel_flags = 0; - goto out_trans_cancel; - } - - xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - /* - * If we're removing a directory perform some additional validation. - */ - if (is_dir) { - ASSERT(ip->i_d.di_nlink >= 2); - if (ip->i_d.di_nlink != 2) { - error = XFS_ERROR(ENOTEMPTY); - goto out_trans_cancel; - } - if (!xfs_dir_isempty(ip)) { - error = XFS_ERROR(ENOTEMPTY); - goto out_trans_cancel; - } - } - - xfs_bmap_init(&free_list, &first_block); - error = xfs_dir_removename(tp, dp, name, ip->i_ino, - &first_block, &free_list, resblks); - if (error) { - ASSERT(error != ENOENT); - goto out_bmap_cancel; - } - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - if (is_dir) { - /* - * Drop the link from ip's "..". - */ - error = xfs_droplink(tp, dp); - if (error) - goto out_bmap_cancel; - - /* - * Drop the "." link from ip to self. - */ - error = xfs_droplink(tp, ip); - if (error) - goto out_bmap_cancel; - } else { - /* - * When removing a non-directory we need to log the parent - * inode here. For a directory this is done implicitly - * by the xfs_droplink call for the ".." entry. - */ - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - } - - /* - * Drop the link from dp to ip. - */ - error = xfs_droplink(tp, ip); - if (error) - goto out_bmap_cancel; - - /* - * Determine if this is the last link while - * we are in the transaction. - */ - link_zero = (ip->i_d.di_nlink == 0); - - /* - * If this is a synchronous mount, make sure that the - * remove transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) - xfs_trans_set_sync(tp); - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) - goto out_bmap_cancel; - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - goto std_return; - - /* - * If we are using filestreams, kill the stream association. - * If the file is still open it may get a new one but that - * will get killed on last close in xfs_close() so we don't - * have to worry about that. - */ - if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) - xfs_filestream_deassociate(ip); - - return 0; - - out_bmap_cancel: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); - std_return: - return error; -} - -int -xfs_link( - xfs_inode_t *tdp, - xfs_inode_t *sip, - struct xfs_name *target_name) -{ - xfs_mount_t *mp = tdp->i_mount; - xfs_trans_t *tp; - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - int resblks; - - trace_xfs_link(tdp, target_name); - - ASSERT(!S_ISDIR(sip->i_d.di_mode)); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - error = xfs_qm_dqattach(sip, 0); - if (error) - goto std_return; - - error = xfs_qm_dqattach(tdp, 0); - if (error) - goto std_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_LINK_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - goto error_return; - } - - xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); - - /* - * If we are using project inheritance, we only allow hard link - * creation in our tree when the project IDs are the same; else - * the tree quota mechanism could be circumvented. - */ - if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { - error = XFS_ERROR(EXDEV); - goto error_return; - } - - error = xfs_dir_canenter(tp, tdp, target_name, resblks); - if (error) - goto error_return; - - xfs_bmap_init(&free_list, &first_block); - - error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, - &first_block, &free_list, resblks); - if (error) - goto abort_return; - xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); - - error = xfs_bumplink(tp, sip); - if (error) - goto abort_return; - - /* - * If this is a synchronous mount, make sure that the - * link transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish (&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - goto abort_return; - } - - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - - abort_return: - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_trans_cancel(tp, cancel_flags); - std_return: - return error; -} - -int -xfs_set_dmattrs( - xfs_inode_t *ip, - u_int evmask, - u_int16_t state) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_trans_t *tp; - int error; - - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - ip->i_d.di_dmevmask = evmask; - ip->i_d.di_dmstate = state; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); - - return error; -} - -/* - * xfs_alloc_file_space() - * This routine allocates disk space for the given file. - * - * If alloc_type == 0, this request is for an ALLOCSP type - * request which will change the file size. In this case, no - * DMAPI event will be generated by the call. A TRUNCATE event - * will be generated later by xfs_setattr. - * - * If alloc_type != 0, this request is for a RESVSP type - * request, and a DMAPI DM_EVENT_WRITE will be generated if the - * lower block boundary byte address is less than the file's - * length. - * - * RETURNS: - * 0 on success - * errno on error - * - */ -STATIC int -xfs_alloc_file_space( - xfs_inode_t *ip, - xfs_off_t offset, - xfs_off_t len, - int alloc_type, - int attr_flags) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_off_t count; - xfs_filblks_t allocated_fsb; - xfs_filblks_t allocatesize_fsb; - xfs_extlen_t extsz, temp; - xfs_fileoff_t startoffset_fsb; - xfs_fsblock_t firstfsb; - int nimaps; - int quota_flag; - int rt; - xfs_trans_t *tp; - xfs_bmbt_irec_t imaps[1], *imapp; - xfs_bmap_free_t free_list; - uint qblocks, resblks, resrtextents; - int committed; - int error; - - trace_xfs_alloc_file_space(ip); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; - - if (len <= 0) - return XFS_ERROR(EINVAL); - - rt = XFS_IS_REALTIME_INODE(ip); - extsz = xfs_get_extsz_hint(ip); - - count = len; - imapp = &imaps[0]; - nimaps = 1; - startoffset_fsb = XFS_B_TO_FSBT(mp, offset); - allocatesize_fsb = XFS_B_TO_FSB(mp, count); - - /* - * Allocate file space until done or until there is an error - */ - while (allocatesize_fsb && !error) { - xfs_fileoff_t s, e; - - /* - * Determine space reservations for data/realtime. - */ - if (unlikely(extsz)) { - s = startoffset_fsb; - do_div(s, extsz); - s *= extsz; - e = startoffset_fsb + allocatesize_fsb; - if ((temp = do_mod(startoffset_fsb, extsz))) - e += temp; - if ((temp = do_mod(e, extsz))) - e += extsz - temp; - } else { - s = 0; - e = allocatesize_fsb; - } - - /* - * The transaction reservation is limited to a 32-bit block - * count, hence we need to limit the number of blocks we are - * trying to reserve to avoid an overflow. We can't allocate - * more than @nimaps extents, and an extent is limited on disk - * to MAXEXTLEN (21 bits), so use that to enforce the limit. - */ - resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); - if (unlikely(rt)) { - resrtextents = qblocks = resblks; - resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; - } else { - resrtextents = 0; - resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); - quota_flag = XFS_QMOPT_RES_REGBLKS; - } - - /* - * Allocate and setup the transaction. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), resrtextents, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - /* - * Check for running out of space - */ - if (error) { - /* - * Free the transaction structure. - */ - ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - break; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, - 0, quota_flag); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, 0); - - xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, alloc_type, &firstfsb, - 0, imapp, &nimaps, &free_list); - if (error) { - goto error0; - } - - /* - * Complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error0; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) { - break; - } - - allocated_fsb = imapp->br_blockcount; - - if (nimaps == 0) { - error = XFS_ERROR(ENOSPC); - break; - } - - startoffset_fsb += allocated_fsb; - allocatesize_fsb -= allocated_fsb; - } - - return error; - -error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ - xfs_bmap_cancel(&free_list); - xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); - -error1: /* Just cancel transaction */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; -} - -/* - * Zero file bytes between startoff and endoff inclusive. - * The iolock is held exclusive and no blocks are buffered. - * - * This function is used by xfs_free_file_space() to zero - * partial blocks when the range to free is not block aligned. - * When unreserving space with boundaries that are not block - * aligned we round up the start and round down the end - * boundaries and then use this function to zero the parts of - * the blocks that got dropped during the rounding. - */ -STATIC int -xfs_zero_remaining_bytes( - xfs_inode_t *ip, - xfs_off_t startoff, - xfs_off_t endoff) -{ - xfs_bmbt_irec_t imap; - xfs_fileoff_t offset_fsb; - xfs_off_t lastoffset; - xfs_off_t offset; - xfs_buf_t *bp; - xfs_mount_t *mp = ip->i_mount; - int nimap; - int error = 0; - - /* - * Avoid doing I/O beyond eof - it's not necessary - * since nothing can read beyond eof. The space will - * be zeroed when the file is extended anyway. - */ - if (startoff >= XFS_ISIZE(ip)) - return 0; - - if (endoff > XFS_ISIZE(ip)) - endoff = XFS_ISIZE(ip); - - bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp, - BTOBB(mp->m_sb.sb_blocksize), 0); - if (!bp) - return XFS_ERROR(ENOMEM); - - xfs_buf_unlock(bp); - - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { - offset_fsb = XFS_B_TO_FSBT(mp, offset); - nimap = 1; - error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); - if (error || nimap < 1) - break; - ASSERT(imap.br_blockcount >= 1); - ASSERT(imap.br_startoff == offset_fsb); - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; - if (lastoffset > endoff) - lastoffset = endoff; - if (imap.br_startblock == HOLESTARTBLOCK) - continue; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - if (imap.br_state == XFS_EXT_UNWRITTEN) - continue; - XFS_BUF_UNDONE(bp); - XFS_BUF_UNWRITE(bp); - XFS_BUF_READ(bp); - XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); - xfsbdstrat(mp, bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_ioerror_alert(bp, - "xfs_zero_remaining_bytes(read)"); - break; - } - memset(bp->b_addr + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); - XFS_BUF_UNDONE(bp); - XFS_BUF_UNREAD(bp); - XFS_BUF_WRITE(bp); - xfsbdstrat(mp, bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_ioerror_alert(bp, - "xfs_zero_remaining_bytes(write)"); - break; - } - } - xfs_buf_free(bp); - return error; -} - -/* - * xfs_free_file_space() - * This routine frees disk space for the given file. - * - * This routine is only called by xfs_change_file_space - * for an UNRESVSP type call. - * - * RETURNS: - * 0 on success - * errno on error - * - */ -STATIC int -xfs_free_file_space( - xfs_inode_t *ip, - xfs_off_t offset, - xfs_off_t len, - int attr_flags) -{ - int committed; - int done; - xfs_fileoff_t endoffset_fsb; - int error; - xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; - xfs_bmbt_irec_t imap; - xfs_off_t ioffset; - xfs_extlen_t mod=0; - xfs_mount_t *mp; - int nimap; - uint resblks; - xfs_off_t rounding; - int rt; - xfs_fileoff_t startoffset_fsb; - xfs_trans_t *tp; - int need_iolock = 1; - - mp = ip->i_mount; - - trace_xfs_free_file_space(ip); - - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; - - error = 0; - if (len <= 0) /* if nothing being freed */ - return error; - rt = XFS_IS_REALTIME_INODE(ip); - startoffset_fsb = XFS_B_TO_FSB(mp, offset); - endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); - - if (attr_flags & XFS_ATTR_NOLOCK) - need_iolock = 0; - if (need_iolock) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - /* wait for the completion of any pending DIOs */ - inode_dio_wait(VFS_I(ip)); - } - - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); - ioffset = offset & ~(rounding - 1); - error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - ioffset, -1); - if (error) - goto out_unlock_iolock; - truncate_pagecache_range(VFS_I(ip), ioffset, -1); - - /* - * Need to zero the stuff we're not freeing, on disk. - * If it's a realtime file & can't use unwritten extents then we - * actually need to zero the extent edges. Otherwise xfs_bunmapi - * will take care of it for us. - */ - if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { - nimap = 1; - error = xfs_bmapi_read(ip, startoffset_fsb, 1, - &imap, &nimap, 0); - if (error) - goto out_unlock_iolock; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); - if (mod) - startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - nimap = 1; - error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, - &imap, &nimap, 0); - if (error) - goto out_unlock_iolock; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && (mod != mp->m_sb.sb_rextsize)) - endoffset_fsb -= mod; - } - } - if ((done = (endoffset_fsb <= startoffset_fsb))) - /* - * One contiguous piece to clear - */ - error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); - else { - /* - * Some full blocks, possibly two pieces to clear - */ - if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) - error = xfs_zero_remaining_bytes(ip, offset, - XFS_FSB_TO_B(mp, startoffset_fsb) - 1); - if (!error && - XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) - error = xfs_zero_remaining_bytes(ip, - XFS_FSB_TO_B(mp, endoffset_fsb), - offset + len - 1); - } - - /* - * free file space until done or until there is an error - */ - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - while (!error && !done) { - - /* - * allocate and setup the transaction. Allow this - * transaction to dip into the reserve blocks to ensure - * the freeing of the space succeeds at ENOSPC. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, - resblks, - XFS_WRITE_LOG_RES(mp), - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - - /* - * check for running out of space - */ - if (error) { - /* - * Free the transaction structure. - */ - ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - break; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, - ip->i_udquot, ip->i_gdquot, ip->i_pdquot, - resblks, 0, XFS_QMOPT_RES_REGBLKS); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, 0); - - /* - * issue the bunmapi() call to free the blocks - */ - xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bunmapi(tp, ip, startoffset_fsb, - endoffset_fsb - startoffset_fsb, - 0, 2, &firstfsb, &free_list, &done); - if (error) { - goto error0; - } - - /* - * complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error0; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - out_unlock_iolock: - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - - error0: - xfs_bmap_cancel(&free_list); - error1: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) : - XFS_ILOCK_EXCL); - return error; -} - - -STATIC int -xfs_zero_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len, - int attr_flags) -{ - struct xfs_mount *mp = ip->i_mount; - uint granularity; - xfs_off_t start_boundary; - xfs_off_t end_boundary; - int error; - - granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); - - /* - * Round the range of extents we are going to convert inwards. If the - * offset is aligned, then it doesn't get changed so we zero from the - * start of the block offset points to. - */ - start_boundary = round_up(offset, granularity); - end_boundary = round_down(offset + len, granularity); - - ASSERT(start_boundary >= offset); - ASSERT(end_boundary <= offset + len); - - if (!(attr_flags & XFS_ATTR_NOLOCK)) - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - if (start_boundary < end_boundary - 1) { - /* punch out the page cache over the conversion range */ - truncate_pagecache_range(VFS_I(ip), start_boundary, - end_boundary - 1); - /* convert the blocks */ - error = xfs_alloc_file_space(ip, start_boundary, - end_boundary - start_boundary - 1, - XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT, - attr_flags); - if (error) - goto out_unlock; - - /* We've handled the interior of the range, now for the edges */ - if (start_boundary != offset) - error = xfs_iozero(ip, offset, start_boundary - offset); - if (error) - goto out_unlock; - - if (end_boundary != offset + len) - error = xfs_iozero(ip, end_boundary, - offset + len - end_boundary); - - } else { - /* - * It's either a sub-granularity range or the range spanned lies - * partially across two adjacent blocks. - */ - error = xfs_iozero(ip, offset, len); - } - -out_unlock: - if (!(attr_flags & XFS_ATTR_NOLOCK)) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - -} - -/* - * xfs_change_file_space() - * This routine allocates or frees disk space for the given file. - * The user specified parameters are checked for alignment and size - * limitations. - * - * RETURNS: - * 0 on success - * errno on error - * - */ -int -xfs_change_file_space( - xfs_inode_t *ip, - int cmd, - xfs_flock64_t *bf, - xfs_off_t offset, - int attr_flags) -{ - xfs_mount_t *mp = ip->i_mount; - int clrprealloc; - int error; - xfs_fsize_t fsize; - int setprealloc; - xfs_off_t startoffset; - xfs_trans_t *tp; - struct iattr iattr; - - if (!S_ISREG(ip->i_d.di_mode)) - return XFS_ERROR(EINVAL); - - switch (bf->l_whence) { - case 0: /*SEEK_SET*/ - break; - case 1: /*SEEK_CUR*/ - bf->l_start += offset; - break; - case 2: /*SEEK_END*/ - bf->l_start += XFS_ISIZE(ip); - break; - default: - return XFS_ERROR(EINVAL); - } - - /* - * length of <= 0 for resv/unresv/zero is invalid. length for - * alloc/free is ignored completely and we have no idea what userspace - * might have set it to, so set it to zero to allow range - * checks to pass. - */ - switch (cmd) { - case XFS_IOC_ZERO_RANGE: - case XFS_IOC_RESVSP: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP: - case XFS_IOC_UNRESVSP64: - if (bf->l_len <= 0) - return XFS_ERROR(EINVAL); - break; - default: - bf->l_len = 0; - break; - } - - if (bf->l_start < 0 || - bf->l_start > mp->m_super->s_maxbytes || - bf->l_start + bf->l_len < 0 || - bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) - return XFS_ERROR(EINVAL); - - bf->l_whence = 0; - - startoffset = bf->l_start; - fsize = XFS_ISIZE(ip); - - setprealloc = clrprealloc = 0; - switch (cmd) { - case XFS_IOC_ZERO_RANGE: - error = xfs_zero_file_space(ip, startoffset, bf->l_len, - attr_flags); - if (error) - return error; - setprealloc = 1; - break; - - case XFS_IOC_RESVSP: - case XFS_IOC_RESVSP64: - error = xfs_alloc_file_space(ip, startoffset, bf->l_len, - XFS_BMAPI_PREALLOC, attr_flags); - if (error) - return error; - setprealloc = 1; - break; - - case XFS_IOC_UNRESVSP: - case XFS_IOC_UNRESVSP64: - if ((error = xfs_free_file_space(ip, startoffset, bf->l_len, - attr_flags))) - return error; - break; - - case XFS_IOC_ALLOCSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP: - case XFS_IOC_FREESP64: - /* - * These operations actually do IO when extending the file, but - * the allocation is done seperately to the zeroing that is - * done. This set of operations need to be serialised against - * other IO operations, such as truncate and buffered IO. We - * need to take the IOLOCK here to serialise the allocation and - * zeroing IO to prevent other IOLOCK holders (e.g. getbmap, - * truncate, direct IO) from racing against the transient - * allocated but not written state we can have here. - */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - if (startoffset > fsize) { - error = xfs_alloc_file_space(ip, fsize, - startoffset - fsize, 0, - attr_flags | XFS_ATTR_NOLOCK); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - break; - } - } - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = startoffset; - - error = xfs_setattr_size(ip, &iattr, - attr_flags | XFS_ATTR_NOLOCK); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - - if (error) - return error; - - clrprealloc = 1; - break; - - default: - ASSERT(0); - return XFS_ERROR(EINVAL); - } - - /* - * update the inode timestamp, mode, and prealloc flag bits - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); - - if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp), - 0, 0, 0))) { - /* ASSERT(0); */ - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - if ((attr_flags & XFS_ATTR_DMI) == 0) { - ip->i_d.di_mode &= ~S_ISUID; - - /* - * Note that we don't have to worry about mandatory - * file locking being disabled here because we only - * clear the S_ISGID bit if the Group execute bit is - * on, but if it was on then mandatory locking wouldn't - * have been enabled. - */ - if (ip->i_d.di_mode & S_IXGRP) - ip->i_d.di_mode &= ~S_ISGID; - - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } - if (setprealloc) - ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; - else if (clrprealloc) - ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (attr_flags & XFS_ATTR_SYNC) - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); -} diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h deleted file mode 100644 index 38c67c34d73f..000000000000 --- a/fs/xfs/xfs_vnodeops.h +++ /dev/null @@ -1,55 +0,0 @@ -#ifndef _XFS_VNODEOPS_H -#define _XFS_VNODEOPS_H 1 - -struct attrlist_cursor_kern; -struct file; -struct iattr; -struct inode; -struct iovec; -struct kiocb; -struct pipe_inode_info; -struct uio; -struct xfs_inode; - - -int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); -int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags); -#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ -#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ -#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ -#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ -#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */ - -int xfs_readlink(struct xfs_inode *ip, char *link); -int xfs_release(struct xfs_inode *ip); -int xfs_inactive(struct xfs_inode *ip); -int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, - struct xfs_inode **ipp, struct xfs_name *ci_name); -int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, - xfs_dev_t rdev, struct xfs_inode **ipp); -int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, - struct xfs_inode *ip); -int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, - struct xfs_name *target_name); -int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize); -int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, - const char *target_path, umode_t mode, struct xfs_inode **ipp); -int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); -int xfs_change_file_space(struct xfs_inode *ip, int cmd, - xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); -int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, - struct xfs_inode *src_ip, struct xfs_inode *target_dp, - struct xfs_name *target_name, struct xfs_inode *target_ip); -int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - unsigned char *value, int *valuelenp, int flags); -int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - unsigned char *value, int valuelen, int flags); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); -int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, - int flags, struct attrlist_cursor_kern *cursor); - -int xfs_iozero(struct xfs_inode *, loff_t, size_t); -int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); -int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); - -#endif /* _XFS_VNODEOPS_H */ diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 87d3e03878c8..e01f35ea76ba 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -17,13 +17,13 @@ */ #include "xfs.h" +#include "xfs_log_format.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_acl.h" -#include "xfs_vnodeops.h" #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> |