summaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_exchrange.c
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2024-04-15 23:54:14 +0200
committerDarrick J. Wong <djwong@kernel.org>2024-04-15 23:54:14 +0200
commit9a64d9b3109d01cca0b83c1d36538b7a37c5284e (patch)
treec33eacfdac02eb3c34365fa448aaf7b2fab437fe /fs/xfs/xfs_exchrange.c
parentvfs: export remap and write check helpers (diff)
downloadlinux-9a64d9b3109d01cca0b83c1d36538b7a37c5284e.tar.xz
linux-9a64d9b3109d01cca0b83c1d36538b7a37c5284e.zip
xfs: introduce new file range exchange ioctl
Introduce a new ioctl to handle exchanging ranges of bytes between files. The goal here is to perform the exchange atomically with respect to applications -- either they see the file contents before the exchange or they see that A-B is now B-A, even if the kernel crashes. My original goal with all this code was to make it so that online repair can build a replacement directory or xattr structure in a temporary file and commit the repair by atomically exchanging all the data blocks between the two files. However, I needed a way to test this mechanism thoroughly, so I've been evolving an ioctl interface since then. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'fs/xfs/xfs_exchrange.c')
-rw-r--r--fs/xfs/xfs_exchrange.c339
1 files changed, 339 insertions, 0 deletions
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
new file mode 100644
index 000000000000..4cd824e47f75
--- /dev/null
+++ b/fs/xfs/xfs_exchrange.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_exchrange.h"
+#include <linux/fsnotify.h>
+
+/*
+ * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
+ * This part deals with struct file objects and byte ranges and does not deal
+ * with XFS-specific data structures such as xfs_inodes and block ranges. This
+ * separation may some day facilitate porting to another filesystem.
+ *
+ * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
+ * file1 with the same number of bytes starting at fxr.file2_offset in file2.
+ * Implementations must call xfs_exchange_range_prep to prepare the two
+ * files prior to taking locks; and they must update the inode change and mod
+ * times of both files as part of the metadata update. The timestamp update
+ * and freshness checks must be done atomically as part of the data exchange
+ * operation to ensure correctness of the freshness check.
+ * xfs_exchange_range_finish must be called after the operation completes
+ * successfully but before locks are dropped.
+ */
+
+/* Verify that we have security clearance to perform this operation. */
+static int
+xfs_exchange_range_verify_area(
+ struct xfs_exchrange *fxr)
+{
+ int ret;
+
+ ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
+ true);
+ if (ret)
+ return ret;
+
+ return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
+ true);
+}
+
+/*
+ * Performs necessary checks before doing a range exchange, having stabilized
+ * mutable inode attributes via i_rwsem.
+ */
+static inline int
+xfs_exchange_range_checks(
+ struct xfs_exchrange *fxr,
+ unsigned int alloc_unit)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ uint64_t allocmask = alloc_unit - 1;
+ int64_t test_len;
+ uint64_t blen;
+ loff_t size1, size2, tmp;
+ int error;
+
+ /* Don't touch certain kinds of inodes */
+ if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
+ return -EPERM;
+ if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
+ return -ETXTBSY;
+
+ size1 = i_size_read(inode1);
+ size2 = i_size_read(inode2);
+
+ /* Ranges cannot start after EOF. */
+ if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
+ return -EINVAL;
+
+ /*
+ * If the caller said to exchange to EOF, we set the length of the
+ * request large enough to cover everything to the end of both files.
+ */
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
+ fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
+ size2 - fxr->file2_offset);
+
+ error = xfs_exchange_range_verify_area(fxr);
+ if (error)
+ return error;
+ }
+
+ /*
+ * The start of both ranges must be aligned to the file allocation
+ * unit.
+ */
+ if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
+ !IS_ALIGNED(fxr->file2_offset, alloc_unit))
+ return -EINVAL;
+
+ /* Ensure offsets don't wrap. */
+ if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
+ check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
+ return -EINVAL;
+
+ /*
+ * We require both ranges to end within EOF, unless we're exchanging
+ * to EOF.
+ */
+ if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
+ (fxr->file1_offset + fxr->length > size1 ||
+ fxr->file2_offset + fxr->length > size2))
+ return -EINVAL;
+
+ /*
+ * Make sure we don't hit any file size limits. If we hit any size
+ * limits such that test_length was adjusted, we abort the whole
+ * operation.
+ */
+ test_len = fxr->length;
+ error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
+ &test_len);
+ if (error)
+ return error;
+ error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
+ &test_len);
+ if (error)
+ return error;
+ if (test_len != fxr->length)
+ return -EINVAL;
+
+ /*
+ * If the user wanted us to exchange up to the infile's EOF, round up
+ * to the next allocation unit boundary for this check. Do the same
+ * for the outfile.
+ *
+ * Otherwise, reject the range length if it's not aligned to an
+ * allocation unit.
+ */
+ if (fxr->file1_offset + fxr->length == size1)
+ blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
+ else if (fxr->file2_offset + fxr->length == size2)
+ blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
+ else if (!IS_ALIGNED(fxr->length, alloc_unit))
+ return -EINVAL;
+ else
+ blen = fxr->length;
+
+ /* Don't allow overlapped exchanges within the same file. */
+ if (inode1 == inode2 &&
+ fxr->file2_offset + blen > fxr->file1_offset &&
+ fxr->file1_offset + blen > fxr->file2_offset)
+ return -EINVAL;
+
+ /*
+ * Ensure that we don't exchange a partial EOF block into the middle of
+ * another file.
+ */
+ if ((fxr->length & allocmask) == 0)
+ return 0;
+
+ blen = fxr->length;
+ if (fxr->file2_offset + blen < size2)
+ blen &= ~allocmask;
+
+ if (fxr->file1_offset + blen < size1)
+ blen &= ~allocmask;
+
+ return blen == fxr->length ? 0 : -EINVAL;
+}
+
+/*
+ * Check that the two inodes are eligible for range exchanges, the ranges make
+ * sense, and then flush all dirty data. Caller must ensure that the inodes
+ * have been locked against any other modifications.
+ */
+static inline int
+xfs_exchange_range_prep(
+ struct xfs_exchrange *fxr,
+ unsigned int alloc_unit)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ bool same_inode = (inode1 == inode2);
+ int error;
+
+ /* Check that we don't violate system file offset limits. */
+ error = xfs_exchange_range_checks(fxr, alloc_unit);
+ if (error || fxr->length == 0)
+ return error;
+
+ /* Wait for the completion of any pending IOs on both files */
+ inode_dio_wait(inode1);
+ if (!same_inode)
+ inode_dio_wait(inode2);
+
+ error = filemap_write_and_wait_range(inode1->i_mapping,
+ fxr->file1_offset,
+ fxr->file1_offset + fxr->length - 1);
+ if (error)
+ return error;
+
+ error = filemap_write_and_wait_range(inode2->i_mapping,
+ fxr->file2_offset,
+ fxr->file2_offset + fxr->length - 1);
+ if (error)
+ return error;
+
+ /*
+ * If the files or inodes involved require synchronous writes, amend
+ * the request to force the filesystem to flush all data and metadata
+ * to disk after the operation completes.
+ */
+ if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
+ IS_SYNC(inode1) || IS_SYNC(inode2))
+ fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
+
+ return 0;
+}
+
+/*
+ * Finish a range exchange operation, if it was successful. Caller must ensure
+ * that the inodes are still locked against any other modifications.
+ */
+static inline int
+xfs_exchange_range_finish(
+ struct xfs_exchrange *fxr)
+{
+ int error;
+
+ error = file_remove_privs(fxr->file1);
+ if (error)
+ return error;
+ if (file_inode(fxr->file1) == file_inode(fxr->file2))
+ return 0;
+
+ return file_remove_privs(fxr->file2);
+}
+
+/* Exchange parts of two files. */
+static int
+xfs_exchange_range(
+ struct xfs_exchrange *fxr)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ int ret;
+
+ BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
+ XFS_EXCHANGE_RANGE_PRIV_FLAGS);
+
+ /* Both files must be on the same mount/filesystem. */
+ if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
+ return -EXDEV;
+
+ if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ return -EINVAL;
+
+ /* Userspace requests only honored for regular files. */
+ if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
+ return -EINVAL;
+
+ /* Both files must be opened for read and write. */
+ if (!(fxr->file1->f_mode & FMODE_READ) ||
+ !(fxr->file1->f_mode & FMODE_WRITE) ||
+ !(fxr->file2->f_mode & FMODE_READ) ||
+ !(fxr->file2->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ /* Neither file can be opened append-only. */
+ if ((fxr->file1->f_flags & O_APPEND) ||
+ (fxr->file2->f_flags & O_APPEND))
+ return -EBADF;
+
+ /*
+ * If we're not exchanging to EOF, we can check the areas before
+ * stabilizing both files' i_size.
+ */
+ if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
+ ret = xfs_exchange_range_verify_area(fxr);
+ if (ret)
+ return ret;
+ }
+
+ /* Update cmtime if the fd/inode don't forbid it. */
+ if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
+ fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
+ if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
+ fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
+
+ file_start_write(fxr->file2);
+ ret = -EOPNOTSUPP; /* XXX call out to lower level code */
+ file_end_write(fxr->file2);
+ if (ret)
+ return ret;
+
+ fsnotify_modify(fxr->file1);
+ if (fxr->file2 != fxr->file1)
+ fsnotify_modify(fxr->file2);
+ return 0;
+}
+
+/* Collect exchange-range arguments from userspace. */
+long
+xfs_ioc_exchange_range(
+ struct file *file,
+ struct xfs_exchange_range __user *argp)
+{
+ struct xfs_exchrange fxr = {
+ .file2 = file,
+ };
+ struct xfs_exchange_range args;
+ struct fd file1;
+ int error;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
+ return -EINVAL;
+ if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ return -EINVAL;
+
+ fxr.file1_offset = args.file1_offset;
+ fxr.file2_offset = args.file2_offset;
+ fxr.length = args.length;
+ fxr.flags = args.flags;
+
+ file1 = fdget(args.file1_fd);
+ if (!file1.file)
+ return -EBADF;
+ fxr.file1 = file1.file;
+
+ error = xfs_exchange_range(&fxr);
+ fdput(file1);
+ return error;
+}