diff options
author | Lennart Poettering <lennart@poettering.net> | 2024-11-11 17:09:06 +0100 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2025-01-23 21:48:02 +0100 |
commit | 88252ca88932b733ead989b6c5cece22ea37941b (patch) | |
tree | 5cec95af773a704227684d2d619cf2245765a5ec /src/nspawn/nspawn.c | |
parent | dissect-image: add client side API wrapper for MountDirectory() varlink call (diff) | |
download | systemd-88252ca88932b733ead989b6c5cece22ea37941b.tar.xz systemd-88252ca88932b733ead989b6c5cece22ea37941b.zip |
nspawn: allow to run unpriv from dir
This simply calls into mountfsd to acquire the root mount and uses it as
root for the container.
Note that this also makes one more change: previously we ran containers
directory off their backing directory. Except when we didn't, and there
were a variety of exceptions: if we had no privs, if we ran off a disk
image, if the directory was the host's root dir, and some others.
This simplifies the logic a bit: we now simply always create a temporary
directory in /tmp/ and bind mount everything there, in all code paths.
This simplifies our code a bit. After all, in order to control
propagation we need to turn the root into a mount point anyway, hence we
might just do it at one place for all cases.
Diffstat (limited to 'src/nspawn/nspawn.c')
-rw-r--r-- | src/nspawn/nspawn.c | 128 |
1 files changed, 66 insertions, 62 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 5ab04b121a..b7622b8db4 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -3934,6 +3934,7 @@ static DissectImageFlags determine_dissect_image_flags(void) { static int outer_child( Barrier *barrier, const char *directory, + int mount_fd, DissectedImage *dissected_image, int fd_outer_socket, int fd_inner_socket, @@ -3987,7 +3988,23 @@ static int outer_child( if (r < 0) return r; - if (dissected_image) { + /* Put the root dir into the target directory now. One of three mechanisms is provided: either we + * have a single mount fd (typically unprivileged --directory= mode) or we have a fully dissected + * image (--image= mode), or we have a regular path. */ + if (mount_fd >= 0) { + assert(arg_directory); + assert(!arg_image); + + if (move_mount(mount_fd, "", AT_FDCWD, directory, MOVE_MOUNT_F_EMPTY_PATH) < 0) + return log_error_errno(errno, "Failed to attach root directory: %m"); + + mount_fd = safe_close(mount_fd); + log_debug("Successfully attached root directory to '%s'.", directory); + + } else if (dissected_image) { + assert(!arg_directory); + assert(arg_image); + /* If we are operating on a disk image, then mount its root directory now, but leave out the * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest, * but then with the uid shift known. That way we can mount VFAT file systems shifted to the @@ -4004,6 +4021,13 @@ static int outer_child( (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0)); if (r < 0) return r; + } else { + assert(arg_directory); + assert(!arg_image); + + r = mount_nofollow_verbose(LOG_ERR, arg_directory, directory, /* fstype= */ NULL, MS_BIND|MS_REC, /* options= */ NULL); + if (r < 0) + return r; } r = determine_uid_shift(directory); @@ -4047,27 +4071,6 @@ static int outer_child( "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); } - if (path_equal(directory, "/")) { - /* If the directory we shall boot is the host, let's operate on a bind mount at a different - * place, so that we can make changes to its mount structure (for example, to implement - * --volatile=) without this interfering with our ability to access files such as - * /etc/localtime to copy into the container. Note that we use a fixed place for this - * (instead of a temporary directory, since we are living in our own mount namespace here - * already, and thus don't need to be afraid of colliding with anyone else's mounts). */ - (void) mkdir_p("/run/systemd/nspawn-root", 0755); - - r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL); - if (r < 0) - return r; - - directory = "/run/systemd/nspawn-root"; - } - - /* Make sure we always have a mount that we can move to root later on. */ - r = make_mount_point(directory); - if (r < 0) - return r; - /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host * mount namespace. For the directory we are going to run our container let's turn this off, so that * we'll live in our own little world from now on, and propagation from the host may only happen via @@ -5161,6 +5164,8 @@ static int load_oci_bundle(void) { } static int run_container( + const char *directory, + int mount_fd, DissectedImage *dissected_image, int userns_fd, FDSet *fds, @@ -5298,7 +5303,8 @@ static int run_container( (void) reset_signal_mask(); r = outer_child(&barrier, - arg_directory, + directory, + mount_fd, dissected_image, fd_outer_socket_pair[1], fd_inner_socket_pair[1], @@ -5915,14 +5921,14 @@ static int cant_be_in_netns(void) { } static int run(int argc, char *argv[]) { - bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false; - _cleanup_close_ int master = -EBADF, userns_fd = -EBADF; + bool remove_directory = false, remove_image = false, veth_created = false; + _cleanup_close_ int master = -EBADF, userns_fd = -EBADF, mount_fd = -EBADF; _cleanup_fdset_free_ FDSet *fds = NULL; int r, n_fd_passed, ret = EXIT_SUCCESS; char veth_name[IFNAMSIZ] = ""; struct ExposeArgs expose_args = {}; _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT; - char tmprootdir[] = "/tmp/nspawn-root-XXXXXX"; + _cleanup_(rmdir_and_freep) char *rootdir = NULL; _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL; _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL; _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL; @@ -6023,14 +6029,25 @@ static int run(int argc, char *argv[]) { if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */ arg_quiet = true; - if (arg_directory) { - assert(!arg_image); + if (!arg_privileged) { + /* if we are unprivileged, let's allocate a 64K userns first */ - if (!arg_privileged) { - r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invoking container from plain directory tree is currently not supported if called without privileges."); + _cleanup_free_ char *userns_name = NULL; + if (asprintf(&userns_name, "nspawn-" PID_FMT "-%s", getpid_cached(), arg_machine) < 0) { + r = log_oom(); goto finish; } + userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000)); + if (userns_fd < 0) { + r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m"); + goto finish; + } + } + + if (arg_directory) { + assert(!arg_image); + /* Safety precaution: let's not allow running images from the live host OS image, as long as * /var from the host will propagate into container dynamically (because bad things happen if * two systems write to the same /var). Let's allow it for the special cases where /var is @@ -6200,6 +6217,15 @@ static int run(int argc, char *argv[]) { } } + if (!arg_privileged) { + r = mountfsd_mount_directory( + arg_directory, + userns_fd, + determine_dissect_image_flags(), + &mount_fd); + if (r < 0) + goto finish; + } } else { DissectImageFlags dissect_image_flags = determine_dissect_image_flags(); @@ -6274,19 +6300,6 @@ static int run(int argc, char *argv[]) { dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE; } - if (!mkdtemp(tmprootdir)) { - r = log_error_errno(errno, "Failed to create temporary directory: %m"); - goto finish; - } - - remove_tmprootdir = true; - - arg_directory = strdup(tmprootdir); - if (!arg_directory) { - r = log_oom(); - goto finish; - } - if (arg_privileged) { r = loop_device_make_by_path( arg_image, @@ -6339,19 +6352,6 @@ static int run(int argc, char *argv[]) { if (r < 0) goto finish; } else { - _cleanup_free_ char *userns_name = strjoin("nspawn-", arg_machine); - if (!userns_name) { - r = log_oom(); - goto finish; - } - - /* if we are unprivileged, let's allocate a 64K userns first */ - userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000)); - if (userns_fd < 0) { - r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m"); - goto finish; - } - r = mountfsd_mount_image( arg_image, userns_fd, @@ -6370,7 +6370,14 @@ static int run(int argc, char *argv[]) { arg_architecture = dissected_image_architecture(dissected_image); } - r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts); + /* Create a temporary place to mount stuff. */ + r = mkdtemp_malloc("/tmp/nspawn-root-XXXXXX", &rootdir); + if (r < 0) { + log_error_errno(r, "Failed to create temporary directory: %m"); + goto finish; + } + + r = custom_mount_prepare_all(rootdir, arg_custom_mounts, arg_n_custom_mounts); if (r < 0) goto finish; @@ -6405,6 +6412,8 @@ static int run(int argc, char *argv[]) { } for (;;) { r = run_container( + rootdir, + mount_fd, dissected_image, userns_fd, fds, @@ -6447,11 +6456,6 @@ finish: log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image); } - if (remove_tmprootdir) { - if (rmdir(tmprootdir) < 0) - log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir); - } - if (arg_machine && arg_privileged) { const char *p; |