summaryrefslogtreecommitdiffstats
path: root/src/nspawn/nspawn.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nspawn/nspawn.c')
-rw-r--r--src/nspawn/nspawn.c409
1 files changed, 257 insertions, 152 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 7ceda0b30c..4c054b2dbb 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -107,6 +107,7 @@
#include "sysctl-util.h"
#include "terminal-util.h"
#include "tmpfile-util.h"
+#include "uid-classification.h"
#include "umask-util.h"
#include "unit-name.h"
#include "user-util.h"
@@ -139,7 +140,7 @@ static char *arg_hostname = NULL; /* The name the payload sees by default */
static const char *arg_selinux_context = NULL;
static const char *arg_selinux_apifs_context = NULL;
static char *arg_slice = NULL;
-static bool arg_private_network = false;
+static bool arg_private_network; /* initialized depending on arg_privileged in run() */
static bool arg_read_only = false;
static StartMode arg_start_mode = START_PID1;
static bool arg_ephemeral = false;
@@ -197,7 +198,7 @@ static VolatileMode arg_volatile_mode = VOLATILE_NO;
static ExposePort *arg_expose_ports = NULL;
static char **arg_property = NULL;
static sd_bus_message *arg_property_message = NULL;
-static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
+static UserNamespaceMode arg_userns_mode; /* initialized depending on arg_privileged in run() */
static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
static int arg_kill_signal = 0;
@@ -369,7 +370,7 @@ static int help(void) {
" the service unit nspawn is running in\n"
"\n%3$sUser Namespacing:%4$s\n"
" --private-users=no Run without user namespacing\n"
- " --private-users=yes|pick|identity\n"
+ " --private-users=yes|pick|identity|managed\n"
" Run within user namespace, autoselect UID/GID range\n"
" --private-users=UIDBASE[:NUIDS]\n"
" Similar, but with user configured UID/GID range\n"
@@ -518,7 +519,7 @@ static int detect_unified_cgroup_hierarchy_from_environment(void) {
static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
int r;
- if (!arg_privileged) {
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
/* We only support the unified mode when running unprivileged */
arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
return 0;
@@ -1257,6 +1258,11 @@ static int parse_argv(int argc, char *argv[]) {
arg_userns_mode = USER_NAMESPACE_FIXED;
arg_uid_shift = 0;
arg_uid_range = UINT32_C(0x10000);
+ } else if (streq(optarg, "managed")) {
+ /* managed: User namespace on, and acquire it from systemd-nsresourced */
+ arg_userns_mode = USER_NAMESPACE_MANAGED;
+ arg_uid_shift = UID_INVALID;
+ arg_uid_range = UINT32_C(0x10000);
} else {
/* anything else: User namespacing on, UID range is explicitly configured */
r = parse_userns_uid_range(optarg, &arg_uid_shift, &arg_uid_range);
@@ -1271,9 +1277,8 @@ static int parse_argv(int argc, char *argv[]) {
case 'U':
if (userns_supported()) {
- arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
- * implied by USER_NAMESPACE_PICK
- * further down. */
+ /* Note that arg_userns_ownership is implied by USER_NAMESPACE_PICK further down. */
+ arg_userns_mode = arg_privileged ? USER_NAMESPACE_PICK : USER_NAMESPACE_MANAGED;
arg_uid_shift = UID_INVALID;
arg_uid_range = UINT32_C(0x10000);
@@ -1656,14 +1661,23 @@ static int parse_argv(int argc, char *argv[]) {
static int verify_arguments(void) {
int r;
- SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged);
+ SET_FLAG(arg_mount_settings, MOUNT_UNMANAGED, arg_userns_mode != USER_NAMESPACE_MANAGED);
- if (!arg_privileged) {
- if (!arg_private_network) {
- log_notice("Automatically implying --private-network, since mounting /sys/ in an unprivileged user namespaces requires network namespacing.");
- arg_private_network = true;
- }
- }
+ /* We can mount selinuxfs only if we are privileged and can do so before userns. In managed mode we
+ * have to enter the userns earlier, hence cannot do that. */
+ /* SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged); */
+ SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_userns_mode != USER_NAMESPACE_MANAGED);
+
+ SET_FLAG(arg_mount_settings, MOUNT_USE_USERNS, arg_userns_mode != USER_NAMESPACE_NO);
+
+ if (arg_private_network)
+ SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, arg_private_network);
+
+ if (!arg_privileged && arg_userns_mode != USER_NAMESPACE_MANAGED)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unprivileged operation requires managed user namespaces, as otherwise no UID range can be acquired.");
+
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED && !arg_private_network)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Managed user namespace operation requires private networking, as otherwise /sys/ may not be mounted.");
if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
/* If we are running the stub init in the container, we don't need to look at what the init
@@ -1684,12 +1698,6 @@ static int verify_arguments(void) {
arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
}
- if (arg_userns_mode != USER_NAMESPACE_NO)
- arg_mount_settings |= MOUNT_USE_USERNS;
-
- if (arg_private_network)
- arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
-
if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
!(arg_clone_ns_flags & CLONE_NEWUTS)) {
arg_register = false;
@@ -1699,8 +1707,7 @@ static int verify_arguments(void) {
if (arg_userns_ownership < 0)
arg_userns_ownership =
- arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
- USER_NAMESPACE_OWNERSHIP_OFF;
+ IN_SET(arg_userns_mode, USER_NAMESPACE_PICK, USER_NAMESPACE_MANAGED) ? USER_NAMESPACE_OWNERSHIP_AUTO : USER_NAMESPACE_OWNERSHIP_OFF;
if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
arg_kill_signal = SIGRTMIN+3;
@@ -1809,10 +1816,18 @@ static int verify_network_interfaces_initialized(void) {
return 0;
}
+static int in_child_chown(void) {
+ /* Returns true when chown()ing inodes we create inside the outer child is required. Basically, we
+ * need the chowning when we implement userns ourselves. If userns is off we don#t need to chown(),
+ * obviously. And if we are in managed mode we already entered the userns, and hence don#t need to
+ * manually chown either. */
+ return IN_SET(arg_userns_mode, USER_NAMESPACE_PICK, USER_NAMESPACE_FIXED);
+}
+
static int userns_chown_at(int fd, const char *fname, uid_t uid, gid_t gid, int flags) {
assert(fd >= 0 || fd == AT_FDCWD);
- if (arg_userns_mode == USER_NAMESPACE_NO)
+ if (!in_child_chown())
return 0;
if (uid == UID_INVALID && gid == GID_INVALID)
@@ -2295,18 +2310,24 @@ static int copy_devnode_one(const char *dest, const char *node, bool ignore_mkno
if (r < 0)
return log_error_errno(r, "Failed to create directory %s: %m", parent);
- if (mknod(to, st.st_mode, st.st_rdev) < 0) {
- r = -errno; /* Save the original error code. */
+ r = RET_NERRNO(mknod(to, st.st_mode, st.st_rdev));
+ if (r < 0) {
/* Explicitly warn the user when /dev/ is already populated. */
if (r == -EEXIST)
log_notice("%s/dev/ is pre-mounted and pre-populated. If a pre-mounted /dev/ is provided it needs to be an unpopulated file system.", dest);
+
/* If arg_uid_shift != 0, then we cannot fall back to use bind mount. */
- if (arg_uid_shift != 0) {
+ if (!(arg_userns_mode == USER_NAMESPACE_NO ||
+ (arg_userns_mode == USER_NAMESPACE_FIXED && arg_uid_shift == 0))) {
if (ignore_mknod_failure) {
log_debug_errno(r, "Failed to mknod(%s), ignoring: %m", to);
return 0;
}
- return log_error_errno(r, "Failed to mknod(%s): %m", to);
+
+ if (arg_userns_mode != USER_NAMESPACE_MANAGED || !ERRNO_IS_NEG_PRIVILEGE(r))
+ return log_error_errno(r, "Failed to mknod(%s): %m", to);
+
+ log_debug_errno(r, "Failed to create device node '%s' and running in managed mode, resorting to bind mount: %m", to);
}
/* Some systems abusively restrict mknod but allow bind mounts. */
@@ -2402,7 +2423,7 @@ static int make_extra_nodes(const char *dest) {
return 0;
}
-static int setup_pts(const char *dest) {
+static int setup_pts(const char *dest, uid_t chown_uid) {
_cleanup_free_ char *options = NULL;
const char *p;
int r;
@@ -2411,13 +2432,13 @@ static int setup_pts(const char *dest) {
if (arg_selinux_apifs_context)
(void) asprintf(&options,
"newinstance,ptmxmode=0666,mode=" STRINGIFY(TTY_MODE) ",gid=" GID_FMT ",context=\"%s\"",
- arg_uid_shift + TTY_GID,
+ chown_uid + TTY_GID,
arg_selinux_apifs_context);
else
#endif
(void) asprintf(&options,
"newinstance,ptmxmode=0666,mode=" STRINGIFY(TTY_MODE) ",gid=" GID_FMT,
- arg_uid_shift + TTY_GID);
+ chown_uid + TTY_GID);
if (!options)
return log_oom();
@@ -2854,7 +2875,9 @@ static int reset_audit_loginuid(void) {
if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
return 0;
- if (!arg_privileged)
+ /* if we are in managed userns mode, then we are already in our userns, hence we cannot reset the
+ * loginuid anyway, hence don't bother */
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED)
return 0;
r = read_virtual_file("/proc/self/loginuid", SIZE_MAX, &p, /* ret_size= */ NULL);
@@ -2886,8 +2909,8 @@ static int mount_tunnel_dig(const char *root) {
const char *p, *q;
int r;
- if (!arg_privileged) {
- log_debug("Not digging mount tunnel, because running unprivileged.");
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+ log_debug("Not digging mount tunnel, because running in managed user namespace mode.");
return 0;
}
@@ -2919,8 +2942,8 @@ static int mount_tunnel_dig(const char *root) {
static int mount_tunnel_open(void) {
int r;
- if (!arg_privileged) {
- log_debug("Not opening up mount tunnel, because running unprivileged.");
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+ log_debug("Not opening up mount tunnel, because running in managed user namespace mode.");
return 0;
}
@@ -3267,6 +3290,12 @@ static int chase_and_update(char **p, unsigned flags) {
static int determine_uid_shift(const char *directory) {
assert(directory);
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+ /* In managed mode we should already know the UID shift */
+ assert(uid_is_valid(arg_uid_shift));
+ return 0;
+ }
+
if (arg_userns_mode == USER_NAMESPACE_NO) {
arg_uid_shift = 0;
return 0;
@@ -3437,7 +3466,7 @@ static int inner_child(
return r;
}
- r = mount_all(NULL,
+ r = mount_all(/* dest= */ NULL,
arg_mount_settings | MOUNT_IN_USERNS,
arg_uid_shift,
arg_selinux_apifs_context);
@@ -3447,7 +3476,7 @@ static int inner_child(
if (!arg_network_namespace_path && arg_private_network) {
_cleanup_close_ int netns_fd = -EBADF;
- if (arg_privileged)
+ if (arg_userns_mode != USER_NAMESPACE_MANAGED)
if (unshare(CLONE_NEWNET) < 0)
return log_error_errno(errno, "Failed to unshare network namespace: %m");
@@ -3463,8 +3492,8 @@ static int inner_child(
(void) barrier_place(barrier); /* #3 */
}
- if (arg_privileged) {
- r = mount_sysfs(NULL, arg_mount_settings);
+ if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
+ r = mount_sysfs(NULL, arg_mount_settings | MOUNT_IN_USERNS);
if (r < 0)
return r;
}
@@ -3817,8 +3846,8 @@ static int setup_unix_export_dir_outside(char **ret) {
assert(ret);
- if (!arg_privileged) {
- log_debug("Not digging socket tunnel, because running unprivileged.");
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+ log_debug("Not digging socket tunnel, because running in managed user namespace mode.");
return 0;
}
@@ -3874,7 +3903,7 @@ static int setup_unix_export_host_inside(const char *directory, const char *unix
assert(directory);
- if (!arg_privileged)
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED)
return 0;
assert(unix_export_path);
@@ -3928,12 +3957,15 @@ static DissectImageFlags determine_dissect_image_flags(void) {
DISSECT_IMAGE_PIN_PARTITION_DEVICES |
(arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS) |
DISSECT_IMAGE_ALLOW_USERSPACE_VERITY |
- (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0);
+ (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0) |
+ ((arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_FOREIGN) ? DISSECT_IMAGE_FOREIGN_UID :
+ (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO) ? DISSECT_IMAGE_IDENTITY_UID : 0);
}
static int outer_child(
Barrier *barrier,
const char *directory,
+ int mount_fd,
DissectedImage *dissected_image,
int fd_outer_socket,
int fd_inner_socket,
@@ -3952,9 +3984,9 @@ static int outer_child(
/* This is the "outer" child process, i.e the one forked off by the container manager itself. Its
* namespace situation is:
*
- * - CLONE_NEWNS : already has its own (created by clone() if arg_privileged, or unshare() if !arg_unprivileged)
- * - CLONE_NEWUSER : if arg_privileged: still in the host's
- * if !arg_privileged: already has its own (created by nsresource_allocate_userns()->setns(userns_fd))
+ * - CLONE_NEWUSER : if not in USER_NAMESPACE_MANAGED mode: still in the host's
+ * if USER_NAMESPACE_MANAGED mode: already has its own (created by nsresource_allocate_userns()->setns(userns_fd))
+ * - CLONE_NEWNS : already has its own (created by clone() if not USER_NAMESPACE_MANAGED, or unshare() otherwise)
* - CLONE_NEWPID : still in the host's
* - CLONE_NEWUTS : still in the host's
* - CLONE_NEWIPC : still in the host's
@@ -3987,7 +4019,23 @@ static int outer_child(
if (r < 0)
return r;
- if (dissected_image) {
+ /* Put the root dir into the target directory now. One of three mechanisms is provided: either we
+ * have a single mount fd (typically unprivileged --directory= mode) or we have a fully dissected
+ * image (--image= mode), or we have a regular path. */
+ if (mount_fd >= 0) {
+ assert(arg_directory);
+ assert(!arg_image);
+
+ if (move_mount(mount_fd, "", AT_FDCWD, directory, MOVE_MOUNT_F_EMPTY_PATH) < 0)
+ return log_error_errno(errno, "Failed to attach root directory: %m");
+
+ mount_fd = safe_close(mount_fd);
+ log_debug("Successfully attached root directory to '%s'.", directory);
+
+ } else if (dissected_image) {
+ assert(!arg_directory);
+ assert(arg_image);
+
/* If we are operating on a disk image, then mount its root directory now, but leave out the
* rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
* but then with the uid shift known. That way we can mount VFAT file systems shifted to the
@@ -4004,12 +4052,31 @@ static int outer_child(
(arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
if (r < 0)
return r;
+ } else {
+ assert(arg_directory);
+ assert(!arg_image);
+
+ r = mount_nofollow_verbose(LOG_ERR, arg_directory, directory, /* fstype= */ NULL, MS_BIND|MS_REC, /* options= */ NULL);
+ if (r < 0)
+ return r;
}
r = determine_uid_shift(directory);
if (r < 0)
return r;
+ /* If we do userns on our own, we need to chown() all files ourselves before. Otherwise, if userns is
+ * off or we are in managed mode we already have the userns applied, hence don't need to chown
+ * anything */
+ uid_t chown_uid, chown_range;
+ if (in_child_chown()) {
+ chown_uid = arg_uid_shift;
+ chown_range = arg_uid_range;
+ } else {
+ chown_uid = 0;
+ chown_range = UINT32_C(0x10000);
+ }
+
if (arg_userns_mode != USER_NAMESPACE_NO) {
_cleanup_close_ int mntns_fd = -EBADF;
@@ -4041,33 +4108,15 @@ static int outer_child(
if (l != sizeof(arg_uid_shift))
return log_error_errno(SYNTHETIC_ERRNO(EIO),
"Short read while receiving UID shift.");
+
+ if (in_child_chown())
+ chown_uid = arg_uid_shift;
}
log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
"Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
}
- if (path_equal(directory, "/")) {
- /* If the directory we shall boot is the host, let's operate on a bind mount at a different
- * place, so that we can make changes to its mount structure (for example, to implement
- * --volatile=) without this interfering with our ability to access files such as
- * /etc/localtime to copy into the container. Note that we use a fixed place for this
- * (instead of a temporary directory, since we are living in our own mount namespace here
- * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
- (void) mkdir_p("/run/systemd/nspawn-root", 0755);
-
- r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
- if (r < 0)
- return r;
-
- directory = "/run/systemd/nspawn-root";
- }
-
- /* Make sure we always have a mount that we can move to root later on. */
- r = make_mount_point(directory);
- if (r < 0)
- return r;
-
/* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
* mount namespace. For the directory we are going to run our container let's turn this off, so that
* we'll live in our own little world from now on, and propagation from the host may only happen via
@@ -4086,7 +4135,7 @@ static int outer_child(
r = setup_volatile_mode(
directory,
arg_volatile_mode,
- arg_uid_shift,
+ chown_uid,
arg_selinux_apifs_context);
if (r < 0)
return r;
@@ -4094,8 +4143,8 @@ static int outer_child(
r = bind_user_prepare(
directory,
arg_bind_user,
- arg_uid_shift,
- arg_uid_range,
+ chown_uid,
+ chown_range,
&arg_custom_mounts, &arg_n_custom_mounts,
&bind_user_context);
if (r < 0)
@@ -4126,17 +4175,47 @@ static int outer_child(
directory,
arg_custom_mounts,
arg_n_custom_mounts,
- arg_uid_shift,
- arg_uid_range,
+ chown_uid,
+ chown_range,
arg_selinux_apifs_context,
MOUNT_ROOT_ONLY);
if (r < 0)
return r;
- if (arg_userns_mode != USER_NAMESPACE_NO &&
- IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
- arg_uid_shift != 0) {
+ if (!IN_SET(arg_userns_mode, USER_NAMESPACE_NO, USER_NAMESPACE_MANAGED) &&
+ IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_FOREIGN, USER_NAMESPACE_OWNERSHIP_AUTO) &&
+ chown_uid != 0) {
_cleanup_strv_free_ char **dirs = NULL;
+ RemountIdmapping mapping;
+
+ switch (arg_userns_ownership) {
+ case USER_NAMESPACE_OWNERSHIP_MAP:
+ mapping = REMOUNT_IDMAPPING_HOST_ROOT;
+ break;
+
+ case USER_NAMESPACE_OWNERSHIP_FOREIGN:
+ mapping = REMOUNT_IDMAPPING_FOREIGN_WITH_HOST_ROOT;
+ break;
+
+ case USER_NAMESPACE_OWNERSHIP_AUTO: {
+ struct stat st;
+
+ if (lstat(directory, &st) < 0)
+ return log_error_errno(errno, "Failed to stat() container root directory '%s': %m", directory);
+
+ r = stat_verify_directory(&st);
+ if (r < 0)
+ return log_error_errno(r, "Container root directory '%s' is not a directory: %m", directory);
+
+ mapping = uid_is_foreign(st.st_uid) ?
+ REMOUNT_IDMAPPING_FOREIGN_WITH_HOST_ROOT :
+ REMOUNT_IDMAPPING_HOST_ROOT;
+ break;
+ }
+
+ default:
+ assert_not_reached();
+ }
if (arg_volatile_mode != VOLATILE_YES) {
r = strv_extend(&dirs, directory);
@@ -4155,7 +4234,13 @@ static int outer_child(
return log_oom();
}
- r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
+ r = remount_idmap(
+ dirs,
+ chown_uid,
+ chown_range,
+ /* host_owner= */ UID_INVALID,
+ /* dest_owner= */ UID_INVALID,
+ mapping);
if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
/* This might fail because the kernel or file system doesn't support idmapping. We
* can't really distinguish this nicely, nor do we have any guarantees about the
@@ -4177,7 +4262,7 @@ static int outer_child(
r = setup_volatile_mode_after_remount_idmap(
directory,
arg_volatile_mode,
- arg_uid_shift,
+ chown_uid,
arg_selinux_apifs_context);
if (r < 0)
return r;
@@ -4187,8 +4272,8 @@ static int outer_child(
r = dissected_image_mount_and_warn(
dissected_image,
directory,
- arg_uid_shift,
- arg_uid_range,
+ chown_uid,
+ chown_range,
/* userns_fd= */ -EBADF,
determine_dissect_image_flags()|
DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
@@ -4212,11 +4297,11 @@ static int outer_child(
"Short write while sending cgroup mode.");
}
- r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
+ r = recursive_chown(directory, chown_uid, chown_range);
if (r < 0)
return r;
- r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
+ r = base_filesystem_create(directory, chown_uid, (gid_t) chown_uid);
if (r < 0)
return r;
@@ -4229,7 +4314,7 @@ static int outer_child(
r = mount_all(directory,
arg_mount_settings,
- arg_uid_shift,
+ chown_uid,
arg_selinux_apifs_context);
if (r < 0)
return r;
@@ -4247,16 +4332,16 @@ static int outer_child(
if (r < 0)
return r;
- (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
+ (void) dev_setup(directory, chown_uid, chown_uid);
p = prefix_roota(directory, "/run/host");
- (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
+ (void) make_inaccessible_nodes(p, chown_uid, chown_uid);
r = setup_unix_export_host_inside(directory, unix_export_path);
if (r < 0)
return r;
- r = setup_pts(directory);
+ r = setup_pts(directory, chown_uid);
if (r < 0)
return r;
@@ -4280,8 +4365,8 @@ static int outer_child(
directory,
arg_custom_mounts,
arg_n_custom_mounts,
- arg_uid_shift,
- arg_uid_range,
+ chown_uid,
+ chown_range,
arg_selinux_apifs_context,
MOUNT_NON_ROOT_ONLY);
if (r < 0)
@@ -4316,8 +4401,8 @@ static int outer_child(
directory,
arg_unified_cgroup_hierarchy,
arg_userns_mode != USER_NAMESPACE_NO,
- arg_uid_shift,
- arg_uid_range,
+ chown_uid,
+ chown_range,
arg_selinux_apifs_context,
false);
if (r < 0)
@@ -4333,7 +4418,7 @@ static int outer_child(
* (and fork for it) for which we then mount sysfs/procfs, and only then switch root. */
_cleanup_close_ int notify_fd = -EBADF;
- if (arg_privileged) {
+ if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
/* Mark everything as shared so our mounts get propagated down. This is required to make new
* bind mounts available in systemd services inside the container that create a new mount
* namespace. See https://github.com/systemd/systemd/issues/3860 Further submounts (such as
@@ -4376,8 +4461,8 @@ static int outer_child(
pid = raw_clone(SIGCHLD|CLONE_NEWNS|
arg_clone_ns_flags |
- (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0) |
- ((arg_private_network && !arg_privileged) ? CLONE_NEWNET : 0));
+ (IN_SET(arg_userns_mode, USER_NAMESPACE_FIXED, USER_NAMESPACE_PICK) ? CLONE_NEWUSER : 0) |
+ ((arg_private_network && arg_userns_mode == USER_NAMESPACE_MANAGED) ? CLONE_NEWNET : 0));
if (pid < 0)
return log_error_errno(errno, "Failed to fork inner child: %m");
if (pid == 0) {
@@ -4396,9 +4481,10 @@ static int outer_child(
return log_error_errno(r, "Failed to join network namespace: %m");
}
- if (!arg_privileged) {
- /* In unprivileged operation, sysfs + procfs are special, we'll have to mount them
- * inside the inner namespaces, but before we switch root. Hence do so here. */
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+ /* In managed usernamespace operation, sysfs + procfs are special, we'll have to
+ * mount them inside the inner namespaces, but before we switch root. Hence do so
+ * here. */
_cleanup_free_ char *j = path_join(directory, "/proc");
if (!j)
return log_oom();
@@ -5161,6 +5247,8 @@ static int load_oci_bundle(void) {
}
static int run_container(
+ const char *directory,
+ int mount_fd,
DissectedImage *dissected_image,
int userns_fd,
FDSet *fds,
@@ -5248,9 +5336,8 @@ static int run_container(
"Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
}
- if (arg_privileged) {
+ if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
assert(userns_fd < 0);
-
/* If we have no user namespace then we'll clone and create a new mount namespace right-away. */
*pid = raw_clone(SIGCHLD|CLONE_NEWNS);
@@ -5260,7 +5347,6 @@ static int run_container(
", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
} else {
assert(userns_fd >= 0);
-
/* If we have a user namespace then we'll clone() first, and then join the user namespace,
* and then open the mount namespace, so that it is owned by the user namespace */
@@ -5298,7 +5384,8 @@ static int run_container(
(void) reset_signal_mask();
r = outer_child(&barrier,
- arg_directory,
+ directory,
+ mount_fd,
dissected_image,
fd_outer_socket_pair[1],
fd_inner_socket_pair[1],
@@ -5416,9 +5503,11 @@ static int run_container(
if (!barrier_place_and_sync(&barrier)) /* #1 */
return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
- r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
- if (r < 0)
- return r;
+ if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
+ r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
+ if (r < 0)
+ return r;
+ }
(void) barrier_place(&barrier); /* #2 */
}
@@ -5442,7 +5531,7 @@ static int run_container(
return r;
if (arg_network_veth) {
- if (arg_privileged) {
+ if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
r = setup_veth(arg_machine, *pid, veth_name,
arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
if (r < 0)
@@ -5580,7 +5669,7 @@ static int run_container(
arg_unified_cgroup_hierarchy,
arg_uid_shift,
userns_fd,
- arg_privileged);
+ arg_userns_mode);
if (r < 0)
return r;
@@ -5622,7 +5711,7 @@ static int run_container(
if (!barrier_sync(&barrier)) /* #5.1 */
return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
- if (arg_userns_mode != USER_NAMESPACE_NO) {
+ if (!IN_SET(arg_userns_mode, USER_NAMESPACE_NO, USER_NAMESPACE_MANAGED)) {
r = wipe_fully_visible_api_fs(mntns_fd);
if (r < 0)
return r;
@@ -5749,7 +5838,7 @@ static int run_container(
fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
- if (arg_private_network && arg_privileged) {
+ if (arg_private_network && arg_userns_mode != USER_NAMESPACE_MANAGED) {
r = move_back_network_interfaces(child_netns_fd, arg_network_interfaces);
if (r < 0)
return r;
@@ -5914,15 +6003,25 @@ static int cant_be_in_netns(void) {
return 0;
}
+static void initialize_defaults(void) {
+ arg_privileged = getuid() == 0;
+
+ /* If running unprivileged default to systemd-nsresourced operation */
+ arg_userns_mode = arg_privileged ? USER_NAMESPACE_NO : USER_NAMESPACE_MANAGED;
+
+ /* Imply private networking for unprivileged operation, since kernel otherwise refuses mounting sysfs */
+ arg_private_network = !arg_privileged;
+}
+
static int run(int argc, char *argv[]) {
- bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
- _cleanup_close_ int master = -EBADF, userns_fd = -EBADF;
+ bool remove_directory = false, remove_image = false, veth_created = false;
+ _cleanup_close_ int master = -EBADF, userns_fd = -EBADF, mount_fd = -EBADF;
_cleanup_fdset_free_ FDSet *fds = NULL;
int r, n_fd_passed, ret = EXIT_SUCCESS;
char veth_name[IFNAMSIZ] = "";
struct ExposeArgs expose_args = {};
_cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
- char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
+ _cleanup_(rmdir_and_freep) char *rootdir = NULL;
_cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
_cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
_cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
@@ -5930,7 +6029,7 @@ static int run(int argc, char *argv[]) {
log_setup();
- arg_privileged = getuid() == 0;
+ initialize_defaults();
r = parse_argv(argc, argv);
if (r <= 0)
@@ -5987,14 +6086,14 @@ static int run(int argc, char *argv[]) {
/* Reapply environment settings. */
(void) detect_unified_cgroup_hierarchy_from_environment();
- if (!arg_privileged) {
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
r = cg_all_unified();
if (r < 0) {
log_error_errno(r, "Failed to determine if we are in unified cgroupv2 mode: %m");
goto finish;
}
if (r == 0)
- return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unprivileged operation only supported in unified cgroupv2 mode.");
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Managed user namespace operation only supported in unified cgroupv2 mode.");
}
/* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
@@ -6023,14 +6122,33 @@ static int run(int argc, char *argv[]) {
if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
arg_quiet = true;
- if (arg_directory) {
- assert(!arg_image);
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+ /* Let's allocate a 64K userns first, if managed mode is chosen */
+
+ _cleanup_free_ char *userns_name = NULL;
+ if (asprintf(&userns_name, "nspawn-" PID_FMT "-%s", getpid_cached(), arg_machine) < 0) {
+ r = log_oom();
+ goto finish;
+ }
+
+ userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000));
+ if (userns_fd < 0) {
+ r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
+ goto finish;
+ }
- if (!arg_privileged) {
- r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invoking container from plain directory tree is currently not supported if called without privileges.");
+ r = userns_get_base_uid(userns_fd, &arg_uid_shift, /* ret_gid= */ NULL);
+ if (r < 0) {
+ log_error_errno(r, "Failed to determine UID shift from userns: %m");
goto finish;
}
+ arg_uid_range = UINT32_C(0x10000);
+ }
+
+ if (arg_directory) {
+ assert(!arg_image);
+
/* Safety precaution: let's not allow running images from the live host OS image, as long as
* /var from the host will propagate into container dynamically (because bad things happen if
* two systems write to the same /var). Let's allow it for the special cases where /var is
@@ -6200,6 +6318,15 @@ static int run(int argc, char *argv[]) {
}
}
+ if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+ r = mountfsd_mount_directory(
+ arg_directory,
+ userns_fd,
+ determine_dissect_image_flags(),
+ &mount_fd);
+ if (r < 0)
+ goto finish;
+ }
} else {
DissectImageFlags dissect_image_flags =
determine_dissect_image_flags();
@@ -6274,20 +6401,7 @@ static int run(int argc, char *argv[]) {
dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
}
- if (!mkdtemp(tmprootdir)) {
- r = log_error_errno(errno, "Failed to create temporary directory: %m");
- goto finish;
- }
-
- remove_tmprootdir = true;
-
- arg_directory = strdup(tmprootdir);
- if (!arg_directory) {
- r = log_oom();
- goto finish;
- }
-
- if (arg_privileged) {
+ if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
r = loop_device_make_by_path(
arg_image,
arg_read_only ? O_RDONLY : O_RDWR,
@@ -6339,19 +6453,6 @@ static int run(int argc, char *argv[]) {
if (r < 0)
goto finish;
} else {
- _cleanup_free_ char *userns_name = strjoin("nspawn-", arg_machine);
- if (!userns_name) {
- r = log_oom();
- goto finish;
- }
-
- /* if we are unprivileged, let's allocate a 64K userns first */
- userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000));
- if (userns_fd < 0) {
- r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
- goto finish;
- }
-
r = mountfsd_mount_image(
arg_image,
userns_fd,
@@ -6370,7 +6471,14 @@ static int run(int argc, char *argv[]) {
arg_architecture = dissected_image_architecture(dissected_image);
}
- r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
+ /* Create a temporary place to mount stuff. */
+ r = mkdtemp_malloc("/tmp/nspawn-root-XXXXXX", &rootdir);
+ if (r < 0) {
+ log_error_errno(r, "Failed to create temporary directory: %m");
+ goto finish;
+ }
+
+ r = custom_mount_prepare_all(rootdir, arg_custom_mounts, arg_n_custom_mounts);
if (r < 0)
goto finish;
@@ -6405,6 +6513,8 @@ static int run(int argc, char *argv[]) {
}
for (;;) {
r = run_container(
+ rootdir,
+ mount_fd,
dissected_image,
userns_fd,
fds,
@@ -6447,12 +6557,7 @@ finish:
log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
}
- if (remove_tmprootdir) {
- if (rmdir(tmprootdir) < 0)
- log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
- }
-
- if (arg_machine && arg_privileged) {
+ if (arg_machine && arg_userns_mode != USER_NAMESPACE_MANAGED) {
const char *p;
p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
@@ -6466,7 +6571,7 @@ finish:
expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET, &expose_args.address4);
expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
- if (arg_privileged) {
+ if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
if (veth_created)
(void) remove_veth_links(veth_name, arg_network_veth_extra);
(void) remove_bridge(arg_network_zone);