7 files changed, 340 insertions, 221 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 4f28b4a225..4ee21c0779 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -18,6 +18,7 @@
 #include "rm-rf.h"
 #include "string-util.h"
 #include "strv.h"
+#include "tmpfile-util.h"
 #include "user-util.h"
 
 static int chown_cgroup_path(const char *path, uid_t uid_shift) {
@@ -48,8 +49,9 @@ static int chown_cgroup_path(const char *path, uid_t uid_shift) {
 }
 
 int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) {
+        _cleanup_(rmdir_and_freep) char *tree = NULL;
         _cleanup_free_ char *cgroup = NULL;
-        char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
+        char pid_string[DECIMAL_STR_MAX(pid) + 1];
         bool undo_mount = false;
         const char *fn;
         int r, unified_controller;
@@ -70,8 +72,9 @@ int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift) {
                 return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
 
         /* In order to access the unified hierarchy we need to mount it */
-        if (!mkdtemp(tree))
-                return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
+        r = mkdtemp_malloc("/tmp/unifiedXXXXXX", &tree);
+        if (r < 0)
+                return log_error_errno(r, "Failed to generate temporary mount point for unified hierarchy: %m");
 
         if (unified_controller > 0)
                 r = mount_nofollow_verbose(LOG_ERR, "cgroup", tree, "cgroup",
@@ -107,7 +110,6 @@ finish:
         if (undo_mount)
                 (void) umount_verbose(LOG_ERR, tree, UMOUNT_NOFOLLOW);
 
-        (void) rmdir(tree);
         return r;
 }
 
@@ -117,7 +119,7 @@ int create_subcgroup(
                 CGroupUnified unified_requested,
                 uid_t uid_shift,
                 int userns_fd,
-                bool privileged) {
+                UserNamespaceMode userns_mode) {
 
         _cleanup_free_ char *cgroup = NULL, *payload = NULL;
         CGroupMask supported;
@@ -161,14 +163,14 @@ int create_subcgroup(
         if (!payload)
                 return log_oom();
 
-        if (privileged)
+        if (userns_mode != USER_NAMESPACE_MANAGED)
                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
         else
                 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, payload);
         if (r < 0)
                 return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);
 
-        if (privileged) {
+        if (userns_mode != USER_NAMESPACE_MANAGED) {
                 _cleanup_free_ char *fs = NULL;
                 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, payload, NULL, &fs);
                 if (r < 0)
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index 7e2cd53ddc..8f039ffb28 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -5,9 +5,10 @@
 #include <sys/types.h>
 
 #include "cgroup-util.h"
+#include "nspawn-settings.h"
 
 int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
-int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested, uid_t uid_shift, int userns_fd, bool privileged);
+int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested, uid_t uid_shift, int userns_fd, UserNamespaceMode userns_mode);
 
 int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
 int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index d5370c22b9..552d629a18 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -127,18 +127,15 @@ static char *resolve_source_path(const char *dest, const char *source) {
 }
 
 static int allocate_temporary_source(CustomMount *m) {
+        int r;
+
         assert(m);
         assert(!m->source);
         assert(!m->rm_rf_tmpdir);
 
-        m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
-        if (!m->rm_rf_tmpdir)
-                return log_oom();
-
-        if (!mkdtemp(m->rm_rf_tmpdir)) {
-                m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
-                return log_error_errno(errno, "Failed to acquire temporary directory: %m");
-        }
+        r = mkdtemp_malloc("/var/tmp/nspawn-temp-XXXXXX", &m->rm_rf_tmpdir);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire temporary directory: %m");
 
         m->source = path_join(m->rm_rf_tmpdir, "src");
         if (!m->source)
@@ -153,7 +150,7 @@ static int allocate_temporary_source(CustomMount *m) {
 int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
         int r;
 
-        /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
+        /* Prepare all custom mounts. This will make sure we know all temporary directories. This is called in the
          * parent process, so that we know the temporary directories to remove on exit before we fork off the
          * children. */
 
@@ -162,9 +159,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
         /* Order the custom mounts, and make sure we have a working directory */
         typesafe_qsort(l, n, custom_mount_compare);
 
-        for (size_t i = 0; i < n; i++) {
-                CustomMount *m = l + i;
-
+        FOREACH_ARRAY(m, l, n) {
                 /* /proc we mount in the inner child, i.e. when we acquired CLONE_NEWPID. All other mounts we mount
                  * already in the outer child, so that the mounts are already established before CLONE_NEWPID and in
                  * particular CLONE_NEWUSER. This also means any custom mounts below /proc also need to be mounted in
@@ -593,17 +588,17 @@ int mount_all(const char *dest,
 
                 /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing when we are privileged) */
                 { "tmpfs",                  "/tmp",                         "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
-                  MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR },
+                  MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR|MOUNT_USRQUOTA_GRACEFUL },
                 { "tmpfs",                  "/sys",                         "tmpfs", "mode=0555" TMPFS_LIMITS_SYS,     MS_NOSUID|MS_NOEXEC|MS_NODEV,
-                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR|MOUNT_PRIVILEGED },
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR|MOUNT_UNMANAGED },
                 { "sysfs",                  "/sys",                         "sysfs", NULL,                             SYS_DEFAULT_MOUNT_FLAGS,
-                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR|MOUNT_PRIVILEGED },    /* skipped if above was mounted */
+                  MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR|MOUNT_UNMANAGED },    /* skipped if above was mounted */
                 { "sysfs",                  "/sys",                         "sysfs", NULL,                             MS_NOSUID|MS_NOEXEC|MS_NODEV,
-                  MOUNT_FATAL|MOUNT_MKDIR|MOUNT_PRIVILEGED },                          /* skipped if above was mounted */
+                  MOUNT_FATAL|MOUNT_MKDIR|MOUNT_UNMANAGED },                          /* skipped if above was mounted */
                 { "tmpfs",                  "/dev",                         "tmpfs", "mode=0755" TMPFS_LIMITS_PRIVATE_DEV, MS_NOSUID|MS_STRICTATIME,
                   MOUNT_FATAL|MOUNT_MKDIR },
                 { "tmpfs",                  "/dev/shm",                     "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
-                  MOUNT_FATAL|MOUNT_MKDIR },
+                  MOUNT_FATAL|MOUNT_MKDIR|MOUNT_USRQUOTA_GRACEFUL },
                 { "tmpfs",                  "/run",                         "tmpfs", "mode=0755" TMPFS_LIMITS_RUN,     MS_NOSUID|MS_NODEV|MS_STRICTATIME,
                   MOUNT_FATAL|MOUNT_MKDIR },
                 { "/run/host",              "/run/host",                    NULL,    NULL,                             MS_BIND,
@@ -622,9 +617,9 @@ int mount_all(const char *dest,
                 { "/sys/fs/selinux",        "/sys/fs/selinux",              NULL,    NULL,                             MS_BIND,
                   MOUNT_MKDIR|MOUNT_PRIVILEGED },  /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
                 { NULL,                     "/sys/fs/selinux",              NULL,    NULL,                             MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
-                  MOUNT_PRIVILEGED },              /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
+                  MOUNT_UNMANAGED|MOUNT_PRIVILEGED },  /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
                 { NULL,                     "/sys/fs/selinux",              NULL,    NULL,                             MS_PRIVATE,
-                  MOUNT_PRIVILEGED },              /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
+                  MOUNT_UNMANAGED|MOUNT_PRIVILEGED },  /* Turn off propagation (we only want that for the mount propagation tunnel dir) */
 #endif
         };
 
@@ -633,6 +628,7 @@ int mount_all(const char *dest,
         bool ro = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO);
         bool in_userns = FLAGS_SET(mount_settings, MOUNT_IN_USERNS);
         bool tmpfs_tmp = FLAGS_SET(mount_settings, MOUNT_APPLY_TMPFS_TMP);
+        bool unmanaged = FLAGS_SET(mount_settings, MOUNT_UNMANAGED);
         bool privileged = FLAGS_SET(mount_settings, MOUNT_PRIVILEGED);
         int r;
 
@@ -641,8 +637,9 @@ int mount_all(const char *dest,
                 bool fatal = FLAGS_SET(m->mount_settings, MOUNT_FATAL);
                 const char *o;
 
-                /* If we are not privileged but the entry is marked as privileged and to be mounted outside the user namespace, then skip it */
-                if (!privileged && FLAGS_SET(m->mount_settings, MOUNT_PRIVILEGED) && !FLAGS_SET(m->mount_settings, MOUNT_IN_USERNS))
+                /* If we are in managed user namespace mode but the entry is marked for mount outside of
+                 * managed user namespace mode, and to be mounted outside the user namespace, then skip it */
+                if (!unmanaged && FLAGS_SET(m->mount_settings, MOUNT_UNMANAGED) && !FLAGS_SET(m->mount_settings, MOUNT_IN_USERNS))
                         continue;
 
                 if (in_userns != FLAGS_SET(m->mount_settings, MOUNT_IN_USERNS))
@@ -657,6 +654,9 @@ int mount_all(const char *dest,
                 if (!tmpfs_tmp && FLAGS_SET(m->mount_settings, MOUNT_APPLY_TMPFS_TMP))
                         continue;
 
+                if (!privileged && FLAGS_SET(m->mount_settings, MOUNT_PRIVILEGED))
+                        continue;
+
                 r = chase(m->where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where, NULL);
                 if (r < 0)
                         return log_error_errno(r, "Failed to resolve %s%s: %m", strempty(dest), m->where);
@@ -711,6 +711,23 @@ int mount_all(const char *dest,
                                 o = options;
                 }
 
+                if (FLAGS_SET(m->mount_settings, MOUNT_USRQUOTA_GRACEFUL)) {
+                        r = mount_option_supported(m->type, /* key= */ "usrquota", /* value= */ NULL);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to determine if '%s' supports 'usrquota', assuming it doesn't: %m", m->type);
+                        else if (r == 0)
+                                log_debug("Kernel doesn't support 'usrquota' on '%s', not including in mount options for '%s'.", m->type, m->where);
+                        else {
+                                _cleanup_free_ char *joined = NULL;
+
+                                if (!strextend_with_separator(&joined, ",", o ?: POINTER_MAX, "usrquota"))
+                                        return log_oom();
+
+                                free_and_replace(options, joined);
+                                o = options;
+                        }
+                }
+
                 if (FLAGS_SET(m->mount_settings, MOUNT_PREFIX_ROOT)) {
                         /* Optionally prefix the mount source with the root dir. This is useful in bind
                          * mounts to be created within the container image before we transition into it. Note
@@ -1101,7 +1118,7 @@ static int setup_volatile_state_after_remount_idmap(const char *directory, uid_t
 
 static int setup_volatile_yes(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) {
         bool tmpfs_mounted = false, bind_mounted = false;
-        char template[] = "/tmp/nspawn-volatile-XXXXXX";
+        _cleanup_(rmdir_and_freep) char *template = NULL;
         _cleanup_free_ char *buf = NULL, *bindir = NULL;
         const char *f, *t, *options;
         struct stat st;
@@ -1130,8 +1147,9 @@ static int setup_volatile_yes(const char *directory, uid_t uid_shift, const char
                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
                                        "Error starting image: if --volatile=yes is used /bin must be a symlink (for merged /usr support) or non-existent (in which case a symlink is created automatically).");
 
-        if (!mkdtemp(template))
-                return log_error_errno(errno, "Failed to create temporary directory: %m");
+        r = mkdtemp_malloc("/tmp/nspawn-volatile-XXXXXX", &template);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create temporary directory: %m");
 
         options = "mode=0755" TMPFS_LIMITS_ROOTFS;
         r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
@@ -1182,13 +1200,12 @@ fail:
         if (tmpfs_mounted)
                 (void) umount_verbose(LOG_ERR, template, UMOUNT_NOFOLLOW);
 
-        (void) rmdir(template);
         return r;
 }
 
 static int setup_volatile_overlay(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) {
         _cleanup_free_ char *buf = NULL, *escaped_directory = NULL, *escaped_upper = NULL, *escaped_work = NULL;
-        char template[] = "/tmp/nspawn-volatile-XXXXXX";
+        _cleanup_(rmdir_and_freep) char *template = NULL;
         const char *upper, *work, *options;
         bool tmpfs_mounted = false;
         int r;
@@ -1197,8 +1214,9 @@ static int setup_volatile_overlay(const char *directory, uid_t uid_shift, const
 
         /* --volatile=overlay means we mount an overlayfs to the root dir. */
 
-        if (!mkdtemp(template))
-                return log_error_errno(errno, "Failed to create temporary directory: %m");
+        r = mkdtemp_malloc("/tmp/nspawn-volatile-XXXXXX", &template);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create temporary directory: %m");
 
         options = "mode=0755" TMPFS_LIMITS_ROOTFS;
         r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
@@ -1243,7 +1261,6 @@ finish:
         if (tmpfs_mounted)
                 (void) umount_verbose(LOG_ERR, template, UMOUNT_NOFOLLOW);
 
-        (void) rmdir(template);
         return r;
 }
 
@@ -1322,8 +1339,7 @@ int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s
 int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
         _cleanup_free_ char *directory_pivot_root_new = NULL;
         _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
-        char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
-        bool remove_pivot_tmp = false;
+        _cleanup_(rmdir_and_freep) char *pivot_tmp = NULL;
         int r;
 
         assert(directory);
@@ -1364,43 +1380,33 @@ int setup_pivot_root(const char *directory, const char *pivot_root_new, const ch
         /* Remount directory_pivot_root_new to make it movable. */
         r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
         if (r < 0)
-                goto done;
+                return r;
 
         if (pivot_root_old) {
-                if (!mkdtemp(pivot_tmp)) {
-                        r = log_error_errno(errno, "Failed to create temporary directory: %m");
-                        goto done;
-                }
+                r = mkdtemp_malloc("/tmp/nspawn-pivot-XXXXXX", &pivot_tmp);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to create temporary directory: %m");
 
-                remove_pivot_tmp = true;
                 pivot_tmp_pivot_root_old = path_join(pivot_tmp, pivot_root_old);
-                if (!pivot_tmp_pivot_root_old) {
-                        r = log_oom();
-                        goto done;
-                }
+                if (!pivot_tmp_pivot_root_old)
+                        return log_oom();
 
                 r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
                 if (r < 0)
-                        goto done;
+                        return r;
 
                 r = mount_nofollow_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
                 if (r < 0)
-                        goto done;
+                        return r;
 
                 r = mount_nofollow_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
-                if (r < 0)
-                        goto done;
-        } else {
+        } else
                 r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
-                if (r < 0)
-                        goto done;
-        }
 
-done:
-        if (remove_pivot_tmp)
-                (void) rmdir(pivot_tmp);
+        if (r < 0)
+                return r;
 
-        return r;
+        return 0;
 }
 
 #define NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS "/run/host/proc"
diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h
index 53aa993d6a..87b3b91c43 100644
--- a/src/nspawn/nspawn-mount.h
+++ b/src/nspawn/nspawn-mount.h
@@ -20,7 +20,9 @@ typedef enum MountSettingsMask {
         MOUNT_TOUCH              = 1 << 9, /* if set, touch file to mount over first */
         MOUNT_PREFIX_ROOT        = 1 << 10,/* if set, prefix the source path with the container's root directory */
         MOUNT_FOLLOW_SYMLINKS    = 1 << 11,/* if set, we'll follow symlinks for the mount target */
-        MOUNT_PRIVILEGED         = 1 << 12,/* if set, we'll only mount this in the outer child if we are running in privileged mode */
+        MOUNT_UNMANAGED          = 1 << 12,/* if set, we'll only mount this in the outer child if we are running in privileged mode */
+        MOUNT_PRIVILEGED         = 1 << 13,/* if set, we'll only mount this if we have full privileges */
+        MOUNT_USRQUOTA_GRACEFUL  = 1 << 14,/* if set, append "usrquota" to mount options if kernel tmpfs supports that */
 } MountSettingsMask;
 
 typedef enum CustomMountType {
diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c
index 7842d93c34..2d883e2196 100644
--- a/src/nspawn/nspawn-settings.c
+++ b/src/nspawn/nspawn-settings.c
@@ -930,10 +930,11 @@ DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(timezone_mode, TimezoneMode, TIMEZONE_AU
 DEFINE_CONFIG_PARSE_ENUM(config_parse_userns_ownership, user_namespace_ownership, UserNamespaceOwnership);
 
 static const char *const user_namespace_ownership_table[_USER_NAMESPACE_OWNERSHIP_MAX] = {
-        [USER_NAMESPACE_OWNERSHIP_OFF]   = "off",
-        [USER_NAMESPACE_OWNERSHIP_CHOWN] = "chown",
-        [USER_NAMESPACE_OWNERSHIP_MAP]   = "map",
-        [USER_NAMESPACE_OWNERSHIP_AUTO]  = "auto",
+        [USER_NAMESPACE_OWNERSHIP_OFF]     = "off",
+        [USER_NAMESPACE_OWNERSHIP_CHOWN]   = "chown",
+        [USER_NAMESPACE_OWNERSHIP_MAP]     = "map",
+        [USER_NAMESPACE_OWNERSHIP_FOREIGN] = "foreign",
+        [USER_NAMESPACE_OWNERSHIP_AUTO]    = "auto",
 };
 
 /* Note: while "yes" maps to "auto" here, we don't really document that, in order to make things clearer and less confusing to users. */
diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h
index 135b3dbb0a..0b30506391 100644
--- a/src/nspawn/nspawn-settings.h
+++ b/src/nspawn/nspawn-settings.h
@@ -29,14 +29,16 @@ typedef enum UserNamespaceMode {
         USER_NAMESPACE_NO,
         USER_NAMESPACE_FIXED,
         USER_NAMESPACE_PICK,
+        USER_NAMESPACE_MANAGED,
         _USER_NAMESPACE_MODE_MAX,
         _USER_NAMESPACE_MODE_INVALID = -EINVAL,
 } UserNamespaceMode;
 
 typedef enum UserNamespaceOwnership {
-        USER_NAMESPACE_OWNERSHIP_OFF,
-        USER_NAMESPACE_OWNERSHIP_CHOWN,
-        USER_NAMESPACE_OWNERSHIP_MAP,
+        USER_NAMESPACE_OWNERSHIP_OFF,     /* do not change ownership */
+        USER_NAMESPACE_OWNERSHIP_CHOWN,   /* chown to target range */
+        USER_NAMESPACE_OWNERSHIP_MAP,     /* map from 0x00000000…0x0000FFFF range to target range */
+        USER_NAMESPACE_OWNERSHIP_FOREIGN, /* map from 0x7FFE0000…0x7FFEFFFF range to target range */
         USER_NAMESPACE_OWNERSHIP_AUTO,
         _USER_NAMESPACE_OWNERSHIP_MAX,
         _USER_NAMESPACE_OWNERSHIP_INVALID = -1,
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 7ceda0b30c..4c054b2dbb 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -107,6 +107,7 @@
 #include "sysctl-util.h"
 #include "terminal-util.h"
 #include "tmpfile-util.h"
+#include "uid-classification.h"
 #include "umask-util.h"
 #include "unit-name.h"
 #include "user-util.h"
@@ -139,7 +140,7 @@ static char *arg_hostname = NULL;    /* The name the payload sees by default */
 static const char *arg_selinux_context = NULL;
 static const char *arg_selinux_apifs_context = NULL;
 static char *arg_slice = NULL;
-static bool arg_private_network = false;
+static bool arg_private_network; /* initialized depending on arg_privileged in run() */
 static bool arg_read_only = false;
 static StartMode arg_start_mode = START_PID1;
 static bool arg_ephemeral = false;
@@ -197,7 +198,7 @@ static VolatileMode arg_volatile_mode = VOLATILE_NO;
 static ExposePort *arg_expose_ports = NULL;
 static char **arg_property = NULL;
 static sd_bus_message *arg_property_message = NULL;
-static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
+static UserNamespaceMode arg_userns_mode; /* initialized depending on arg_privileged in run() */
 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
 static int arg_kill_signal = 0;
@@ -369,7 +370,7 @@ static int help(void) {
                "                            the service unit nspawn is running in\n"
                "\n%3$sUser Namespacing:%4$s\n"
                "     --private-users=no     Run without user namespacing\n"
-               "     --private-users=yes|pick|identity\n"
+               "     --private-users=yes|pick|identity|managed\n"
                "                            Run within user namespace, autoselect UID/GID range\n"
                "     --private-users=UIDBASE[:NUIDS]\n"
                "                            Similar, but with user configured UID/GID range\n"
@@ -518,7 +519,7 @@ static int detect_unified_cgroup_hierarchy_from_environment(void) {
 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
         int r;
 
-        if (!arg_privileged) {
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
                 /* We only support the unified mode when running unprivileged */
                 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
                 return 0;
@@ -1257,6 +1258,11 @@ static int parse_argv(int argc, char *argv[]) {
                                 arg_userns_mode = USER_NAMESPACE_FIXED;
                                 arg_uid_shift = 0;
                                 arg_uid_range = UINT32_C(0x10000);
+                        } else if (streq(optarg, "managed")) {
+                                /* managed: User namespace on, and acquire it from systemd-nsresourced */
+                                arg_userns_mode = USER_NAMESPACE_MANAGED;
+                                arg_uid_shift = UID_INVALID;
+                                arg_uid_range = UINT32_C(0x10000);
                         } else {
                                 /* anything else: User namespacing on, UID range is explicitly configured */
                                 r = parse_userns_uid_range(optarg, &arg_uid_shift, &arg_uid_range);
@@ -1271,9 +1277,8 @@ static int parse_argv(int argc, char *argv[]) {
 
                 case 'U':
                         if (userns_supported()) {
-                                arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
-                                                                        * implied by USER_NAMESPACE_PICK
-                                                                        * further down. */
+                                /* Note that arg_userns_ownership is implied by USER_NAMESPACE_PICK further down. */
+                                arg_userns_mode = arg_privileged ? USER_NAMESPACE_PICK : USER_NAMESPACE_MANAGED;
                                 arg_uid_shift = UID_INVALID;
                                 arg_uid_range = UINT32_C(0x10000);
 
@@ -1656,14 +1661,23 @@ static int parse_argv(int argc, char *argv[]) {
 static int verify_arguments(void) {
         int r;
 
-        SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged);
+        SET_FLAG(arg_mount_settings, MOUNT_UNMANAGED, arg_userns_mode != USER_NAMESPACE_MANAGED);
 
-        if (!arg_privileged) {
-                if (!arg_private_network) {
-                        log_notice("Automatically implying --private-network, since mounting /sys/ in an unprivileged user namespaces requires network namespacing.");
-                        arg_private_network = true;
-                }
-        }
+        /* We can mount selinuxfs only if we are privileged and can do so before userns. In managed mode we
+         * have to enter the userns earlier, hence cannot do that. */
+        /* SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_privileged); */
+        SET_FLAG(arg_mount_settings, MOUNT_PRIVILEGED, arg_userns_mode != USER_NAMESPACE_MANAGED);
+
+        SET_FLAG(arg_mount_settings, MOUNT_USE_USERNS, arg_userns_mode != USER_NAMESPACE_NO);
+
+        if (arg_private_network)
+                SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, arg_private_network);
+
+        if (!arg_privileged && arg_userns_mode != USER_NAMESPACE_MANAGED)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unprivileged operation requires managed user namespaces, as otherwise no UID range can be acquired.");
+
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED && !arg_private_network)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Managed user namespace operation requires private networking, as otherwise /sys/ may not be mounted.");
 
         if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
                 /* If we are running the stub init in the container, we don't need to look at what the init
@@ -1684,12 +1698,6 @@ static int verify_arguments(void) {
                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
         }
 
-        if (arg_userns_mode != USER_NAMESPACE_NO)
-                arg_mount_settings |= MOUNT_USE_USERNS;
-
-        if (arg_private_network)
-                arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
-
         if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
             !(arg_clone_ns_flags & CLONE_NEWUTS)) {
                 arg_register = false;
@@ -1699,8 +1707,7 @@ static int verify_arguments(void) {
 
         if (arg_userns_ownership < 0)
                 arg_userns_ownership =
-                        arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
-                                                                 USER_NAMESPACE_OWNERSHIP_OFF;
+                        IN_SET(arg_userns_mode, USER_NAMESPACE_PICK, USER_NAMESPACE_MANAGED) ? USER_NAMESPACE_OWNERSHIP_AUTO : USER_NAMESPACE_OWNERSHIP_OFF;
 
         if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
                 arg_kill_signal = SIGRTMIN+3;
@@ -1809,10 +1816,18 @@ static int verify_network_interfaces_initialized(void) {
         return 0;
 }
 
+static int in_child_chown(void) {
+        /* Returns true when chown()ing inodes we create inside the outer child is required. Basically, we
+         * need the chowning when we implement userns ourselves. If userns is off we don#t need to chown(),
+         * obviously. And if we are in managed mode we already entered the userns, and hence don#t need to
+         * manually chown either. */
+        return IN_SET(arg_userns_mode, USER_NAMESPACE_PICK, USER_NAMESPACE_FIXED);
+}
+
 static int userns_chown_at(int fd, const char *fname, uid_t uid, gid_t gid, int flags) {
         assert(fd >= 0 || fd == AT_FDCWD);
 
-        if (arg_userns_mode == USER_NAMESPACE_NO)
+        if (!in_child_chown())
                 return 0;
 
         if (uid == UID_INVALID && gid == GID_INVALID)
@@ -2295,18 +2310,24 @@ static int copy_devnode_one(const char *dest, const char *node, bool ignore_mkno
         if (r < 0)
                 return log_error_errno(r, "Failed to create directory %s: %m", parent);
 
-        if (mknod(to, st.st_mode, st.st_rdev) < 0) {
-                r = -errno; /* Save the original error code. */
+        r = RET_NERRNO(mknod(to, st.st_mode, st.st_rdev));
+        if (r < 0) {
                 /* Explicitly warn the user when /dev/ is already populated. */
                 if (r == -EEXIST)
                         log_notice("%s/dev/ is pre-mounted and pre-populated. If a pre-mounted /dev/ is provided it needs to be an unpopulated file system.", dest);
+
                 /* If arg_uid_shift != 0, then we cannot fall back to use bind mount. */
-                if (arg_uid_shift != 0) {
+                if (!(arg_userns_mode == USER_NAMESPACE_NO ||
+                      (arg_userns_mode == USER_NAMESPACE_FIXED && arg_uid_shift == 0))) {
                         if (ignore_mknod_failure) {
                                 log_debug_errno(r, "Failed to mknod(%s), ignoring: %m", to);
                                 return 0;
                         }
-                        return log_error_errno(r, "Failed to mknod(%s): %m", to);
+
+                        if (arg_userns_mode != USER_NAMESPACE_MANAGED || !ERRNO_IS_NEG_PRIVILEGE(r))
+                                return log_error_errno(r, "Failed to mknod(%s): %m", to);
+
+                        log_debug_errno(r, "Failed to create device node '%s' and running in managed mode, resorting to bind mount: %m", to);
                 }
 
                 /* Some systems abusively restrict mknod but allow bind mounts. */
@@ -2402,7 +2423,7 @@ static int make_extra_nodes(const char *dest) {
         return 0;
 }
 
-static int setup_pts(const char *dest) {
+static int setup_pts(const char *dest, uid_t chown_uid) {
         _cleanup_free_ char *options = NULL;
         const char *p;
         int r;
@@ -2411,13 +2432,13 @@ static int setup_pts(const char *dest) {
         if (arg_selinux_apifs_context)
                 (void) asprintf(&options,
                                 "newinstance,ptmxmode=0666,mode=" STRINGIFY(TTY_MODE) ",gid=" GID_FMT ",context=\"%s\"",
-                                arg_uid_shift + TTY_GID,
+                                chown_uid + TTY_GID,
                                 arg_selinux_apifs_context);
         else
 #endif
                 (void) asprintf(&options,
                                 "newinstance,ptmxmode=0666,mode=" STRINGIFY(TTY_MODE) ",gid=" GID_FMT,
-                                arg_uid_shift + TTY_GID);
+                                chown_uid + TTY_GID);
 
         if (!options)
                 return log_oom();
@@ -2854,7 +2875,9 @@ static int reset_audit_loginuid(void) {
         if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
                 return 0;
 
-        if (!arg_privileged)
+        /* if we are in managed userns mode, then we are already in our userns, hence we cannot reset the
+         * loginuid anyway, hence don't bother */
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED)
                 return 0;
 
         r = read_virtual_file("/proc/self/loginuid", SIZE_MAX, &p, /* ret_size= */ NULL);
@@ -2886,8 +2909,8 @@ static int mount_tunnel_dig(const char *root) {
         const char *p, *q;
         int r;
 
-        if (!arg_privileged) {
-                log_debug("Not digging mount tunnel, because running unprivileged.");
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                log_debug("Not digging mount tunnel, because running in managed user namespace mode.");
                 return 0;
         }
 
@@ -2919,8 +2942,8 @@ static int mount_tunnel_dig(const char *root) {
 static int mount_tunnel_open(void) {
         int r;
 
-        if (!arg_privileged) {
-                log_debug("Not opening up mount tunnel, because running unprivileged.");
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                log_debug("Not opening up mount tunnel, because running in managed user namespace mode.");
                 return 0;
         }
 
@@ -3267,6 +3290,12 @@ static int chase_and_update(char **p, unsigned flags) {
 static int determine_uid_shift(const char *directory) {
         assert(directory);
 
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                /* In managed mode we should already know the UID shift */
+                assert(uid_is_valid(arg_uid_shift));
+                return 0;
+        }
+
         if (arg_userns_mode == USER_NAMESPACE_NO) {
                 arg_uid_shift = 0;
                 return 0;
@@ -3437,7 +3466,7 @@ static int inner_child(
                         return r;
         }
 
-        r = mount_all(NULL,
+        r = mount_all(/* dest= */ NULL,
                       arg_mount_settings | MOUNT_IN_USERNS,
                       arg_uid_shift,
                       arg_selinux_apifs_context);
@@ -3447,7 +3476,7 @@ static int inner_child(
         if (!arg_network_namespace_path && arg_private_network) {
                 _cleanup_close_ int netns_fd = -EBADF;
 
-                if (arg_privileged)
+                if (arg_userns_mode != USER_NAMESPACE_MANAGED)
                         if (unshare(CLONE_NEWNET) < 0)
                                 return log_error_errno(errno, "Failed to unshare network namespace: %m");
 
@@ -3463,8 +3492,8 @@ static int inner_child(
                 (void) barrier_place(barrier); /* #3 */
         }
 
-        if (arg_privileged) {
-                r = mount_sysfs(NULL, arg_mount_settings);
+        if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
+                r = mount_sysfs(NULL, arg_mount_settings | MOUNT_IN_USERNS);
                 if (r < 0)
                         return r;
         }
@@ -3817,8 +3846,8 @@ static int setup_unix_export_dir_outside(char **ret) {
 
         assert(ret);
 
-        if (!arg_privileged) {
-                log_debug("Not digging socket tunnel, because running unprivileged.");
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                log_debug("Not digging socket tunnel, because running in managed user namespace mode.");
                 return 0;
         }
 
@@ -3874,7 +3903,7 @@ static int setup_unix_export_host_inside(const char *directory, const char *unix
 
         assert(directory);
 
-        if (!arg_privileged)
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED)
                 return 0;
 
         assert(unix_export_path);
@@ -3928,12 +3957,15 @@ static DissectImageFlags determine_dissect_image_flags(void) {
                 DISSECT_IMAGE_PIN_PARTITION_DEVICES |
                 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS) |
                 DISSECT_IMAGE_ALLOW_USERSPACE_VERITY |
-                (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0);
+                (arg_console_mode == CONSOLE_INTERACTIVE ? DISSECT_IMAGE_ALLOW_INTERACTIVE_AUTH : 0) |
+                ((arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_FOREIGN) ? DISSECT_IMAGE_FOREIGN_UID :
+                 (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO) ? DISSECT_IMAGE_IDENTITY_UID : 0);
 }
 
 static int outer_child(
                 Barrier *barrier,
                 const char *directory,
+                int mount_fd,
                 DissectedImage *dissected_image,
                 int fd_outer_socket,
                 int fd_inner_socket,
@@ -3952,9 +3984,9 @@ static int outer_child(
         /* This is the "outer" child process, i.e the one forked off by the container manager itself.  Its
          * namespace situation is:
          *
-         *  - CLONE_NEWNS   : already has its own (created by clone() if arg_privileged, or unshare() if !arg_unprivileged)
-         *  - CLONE_NEWUSER : if  arg_privileged: still in the host's
-         *                    if !arg_privileged: already has its own (created by nsresource_allocate_userns()->setns(userns_fd))
+         *  - CLONE_NEWUSER : if not in USER_NAMESPACE_MANAGED mode: still in the host's
+         *                    if USER_NAMESPACE_MANAGED mode: already has its own (created by nsresource_allocate_userns()->setns(userns_fd))
+         *  - CLONE_NEWNS   : already has its own (created by clone() if not USER_NAMESPACE_MANAGED, or unshare() otherwise)
          *  - CLONE_NEWPID  : still in the host's
          *  - CLONE_NEWUTS  : still in the host's
          *  - CLONE_NEWIPC  : still in the host's
@@ -3987,7 +4019,23 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        if (dissected_image) {
+        /* Put the root dir into the target directory now. One of three mechanisms is provided: either we
+         * have a single mount fd (typically unprivileged --directory= mode) or we have a fully dissected
+         * image (--image= mode), or we have a regular path. */
+        if (mount_fd >= 0) {
+                assert(arg_directory);
+                assert(!arg_image);
+
+                if (move_mount(mount_fd, "", AT_FDCWD, directory, MOVE_MOUNT_F_EMPTY_PATH) < 0)
+                        return log_error_errno(errno, "Failed to attach root directory: %m");
+
+                mount_fd = safe_close(mount_fd);
+                log_debug("Successfully attached root directory to '%s'.", directory);
+
+        } else if (dissected_image) {
+                assert(!arg_directory);
+                assert(arg_image);
+
                 /* If we are operating on a disk image, then mount its root directory now, but leave out the
                  * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
                  * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
@@ -4004,12 +4052,31 @@ static int outer_child(
                                 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
                 if (r < 0)
                         return r;
+        } else {
+                assert(arg_directory);
+                assert(!arg_image);
+
+                r = mount_nofollow_verbose(LOG_ERR, arg_directory, directory, /* fstype= */ NULL, MS_BIND|MS_REC, /* options= */ NULL);
+                if (r < 0)
+                        return r;
         }
 
         r = determine_uid_shift(directory);
         if (r < 0)
                 return r;
 
+        /* If we do userns on our own, we need to chown() all files ourselves before. Otherwise, if userns is
+         * off or we are in managed mode we already have the userns applied, hence don't need to chown
+         * anything */
+        uid_t chown_uid, chown_range;
+        if (in_child_chown()) {
+                chown_uid = arg_uid_shift;
+                chown_range = arg_uid_range;
+        } else {
+                chown_uid = 0;
+                chown_range = UINT32_C(0x10000);
+        }
+
         if (arg_userns_mode != USER_NAMESPACE_NO) {
                 _cleanup_close_ int mntns_fd = -EBADF;
 
@@ -4041,33 +4108,15 @@ static int outer_child(
                         if (l != sizeof(arg_uid_shift))
                                 return log_error_errno(SYNTHETIC_ERRNO(EIO),
                                                        "Short read while receiving UID shift.");
+
+                        if (in_child_chown())
+                                chown_uid = arg_uid_shift;
                 }
 
                 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
                          "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
         }
 
-        if (path_equal(directory, "/")) {
-                /* If the directory we shall boot is the host, let's operate on a bind mount at a different
-                 * place, so that we can make changes to its mount structure (for example, to implement
-                 * --volatile=) without this interfering with our ability to access files such as
-                 * /etc/localtime to copy into the container. Note that we use a fixed place for this
-                 * (instead of a temporary directory, since we are living in our own mount namespace here
-                 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
-                (void) mkdir_p("/run/systemd/nspawn-root", 0755);
-
-                r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
-                if (r < 0)
-                        return r;
-
-                directory = "/run/systemd/nspawn-root";
-        }
-
-        /* Make sure we always have a mount that we can move to root later on. */
-        r = make_mount_point(directory);
-        if (r < 0)
-                return r;
-
         /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
          * mount namespace. For the directory we are going to run our container let's turn this off, so that
          * we'll live in our own little world from now on, and propagation from the host may only happen via
@@ -4086,7 +4135,7 @@ static int outer_child(
         r = setup_volatile_mode(
                         directory,
                         arg_volatile_mode,
-                        arg_uid_shift,
+                        chown_uid,
                         arg_selinux_apifs_context);
         if (r < 0)
                 return r;
@@ -4094,8 +4143,8 @@ static int outer_child(
         r = bind_user_prepare(
                         directory,
                         arg_bind_user,
-                        arg_uid_shift,
-                        arg_uid_range,
+                        chown_uid,
+                        chown_range,
                         &arg_custom_mounts, &arg_n_custom_mounts,
                         &bind_user_context);
         if (r < 0)
@@ -4126,17 +4175,47 @@ static int outer_child(
                         directory,
                         arg_custom_mounts,
                         arg_n_custom_mounts,
-                        arg_uid_shift,
-                        arg_uid_range,
+                        chown_uid,
+                        chown_range,
                         arg_selinux_apifs_context,
                         MOUNT_ROOT_ONLY);
         if (r < 0)
                 return r;
 
-        if (arg_userns_mode != USER_NAMESPACE_NO &&
-            IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
-            arg_uid_shift != 0) {
+        if (!IN_SET(arg_userns_mode, USER_NAMESPACE_NO, USER_NAMESPACE_MANAGED) &&
+            IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_FOREIGN, USER_NAMESPACE_OWNERSHIP_AUTO) &&
+            chown_uid != 0) {
                 _cleanup_strv_free_ char **dirs = NULL;
+                RemountIdmapping mapping;
+
+                switch (arg_userns_ownership) {
+                case USER_NAMESPACE_OWNERSHIP_MAP:
+                        mapping = REMOUNT_IDMAPPING_HOST_ROOT;
+                        break;
+
+                case USER_NAMESPACE_OWNERSHIP_FOREIGN:
+                        mapping = REMOUNT_IDMAPPING_FOREIGN_WITH_HOST_ROOT;
+                        break;
+
+                case USER_NAMESPACE_OWNERSHIP_AUTO: {
+                        struct stat st;
+
+                        if (lstat(directory, &st) < 0)
+                                return log_error_errno(errno, "Failed to stat() container root directory '%s': %m", directory);
+
+                        r = stat_verify_directory(&st);
+                        if (r < 0)
+                                return log_error_errno(r, "Container root directory '%s' is not a directory: %m", directory);
+
+                        mapping = uid_is_foreign(st.st_uid) ?
+                                REMOUNT_IDMAPPING_FOREIGN_WITH_HOST_ROOT :
+                                REMOUNT_IDMAPPING_HOST_ROOT;
+                        break;
+                }
+
+                default:
+                        assert_not_reached();
+                }
 
                 if (arg_volatile_mode != VOLATILE_YES) {
                         r = strv_extend(&dirs, directory);
@@ -4155,7 +4234,13 @@ static int outer_child(
                                 return log_oom();
                 }
 
-                r = remount_idmap(dirs, arg_uid_shift, arg_uid_range, UID_INVALID, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
+                r = remount_idmap(
+                                dirs,
+                                chown_uid,
+                                chown_range,
+                                /* host_owner= */ UID_INVALID,
+                                /* dest_owner= */ UID_INVALID,
+                                mapping);
                 if (r == -EINVAL || ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
                         /* This might fail because the kernel or file system doesn't support idmapping. We
                          * can't really distinguish this nicely, nor do we have any guarantees about the
@@ -4177,7 +4262,7 @@ static int outer_child(
         r = setup_volatile_mode_after_remount_idmap(
                         directory,
                         arg_volatile_mode,
-                        arg_uid_shift,
+                        chown_uid,
                         arg_selinux_apifs_context);
         if (r < 0)
                 return r;
@@ -4187,8 +4272,8 @@ static int outer_child(
                 r = dissected_image_mount_and_warn(
                                 dissected_image,
                                 directory,
-                                arg_uid_shift,
-                                arg_uid_range,
+                                chown_uid,
+                                chown_range,
                                 /* userns_fd= */ -EBADF,
                                 determine_dissect_image_flags()|
                                 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
@@ -4212,11 +4297,11 @@ static int outer_child(
                                                "Short write while sending cgroup mode.");
         }
 
-        r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
+        r = recursive_chown(directory, chown_uid, chown_range);
         if (r < 0)
                 return r;
 
-        r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
+        r = base_filesystem_create(directory, chown_uid, (gid_t) chown_uid);
         if (r < 0)
                 return r;
 
@@ -4229,7 +4314,7 @@ static int outer_child(
 
         r = mount_all(directory,
                       arg_mount_settings,
-                      arg_uid_shift,
+                      chown_uid,
                       arg_selinux_apifs_context);
         if (r < 0)
                 return r;
@@ -4247,16 +4332,16 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
+        (void) dev_setup(directory, chown_uid, chown_uid);
 
         p = prefix_roota(directory, "/run/host");
-        (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
+        (void) make_inaccessible_nodes(p, chown_uid, chown_uid);
 
         r = setup_unix_export_host_inside(directory, unix_export_path);
         if (r < 0)
                 return r;
 
-        r = setup_pts(directory);
+        r = setup_pts(directory, chown_uid);
         if (r < 0)
                 return r;
 
@@ -4280,8 +4365,8 @@ static int outer_child(
                         directory,
                         arg_custom_mounts,
                         arg_n_custom_mounts,
-                        arg_uid_shift,
-                        arg_uid_range,
+                        chown_uid,
+                        chown_range,
                         arg_selinux_apifs_context,
                         MOUNT_NON_ROOT_ONLY);
         if (r < 0)
@@ -4316,8 +4401,8 @@ static int outer_child(
                                 directory,
                                 arg_unified_cgroup_hierarchy,
                                 arg_userns_mode != USER_NAMESPACE_NO,
-                                arg_uid_shift,
-                                arg_uid_range,
+                                chown_uid,
+                                chown_range,
                                 arg_selinux_apifs_context,
                                 false);
                 if (r < 0)
@@ -4333,7 +4418,7 @@ static int outer_child(
          * (and fork for it) for which we then mount sysfs/procfs, and only then switch root. */
 
         _cleanup_close_ int notify_fd = -EBADF;
-        if (arg_privileged) {
+        if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
                 /* Mark everything as shared so our mounts get propagated down. This is required to make new
                  * bind mounts available in systemd services inside the container that create a new mount
                  * namespace.  See https://github.com/systemd/systemd/issues/3860 Further submounts (such as
@@ -4376,8 +4461,8 @@ static int outer_child(
 
         pid = raw_clone(SIGCHLD|CLONE_NEWNS|
                         arg_clone_ns_flags |
-                        (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0) |
-                        ((arg_private_network && !arg_privileged) ? CLONE_NEWNET : 0));
+                        (IN_SET(arg_userns_mode, USER_NAMESPACE_FIXED, USER_NAMESPACE_PICK) ? CLONE_NEWUSER : 0) |
+                        ((arg_private_network && arg_userns_mode == USER_NAMESPACE_MANAGED) ? CLONE_NEWNET : 0));
         if (pid < 0)
                 return log_error_errno(errno, "Failed to fork inner child: %m");
         if (pid == 0) {
@@ -4396,9 +4481,10 @@ static int outer_child(
                                 return log_error_errno(r, "Failed to join network namespace: %m");
                 }
 
-                if (!arg_privileged) {
-                        /* In unprivileged operation, sysfs + procfs are special, we'll have to mount them
-                         * inside the inner namespaces, but before we switch root. Hence do so here. */
+                if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                        /* In managed usernamespace operation, sysfs + procfs are special, we'll have to
+                         * mount them inside the inner namespaces, but before we switch root. Hence do so
+                         * here. */
                         _cleanup_free_ char *j = path_join(directory, "/proc");
                         if (!j)
                                 return log_oom();
@@ -5161,6 +5247,8 @@ static int load_oci_bundle(void) {
 }
 
 static int run_container(
+                const char *directory,
+                int mount_fd,
                 DissectedImage *dissected_image,
                 int userns_fd,
                 FDSet *fds,
@@ -5248,9 +5336,8 @@ static int run_container(
                                                "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
         }
 
-        if (arg_privileged) {
+        if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
                 assert(userns_fd < 0);
-
                 /* If we have no user namespace then we'll clone and create a new mount namespace right-away. */
 
                 *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
@@ -5260,7 +5347,6 @@ static int run_container(
                                                ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
         } else {
                 assert(userns_fd >= 0);
-
                 /* If we have a user namespace then we'll clone() first, and then join the user namespace,
                  * and then open the mount namespace, so that it is owned by the user namespace */
 
@@ -5298,7 +5384,8 @@ static int run_container(
                 (void) reset_signal_mask();
 
                 r = outer_child(&barrier,
-                                arg_directory,
+                                directory,
+                                mount_fd,
                                 dissected_image,
                                 fd_outer_socket_pair[1],
                                 fd_inner_socket_pair[1],
@@ -5416,9 +5503,11 @@ static int run_container(
                 if (!barrier_place_and_sync(&barrier)) /* #1 */
                         return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
 
-                r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
-                if (r < 0)
-                        return r;
+                if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
+                        r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
+                        if (r < 0)
+                                return r;
+                }
 
                 (void) barrier_place(&barrier); /* #2 */
         }
@@ -5442,7 +5531,7 @@ static int run_container(
                         return r;
 
                 if (arg_network_veth) {
-                        if (arg_privileged) {
+                        if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
                                 r = setup_veth(arg_machine, *pid, veth_name,
                                                arg_network_bridge || arg_network_zone, &arg_network_provided_mac);
                                 if (r < 0)
@@ -5580,7 +5669,7 @@ static int run_container(
                         arg_unified_cgroup_hierarchy,
                         arg_uid_shift,
                         userns_fd,
-                        arg_privileged);
+                        arg_userns_mode);
         if (r < 0)
                 return r;
 
@@ -5622,7 +5711,7 @@ static int run_container(
         if (!barrier_sync(&barrier)) /* #5.1 */
                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
 
-        if (arg_userns_mode != USER_NAMESPACE_NO) {
+        if (!IN_SET(arg_userns_mode, USER_NAMESPACE_NO, USER_NAMESPACE_MANAGED)) {
                 r = wipe_fully_visible_api_fs(mntns_fd);
                 if (r < 0)
                         return r;
@@ -5749,7 +5838,7 @@ static int run_container(
 
         fd_kmsg_fifo = safe_close(fd_kmsg_fifo);
 
-        if (arg_private_network && arg_privileged) {
+        if (arg_private_network && arg_userns_mode != USER_NAMESPACE_MANAGED) {
                 r = move_back_network_interfaces(child_netns_fd, arg_network_interfaces);
                 if (r < 0)
                         return r;
@@ -5914,15 +6003,25 @@ static int cant_be_in_netns(void) {
         return 0;
 }
 
+static void initialize_defaults(void) {
+        arg_privileged = getuid() == 0;
+
+        /* If running unprivileged default to systemd-nsresourced operation */
+        arg_userns_mode = arg_privileged ? USER_NAMESPACE_NO : USER_NAMESPACE_MANAGED;
+
+        /* Imply private networking for unprivileged operation, since kernel otherwise refuses mounting sysfs */
+        arg_private_network = !arg_privileged;
+}
+
 static int run(int argc, char *argv[]) {
-        bool remove_directory = false, remove_image = false, veth_created = false, remove_tmprootdir = false;
-        _cleanup_close_ int master = -EBADF, userns_fd = -EBADF;
+        bool remove_directory = false, remove_image = false, veth_created = false;
+        _cleanup_close_ int master = -EBADF, userns_fd = -EBADF, mount_fd = -EBADF;
         _cleanup_fdset_free_ FDSet *fds = NULL;
         int r, n_fd_passed, ret = EXIT_SUCCESS;
         char veth_name[IFNAMSIZ] = "";
         struct ExposeArgs expose_args = {};
         _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
-        char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
+        _cleanup_(rmdir_and_freep) char *rootdir = NULL;
         _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
         _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
@@ -5930,7 +6029,7 @@ static int run(int argc, char *argv[]) {
 
         log_setup();
 
-        arg_privileged = getuid() == 0;
+        initialize_defaults();
 
         r = parse_argv(argc, argv);
         if (r <= 0)
@@ -5987,14 +6086,14 @@ static int run(int argc, char *argv[]) {
         /* Reapply environment settings. */
         (void) detect_unified_cgroup_hierarchy_from_environment();
 
-        if (!arg_privileged) {
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
                 r = cg_all_unified();
                 if (r < 0) {
                         log_error_errno(r, "Failed to determine if we are in unified cgroupv2 mode: %m");
                         goto finish;
                 }
                 if (r == 0)
-                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unprivileged operation only supported in unified cgroupv2 mode.");
+                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Managed user namespace operation only supported in unified cgroupv2 mode.");
         }
 
         /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
@@ -6023,14 +6122,33 @@ static int run(int argc, char *argv[]) {
         if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
                 arg_quiet = true;
 
-        if (arg_directory) {
-                assert(!arg_image);
+        if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                /* Let's allocate a 64K userns first, if managed mode is chosen */
+
+                _cleanup_free_ char *userns_name = NULL;
+                if (asprintf(&userns_name, "nspawn-" PID_FMT "-%s", getpid_cached(), arg_machine) < 0) {
+                        r = log_oom();
+                        goto finish;
+                }
+
+                userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000));
+                if (userns_fd < 0) {
+                        r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
+                        goto finish;
+                }
 
-                if (!arg_privileged) {
-                        r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invoking container from plain directory tree is currently not supported if called without privileges.");
+                r = userns_get_base_uid(userns_fd, &arg_uid_shift, /* ret_gid= */ NULL);
+                if (r < 0) {
+                        log_error_errno(r, "Failed to determine UID shift from userns: %m");
                         goto finish;
                 }
 
+                arg_uid_range = UINT32_C(0x10000);
+        }
+
+        if (arg_directory) {
+                assert(!arg_image);
+
                 /* Safety precaution: let's not allow running images from the live host OS image, as long as
                  * /var from the host will propagate into container dynamically (because bad things happen if
                  * two systems write to the same /var). Let's allow it for the special cases where /var is
@@ -6200,6 +6318,15 @@ static int run(int argc, char *argv[]) {
                         }
                 }
 
+                if (arg_userns_mode == USER_NAMESPACE_MANAGED) {
+                        r = mountfsd_mount_directory(
+                                        arg_directory,
+                                        userns_fd,
+                                        determine_dissect_image_flags(),
+                                        &mount_fd);
+                        if (r < 0)
+                                goto finish;
+                }
         } else {
                 DissectImageFlags dissect_image_flags =
                         determine_dissect_image_flags();
@@ -6274,20 +6401,7 @@ static int run(int argc, char *argv[]) {
                                 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
                 }
 
-                if (!mkdtemp(tmprootdir)) {
-                        r = log_error_errno(errno, "Failed to create temporary directory: %m");
-                        goto finish;
-                }
-
-                remove_tmprootdir = true;
-
-                arg_directory = strdup(tmprootdir);
-                if (!arg_directory) {
-                        r = log_oom();
-                        goto finish;
-                }
-
-                if (arg_privileged) {
+                if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
                         r = loop_device_make_by_path(
                                         arg_image,
                                         arg_read_only ? O_RDONLY : O_RDWR,
@@ -6339,19 +6453,6 @@ static int run(int argc, char *argv[]) {
                         if (r < 0)
                                 goto finish;
                 } else {
-                        _cleanup_free_ char *userns_name = strjoin("nspawn-", arg_machine);
-                        if (!userns_name) {
-                                r = log_oom();
-                                goto finish;
-                        }
-
-                        /* if we are unprivileged, let's allocate a 64K userns first */
-                        userns_fd = nsresource_allocate_userns(userns_name, UINT64_C(0x10000));
-                        if (userns_fd < 0) {
-                                r = log_error_errno(userns_fd, "Failed to allocate user namespace with 64K users: %m");
-                                goto finish;
-                        }
-
                         r = mountfsd_mount_image(
                                         arg_image,
                                         userns_fd,
@@ -6370,7 +6471,14 @@ static int run(int argc, char *argv[]) {
                         arg_architecture = dissected_image_architecture(dissected_image);
         }
 
-        r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
+        /* Create a temporary place to mount stuff. */
+        r = mkdtemp_malloc("/tmp/nspawn-root-XXXXXX", &rootdir);
+        if (r < 0) {
+                log_error_errno(r, "Failed to create temporary directory: %m");
+                goto finish;
+        }
+
+        r = custom_mount_prepare_all(rootdir, arg_custom_mounts, arg_n_custom_mounts);
         if (r < 0)
                 goto finish;
 
@@ -6405,6 +6513,8 @@ static int run(int argc, char *argv[]) {
         }
         for (;;) {
                 r = run_container(
+                                rootdir,
+                                mount_fd,
                                 dissected_image,
                                 userns_fd,
                                 fds,
@@ -6447,12 +6557,7 @@ finish:
                         log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
         }
 
-        if (remove_tmprootdir) {
-                if (rmdir(tmprootdir) < 0)
-                        log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
-        }
-
-        if (arg_machine && arg_privileged) {
+        if (arg_machine && arg_userns_mode != USER_NAMESPACE_MANAGED) {
                 const char *p;
 
                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
@@ -6466,7 +6571,7 @@ finish:
         expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET,  &expose_args.address4);
         expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
 
-        if (arg_privileged) {
+        if (arg_userns_mode != USER_NAMESPACE_MANAGED) {
                 if (veth_created)
                         (void) remove_veth_links(veth_name, arg_network_veth_extra);
                 (void) remove_bridge(arg_network_zone);