summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaan De Meyer <daan.j.demeyer@gmail.com>2023-06-06 17:44:09 +0200
committerDaan De Meyer <daan.j.demeyer@gmail.com>2023-06-21 12:48:46 +0200
commit9c0c670125ff065c22c12bb82f1f20c7b2c8c46d (patch)
treece51381fbcc6844f0f480db1aeb55ec7dfd97281
parenttmpfiles: Add note to man page about guaranteed cleanup for files/directories (diff)
downloadsystemd-9c0c670125ff065c22c12bb82f1f20c7b2c8c46d.tar.xz
systemd-9c0c670125ff065c22c12bb82f1f20c7b2c8c46d.zip
core: Add RootEphemeral= setting
This setting allows services to run in an ephemeral copy of the root directory or root image. To make sure the ephemeral copies are always cleaned up, we add a tmpfiles snippet to unconditionally clean up /var/lib/systemd/ephemeral. To prevent in use ephemeral copies from being cleaned up by tmpfiles, we use the newly added COPY_LOCK_BSD and BTRFS_SNAPSHOT_LOCK_BSD flags to take a BSD lock on the ephemeral copies which instruct tmpfiles to not touch those ephemeral copies as long as the BSD lock is held.
-rw-r--r--man/org.freedesktop.systemd1.xml24
-rw-r--r--man/systemd.exec.xml20
-rw-r--r--src/core/dbus-execute.c4
-rw-r--r--src/core/execute.c174
-rw-r--r--src/core/execute.h11
-rw-r--r--src/core/load-fragment-gperf.gperf.in1
-rw-r--r--src/core/service.c2
-rw-r--r--src/core/unit.c2
-rw-r--r--src/shared/bus-unit-util.c3
-rw-r--r--test/fuzz/fuzz-unit-file/directives-all.service1
-rw-r--r--tmpfiles.d/systemd.conf.in4
11 files changed, 208 insertions, 38 deletions
diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml
index b50ddb95e9..560ae252e3 100644
--- a/man/org.freedesktop.systemd1.xml
+++ b/man/org.freedesktop.systemd1.xml
@@ -2950,6 +2950,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly b RootEphemeral = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -3547,6 +3549,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<!--property RootHashSignaturePath is not documented!-->
+ <!--property RootEphemeral is not documented!-->
+
<!--property OOMScoreAdjust is not documented!-->
<!--property CoredumpFilter is not documented!-->
@@ -4189,6 +4193,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>
@@ -4972,6 +4978,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly b RootEphemeral = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -5581,6 +5589,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<!--property RootHashSignaturePath is not documented!-->
+ <!--property RootEphemeral is not documented!-->
+
<!--property OOMScoreAdjust is not documented!-->
<!--property CoredumpFilter is not documented!-->
@@ -6203,6 +6213,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>
@@ -6861,6 +6873,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly b RootEphemeral = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -7398,6 +7412,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<!--property RootHashSignaturePath is not documented!-->
+ <!--property RootEphemeral is not documented!-->
+
<!--property OOMScoreAdjust is not documented!-->
<!--property CoredumpFilter is not documented!-->
@@ -7938,6 +7954,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>
@@ -8723,6 +8741,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s RootVerity = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly b RootEphemeral = ...;
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as ExtensionDirectories = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(sba(ss)) ExtensionImages = [...];
@@ -9246,6 +9266,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<!--property RootHashSignaturePath is not documented!-->
+ <!--property RootEphemeral is not documented!-->
+
<!--property OOMScoreAdjust is not documented!-->
<!--property CoredumpFilter is not documented!-->
@@ -9772,6 +9794,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/>
diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 938a3c87a9..84eda5c584 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -201,6 +201,26 @@
</varlistentry>
<varlistentry>
+ <term><varname>RootEphemeral=</varname></term>
+
+ <listitem><para>Takes a boolean argument. If enabled, executed processes will run in an ephemeral
+ copy of the root directory or root image. The ephemeral copy is placed in
+ <filename>/var/lib/systemd/ephemeral-trees/</filename> while the service is active and is cleaned up
+ when the service is stopped or restarted. If <varname>RootDirectory=</varname> is used and the root
+ directory is a subvolume, the ephemeral copy will be created by making a snapshot of the subvolume.
+ </para>
+
+ <para>To make sure making ephemeral copies can be made efficiently, the root directory or root image
+ should be located on the same filesystem as <filename>/var/lib/systemd/ephemeral-trees/</filename>.
+ When using <varname>RootEphemeral=</varname> with root directories, btrfs should be used as the
+ filesystem and the root directory should ideally be a subvolume which <command>systemd</command> can
+ snapshot to make the ephemeral copy. For root images, a filesystem with support for reflinks should
+ be used to ensure an efficient ephemeral copy.</para>
+
+ <xi:include href="system-only.xml" xpointer="singular"/></listitem>
+ </varlistentry>
+
+ <varlistentry>
<term><varname>RootHash=</varname></term>
<listitem><para>Takes a data integrity (dm-verity) root hash specified in hexadecimal, or the path to a file
diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c
index 04070a7f1e..80a035ab90 100644
--- a/src/core/dbus-execute.c
+++ b/src/core/dbus-execute.c
@@ -1231,6 +1231,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("RootHashSignature", "ay", property_get_root_hash_sig, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("RootEphemeral", "b", bus_property_get_bool, offsetof(ExecContext, root_ephemeral), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ExtensionDirectories", "as", NULL, offsetof(ExecContext, extension_directories), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ExtensionImages", "a(sba(ss))", property_get_extension_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1865,6 +1866,9 @@ int bus_exec_context_set_transient_property(
if (streq(name, "RootDirectory"))
return bus_set_transient_path(u, name, &c->root_directory, message, flags, error);
+ if (streq(name, "RootEphemeral"))
+ return bus_set_transient_bool(u, name, &c->root_ephemeral, message, flags, error);
+
if (streq(name, "SyslogIdentifier"))
return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error);
diff --git a/src/core/execute.c b/src/core/execute.c
index 204c5a1f8c..b7fe922c7a 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -15,6 +15,8 @@
#include <unistd.h>
#include <utmpx.h>
+#include <linux/fs.h> /* Must be included after <sys/mount.h> */
+
#if HAVE_PAM
#include <security/pam_appl.h>
#endif
@@ -43,8 +45,10 @@
#include "async.h"
#include "barrier.h"
#include "bpf-lsm.h"
+#include "btrfs-util.h"
#include "cap-list.h"
#include "capability-util.h"
+#include "chattr-util.h"
#include "cgroup-setup.h"
#include "chase.h"
#include "chown-recursive.h"
@@ -66,6 +70,7 @@
#include "io-util.h"
#include "ioprio-util.h"
#include "label-util.h"
+#include "lock-util.h"
#include "log.h"
#include "macro.h"
#include "manager.h"
@@ -2170,6 +2175,10 @@ bool exec_needs_network_namespace(const ExecContext *context) {
return context->private_network || context->network_namespace_path;
}
+static bool exec_needs_ephemeral(const ExecContext *context) {
+ return (context->root_image || context->root_directory) && context->root_ephemeral;
+}
+
static bool exec_needs_ipc_namespace(const ExecContext *context) {
assert(context);
@@ -3823,6 +3832,63 @@ static bool insist_on_sandboxing(
return false;
}
+static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ if (!runtime || !runtime->ephemeral_copy)
+ return 0;
+
+ r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
+
+ CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
+
+ fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
+ if (fd >= 0)
+ /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
+ return 0;
+
+ if (fd != -EAGAIN)
+ return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
+
+ log_debug("Making ephemeral snapshot of %s to %s",
+ context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+
+ if (context->root_image)
+ fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
+ COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
+ else
+ fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
+ AT_FDCWD, runtime->ephemeral_copy,
+ BTRFS_SNAPSHOT_FALLBACK_COPY |
+ BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+ BTRFS_SNAPSHOT_RECURSIVE |
+ BTRFS_SNAPSHOT_LOCK_BSD);
+ if (fd < 0)
+ return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
+ context->root_image ?: context->root_directory, runtime->ephemeral_copy);
+
+ if (context->root_image) {
+ /* A root image might be subject to lots of random writes so let's try to disable COW on it
+ * which tends to not perform well in combination with lots of random writes.
+ *
+ * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
+ * copy, but we at least want to make the intention clear.
+ */
+ r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
+ if (r < 0)
+ log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
+ }
+
+ r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
+
+ return 1;
+}
+
static int verity_settings_prepare(
VeritySettings *verity,
const char *root_image,
@@ -3884,7 +3950,7 @@ static int apply_mount_namespace(
ExecCommandFlags command_flags,
const ExecContext *context,
const ExecParameters *params,
- const ExecRuntime *runtime,
+ ExecRuntime *runtime,
const char *memory_pressure_path,
char **error_path) {
@@ -3906,10 +3972,14 @@ static int apply_mount_namespace(
CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
if (params->flags & EXEC_APPLY_CHROOT) {
- root_image = context->root_image;
+ r = setup_ephemeral(context, runtime);
+ if (r < 0)
+ return r;
- if (!root_image)
- root_dir = context->root_directory;
+ if (context->root_image)
+ root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
+ else
+ root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
}
r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
@@ -4090,6 +4160,7 @@ static int apply_mount_namespace(
static int apply_working_directory(
const ExecContext *context,
const ExecParameters *params,
+ ExecRuntime *runtime,
const char *home,
int *exit_status) {
@@ -4113,7 +4184,7 @@ static int apply_working_directory(
if (params->flags & EXEC_APPLY_CHROOT)
d = wd;
else
- d = prefix_roota(context->root_directory, wd);
+ d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
if (chdir(d) < 0 && !context->working_directory_missing_ok) {
*exit_status = EXIT_CHDIR;
@@ -4126,6 +4197,7 @@ static int apply_working_directory(
static int apply_root_directory(
const ExecContext *context,
const ExecParameters *params,
+ ExecRuntime *runtime,
const bool needs_mount_ns,
int *exit_status) {
@@ -4134,7 +4206,7 @@ static int apply_root_directory(
if (params->flags & EXEC_APPLY_CHROOT)
if (!needs_mount_ns && context->root_directory)
- if (chroot(context->root_directory) < 0) {
+ if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
*exit_status = EXIT_CHROOT;
return -errno;
}
@@ -4271,7 +4343,7 @@ static int close_remaining_fds(
const int *fds, size_t n_fds) {
size_t n_dont_close = 0;
- int dont_close[n_fds + 12];
+ int dont_close[n_fds + 14];
assert(params);
@@ -4289,6 +4361,9 @@ static int close_remaining_fds(
n_dont_close += n_fds;
}
+ if (runtime)
+ append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
+
if (runtime && runtime->shared) {
append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
@@ -5584,7 +5659,7 @@ static int exec_child(
}
/* chroot to root directory first, before we lose the ability to chroot */
- r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
+ r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
if (r < 0)
return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
@@ -5610,7 +5685,7 @@ static int exec_child(
/* Apply working directory here, because the working directory might be on NFS and only the user running
* this service might have the correct privilege to change to the working directory */
- r = apply_working_directory(context, params, home, exit_status);
+ r = apply_working_directory(context, params, runtime, home, exit_status);
if (r < 0)
return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
@@ -6422,6 +6497,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
"%sUMask: %04o\n"
"%sWorkingDirectory: %s\n"
"%sRootDirectory: %s\n"
+ "%sRootEphemeral: %s\n"
"%sNonBlocking: %s\n"
"%sPrivateTmp: %s\n"
"%sPrivateDevices: %s\n"
@@ -6446,6 +6522,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
prefix, c->umask,
prefix, empty_to_root(c->working_directory),
prefix, empty_to_root(c->root_directory),
+ prefix, yes_no(c->root_ephemeral),
prefix, yes_no(c->non_blocking),
prefix, yes_no(c->private_tmp),
prefix, yes_no(c->private_devices),
@@ -7241,13 +7318,30 @@ int exec_command_append(ExecCommand *c, const char *path, ...) {
return 0;
}
-static void *remove_tmpdir_thread(void *p) {
+static void *rm_rf_thread(void *p) {
_cleanup_free_ char *path = p;
- (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
+ (void) rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
return NULL;
}
+static void asynchronous_rm_rf(char **path) {
+ int r;
+
+ assert(path);
+
+ if (!*path || streq(*path, RUN_SYSTEMD_EMPTY))
+ return;
+
+ log_debug("Spawning thread to nuke %s", *path);
+
+ r = asynchronous_job(rm_rf_thread, *path);
+ if (r < 0)
+ log_warning_errno(r, "Failed to nuke %s: %m", *path);
+ else
+ *path = NULL;
+}
+
static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
if (!rt)
return NULL;
@@ -7267,8 +7361,6 @@ DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_ru
DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
- int r;
-
if (!rt)
return NULL;
@@ -7278,25 +7370,8 @@ ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
if (rt->n_ref > 0)
return NULL;
- if (rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
- log_debug("Spawning thread to nuke %s", rt->tmp_dir);
-
- r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
- if (r < 0)
- log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
- else
- rt->tmp_dir = NULL;
- }
-
- if (rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
- log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
-
- r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
- if (r < 0)
- log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
- else
- rt->var_tmp_dir = NULL;
- }
+ asynchronous_rm_rf(&rt->tmp_dir);
+ asynchronous_rm_rf(&rt->var_tmp_dir);
return exec_shared_runtime_free(rt);
}
@@ -7731,16 +7806,39 @@ void exec_shared_runtime_vacuum(Manager *m) {
}
}
-int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret) {
+int exec_runtime_make(
+ const Unit *unit,
+ const ExecContext *context,
+ ExecSharedRuntime *shared,
+ DynamicCreds *creds,
+ ExecRuntime **ret) {
+ _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
+ _cleanup_free_ char *ephemeral = NULL;
_cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
+ int r;
+ assert(unit);
+ assert(context);
assert(ret);
- if (!shared && !creds) {
+ if (!shared && !creds && !exec_needs_ephemeral(context)) {
*ret = NULL;
return 0;
}
+ if (exec_needs_ephemeral(context)) {
+ r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
+ if (r < 0)
+ return r;
+
+ r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
+ if (r < 0)
+ return r;
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
+ return -errno;
+ }
+
rt = new(ExecRuntime, 1);
if (!rt)
return -ENOMEM;
@@ -7748,6 +7846,9 @@ int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntim
*rt = (ExecRuntime) {
.shared = shared,
.dynamic_creds = creds,
+ .ephemeral_copy = TAKE_PTR(ephemeral),
+ .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
+ .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
};
*ret = TAKE_PTR(rt);
@@ -7760,6 +7861,11 @@ ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
exec_shared_runtime_unref(rt->shared);
dynamic_creds_unref(rt->dynamic_creds);
+
+ asynchronous_rm_rf(&rt->ephemeral_copy);
+
+ free(rt->ephemeral_copy);
+ safe_close_pair(rt->ephemeral_storage_socket);
return mfree(rt);
}
diff --git a/src/core/execute.h b/src/core/execute.h
index 953dc9e7f7..ee73fb6367 100644
--- a/src/core/execute.h
+++ b/src/core/execute.h
@@ -129,6 +129,14 @@ struct ExecSharedRuntime {
struct ExecRuntime {
ExecSharedRuntime *shared;
DynamicCreds *dynamic_creds;
+
+ /* The path to the ephemeral snapshot of the root directory or root image if one was requested. */
+ char *ephemeral_copy;
+
+ /* An AF_UNIX socket pair that receives the locked file descriptor referring to the ephemeral copy of
+ * the root directory or root image. The lock prevents tmpfiles from removing the ephemeral snapshot
+ * until we're done using it. */
+ int ephemeral_storage_socket[2];
};
typedef enum ExecDirectoryType {
@@ -195,6 +203,7 @@ struct ExecContext {
void *root_hash, *root_hash_sig;
size_t root_hash_size, root_hash_sig_size;
LIST_HEAD(MountOptions, root_image_options);
+ bool root_ephemeral;
bool working_directory_missing_ok:1;
bool working_directory_home:1;
@@ -506,7 +515,7 @@ int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char
int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds);
void exec_shared_runtime_vacuum(Manager *m);
-int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret);
+int exec_runtime_make(const Unit *unit, const ExecContext *context, ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret);
ExecRuntime* exec_runtime_free(ExecRuntime *rt);
DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free);
ExecRuntime* exec_runtime_destroy(ExecRuntime *rt);
diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in
index ae318dae89..382b60ea90 100644
--- a/src/core/load-fragment-gperf.gperf.in
+++ b/src/core/load-fragment-gperf.gperf.in
@@ -10,6 +10,7 @@
{{type}}.RootHash, config_parse_exec_root_hash, 0, offsetof({{type}}, exec_context)
{{type}}.RootHashSignature, config_parse_exec_root_hash_sig, 0, offsetof({{type}}, exec_context)
{{type}}.RootVerity, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_verity)
+{{type}}.RootEphemeral, config_parse_bool, 0, offsetof({{type}}, exec_context.root_ephemeral)
{{type}}.ExtensionDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.extension_directories)
{{type}}.ExtensionImages, config_parse_extension_images, 0, offsetof({{type}}, exec_context)
{{type}}.ExtensionImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.extension_image_policy)
diff --git a/src/core/service.c b/src/core/service.c
index cecdd3bf50..146b892e46 100644
--- a/src/core/service.c
+++ b/src/core/service.c
@@ -2029,7 +2029,7 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart)
/* Reset NotifyAccess override */
s->notify_access_override = _NOTIFY_ACCESS_INVALID;
- /* We want fresh tmpdirs in case service is started again immediately */
+ /* We want fresh tmpdirs and ephemeral snapshots in case the service is started again immediately. */
s->exec_runtime = exec_runtime_destroy(s->exec_runtime);
/* Also, remove the runtime directory */
diff --git a/src/core/unit.c b/src/core/unit.c
index 570234c8f4..f51b5687f8 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -5012,7 +5012,7 @@ int unit_setup_exec_runtime(Unit *u) {
return r;
}
- r = exec_runtime_make(esr, dcreds, rt);
+ r = exec_runtime_make(u, ec, esr, dcreds, rt);
if (r < 0)
return r;
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
index 6e93d0ca43..cc287feb8e 100644
--- a/src/shared/bus-unit-util.c
+++ b/src/shared/bus-unit-util.c
@@ -992,7 +992,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
"LockPersonality",
"ProtectHostname",
"MemoryKSM",
- "RestrictSUIDSGID"))
+ "RestrictSUIDSGID",
+ "RootEphemeral"))
return bus_append_parse_boolean(m, field, eq);
if (STR_IN_SET(field, "ReadWriteDirectories",
diff --git a/test/fuzz/fuzz-unit-file/directives-all.service b/test/fuzz/fuzz-unit-file/directives-all.service
index 818fb28dbf..4bdc48a59b 100644
--- a/test/fuzz/fuzz-unit-file/directives-all.service
+++ b/test/fuzz/fuzz-unit-file/directives-all.service
@@ -222,6 +222,7 @@ RootImage=
RootHash=
RootHashSignature=
RootVerity=
+RootEphemeral=
ExtensionDirectories=
ExtensionImages=
RuntimeMaxSec=
diff --git a/tmpfiles.d/systemd.conf.in b/tmpfiles.d/systemd.conf.in
index fa838d8d06..3781c579e0 100644
--- a/tmpfiles.d/systemd.conf.in
+++ b/tmpfiles.d/systemd.conf.in
@@ -60,6 +60,10 @@ a+ /var/log/journal/%m/system.journal - - - - group:wheel:r--
d /var/lib/systemd 0755 root root -
d /var/lib/systemd/coredump 0755 root root 3d
+# Files and directories in /var/lib/systemd/ephemeral-trees are locked by pid 1 to prevent tmpfiles from
+# removing them, and tmpfiles is told to clean up anything in /var/lib/systemd/ephemeral-trees that isn't
+# locked unconditionally.
+d /var/lib/systemd/ephemeral-trees 0755 root root 0
d /var/lib/private 0700 root root -
d /var/log/private 0700 root root -