diff options
author | Daan De Meyer <daan.j.demeyer@gmail.com> | 2023-06-06 17:44:09 +0200 |
---|---|---|
committer | Daan De Meyer <daan.j.demeyer@gmail.com> | 2023-06-21 12:48:46 +0200 |
commit | 9c0c670125ff065c22c12bb82f1f20c7b2c8c46d (patch) | |
tree | ce51381fbcc6844f0f480db1aeb55ec7dfd97281 | |
parent | tmpfiles: Add note to man page about guaranteed cleanup for files/directories (diff) | |
download | systemd-9c0c670125ff065c22c12bb82f1f20c7b2c8c46d.tar.xz systemd-9c0c670125ff065c22c12bb82f1f20c7b2c8c46d.zip |
core: Add RootEphemeral= setting
This setting allows services to run in an ephemeral copy of the root
directory or root image. To make sure the ephemeral copies are always
cleaned up, we add a tmpfiles snippet to unconditionally clean up
/var/lib/systemd/ephemeral. To prevent in use ephemeral copies from
being cleaned up by tmpfiles, we use the newly added COPY_LOCK_BSD
and BTRFS_SNAPSHOT_LOCK_BSD flags to take a BSD lock on the ephemeral
copies which instruct tmpfiles to not touch those ephemeral copies as
long as the BSD lock is held.
-rw-r--r-- | man/org.freedesktop.systemd1.xml | 24 | ||||
-rw-r--r-- | man/systemd.exec.xml | 20 | ||||
-rw-r--r-- | src/core/dbus-execute.c | 4 | ||||
-rw-r--r-- | src/core/execute.c | 174 | ||||
-rw-r--r-- | src/core/execute.h | 11 | ||||
-rw-r--r-- | src/core/load-fragment-gperf.gperf.in | 1 | ||||
-rw-r--r-- | src/core/service.c | 2 | ||||
-rw-r--r-- | src/core/unit.c | 2 | ||||
-rw-r--r-- | src/shared/bus-unit-util.c | 3 | ||||
-rw-r--r-- | test/fuzz/fuzz-unit-file/directives-all.service | 1 | ||||
-rw-r--r-- | tmpfiles.d/systemd.conf.in | 4 |
11 files changed, 208 insertions, 38 deletions
diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index b50ddb95e9..560ae252e3 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2950,6 +2950,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s RootVerity = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b RootEphemeral = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -3547,6 +3549,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { <!--property RootHashSignaturePath is not documented!--> + <!--property RootEphemeral is not documented!--> + <!--property OOMScoreAdjust is not documented!--> <!--property CoredumpFilter is not documented!--> @@ -4189,6 +4193,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { <variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/> + <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/> + <variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/> <variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/> @@ -4972,6 +4978,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s RootVerity = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b RootEphemeral = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -5581,6 +5589,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { <!--property RootHashSignaturePath is not documented!--> + <!--property RootEphemeral is not documented!--> + <!--property OOMScoreAdjust is not documented!--> <!--property CoredumpFilter is not documented!--> @@ -6203,6 +6213,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { <variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/> + <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/> + <variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/> <variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/> @@ -6861,6 +6873,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s RootVerity = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b RootEphemeral = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -7398,6 +7412,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { <!--property RootHashSignaturePath is not documented!--> + <!--property RootEphemeral is not documented!--> + <!--property OOMScoreAdjust is not documented!--> <!--property CoredumpFilter is not documented!--> @@ -7938,6 +7954,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { <variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/> + <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/> + <variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/> <variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/> @@ -8723,6 +8741,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s RootVerity = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b RootEphemeral = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as ExtensionDirectories = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly a(sba(ss)) ExtensionImages = [...]; @@ -9246,6 +9266,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { <!--property RootHashSignaturePath is not documented!--> + <!--property RootEphemeral is not documented!--> + <!--property OOMScoreAdjust is not documented!--> <!--property CoredumpFilter is not documented!--> @@ -9772,6 +9794,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { <variablelist class="dbus-property" generated="True" extra-ref="RootVerity"/> + <variablelist class="dbus-property" generated="True" extra-ref="RootEphemeral"/> + <variablelist class="dbus-property" generated="True" extra-ref="ExtensionDirectories"/> <variablelist class="dbus-property" generated="True" extra-ref="ExtensionImages"/> diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 938a3c87a9..84eda5c584 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -201,6 +201,26 @@ </varlistentry> <varlistentry> + <term><varname>RootEphemeral=</varname></term> + + <listitem><para>Takes a boolean argument. If enabled, executed processes will run in an ephemeral + copy of the root directory or root image. The ephemeral copy is placed in + <filename>/var/lib/systemd/ephemeral-trees/</filename> while the service is active and is cleaned up + when the service is stopped or restarted. If <varname>RootDirectory=</varname> is used and the root + directory is a subvolume, the ephemeral copy will be created by making a snapshot of the subvolume. + </para> + + <para>To make sure making ephemeral copies can be made efficiently, the root directory or root image + should be located on the same filesystem as <filename>/var/lib/systemd/ephemeral-trees/</filename>. + When using <varname>RootEphemeral=</varname> with root directories, btrfs should be used as the + filesystem and the root directory should ideally be a subvolume which <command>systemd</command> can + snapshot to make the ephemeral copy. For root images, a filesystem with support for reflinks should + be used to ensure an efficient ephemeral copy.</para> + + <xi:include href="system-only.xml" xpointer="singular"/></listitem> + </varlistentry> + + <varlistentry> <term><varname>RootHash=</varname></term> <listitem><para>Takes a data integrity (dm-verity) root hash specified in hexadecimal, or the path to a file diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 04070a7f1e..80a035ab90 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -1231,6 +1231,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("RootHashSignature", "ay", property_get_root_hash_sig, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RootHashSignaturePath", "s", NULL, offsetof(ExecContext, root_hash_sig_path), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RootVerity", "s", NULL, offsetof(ExecContext, root_verity), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("RootEphemeral", "b", bus_property_get_bool, offsetof(ExecContext, root_ephemeral), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ExtensionDirectories", "as", NULL, offsetof(ExecContext, extension_directories), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ExtensionImages", "a(sba(ss))", property_get_extension_images, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MountImages", "a(ssba(ss))", property_get_mount_images, 0, SD_BUS_VTABLE_PROPERTY_CONST), @@ -1865,6 +1866,9 @@ int bus_exec_context_set_transient_property( if (streq(name, "RootDirectory")) return bus_set_transient_path(u, name, &c->root_directory, message, flags, error); + if (streq(name, "RootEphemeral")) + return bus_set_transient_bool(u, name, &c->root_ephemeral, message, flags, error); + if (streq(name, "SyslogIdentifier")) return bus_set_transient_string(u, name, &c->syslog_identifier, message, flags, error); diff --git a/src/core/execute.c b/src/core/execute.c index 204c5a1f8c..b7fe922c7a 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -15,6 +15,8 @@ #include <unistd.h> #include <utmpx.h> +#include <linux/fs.h> /* Must be included after <sys/mount.h> */ + #if HAVE_PAM #include <security/pam_appl.h> #endif @@ -43,8 +45,10 @@ #include "async.h" #include "barrier.h" #include "bpf-lsm.h" +#include "btrfs-util.h" #include "cap-list.h" #include "capability-util.h" +#include "chattr-util.h" #include "cgroup-setup.h" #include "chase.h" #include "chown-recursive.h" @@ -66,6 +70,7 @@ #include "io-util.h" #include "ioprio-util.h" #include "label-util.h" +#include "lock-util.h" #include "log.h" #include "macro.h" #include "manager.h" @@ -2170,6 +2175,10 @@ bool exec_needs_network_namespace(const ExecContext *context) { return context->private_network || context->network_namespace_path; } +static bool exec_needs_ephemeral(const ExecContext *context) { + return (context->root_image || context->root_directory) && context->root_ephemeral; +} + static bool exec_needs_ipc_namespace(const ExecContext *context) { assert(context); @@ -3823,6 +3832,63 @@ static bool insist_on_sandboxing( return false; } +static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) { + _cleanup_close_ int fd = -EBADF; + int r; + + if (!runtime || !runtime->ephemeral_copy) + return 0; + + r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX); + if (r < 0) + return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m"); + + CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]); + + fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT); + if (fd >= 0) + /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */ + return 0; + + if (fd != -EAGAIN) + return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m"); + + log_debug("Making ephemeral snapshot of %s to %s", + context->root_image ?: context->root_directory, runtime->ephemeral_copy); + + if (context->root_image) + fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600, + COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME); + else + fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory, + AT_FDCWD, runtime->ephemeral_copy, + BTRFS_SNAPSHOT_FALLBACK_COPY | + BTRFS_SNAPSHOT_FALLBACK_DIRECTORY | + BTRFS_SNAPSHOT_RECURSIVE | + BTRFS_SNAPSHOT_LOCK_BSD); + if (fd < 0) + return log_debug_errno(fd, "Failed to snapshot %s to %s: %m", + context->root_image ?: context->root_directory, runtime->ephemeral_copy); + + if (context->root_image) { + /* A root image might be subject to lots of random writes so let's try to disable COW on it + * which tends to not perform well in combination with lots of random writes. + * + * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed + * copy, but we at least want to make the intention clear. + */ + r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL); + if (r < 0) + log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy); + } + + r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT); + if (r < 0) + return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m"); + + return 1; +} + static int verity_settings_prepare( VeritySettings *verity, const char *root_image, @@ -3884,7 +3950,7 @@ static int apply_mount_namespace( ExecCommandFlags command_flags, const ExecContext *context, const ExecParameters *params, - const ExecRuntime *runtime, + ExecRuntime *runtime, const char *memory_pressure_path, char **error_path) { @@ -3906,10 +3972,14 @@ static int apply_mount_namespace( CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many); if (params->flags & EXEC_APPLY_CHROOT) { - root_image = context->root_image; + r = setup_ephemeral(context, runtime); + if (r < 0) + return r; - if (!root_image) - root_dir = context->root_directory; + if (context->root_image) + root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image; + else + root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory; } r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories); @@ -4090,6 +4160,7 @@ static int apply_mount_namespace( static int apply_working_directory( const ExecContext *context, const ExecParameters *params, + ExecRuntime *runtime, const char *home, int *exit_status) { @@ -4113,7 +4184,7 @@ static int apply_working_directory( if (params->flags & EXEC_APPLY_CHROOT) d = wd; else - d = prefix_roota(context->root_directory, wd); + d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd); if (chdir(d) < 0 && !context->working_directory_missing_ok) { *exit_status = EXIT_CHDIR; @@ -4126,6 +4197,7 @@ static int apply_working_directory( static int apply_root_directory( const ExecContext *context, const ExecParameters *params, + ExecRuntime *runtime, const bool needs_mount_ns, int *exit_status) { @@ -4134,7 +4206,7 @@ static int apply_root_directory( if (params->flags & EXEC_APPLY_CHROOT) if (!needs_mount_ns && context->root_directory) - if (chroot(context->root_directory) < 0) { + if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) { *exit_status = EXIT_CHROOT; return -errno; } @@ -4271,7 +4343,7 @@ static int close_remaining_fds( const int *fds, size_t n_fds) { size_t n_dont_close = 0; - int dont_close[n_fds + 12]; + int dont_close[n_fds + 14]; assert(params); @@ -4289,6 +4361,9 @@ static int close_remaining_fds( n_dont_close += n_fds; } + if (runtime) + append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket); + if (runtime && runtime->shared) { append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket); append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket); @@ -5584,7 +5659,7 @@ static int exec_child( } /* chroot to root directory first, before we lose the ability to chroot */ - r = apply_root_directory(context, params, needs_mount_namespace, exit_status); + r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status); if (r < 0) return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m"); @@ -5610,7 +5685,7 @@ static int exec_child( /* Apply working directory here, because the working directory might be on NFS and only the user running * this service might have the correct privilege to change to the working directory */ - r = apply_working_directory(context, params, home, exit_status); + r = apply_working_directory(context, params, runtime, home, exit_status); if (r < 0) return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m"); @@ -6422,6 +6497,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { "%sUMask: %04o\n" "%sWorkingDirectory: %s\n" "%sRootDirectory: %s\n" + "%sRootEphemeral: %s\n" "%sNonBlocking: %s\n" "%sPrivateTmp: %s\n" "%sPrivateDevices: %s\n" @@ -6446,6 +6522,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { prefix, c->umask, prefix, empty_to_root(c->working_directory), prefix, empty_to_root(c->root_directory), + prefix, yes_no(c->root_ephemeral), prefix, yes_no(c->non_blocking), prefix, yes_no(c->private_tmp), prefix, yes_no(c->private_devices), @@ -7241,13 +7318,30 @@ int exec_command_append(ExecCommand *c, const char *path, ...) { return 0; } -static void *remove_tmpdir_thread(void *p) { +static void *rm_rf_thread(void *p) { _cleanup_free_ char *path = p; - (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL); + (void) rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL); return NULL; } +static void asynchronous_rm_rf(char **path) { + int r; + + assert(path); + + if (!*path || streq(*path, RUN_SYSTEMD_EMPTY)) + return; + + log_debug("Spawning thread to nuke %s", *path); + + r = asynchronous_job(rm_rf_thread, *path); + if (r < 0) + log_warning_errno(r, "Failed to nuke %s: %m", *path); + else + *path = NULL; +} + static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) { if (!rt) return NULL; @@ -7267,8 +7361,6 @@ DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_ru DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free); ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) { - int r; - if (!rt) return NULL; @@ -7278,25 +7370,8 @@ ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) { if (rt->n_ref > 0) return NULL; - if (rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) { - log_debug("Spawning thread to nuke %s", rt->tmp_dir); - - r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir); - if (r < 0) - log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir); - else - rt->tmp_dir = NULL; - } - - if (rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) { - log_debug("Spawning thread to nuke %s", rt->var_tmp_dir); - - r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir); - if (r < 0) - log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir); - else - rt->var_tmp_dir = NULL; - } + asynchronous_rm_rf(&rt->tmp_dir); + asynchronous_rm_rf(&rt->var_tmp_dir); return exec_shared_runtime_free(rt); } @@ -7731,16 +7806,39 @@ void exec_shared_runtime_vacuum(Manager *m) { } } -int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret) { +int exec_runtime_make( + const Unit *unit, + const ExecContext *context, + ExecSharedRuntime *shared, + DynamicCreds *creds, + ExecRuntime **ret) { + _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF; + _cleanup_free_ char *ephemeral = NULL; _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL; + int r; + assert(unit); + assert(context); assert(ret); - if (!shared && !creds) { + if (!shared && !creds && !exec_needs_ephemeral(context)) { *ret = NULL; return 0; } + if (exec_needs_ephemeral(context)) { + r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755); + if (r < 0) + return r; + + r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral); + if (r < 0) + return r; + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0) + return -errno; + } + rt = new(ExecRuntime, 1); if (!rt) return -ENOMEM; @@ -7748,6 +7846,9 @@ int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntim *rt = (ExecRuntime) { .shared = shared, .dynamic_creds = creds, + .ephemeral_copy = TAKE_PTR(ephemeral), + .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]), + .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]), }; *ret = TAKE_PTR(rt); @@ -7760,6 +7861,11 @@ ExecRuntime* exec_runtime_free(ExecRuntime *rt) { exec_shared_runtime_unref(rt->shared); dynamic_creds_unref(rt->dynamic_creds); + + asynchronous_rm_rf(&rt->ephemeral_copy); + + free(rt->ephemeral_copy); + safe_close_pair(rt->ephemeral_storage_socket); return mfree(rt); } diff --git a/src/core/execute.h b/src/core/execute.h index 953dc9e7f7..ee73fb6367 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -129,6 +129,14 @@ struct ExecSharedRuntime { struct ExecRuntime { ExecSharedRuntime *shared; DynamicCreds *dynamic_creds; + + /* The path to the ephemeral snapshot of the root directory or root image if one was requested. */ + char *ephemeral_copy; + + /* An AF_UNIX socket pair that receives the locked file descriptor referring to the ephemeral copy of + * the root directory or root image. The lock prevents tmpfiles from removing the ephemeral snapshot + * until we're done using it. */ + int ephemeral_storage_socket[2]; }; typedef enum ExecDirectoryType { @@ -195,6 +203,7 @@ struct ExecContext { void *root_hash, *root_hash_sig; size_t root_hash_size, root_hash_sig_size; LIST_HEAD(MountOptions, root_image_options); + bool root_ephemeral; bool working_directory_missing_ok:1; bool working_directory_home:1; @@ -506,7 +515,7 @@ int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds); void exec_shared_runtime_vacuum(Manager *m); -int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret); +int exec_runtime_make(const Unit *unit, const ExecContext *context, ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret); ExecRuntime* exec_runtime_free(ExecRuntime *rt); DEFINE_TRIVIAL_CLEANUP_FUNC(ExecRuntime*, exec_runtime_free); ExecRuntime* exec_runtime_destroy(ExecRuntime *rt); diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index ae318dae89..382b60ea90 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -10,6 +10,7 @@ {{type}}.RootHash, config_parse_exec_root_hash, 0, offsetof({{type}}, exec_context) {{type}}.RootHashSignature, config_parse_exec_root_hash_sig, 0, offsetof({{type}}, exec_context) {{type}}.RootVerity, config_parse_unit_path_printf, true, offsetof({{type}}, exec_context.root_verity) +{{type}}.RootEphemeral, config_parse_bool, 0, offsetof({{type}}, exec_context.root_ephemeral) {{type}}.ExtensionDirectories, config_parse_namespace_path_strv, 0, offsetof({{type}}, exec_context.extension_directories) {{type}}.ExtensionImages, config_parse_extension_images, 0, offsetof({{type}}, exec_context) {{type}}.ExtensionImagePolicy, config_parse_image_policy, 0, offsetof({{type}}, exec_context.extension_image_policy) diff --git a/src/core/service.c b/src/core/service.c index cecdd3bf50..146b892e46 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -2029,7 +2029,7 @@ static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart) /* Reset NotifyAccess override */ s->notify_access_override = _NOTIFY_ACCESS_INVALID; - /* We want fresh tmpdirs in case service is started again immediately */ + /* We want fresh tmpdirs and ephemeral snapshots in case the service is started again immediately. */ s->exec_runtime = exec_runtime_destroy(s->exec_runtime); /* Also, remove the runtime directory */ diff --git a/src/core/unit.c b/src/core/unit.c index 570234c8f4..f51b5687f8 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -5012,7 +5012,7 @@ int unit_setup_exec_runtime(Unit *u) { return r; } - r = exec_runtime_make(esr, dcreds, rt); + r = exec_runtime_make(u, ec, esr, dcreds, rt); if (r < 0) return r; diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 6e93d0ca43..cc287feb8e 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -992,7 +992,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con "LockPersonality", "ProtectHostname", "MemoryKSM", - "RestrictSUIDSGID")) + "RestrictSUIDSGID", + "RootEphemeral")) return bus_append_parse_boolean(m, field, eq); if (STR_IN_SET(field, "ReadWriteDirectories", diff --git a/test/fuzz/fuzz-unit-file/directives-all.service b/test/fuzz/fuzz-unit-file/directives-all.service index 818fb28dbf..4bdc48a59b 100644 --- a/test/fuzz/fuzz-unit-file/directives-all.service +++ b/test/fuzz/fuzz-unit-file/directives-all.service @@ -222,6 +222,7 @@ RootImage= RootHash= RootHashSignature= RootVerity= +RootEphemeral= ExtensionDirectories= ExtensionImages= RuntimeMaxSec= diff --git a/tmpfiles.d/systemd.conf.in b/tmpfiles.d/systemd.conf.in index fa838d8d06..3781c579e0 100644 --- a/tmpfiles.d/systemd.conf.in +++ b/tmpfiles.d/systemd.conf.in @@ -60,6 +60,10 @@ a+ /var/log/journal/%m/system.journal - - - - group:wheel:r-- d /var/lib/systemd 0755 root root - d /var/lib/systemd/coredump 0755 root root 3d +# Files and directories in /var/lib/systemd/ephemeral-trees are locked by pid 1 to prevent tmpfiles from +# removing them, and tmpfiles is told to clean up anything in /var/lib/systemd/ephemeral-trees that isn't +# locked unconditionally. +d /var/lib/systemd/ephemeral-trees 0755 root root 0 d /var/lib/private 0700 root root - d /var/log/private 0700 root root - |