diff options
author | Roman Gushchin <roman.gushchin@linux.dev> | 2024-06-25 02:58:58 +0200 |
---|---|---|
committer | Andrew Morton <akpm@linux-foundation.org> | 2024-07-05 03:05:52 +0200 |
commit | 66d60c428b23c5b171218985b6880958fb7edbe3 (patch) | |
tree | 7db78ef6a73daa7c5326206b8f20326ae0ae3a32 /mm/memcontrol-v1.c | |
parent | mm: memcg: rename charge move-related functions (diff) | |
download | linux-66d60c428b23c5b171218985b6880958fb7edbe3.tar.xz linux-66d60c428b23c5b171218985b6880958fb7edbe3.zip |
mm: memcg: move legacy memcg event code into memcontrol-v1.c
Cgroup v1's memory controller contains a pretty complicated event
notifications mechanism which is not used on cgroup v2. Let's move the
corresponding code into memcontrol-v1.c.
Please, note, that mem_cgroup_event_ratelimit() remains in memcontrol.c,
otherwise it would require exporting too many details on memcg stats
outside of memcontrol.c.
Link: https://lkml.kernel.org/r/20240625005906.106920-7-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol-v1.c')
-rw-r--r-- | mm/memcontrol-v1.c | 653 |
1 files changed, 653 insertions, 0 deletions
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index c25e038ac874..4b2290ceace6 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -6,6 +6,10 @@ #include <linux/pagewalk.h> #include <linux/backing-dev.h> #include <linux/swap_cgroup.h> +#include <linux/eventfd.h> +#include <linux/poll.h> +#include <linux/sort.h> +#include <linux/file.h> #include "internal.h" #include "swap.h" @@ -60,6 +64,54 @@ static struct move_charge_struct { .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), }; +/* for OOM */ +struct mem_cgroup_eventfd_list { + struct list_head list; + struct eventfd_ctx *eventfd; +}; + +/* + * cgroup_event represents events which userspace want to receive. + */ +struct mem_cgroup_event { + /* + * memcg which the event belongs to. + */ + struct mem_cgroup *memcg; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * register_event() callback will be used to add new userspace + * waiter for changes related to this event. Use eventfd_signal() + * on eventfd to send notification to userspace. + */ + int (*register_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args); + /* + * unregister_event() callback will be called when userspace closes + * the eventfd or on cgroup removing. This callback must be set, + * if you want provide notification functionality. + */ + void (*unregister_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd); + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_entry_t wait; + struct work_struct remove; +}; + +extern spinlock_t memcg_oom_lock; + static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, struct mem_cgroup_tree_per_node *mctz, unsigned long new_usage_in_excess) @@ -1306,6 +1358,607 @@ void memcg1_move_task(void) } #endif +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) +{ + struct mem_cgroup_threshold_ary *t; + unsigned long usage; + int i; + + rcu_read_lock(); + if (!swap) + t = rcu_dereference(memcg->thresholds.primary); + else + t = rcu_dereference(memcg->memsw_thresholds.primary); + + if (!t) + goto unlock; + + usage = mem_cgroup_usage(memcg, swap); + + /* + * current_threshold points to threshold just below or equal to usage. + * If it's not true, a threshold was crossed after last + * call of __mem_cgroup_threshold(). + */ + i = t->current_threshold; + + /* + * Iterate backward over array of thresholds starting from + * current_threshold and check if a threshold is crossed. + * If none of thresholds below usage is crossed, we read + * only one element of the array here. + */ + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) + eventfd_signal(t->entries[i].eventfd); + + /* i = current_threshold + 1 */ + i++; + + /* + * Iterate forward over array of thresholds starting from + * current_threshold+1 and check if a threshold is crossed. + * If none of thresholds above usage is crossed, we read + * only one element of the array here. + */ + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) + eventfd_signal(t->entries[i].eventfd); + + /* Update current_threshold */ + t->current_threshold = i - 1; +unlock: + rcu_read_unlock(); +} + +static void mem_cgroup_threshold(struct mem_cgroup *memcg) +{ + while (memcg) { + __mem_cgroup_threshold(memcg, false); + if (do_memsw_account()) + __mem_cgroup_threshold(memcg, true); + + memcg = parent_mem_cgroup(memcg); + } +} + +/* + * Check events in order. + * + */ +void memcg_check_events(struct mem_cgroup *memcg, int nid) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return; + + /* threshold event is triggered in finer grain than soft limit */ + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit; + + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); + mem_cgroup_threshold(memcg); + if (unlikely(do_softlimit)) + memcg1_update_tree(memcg, nid); + } +} + +static int compare_thresholds(const void *a, const void *b) +{ + const struct mem_cgroup_threshold *_a = a; + const struct mem_cgroup_threshold *_b = b; + + if (_a->threshold > _b->threshold) + return 1; + + if (_a->threshold < _b->threshold) + return -1; + + return 0; +} + +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) +{ + struct mem_cgroup_eventfd_list *ev; + + spin_lock(&memcg_oom_lock); + + list_for_each_entry(ev, &memcg->oom_notify, list) + eventfd_signal(ev->eventfd); + + spin_unlock(&memcg_oom_lock); + return 0; +} + +void mem_cgroup_oom_notify(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) + mem_cgroup_oom_notify_cb(iter); +} + +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args, enum res_type type) +{ + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + unsigned long threshold; + unsigned long usage; + int i, size, ret; + + ret = page_counter_memparse(args, "-1", &threshold); + if (ret) + return ret; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) { + thresholds = &memcg->thresholds; + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; + usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + + /* Check if a threshold crossed before adding a new one */ + if (thresholds->primary) + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + size = thresholds->primary ? thresholds->primary->size + 1 : 1; + + /* Allocate memory for new array of thresholds */ + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto unlock; + } + new->size = size; + + /* Copy thresholds (if any) to new array */ + if (thresholds->primary) + memcpy(new->entries, thresholds->primary->entries, + flex_array_size(new, entries, size - 1)); + + /* Add new threshold */ + new->entries[size - 1].eventfd = eventfd; + new->entries[size - 1].threshold = threshold; + + /* Sort thresholds. Registering of new threshold isn't time-critical */ + sort(new->entries, size, sizeof(*new->entries), + compare_thresholds, NULL); + + /* Find current threshold */ + new->current_threshold = -1; + for (i = 0; i < size; i++) { + if (new->entries[i].threshold <= usage) { + /* + * new->current_threshold will not be used until + * rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } else + break; + } + + /* Free old spare buffer and save old primary buffer as spare */ + kfree(thresholds->spare); + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + +unlock: + mutex_unlock(&memcg->thresholds_lock); + + return ret; +} + +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); +} + +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); +} + +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, enum res_type type) +{ + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + unsigned long usage; + int i, j, size, entries; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) { + thresholds = &memcg->thresholds; + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; + usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + + if (!thresholds->primary) + goto unlock; + + /* Check if a threshold crossed before removing */ + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + /* Calculate new number of threshold */ + size = entries = 0; + for (i = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd != eventfd) + size++; + else + entries++; + } + + new = thresholds->spare; + + /* If no items related to eventfd have been cleared, nothing to do */ + if (!entries) + goto unlock; + + /* Set thresholds array to NULL if we don't have thresholds */ + if (!size) { + kfree(new); + new = NULL; + goto swap_buffers; + } + + new->size = size; + + /* Copy thresholds and find current threshold */ + new->current_threshold = -1; + for (i = 0, j = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd == eventfd) + continue; + + new->entries[j] = thresholds->primary->entries[i]; + if (new->entries[j].threshold <= usage) { + /* + * new->current_threshold will not be used + * until rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } + j++; + } + +swap_buffers: + /* Swap primary and spare array */ + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + + /* If all events are unregistered, free the spare array */ + if (!new) { + kfree(thresholds->spare); + thresholds->spare = NULL; + } +unlock: + mutex_unlock(&memcg->thresholds_lock); +} + +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); +} + +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); +} + +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup_eventfd_list *event; + + event = kmalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + spin_lock(&memcg_oom_lock); + + event->eventfd = eventfd; + list_add(&event->list, &memcg->oom_notify); + + /* already in OOM ? */ + if (memcg->under_oom) + eventfd_signal(eventfd); + spin_unlock(&memcg_oom_lock); + + return 0; +} + +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + struct mem_cgroup_eventfd_list *ev, *tmp; + + spin_lock(&memcg_oom_lock); + + list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { + if (ev->eventfd == eventfd) { + list_del(&ev->list); + kfree(ev); + } + } + + spin_unlock(&memcg_oom_lock); +} + +/* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered. It tries to support fully configurable + * events for each user. Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void memcg_event_remove(struct work_struct *work) +{ + struct mem_cgroup_event *event = + container_of(work, struct mem_cgroup_event, remove); + struct mem_cgroup *memcg = event->memcg; + + remove_wait_queue(event->wqh, &event->wait); + + event->unregister_event(memcg, event->eventfd); + + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd); + + eventfd_ctx_put(event->eventfd); + kfree(event); + css_put(&memcg->css); +} + +/* + * Gets called on EPOLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, + int sync, void *key) +{ + struct mem_cgroup_event *event = + container_of(wait, struct mem_cgroup_event, wait); + struct mem_cgroup *memcg = event->memcg; + __poll_t flags = key_to_poll(key); + + if (flags & EPOLLHUP) { + /* + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. + */ + spin_lock(&memcg->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&memcg->event_list_lock); + } + + return 0; +} + +static void memcg_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct mem_cgroup_event *event = + container_of(pt, struct mem_cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * DO NOT USE IN NEW FILES. + * + * Parse input and register new cgroup event handler. + * + * Input must be in format '<event_fd> <control_fd> <args>'. + * Interpretation of args is defined by control file implementation. + */ +ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup_subsys_state *css = of_css(of); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event; + struct cgroup_subsys_state *cfile_css; + unsigned int efd, cfd; + struct fd efile; + struct fd cfile; + struct dentry *cdentry; + const char *name; + char *endp; + int ret; + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return -EOPNOTSUPP; + + buf = strstrip(buf); + + efd = simple_strtoul(buf, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buf = endp + 1; + + cfd = simple_strtoul(buf, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buf = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + event->memcg = memcg; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, memcg_event_wake); + INIT_WORK(&event->remove, memcg_event_remove); + + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out_kfree; + } + + event->eventfd = eventfd_ctx_fileget(efile.file); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto out_put_efile; + } + + cfile = fdget(cfd); + if (!cfile.file) { + ret = -EBADF; + goto out_put_eventfd; + } + + /* the process need read permission on control file */ + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = file_permission(cfile.file, MAY_READ); + if (ret < 0) + goto out_put_cfile; + + /* + * The control file must be a regular cgroup1 file. As a regular cgroup + * file can't be renamed, it's safe to access its name afterwards. + */ + cdentry = cfile.file->f_path.dentry; + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { + ret = -EINVAL; + goto out_put_cfile; + } + + /* + * Determine the event callbacks and set them in @event. This used + * to be done via struct cftype but cgroup core no longer knows + * about these events. The following is crude but the whole thing + * is for compatibility anyway. + * + * DO NOT ADD NEW FILES. + */ + name = cdentry->d_name.name; + + if (!strcmp(name, "memory.usage_in_bytes")) { + event->register_event = mem_cgroup_usage_register_event; + event->unregister_event = mem_cgroup_usage_unregister_event; + } else if (!strcmp(name, "memory.oom_control")) { + event->register_event = mem_cgroup_oom_register_event; + event->unregister_event = mem_cgroup_oom_unregister_event; + } else if (!strcmp(name, "memory.pressure_level")) { + event->register_event = vmpressure_register_event; + event->unregister_event = vmpressure_unregister_event; + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { + event->register_event = memsw_cgroup_usage_register_event; + event->unregister_event = memsw_cgroup_usage_unregister_event; + } else { + ret = -EINVAL; + goto out_put_cfile; + } + + /* + * Verify @cfile should belong to @css. Also, remaining events are + * automatically removed on cgroup destruction but the removal is + * asynchronous, so take an extra ref on @css. + */ + cfile_css = css_tryget_online_from_dir(cdentry->d_parent, + &memory_cgrp_subsys); + ret = -EINVAL; + if (IS_ERR(cfile_css)) + goto out_put_cfile; + if (cfile_css != css) { + css_put(cfile_css); + goto out_put_cfile; + } + + ret = event->register_event(memcg, event->eventfd, buf); + if (ret) + goto out_put_css; + + vfs_poll(efile.file, &event->pt); + + spin_lock_irq(&memcg->event_list_lock); + list_add(&event->list, &memcg->event_list); + spin_unlock_irq(&memcg->event_list_lock); + + fdput(cfile); + fdput(efile); + + return nbytes; + +out_put_css: + css_put(css); +out_put_cfile: + fdput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fdput(efile); +out_kfree: + kfree(event); + + return ret; +} + +void memcg1_css_offline(struct mem_cgroup *memcg) +{ + struct mem_cgroup_event *event, *tmp; + + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace. + */ + spin_lock_irq(&memcg->event_list_lock); + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { + list_del_init(&event->list); + schedule_work(&event->remove); + } + spin_unlock_irq(&memcg->event_list_lock); +} + static int __init memcg1_init(void) { int node; |