[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Znpsy2rISbnT4Ffi@tiehlicka>
Date: Tue, 25 Jun 2024 09:07:55 +0200
From: Michal Hocko <mhocko@...e.com>
To: Roman Gushchin <roman.gushchin@...ux.dev>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
Johannes Weiner <hannes@...xchg.org>,
Shakeel Butt <shakeel.butt@...ux.dev>,
Muchun Song <muchun.song@...ux.dev>, linux-kernel@...r.kernel.org,
cgroups@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [PATCH v2 06/14] mm: memcg: move legacy memcg event code into
memcontrol-v1.c
On Mon 24-06-24 17:58:58, Roman Gushchin wrote:
> Cgroup v1's memory controller contains a pretty complicated
> event notifications mechanism which is not used on cgroup v2.
> Let's move the corresponding code into memcontrol-v1.c.
>
> Please, note, that mem_cgroup_event_ratelimit() remains in
> memcontrol.c, otherwise it would require exporting too many
> details on memcg stats outside of memcontrol.c.
>
> Signed-off-by: Roman Gushchin <roman.gushchin@...ux.dev>
Acked-by: Michal Hocko <mhocko@...e.com>
> ---
> include/linux/memcontrol.h | 12 -
> mm/memcontrol-v1.c | 653 +++++++++++++++++++++++++++++++++++
> mm/memcontrol-v1.h | 51 +++
> mm/memcontrol.c | 687 +------------------------------------
> 4 files changed, 709 insertions(+), 694 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 83c8327455d8..588179d29849 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -69,18 +69,6 @@ struct mem_cgroup_id {
> refcount_t ref;
> };
>
> -/*
> - * Per memcg event counter is incremented at every pagein/pageout. With THP,
> - * it will be incremented by the number of pages. This counter is used
> - * to trigger some periodic events. This is straightforward and better
> - * than using jiffies etc. to handle periodic memcg event.
> - */
> -enum mem_cgroup_events_target {
> - MEM_CGROUP_TARGET_THRESH,
> - MEM_CGROUP_TARGET_SOFTLIMIT,
> - MEM_CGROUP_NTARGETS,
> -};
> -
> struct memcg_vmstats_percpu;
> struct memcg_vmstats;
> struct lruvec_stats_percpu;
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index c25e038ac874..4b2290ceace6 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -6,6 +6,10 @@
> #include <linux/pagewalk.h>
> #include <linux/backing-dev.h>
> #include <linux/swap_cgroup.h>
> +#include <linux/eventfd.h>
> +#include <linux/poll.h>
> +#include <linux/sort.h>
> +#include <linux/file.h>
>
> #include "internal.h"
> #include "swap.h"
> @@ -60,6 +64,54 @@ static struct move_charge_struct {
> .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
> };
>
> +/* for OOM */
> +struct mem_cgroup_eventfd_list {
> + struct list_head list;
> + struct eventfd_ctx *eventfd;
> +};
> +
> +/*
> + * cgroup_event represents events which userspace want to receive.
> + */
> +struct mem_cgroup_event {
> + /*
> + * memcg which the event belongs to.
> + */
> + struct mem_cgroup *memcg;
> + /*
> + * eventfd to signal userspace about the event.
> + */
> + struct eventfd_ctx *eventfd;
> + /*
> + * Each of these stored in a list by the cgroup.
> + */
> + struct list_head list;
> + /*
> + * register_event() callback will be used to add new userspace
> + * waiter for changes related to this event. Use eventfd_signal()
> + * on eventfd to send notification to userspace.
> + */
> + int (*register_event)(struct mem_cgroup *memcg,
> + struct eventfd_ctx *eventfd, const char *args);
> + /*
> + * unregister_event() callback will be called when userspace closes
> + * the eventfd or on cgroup removing. This callback must be set,
> + * if you want provide notification functionality.
> + */
> + void (*unregister_event)(struct mem_cgroup *memcg,
> + struct eventfd_ctx *eventfd);
> + /*
> + * All fields below needed to unregister event when
> + * userspace closes eventfd.
> + */
> + poll_table pt;
> + wait_queue_head_t *wqh;
> + wait_queue_entry_t wait;
> + struct work_struct remove;
> +};
> +
> +extern spinlock_t memcg_oom_lock;
> +
> static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
> struct mem_cgroup_tree_per_node *mctz,
> unsigned long new_usage_in_excess)
> @@ -1306,6 +1358,607 @@ void memcg1_move_task(void)
> }
> #endif
>
> +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
> +{
> + struct mem_cgroup_threshold_ary *t;
> + unsigned long usage;
> + int i;
> +
> + rcu_read_lock();
> + if (!swap)
> + t = rcu_dereference(memcg->thresholds.primary);
> + else
> + t = rcu_dereference(memcg->memsw_thresholds.primary);
> +
> + if (!t)
> + goto unlock;
> +
> + usage = mem_cgroup_usage(memcg, swap);
> +
> + /*
> + * current_threshold points to threshold just below or equal to usage.
> + * If it's not true, a threshold was crossed after last
> + * call of __mem_cgroup_threshold().
> + */
> + i = t->current_threshold;
> +
> + /*
> + * Iterate backward over array of thresholds starting from
> + * current_threshold and check if a threshold is crossed.
> + * If none of thresholds below usage is crossed, we read
> + * only one element of the array here.
> + */
> + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
> + eventfd_signal(t->entries[i].eventfd);
> +
> + /* i = current_threshold + 1 */
> + i++;
> +
> + /*
> + * Iterate forward over array of thresholds starting from
> + * current_threshold+1 and check if a threshold is crossed.
> + * If none of thresholds above usage is crossed, we read
> + * only one element of the array here.
> + */
> + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
> + eventfd_signal(t->entries[i].eventfd);
> +
> + /* Update current_threshold */
> + t->current_threshold = i - 1;
> +unlock:
> + rcu_read_unlock();
> +}
> +
> +static void mem_cgroup_threshold(struct mem_cgroup *memcg)
> +{
> + while (memcg) {
> + __mem_cgroup_threshold(memcg, false);
> + if (do_memsw_account())
> + __mem_cgroup_threshold(memcg, true);
> +
> + memcg = parent_mem_cgroup(memcg);
> + }
> +}
> +
> +/*
> + * Check events in order.
> + *
> + */
> +void memcg_check_events(struct mem_cgroup *memcg, int nid)
> +{
> + if (IS_ENABLED(CONFIG_PREEMPT_RT))
> + return;
> +
> + /* threshold event is triggered in finer grain than soft limit */
> + if (unlikely(mem_cgroup_event_ratelimit(memcg,
> + MEM_CGROUP_TARGET_THRESH))) {
> + bool do_softlimit;
> +
> + do_softlimit = mem_cgroup_event_ratelimit(memcg,
> + MEM_CGROUP_TARGET_SOFTLIMIT);
> + mem_cgroup_threshold(memcg);
> + if (unlikely(do_softlimit))
> + memcg1_update_tree(memcg, nid);
> + }
> +}
> +
> +static int compare_thresholds(const void *a, const void *b)
> +{
> + const struct mem_cgroup_threshold *_a = a;
> + const struct mem_cgroup_threshold *_b = b;
> +
> + if (_a->threshold > _b->threshold)
> + return 1;
> +
> + if (_a->threshold < _b->threshold)
> + return -1;
> +
> + return 0;
> +}
> +
> +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
> +{
> + struct mem_cgroup_eventfd_list *ev;
> +
> + spin_lock(&memcg_oom_lock);
> +
> + list_for_each_entry(ev, &memcg->oom_notify, list)
> + eventfd_signal(ev->eventfd);
> +
> + spin_unlock(&memcg_oom_lock);
> + return 0;
> +}
> +
> +void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
> +{
> + struct mem_cgroup *iter;
> +
> + for_each_mem_cgroup_tree(iter, memcg)
> + mem_cgroup_oom_notify_cb(iter);
> +}
> +
> +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> + struct eventfd_ctx *eventfd, const char *args, enum res_type type)
> +{
> + struct mem_cgroup_thresholds *thresholds;
> + struct mem_cgroup_threshold_ary *new;
> + unsigned long threshold;
> + unsigned long usage;
> + int i, size, ret;
> +
> + ret = page_counter_memparse(args, "-1", &threshold);
> + if (ret)
> + return ret;
> +
> + mutex_lock(&memcg->thresholds_lock);
> +
> + if (type == _MEM) {
> + thresholds = &memcg->thresholds;
> + usage = mem_cgroup_usage(memcg, false);
> + } else if (type == _MEMSWAP) {
> + thresholds = &memcg->memsw_thresholds;
> + usage = mem_cgroup_usage(memcg, true);
> + } else
> + BUG();
> +
> + /* Check if a threshold crossed before adding a new one */
> + if (thresholds->primary)
> + __mem_cgroup_threshold(memcg, type == _MEMSWAP);
> +
> + size = thresholds->primary ? thresholds->primary->size + 1 : 1;
> +
> + /* Allocate memory for new array of thresholds */
> + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
> + if (!new) {
> + ret = -ENOMEM;
> + goto unlock;
> + }
> + new->size = size;
> +
> + /* Copy thresholds (if any) to new array */
> + if (thresholds->primary)
> + memcpy(new->entries, thresholds->primary->entries,
> + flex_array_size(new, entries, size - 1));
> +
> + /* Add new threshold */
> + new->entries[size - 1].eventfd = eventfd;
> + new->entries[size - 1].threshold = threshold;
> +
> + /* Sort thresholds. Registering of new threshold isn't time-critical */
> + sort(new->entries, size, sizeof(*new->entries),
> + compare_thresholds, NULL);
> +
> + /* Find current threshold */
> + new->current_threshold = -1;
> + for (i = 0; i < size; i++) {
> + if (new->entries[i].threshold <= usage) {
> + /*
> + * new->current_threshold will not be used until
> + * rcu_assign_pointer(), so it's safe to increment
> + * it here.
> + */
> + ++new->current_threshold;
> + } else
> + break;
> + }
> +
> + /* Free old spare buffer and save old primary buffer as spare */
> + kfree(thresholds->spare);
> + thresholds->spare = thresholds->primary;
> +
> + rcu_assign_pointer(thresholds->primary, new);
> +
> + /* To be sure that nobody uses thresholds */
> + synchronize_rcu();
> +
> +unlock:
> + mutex_unlock(&memcg->thresholds_lock);
> +
> + return ret;
> +}
> +
> +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> + struct eventfd_ctx *eventfd, const char *args)
> +{
> + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
> +}
> +
> +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
> + struct eventfd_ctx *eventfd, const char *args)
> +{
> + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
> +}
> +
> +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> + struct eventfd_ctx *eventfd, enum res_type type)
> +{
> + struct mem_cgroup_thresholds *thresholds;
> + struct mem_cgroup_threshold_ary *new;
> + unsigned long usage;
> + int i, j, size, entries;
> +
> + mutex_lock(&memcg->thresholds_lock);
> +
> + if (type == _MEM) {
> + thresholds = &memcg->thresholds;
> + usage = mem_cgroup_usage(memcg, false);
> + } else if (type == _MEMSWAP) {
> + thresholds = &memcg->memsw_thresholds;
> + usage = mem_cgroup_usage(memcg, true);
> + } else
> + BUG();
> +
> + if (!thresholds->primary)
> + goto unlock;
> +
> + /* Check if a threshold crossed before removing */
> + __mem_cgroup_threshold(memcg, type == _MEMSWAP);
> +
> + /* Calculate new number of threshold */
> + size = entries = 0;
> + for (i = 0; i < thresholds->primary->size; i++) {
> + if (thresholds->primary->entries[i].eventfd != eventfd)
> + size++;
> + else
> + entries++;
> + }
> +
> + new = thresholds->spare;
> +
> + /* If no items related to eventfd have been cleared, nothing to do */
> + if (!entries)
> + goto unlock;
> +
> + /* Set thresholds array to NULL if we don't have thresholds */
> + if (!size) {
> + kfree(new);
> + new = NULL;
> + goto swap_buffers;
> + }
> +
> + new->size = size;
> +
> + /* Copy thresholds and find current threshold */
> + new->current_threshold = -1;
> + for (i = 0, j = 0; i < thresholds->primary->size; i++) {
> + if (thresholds->primary->entries[i].eventfd == eventfd)
> + continue;
> +
> + new->entries[j] = thresholds->primary->entries[i];
> + if (new->entries[j].threshold <= usage) {
> + /*
> + * new->current_threshold will not be used
> + * until rcu_assign_pointer(), so it's safe to increment
> + * it here.
> + */
> + ++new->current_threshold;
> + }
> + j++;
> + }
> +
> +swap_buffers:
> + /* Swap primary and spare array */
> + thresholds->spare = thresholds->primary;
> +
> + rcu_assign_pointer(thresholds->primary, new);
> +
> + /* To be sure that nobody uses thresholds */
> + synchronize_rcu();
> +
> + /* If all events are unregistered, free the spare array */
> + if (!new) {
> + kfree(thresholds->spare);
> + thresholds->spare = NULL;
> + }
> +unlock:
> + mutex_unlock(&memcg->thresholds_lock);
> +}
> +
> +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> + struct eventfd_ctx *eventfd)
> +{
> + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
> +}
> +
> +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> + struct eventfd_ctx *eventfd)
> +{
> + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
> +}
> +
> +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
> + struct eventfd_ctx *eventfd, const char *args)
> +{
> + struct mem_cgroup_eventfd_list *event;
> +
> + event = kmalloc(sizeof(*event), GFP_KERNEL);
> + if (!event)
> + return -ENOMEM;
> +
> + spin_lock(&memcg_oom_lock);
> +
> + event->eventfd = eventfd;
> + list_add(&event->list, &memcg->oom_notify);
> +
> + /* already in OOM ? */
> + if (memcg->under_oom)
> + eventfd_signal(eventfd);
> + spin_unlock(&memcg_oom_lock);
> +
> + return 0;
> +}
> +
> +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
> + struct eventfd_ctx *eventfd)
> +{
> + struct mem_cgroup_eventfd_list *ev, *tmp;
> +
> + spin_lock(&memcg_oom_lock);
> +
> + list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
> + if (ev->eventfd == eventfd) {
> + list_del(&ev->list);
> + kfree(ev);
> + }
> + }
> +
> + spin_unlock(&memcg_oom_lock);
> +}
> +
> +/*
> + * DO NOT USE IN NEW FILES.
> + *
> + * "cgroup.event_control" implementation.
> + *
> + * This is way over-engineered. It tries to support fully configurable
> + * events for each user. Such level of flexibility is completely
> + * unnecessary especially in the light of the planned unified hierarchy.
> + *
> + * Please deprecate this and replace with something simpler if at all
> + * possible.
> + */
> +
> +/*
> + * Unregister event and free resources.
> + *
> + * Gets called from workqueue.
> + */
> +static void memcg_event_remove(struct work_struct *work)
> +{
> + struct mem_cgroup_event *event =
> + container_of(work, struct mem_cgroup_event, remove);
> + struct mem_cgroup *memcg = event->memcg;
> +
> + remove_wait_queue(event->wqh, &event->wait);
> +
> + event->unregister_event(memcg, event->eventfd);
> +
> + /* Notify userspace the event is going away. */
> + eventfd_signal(event->eventfd);
> +
> + eventfd_ctx_put(event->eventfd);
> + kfree(event);
> + css_put(&memcg->css);
> +}
> +
> +/*
> + * Gets called on EPOLLHUP on eventfd when user closes it.
> + *
> + * Called with wqh->lock held and interrupts disabled.
> + */
> +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
> + int sync, void *key)
> +{
> + struct mem_cgroup_event *event =
> + container_of(wait, struct mem_cgroup_event, wait);
> + struct mem_cgroup *memcg = event->memcg;
> + __poll_t flags = key_to_poll(key);
> +
> + if (flags & EPOLLHUP) {
> + /*
> + * If the event has been detached at cgroup removal, we
> + * can simply return knowing the other side will cleanup
> + * for us.
> + *
> + * We can't race against event freeing since the other
> + * side will require wqh->lock via remove_wait_queue(),
> + * which we hold.
> + */
> + spin_lock(&memcg->event_list_lock);
> + if (!list_empty(&event->list)) {
> + list_del_init(&event->list);
> + /*
> + * We are in atomic context, but cgroup_event_remove()
> + * may sleep, so we have to call it in workqueue.
> + */
> + schedule_work(&event->remove);
> + }
> + spin_unlock(&memcg->event_list_lock);
> + }
> +
> + return 0;
> +}
> +
> +static void memcg_event_ptable_queue_proc(struct file *file,
> + wait_queue_head_t *wqh, poll_table *pt)
> +{
> + struct mem_cgroup_event *event =
> + container_of(pt, struct mem_cgroup_event, pt);
> +
> + event->wqh = wqh;
> + add_wait_queue(wqh, &event->wait);
> +}
> +
> +/*
> + * DO NOT USE IN NEW FILES.
> + *
> + * Parse input and register new cgroup event handler.
> + *
> + * Input must be in format '<event_fd> <control_fd> <args>'.
> + * Interpretation of args is defined by control file implementation.
> + */
> +ssize_t memcg_write_event_control(struct kernfs_open_file *of,
> + char *buf, size_t nbytes, loff_t off)
> +{
> + struct cgroup_subsys_state *css = of_css(of);
> + struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> + struct mem_cgroup_event *event;
> + struct cgroup_subsys_state *cfile_css;
> + unsigned int efd, cfd;
> + struct fd efile;
> + struct fd cfile;
> + struct dentry *cdentry;
> + const char *name;
> + char *endp;
> + int ret;
> +
> + if (IS_ENABLED(CONFIG_PREEMPT_RT))
> + return -EOPNOTSUPP;
> +
> + buf = strstrip(buf);
> +
> + efd = simple_strtoul(buf, &endp, 10);
> + if (*endp != ' ')
> + return -EINVAL;
> + buf = endp + 1;
> +
> + cfd = simple_strtoul(buf, &endp, 10);
> + if ((*endp != ' ') && (*endp != '\0'))
> + return -EINVAL;
> + buf = endp + 1;
> +
> + event = kzalloc(sizeof(*event), GFP_KERNEL);
> + if (!event)
> + return -ENOMEM;
> +
> + event->memcg = memcg;
> + INIT_LIST_HEAD(&event->list);
> + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
> + init_waitqueue_func_entry(&event->wait, memcg_event_wake);
> + INIT_WORK(&event->remove, memcg_event_remove);
> +
> + efile = fdget(efd);
> + if (!efile.file) {
> + ret = -EBADF;
> + goto out_kfree;
> + }
> +
> + event->eventfd = eventfd_ctx_fileget(efile.file);
> + if (IS_ERR(event->eventfd)) {
> + ret = PTR_ERR(event->eventfd);
> + goto out_put_efile;
> + }
> +
> + cfile = fdget(cfd);
> + if (!cfile.file) {
> + ret = -EBADF;
> + goto out_put_eventfd;
> + }
> +
> + /* the process need read permission on control file */
> + /* AV: shouldn't we check that it's been opened for read instead? */
> + ret = file_permission(cfile.file, MAY_READ);
> + if (ret < 0)
> + goto out_put_cfile;
> +
> + /*
> + * The control file must be a regular cgroup1 file. As a regular cgroup
> + * file can't be renamed, it's safe to access its name afterwards.
> + */
> + cdentry = cfile.file->f_path.dentry;
> + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
> + ret = -EINVAL;
> + goto out_put_cfile;
> + }
> +
> + /*
> + * Determine the event callbacks and set them in @event. This used
> + * to be done via struct cftype but cgroup core no longer knows
> + * about these events. The following is crude but the whole thing
> + * is for compatibility anyway.
> + *
> + * DO NOT ADD NEW FILES.
> + */
> + name = cdentry->d_name.name;
> +
> + if (!strcmp(name, "memory.usage_in_bytes")) {
> + event->register_event = mem_cgroup_usage_register_event;
> + event->unregister_event = mem_cgroup_usage_unregister_event;
> + } else if (!strcmp(name, "memory.oom_control")) {
> + event->register_event = mem_cgroup_oom_register_event;
> + event->unregister_event = mem_cgroup_oom_unregister_event;
> + } else if (!strcmp(name, "memory.pressure_level")) {
> + event->register_event = vmpressure_register_event;
> + event->unregister_event = vmpressure_unregister_event;
> + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
> + event->register_event = memsw_cgroup_usage_register_event;
> + event->unregister_event = memsw_cgroup_usage_unregister_event;
> + } else {
> + ret = -EINVAL;
> + goto out_put_cfile;
> + }
> +
> + /*
> + * Verify @cfile should belong to @css. Also, remaining events are
> + * automatically removed on cgroup destruction but the removal is
> + * asynchronous, so take an extra ref on @css.
> + */
> + cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
> + &memory_cgrp_subsys);
> + ret = -EINVAL;
> + if (IS_ERR(cfile_css))
> + goto out_put_cfile;
> + if (cfile_css != css) {
> + css_put(cfile_css);
> + goto out_put_cfile;
> + }
> +
> + ret = event->register_event(memcg, event->eventfd, buf);
> + if (ret)
> + goto out_put_css;
> +
> + vfs_poll(efile.file, &event->pt);
> +
> + spin_lock_irq(&memcg->event_list_lock);
> + list_add(&event->list, &memcg->event_list);
> + spin_unlock_irq(&memcg->event_list_lock);
> +
> + fdput(cfile);
> + fdput(efile);
> +
> + return nbytes;
> +
> +out_put_css:
> + css_put(css);
> +out_put_cfile:
> + fdput(cfile);
> +out_put_eventfd:
> + eventfd_ctx_put(event->eventfd);
> +out_put_efile:
> + fdput(efile);
> +out_kfree:
> + kfree(event);
> +
> + return ret;
> +}
> +
> +void memcg1_css_offline(struct mem_cgroup *memcg)
> +{
> + struct mem_cgroup_event *event, *tmp;
> +
> + /*
> + * Unregister events and notify userspace.
> + * Notify userspace about cgroup removing only after rmdir of cgroup
> + * directory to avoid race between userspace and kernelspace.
> + */
> + spin_lock_irq(&memcg->event_list_lock);
> + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
> + list_del_init(&event->list);
> + schedule_work(&event->remove);
> + }
> + spin_unlock_irq(&memcg->event_list_lock);
> +}
> +
> static int __init memcg1_init(void)
> {
> int node;
> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
> index d377c0be9880..524a2c76ffc9 100644
> --- a/mm/memcontrol-v1.h
> +++ b/mm/memcontrol-v1.h
> @@ -41,4 +41,55 @@ u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
> int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
> struct cftype *cft, u64 val);
>
> +/*
> + * Per memcg event counter is incremented at every pagein/pageout. With THP,
> + * it will be incremented by the number of pages. This counter is used
> + * to trigger some periodic events. This is straightforward and better
> + * than using jiffies etc. to handle periodic memcg event.
> + */
> +enum mem_cgroup_events_target {
> + MEM_CGROUP_TARGET_THRESH,
> + MEM_CGROUP_TARGET_SOFTLIMIT,
> + MEM_CGROUP_NTARGETS,
> +};
> +
> +/* Whether legacy memory+swap accounting is active */
> +static bool do_memsw_account(void)
> +{
> + return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
> +}
> +
> +/*
> + * Iteration constructs for visiting all cgroups (under a tree). If
> + * loops are exited prematurely (break), mem_cgroup_iter_break() must
> + * be used for reference counting.
> + */
> +#define for_each_mem_cgroup_tree(iter, root) \
> + for (iter = mem_cgroup_iter(root, NULL, NULL); \
> + iter != NULL; \
> + iter = mem_cgroup_iter(root, iter, NULL))
> +
> +#define for_each_mem_cgroup(iter) \
> + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
> + iter != NULL; \
> + iter = mem_cgroup_iter(NULL, iter, NULL))
> +
> +void memcg1_css_offline(struct mem_cgroup *memcg);
> +
> +/* for encoding cft->private value on file */
> +enum res_type {
> + _MEM,
> + _MEMSWAP,
> + _KMEM,
> + _TCP,
> +};
> +
> +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> + enum mem_cgroup_events_target target);
> +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
> +void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
> +ssize_t memcg_write_event_control(struct kernfs_open_file *of,
> + char *buf, size_t nbytes, loff_t off);
> +
> +
> #endif /* __MM_MEMCONTROL_V1_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index da2c0fa0de1b..bd4b26a73596 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -46,9 +46,6 @@
> #include <linux/slab.h>
> #include <linux/swapops.h>
> #include <linux/spinlock.h>
> -#include <linux/eventfd.h>
> -#include <linux/poll.h>
> -#include <linux/sort.h>
> #include <linux/fs.h>
> #include <linux/seq_file.h>
> #include <linux/parser.h>
> @@ -59,7 +56,6 @@
> #include <linux/cpu.h>
> #include <linux/oom.h>
> #include <linux/lockdep.h>
> -#include <linux/file.h>
> #include <linux/resume_user_mode.h>
> #include <linux/psi.h>
> #include <linux/seq_buf.h>
> @@ -97,91 +93,13 @@ static bool cgroup_memory_nobpf __ro_after_init;
> static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
> #endif
>
> -/* Whether legacy memory+swap accounting is active */
> -static bool do_memsw_account(void)
> -{
> - return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
> -}
> -
> #define THRESHOLDS_EVENTS_TARGET 128
> #define SOFTLIMIT_EVENTS_TARGET 1024
>
> -/* for OOM */
> -struct mem_cgroup_eventfd_list {
> - struct list_head list;
> - struct eventfd_ctx *eventfd;
> -};
> -
> -/*
> - * cgroup_event represents events which userspace want to receive.
> - */
> -struct mem_cgroup_event {
> - /*
> - * memcg which the event belongs to.
> - */
> - struct mem_cgroup *memcg;
> - /*
> - * eventfd to signal userspace about the event.
> - */
> - struct eventfd_ctx *eventfd;
> - /*
> - * Each of these stored in a list by the cgroup.
> - */
> - struct list_head list;
> - /*
> - * register_event() callback will be used to add new userspace
> - * waiter for changes related to this event. Use eventfd_signal()
> - * on eventfd to send notification to userspace.
> - */
> - int (*register_event)(struct mem_cgroup *memcg,
> - struct eventfd_ctx *eventfd, const char *args);
> - /*
> - * unregister_event() callback will be called when userspace closes
> - * the eventfd or on cgroup removing. This callback must be set,
> - * if you want provide notification functionality.
> - */
> - void (*unregister_event)(struct mem_cgroup *memcg,
> - struct eventfd_ctx *eventfd);
> - /*
> - * All fields below needed to unregister event when
> - * userspace closes eventfd.
> - */
> - poll_table pt;
> - wait_queue_head_t *wqh;
> - wait_queue_entry_t wait;
> - struct work_struct remove;
> -};
> -
> -static void mem_cgroup_threshold(struct mem_cgroup *memcg);
> -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
> -
> -/* for encoding cft->private value on file */
> -enum res_type {
> - _MEM,
> - _MEMSWAP,
> - _KMEM,
> - _TCP,
> -};
> -
> #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
> #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
> #define MEMFILE_ATTR(val) ((val) & 0xffff)
>
> -/*
> - * Iteration constructs for visiting all cgroups (under a tree). If
> - * loops are exited prematurely (break), mem_cgroup_iter_break() must
> - * be used for reference counting.
> - */
> -#define for_each_mem_cgroup_tree(iter, root) \
> - for (iter = mem_cgroup_iter(root, NULL, NULL); \
> - iter != NULL; \
> - iter = mem_cgroup_iter(root, iter, NULL))
> -
> -#define for_each_mem_cgroup(iter) \
> - for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
> - iter != NULL; \
> - iter = mem_cgroup_iter(NULL, iter, NULL))
> -
> static inline bool task_is_dying(void)
> {
> return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
> @@ -940,8 +858,8 @@ void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
> __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
> }
>
> -static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> - enum mem_cgroup_events_target target)
> +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> + enum mem_cgroup_events_target target)
> {
> unsigned long val, next;
>
> @@ -965,28 +883,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> return false;
> }
>
> -/*
> - * Check events in order.
> - *
> - */
> -void memcg_check_events(struct mem_cgroup *memcg, int nid)
> -{
> - if (IS_ENABLED(CONFIG_PREEMPT_RT))
> - return;
> -
> - /* threshold event is triggered in finer grain than soft limit */
> - if (unlikely(mem_cgroup_event_ratelimit(memcg,
> - MEM_CGROUP_TARGET_THRESH))) {
> - bool do_softlimit;
> -
> - do_softlimit = mem_cgroup_event_ratelimit(memcg,
> - MEM_CGROUP_TARGET_SOFTLIMIT);
> - mem_cgroup_threshold(memcg);
> - if (unlikely(do_softlimit))
> - memcg1_update_tree(memcg, nid);
> - }
> -}
> -
> struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
> {
> /*
> @@ -1726,7 +1622,7 @@ static struct lockdep_map memcg_oom_lock_dep_map = {
> };
> #endif
>
> -static DEFINE_SPINLOCK(memcg_oom_lock);
> +DEFINE_SPINLOCK(memcg_oom_lock);
>
> /*
> * Check OOM-Killer is already running under our hierarchy.
> @@ -3545,7 +3441,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
> return -EINVAL;
> }
>
> -static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
> +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
> {
> unsigned long val;
>
> @@ -4046,331 +3942,6 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
> return 0;
> }
>
> -static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
> -{
> - struct mem_cgroup_threshold_ary *t;
> - unsigned long usage;
> - int i;
> -
> - rcu_read_lock();
> - if (!swap)
> - t = rcu_dereference(memcg->thresholds.primary);
> - else
> - t = rcu_dereference(memcg->memsw_thresholds.primary);
> -
> - if (!t)
> - goto unlock;
> -
> - usage = mem_cgroup_usage(memcg, swap);
> -
> - /*
> - * current_threshold points to threshold just below or equal to usage.
> - * If it's not true, a threshold was crossed after last
> - * call of __mem_cgroup_threshold().
> - */
> - i = t->current_threshold;
> -
> - /*
> - * Iterate backward over array of thresholds starting from
> - * current_threshold and check if a threshold is crossed.
> - * If none of thresholds below usage is crossed, we read
> - * only one element of the array here.
> - */
> - for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
> - eventfd_signal(t->entries[i].eventfd);
> -
> - /* i = current_threshold + 1 */
> - i++;
> -
> - /*
> - * Iterate forward over array of thresholds starting from
> - * current_threshold+1 and check if a threshold is crossed.
> - * If none of thresholds above usage is crossed, we read
> - * only one element of the array here.
> - */
> - for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
> - eventfd_signal(t->entries[i].eventfd);
> -
> - /* Update current_threshold */
> - t->current_threshold = i - 1;
> -unlock:
> - rcu_read_unlock();
> -}
> -
> -static void mem_cgroup_threshold(struct mem_cgroup *memcg)
> -{
> - while (memcg) {
> - __mem_cgroup_threshold(memcg, false);
> - if (do_memsw_account())
> - __mem_cgroup_threshold(memcg, true);
> -
> - memcg = parent_mem_cgroup(memcg);
> - }
> -}
> -
> -static int compare_thresholds(const void *a, const void *b)
> -{
> - const struct mem_cgroup_threshold *_a = a;
> - const struct mem_cgroup_threshold *_b = b;
> -
> - if (_a->threshold > _b->threshold)
> - return 1;
> -
> - if (_a->threshold < _b->threshold)
> - return -1;
> -
> - return 0;
> -}
> -
> -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
> -{
> - struct mem_cgroup_eventfd_list *ev;
> -
> - spin_lock(&memcg_oom_lock);
> -
> - list_for_each_entry(ev, &memcg->oom_notify, list)
> - eventfd_signal(ev->eventfd);
> -
> - spin_unlock(&memcg_oom_lock);
> - return 0;
> -}
> -
> -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
> -{
> - struct mem_cgroup *iter;
> -
> - for_each_mem_cgroup_tree(iter, memcg)
> - mem_cgroup_oom_notify_cb(iter);
> -}
> -
> -static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> - struct eventfd_ctx *eventfd, const char *args, enum res_type type)
> -{
> - struct mem_cgroup_thresholds *thresholds;
> - struct mem_cgroup_threshold_ary *new;
> - unsigned long threshold;
> - unsigned long usage;
> - int i, size, ret;
> -
> - ret = page_counter_memparse(args, "-1", &threshold);
> - if (ret)
> - return ret;
> -
> - mutex_lock(&memcg->thresholds_lock);
> -
> - if (type == _MEM) {
> - thresholds = &memcg->thresholds;
> - usage = mem_cgroup_usage(memcg, false);
> - } else if (type == _MEMSWAP) {
> - thresholds = &memcg->memsw_thresholds;
> - usage = mem_cgroup_usage(memcg, true);
> - } else
> - BUG();
> -
> - /* Check if a threshold crossed before adding a new one */
> - if (thresholds->primary)
> - __mem_cgroup_threshold(memcg, type == _MEMSWAP);
> -
> - size = thresholds->primary ? thresholds->primary->size + 1 : 1;
> -
> - /* Allocate memory for new array of thresholds */
> - new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
> - if (!new) {
> - ret = -ENOMEM;
> - goto unlock;
> - }
> - new->size = size;
> -
> - /* Copy thresholds (if any) to new array */
> - if (thresholds->primary)
> - memcpy(new->entries, thresholds->primary->entries,
> - flex_array_size(new, entries, size - 1));
> -
> - /* Add new threshold */
> - new->entries[size - 1].eventfd = eventfd;
> - new->entries[size - 1].threshold = threshold;
> -
> - /* Sort thresholds. Registering of new threshold isn't time-critical */
> - sort(new->entries, size, sizeof(*new->entries),
> - compare_thresholds, NULL);
> -
> - /* Find current threshold */
> - new->current_threshold = -1;
> - for (i = 0; i < size; i++) {
> - if (new->entries[i].threshold <= usage) {
> - /*
> - * new->current_threshold will not be used until
> - * rcu_assign_pointer(), so it's safe to increment
> - * it here.
> - */
> - ++new->current_threshold;
> - } else
> - break;
> - }
> -
> - /* Free old spare buffer and save old primary buffer as spare */
> - kfree(thresholds->spare);
> - thresholds->spare = thresholds->primary;
> -
> - rcu_assign_pointer(thresholds->primary, new);
> -
> - /* To be sure that nobody uses thresholds */
> - synchronize_rcu();
> -
> -unlock:
> - mutex_unlock(&memcg->thresholds_lock);
> -
> - return ret;
> -}
> -
> -static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> - struct eventfd_ctx *eventfd, const char *args)
> -{
> - return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
> -}
> -
> -static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
> - struct eventfd_ctx *eventfd, const char *args)
> -{
> - return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
> -}
> -
> -static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> - struct eventfd_ctx *eventfd, enum res_type type)
> -{
> - struct mem_cgroup_thresholds *thresholds;
> - struct mem_cgroup_threshold_ary *new;
> - unsigned long usage;
> - int i, j, size, entries;
> -
> - mutex_lock(&memcg->thresholds_lock);
> -
> - if (type == _MEM) {
> - thresholds = &memcg->thresholds;
> - usage = mem_cgroup_usage(memcg, false);
> - } else if (type == _MEMSWAP) {
> - thresholds = &memcg->memsw_thresholds;
> - usage = mem_cgroup_usage(memcg, true);
> - } else
> - BUG();
> -
> - if (!thresholds->primary)
> - goto unlock;
> -
> - /* Check if a threshold crossed before removing */
> - __mem_cgroup_threshold(memcg, type == _MEMSWAP);
> -
> - /* Calculate new number of threshold */
> - size = entries = 0;
> - for (i = 0; i < thresholds->primary->size; i++) {
> - if (thresholds->primary->entries[i].eventfd != eventfd)
> - size++;
> - else
> - entries++;
> - }
> -
> - new = thresholds->spare;
> -
> - /* If no items related to eventfd have been cleared, nothing to do */
> - if (!entries)
> - goto unlock;
> -
> - /* Set thresholds array to NULL if we don't have thresholds */
> - if (!size) {
> - kfree(new);
> - new = NULL;
> - goto swap_buffers;
> - }
> -
> - new->size = size;
> -
> - /* Copy thresholds and find current threshold */
> - new->current_threshold = -1;
> - for (i = 0, j = 0; i < thresholds->primary->size; i++) {
> - if (thresholds->primary->entries[i].eventfd == eventfd)
> - continue;
> -
> - new->entries[j] = thresholds->primary->entries[i];
> - if (new->entries[j].threshold <= usage) {
> - /*
> - * new->current_threshold will not be used
> - * until rcu_assign_pointer(), so it's safe to increment
> - * it here.
> - */
> - ++new->current_threshold;
> - }
> - j++;
> - }
> -
> -swap_buffers:
> - /* Swap primary and spare array */
> - thresholds->spare = thresholds->primary;
> -
> - rcu_assign_pointer(thresholds->primary, new);
> -
> - /* To be sure that nobody uses thresholds */
> - synchronize_rcu();
> -
> - /* If all events are unregistered, free the spare array */
> - if (!new) {
> - kfree(thresholds->spare);
> - thresholds->spare = NULL;
> - }
> -unlock:
> - mutex_unlock(&memcg->thresholds_lock);
> -}
> -
> -static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> - struct eventfd_ctx *eventfd)
> -{
> - return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
> -}
> -
> -static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> - struct eventfd_ctx *eventfd)
> -{
> - return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
> -}
> -
> -static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
> - struct eventfd_ctx *eventfd, const char *args)
> -{
> - struct mem_cgroup_eventfd_list *event;
> -
> - event = kmalloc(sizeof(*event), GFP_KERNEL);
> - if (!event)
> - return -ENOMEM;
> -
> - spin_lock(&memcg_oom_lock);
> -
> - event->eventfd = eventfd;
> - list_add(&event->list, &memcg->oom_notify);
> -
> - /* already in OOM ? */
> - if (memcg->under_oom)
> - eventfd_signal(eventfd);
> - spin_unlock(&memcg_oom_lock);
> -
> - return 0;
> -}
> -
> -static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
> - struct eventfd_ctx *eventfd)
> -{
> - struct mem_cgroup_eventfd_list *ev, *tmp;
> -
> - spin_lock(&memcg_oom_lock);
> -
> - list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
> - if (ev->eventfd == eventfd) {
> - list_del(&ev->list);
> - kfree(ev);
> - }
> - }
> -
> - spin_unlock(&memcg_oom_lock);
> -}
> -
> static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
> @@ -4611,243 +4182,6 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
>
> #endif /* CONFIG_CGROUP_WRITEBACK */
>
> -/*
> - * DO NOT USE IN NEW FILES.
> - *
> - * "cgroup.event_control" implementation.
> - *
> - * This is way over-engineered. It tries to support fully configurable
> - * events for each user. Such level of flexibility is completely
> - * unnecessary especially in the light of the planned unified hierarchy.
> - *
> - * Please deprecate this and replace with something simpler if at all
> - * possible.
> - */
> -
> -/*
> - * Unregister event and free resources.
> - *
> - * Gets called from workqueue.
> - */
> -static void memcg_event_remove(struct work_struct *work)
> -{
> - struct mem_cgroup_event *event =
> - container_of(work, struct mem_cgroup_event, remove);
> - struct mem_cgroup *memcg = event->memcg;
> -
> - remove_wait_queue(event->wqh, &event->wait);
> -
> - event->unregister_event(memcg, event->eventfd);
> -
> - /* Notify userspace the event is going away. */
> - eventfd_signal(event->eventfd);
> -
> - eventfd_ctx_put(event->eventfd);
> - kfree(event);
> - css_put(&memcg->css);
> -}
> -
> -/*
> - * Gets called on EPOLLHUP on eventfd when user closes it.
> - *
> - * Called with wqh->lock held and interrupts disabled.
> - */
> -static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
> - int sync, void *key)
> -{
> - struct mem_cgroup_event *event =
> - container_of(wait, struct mem_cgroup_event, wait);
> - struct mem_cgroup *memcg = event->memcg;
> - __poll_t flags = key_to_poll(key);
> -
> - if (flags & EPOLLHUP) {
> - /*
> - * If the event has been detached at cgroup removal, we
> - * can simply return knowing the other side will cleanup
> - * for us.
> - *
> - * We can't race against event freeing since the other
> - * side will require wqh->lock via remove_wait_queue(),
> - * which we hold.
> - */
> - spin_lock(&memcg->event_list_lock);
> - if (!list_empty(&event->list)) {
> - list_del_init(&event->list);
> - /*
> - * We are in atomic context, but cgroup_event_remove()
> - * may sleep, so we have to call it in workqueue.
> - */
> - schedule_work(&event->remove);
> - }
> - spin_unlock(&memcg->event_list_lock);
> - }
> -
> - return 0;
> -}
> -
> -static void memcg_event_ptable_queue_proc(struct file *file,
> - wait_queue_head_t *wqh, poll_table *pt)
> -{
> - struct mem_cgroup_event *event =
> - container_of(pt, struct mem_cgroup_event, pt);
> -
> - event->wqh = wqh;
> - add_wait_queue(wqh, &event->wait);
> -}
> -
> -/*
> - * DO NOT USE IN NEW FILES.
> - *
> - * Parse input and register new cgroup event handler.
> - *
> - * Input must be in format '<event_fd> <control_fd> <args>'.
> - * Interpretation of args is defined by control file implementation.
> - */
> -static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
> - char *buf, size_t nbytes, loff_t off)
> -{
> - struct cgroup_subsys_state *css = of_css(of);
> - struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> - struct mem_cgroup_event *event;
> - struct cgroup_subsys_state *cfile_css;
> - unsigned int efd, cfd;
> - struct fd efile;
> - struct fd cfile;
> - struct dentry *cdentry;
> - const char *name;
> - char *endp;
> - int ret;
> -
> - if (IS_ENABLED(CONFIG_PREEMPT_RT))
> - return -EOPNOTSUPP;
> -
> - buf = strstrip(buf);
> -
> - efd = simple_strtoul(buf, &endp, 10);
> - if (*endp != ' ')
> - return -EINVAL;
> - buf = endp + 1;
> -
> - cfd = simple_strtoul(buf, &endp, 10);
> - if ((*endp != ' ') && (*endp != '\0'))
> - return -EINVAL;
> - buf = endp + 1;
> -
> - event = kzalloc(sizeof(*event), GFP_KERNEL);
> - if (!event)
> - return -ENOMEM;
> -
> - event->memcg = memcg;
> - INIT_LIST_HEAD(&event->list);
> - init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
> - init_waitqueue_func_entry(&event->wait, memcg_event_wake);
> - INIT_WORK(&event->remove, memcg_event_remove);
> -
> - efile = fdget(efd);
> - if (!efile.file) {
> - ret = -EBADF;
> - goto out_kfree;
> - }
> -
> - event->eventfd = eventfd_ctx_fileget(efile.file);
> - if (IS_ERR(event->eventfd)) {
> - ret = PTR_ERR(event->eventfd);
> - goto out_put_efile;
> - }
> -
> - cfile = fdget(cfd);
> - if (!cfile.file) {
> - ret = -EBADF;
> - goto out_put_eventfd;
> - }
> -
> - /* the process need read permission on control file */
> - /* AV: shouldn't we check that it's been opened for read instead? */
> - ret = file_permission(cfile.file, MAY_READ);
> - if (ret < 0)
> - goto out_put_cfile;
> -
> - /*
> - * The control file must be a regular cgroup1 file. As a regular cgroup
> - * file can't be renamed, it's safe to access its name afterwards.
> - */
> - cdentry = cfile.file->f_path.dentry;
> - if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
> - ret = -EINVAL;
> - goto out_put_cfile;
> - }
> -
> - /*
> - * Determine the event callbacks and set them in @event. This used
> - * to be done via struct cftype but cgroup core no longer knows
> - * about these events. The following is crude but the whole thing
> - * is for compatibility anyway.
> - *
> - * DO NOT ADD NEW FILES.
> - */
> - name = cdentry->d_name.name;
> -
> - if (!strcmp(name, "memory.usage_in_bytes")) {
> - event->register_event = mem_cgroup_usage_register_event;
> - event->unregister_event = mem_cgroup_usage_unregister_event;
> - } else if (!strcmp(name, "memory.oom_control")) {
> - event->register_event = mem_cgroup_oom_register_event;
> - event->unregister_event = mem_cgroup_oom_unregister_event;
> - } else if (!strcmp(name, "memory.pressure_level")) {
> - event->register_event = vmpressure_register_event;
> - event->unregister_event = vmpressure_unregister_event;
> - } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
> - event->register_event = memsw_cgroup_usage_register_event;
> - event->unregister_event = memsw_cgroup_usage_unregister_event;
> - } else {
> - ret = -EINVAL;
> - goto out_put_cfile;
> - }
> -
> - /*
> - * Verify @cfile should belong to @css. Also, remaining events are
> - * automatically removed on cgroup destruction but the removal is
> - * asynchronous, so take an extra ref on @css.
> - */
> - cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
> - &memory_cgrp_subsys);
> - ret = -EINVAL;
> - if (IS_ERR(cfile_css))
> - goto out_put_cfile;
> - if (cfile_css != css) {
> - css_put(cfile_css);
> - goto out_put_cfile;
> - }
> -
> - ret = event->register_event(memcg, event->eventfd, buf);
> - if (ret)
> - goto out_put_css;
> -
> - vfs_poll(efile.file, &event->pt);
> -
> - spin_lock_irq(&memcg->event_list_lock);
> - list_add(&event->list, &memcg->event_list);
> - spin_unlock_irq(&memcg->event_list_lock);
> -
> - fdput(cfile);
> - fdput(efile);
> -
> - return nbytes;
> -
> -out_put_css:
> - css_put(css);
> -out_put_cfile:
> - fdput(cfile);
> -out_put_eventfd:
> - eventfd_ctx_put(event->eventfd);
> -out_put_efile:
> - fdput(efile);
> -out_kfree:
> - kfree(event);
> -
> - return ret;
> -}
> -
> #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG)
> static int mem_cgroup_slab_show(struct seq_file *m, void *p)
> {
> @@ -5314,19 +4648,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
> static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> - struct mem_cgroup_event *event, *tmp;
>
> - /*
> - * Unregister events and notify userspace.
> - * Notify userspace about cgroup removing only after rmdir of cgroup
> - * directory to avoid race between userspace and kernelspace.
> - */
> - spin_lock_irq(&memcg->event_list_lock);
> - list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
> - list_del_init(&event->list);
> - schedule_work(&event->remove);
> - }
> - spin_unlock_irq(&memcg->event_list_lock);
> + memcg1_css_offline(memcg);
>
> page_counter_set_min(&memcg->memory, 0);
> page_counter_set_low(&memcg->memory, 0);
> --
> 2.45.2
--
Michal Hocko
SUSE Labs
Powered by blists - more mailing lists