linux-kernel - Re: [PATCH v2 06/14] mm: memcg: move legacy memcg event code into memcontrol-v1.c

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Znpsy2rISbnT4Ffi@tiehlicka>
Date: Tue, 25 Jun 2024 09:07:55 +0200
From: Michal Hocko <mhocko@...e.com>
To: Roman Gushchin <roman.gushchin@...ux.dev>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
	Johannes Weiner <hannes@...xchg.org>,
	Shakeel Butt <shakeel.butt@...ux.dev>,
	Muchun Song <muchun.song@...ux.dev>, linux-kernel@...r.kernel.org,
	cgroups@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [PATCH v2 06/14] mm: memcg: move legacy memcg event code into
 memcontrol-v1.c

On Mon 24-06-24 17:58:58, Roman Gushchin wrote:
> Cgroup v1's memory controller contains a pretty complicated
> event notifications mechanism which is not used on cgroup v2.
> Let's move the corresponding code into memcontrol-v1.c.
> 
> Please, note, that mem_cgroup_event_ratelimit() remains in
> memcontrol.c, otherwise it would require exporting too many
> details on memcg stats outside of memcontrol.c.
> 
> Signed-off-by: Roman Gushchin <roman.gushchin@...ux.dev>

Acked-by: Michal Hocko <mhocko@...e.com>

> ---
>  include/linux/memcontrol.h |  12 -
>  mm/memcontrol-v1.c         | 653 +++++++++++++++++++++++++++++++++++
>  mm/memcontrol-v1.h         |  51 +++
>  mm/memcontrol.c            | 687 +------------------------------------
>  4 files changed, 709 insertions(+), 694 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 83c8327455d8..588179d29849 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -69,18 +69,6 @@ struct mem_cgroup_id {
>  	refcount_t ref;
>  };
>  
> -/*
> - * Per memcg event counter is incremented at every pagein/pageout. With THP,
> - * it will be incremented by the number of pages. This counter is used
> - * to trigger some periodic events. This is straightforward and better
> - * than using jiffies etc. to handle periodic memcg event.
> - */
> -enum mem_cgroup_events_target {
> -	MEM_CGROUP_TARGET_THRESH,
> -	MEM_CGROUP_TARGET_SOFTLIMIT,
> -	MEM_CGROUP_NTARGETS,
> -};
> -
>  struct memcg_vmstats_percpu;
>  struct memcg_vmstats;
>  struct lruvec_stats_percpu;
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index c25e038ac874..4b2290ceace6 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -6,6 +6,10 @@
>  #include <linux/pagewalk.h>
>  #include <linux/backing-dev.h>
>  #include <linux/swap_cgroup.h>
> +#include <linux/eventfd.h>
> +#include <linux/poll.h>
> +#include <linux/sort.h>
> +#include <linux/file.h>
>  
>  #include "internal.h"
>  #include "swap.h"
> @@ -60,6 +64,54 @@ static struct move_charge_struct {
>  	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
>  };
>  
> +/* for OOM */
> +struct mem_cgroup_eventfd_list {
> +	struct list_head list;
> +	struct eventfd_ctx *eventfd;
> +};
> +
> +/*
> + * cgroup_event represents events which userspace want to receive.
> + */
> +struct mem_cgroup_event {
> +	/*
> +	 * memcg which the event belongs to.
> +	 */
> +	struct mem_cgroup *memcg;
> +	/*
> +	 * eventfd to signal userspace about the event.
> +	 */
> +	struct eventfd_ctx *eventfd;
> +	/*
> +	 * Each of these stored in a list by the cgroup.
> +	 */
> +	struct list_head list;
> +	/*
> +	 * register_event() callback will be used to add new userspace
> +	 * waiter for changes related to this event.  Use eventfd_signal()
> +	 * on eventfd to send notification to userspace.
> +	 */
> +	int (*register_event)(struct mem_cgroup *memcg,
> +			      struct eventfd_ctx *eventfd, const char *args);
> +	/*
> +	 * unregister_event() callback will be called when userspace closes
> +	 * the eventfd or on cgroup removing.  This callback must be set,
> +	 * if you want provide notification functionality.
> +	 */
> +	void (*unregister_event)(struct mem_cgroup *memcg,
> +				 struct eventfd_ctx *eventfd);
> +	/*
> +	 * All fields below needed to unregister event when
> +	 * userspace closes eventfd.
> +	 */
> +	poll_table pt;
> +	wait_queue_head_t *wqh;
> +	wait_queue_entry_t wait;
> +	struct work_struct remove;
> +};
> +
> +extern spinlock_t memcg_oom_lock;
> +
>  static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
>  					 struct mem_cgroup_tree_per_node *mctz,
>  					 unsigned long new_usage_in_excess)
> @@ -1306,6 +1358,607 @@ void memcg1_move_task(void)
>  }
>  #endif
>  
> +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
> +{
> +	struct mem_cgroup_threshold_ary *t;
> +	unsigned long usage;
> +	int i;
> +
> +	rcu_read_lock();
> +	if (!swap)
> +		t = rcu_dereference(memcg->thresholds.primary);
> +	else
> +		t = rcu_dereference(memcg->memsw_thresholds.primary);
> +
> +	if (!t)
> +		goto unlock;
> +
> +	usage = mem_cgroup_usage(memcg, swap);
> +
> +	/*
> +	 * current_threshold points to threshold just below or equal to usage.
> +	 * If it's not true, a threshold was crossed after last
> +	 * call of __mem_cgroup_threshold().
> +	 */
> +	i = t->current_threshold;
> +
> +	/*
> +	 * Iterate backward over array of thresholds starting from
> +	 * current_threshold and check if a threshold is crossed.
> +	 * If none of thresholds below usage is crossed, we read
> +	 * only one element of the array here.
> +	 */
> +	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
> +		eventfd_signal(t->entries[i].eventfd);
> +
> +	/* i = current_threshold + 1 */
> +	i++;
> +
> +	/*
> +	 * Iterate forward over array of thresholds starting from
> +	 * current_threshold+1 and check if a threshold is crossed.
> +	 * If none of thresholds above usage is crossed, we read
> +	 * only one element of the array here.
> +	 */
> +	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
> +		eventfd_signal(t->entries[i].eventfd);
> +
> +	/* Update current_threshold */
> +	t->current_threshold = i - 1;
> +unlock:
> +	rcu_read_unlock();
> +}
> +
> +static void mem_cgroup_threshold(struct mem_cgroup *memcg)
> +{
> +	while (memcg) {
> +		__mem_cgroup_threshold(memcg, false);
> +		if (do_memsw_account())
> +			__mem_cgroup_threshold(memcg, true);
> +
> +		memcg = parent_mem_cgroup(memcg);
> +	}
> +}
> +
> +/*
> + * Check events in order.
> + *
> + */
> +void memcg_check_events(struct mem_cgroup *memcg, int nid)
> +{
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT))
> +		return;
> +
> +	/* threshold event is triggered in finer grain than soft limit */
> +	if (unlikely(mem_cgroup_event_ratelimit(memcg,
> +						MEM_CGROUP_TARGET_THRESH))) {
> +		bool do_softlimit;
> +
> +		do_softlimit = mem_cgroup_event_ratelimit(memcg,
> +						MEM_CGROUP_TARGET_SOFTLIMIT);
> +		mem_cgroup_threshold(memcg);
> +		if (unlikely(do_softlimit))
> +			memcg1_update_tree(memcg, nid);
> +	}
> +}
> +
> +static int compare_thresholds(const void *a, const void *b)
> +{
> +	const struct mem_cgroup_threshold *_a = a;
> +	const struct mem_cgroup_threshold *_b = b;
> +
> +	if (_a->threshold > _b->threshold)
> +		return 1;
> +
> +	if (_a->threshold < _b->threshold)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
> +{
> +	struct mem_cgroup_eventfd_list *ev;
> +
> +	spin_lock(&memcg_oom_lock);
> +
> +	list_for_each_entry(ev, &memcg->oom_notify, list)
> +		eventfd_signal(ev->eventfd);
> +
> +	spin_unlock(&memcg_oom_lock);
> +	return 0;
> +}
> +
> +void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
> +{
> +	struct mem_cgroup *iter;
> +
> +	for_each_mem_cgroup_tree(iter, memcg)
> +		mem_cgroup_oom_notify_cb(iter);
> +}
> +
> +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
> +{
> +	struct mem_cgroup_thresholds *thresholds;
> +	struct mem_cgroup_threshold_ary *new;
> +	unsigned long threshold;
> +	unsigned long usage;
> +	int i, size, ret;
> +
> +	ret = page_counter_memparse(args, "-1", &threshold);
> +	if (ret)
> +		return ret;
> +
> +	mutex_lock(&memcg->thresholds_lock);
> +
> +	if (type == _MEM) {
> +		thresholds = &memcg->thresholds;
> +		usage = mem_cgroup_usage(memcg, false);
> +	} else if (type == _MEMSWAP) {
> +		thresholds = &memcg->memsw_thresholds;
> +		usage = mem_cgroup_usage(memcg, true);
> +	} else
> +		BUG();
> +
> +	/* Check if a threshold crossed before adding a new one */
> +	if (thresholds->primary)
> +		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
> +
> +	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
> +
> +	/* Allocate memory for new array of thresholds */
> +	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
> +	if (!new) {
> +		ret = -ENOMEM;
> +		goto unlock;
> +	}
> +	new->size = size;
> +
> +	/* Copy thresholds (if any) to new array */
> +	if (thresholds->primary)
> +		memcpy(new->entries, thresholds->primary->entries,
> +		       flex_array_size(new, entries, size - 1));
> +
> +	/* Add new threshold */
> +	new->entries[size - 1].eventfd = eventfd;
> +	new->entries[size - 1].threshold = threshold;
> +
> +	/* Sort thresholds. Registering of new threshold isn't time-critical */
> +	sort(new->entries, size, sizeof(*new->entries),
> +			compare_thresholds, NULL);
> +
> +	/* Find current threshold */
> +	new->current_threshold = -1;
> +	for (i = 0; i < size; i++) {
> +		if (new->entries[i].threshold <= usage) {
> +			/*
> +			 * new->current_threshold will not be used until
> +			 * rcu_assign_pointer(), so it's safe to increment
> +			 * it here.
> +			 */
> +			++new->current_threshold;
> +		} else
> +			break;
> +	}
> +
> +	/* Free old spare buffer and save old primary buffer as spare */
> +	kfree(thresholds->spare);
> +	thresholds->spare = thresholds->primary;
> +
> +	rcu_assign_pointer(thresholds->primary, new);
> +
> +	/* To be sure that nobody uses thresholds */
> +	synchronize_rcu();
> +
> +unlock:
> +	mutex_unlock(&memcg->thresholds_lock);
> +
> +	return ret;
> +}
> +
> +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd, const char *args)
> +{
> +	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
> +}
> +
> +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd, const char *args)
> +{
> +	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
> +}
> +
> +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd, enum res_type type)
> +{
> +	struct mem_cgroup_thresholds *thresholds;
> +	struct mem_cgroup_threshold_ary *new;
> +	unsigned long usage;
> +	int i, j, size, entries;
> +
> +	mutex_lock(&memcg->thresholds_lock);
> +
> +	if (type == _MEM) {
> +		thresholds = &memcg->thresholds;
> +		usage = mem_cgroup_usage(memcg, false);
> +	} else if (type == _MEMSWAP) {
> +		thresholds = &memcg->memsw_thresholds;
> +		usage = mem_cgroup_usage(memcg, true);
> +	} else
> +		BUG();
> +
> +	if (!thresholds->primary)
> +		goto unlock;
> +
> +	/* Check if a threshold crossed before removing */
> +	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
> +
> +	/* Calculate new number of threshold */
> +	size = entries = 0;
> +	for (i = 0; i < thresholds->primary->size; i++) {
> +		if (thresholds->primary->entries[i].eventfd != eventfd)
> +			size++;
> +		else
> +			entries++;
> +	}
> +
> +	new = thresholds->spare;
> +
> +	/* If no items related to eventfd have been cleared, nothing to do */
> +	if (!entries)
> +		goto unlock;
> +
> +	/* Set thresholds array to NULL if we don't have thresholds */
> +	if (!size) {
> +		kfree(new);
> +		new = NULL;
> +		goto swap_buffers;
> +	}
> +
> +	new->size = size;
> +
> +	/* Copy thresholds and find current threshold */
> +	new->current_threshold = -1;
> +	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
> +		if (thresholds->primary->entries[i].eventfd == eventfd)
> +			continue;
> +
> +		new->entries[j] = thresholds->primary->entries[i];
> +		if (new->entries[j].threshold <= usage) {
> +			/*
> +			 * new->current_threshold will not be used
> +			 * until rcu_assign_pointer(), so it's safe to increment
> +			 * it here.
> +			 */
> +			++new->current_threshold;
> +		}
> +		j++;
> +	}
> +
> +swap_buffers:
> +	/* Swap primary and spare array */
> +	thresholds->spare = thresholds->primary;
> +
> +	rcu_assign_pointer(thresholds->primary, new);
> +
> +	/* To be sure that nobody uses thresholds */
> +	synchronize_rcu();
> +
> +	/* If all events are unregistered, free the spare array */
> +	if (!new) {
> +		kfree(thresholds->spare);
> +		thresholds->spare = NULL;
> +	}
> +unlock:
> +	mutex_unlock(&memcg->thresholds_lock);
> +}
> +
> +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd)
> +{
> +	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
> +}
> +
> +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd)
> +{
> +	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
> +}
> +
> +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd, const char *args)
> +{
> +	struct mem_cgroup_eventfd_list *event;
> +
> +	event = kmalloc(sizeof(*event),	GFP_KERNEL);
> +	if (!event)
> +		return -ENOMEM;
> +
> +	spin_lock(&memcg_oom_lock);
> +
> +	event->eventfd = eventfd;
> +	list_add(&event->list, &memcg->oom_notify);
> +
> +	/* already in OOM ? */
> +	if (memcg->under_oom)
> +		eventfd_signal(eventfd);
> +	spin_unlock(&memcg_oom_lock);
> +
> +	return 0;
> +}
> +
> +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
> +	struct eventfd_ctx *eventfd)
> +{
> +	struct mem_cgroup_eventfd_list *ev, *tmp;
> +
> +	spin_lock(&memcg_oom_lock);
> +
> +	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
> +		if (ev->eventfd == eventfd) {
> +			list_del(&ev->list);
> +			kfree(ev);
> +		}
> +	}
> +
> +	spin_unlock(&memcg_oom_lock);
> +}
> +
> +/*
> + * DO NOT USE IN NEW FILES.
> + *
> + * "cgroup.event_control" implementation.
> + *
> + * This is way over-engineered.  It tries to support fully configurable
> + * events for each user.  Such level of flexibility is completely
> + * unnecessary especially in the light of the planned unified hierarchy.
> + *
> + * Please deprecate this and replace with something simpler if at all
> + * possible.
> + */
> +
> +/*
> + * Unregister event and free resources.
> + *
> + * Gets called from workqueue.
> + */
> +static void memcg_event_remove(struct work_struct *work)
> +{
> +	struct mem_cgroup_event *event =
> +		container_of(work, struct mem_cgroup_event, remove);
> +	struct mem_cgroup *memcg = event->memcg;
> +
> +	remove_wait_queue(event->wqh, &event->wait);
> +
> +	event->unregister_event(memcg, event->eventfd);
> +
> +	/* Notify userspace the event is going away. */
> +	eventfd_signal(event->eventfd);
> +
> +	eventfd_ctx_put(event->eventfd);
> +	kfree(event);
> +	css_put(&memcg->css);
> +}
> +
> +/*
> + * Gets called on EPOLLHUP on eventfd when user closes it.
> + *
> + * Called with wqh->lock held and interrupts disabled.
> + */
> +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
> +			    int sync, void *key)
> +{
> +	struct mem_cgroup_event *event =
> +		container_of(wait, struct mem_cgroup_event, wait);
> +	struct mem_cgroup *memcg = event->memcg;
> +	__poll_t flags = key_to_poll(key);
> +
> +	if (flags & EPOLLHUP) {
> +		/*
> +		 * If the event has been detached at cgroup removal, we
> +		 * can simply return knowing the other side will cleanup
> +		 * for us.
> +		 *
> +		 * We can't race against event freeing since the other
> +		 * side will require wqh->lock via remove_wait_queue(),
> +		 * which we hold.
> +		 */
> +		spin_lock(&memcg->event_list_lock);
> +		if (!list_empty(&event->list)) {
> +			list_del_init(&event->list);
> +			/*
> +			 * We are in atomic context, but cgroup_event_remove()
> +			 * may sleep, so we have to call it in workqueue.
> +			 */
> +			schedule_work(&event->remove);
> +		}
> +		spin_unlock(&memcg->event_list_lock);
> +	}
> +
> +	return 0;
> +}
> +
> +static void memcg_event_ptable_queue_proc(struct file *file,
> +		wait_queue_head_t *wqh, poll_table *pt)
> +{
> +	struct mem_cgroup_event *event =
> +		container_of(pt, struct mem_cgroup_event, pt);
> +
> +	event->wqh = wqh;
> +	add_wait_queue(wqh, &event->wait);
> +}
> +
> +/*
> + * DO NOT USE IN NEW FILES.
> + *
> + * Parse input and register new cgroup event handler.
> + *
> + * Input must be in format '<event_fd> <control_fd> <args>'.
> + * Interpretation of args is defined by control file implementation.
> + */
> +ssize_t memcg_write_event_control(struct kernfs_open_file *of,
> +				  char *buf, size_t nbytes, loff_t off)
> +{
> +	struct cgroup_subsys_state *css = of_css(of);
> +	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> +	struct mem_cgroup_event *event;
> +	struct cgroup_subsys_state *cfile_css;
> +	unsigned int efd, cfd;
> +	struct fd efile;
> +	struct fd cfile;
> +	struct dentry *cdentry;
> +	const char *name;
> +	char *endp;
> +	int ret;
> +
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT))
> +		return -EOPNOTSUPP;
> +
> +	buf = strstrip(buf);
> +
> +	efd = simple_strtoul(buf, &endp, 10);
> +	if (*endp != ' ')
> +		return -EINVAL;
> +	buf = endp + 1;
> +
> +	cfd = simple_strtoul(buf, &endp, 10);
> +	if ((*endp != ' ') && (*endp != '\0'))
> +		return -EINVAL;
> +	buf = endp + 1;
> +
> +	event = kzalloc(sizeof(*event), GFP_KERNEL);
> +	if (!event)
> +		return -ENOMEM;
> +
> +	event->memcg = memcg;
> +	INIT_LIST_HEAD(&event->list);
> +	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
> +	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
> +	INIT_WORK(&event->remove, memcg_event_remove);
> +
> +	efile = fdget(efd);
> +	if (!efile.file) {
> +		ret = -EBADF;
> +		goto out_kfree;
> +	}
> +
> +	event->eventfd = eventfd_ctx_fileget(efile.file);
> +	if (IS_ERR(event->eventfd)) {
> +		ret = PTR_ERR(event->eventfd);
> +		goto out_put_efile;
> +	}
> +
> +	cfile = fdget(cfd);
> +	if (!cfile.file) {
> +		ret = -EBADF;
> +		goto out_put_eventfd;
> +	}
> +
> +	/* the process need read permission on control file */
> +	/* AV: shouldn't we check that it's been opened for read instead? */
> +	ret = file_permission(cfile.file, MAY_READ);
> +	if (ret < 0)
> +		goto out_put_cfile;
> +
> +	/*
> +	 * The control file must be a regular cgroup1 file. As a regular cgroup
> +	 * file can't be renamed, it's safe to access its name afterwards.
> +	 */
> +	cdentry = cfile.file->f_path.dentry;
> +	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
> +		ret = -EINVAL;
> +		goto out_put_cfile;
> +	}
> +
> +	/*
> +	 * Determine the event callbacks and set them in @event.  This used
> +	 * to be done via struct cftype but cgroup core no longer knows
> +	 * about these events.  The following is crude but the whole thing
> +	 * is for compatibility anyway.
> +	 *
> +	 * DO NOT ADD NEW FILES.
> +	 */
> +	name = cdentry->d_name.name;
> +
> +	if (!strcmp(name, "memory.usage_in_bytes")) {
> +		event->register_event = mem_cgroup_usage_register_event;
> +		event->unregister_event = mem_cgroup_usage_unregister_event;
> +	} else if (!strcmp(name, "memory.oom_control")) {
> +		event->register_event = mem_cgroup_oom_register_event;
> +		event->unregister_event = mem_cgroup_oom_unregister_event;
> +	} else if (!strcmp(name, "memory.pressure_level")) {
> +		event->register_event = vmpressure_register_event;
> +		event->unregister_event = vmpressure_unregister_event;
> +	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
> +		event->register_event = memsw_cgroup_usage_register_event;
> +		event->unregister_event = memsw_cgroup_usage_unregister_event;
> +	} else {
> +		ret = -EINVAL;
> +		goto out_put_cfile;
> +	}
> +
> +	/*
> +	 * Verify @cfile should belong to @css.  Also, remaining events are
> +	 * automatically removed on cgroup destruction but the removal is
> +	 * asynchronous, so take an extra ref on @css.
> +	 */
> +	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
> +					       &memory_cgrp_subsys);
> +	ret = -EINVAL;
> +	if (IS_ERR(cfile_css))
> +		goto out_put_cfile;
> +	if (cfile_css != css) {
> +		css_put(cfile_css);
> +		goto out_put_cfile;
> +	}
> +
> +	ret = event->register_event(memcg, event->eventfd, buf);
> +	if (ret)
> +		goto out_put_css;
> +
> +	vfs_poll(efile.file, &event->pt);
> +
> +	spin_lock_irq(&memcg->event_list_lock);
> +	list_add(&event->list, &memcg->event_list);
> +	spin_unlock_irq(&memcg->event_list_lock);
> +
> +	fdput(cfile);
> +	fdput(efile);
> +
> +	return nbytes;
> +
> +out_put_css:
> +	css_put(css);
> +out_put_cfile:
> +	fdput(cfile);
> +out_put_eventfd:
> +	eventfd_ctx_put(event->eventfd);
> +out_put_efile:
> +	fdput(efile);
> +out_kfree:
> +	kfree(event);
> +
> +	return ret;
> +}
> +
> +void memcg1_css_offline(struct mem_cgroup *memcg)
> +{
> +	struct mem_cgroup_event *event, *tmp;
> +
> +	/*
> +	 * Unregister events and notify userspace.
> +	 * Notify userspace about cgroup removing only after rmdir of cgroup
> +	 * directory to avoid race between userspace and kernelspace.
> +	 */
> +	spin_lock_irq(&memcg->event_list_lock);
> +	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
> +		list_del_init(&event->list);
> +		schedule_work(&event->remove);
> +	}
> +	spin_unlock_irq(&memcg->event_list_lock);
> +}
> +
>  static int __init memcg1_init(void)
>  {
>  	int node;
> diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
> index d377c0be9880..524a2c76ffc9 100644
> --- a/mm/memcontrol-v1.h
> +++ b/mm/memcontrol-v1.h
> @@ -41,4 +41,55 @@ u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
>  int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
>  				 struct cftype *cft, u64 val);
>  
> +/*
> + * Per memcg event counter is incremented at every pagein/pageout. With THP,
> + * it will be incremented by the number of pages. This counter is used
> + * to trigger some periodic events. This is straightforward and better
> + * than using jiffies etc. to handle periodic memcg event.
> + */
> +enum mem_cgroup_events_target {
> +	MEM_CGROUP_TARGET_THRESH,
> +	MEM_CGROUP_TARGET_SOFTLIMIT,
> +	MEM_CGROUP_NTARGETS,
> +};
> +
> +/* Whether legacy memory+swap accounting is active */
> +static bool do_memsw_account(void)
> +{
> +	return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
> +}
> +
> +/*
> + * Iteration constructs for visiting all cgroups (under a tree).  If
> + * loops are exited prematurely (break), mem_cgroup_iter_break() must
> + * be used for reference counting.
> + */
> +#define for_each_mem_cgroup_tree(iter, root)		\
> +	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
> +	     iter != NULL;				\
> +	     iter = mem_cgroup_iter(root, iter, NULL))
> +
> +#define for_each_mem_cgroup(iter)			\
> +	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
> +	     iter != NULL;				\
> +	     iter = mem_cgroup_iter(NULL, iter, NULL))
> +
> +void memcg1_css_offline(struct mem_cgroup *memcg);
> +
> +/* for encoding cft->private value on file */
> +enum res_type {
> +	_MEM,
> +	_MEMSWAP,
> +	_KMEM,
> +	_TCP,
> +};
> +
> +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> +				enum mem_cgroup_events_target target);
> +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
> +void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
> +ssize_t memcg_write_event_control(struct kernfs_open_file *of,
> +				  char *buf, size_t nbytes, loff_t off);
> +
> +
>  #endif	/* __MM_MEMCONTROL_V1_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index da2c0fa0de1b..bd4b26a73596 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -46,9 +46,6 @@
>  #include <linux/slab.h>
>  #include <linux/swapops.h>
>  #include <linux/spinlock.h>
> -#include <linux/eventfd.h>
> -#include <linux/poll.h>
> -#include <linux/sort.h>
>  #include <linux/fs.h>
>  #include <linux/seq_file.h>
>  #include <linux/parser.h>
> @@ -59,7 +56,6 @@
>  #include <linux/cpu.h>
>  #include <linux/oom.h>
>  #include <linux/lockdep.h>
> -#include <linux/file.h>
>  #include <linux/resume_user_mode.h>
>  #include <linux/psi.h>
>  #include <linux/seq_buf.h>
> @@ -97,91 +93,13 @@ static bool cgroup_memory_nobpf __ro_after_init;
>  static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
>  #endif
>  
> -/* Whether legacy memory+swap accounting is active */
> -static bool do_memsw_account(void)
> -{
> -	return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
> -}
> -
>  #define THRESHOLDS_EVENTS_TARGET 128
>  #define SOFTLIMIT_EVENTS_TARGET 1024
>  
> -/* for OOM */
> -struct mem_cgroup_eventfd_list {
> -	struct list_head list;
> -	struct eventfd_ctx *eventfd;
> -};
> -
> -/*
> - * cgroup_event represents events which userspace want to receive.
> - */
> -struct mem_cgroup_event {
> -	/*
> -	 * memcg which the event belongs to.
> -	 */
> -	struct mem_cgroup *memcg;
> -	/*
> -	 * eventfd to signal userspace about the event.
> -	 */
> -	struct eventfd_ctx *eventfd;
> -	/*
> -	 * Each of these stored in a list by the cgroup.
> -	 */
> -	struct list_head list;
> -	/*
> -	 * register_event() callback will be used to add new userspace
> -	 * waiter for changes related to this event.  Use eventfd_signal()
> -	 * on eventfd to send notification to userspace.
> -	 */
> -	int (*register_event)(struct mem_cgroup *memcg,
> -			      struct eventfd_ctx *eventfd, const char *args);
> -	/*
> -	 * unregister_event() callback will be called when userspace closes
> -	 * the eventfd or on cgroup removing.  This callback must be set,
> -	 * if you want provide notification functionality.
> -	 */
> -	void (*unregister_event)(struct mem_cgroup *memcg,
> -				 struct eventfd_ctx *eventfd);
> -	/*
> -	 * All fields below needed to unregister event when
> -	 * userspace closes eventfd.
> -	 */
> -	poll_table pt;
> -	wait_queue_head_t *wqh;
> -	wait_queue_entry_t wait;
> -	struct work_struct remove;
> -};
> -
> -static void mem_cgroup_threshold(struct mem_cgroup *memcg);
> -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
> -
> -/* for encoding cft->private value on file */
> -enum res_type {
> -	_MEM,
> -	_MEMSWAP,
> -	_KMEM,
> -	_TCP,
> -};
> -
>  #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
>  #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
>  #define MEMFILE_ATTR(val)	((val) & 0xffff)
>  
> -/*
> - * Iteration constructs for visiting all cgroups (under a tree).  If
> - * loops are exited prematurely (break), mem_cgroup_iter_break() must
> - * be used for reference counting.
> - */
> -#define for_each_mem_cgroup_tree(iter, root)		\
> -	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
> -	     iter != NULL;				\
> -	     iter = mem_cgroup_iter(root, iter, NULL))
> -
> -#define for_each_mem_cgroup(iter)			\
> -	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
> -	     iter != NULL;				\
> -	     iter = mem_cgroup_iter(NULL, iter, NULL))
> -
>  static inline bool task_is_dying(void)
>  {
>  	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
> @@ -940,8 +858,8 @@ void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
>  	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
>  }
>  
> -static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> -				       enum mem_cgroup_events_target target)
> +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> +				enum mem_cgroup_events_target target)
>  {
>  	unsigned long val, next;
>  
> @@ -965,28 +883,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
>  	return false;
>  }
>  
> -/*
> - * Check events in order.
> - *
> - */
> -void memcg_check_events(struct mem_cgroup *memcg, int nid)
> -{
> -	if (IS_ENABLED(CONFIG_PREEMPT_RT))
> -		return;
> -
> -	/* threshold event is triggered in finer grain than soft limit */
> -	if (unlikely(mem_cgroup_event_ratelimit(memcg,
> -						MEM_CGROUP_TARGET_THRESH))) {
> -		bool do_softlimit;
> -
> -		do_softlimit = mem_cgroup_event_ratelimit(memcg,
> -						MEM_CGROUP_TARGET_SOFTLIMIT);
> -		mem_cgroup_threshold(memcg);
> -		if (unlikely(do_softlimit))
> -			memcg1_update_tree(memcg, nid);
> -	}
> -}
> -
>  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
>  {
>  	/*
> @@ -1726,7 +1622,7 @@ static struct lockdep_map memcg_oom_lock_dep_map = {
>  };
>  #endif
>  
> -static DEFINE_SPINLOCK(memcg_oom_lock);
> +DEFINE_SPINLOCK(memcg_oom_lock);
>  
>  /*
>   * Check OOM-Killer is already running under our hierarchy.
> @@ -3545,7 +3441,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
>  	return -EINVAL;
>  }
>  
> -static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
> +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
>  {
>  	unsigned long val;
>  
> @@ -4046,331 +3942,6 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
>  	return 0;
>  }
>  
> -static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
> -{
> -	struct mem_cgroup_threshold_ary *t;
> -	unsigned long usage;
> -	int i;
> -
> -	rcu_read_lock();
> -	if (!swap)
> -		t = rcu_dereference(memcg->thresholds.primary);
> -	else
> -		t = rcu_dereference(memcg->memsw_thresholds.primary);
> -
> -	if (!t)
> -		goto unlock;
> -
> -	usage = mem_cgroup_usage(memcg, swap);
> -
> -	/*
> -	 * current_threshold points to threshold just below or equal to usage.
> -	 * If it's not true, a threshold was crossed after last
> -	 * call of __mem_cgroup_threshold().
> -	 */
> -	i = t->current_threshold;
> -
> -	/*
> -	 * Iterate backward over array of thresholds starting from
> -	 * current_threshold and check if a threshold is crossed.
> -	 * If none of thresholds below usage is crossed, we read
> -	 * only one element of the array here.
> -	 */
> -	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
> -		eventfd_signal(t->entries[i].eventfd);
> -
> -	/* i = current_threshold + 1 */
> -	i++;
> -
> -	/*
> -	 * Iterate forward over array of thresholds starting from
> -	 * current_threshold+1 and check if a threshold is crossed.
> -	 * If none of thresholds above usage is crossed, we read
> -	 * only one element of the array here.
> -	 */
> -	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
> -		eventfd_signal(t->entries[i].eventfd);
> -
> -	/* Update current_threshold */
> -	t->current_threshold = i - 1;
> -unlock:
> -	rcu_read_unlock();
> -}
> -
> -static void mem_cgroup_threshold(struct mem_cgroup *memcg)
> -{
> -	while (memcg) {
> -		__mem_cgroup_threshold(memcg, false);
> -		if (do_memsw_account())
> -			__mem_cgroup_threshold(memcg, true);
> -
> -		memcg = parent_mem_cgroup(memcg);
> -	}
> -}
> -
> -static int compare_thresholds(const void *a, const void *b)
> -{
> -	const struct mem_cgroup_threshold *_a = a;
> -	const struct mem_cgroup_threshold *_b = b;
> -
> -	if (_a->threshold > _b->threshold)
> -		return 1;
> -
> -	if (_a->threshold < _b->threshold)
> -		return -1;
> -
> -	return 0;
> -}
> -
> -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup_eventfd_list *ev;
> -
> -	spin_lock(&memcg_oom_lock);
> -
> -	list_for_each_entry(ev, &memcg->oom_notify, list)
> -		eventfd_signal(ev->eventfd);
> -
> -	spin_unlock(&memcg_oom_lock);
> -	return 0;
> -}
> -
> -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
> -{
> -	struct mem_cgroup *iter;
> -
> -	for_each_mem_cgroup_tree(iter, memcg)
> -		mem_cgroup_oom_notify_cb(iter);
> -}
> -
> -static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
> -{
> -	struct mem_cgroup_thresholds *thresholds;
> -	struct mem_cgroup_threshold_ary *new;
> -	unsigned long threshold;
> -	unsigned long usage;
> -	int i, size, ret;
> -
> -	ret = page_counter_memparse(args, "-1", &threshold);
> -	if (ret)
> -		return ret;
> -
> -	mutex_lock(&memcg->thresholds_lock);
> -
> -	if (type == _MEM) {
> -		thresholds = &memcg->thresholds;
> -		usage = mem_cgroup_usage(memcg, false);
> -	} else if (type == _MEMSWAP) {
> -		thresholds = &memcg->memsw_thresholds;
> -		usage = mem_cgroup_usage(memcg, true);
> -	} else
> -		BUG();
> -
> -	/* Check if a threshold crossed before adding a new one */
> -	if (thresholds->primary)
> -		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
> -
> -	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
> -
> -	/* Allocate memory for new array of thresholds */
> -	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
> -	if (!new) {
> -		ret = -ENOMEM;
> -		goto unlock;
> -	}
> -	new->size = size;
> -
> -	/* Copy thresholds (if any) to new array */
> -	if (thresholds->primary)
> -		memcpy(new->entries, thresholds->primary->entries,
> -		       flex_array_size(new, entries, size - 1));
> -
> -	/* Add new threshold */
> -	new->entries[size - 1].eventfd = eventfd;
> -	new->entries[size - 1].threshold = threshold;
> -
> -	/* Sort thresholds. Registering of new threshold isn't time-critical */
> -	sort(new->entries, size, sizeof(*new->entries),
> -			compare_thresholds, NULL);
> -
> -	/* Find current threshold */
> -	new->current_threshold = -1;
> -	for (i = 0; i < size; i++) {
> -		if (new->entries[i].threshold <= usage) {
> -			/*
> -			 * new->current_threshold will not be used until
> -			 * rcu_assign_pointer(), so it's safe to increment
> -			 * it here.
> -			 */
> -			++new->current_threshold;
> -		} else
> -			break;
> -	}
> -
> -	/* Free old spare buffer and save old primary buffer as spare */
> -	kfree(thresholds->spare);
> -	thresholds->spare = thresholds->primary;
> -
> -	rcu_assign_pointer(thresholds->primary, new);
> -
> -	/* To be sure that nobody uses thresholds */
> -	synchronize_rcu();
> -
> -unlock:
> -	mutex_unlock(&memcg->thresholds_lock);
> -
> -	return ret;
> -}
> -
> -static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd, const char *args)
> -{
> -	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
> -}
> -
> -static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd, const char *args)
> -{
> -	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
> -}
> -
> -static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd, enum res_type type)
> -{
> -	struct mem_cgroup_thresholds *thresholds;
> -	struct mem_cgroup_threshold_ary *new;
> -	unsigned long usage;
> -	int i, j, size, entries;
> -
> -	mutex_lock(&memcg->thresholds_lock);
> -
> -	if (type == _MEM) {
> -		thresholds = &memcg->thresholds;
> -		usage = mem_cgroup_usage(memcg, false);
> -	} else if (type == _MEMSWAP) {
> -		thresholds = &memcg->memsw_thresholds;
> -		usage = mem_cgroup_usage(memcg, true);
> -	} else
> -		BUG();
> -
> -	if (!thresholds->primary)
> -		goto unlock;
> -
> -	/* Check if a threshold crossed before removing */
> -	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
> -
> -	/* Calculate new number of threshold */
> -	size = entries = 0;
> -	for (i = 0; i < thresholds->primary->size; i++) {
> -		if (thresholds->primary->entries[i].eventfd != eventfd)
> -			size++;
> -		else
> -			entries++;
> -	}
> -
> -	new = thresholds->spare;
> -
> -	/* If no items related to eventfd have been cleared, nothing to do */
> -	if (!entries)
> -		goto unlock;
> -
> -	/* Set thresholds array to NULL if we don't have thresholds */
> -	if (!size) {
> -		kfree(new);
> -		new = NULL;
> -		goto swap_buffers;
> -	}
> -
> -	new->size = size;
> -
> -	/* Copy thresholds and find current threshold */
> -	new->current_threshold = -1;
> -	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
> -		if (thresholds->primary->entries[i].eventfd == eventfd)
> -			continue;
> -
> -		new->entries[j] = thresholds->primary->entries[i];
> -		if (new->entries[j].threshold <= usage) {
> -			/*
> -			 * new->current_threshold will not be used
> -			 * until rcu_assign_pointer(), so it's safe to increment
> -			 * it here.
> -			 */
> -			++new->current_threshold;
> -		}
> -		j++;
> -	}
> -
> -swap_buffers:
> -	/* Swap primary and spare array */
> -	thresholds->spare = thresholds->primary;
> -
> -	rcu_assign_pointer(thresholds->primary, new);
> -
> -	/* To be sure that nobody uses thresholds */
> -	synchronize_rcu();
> -
> -	/* If all events are unregistered, free the spare array */
> -	if (!new) {
> -		kfree(thresholds->spare);
> -		thresholds->spare = NULL;
> -	}
> -unlock:
> -	mutex_unlock(&memcg->thresholds_lock);
> -}
> -
> -static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd)
> -{
> -	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
> -}
> -
> -static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd)
> -{
> -	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
> -}
> -
> -static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd, const char *args)
> -{
> -	struct mem_cgroup_eventfd_list *event;
> -
> -	event = kmalloc(sizeof(*event),	GFP_KERNEL);
> -	if (!event)
> -		return -ENOMEM;
> -
> -	spin_lock(&memcg_oom_lock);
> -
> -	event->eventfd = eventfd;
> -	list_add(&event->list, &memcg->oom_notify);
> -
> -	/* already in OOM ? */
> -	if (memcg->under_oom)
> -		eventfd_signal(eventfd);
> -	spin_unlock(&memcg_oom_lock);
> -
> -	return 0;
> -}
> -
> -static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
> -	struct eventfd_ctx *eventfd)
> -{
> -	struct mem_cgroup_eventfd_list *ev, *tmp;
> -
> -	spin_lock(&memcg_oom_lock);
> -
> -	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
> -		if (ev->eventfd == eventfd) {
> -			list_del(&ev->list);
> -			kfree(ev);
> -		}
> -	}
> -
> -	spin_unlock(&memcg_oom_lock);
> -}
> -
>  static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
>  {
>  	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
> @@ -4611,243 +4182,6 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
>  
>  #endif	/* CONFIG_CGROUP_WRITEBACK */
>  
> -/*
> - * DO NOT USE IN NEW FILES.
> - *
> - * "cgroup.event_control" implementation.
> - *
> - * This is way over-engineered.  It tries to support fully configurable
> - * events for each user.  Such level of flexibility is completely
> - * unnecessary especially in the light of the planned unified hierarchy.
> - *
> - * Please deprecate this and replace with something simpler if at all
> - * possible.
> - */
> -
> -/*
> - * Unregister event and free resources.
> - *
> - * Gets called from workqueue.
> - */
> -static void memcg_event_remove(struct work_struct *work)
> -{
> -	struct mem_cgroup_event *event =
> -		container_of(work, struct mem_cgroup_event, remove);
> -	struct mem_cgroup *memcg = event->memcg;
> -
> -	remove_wait_queue(event->wqh, &event->wait);
> -
> -	event->unregister_event(memcg, event->eventfd);
> -
> -	/* Notify userspace the event is going away. */
> -	eventfd_signal(event->eventfd);
> -
> -	eventfd_ctx_put(event->eventfd);
> -	kfree(event);
> -	css_put(&memcg->css);
> -}
> -
> -/*
> - * Gets called on EPOLLHUP on eventfd when user closes it.
> - *
> - * Called with wqh->lock held and interrupts disabled.
> - */
> -static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
> -			    int sync, void *key)
> -{
> -	struct mem_cgroup_event *event =
> -		container_of(wait, struct mem_cgroup_event, wait);
> -	struct mem_cgroup *memcg = event->memcg;
> -	__poll_t flags = key_to_poll(key);
> -
> -	if (flags & EPOLLHUP) {
> -		/*
> -		 * If the event has been detached at cgroup removal, we
> -		 * can simply return knowing the other side will cleanup
> -		 * for us.
> -		 *
> -		 * We can't race against event freeing since the other
> -		 * side will require wqh->lock via remove_wait_queue(),
> -		 * which we hold.
> -		 */
> -		spin_lock(&memcg->event_list_lock);
> -		if (!list_empty(&event->list)) {
> -			list_del_init(&event->list);
> -			/*
> -			 * We are in atomic context, but cgroup_event_remove()
> -			 * may sleep, so we have to call it in workqueue.
> -			 */
> -			schedule_work(&event->remove);
> -		}
> -		spin_unlock(&memcg->event_list_lock);
> -	}
> -
> -	return 0;
> -}
> -
> -static void memcg_event_ptable_queue_proc(struct file *file,
> -		wait_queue_head_t *wqh, poll_table *pt)
> -{
> -	struct mem_cgroup_event *event =
> -		container_of(pt, struct mem_cgroup_event, pt);
> -
> -	event->wqh = wqh;
> -	add_wait_queue(wqh, &event->wait);
> -}
> -
> -/*
> - * DO NOT USE IN NEW FILES.
> - *
> - * Parse input and register new cgroup event handler.
> - *
> - * Input must be in format '<event_fd> <control_fd> <args>'.
> - * Interpretation of args is defined by control file implementation.
> - */
> -static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
> -					 char *buf, size_t nbytes, loff_t off)
> -{
> -	struct cgroup_subsys_state *css = of_css(of);
> -	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> -	struct mem_cgroup_event *event;
> -	struct cgroup_subsys_state *cfile_css;
> -	unsigned int efd, cfd;
> -	struct fd efile;
> -	struct fd cfile;
> -	struct dentry *cdentry;
> -	const char *name;
> -	char *endp;
> -	int ret;
> -
> -	if (IS_ENABLED(CONFIG_PREEMPT_RT))
> -		return -EOPNOTSUPP;
> -
> -	buf = strstrip(buf);
> -
> -	efd = simple_strtoul(buf, &endp, 10);
> -	if (*endp != ' ')
> -		return -EINVAL;
> -	buf = endp + 1;
> -
> -	cfd = simple_strtoul(buf, &endp, 10);
> -	if ((*endp != ' ') && (*endp != '\0'))
> -		return -EINVAL;
> -	buf = endp + 1;
> -
> -	event = kzalloc(sizeof(*event), GFP_KERNEL);
> -	if (!event)
> -		return -ENOMEM;
> -
> -	event->memcg = memcg;
> -	INIT_LIST_HEAD(&event->list);
> -	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
> -	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
> -	INIT_WORK(&event->remove, memcg_event_remove);
> -
> -	efile = fdget(efd);
> -	if (!efile.file) {
> -		ret = -EBADF;
> -		goto out_kfree;
> -	}
> -
> -	event->eventfd = eventfd_ctx_fileget(efile.file);
> -	if (IS_ERR(event->eventfd)) {
> -		ret = PTR_ERR(event->eventfd);
> -		goto out_put_efile;
> -	}
> -
> -	cfile = fdget(cfd);
> -	if (!cfile.file) {
> -		ret = -EBADF;
> -		goto out_put_eventfd;
> -	}
> -
> -	/* the process need read permission on control file */
> -	/* AV: shouldn't we check that it's been opened for read instead? */
> -	ret = file_permission(cfile.file, MAY_READ);
> -	if (ret < 0)
> -		goto out_put_cfile;
> -
> -	/*
> -	 * The control file must be a regular cgroup1 file. As a regular cgroup
> -	 * file can't be renamed, it's safe to access its name afterwards.
> -	 */
> -	cdentry = cfile.file->f_path.dentry;
> -	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
> -		ret = -EINVAL;
> -		goto out_put_cfile;
> -	}
> -
> -	/*
> -	 * Determine the event callbacks and set them in @event.  This used
> -	 * to be done via struct cftype but cgroup core no longer knows
> -	 * about these events.  The following is crude but the whole thing
> -	 * is for compatibility anyway.
> -	 *
> -	 * DO NOT ADD NEW FILES.
> -	 */
> -	name = cdentry->d_name.name;
> -
> -	if (!strcmp(name, "memory.usage_in_bytes")) {
> -		event->register_event = mem_cgroup_usage_register_event;
> -		event->unregister_event = mem_cgroup_usage_unregister_event;
> -	} else if (!strcmp(name, "memory.oom_control")) {
> -		event->register_event = mem_cgroup_oom_register_event;
> -		event->unregister_event = mem_cgroup_oom_unregister_event;
> -	} else if (!strcmp(name, "memory.pressure_level")) {
> -		event->register_event = vmpressure_register_event;
> -		event->unregister_event = vmpressure_unregister_event;
> -	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
> -		event->register_event = memsw_cgroup_usage_register_event;
> -		event->unregister_event = memsw_cgroup_usage_unregister_event;
> -	} else {
> -		ret = -EINVAL;
> -		goto out_put_cfile;
> -	}
> -
> -	/*
> -	 * Verify @cfile should belong to @css.  Also, remaining events are
> -	 * automatically removed on cgroup destruction but the removal is
> -	 * asynchronous, so take an extra ref on @css.
> -	 */
> -	cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
> -					       &memory_cgrp_subsys);
> -	ret = -EINVAL;
> -	if (IS_ERR(cfile_css))
> -		goto out_put_cfile;
> -	if (cfile_css != css) {
> -		css_put(cfile_css);
> -		goto out_put_cfile;
> -	}
> -
> -	ret = event->register_event(memcg, event->eventfd, buf);
> -	if (ret)
> -		goto out_put_css;
> -
> -	vfs_poll(efile.file, &event->pt);
> -
> -	spin_lock_irq(&memcg->event_list_lock);
> -	list_add(&event->list, &memcg->event_list);
> -	spin_unlock_irq(&memcg->event_list_lock);
> -
> -	fdput(cfile);
> -	fdput(efile);
> -
> -	return nbytes;
> -
> -out_put_css:
> -	css_put(css);
> -out_put_cfile:
> -	fdput(cfile);
> -out_put_eventfd:
> -	eventfd_ctx_put(event->eventfd);
> -out_put_efile:
> -	fdput(efile);
> -out_kfree:
> -	kfree(event);
> -
> -	return ret;
> -}
> -
>  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG)
>  static int mem_cgroup_slab_show(struct seq_file *m, void *p)
>  {
> @@ -5314,19 +4648,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
>  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
>  {
>  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> -	struct mem_cgroup_event *event, *tmp;
>  
> -	/*
> -	 * Unregister events and notify userspace.
> -	 * Notify userspace about cgroup removing only after rmdir of cgroup
> -	 * directory to avoid race between userspace and kernelspace.
> -	 */
> -	spin_lock_irq(&memcg->event_list_lock);
> -	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
> -		list_del_init(&event->list);
> -		schedule_work(&event->remove);
> -	}
> -	spin_unlock_irq(&memcg->event_list_lock);
> +	memcg1_css_offline(memcg);
>  
>  	page_counter_set_min(&memcg->memory, 0);
>  	page_counter_set_low(&memcg->memory, 0);
> -- 
> 2.45.2

-- 
Michal Hocko
SUSE Labs