linux-kernel - [PATCH bpf-next v3 07/17] mm: introduce BPF OOM struct ops

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260127024421.494929-8-roman.gushchin@linux.dev>
Date: Mon, 26 Jan 2026 18:44:10 -0800
From: Roman Gushchin <roman.gushchin@...ux.dev>
To: bpf@...r.kernel.org
Cc: Michal Hocko <mhocko@...e.com>,
	Alexei Starovoitov <ast@...nel.org>,
	Matt Bobrowski <mattbobrowski@...gle.com>,
	Shakeel Butt <shakeel.butt@...ux.dev>,
	JP Kobryn <inwardvessel@...il.com>,
	linux-kernel@...r.kernel.org,
	linux-mm@...ck.org,
	Suren Baghdasaryan <surenb@...gle.com>,
	Johannes Weiner <hannes@...xchg.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Roman Gushchin <roman.gushchin@...ux.dev>
Subject: [PATCH bpf-next v3 07/17] mm: introduce BPF OOM struct ops

Introduce a bpf struct ops for implementing custom OOM handling
policies.

It's possible to load one bpf_oom_ops for the system and one
bpf_oom_ops for every memory cgroup. In case of a memcg OOM, the
cgroup tree is traversed from the OOM'ing memcg up to the root and
corresponding BPF OOM handlers are executed until some memory is
freed. If no memory is freed, the kernel OOM killer is invoked.

The struct ops provides the bpf_handle_out_of_memory() callback,
which expected to return 1 if it was able to free some memory and 0
otherwise. If 1 is returned, the kernel also checks the bpf_memory_freed
field of the oom_control structure, which is expected to be set by
kfuncs suitable for releasing memory (which will be introduced later
in the patch series). If both are set, OOM is considered handled,
otherwise the next OOM handler in the chain is executed: e.g. BPF OOM
attached to the parent cgroup or the kernel OOM killer.

The bpf_handle_out_of_memory() callback program is sleepable to allow
using iterators, e.g. cgroup iterators. The callback receives struct
oom_control as an argument, so it can determine the scope of the OOM
event: if this is a memcg-wide or system-wide OOM. It also receives
bpf_struct_ops_link as the second argument, so it can detect the
cgroup level at which this specific instance is attached.

The bpf_handle_out_of_memory() callback is executed just before the
kernel victim task selection algorithm, so all heuristics and sysctls
like panic on oom, sysctl_oom_kill_allocating_task and
sysctl_oom_kill_allocating_task are respected.

The struct ops has the name field, which allows to define a custom
name for the implemented policy. It's printed in the OOM report
in the oom_handler=<name> format only if a bpf handler is invoked.

Signed-off-by: Roman Gushchin <roman.gushchin@...ux.dev>
---
 MAINTAINERS                     |   2 +
 include/linux/bpf-cgroup-defs.h |   3 +
 include/linux/bpf.h             |   1 +
 include/linux/bpf_oom.h         |  46 ++++++++
 include/linux/oom.h             |   8 ++
 kernel/bpf/bpf_struct_ops.c     |  12 +-
 mm/Makefile                     |   2 +-
 mm/bpf_oom.c                    | 192 ++++++++++++++++++++++++++++++++
 mm/oom_kill.c                   |  19 ++++
 9 files changed, 282 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/bpf_oom.h
 create mode 100644 mm/bpf_oom.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 491d567f7dc8..53465570c1e5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4807,7 +4807,9 @@ M:	Shakeel Butt <shakeel.butt@...ux.dev>
 L:	bpf@...r.kernel.org
 L:	linux-mm@...ck.org
 S:	Maintained
+F:	include/linux/bpf_oom.h
 F:	mm/bpf_memcontrol.c
+F:	mm/bpf_oom.c
 
 BPF [MISC]
 L:	bpf@...r.kernel.org
diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 6c5e37190dad..52395834ce13 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -74,6 +74,9 @@ struct cgroup_bpf {
 	/* list of bpf struct ops links */
 	struct list_head struct_ops_links;
 
+	/* BPF OOM struct ops link */
+	struct bpf_struct_ops_link __rcu *bpf_oom_link;
+
 	/* reference counter used to detach bpf programs after cgroup removal */
 	struct percpu_ref refcnt;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 391888eb257c..a5cee5a657b0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3944,6 +3944,7 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog)
 int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
 			   const char **linep, int *nump);
 struct bpf_prog *bpf_prog_find_from_stack(void);
+void *bpf_struct_ops_data(struct bpf_map *map);
 
 int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog);
 int bpf_insn_array_ready(struct bpf_map *map);
diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
new file mode 100644
index 000000000000..c81133145c50
--- /dev/null
+++ b/include/linux/bpf_oom.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef __BPF_OOM_H
+#define __BPF_OOM_H
+
+struct oom_control;
+
+#define BPF_OOM_NAME_MAX_LEN 64
+
+struct bpf_oom_ops {
+	/**
+	 * @handle_out_of_memory: Out of memory bpf handler, called before
+	 * the in-kernel OOM killer.
+	 * @oc: OOM control structure
+	 * @st_link: struct ops link
+	 *
+	 * Should return 1 if some memory was freed up, otherwise
+	 * the in-kernel OOM killer is invoked.
+	 */
+	int (*handle_out_of_memory)(struct oom_control *oc,
+				    struct bpf_struct_ops_link *st_link);
+
+	/**
+	 * @name: BPF OOM policy name
+	 */
+	char name[BPF_OOM_NAME_MAX_LEN];
+};
+
+#ifdef CONFIG_BPF_SYSCALL
+/**
+ * @bpf_handle_oom: handle out of memory condition using bpf
+ * @oc: OOM control structure
+ *
+ * Returns true if some memory was freed.
+ */
+bool bpf_handle_oom(struct oom_control *oc);
+
+#else /* CONFIG_BPF_SYSCALL */
+static inline bool bpf_handle_oom(struct oom_control *oc)
+{
+	return false;
+}
+
+#endif /* CONFIG_BPF_SYSCALL */
+
+#endif /* __BPF_OOM_H */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7b02bc1d0a7e..c2dce336bcb4 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -51,6 +51,14 @@ struct oom_control {
 
 	/* Used to print the constraint info. */
 	enum oom_constraint constraint;
+
+#ifdef CONFIG_BPF_SYSCALL
+	/* Used by the bpf oom implementation to mark the forward progress */
+	bool bpf_memory_freed;
+
+	/* Handler name */
+	const char *bpf_handler_name;
+#endif
 };
 
 extern struct mutex oom_lock;
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 2e361e22cfa0..6285a6d56b98 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1009,7 +1009,7 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 	 * in the tramopline image to finish before releasing
 	 * the trampoline image.
 	 */
-	synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+	synchronize_rcu_mult(call_rcu, call_rcu_tasks, call_rcu_tasks_trace);
 
 	__bpf_struct_ops_map_free(map);
 }
@@ -1226,7 +1226,8 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
 	if (st_link->cgroup)
 		cgroup_bpf_detach_struct_ops(st_link->cgroup, st_link);
 
-	kfree(st_link);
+	synchronize_rcu_tasks_trace();
+	kfree_rcu(st_link, link.rcu);
 }
 
 static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
@@ -1535,3 +1536,10 @@ void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map
 
 	info->btf_vmlinux_id = btf_obj_id(st_map->btf);
 }
+
+void *bpf_struct_ops_data(struct bpf_map *map)
+{
+	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+	return &st_map->kvalue.data;
+}
diff --git a/mm/Makefile b/mm/Makefile
index bf46fe31dc14..e939525ba01b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -107,7 +107,7 @@ ifdef CONFIG_SWAP
 obj-$(CONFIG_MEMCG) += swap_cgroup.o
 endif
 ifdef CONFIG_BPF_SYSCALL
-obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
+obj-$(CONFIG_MEMCG) += bpf_memcontrol.o bpf_oom.o
 endif
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_GUP_TEST) += gup_test.o
diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c
new file mode 100644
index 000000000000..ea70be6e2c26
--- /dev/null
+++ b/mm/bpf_oom.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * BPF-driven OOM killer customization
+ *
+ * Author: Roman Gushchin <roman.gushchin@...ux.dev>
+ */
+
+#include <linux/bpf.h>
+#include <linux/oom.h>
+#include <linux/bpf_oom.h>
+#include <linux/bpf-cgroup.h>
+#include <linux/cgroup.h>
+#include <linux/memcontrol.h>
+#include <linux/uaccess.h>
+
+static int bpf_ops_handle_oom(struct bpf_oom_ops *bpf_oom_ops,
+			      struct bpf_struct_ops_link *st_link,
+			      struct oom_control *oc)
+{
+	int ret;
+
+	oc->bpf_handler_name = &bpf_oom_ops->name[0];
+	oc->bpf_memory_freed = false;
+	pagefault_disable();
+	ret = bpf_oom_ops->handle_out_of_memory(oc, st_link);
+	pagefault_enable();
+	oc->bpf_handler_name = NULL;
+
+	return ret;
+}
+
+bool bpf_handle_oom(struct oom_control *oc)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_oom_ops *bpf_oom_ops;
+	struct mem_cgroup *memcg;
+	struct bpf_map *map;
+	int ret = 0;
+
+	/*
+	 * System-wide OOMs are handled by the struct ops attached
+	 * to the root memory cgroup
+	 */
+	memcg = oc->memcg ? oc->memcg : root_mem_cgroup;
+
+	rcu_read_lock_trace();
+
+	/* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */
+	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+		st_link = rcu_dereference_check(memcg->css.cgroup->bpf.bpf_oom_link,
+						rcu_read_lock_trace_held());
+		if (!st_link)
+			continue;
+
+		map = rcu_dereference_check((st_link->map),
+					    rcu_read_lock_trace_held());
+		if (!map)
+			continue;
+
+		/* Call BPF OOM handler */
+		bpf_oom_ops = bpf_struct_ops_data(map);
+		ret = bpf_ops_handle_oom(bpf_oom_ops, st_link, oc);
+		if (ret && oc->bpf_memory_freed)
+			break;
+		ret = 0;
+	}
+
+	rcu_read_unlock_trace();
+
+	return ret && oc->bpf_memory_freed;
+}
+
+static int __handle_out_of_memory(struct oom_control *oc,
+				  struct bpf_struct_ops_link *st_link)
+{
+	return 0;
+}
+
+static struct bpf_oom_ops __bpf_oom_ops = {
+	.handle_out_of_memory = __handle_out_of_memory,
+};
+
+static const struct bpf_func_proto *
+bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return tracing_prog_func_proto(func_id, prog);
+}
+
+static bool bpf_oom_ops_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					const struct bpf_prog *prog,
+					struct bpf_insn_access_aux *info)
+{
+	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static const struct bpf_verifier_ops bpf_oom_verifier_ops = {
+	.get_func_proto = bpf_oom_func_proto,
+	.is_valid_access = bpf_oom_ops_is_valid_access,
+};
+
+static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+	struct cgroup *cgrp;
+
+	/* The link is not yet fully initialized, but cgroup should be set */
+	if (!link)
+		return -EOPNOTSUPP;
+
+	cgrp = st_link->cgroup;
+	if (!cgrp)
+		return -EINVAL;
+
+	if (cmpxchg(&cgrp->bpf.bpf_oom_link, NULL, st_link))
+		return -EEXIST;
+
+	return 0;
+}
+
+static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+	struct cgroup *cgrp;
+
+	if (!link)
+		return;
+
+	cgrp = st_link->cgroup;
+	if (!cgrp)
+		return;
+
+	WARN_ON(cmpxchg(&cgrp->bpf.bpf_oom_link, st_link, NULL) != st_link);
+}
+
+static int bpf_oom_ops_check_member(const struct btf_type *t,
+				    const struct btf_member *member,
+				    const struct bpf_prog *prog)
+{
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct bpf_oom_ops, handle_out_of_memory):
+		if (!prog)
+			return -EINVAL;
+		break;
+	}
+
+	return 0;
+}
+
+static int bpf_oom_ops_init_member(const struct btf_type *t,
+				   const struct btf_member *member,
+				   void *kdata, const void *udata)
+{
+	const struct bpf_oom_ops *uops = udata;
+	struct bpf_oom_ops *ops = kdata;
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct bpf_oom_ops, name):
+		if (uops->name[0])
+			strscpy_pad(ops->name, uops->name, sizeof(ops->name));
+		else
+			strscpy_pad(ops->name, "bpf_defined_policy");
+		return 1;
+	}
+	return 0;
+}
+
+static int bpf_oom_ops_init(struct btf *btf)
+{
+	return 0;
+}
+
+static struct bpf_struct_ops bpf_oom_bpf_ops = {
+	.verifier_ops = &bpf_oom_verifier_ops,
+	.reg = bpf_oom_ops_reg,
+	.unreg = bpf_oom_ops_unreg,
+	.check_member = bpf_oom_ops_check_member,
+	.init_member = bpf_oom_ops_init_member,
+	.init = bpf_oom_ops_init,
+	.name = "bpf_oom_ops",
+	.owner = THIS_MODULE,
+	.cfi_stubs = &__bpf_oom_ops
+};
+
+static int __init bpf_oom_struct_ops_init(void)
+{
+	return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops);
+}
+late_initcall(bpf_oom_struct_ops_init);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5eb11fbba704..44bbcf033804 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -45,6 +45,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/cred.h>
 #include <linux/nmi.h>
+#include <linux/bpf_oom.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -246,6 +247,15 @@ static const char * const oom_constraint_text[] = {
 	[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
 };
 
+static const char *oom_handler_name(struct oom_control *oc)
+{
+#ifdef CONFIG_BPF_SYSCALL
+	if (oc->bpf_handler_name)
+		return oc->bpf_handler_name;
+#endif
+	return NULL;
+}
+
 /*
  * Determine the type of allocation constraint.
  */
@@ -461,6 +471,8 @@ static void dump_header(struct oom_control *oc)
 	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
 		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
 			current->signal->oom_score_adj);
+	if (oom_handler_name(oc))
+		pr_warn("oom bpf handler: %s\n", oom_handler_name(oc));
 	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
 		pr_warn("COMPACTION is disabled!!!\n");
 
@@ -1168,6 +1180,13 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
+	/*
+	 * Let bpf handle the OOM first. If it was able to free up some memory,
+	 * bail out. Otherwise fall back to the kernel OOM killer.
+	 */
+	if (bpf_handle_oom(oc))
+		return true;
+
 	select_bad_process(oc);
 	/* Found nothing?!?! */
 	if (!oc->chosen) {
-- 
2.52.0