[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9f072e53f79ceaea43e3730476494517e453530a.1769157382.git.zhuhui@kylinos.cn>
Date: Fri, 23 Jan 2026 17:00:15 +0800
From: Hui Zhu <hui.zhu@...ux.dev>
To: Andrew Morton <akpm@...ux-foundation.org>,
Johannes Weiner <hannes@...xchg.org>,
Michal Hocko <mhocko@...nel.org>,
Roman Gushchin <roman.gushchin@...ux.dev>,
Shakeel Butt <shakeel.butt@...ux.dev>,
Muchun Song <muchun.song@...ux.dev>,
Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
Andrii Nakryiko <andrii@...nel.org>,
Martin KaFai Lau <martin.lau@...ux.dev>,
Eduard Zingerman <eddyz87@...il.com>,
Song Liu <song@...nel.org>,
Yonghong Song <yonghong.song@...ux.dev>,
John Fastabend <john.fastabend@...il.com>,
KP Singh <kpsingh@...nel.org>,
Stanislav Fomichev <sdf@...ichev.me>,
Hao Luo <haoluo@...gle.com>,
Jiri Olsa <jolsa@...nel.org>,
Shuah Khan <shuah@...nel.org>,
Peter Zijlstra <peterz@...radead.org>,
Miguel Ojeda <ojeda@...nel.org>,
Nathan Chancellor <nathan@...nel.org>,
Kees Cook <kees@...nel.org>,
Tejun Heo <tj@...nel.org>,
Jeff Xu <jeffxu@...omium.org>,
mkoutny@...e.com,
Jan Hendrik Farr <kernel@...rr.cc>,
Christian Brauner <brauner@...nel.org>,
Randy Dunlap <rdunlap@...radead.org>,
Brian Gerst <brgerst@...il.com>,
Masahiro Yamada <masahiroy@...nel.org>,
davem@...emloft.net,
Jakub Kicinski <kuba@...nel.org>,
Jesper Dangaard Brouer <hawk@...nel.org>,
JP Kobryn <inwardvessel@...il.com>,
Willem de Bruijn <willemb@...gle.com>,
Jason Xing <kerneljasonxing@...il.com>,
Paul Chaignon <paul.chaignon@...il.com>,
Anton Protopopov <a.s.protopopov@...il.com>,
Amery Hung <ameryhung@...il.com>,
Chen Ridong <chenridong@...weicloud.com>,
Lance Yang <lance.yang@...ux.dev>,
Jiayuan Chen <jiayuan.chen@...ux.dev>,
linux-kernel@...r.kernel.org,
linux-mm@...ck.org,
cgroups@...r.kernel.org,
bpf@...r.kernel.org,
netdev@...r.kernel.org,
linux-kselftest@...r.kernel.org
Cc: Hui Zhu <zhuhui@...inos.cn>,
Geliang Tang <geliang@...nel.org>
Subject: [RFC PATCH bpf-next v3 10/12] mm/bpf: Add BPF_F_ALLOW_OVERRIDE support for memcg_bpf_ops
From: Hui Zhu <zhuhui@...inos.cn>
To allow for more flexible attachment policies in nested cgroup
hierarchies, this patch introduces support for the
`BPF_F_ALLOW_OVERRIDE` flag for `memcg_bpf_ops`.
When a `memcg_bpf_ops` is attached to a cgroup with this flag, it
permits child cgroups to attach their own, different `memcg_bpf_ops`,
overriding the parent's inherited program. Without this flag,
attaching a BPF program to a cgroup that already has one (either
directly or via inheritance) will fail.
The implementation involves:
- Adding a `bpf_ops_flags` field to `struct mem_cgroup`.
- During registration (`bpf_memcg_ops_reg`), checking for existing
programs and the `BPF_F_ALLOW_OVERRIDE` flag.
- During unregistration (`bpf_memcg_ops_unreg`), correctly restoring
the parent's BPF program to the cgroup hierarchy.
- Ensuring flags are inherited by child cgroups during online events.
This change enables complex, multi-level policy enforcement where
different subtrees of the cgroup hierarchy can have distinct memory
management BPF programs.
Signed-off-by: Geliang Tang <geliang@...nel.org>
Signed-off-by: Hui Zhu <zhuhui@...inos.cn>
---
include/linux/memcontrol.h | 1 +
mm/bpf_memcontrol.c | 77 ++++++++++++++++++++++++--------------
2 files changed, 49 insertions(+), 29 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d71e86b85ba7..a37b78d3853d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -354,6 +354,7 @@ struct mem_cgroup {
#ifdef CONFIG_BPF_SYSCALL
struct memcg_bpf_ops *bpf_ops;
+ u32 bpf_ops_flags;
#endif
struct mem_cgroup_per_node *nodeinfo[];
diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c
index 3eae1af49519..d6126b94f521 100644
--- a/mm/bpf_memcontrol.c
+++ b/mm/bpf_memcontrol.c
@@ -213,6 +213,7 @@ void memcontrol_bpf_online(struct mem_cgroup *memcg)
goto out;
WRITE_ONCE(memcg->bpf_ops, ops);
+ memcg->bpf_ops_flags = parent_memcg->bpf_ops_flags;
/*
* If the BPF program implements it, call the online handler to
@@ -340,29 +341,6 @@ static int bpf_memcg_ops_init_member(const struct btf_type *t,
return 0;
}
-/**
- * clean_memcg_bpf_ops - Detach BPF programs from a cgroup hierarchy.
- * @memcg: The root of the cgroup hierarchy to clean.
- * @ops: The specific ops struct to detach. If NULL, detach any ops.
- *
- * Iterates through all descendant cgroups of @memcg (including itself)
- * and clears their bpf_ops pointer. This is used when a BPF program
- * is detached or if attachment fails midway.
- */
-static void clean_memcg_bpf_ops(struct mem_cgroup *memcg,
- struct memcg_bpf_ops *ops)
-{
- struct mem_cgroup *iter = NULL;
-
- while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
- if (ops) {
- if (!WARN_ON(READ_ONCE(memcg->bpf_ops) != ops))
- WRITE_ONCE(memcg->bpf_ops, NULL);
- } else
- WRITE_ONCE(iter->bpf_ops, NULL);
- }
-}
-
static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link)
{
struct bpf_struct_ops_link *ops_link
@@ -371,21 +349,44 @@ static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link)
struct mem_cgroup *memcg, *iter = NULL;
int err = 0;
+ if (ops_link->flags & ~BPF_F_ALLOW_OVERRIDE) {
+ pr_err("attach only support BPF_F_ALLOW_OVERRIDE\n");
+ return -EOPNOTSUPP;
+ }
+
memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
if (IS_ERR_OR_NULL(memcg))
return PTR_ERR(memcg);
cgroup_lock();
+
+ if (READ_ONCE(memcg->bpf_ops)) {
+ /* Check if bpf_ops of the parent is BPF_F_ALLOW_OVERRIDE. */
+ if (memcg->bpf_ops_flags & BPF_F_ALLOW_OVERRIDE) {
+ iter = parent_mem_cgroup(memcg);
+
+ if (!iter)
+ goto busy_out;
+ if (READ_ONCE(iter->bpf_ops) !=
+ READ_ONCE(memcg->bpf_ops))
+ goto busy_out;
+ } else {
+busy_out:
+ err = -EBUSY;
+ goto unlock_out;
+ }
+ }
+
while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
if (READ_ONCE(iter->bpf_ops)) {
- mem_cgroup_iter_break(memcg, iter);
- err = -EBUSY;
- break;
+ /* cannot override existing bpf_ops of sub-cgroup. */
+ continue;
}
WRITE_ONCE(iter->bpf_ops, ops);
+ iter->bpf_ops_flags = ops_link->flags;
}
- if (err)
- clean_memcg_bpf_ops(memcg, NULL);
+
+unlock_out:
cgroup_unlock();
mem_cgroup_put(memcg);
@@ -399,13 +400,31 @@ static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link)
= container_of(link, struct bpf_struct_ops_link, link);
struct memcg_bpf_ops *ops = kdata;
struct mem_cgroup *memcg;
+ struct mem_cgroup *iter;
+ struct memcg_bpf_ops *parent_bpf_ops = NULL;
+ u32 parent_bpf_ops_flags = 0;
memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id);
if (IS_ERR_OR_NULL(memcg))
goto out;
cgroup_lock();
- clean_memcg_bpf_ops(memcg, ops);
+
+ /* Get the parent bpf_ops and bpf_ops_flags */
+ iter = parent_mem_cgroup(memcg);
+ if (iter) {
+ parent_bpf_ops = READ_ONCE(iter->bpf_ops);
+ parent_bpf_ops_flags = iter->bpf_ops_flags;
+ }
+
+ iter = NULL;
+ while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
+ if (READ_ONCE(iter->bpf_ops) == ops) {
+ WRITE_ONCE(iter->bpf_ops, parent_bpf_ops);
+ iter->bpf_ops_flags = parent_bpf_ops_flags;
+ }
+ }
+
cgroup_unlock();
mem_cgroup_put(memcg);
--
2.43.0
Powered by blists - more mailing lists