[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250920005931.2753828-25-tj@kernel.org>
Date: Fri, 19 Sep 2025 14:58:47 -1000
From: Tejun Heo <tj@...nel.org>
To: void@...ifault.com,
arighi@...dia.com,
multics69@...il.com
Cc: linux-kernel@...r.kernel.org,
sched-ext@...ts.linux.dev,
memxor@...il.com,
bpf@...r.kernel.org,
Tejun Heo <tj@...nel.org>
Subject: [PATCH 24/46] HACK_NOT_FOR_UPSTREAM: BPF: Implement prog grouping hack
Hopefully, we can have something better instead.
NOT_SIGNED_OFF
---
include/linux/bpf.h | 5 +++++
include/linux/sched.h | 2 ++
kernel/bpf/syscall.c | 23 +++++++++++++++++++++++
kernel/sched/ext.c | 36 ++++++++++++++++++++++++++++++++++++
tools/sched_ext/scx_qmap.c | 13 +++++++++++++
5 files changed, 79 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cc700925b802..5101ae3ba2b6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1581,6 +1581,11 @@ struct bpf_stream_stage {
struct bpf_prog_aux {
atomic64_t refcnt;
+
+ /* XXX - See kernel/sched/ext.c::scx_sub_enable() */
+ u64 priv_user;
+ void *priv;
+
u32 used_map_cnt;
u32 used_btf_cnt;
u32 max_ctx_offset;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b272382673d..576aed48beb2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1596,6 +1596,8 @@ struct task_struct {
struct bpf_local_storage __rcu *bpf_storage;
/* Used for BPF run context */
struct bpf_run_ctx *bpf_ctx;
+ /* XXX - See kernel/sched/ext.c::scx_sub_enable() */
+ u64 bpf_prog_aux_priv;
#endif
/* Used by BPF for per-TASK xdp storage */
struct bpf_net_context *bpf_net_context;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0fbfa8532c39..e85dbe7fe5ce 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2761,6 +2761,27 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
}
+static int prog_aux_priv_param_set(const char *input, const struct kernel_param *kp)
+{
+ return kstrtoull(input, 0, ¤t->bpf_prog_aux_priv);
+}
+
+static int prog_aux_priv_param_get(char *buf, const struct kernel_param *kp)
+{
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", current->bpf_prog_aux_priv);
+}
+
+static const struct kernel_param_ops prog_aux_priv_param_ops = {
+ .set = prog_aux_priv_param_set,
+ .get = prog_aux_priv_param_get,
+};
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "bpf."
+module_param_cb(prog_aux_priv, &prog_aux_priv_param_ops, NULL, 0664);
+MODULE_PARM_DESC("prog_aux_priv",
+ "Set prog->aux->priv to this value for all BPF programs loaded by %current");
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt
@@ -2898,6 +2919,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
prog->expected_attach_type = attr->expected_attach_type;
prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
+ /* XXX - See kernel/sched/ext.c::scx_sub_enable() */
+ prog->aux->priv_user = current->bpf_prog_aux_priv;
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5eb1d6919595..a0251442b8ac 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4116,6 +4116,24 @@ static void scx_sub_disable(struct scx_sched *sch)
if (sch->ops.exit)
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info);
+
+ /*
+ * XXX - NULL prog->aux->priv is interpreted as scx_root, so use an
+ * ERR_PTR value to mark the associated progs dead. Note that this is
+ * racy as e.g. a tracepoint program associated with a scheduler which
+ * hasn't finished scx_sub_enable() yet may end up affecting scx_root
+ * inadvertently. Plug the hole when this hack is replaced with a proper
+ * BPF construct.
+ */
+ u32 prog_id = 0;
+ struct bpf_prog *prog;
+ while ((prog = bpf_prog_get_curr_or_next(&prog_id))) {
+ if (prog->aux->priv == sch)
+ RCU_INIT_POINTER(prog->aux->priv, ERR_PTR(-ENODEV));
+ bpf_prog_put(prog);
+ prog_id++;
+ }
+
kobject_del(&sch->kobj);
}
#else /* CONFIG_EXT_SUB_SCHED */
@@ -5148,6 +5166,24 @@ static int scx_sub_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_disable;
}
+ /*
+ * XXX - We want all BPF programs loaded together with this scheduler
+ * instance to point to this scheduler instance. BPF currently doesn't
+ * have such feature so work around with a hack. The loading userspace
+ * thread sets %current->bpf_prog_aux_priv to the associated cgroup ID
+ * which gets transferred to bpf->aux->priv_user in bpf_prog_load().
+ * Here, we can find all progs that have the matching cgroup ID and set
+ * their prog->aux->priv to $sch.
+ */
+ u32 prog_id = 0;
+ struct bpf_prog *prog;
+ while ((prog = bpf_prog_get_curr_or_next(&prog_id))) {
+ if (prog->aux->priv_user == cgroup_id(cgrp))
+ rcu_assign_pointer(prog->aux->priv, sch);
+ bpf_prog_put(prog);
+ prog_id++;
+ }
+
if (sch->ops.init) {
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
if (ret) {
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 5d762d10f4db..cefc439c9e4a 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -99,12 +99,25 @@ int main(int argc, char **argv)
break;
case 'c': {
struct stat st;
+ int fd, len;
+ char buf[19];
if (stat(optarg, &st) < 0) {
perror("stat");
return 1;
}
skel->struct_ops.qmap_ops->sub_cgroup_id = st.st_ino;
skel->rodata->sub_cgroup_id = st.st_ino;
+ fd = open("/sys/module/bpf/parameters/prog_aux_priv", O_RDWR);
+ if (fd < 0) {
+ perror("open(\"/sys/module/bpf/parameters/prog_aux_priv\")");
+ return 1;
+ }
+ len = snprintf(buf, sizeof(buf), "0x%lx", st.st_ino);
+ if (write(fd, buf, len) != len) {
+ perror("write(\"/sys/module/bpf/parameters/prog_aux_priv\")");
+ return 1;
+ }
+ close(fd);
break;
}
case 'd':
--
2.51.0
Powered by blists - more mailing lists