[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250423234542.1890867-4-tj@kernel.org>
Date: Wed, 23 Apr 2025 13:44:41 -1000
From: Tejun Heo <tj@...nel.org>
To: David Vernet <void@...ifault.com>,
Andrea Righi <arighi@...dia.com>,
Changwoo Min <changwoo@...lia.com>,
linux-kernel@...r.kernel.org
Cc: Tejun Heo <tj@...nel.org>
Subject: [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched
To prepare for supporting multiple schedulers, make scx_sched allocated
dynamically. scx_sched->kobj is now an embedded field and the kobj's
lifetime determines the lifetime of the containing scx_sched.
- Enable path is updated so that kobj init and addition are performed later.
- scx_sched freeing is initiated in scx_kobj_release() and also goes through
an rcu_work so that scx_root can be accessed from an unsynchronized path -
scx_disable().
No behavior changes intended.
Signed-off-by: Tejun Heo <tj@...nel.org>
---
kernel/sched/ext.c | 151 ++++++++++++++++++++++++++-------------------
1 file changed, 86 insertions(+), 65 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ad392890d2dd..612232c66d13 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -772,7 +772,8 @@ struct scx_sched {
atomic_t exit_kind;
struct scx_exit_info *exit_info;
- struct kobject *kobj;
+ struct kobject kobj;
+ struct rcu_work rcu_work;
};
enum scx_wake_flags {
@@ -933,11 +934,7 @@ enum scx_ops_state {
#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
-static struct scx_sched __scx_root = {
- .exit_kind = ATOMIC_INIT(SCX_EXIT_DONE),
-};
-
-static struct scx_sched *scx_root = &__scx_root;
+static struct scx_sched __rcu *scx_root;
/*
* During exit, a task may schedule after losing its PIDs. When disabling the
@@ -4417,9 +4414,23 @@ static const struct attribute_group scx_global_attr_group = {
.attrs = scx_global_attrs,
};
+static void free_exit_info(struct scx_exit_info *ei);
+
+static void scx_sched_free_rcu_work(struct work_struct *work)
+{
+ struct rcu_work *rcu_work = to_rcu_work(work);
+ struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+
+ free_exit_info(sch->exit_info);
+ kfree(sch);
+}
+
static void scx_kobj_release(struct kobject *kobj)
{
- kfree(kobj);
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
+ queue_rcu_work(system_unbound_wq, &sch->rcu_work);
}
static ssize_t scx_attr_ops_show(struct kobject *kobj,
@@ -4709,14 +4720,15 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
static void scx_disable_workfn(struct kthread_work *work)
{
- struct scx_exit_info *ei = scx_root->exit_info;
+ struct scx_sched *sch = scx_root;
+ struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq;
int kind, cpu;
- kind = atomic_read(&scx_root->exit_kind);
+ kind = atomic_read(&sch->exit_kind);
while (true) {
/*
* NONE indicates that a new scx_ops has been registered since
@@ -4725,7 +4737,7 @@ static void scx_disable_workfn(struct kthread_work *work)
*/
if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
return;
- if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
+ if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
break;
}
ei->kind = kind;
@@ -4740,7 +4752,7 @@ static void scx_disable_workfn(struct kthread_work *work)
break;
case SCX_DISABLED:
pr_warn("sched_ext: ops error detected without ops (%s)\n",
- scx_root->exit_info->msg);
+ sch->exit_info->msg);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
goto done;
default:
@@ -4807,41 +4819,43 @@ static void scx_disable_workfn(struct kthread_work *work)
/* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_enabled);
- bitmap_zero(scx_root->has_op, SCX_OPI_END);
+ bitmap_zero(sch->has_op, SCX_OPI_END);
scx_idle_disable();
synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) {
pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_root->ops.name, ei->reason);
+ sch->ops.name, ei->reason);
if (ei->msg[0] != '\0')
- pr_err("sched_ext: %s: %s\n",
- scx_root->ops.name, ei->msg);
+ pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
#ifdef CONFIG_STACKTRACE
stack_trace_print(ei->bt, ei->bt_len, 2);
#endif
} else {
pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_root->ops.name, ei->reason);
+ sch->ops.name, ei->reason);
}
- if (scx_root->ops.exit)
+ if (sch->ops.exit)
SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
/*
- * Delete the kobject from the hierarchy eagerly in addition to just
- * dropping a reference. Otherwise, if the object is deleted
- * asynchronously, sysfs could observe an object of the same name still
- * in the hierarchy when another scheduler is loaded.
+ * scx_root clearing must be inside cpus_read_lock(). See
+ * handle_hotplug().
*/
- kobject_del(scx_root->kobj);
- kobject_put(scx_root->kobj);
- scx_root->kobj = NULL;
+ cpus_read_lock();
+ RCU_INIT_POINTER(scx_root, NULL);
+ cpus_read_unlock();
- memset(&scx_root->ops, 0, sizeof(scx_root->ops));
+ /*
+ * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
+ * could observe an object of the same name still in the hierarchy when
+ * the next scheduler is loaded.
+ */
+ kobject_del(&sch->kobj);
rhashtable_walk_enter(&dsq_hash, &rht_iter);
do {
@@ -4858,9 +4872,6 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
- free_exit_info(scx_root->exit_info);
- scx_root->exit_info = NULL;
-
mutex_unlock(&scx_enable_mutex);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
@@ -4885,13 +4896,18 @@ static void schedule_scx_disable_work(void)
static void scx_disable(enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+ struct scx_sched *sch;
if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
kind = SCX_EXIT_ERROR;
- atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);
-
- schedule_scx_disable_work();
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+ schedule_scx_disable_work();
+ }
+ rcu_read_unlock();
}
static void dump_newline(struct seq_buf *s)
@@ -5288,6 +5304,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{
+ struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
@@ -5351,33 +5368,32 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_unlock;
}
- scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
- if (!scx_root->kobj) {
+ sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+ if (!sch) {
ret = -ENOMEM;
goto err_unlock;
}
- scx_root->kobj->kset = scx_kset;
- ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
- if (ret < 0)
- goto err;
-
- scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
- if (!scx_root->exit_info) {
+ sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+ if (!sch->exit_info) {
ret = -ENOMEM;
- goto err_del;
+ goto err_free;
}
+ sch->kobj.kset = scx_kset;
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+ if (ret < 0)
+ goto err_free;
+
+ atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+ sch->ops = *ops;
+
/*
- * Set scx_ops, transition to ENABLING and clear exit info to arm the
- * disable path. Failure triggers full disabling from here on.
+ * Transition to ENABLING and clear exit info to arm the disable path.
+ * Failure triggers full disabling from here on.
*/
- scx_root->ops = *ops;
-
WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
-
- atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
- scx_root->warned_zero_slice = false;
+ WARN_ON_ONCE(scx_root);
atomic_long_set(&scx_nr_rejected, 0);
@@ -5390,9 +5406,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
cpus_read_lock();
+ /*
+ * Make the scheduler instance visible. Must be inside cpus_read_lock().
+ * See handle_hotplug().
+ */
+ rcu_assign_pointer(scx_root, sch);
+
scx_idle_enable(ops);
- if (scx_root->ops.init) {
+ if (sch->ops.init) {
ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
if (ret) {
ret = ops_sanitize_err("init", ret);
@@ -5404,7 +5426,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
- set_bit(i, scx_root->has_op);
+ set_bit(i, sch->has_op);
check_hotplug_seq(ops);
scx_idle_update_selcpu_topology(ops);
@@ -5445,10 +5467,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
- set_bit(i, scx_root->has_op);
+ set_bit(i, sch->has_op);
- if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release)
- scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
+ if (sch->ops.cpu_acquire || sch->ops.cpu_release)
+ sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
/*
* Lock out forks, cgroup on/offlining and moves before opening the
@@ -5547,7 +5569,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_bypass(false);
if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
- WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
+ WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
goto err_disable;
}
@@ -5555,23 +5577,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable(&__scx_switched_all);
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
- scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
- kobject_uevent(scx_root->kobj, KOBJ_ADD);
+ sch->ops.name, scx_switched_all() ? "" : " (partial)");
+ kobject_uevent(&sch->kobj, KOBJ_ADD);
mutex_unlock(&scx_enable_mutex);
atomic_long_inc(&scx_enable_seq);
return 0;
-err_del:
- kobject_del(scx_root->kobj);
-err:
- kobject_put(scx_root->kobj);
- scx_root->kobj = NULL;
- if (scx_root->exit_info) {
- free_exit_info(scx_root->exit_info);
- scx_root->exit_info = NULL;
- }
+err_free:
+ if (sch->exit_info)
+ free_exit_info(sch->exit_info);
+ kfree(sch);
err_unlock:
mutex_unlock(&scx_enable_mutex);
return ret;
@@ -5593,6 +5610,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
scx_error("scx_enable() failed (%d)", ret);
kthread_flush_work(&scx_disable_work);
+ kobject_put(&sch->kobj);
return 0;
}
@@ -5741,8 +5759,11 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
{
+ struct scx_sched *sch = scx_root;
+
scx_disable(SCX_EXIT_UNREG);
kthread_flush_work(&scx_disable_work);
+ kobject_put(&sch->kobj);
}
static int bpf_scx_init(struct btf *btf)
--
2.49.0
Powered by blists - more mailing lists