[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aAtgdcpNW6Rj4m_f@gpd3>
Date: Fri, 25 Apr 2025 12:14:13 +0200
From: Andrea Righi <arighi@...dia.com>
To: Tejun Heo <tj@...nel.org>
Cc: David Vernet <void@...ifault.com>, Changwoo Min <changwoo@...lia.com>,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched
Hi Tejun,
On Wed, Apr 23, 2025 at 01:44:41PM -1000, Tejun Heo wrote:
> To prepare for supporting multiple schedulers, make scx_sched allocated
> dynamically. scx_sched->kobj is now an embedded field and the kobj's
> lifetime determines the lifetime of the containing scx_sched.
>
> - Enable path is updated so that kobj init and addition are performed later.
>
> - scx_sched freeing is initiated in scx_kobj_release() and also goes through
> an rcu_work so that scx_root can be accessed from an unsynchronized path -
> scx_disable().
>
> No behavior changes intended.
>
> Signed-off-by: Tejun Heo <tj@...nel.org>
> ---
> kernel/sched/ext.c | 151 ++++++++++++++++++++++++++-------------------
> 1 file changed, 86 insertions(+), 65 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index ad392890d2dd..612232c66d13 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -772,7 +772,8 @@ struct scx_sched {
> atomic_t exit_kind;
> struct scx_exit_info *exit_info;
>
> - struct kobject *kobj;
> + struct kobject kobj;
> + struct rcu_work rcu_work;
> };
>
> enum scx_wake_flags {
> @@ -933,11 +934,7 @@ enum scx_ops_state {
> #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
> #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
>
> -static struct scx_sched __scx_root = {
> - .exit_kind = ATOMIC_INIT(SCX_EXIT_DONE),
> -};
> -
> -static struct scx_sched *scx_root = &__scx_root;
> +static struct scx_sched __rcu *scx_root;
>
> /*
> * During exit, a task may schedule after losing its PIDs. When disabling the
> @@ -4417,9 +4414,23 @@ static const struct attribute_group scx_global_attr_group = {
> .attrs = scx_global_attrs,
> };
>
> +static void free_exit_info(struct scx_exit_info *ei);
> +
> +static void scx_sched_free_rcu_work(struct work_struct *work)
> +{
> + struct rcu_work *rcu_work = to_rcu_work(work);
> + struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
> +
> + free_exit_info(sch->exit_info);
> + kfree(sch);
> +}
> +
> static void scx_kobj_release(struct kobject *kobj)
> {
> - kfree(kobj);
> + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
> +
> + INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
> + queue_rcu_work(system_unbound_wq, &sch->rcu_work);
> }
>
> static ssize_t scx_attr_ops_show(struct kobject *kobj,
> @@ -4709,14 +4720,15 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>
> static void scx_disable_workfn(struct kthread_work *work)
> {
> - struct scx_exit_info *ei = scx_root->exit_info;
> + struct scx_sched *sch = scx_root;
> + struct scx_exit_info *ei = sch->exit_info;
> struct scx_task_iter sti;
> struct task_struct *p;
> struct rhashtable_iter rht_iter;
> struct scx_dispatch_q *dsq;
> int kind, cpu;
>
> - kind = atomic_read(&scx_root->exit_kind);
> + kind = atomic_read(&sch->exit_kind);
> while (true) {
> /*
> * NONE indicates that a new scx_ops has been registered since
> @@ -4725,7 +4737,7 @@ static void scx_disable_workfn(struct kthread_work *work)
> */
> if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
> return;
> - if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
> + if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
> break;
> }
> ei->kind = kind;
> @@ -4740,7 +4752,7 @@ static void scx_disable_workfn(struct kthread_work *work)
> break;
> case SCX_DISABLED:
> pr_warn("sched_ext: ops error detected without ops (%s)\n",
> - scx_root->exit_info->msg);
> + sch->exit_info->msg);
> WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
> goto done;
> default:
> @@ -4807,41 +4819,43 @@ static void scx_disable_workfn(struct kthread_work *work)
>
> /* no task is on scx, turn off all the switches and flush in-progress calls */
> static_branch_disable(&__scx_enabled);
> - bitmap_zero(scx_root->has_op, SCX_OPI_END);
> + bitmap_zero(sch->has_op, SCX_OPI_END);
> scx_idle_disable();
> synchronize_rcu();
>
> if (ei->kind >= SCX_EXIT_ERROR) {
> pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
> - scx_root->ops.name, ei->reason);
> + sch->ops.name, ei->reason);
>
> if (ei->msg[0] != '\0')
> - pr_err("sched_ext: %s: %s\n",
> - scx_root->ops.name, ei->msg);
> + pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
> #ifdef CONFIG_STACKTRACE
> stack_trace_print(ei->bt, ei->bt_len, 2);
> #endif
> } else {
> pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
> - scx_root->ops.name, ei->reason);
> + sch->ops.name, ei->reason);
> }
>
> - if (scx_root->ops.exit)
> + if (sch->ops.exit)
> SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
>
> cancel_delayed_work_sync(&scx_watchdog_work);
>
> /*
> - * Delete the kobject from the hierarchy eagerly in addition to just
> - * dropping a reference. Otherwise, if the object is deleted
> - * asynchronously, sysfs could observe an object of the same name still
> - * in the hierarchy when another scheduler is loaded.
> + * scx_root clearing must be inside cpus_read_lock(). See
> + * handle_hotplug().
> */
> - kobject_del(scx_root->kobj);
> - kobject_put(scx_root->kobj);
> - scx_root->kobj = NULL;
> + cpus_read_lock();
> + RCU_INIT_POINTER(scx_root, NULL);
> + cpus_read_unlock();
>
> - memset(&scx_root->ops, 0, sizeof(scx_root->ops));
> + /*
> + * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
> + * could observe an object of the same name still in the hierarchy when
> + * the next scheduler is loaded.
> + */
> + kobject_del(&sch->kobj);
>
> rhashtable_walk_enter(&dsq_hash, &rht_iter);
> do {
> @@ -4858,9 +4872,6 @@ static void scx_disable_workfn(struct kthread_work *work)
> scx_dsp_ctx = NULL;
> scx_dsp_max_batch = 0;
>
> - free_exit_info(scx_root->exit_info);
> - scx_root->exit_info = NULL;
> -
> mutex_unlock(&scx_enable_mutex);
>
> WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
> @@ -4885,13 +4896,18 @@ static void schedule_scx_disable_work(void)
> static void scx_disable(enum scx_exit_kind kind)
> {
> int none = SCX_EXIT_NONE;
> + struct scx_sched *sch;
>
> if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
> kind = SCX_EXIT_ERROR;
>
> - atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);
> -
> - schedule_scx_disable_work();
> + rcu_read_lock();
> + sch = rcu_dereference(scx_root);
> + if (sch) {
> + atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
> + schedule_scx_disable_work();
> + }
> + rcu_read_unlock();
> }
>
> static void dump_newline(struct seq_buf *s)
> @@ -5288,6 +5304,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
>
> static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> {
> + struct scx_sched *sch;
> struct scx_task_iter sti;
> struct task_struct *p;
> unsigned long timeout;
> @@ -5351,33 +5368,32 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> goto err_unlock;
> }
>
> - scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
> - if (!scx_root->kobj) {
> + sch = kzalloc(sizeof(*sch), GFP_KERNEL);
> + if (!sch) {
> ret = -ENOMEM;
> goto err_unlock;
> }
>
> - scx_root->kobj->kset = scx_kset;
> - ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
> - if (ret < 0)
> - goto err;
> -
> - scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
> - if (!scx_root->exit_info) {
> + sch->exit_info = alloc_exit_info(ops->exit_dump_len);
> + if (!sch->exit_info) {
> ret = -ENOMEM;
> - goto err_del;
> + goto err_free;
> }
>
> + sch->kobj.kset = scx_kset;
> + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
> + if (ret < 0)
> + goto err_free;
> +
> + atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
> + sch->ops = *ops;
> +
> /*
> - * Set scx_ops, transition to ENABLING and clear exit info to arm the
> - * disable path. Failure triggers full disabling from here on.
> + * Transition to ENABLING and clear exit info to arm the disable path.
> + * Failure triggers full disabling from here on.
> */
> - scx_root->ops = *ops;
> -
> WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
> -
> - atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
> - scx_root->warned_zero_slice = false;
> + WARN_ON_ONCE(scx_root);
>
> atomic_long_set(&scx_nr_rejected, 0);
>
> @@ -5390,9 +5406,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> */
> cpus_read_lock();
>
> + /*
> + * Make the scheduler instance visible. Must be inside cpus_read_lock().
> + * See handle_hotplug().
> + */
> + rcu_assign_pointer(scx_root, sch);
> +
> scx_idle_enable(ops);
>
> - if (scx_root->ops.init) {
> + if (sch->ops.init) {
> ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
> if (ret) {
> ret = ops_sanitize_err("init", ret);
> @@ -5404,7 +5426,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>
> for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
> if (((void (**)(void))ops)[i])
> - set_bit(i, scx_root->has_op);
> + set_bit(i, sch->has_op);
>
> check_hotplug_seq(ops);
> scx_idle_update_selcpu_topology(ops);
> @@ -5445,10 +5467,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>
> for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
> if (((void (**)(void))ops)[i])
> - set_bit(i, scx_root->has_op);
> + set_bit(i, sch->has_op);
>
> - if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release)
> - scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
> + if (sch->ops.cpu_acquire || sch->ops.cpu_release)
> + sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
>
> /*
> * Lock out forks, cgroup on/offlining and moves before opening the
> @@ -5547,7 +5569,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> scx_bypass(false);
>
> if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
> - WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
> + WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
> goto err_disable;
> }
>
> @@ -5555,23 +5577,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> static_branch_enable(&__scx_switched_all);
>
> pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
> - scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
> - kobject_uevent(scx_root->kobj, KOBJ_ADD);
> + sch->ops.name, scx_switched_all() ? "" : " (partial)");
> + kobject_uevent(&sch->kobj, KOBJ_ADD);
> mutex_unlock(&scx_enable_mutex);
>
> atomic_long_inc(&scx_enable_seq);
>
> return 0;
>
> -err_del:
> - kobject_del(scx_root->kobj);
> -err:
> - kobject_put(scx_root->kobj);
> - scx_root->kobj = NULL;
> - if (scx_root->exit_info) {
> - free_exit_info(scx_root->exit_info);
> - scx_root->exit_info = NULL;
> - }
> +err_free:
> + if (sch->exit_info)
> + free_exit_info(sch->exit_info);
> + kfree(sch);
> err_unlock:
> mutex_unlock(&scx_enable_mutex);
> return ret;
> @@ -5593,6 +5610,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> */
> scx_error("scx_enable() failed (%d)", ret);
> kthread_flush_work(&scx_disable_work);
> + kobject_put(&sch->kobj);
> return 0;
> }
>
> @@ -5741,8 +5759,11 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
>
> static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
> {
> + struct scx_sched *sch = scx_root;
> +
> scx_disable(SCX_EXIT_UNREG);
> kthread_flush_work(&scx_disable_work);
> + kobject_put(&sch->kobj);
> }
We probably need to check sch != NULL here, I was able to trigger this bug
(using a buggy rustland):
[ 5.048913] sched_ext: rustland: invalid CPU -16 from ops.select_cpu()
[ 5.048984] ops_cpu_valid+0x4a/0x60
[ 5.049039] select_task_rq_scx+0x10f/0x200
[ 5.049100] try_to_wake_up+0x17a/0x890
[ 5.049149] ep_autoremove_wake_function+0x12/0x40
[ 5.049211] __wake_up_common+0x7f/0xc0
[ 5.049259] __wake_up+0x36/0x60
[ 5.049306] ep_poll_callback+0x265/0x320
[ 5.049354] __wake_up_common+0x7f/0xc0
[ 5.049401] __wake_up+0x36/0x60
[ 5.049448] __send_signal_locked+0x71e/0x740
[ 5.049508] group_send_sig_info+0xf3/0x1b0
[ 5.049567] kill_pid_info_type+0x79/0x1a0
[ 5.049627] kill_proc_info+0x5d/0x110
[ 5.049674] __x64_sys_kill+0x91/0xc0
[ 5.049789] do_syscall_64+0xbb/0x1d0
[ 5.049855] entry_SYSCALL_64_after_hwframe+0x77/0x7f
[ 5.050315] BUG: kernel NULL pointer dereference, address: 00000000000003b0
[ 5.050386] #PF: supervisor read access in kernel mode
[ 5.050439] #PF: error_code(0x0000) - not-present page
[ 5.050488] PGD 0 P4D 0
[ 5.050523] Oops: Oops: 0000 [#1] SMP NOPTI
[ 5.050571] CPU: 5 UID: 0 PID: 284 Comm: scx_rustland Not tainted 6.14.0-virtme #27 PREEMPT(full)
[ 5.050670] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
[ 5.050782] RIP: 0010:kthread_flush_work+0x60/0x140
[ 5.050847] Code: 80 00 00 00 31 c0 48 8d 7c 24 18 48 89 24 24 48 89 64 24 08 48 c7 44 24 10 30 e6 58 b0 f3 48 ab 48 8d 7c 24 30 e8 40 6c 05 00 <48> 8b 6b 18 48 85 ed 74 69 4c 8d 65 08 4c 89 e7 e8 0b 0c d3 00 48
[ 5.051021] RSP: 0018:ffffad01816f7e08 EFLAGS: 00010246
[ 5.051066] RAX: 0000000000000000 RBX: 0000000000000398 RCX: 0000000000000000
[ 5.051131] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffb2f34d80
[ 5.051196] RBP: ffff9b140268e800 R08: 0000000000000002 R09: 0000000000000000
[ 5.051260] R10: 0000000000000001 R11: 0000000000000000 R12: ffff9b14804c1ea0
[ 5.051325] R13: ffff9b1401b6cc20 R14: ffff9b1403965728 R15: 0000000000000000
[ 5.051393] FS: 00007f1ff0550800(0000) GS:ffff9b14cbdb3000(0000) knlGS:0000000000000000
[ 5.051463] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5.051516] CR2: 00000000000003b0 CR3: 000000000553e002 CR4: 0000000000772ef0
[ 5.051582] PKRU: 55555554
[ 5.051606] Call Trace:
[ 5.051634] <TASK>
[ 5.051665] ? __pfx_kthread_flush_work_fn+0x10/0x10
[ 5.051726] bpf_scx_unreg+0x27/0x40
[ 5.051773] bpf_struct_ops_map_link_dealloc+0x36/0x50
[ 5.051824] bpf_link_release+0x18/0x20
[ 5.051863] __fput+0xf8/0x2c0
[ 5.051905] __x64_sys_close+0x3d/0x80
[ 5.051943] do_syscall_64+0xbb/0x1d0
[ 5.051983] entry_SYSCALL_64_after_hwframe+0x77/0x7f
Changing bpf_scx_unreg() as following fixed the bug for me:
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d963aa5c99e1a..0e52a8dbd593e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5752,11 +5752,17 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
scx_disable(SCX_EXIT_UNREG);
- kthread_flush_work(&sch->disable_work);
- kobject_put(&sch->kobj);
+
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ kthread_flush_work(&sch->disable_work);
+ kobject_put(&sch->kobj);
+ }
+ rcu_read_unlock();
}
static int bpf_scx_init(struct btf *btf)
Thanks,
-Andrea
Powered by blists - more mailing lists