linux-kernel - Re: [PATCH 03/12] sched_ext: Use dynamic allocation for scx

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aAtgdcpNW6Rj4m_f@gpd3>
Date: Fri, 25 Apr 2025 12:14:13 +0200
From: Andrea Righi <arighi@...dia.com>
To: Tejun Heo <tj@...nel.org>
Cc: David Vernet <void@...ifault.com>, Changwoo Min <changwoo@...lia.com>,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched

Hi Tejun,

On Wed, Apr 23, 2025 at 01:44:41PM -1000, Tejun Heo wrote:
> To prepare for supporting multiple schedulers, make scx_sched allocated
> dynamically. scx_sched->kobj is now an embedded field and the kobj's
> lifetime determines the lifetime of the containing scx_sched.
> 
> - Enable path is updated so that kobj init and addition are performed later.
> 
> - scx_sched freeing is initiated in scx_kobj_release() and also goes through
>   an rcu_work so that scx_root can be accessed from an unsynchronized path -
>   scx_disable().
> 
> No behavior changes intended.
> 
> Signed-off-by: Tejun Heo <tj@...nel.org>
> ---
>  kernel/sched/ext.c | 151 ++++++++++++++++++++++++++-------------------
>  1 file changed, 86 insertions(+), 65 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index ad392890d2dd..612232c66d13 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -772,7 +772,8 @@ struct scx_sched {
>  	atomic_t		exit_kind;
>  	struct scx_exit_info	*exit_info;
>  
> -	struct kobject		*kobj;
> +	struct kobject		kobj;
> +	struct rcu_work		rcu_work;
>  };
>  
>  enum scx_wake_flags {
> @@ -933,11 +934,7 @@ enum scx_ops_state {
>  #define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
>  #define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
>  
> -static struct scx_sched __scx_root = {
> -	.exit_kind		= ATOMIC_INIT(SCX_EXIT_DONE),
> -};
> -
> -static struct scx_sched *scx_root = &__scx_root;
> +static struct scx_sched __rcu *scx_root;
>  
>  /*
>   * During exit, a task may schedule after losing its PIDs. When disabling the
> @@ -4417,9 +4414,23 @@ static const struct attribute_group scx_global_attr_group = {
>  	.attrs = scx_global_attrs,
>  };
>  
> +static void free_exit_info(struct scx_exit_info *ei);
> +
> +static void scx_sched_free_rcu_work(struct work_struct *work)
> +{
> +	struct rcu_work *rcu_work = to_rcu_work(work);
> +	struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
> +
> +	free_exit_info(sch->exit_info);
> +	kfree(sch);
> +}
> +
>  static void scx_kobj_release(struct kobject *kobj)
>  {
> -	kfree(kobj);
> +	struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
> +
> +	INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
> +	queue_rcu_work(system_unbound_wq, &sch->rcu_work);
>  }
>  
>  static ssize_t scx_attr_ops_show(struct kobject *kobj,
> @@ -4709,14 +4720,15 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>  
>  static void scx_disable_workfn(struct kthread_work *work)
>  {
> -	struct scx_exit_info *ei = scx_root->exit_info;
> +	struct scx_sched *sch = scx_root;
> +	struct scx_exit_info *ei = sch->exit_info;
>  	struct scx_task_iter sti;
>  	struct task_struct *p;
>  	struct rhashtable_iter rht_iter;
>  	struct scx_dispatch_q *dsq;
>  	int kind, cpu;
>  
> -	kind = atomic_read(&scx_root->exit_kind);
> +	kind = atomic_read(&sch->exit_kind);
>  	while (true) {
>  		/*
>  		 * NONE indicates that a new scx_ops has been registered since
> @@ -4725,7 +4737,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>  		 */
>  		if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
>  			return;
> -		if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
> +		if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
>  			break;
>  	}
>  	ei->kind = kind;
> @@ -4740,7 +4752,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>  		break;
>  	case SCX_DISABLED:
>  		pr_warn("sched_ext: ops error detected without ops (%s)\n",
> -			scx_root->exit_info->msg);
> +			sch->exit_info->msg);
>  		WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
>  		goto done;
>  	default:
> @@ -4807,41 +4819,43 @@ static void scx_disable_workfn(struct kthread_work *work)
>  
>  	/* no task is on scx, turn off all the switches and flush in-progress calls */
>  	static_branch_disable(&__scx_enabled);
> -	bitmap_zero(scx_root->has_op, SCX_OPI_END);
> +	bitmap_zero(sch->has_op, SCX_OPI_END);
>  	scx_idle_disable();
>  	synchronize_rcu();
>  
>  	if (ei->kind >= SCX_EXIT_ERROR) {
>  		pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
> -		       scx_root->ops.name, ei->reason);
> +		       sch->ops.name, ei->reason);
>  
>  		if (ei->msg[0] != '\0')
> -			pr_err("sched_ext: %s: %s\n",
> -			       scx_root->ops.name, ei->msg);
> +			pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
>  #ifdef CONFIG_STACKTRACE
>  		stack_trace_print(ei->bt, ei->bt_len, 2);
>  #endif
>  	} else {
>  		pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
> -			scx_root->ops.name, ei->reason);
> +			sch->ops.name, ei->reason);
>  	}
>  
> -	if (scx_root->ops.exit)
> +	if (sch->ops.exit)
>  		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
>  
>  	cancel_delayed_work_sync(&scx_watchdog_work);
>  
>  	/*
> -	 * Delete the kobject from the hierarchy eagerly in addition to just
> -	 * dropping a reference. Otherwise, if the object is deleted
> -	 * asynchronously, sysfs could observe an object of the same name still
> -	 * in the hierarchy when another scheduler is loaded.
> +	 * scx_root clearing must be inside cpus_read_lock(). See
> +	 * handle_hotplug().
>  	 */
> -	kobject_del(scx_root->kobj);
> -	kobject_put(scx_root->kobj);
> -	scx_root->kobj = NULL;
> +	cpus_read_lock();
> +	RCU_INIT_POINTER(scx_root, NULL);
> +	cpus_read_unlock();
>  
> -	memset(&scx_root->ops, 0, sizeof(scx_root->ops));
> +	/*
> +	 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
> +	 * could observe an object of the same name still in the hierarchy when
> +	 * the next scheduler is loaded.
> +	 */
> +	kobject_del(&sch->kobj);
>  
>  	rhashtable_walk_enter(&dsq_hash, &rht_iter);
>  	do {
> @@ -4858,9 +4872,6 @@ static void scx_disable_workfn(struct kthread_work *work)
>  	scx_dsp_ctx = NULL;
>  	scx_dsp_max_batch = 0;
>  
> -	free_exit_info(scx_root->exit_info);
> -	scx_root->exit_info = NULL;
> -
>  	mutex_unlock(&scx_enable_mutex);
>  
>  	WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
> @@ -4885,13 +4896,18 @@ static void schedule_scx_disable_work(void)
>  static void scx_disable(enum scx_exit_kind kind)
>  {
>  	int none = SCX_EXIT_NONE;
> +	struct scx_sched *sch;
>  
>  	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
>  		kind = SCX_EXIT_ERROR;
>  
> -	atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);
> -
> -	schedule_scx_disable_work();
> +	rcu_read_lock();
> +	sch = rcu_dereference(scx_root);
> +	if (sch) {
> +		atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
> +		schedule_scx_disable_work();
> +	}
> +	rcu_read_unlock();
>  }
>  
>  static void dump_newline(struct seq_buf *s)
> @@ -5288,6 +5304,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
>  
>  static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  {
> +	struct scx_sched *sch;
>  	struct scx_task_iter sti;
>  	struct task_struct *p;
>  	unsigned long timeout;
> @@ -5351,33 +5368,32 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  		goto err_unlock;
>  	}
>  
> -	scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
> -	if (!scx_root->kobj) {
> +	sch = kzalloc(sizeof(*sch), GFP_KERNEL);
> +	if (!sch) {
>  		ret = -ENOMEM;
>  		goto err_unlock;
>  	}
>  
> -	scx_root->kobj->kset = scx_kset;
> -	ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
> -	if (ret < 0)
> -		goto err;
> -
> -	scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
> -	if (!scx_root->exit_info) {
> +	sch->exit_info = alloc_exit_info(ops->exit_dump_len);
> +	if (!sch->exit_info) {
>  		ret = -ENOMEM;
> -		goto err_del;
> +		goto err_free;
>  	}
>  
> +	sch->kobj.kset = scx_kset;
> +	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
> +	if (ret < 0)
> +		goto err_free;
> +
> +	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
> +	sch->ops = *ops;
> +
>  	/*
> -	 * Set scx_ops, transition to ENABLING and clear exit info to arm the
> -	 * disable path. Failure triggers full disabling from here on.
> +	 * Transition to ENABLING and clear exit info to arm the disable path.
> +	 * Failure triggers full disabling from here on.
>  	 */
> -	scx_root->ops = *ops;
> -
>  	WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
> -
> -	atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
> -	scx_root->warned_zero_slice = false;
> +	WARN_ON_ONCE(scx_root);
>  
>  	atomic_long_set(&scx_nr_rejected, 0);
>  
> @@ -5390,9 +5406,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  	 */
>  	cpus_read_lock();
>  
> +	/*
> +	 * Make the scheduler instance visible. Must be inside cpus_read_lock().
> +	 * See handle_hotplug().
> +	 */
> +	rcu_assign_pointer(scx_root, sch);
> +
>  	scx_idle_enable(ops);
>  
> -	if (scx_root->ops.init) {
> +	if (sch->ops.init) {
>  		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
>  		if (ret) {
>  			ret = ops_sanitize_err("init", ret);
> @@ -5404,7 +5426,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
>  		if (((void (**)(void))ops)[i])
> -			set_bit(i, scx_root->has_op);
> +			set_bit(i, sch->has_op);
>  
>  	check_hotplug_seq(ops);
>  	scx_idle_update_selcpu_topology(ops);
> @@ -5445,10 +5467,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
>  		if (((void (**)(void))ops)[i])
> -			set_bit(i, scx_root->has_op);
> +			set_bit(i, sch->has_op);
>  
> -	if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release)
> -		scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
> +	if (sch->ops.cpu_acquire || sch->ops.cpu_release)
> +		sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
>  
>  	/*
>  	 * Lock out forks, cgroup on/offlining and moves before opening the
> @@ -5547,7 +5569,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  	scx_bypass(false);
>  
>  	if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
> -		WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
> +		WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
>  		goto err_disable;
>  	}
>  
> @@ -5555,23 +5577,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  		static_branch_enable(&__scx_switched_all);
>  
>  	pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
> -		scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
> -	kobject_uevent(scx_root->kobj, KOBJ_ADD);
> +		sch->ops.name, scx_switched_all() ? "" : " (partial)");
> +	kobject_uevent(&sch->kobj, KOBJ_ADD);
>  	mutex_unlock(&scx_enable_mutex);
>  
>  	atomic_long_inc(&scx_enable_seq);
>  
>  	return 0;
>  
> -err_del:
> -	kobject_del(scx_root->kobj);
> -err:
> -	kobject_put(scx_root->kobj);
> -	scx_root->kobj = NULL;
> -	if (scx_root->exit_info) {
> -		free_exit_info(scx_root->exit_info);
> -		scx_root->exit_info = NULL;
> -	}
> +err_free:
> +	if (sch->exit_info)
> +		free_exit_info(sch->exit_info);
> +	kfree(sch);
>  err_unlock:
>  	mutex_unlock(&scx_enable_mutex);
>  	return ret;
> @@ -5593,6 +5610,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  	 */
>  	scx_error("scx_enable() failed (%d)", ret);
>  	kthread_flush_work(&scx_disable_work);
> +	kobject_put(&sch->kobj);
>  	return 0;
>  }
>  
> @@ -5741,8 +5759,11 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
>  
>  static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
>  {
> +	struct scx_sched *sch = scx_root;
> +
>  	scx_disable(SCX_EXIT_UNREG);
>  	kthread_flush_work(&scx_disable_work);
> +	kobject_put(&sch->kobj);
>  }

We probably need to check sch != NULL here, I was able to trigger this bug
(using a buggy rustland):

[    5.048913] sched_ext: rustland: invalid CPU -16 from ops.select_cpu()
[    5.048984]    ops_cpu_valid+0x4a/0x60
[    5.049039]    select_task_rq_scx+0x10f/0x200
[    5.049100]    try_to_wake_up+0x17a/0x890
[    5.049149]    ep_autoremove_wake_function+0x12/0x40
[    5.049211]    __wake_up_common+0x7f/0xc0
[    5.049259]    __wake_up+0x36/0x60
[    5.049306]    ep_poll_callback+0x265/0x320
[    5.049354]    __wake_up_common+0x7f/0xc0
[    5.049401]    __wake_up+0x36/0x60
[    5.049448]    __send_signal_locked+0x71e/0x740
[    5.049508]    group_send_sig_info+0xf3/0x1b0
[    5.049567]    kill_pid_info_type+0x79/0x1a0
[    5.049627]    kill_proc_info+0x5d/0x110
[    5.049674]    __x64_sys_kill+0x91/0xc0
[    5.049789]    do_syscall_64+0xbb/0x1d0
[    5.049855]    entry_SYSCALL_64_after_hwframe+0x77/0x7f
[    5.050315] BUG: kernel NULL pointer dereference, address: 00000000000003b0
[    5.050386] #PF: supervisor read access in kernel mode
[    5.050439] #PF: error_code(0x0000) - not-present page
[    5.050488] PGD 0 P4D 0
[    5.050523] Oops: Oops: 0000 [#1] SMP NOPTI
[    5.050571] CPU: 5 UID: 0 PID: 284 Comm: scx_rustland Not tainted 6.14.0-virtme #27 PREEMPT(full)
[    5.050670] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
[    5.050782] RIP: 0010:kthread_flush_work+0x60/0x140
[    5.050847] Code: 80 00 00 00 31 c0 48 8d 7c 24 18 48 89 24 24 48 89 64 24 08 48 c7 44 24 10 30 e6 58 b0 f3 48 ab 48 8d 7c 24 30 e8 40 6c 05 00 <48> 8b 6b 18 48 85 ed 74 69 4c 8d 65 08 4c 89 e7 e8 0b 0c d3 00 48
[    5.051021] RSP: 0018:ffffad01816f7e08 EFLAGS: 00010246
[    5.051066] RAX: 0000000000000000 RBX: 0000000000000398 RCX: 0000000000000000
[    5.051131] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffb2f34d80
[    5.051196] RBP: ffff9b140268e800 R08: 0000000000000002 R09: 0000000000000000
[    5.051260] R10: 0000000000000001 R11: 0000000000000000 R12: ffff9b14804c1ea0
[    5.051325] R13: ffff9b1401b6cc20 R14: ffff9b1403965728 R15: 0000000000000000
[    5.051393] FS:  00007f1ff0550800(0000) GS:ffff9b14cbdb3000(0000) knlGS:0000000000000000
[    5.051463] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    5.051516] CR2: 00000000000003b0 CR3: 000000000553e002 CR4: 0000000000772ef0
[    5.051582] PKRU: 55555554
[    5.051606] Call Trace:
[    5.051634]  <TASK>
[    5.051665]  ? __pfx_kthread_flush_work_fn+0x10/0x10
[    5.051726]  bpf_scx_unreg+0x27/0x40
[    5.051773]  bpf_struct_ops_map_link_dealloc+0x36/0x50
[    5.051824]  bpf_link_release+0x18/0x20
[    5.051863]  __fput+0xf8/0x2c0
[    5.051905]  __x64_sys_close+0x3d/0x80
[    5.051943]  do_syscall_64+0xbb/0x1d0
[    5.051983]  entry_SYSCALL_64_after_hwframe+0x77/0x7f

Changing bpf_scx_unreg() as following fixed the bug for me:

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d963aa5c99e1a..0e52a8dbd593e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5752,11 +5752,17 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
 
 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch;
 
 	scx_disable(SCX_EXIT_UNREG);
-	kthread_flush_work(&sch->disable_work);
-	kobject_put(&sch->kobj);
+
+	rcu_read_lock();
+	sch = rcu_dereference(scx_root);
+	if (sch) {
+		kthread_flush_work(&sch->disable_work);
+		kobject_put(&sch->kobj);
+	}
+	rcu_read_unlock();
 }
 
 static int bpf_scx_init(struct btf *btf)

Thanks,
-Andrea