linux-kernel - Re: [PATCH v2 1/3] sched_ext: Add lockless peek operation for DSQs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <aOAvosirYCLYdPrp@gpd4>
Date: Fri, 3 Oct 2025 22:18:42 +0200
From: Andrea Righi <arighi@...dia.com>
To: Ryan Newton <rrnewton@...il.com>
Cc: linux-kernel@...r.kernel.org, tj@...nel.org, newton@...a.com
Subject: Re: [PATCH v2 1/3] sched_ext: Add lockless peek operation for DSQs

Hi Ryan,

On Fri, Oct 03, 2025 at 03:54:06PM -0400, Ryan Newton wrote:
> From: Ryan Newton <newton@...a.com>
> 
> The builtin DSQ queue data structures are meant to be used by a wide
> range of different sched_ext schedulers with different demands on these
> data structures. They might be per-cpu with low-contention, or
> high-contention shared queues. Unfortunately, DSQs have a coarse-grained
> lock around the whole data structure. Without going all the way to a
> lock-free, more scalable implementation, a small step we can take to
> reduce lock contention is to allow a lockless, small-fixed-cost peek at
> the head of the queue.
> 
> This change allows certain custom SCX schedulers to cheaply peek at
> queues, e.g. during load balancing, before locking them. But it
> represents a few extra memory operations to update the pointer each
> time the DSQ is modified, including a memory barrier on ARM so the write
> appears correctly ordered.
> 
> This commit adds a first_task pointer field which is updated
> atomically when the DSQ is modified, and allows any thread to peek at
> the head of the queue without holding the lock.
> 
> Signed-off-by: Ryan Newton <newton@...a.com>
> ---
>  include/linux/sched/ext.h                |  1 +
>  kernel/sched/ext.c                       | 59 ++++++++++++++++++++++++
>  tools/sched_ext/include/scx/common.bpf.h |  1 +
>  tools/sched_ext/include/scx/compat.bpf.h | 19 ++++++++
>  4 files changed, 80 insertions(+)
> 
> diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> index d82b7a9b0658..81478d4ae782 100644
> --- a/include/linux/sched/ext.h
> +++ b/include/linux/sched/ext.h
> @@ -58,6 +58,7 @@ enum scx_dsq_id_flags {
>   */
>  struct scx_dispatch_q {
>  	raw_spinlock_t		lock;
> +	struct task_struct __rcu *first_task; /* lockless peek at head */
>  	struct list_head	list;	/* tasks in dispatch order */
>  	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
>  	u32			nr;
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 2b0e88206d07..ea0fe4019eca 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -885,6 +885,27 @@ static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
>  	__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
>  }
>  
> +/* while holding dsq->lock and does nothing on builtin DSQs */
> +static inline void dsq_set_first_task(struct scx_dispatch_q *dsq,
> +				      struct task_struct *p)
> +{
> +	if (dsq->id & SCX_DSQ_FLAG_BUILTIN)
> +		return;
> +	rcu_assign_pointer(dsq->first_task, p);

Do we need this helper? Maybe we can just do this in the caller:

	if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
		rcu_assign_pointer(dsq->first_task, p);

> +}
> +
> +/* while holding dsq->lock and does nothing on builtin DSQs */
> +static void dsq_update_first_task(struct scx_dispatch_q *dsq)
> +{
> +	struct task_struct *first_task;
> +
> +	if ((dsq->id & SCX_DSQ_FLAG_BUILTIN))

nit: remove the extra ().

> +		return;
> +
> +	first_task = nldsq_next_task(dsq, NULL, false);
> +	rcu_assign_pointer(dsq->first_task, first_task);
> +}
> +
>  static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
>  			     struct task_struct *p, u64 enq_flags)
>  {
> @@ -959,6 +980,9 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
>  			list_add_tail(&p->scx.dsq_list.node, &dsq->list);
>  	}
>  
> +	/* even the add_tail code path may have changed the first element */
> +	dsq_update_first_task(dsq);
> +
>  	/* seq records the order tasks are queued, used by BPF DSQ iterator */
>  	dsq->seq++;
>  	p->scx.dsq_seq = dsq->seq;
> @@ -1013,6 +1037,7 @@ static void task_unlink_from_dsq(struct task_struct *p,
>  
>  	list_del_init(&p->scx.dsq_list.node);
>  	dsq_mod_nr(dsq, -1);
> +	dsq_update_first_task(dsq);

Same here:

	if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
		dsq_update_first_task(dsq);

And remove the check inside dsq_update_first_task() - feel free to ignore
this, just a stylish comment.

>  }
>  
>  static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
> @@ -6084,6 +6109,39 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
>  	kit->dsq = NULL;
>  }
>  
> +/**
> + * scx_bpf_dsq_peek - Lockless peek at the first element.
> + * @dsq_id: DSQ to examine.
> + *
> + * Read the first element in the DSQ. This is semantically equivalent to using
> + * the DSQ iterator, but is lockfree.
> + *
> + * Returns the pointer, or NULL indicates an empty queue OR internal error.
> + */
> +__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id)
> +{
> +	struct scx_sched *sch;
> +	struct scx_dispatch_q *dsq;
> +
> +	sch = rcu_dereference(scx_root);
> +	/* Rather than return ERR_PTR(-ENODEV), we follow the precedent of
> +	 * other functions and let the peek fail but we continue on.
> +	 */

Well, if sch == NULL the root scheduler doesn't exist, so it's not too
unreasonable to return NULL (meaning that the DSQ is empty).

What I'm saying is that we either return ENODEV (but then we need to force
the caller to handle that) or maybe we can just drop this comment,
otherwise I think it's a bit confusing mentioning about ENODEV without
returning it.

> +	if (unlikely(!sch))
> +		return NULL;
> +
> +	dsq = find_user_dsq(sch, dsq_id);
> +	if (unlikely(!dsq)) {
> +		scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id);
> +		return NULL;
> +	}
> +	if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN))) {
> +		scx_error(NULL, "peek disallowed on builtin DSQ 0x%llx", dsq_id);

This should be sch instead of NULL, right?

Moreover, I think we can move this check before find_user_dsq(), so we can
save a call to find_user_dsq() that would fail.

> +		return NULL;
> +	}
> +	return rcu_dereference(dsq->first_task);
> +}
> +
>  __bpf_kfunc_end_defs();
>  
>  static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
> @@ -6641,6 +6699,7 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
>  BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
>  BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
>  BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
> +BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
>  BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
> diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
> index 06e2551033cb..fbf3e7f9526c 100644
> --- a/tools/sched_ext/include/scx/common.bpf.h
> +++ b/tools/sched_ext/include/scx/common.bpf.h
> @@ -75,6 +75,7 @@ u32 scx_bpf_reenqueue_local(void) __ksym;
>  void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
>  s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
>  void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
> +struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
>  int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
>  struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
>  void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
> diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
> index dd9144624dc9..97b10c184b2c 100644
> --- a/tools/sched_ext/include/scx/compat.bpf.h
> +++ b/tools/sched_ext/include/scx/compat.bpf.h
> @@ -130,6 +130,25 @@ int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym
>  	false;									\
>  })
>  
> +
> +/*
> + * v6.19: Introduce lockless peek API for user DSQs.
> + *
> + * Preserve the following macro until v6.21.
> + */
> +static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id)
> +{
> +	struct task_struct *p = NULL;
> +	struct bpf_iter_scx_dsq it;
> +
> +	if (bpf_ksym_exists(scx_bpf_dsq_peek))
> +		return scx_bpf_dsq_peek(dsq_id);
> +	if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0))
> +		p = bpf_iter_scx_dsq_next(&it);
> +	bpf_iter_scx_dsq_destroy(&it);
> +	return p;
> +}
> +
>  /**
>   * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
>   * in a compatible way. We will preserve this __COMPAT helper until v6.16.
> -- 
> 2.51.0
> 

Thanks,
-Andrea