[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251003195408.675527-2-rrnewton@gmail.com>
Date: Fri, 3 Oct 2025 15:54:06 -0400
From: Ryan Newton <rrnewton@...il.com>
To: linux-kernel@...r.kernel.org
Cc: tj@...nel.org,
arighi@...dia.com,
rrnewton@...il.com,
newton@...a.com
Subject: [PATCH v2 1/3] sched_ext: Add lockless peek operation for DSQs
From: Ryan Newton <newton@...a.com>
The builtin DSQ queue data structures are meant to be used by a wide
range of different sched_ext schedulers with different demands on these
data structures. They might be per-cpu with low-contention, or
high-contention shared queues. Unfortunately, DSQs have a coarse-grained
lock around the whole data structure. Without going all the way to a
lock-free, more scalable implementation, a small step we can take to
reduce lock contention is to allow a lockless, small-fixed-cost peek at
the head of the queue.
This change allows certain custom SCX schedulers to cheaply peek at
queues, e.g. during load balancing, before locking them. But it
represents a few extra memory operations to update the pointer each
time the DSQ is modified, including a memory barrier on ARM so the write
appears correctly ordered.
This commit adds a first_task pointer field which is updated
atomically when the DSQ is modified, and allows any thread to peek at
the head of the queue without holding the lock.
Signed-off-by: Ryan Newton <newton@...a.com>
---
include/linux/sched/ext.h | 1 +
kernel/sched/ext.c | 59 ++++++++++++++++++++++++
tools/sched_ext/include/scx/common.bpf.h | 1 +
tools/sched_ext/include/scx/compat.bpf.h | 19 ++++++++
4 files changed, 80 insertions(+)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index d82b7a9b0658..81478d4ae782 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -58,6 +58,7 @@ enum scx_dsq_id_flags {
*/
struct scx_dispatch_q {
raw_spinlock_t lock;
+ struct task_struct __rcu *first_task; /* lockless peek at head */
struct list_head list; /* tasks in dispatch order */
struct rb_root priq; /* used to order by p->scx.dsq_vtime */
u32 nr;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2b0e88206d07..ea0fe4019eca 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -885,6 +885,27 @@ static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
}
+/* while holding dsq->lock and does nothing on builtin DSQs */
+static inline void dsq_set_first_task(struct scx_dispatch_q *dsq,
+ struct task_struct *p)
+{
+ if (dsq->id & SCX_DSQ_FLAG_BUILTIN)
+ return;
+ rcu_assign_pointer(dsq->first_task, p);
+}
+
+/* while holding dsq->lock and does nothing on builtin DSQs */
+static void dsq_update_first_task(struct scx_dispatch_q *dsq)
+{
+ struct task_struct *first_task;
+
+ if ((dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ return;
+
+ first_task = nldsq_next_task(dsq, NULL, false);
+ rcu_assign_pointer(dsq->first_task, first_task);
+}
+
static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
struct task_struct *p, u64 enq_flags)
{
@@ -959,6 +980,9 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
list_add_tail(&p->scx.dsq_list.node, &dsq->list);
}
+ /* even the add_tail code path may have changed the first element */
+ dsq_update_first_task(dsq);
+
/* seq records the order tasks are queued, used by BPF DSQ iterator */
dsq->seq++;
p->scx.dsq_seq = dsq->seq;
@@ -1013,6 +1037,7 @@ static void task_unlink_from_dsq(struct task_struct *p,
list_del_init(&p->scx.dsq_list.node);
dsq_mod_nr(dsq, -1);
+ dsq_update_first_task(dsq);
}
static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
@@ -6084,6 +6109,39 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
kit->dsq = NULL;
}
+/**
+ * scx_bpf_dsq_peek - Lockless peek at the first element.
+ * @dsq_id: DSQ to examine.
+ *
+ * Read the first element in the DSQ. This is semantically equivalent to using
+ * the DSQ iterator, but is lockfree.
+ *
+ * Returns the pointer, or NULL indicates an empty queue OR internal error.
+ */
+__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id)
+{
+ struct scx_sched *sch;
+ struct scx_dispatch_q *dsq;
+
+ sch = rcu_dereference(scx_root);
+ /* Rather than return ERR_PTR(-ENODEV), we follow the precedent of
+ * other functions and let the peek fail but we continue on.
+ */
+ if (unlikely(!sch))
+ return NULL;
+
+ dsq = find_user_dsq(sch, dsq_id);
+ if (unlikely(!dsq)) {
+ scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id);
+ return NULL;
+ }
+ if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN))) {
+ scx_error(NULL, "peek disallowed on builtin DSQ 0x%llx", dsq_id);
+ return NULL;
+ }
+ return rcu_dereference(dsq->first_task);
+}
+
__bpf_kfunc_end_defs();
static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
@@ -6641,6 +6699,7 @@ BTF_KFUNCS_START(scx_kfunc_ids_any)
BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 06e2551033cb..fbf3e7f9526c 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -75,6 +75,7 @@ u32 scx_bpf_reenqueue_local(void) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index dd9144624dc9..97b10c184b2c 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -130,6 +130,25 @@ int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym
false; \
})
+
+/*
+ * v6.19: Introduce lockless peek API for user DSQs.
+ *
+ * Preserve the following macro until v6.21.
+ */
+static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id)
+{
+ struct task_struct *p = NULL;
+ struct bpf_iter_scx_dsq it;
+
+ if (bpf_ksym_exists(scx_bpf_dsq_peek))
+ return scx_bpf_dsq_peek(dsq_id);
+ if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0))
+ p = bpf_iter_scx_dsq_next(&it);
+ bpf_iter_scx_dsq_destroy(&it);
+ return p;
+}
+
/**
* __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
* in a compatible way. We will preserve this __COMPAT helper until v6.16.
--
2.51.0
Powered by blists - more mailing lists