[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z60p755gE1aDiimC@slm.duckdns.org>
Date: Wed, 12 Feb 2025 13:08:31 -1000
From: Tejun Heo <tj@...nel.org>
To: Peter Zijlstra <peterz@...radead.org>,
David Vernet <void@...ifault.com>, Andrea Righi <arighi@...dia.com>,
Changwoo Min <changwoo@...lia.com>
Cc: Neel Natu <neelnatu@...gle.com>, Barret Rhoden <brho@...gle.com>,
linux-kernel@...r.kernel.org, kernel-team@...a.com,
sched-ext@...a.com
Subject: [PATCH sched_ext/for-6.15] sched_ext: Implement
SCX_OPS_ALLOW_QUEUED_WAKEUP
A task wakeup can be either processed on the waker's CPU or bounced to the
wakee's previous CPU using an IPI (ttwu_queue). Bouncing to the wakee's CPU
avoids the waker's CPU locking and accessing the wakee's rq which can be
expensive across cache and node boundaries.
When ttwu_queue path is taken, select_task_rq() and thus ops.select_cpu()
are skipped. As this confused some BPF schedulers, there wasn't a good way
for a BPF scheduler to tell whether idle CPU selection has been skipped,
ops.enqueue() couldn't insert tasks into foreign local DSQs, and the
performance difference on machines with simple toplogies were minimal,
sched_ext disabled ttwu_queue.
However, this optimization makes noticeable difference on more complex
topologies and a BPF scheduler now has an easy way tell whether
ops.select_cpu() was skipped since 9b671793c7d9 ("sched_ext, scx_qmap: Add
and use SCX_ENQ_CPU_SELECTED") and can insert tasks into foreign local DSQs
since 5b26f7b920f7 ("sched_ext: Allow SCX_DSQ_LOCAL_ON for direct
dispatches").
Implement SCX_OPS_ALLOW_QUEUED_WAKEUP which allows BPF schedulers to choose
to enable ttwu_queue optimization.
Signed-off-by: Tejun Heo <tj@...nel.org>
Reported-by: Neel Natu <neelnatu@...gle.com>
Reported-by: Barret Rhoden <brho@...gle.com>
Cc: Peter Zijlstra (Intel) <peterz@...radead.org>
---
kernel/sched/core.c | 9 ++-------
kernel/sched/ext.c | 30 ++++++++++++++++++++++++------
kernel/sched/ext.h | 10 ++++++++++
3 files changed, 36 insertions(+), 13 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3921,13 +3921,8 @@ bool cpus_share_resources(int this_cpu,
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
- /*
- * The BPF scheduler may depend on select_task_rq() being invoked during
- * wakeups. In addition, @p may end up executing on a different CPU
- * regardless of what happens in the wakeup path making the ttwu_queue
- * optimization less meaningful. Skip if on SCX.
- */
- if (task_on_scx(p))
+ /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
+ if (!scx_allow_ttwu_queue(p))
return false;
/*
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -96,7 +96,7 @@ enum scx_ops_flags {
/*
* Keep built-in idle tracking even if ops.update_idle() is implemented.
*/
- SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
/*
* By default, if there are no other task to run on the CPU, ext core
@@ -104,7 +104,7 @@ enum scx_ops_flags {
* flag is specified, such tasks are passed to ops.enqueue() with
* %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
*/
- SCX_OPS_ENQ_LAST = 1LLU << 1,
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
/*
* An exiting task may schedule after PF_EXITING is set. In such cases,
@@ -117,13 +117,13 @@ enum scx_ops_flags {
* depend on pid lookups and wants to handle these tasks directly, the
* following flag can be used.
*/
- SCX_OPS_ENQ_EXITING = 1LLU << 2,
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
/*
* If set, only tasks with policy set to SCHED_EXT are attached to
* sched_ext. If clear, SCHED_NORMAL tasks are also included.
*/
- SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
/*
* A migration disabled task can only execute on its current CPU. By
@@ -136,7 +136,21 @@ enum scx_ops_flags {
* current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
* and thus may disagree with cpumask_weight(p->cpus_ptr).
*/
- SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
+ * Queued wakeup (ttwu_queue) is an optimization during wakeups which
+ * bypasses ops.select_cpu() and invokes ops.enqueue() on the wakee's
+ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+ * transfers. As the BPF scheduler may depend on ops.select_cpu() being
+ * invoked during wakeups, queued wakeup is disabled by default.
+ *
+ * If this ops flag is set, queued wakeup optimization is enabled and
+ * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+ * wakee's CPU without preceding ops.select_cpu() even for tasks which
+ * may be executed on multiple CPUs.
+ */
+ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
/*
* CPU cgroup support flags
@@ -147,6 +161,7 @@ enum scx_ops_flags {
SCX_OPS_ENQ_LAST |
SCX_OPS_ENQ_EXITING |
SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
SCX_OPS_SWITCH_PARTIAL |
SCX_OPS_HAS_CGROUP_WEIGHT,
};
@@ -897,6 +912,7 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_a
static struct sched_ext_ops scx_ops;
static bool scx_warned_zero_slice;
+DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
@@ -4717,6 +4733,7 @@ static void scx_ops_disable_workfn(struc
static_branch_disable(&__scx_ops_enabled);
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
static_branch_disable(&scx_has_op[i]);
+ static_branch_disable(&scx_ops_allow_queued_wakeup);
static_branch_disable(&scx_ops_enq_last);
static_branch_disable(&scx_ops_enq_exiting);
static_branch_disable(&scx_ops_enq_migration_disabled);
@@ -5348,9 +5365,10 @@ static int scx_ops_enable(struct sched_e
if (((void (**)(void))ops)[i])
static_branch_enable(&scx_has_op[i]);
+ if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+ static_branch_enable(&scx_ops_allow_queued_wakeup);
if (ops->flags & SCX_OPS_ENQ_LAST)
static_branch_enable(&scx_ops_enq_last);
-
if (ops->flags & SCX_OPS_ENQ_EXITING)
static_branch_enable(&scx_ops_enq_exiting);
if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,6 +8,8 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
+DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
+
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -34,6 +36,13 @@ static inline bool task_on_scx(const str
return scx_enabled() && p->sched_class == &ext_sched_class;
}
+static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
+{
+ return !scx_enabled() ||
+ static_branch_likely(&scx_ops_allow_queued_wakeup) ||
+ p->sched_class != &ext_sched_class;
+}
+
#ifdef CONFIG_SCHED_CORE
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi);
@@ -52,6 +61,7 @@ static inline void scx_rq_activate(struc
static inline void scx_rq_deactivate(struct rq *rq) {}
static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
static inline bool task_on_scx(const struct task_struct *p) { return false; }
+static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; }
static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
Powered by blists - more mailing lists