[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250306182544.128649-3-arighi@nvidia.com>
Date: Thu, 6 Mar 2025 19:18:05 +0100
From: Andrea Righi <arighi@...dia.com>
To: Tejun Heo <tj@...nel.org>,
David Vernet <void@...ifault.com>,
Changwoo Min <changwoo@...lia.com>
Cc: bpf@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: [PATCH 2/4] sched_ext: idle: Introduce the concept of preferred CPUs
Many scx schedulers define their own concept of scheduling domains to
represent topology characteristics, such as heterogeneous architectures
(e.g., big.LITTLE, P-cores/E-cores), or to categorize tasks based on
specific properties (e.g., setting the soft-affinity of certain tasks to
a subset of CPUs).
Currently, there is no mechanism to share these domains with the
built-in idle CPU selection policy. As a result, schedulers often
implement their own idle CPU selection policies, which are typically
similar to one another, leading to a lot of code duplication.
To address this, introduce the concept of preferred domain (represented
as a cpumask) that can be used by the BPF schedulers to apply the
built-in idle CPU selection policy to a subset of preferred CPUs.
With this concept the idle CPU selection policy becomes the following:
- always prioritize CPUs from fully idle SMT cores (if SMT is enabled),
- select the same CPU if it's idle and in the preferred domain,
- select an idle CPU within the same LLC domain, if the LLC domain is a
subset of the preferred domain,
- select an idle CPU within the same node, if the node domain is a
subset of the preferred domain,
- select an idle CPU within the preferred domain,
- select any idle CPU usable by the task.
Moreover, introduce the new idle flag %SCX_PICK_IDLE_IN_PREF, that
enforces strict selection within the preferred domain. Without this
flag, the preferred domain is treated as a soft constraint: idle CPUs
outside the preferred domain can be considered if the preferred domain
is fully busy.
If the preferred domain is empty or NULL, the behavior of the built-in
idle CPU selection policy remains unchanged.
This only introduces the core concept of preferred domain. This
functionality will be exposed through a dedicated kfunc in a separate
patch.
Signed-off-by: Andrea Righi <arighi@...dia.com>
---
kernel/sched/ext.c | 3 +-
kernel/sched/ext_idle.c | 142 ++++++++++++++++++++-------
kernel/sched/ext_idle.h | 3 +-
tools/sched_ext/include/scx/compat.h | 1 +
4 files changed, 111 insertions(+), 38 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5cd878bbd0e39..a28ddd7655ba8 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -807,6 +807,7 @@ enum scx_deq_flags {
enum scx_pick_idle_cpu_flags {
SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
+ SCX_PICK_IDLE_IN_PREF = 1LLU << 2, /* pick a CPU in the preferred domain */
};
enum scx_kick_flags {
@@ -3396,7 +3397,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
bool found;
s32 cpu;
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0, &found);
+ cpu = scx_select_cpu_dfl(p, NULL, prev_cpu, wake_flags, 0, &found);
p->scx.selected_cpu = cpu;
if (found) {
p->scx.slice = SCX_SLICE_DFL;
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 16981456ec1ed..9b002e109404b 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -46,6 +46,11 @@ static struct scx_idle_cpus scx_idle_global_masks;
*/
static struct scx_idle_cpus **scx_idle_node_masks;
+/*
+ * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
+ */
+static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
+
/*
* Return the idle masks associated to a target @node.
*
@@ -403,52 +408,80 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
* branch prediction optimizations.
*
* 3. Pick a CPU within the same LLC (Last-Level Cache):
- * - if the above conditions aren't met, pick a CPU that shares the same LLC
- * to maintain cache locality.
+ * - if the above conditions aren't met, pick a CPU that shares the same
+ * LLC, if the LLC domain is a subset of @preferred_cpus, to maintain
+ * cache locality.
*
* 4. Pick a CPU within the same NUMA node, if enabled:
- * - choose a CPU from the same NUMA node to reduce memory access latency.
+ * - choose a CPU from the same NUMA node, if the node domain is a subset
+ * of @preferred_cpus, to reduce memory access latency.
+ *
+ * 5. Pick a CPU within @preferred_cpus.
*
- * 5. Pick any idle CPU usable by the task.
+ * 6. Pick any idle CPU usable by the task.
*
* Step 3 and 4 are performed only if the system has, respectively, multiple
* LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
- * scx_selcpu_topo_numa).
+ * scx_selcpu_topo_numa) and their domains don't overlap.
+ *
+ * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always
+ * begin in @prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
+ *
+ * Return the picked CPU with *@...nd set, indicating whether the picked
+ * CPU is currently idle, or a negative value otherwise.
*
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
* we never call ops.select_cpu() for them, see select_task_rq().
*/
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags, bool *found)
+s32 scx_select_cpu_dfl(struct task_struct *p, const struct cpumask *preferred_cpus,
+ s32 prev_cpu, u64 wake_flags, u64 flags, bool *found)
{
const struct cpumask *llc_cpus = NULL;
const struct cpumask *numa_cpus = NULL;
- int node = scx_cpu_node_if_enabled(prev_cpu);
+ int node;
s32 cpu;
*found = false;
+ /*
+ * If @prev_cpu is not in the preferred domain, try to assign a new
+ * arbitrary CPU in the preferred domain.
+ */
+ if (preferred_cpus && !cpumask_test_cpu(prev_cpu, preferred_cpus)) {
+ cpu = cpumask_any_and_distribute(p->cpus_ptr, preferred_cpus);
+ if (cpu < nr_cpu_ids) {
+ prev_cpu = cpu;
+ node = scx_cpu_node_if_enabled(prev_cpu);
+ }
+ } else {
+ node = scx_cpu_node_if_enabled(prev_cpu);
+ }
+
/*
* This is necessary to protect llc_cpus.
*/
rcu_read_lock();
/*
- * Determine the scheduling domain only if the task is allowed to run
- * on all CPUs.
- *
- * This is done primarily for efficiency, as it avoids the overhead of
- * updating a cpumask every time we need to select an idle CPU (which
- * can be costly in large SMP systems), but it also aligns logically:
- * if a task's scheduling domain is restricted by user-space (through
- * CPU affinity), the task will simply use the flat scheduling domain
- * defined by user-space.
+ * Consider node/LLC scheduling domains only if the preferred
+ * cpumask contains all the CPUs of each particular domain and if
+ * the domains don't overlap.
*/
- if (p->nr_cpus_allowed >= num_possible_cpus()) {
- if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = numa_span(prev_cpu);
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) {
+ const struct cpumask *cpus = numa_span(prev_cpu);
+ const struct cpumask *pref = preferred_cpus ?: p->cpus_ptr;
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
- llc_cpus = llc_span(prev_cpu);
+ if (!cpumask_equal(cpus, pref) && cpumask_subset(cpus, pref))
+ numa_cpus = cpus;
+ }
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+ const struct cpumask *cpus = llc_span(prev_cpu);
+ const struct cpumask *pref = preferred_cpus ?: p->cpus_ptr;
+
+ if (!cpumask_equal(cpus, pref) && cpumask_subset(cpus, pref))
+ llc_cpus = cpus;
}
/*
@@ -486,7 +519,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
(!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
!cpumask_empty(idle_cpumask(waker_node)->cpu)) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (cpumask_test_cpu(cpu, preferred_cpus ?: p->cpus_ptr))
goto cpu_found;
}
}
@@ -523,6 +556,20 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
goto cpu_found;
}
+ /*
+ * Search for any full-idle core in the preferred domain.
+ *
+ * If the node-aware idle CPU selection policy is enabled
+ * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always
+ * begin in prev_cpu's node and proceed to other nodes in
+ * order of increasing distance.
+ */
+ if (preferred_cpus) {
+ cpu = scx_pick_idle_cpu(preferred_cpus, node, flags | SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
/*
* Search for any full-idle core usable by the task.
*
@@ -531,9 +578,11 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* begin in prev_cpu's node and proceed to other nodes in
* order of increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
+ if (!(flags & SCX_PICK_IDLE_IN_PREF)) {
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
/*
* Give up if we're strictly looking for a full-idle SMT
@@ -571,6 +620,20 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
goto cpu_found;
}
+ /*
+ * Search for any idle CPU in the preferred domain.
+ *
+ * If the node-aware idle CPU selection policy is enabled
+ * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin
+ * in prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
+ */
+ if (preferred_cpus) {
+ cpu = scx_pick_idle_cpu(preferred_cpus, node, flags);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
/*
* Search for any idle CPU usable by the task.
*
@@ -579,9 +642,11 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* in prev_cpu's node and proceed to other nodes in order of
* increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
- if (cpu >= 0)
- goto cpu_found;
+ if (!(flags & SCX_PICK_IDLE_IN_PREF)) {
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
cpu = prev_cpu;
goto out_unlock;
@@ -599,7 +664,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
*/
void scx_idle_init_masks(void)
{
- int node;
+ int i;
/* Allocate global idle cpumasks */
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
@@ -610,14 +675,19 @@ void scx_idle_init_masks(void)
sizeof(*scx_idle_node_masks), GFP_KERNEL);
BUG_ON(!scx_idle_node_masks);
- for_each_node(node) {
- scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks),
- GFP_KERNEL, node);
- BUG_ON(!scx_idle_node_masks[node]);
+ for_each_node(i) {
+ scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks),
+ GFP_KERNEL, i);
+ BUG_ON(!scx_idle_node_masks[i]);
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node));
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i));
}
+
+ /* Allocate local per-cpu idle cpumasks */
+ for_each_possible_cpu(i)
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
}
static void update_builtin_idle(int cpu, bool idle)
@@ -829,7 +899,7 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
goto prev_cpu;
#ifdef CONFIG_SMP
- return scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0, is_idle);
+ return scx_select_cpu_dfl(p, NULL, prev_cpu, wake_flags, 0, is_idle);
#endif
prev_cpu:
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index 5c1db6b315f7a..386bde7e8ee3e 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -27,7 +27,8 @@ static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node
}
#endif /* CONFIG_SMP */
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags, bool *found);
+s32 scx_select_cpu_dfl(struct task_struct *p, const struct cpumask *preferred_cpus,
+ s32 prev_cpu, u64 wake_flags, u64 flags, bool *found);
void scx_idle_enable(struct sched_ext_ops *ops);
void scx_idle_disable(void);
int scx_idle_init(void);
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index 35c67c5174ac0..f9c06079b3a86 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -120,6 +120,7 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
#define SCX_PICK_IDLE_CORE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_CORE)
#define SCX_PICK_IDLE_IN_NODE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_IN_NODE)
+#define SCX_PICK_IDLE_IN_PREF SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_IN_PREF)
static inline long scx_hotplug_seq(void)
{
--
2.48.1
Powered by blists - more mailing lists