[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250321221454.298202-4-arighi@nvidia.com>
Date: Fri, 21 Mar 2025 23:10:49 +0100
From: Andrea Righi <arighi@...dia.com>
To: Tejun Heo <tj@...nel.org>,
David Vernet <void@...ifault.com>,
Changwoo Min <changwoo@...lia.com>
Cc: Joel Fernandes <joelagnelf@...dia.com>,
linux-kernel@...r.kernel.org
Subject: [PATCH 3/6] sched_ext: idle: Accept an arbitrary cpumask in scx_select_cpu_dfl()
Many scx schedulers implement their own hard or soft-affinity rules
to support topology characteristics, such as heterogeneous architectures
(e.g., big.LITTLE, P-cores/E-cores), or to categorize tasks based on
specific properties (e.g., running certain tasks only in a subset of
CPUs).
Currently, there is no mechanism that allows to use the built-in idle
CPU selection policy to an arbitrary subset of CPUs. As a result,
schedulers often implement their own idle CPU selection policies, which
are typically similar to one another, leading to a lot of code
duplication.
To address this, modify scx_select_cpu_dfl() to accept an arbitrary
cpumask, that can be used by the BPF schedulers to apply the existent
built-in idle CPU selection policy to a subset of allowed CPUs.
With this concept the idle CPU selection policy becomes the following:
- always prioritize CPUs from fully idle SMT cores (if SMT is enabled),
- select the same CPU if it's idle and in the allowed CPUs,
- select an idle CPU within the same LLC, if the LLC cpumask is a
subset of the allowed CPUs,
- select an idle CPU within the same node, if the node cpumask is a
subset of the allowed CPUs,
- select an idle CPU within the allowed CPUs.
This functionality will be exposed through a dedicated kfunc in a
separate patch.
Signed-off-by: Andrea Righi <arighi@...dia.com>
---
kernel/sched/ext_idle.c | 68 +++++++++++++++++++++++++++++++++--------
1 file changed, 56 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 2dcd758681170..faed4f89f95e9 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -49,6 +49,7 @@ static struct scx_idle_cpus **scx_idle_node_masks;
/*
* Local per-CPU cpumasks (used to generate temporary idle cpumasks).
*/
+static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
@@ -417,13 +418,15 @@ static inline bool task_affinity_all(const struct task_struct *p)
* branch prediction optimizations.
*
* 3. Pick a CPU within the same LLC (Last-Level Cache):
- * - if the above conditions aren't met, pick a CPU that shares the same LLC
- * to maintain cache locality.
+ * - if the above conditions aren't met, pick a CPU that shares the same
+ * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
+ * cache locality.
*
* 4. Pick a CPU within the same NUMA node, if enabled:
- * - choose a CPU from the same NUMA node to reduce memory access latency.
+ * - choose a CPU from the same NUMA node, if the node cpumask is a
+ * subset of @cpus_allowed, to reduce memory access latency.
*
- * 5. Pick any idle CPU usable by the task.
+ * 5. Pick any idle CPU within the @cpus_allowed domain.
*
* Step 3 and 4 are performed only if the system has, respectively,
* multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
@@ -442,9 +445,43 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags)
{
const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL;
- int node = scx_cpu_node_if_enabled(prev_cpu);
+ const struct cpumask *allowed = p->cpus_ptr;
+ int node;
s32 cpu;
+ preempt_disable();
+
+ /*
+ * Determine the subset of CPUs usable by @p within @cpus_allowed.
+ */
+ if (cpus_allowed != p->cpus_ptr) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask);
+
+ if (task_affinity_all(p)) {
+ allowed = cpus_allowed;
+ } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) {
+ allowed = local_cpus;
+ } else {
+ cpu = -EBUSY;
+ goto out_enable;
+ }
+ }
+
+ /*
+ * If @prev_cpu is not in the allowed domain, try to assign a new
+ * arbitrary CPU usable by the task in the allowed domain.
+ */
+ if (!cpumask_test_cpu(prev_cpu, allowed)) {
+ cpu = cpumask_any_and_distribute(p->cpus_ptr, allowed);
+ if (cpu < nr_cpu_ids) {
+ prev_cpu = cpu;
+ } else {
+ cpu = -EBUSY;
+ goto out_enable;
+ }
+ }
+ node = scx_cpu_node_if_enabled(prev_cpu);
+
/*
* This is necessary to protect llc_cpus.
*/
@@ -453,14 +490,17 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
/*
* Determine the subset of CPUs that the task can use in its
* current LLC and node.
+ *
+ * If the task can run on all CPUs, use the node and LLC cpumasks
+ * directly.
*/
if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) {
struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask);
const struct cpumask *cpus = numa_span(prev_cpu);
- if (cpus_allowed == p->cpus_ptr && task_affinity_all(p))
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
numa_cpus = cpus;
- else if (cpus && cpumask_and(local_cpus, cpus_allowed, cpus))
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
numa_cpus = local_cpus;
}
@@ -468,9 +508,9 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask);
const struct cpumask *cpus = llc_span(prev_cpu);
- if (cpus_allowed == p->cpus_ptr && task_affinity_all(p))
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
llc_cpus = cpus;
- else if (cpus && cpumask_and(local_cpus, cpus_allowed, cpus))
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
llc_cpus = local_cpus;
}
@@ -509,7 +549,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
(!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
!cpumask_empty(idle_cpumask(waker_node)->cpu)) {
- if (cpumask_test_cpu(cpu, cpus_allowed))
+ if (cpumask_test_cpu(cpu, allowed))
goto out_unlock;
}
}
@@ -554,7 +594,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
* begin in prev_cpu's node and proceed to other nodes in
* order of increasing distance.
*/
- cpu = scx_pick_idle_cpu(cpus_allowed, node, flags | SCX_PICK_IDLE_CORE);
+ cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE);
if (cpu >= 0)
goto out_unlock;
@@ -602,12 +642,14 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
* in prev_cpu's node and proceed to other nodes in order of
* increasing distance.
*/
- cpu = scx_pick_idle_cpu(cpus_allowed, node, flags);
+ cpu = scx_pick_idle_cpu(allowed, node, flags);
if (cpu >= 0)
goto out_unlock;
out_unlock:
rcu_read_unlock();
+out_enable:
+ preempt_enable();
return cpu;
}
@@ -639,6 +681,8 @@ void scx_idle_init_masks(void)
/* Allocate local per-cpu idle cpumasks */
for_each_possible_cpu(i) {
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i),
GFP_KERNEL, cpu_to_node(i)));
BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i),
--
2.48.1
Powered by blists - more mailing lists