[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241024083615.64948-1-arighi@nvidia.com>
Date: Thu, 24 Oct 2024 10:36:15 +0200
From: Andrea Righi <arighi@...dia.com>
To: Tejun Heo <tj@...nel.org>,
David Vernet <void@...ifault.com>
Cc: linux-kernel@...r.kernel.org
Subject: [PATCH] sched_ext: Introduce NUMA awareness to the default idle selection policy
Similarly to commit dfa4ed29b18c ("sched_ext: Introduce LLC awareness to
the default idle selection policy"), extend the built-in idle CPU
selection policy to also prioritize CPUs within the same NUMA node.
With this change applied, the built-in CPU idle selection policy follows
this logic:
- always prioritize CPUs from fully idle SMT cores,
- select the same CPU if possible,
- select a CPU within the same LLC domain,
- select a CPU within the same NUMA node.
Note that LLC and NUMA awareness optimizations are only applied when
CONFIG_SCHED_MC is enabled.
Signed-off-by: Andrea Righi <arighi@...dia.com>
---
kernel/sched/ext.c | 97 ++++++++++++++++++++++++++++++++++++++++------
1 file changed, 85 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d7ae816db6f2..cdc6094893db 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3124,31 +3124,85 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
goto retry;
}
+/*
+ * Scheduling domain types used for idle CPU selection.
+ */
+enum scx_domain_type {
+ SCX_DOM_LLC, /* Use the same last-level cache (LLC) */
+ SCX_DOM_NUMA, /* Use the same NUMA node */
+};
+
#ifdef CONFIG_SCHED_MC
/*
- * Return the cpumask of CPUs usable by task @p in the same LLC domain of @cpu,
- * or NULL if the LLC domain cannot be determined.
+ * Return the cpumask of CPUs usable by task @p in the same domain of @cpu, or
+ * NULL if the domain cannot be determined.
*/
-static const struct cpumask *llc_domain(const struct task_struct *p, s32 cpu)
+static const struct cpumask *
+scx_domain(const struct task_struct *p, s32 cpu, enum scx_domain_type type)
{
- struct sched_domain *sd = rcu_dereference(per_cpu(sd_llc, cpu));
- const struct cpumask *llc_cpus = sd ? sched_domain_span(sd) : NULL;
+ struct sched_domain *sd;
/*
- * Return the LLC domain only if the task is allowed to run on all
- * CPUs.
- */
- return p->nr_cpus_allowed == nr_cpu_ids ? llc_cpus : NULL;
+ * Determine the scheduling domain only if the task is allowed to run
+ * on all CPUs.
+ *
+ * This is done primarily for efficiency, as it avoids the overhead of
+ * updating a cpumask every time we need to select an idle CPU (which
+ * can be costly in large SMP systems), but it also aligns logically:
+ * if a task's scheduling domain is restricted by user-space (through
+ * CPU affinity), the task will simply use the flat scheduling domain
+ * defined by user-space.
+ */
+ if (p->nr_cpus_allowed < nr_cpu_ids)
+ return NULL;
+
+ switch (type) {
+ case SCX_DOM_LLC:
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ break;
+ case SCX_DOM_NUMA:
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ sd = NULL;
+ }
+ if (!sd)
+ return NULL;
+
+ return sched_domain_span(sd);
}
#else /* CONFIG_SCHED_MC */
-static inline const struct cpumask *llc_domain(struct task_struct *p, s32 cpu)
+static const struct cpumask *
+scx_domain(const struct task_struct *p, s32 cpu, enum scx_domain_type type)
{
return NULL;
}
#endif /* CONFIG_SCHED_MC */
/*
- * Built-in cpu idle selection policy.
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ * - always prioritize CPUs from fully idle cores (both logical CPUs are
+ * idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ * - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ * branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ * - if the above conditions aren't met, pick a CPU that shares the same LLC
+ * to maintain cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA Node:
+ * - choose a CPU from the same NUMA node to reduce memory access latency.
+ *
+ * In most architectures the NUMA domain and LLC domain overlap. In this case,
+ * making an additional attempt to find an idle CPU within the same domain
+ * might seem redundant, but the overhead is minimal and it can be beneficial,
+ * as it increases the chance of selecting a close CPU that may have just
+ * become idle.
*
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
* we never call ops.select_cpu() for them, see select_task_rq().
@@ -3156,7 +3210,8 @@ static inline const struct cpumask *llc_domain(struct task_struct *p, s32 cpu)
static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *found)
{
- const struct cpumask *llc_cpus = llc_domain(p, prev_cpu);
+ const struct cpumask *llc_cpus = scx_domain(p, prev_cpu, SCX_DOM_LLC);
+ const struct cpumask *numa_cpus = scx_domain(p, prev_cpu, SCX_DOM_NUMA);
s32 cpu;
*found = false;
@@ -3226,6 +3281,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
goto cpu_found;
}
+ /*
+ * Search for any fully idle core in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
/*
* Search for any full idle core usable by the task.
*/
@@ -3251,6 +3315,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
goto cpu_found;
}
+ /*
+ * Search for any idle CPU in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = scx_pick_idle_cpu(numa_cpus, 0);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
/*
* Search for any idle CPU usable by the task.
*/
--
2.47.0
Powered by blists - more mailing lists