[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <xhsmh343e43fd.mognet@vschneid-thinkpadt14sgen2i.remote.csb>
Date: Fri, 06 Feb 2026 10:38:46 +0100
From: Valentin Schneider <vschneid@...hat.com>
To: K Prateek Nayak <kprateek.nayak@....com>, Ingo Molnar
<mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>, Juri Lelli
<juri.lelli@...hat.com>, Vincent Guittot <vincent.guittot@...aro.org>,
linux-kernel@...r.kernel.org
Cc: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
<rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
<mgorman@...e.de>, Chen Yu <yu.c.chen@...el.com>, Shrikanth Hegde
<sshegde@...ux.ibm.com>, "Gautham R. Shenoy" <gautham.shenoy@....com>
Subject: Re: [PATCH v3 3/8] sched/topology: Switch to assigning "sd->shared"
from s_data
On 06/02/26 10:50, K Prateek Nayak wrote:
> Hello Valentin,
>
> On 2/5/2026 10:23 PM, Valentin Schneider wrote:
>> On 20/01/26 11:32, K Prateek Nayak wrote:
>>> @@ -2655,8 +2655,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>>> unsigned int imb_span = 1;
>>>
>>> for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
>>> + struct sched_domain *parent = sd->parent;
>>> struct sched_domain *child = sd->child;
>>>
>>> + /* Attach sd->shared to the topmost SD_SHARE_LLC domain. */
>>> + if ((sd->flags & SD_SHARE_LLC) &&
>>> + (!parent || !(parent->flags & SD_SHARE_LLC))) {
>>> + int llc_id = cpumask_first(sched_domain_span(sd));
>>> +
>>> + sd->shared = *per_cpu_ptr(d.sds, llc_id);
>>> + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
>>> + atomic_inc(&sd->shared->ref);
>>> + }
>>> +
>>
>> We now have two if's looking for the highest_flag_domain(i, SD_SHARE_LLC),
>> but given this needs to write the sd->imb_numa_nr for every SD I couldn't
>> factorize this into something that looked sane :(
>
> Yeah! The "imb_numa_nr" cares about the "sd_llc" *after* we've crossed
> it and "sd->shared" assignment cares when we are *at* the sd_llc.
>
> Since we have to assign the "sd->shared" before claim_allocations(),
> I couldn't find a better spot to assign it.
>
> That said, "imb_numa_nr" calculation can be modified to use the "sd_llc"
> and its "parent". I'll let you be the judge of whether the following is
> better or worse ;-)
>
> (Only build tested)
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index ac268da91778..e98bb812de35 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2614,13 +2614,23 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> unsigned int imb_span = 1;
>
> for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
> - struct sched_domain *child = sd->child;
> + struct sched_domain *parent = sd->parent;
>
> - if (!(sd->flags & SD_SHARE_LLC) && child &&
> - (child->flags & SD_SHARE_LLC)) {
> - struct sched_domain __rcu *top_p;
> + /* Topmost SD_SHARE_LLC domain. */
> + if ((sd->flags & SD_SHARE_LLC) &&
> + (!parent || !(parent->flags & SD_SHARE_LLC))) {
> + int sd_id = cpumask_first(sched_domain_span(sd));
> + struct sched_domain *top_p;
> unsigned int nr_llcs;
>
> + sd->shared = *per_cpu_ptr(d.sds, sd_id);
> + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
> + atomic_inc(&sd->shared->ref);
> +
> + /* No SD_NUMA domains. */
> + if (!parent)
> + break;
> +
AIUI we currently write sd->imb_numa_nr for all SD's, but it's only useful
for the SD_NUMA ones... How about the lightly tested:
---
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index afb2c26efb4e5..03db45658f6bd 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2575,6 +2575,51 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
return true;
}
+static inline void adjust_numa_imbalance(struct sched_domain *sd_llc,
+ unsigned int *imb, unsigned int *imb_span)
+{
+ /*
+ * For a single LLC per node, allow an
+ * imbalance up to 12.5% of the node. This is
+ * arbitrary cutoff based two factors -- SMT and
+ * memory channels. For SMT-2, the intent is to
+ * avoid premature sharing of HT resources but
+ * SMT-4 or SMT-8 *may* benefit from a different
+ * cutoff. For memory channels, this is a very
+ * rough estimate of how many channels may be
+ * active and is based on recent CPUs with
+ * many cores.
+ *
+ * For multiple LLCs, allow an imbalance
+ * until multiple tasks would share an LLC
+ * on one node while LLCs on another node
+ * remain idle. This assumes that there are
+ * enough logical CPUs per LLC to avoid SMT
+ * factors and that there is a correlation
+ * between LLCs and memory channels.
+ */
+ struct sched_domain *top_p;
+ unsigned int nr_llcs;
+
+ WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
+ WARN_ON(!sd_llc->parent);
+
+ nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
+ if (nr_llcs == 1)
+ *imb = sd_llc->parent->span_weight >> 3;
+ else
+ *imb = nr_llcs;
+ *imb = max(1U, *imb);
+ sd_llc->parent->imb_numa_nr = *imb;
+
+ /* Set span based on the first NUMA domain. */
+ top_p = sd_llc->parent->parent;
+ while (top_p && !(top_p->flags & SD_NUMA)) {
+ top_p = top_p->parent;
+ }
+ *imb_span = top_p ? top_p->span_weight : sd_llc->parent->span_weight;
+}
+
/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
@@ -2640,63 +2685,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
unsigned int imb = 0;
unsigned int imb_span = 1;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- struct sched_domain *parent = sd->parent;
-
- /* Topmost SD_SHARE_LLC domain. */
- if ((sd->flags & SD_SHARE_LLC) &&
- (!parent || !(parent->flags & SD_SHARE_LLC))) {
- int sd_id = cpumask_first(sched_domain_span(sd));
- struct sched_domain *top_p;
- unsigned int nr_llcs;
-
- sd->shared = *per_cpu_ptr(d.sds, sd_id);
- atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
- atomic_inc(&sd->shared->ref);
-
- /* No SD_NUMA domains. */
- if (!parent)
- break;
-
- /*
- * For a single LLC per node, allow an
- * imbalance up to 12.5% of the node. This is
- * arbitrary cutoff based two factors -- SMT and
- * memory channels. For SMT-2, the intent is to
- * avoid premature sharing of HT resources but
- * SMT-4 or SMT-8 *may* benefit from a different
- * cutoff. For memory channels, this is a very
- * rough estimate of how many channels may be
- * active and is based on recent CPUs with
- * many cores.
- *
- * For multiple LLCs, allow an imbalance
- * until multiple tasks would share an LLC
- * on one node while LLCs on another node
- * remain idle. This assumes that there are
- * enough logical CPUs per LLC to avoid SMT
- * factors and that there is a correlation
- * between LLCs and memory channels.
- */
- nr_llcs = parent->span_weight / sd->span_weight;
- if (nr_llcs == 1)
- imb = sd->span_weight >> 3;
- else
- imb = nr_llcs;
- imb = max(1U, imb);
- sd->imb_numa_nr = imb;
-
- /* Set span based on the first NUMA domain. */
- top_p = parent;
- while (top_p && !(top_p->flags & SD_NUMA)) {
- top_p = top_p->parent;
- }
- imb_span = top_p ? top_p->span_weight : parent->span_weight;
- } else {
- int factor = max(1U, (sd->span_weight / imb_span));
+ sd = *per_cpu_ptr(d.sd, i);
+ /* First, find the topmost SD_SHARE_LLC domain */
+ while (sd && sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+ sd = sd->parent;
- sd->imb_numa_nr = imb * factor;
- }
+ if (sd->flags & SD_SHARE_LLC) {
+ int sd_id = cpumask_first(sched_domain_span(sd));
+
+ sd->shared = *per_cpu_ptr(d.sds, sd_id);
+ atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+ atomic_inc(&sd->shared->ref);
+ }
+
+ /* Special case the first parent of the topmost SD_SHARE_LLC domain. */
+ if ((sd->flags & SD_SHARE_LLC) && sd->parent) {
+ adjust_numa_imbalance(sd, &imb, &imb_span);
+ sd = sd->parent->parent;
+ }
+
+ /* Update the upper remainder of the topology */
+ while (sd) {
+ int factor = max(1U, (sd->span_weight / imb_span));
+ sd->imb_numa_nr = imb * factor;
+ sd = sd->parent;
}
}
Powered by blists - more mailing lists