lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <xhsmh343e43fd.mognet@vschneid-thinkpadt14sgen2i.remote.csb>
Date: Fri, 06 Feb 2026 10:38:46 +0100
From: Valentin Schneider <vschneid@...hat.com>
To: K Prateek Nayak <kprateek.nayak@....com>, Ingo Molnar
 <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>, Juri Lelli
 <juri.lelli@...hat.com>, Vincent Guittot <vincent.guittot@...aro.org>,
 linux-kernel@...r.kernel.org
Cc: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
 <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
 <mgorman@...e.de>, Chen Yu <yu.c.chen@...el.com>, Shrikanth Hegde
 <sshegde@...ux.ibm.com>, "Gautham R. Shenoy" <gautham.shenoy@....com>
Subject: Re: [PATCH v3 3/8] sched/topology: Switch to assigning "sd->shared"
 from s_data

On 06/02/26 10:50, K Prateek Nayak wrote:
> Hello Valentin,
>
> On 2/5/2026 10:23 PM, Valentin Schneider wrote:
>> On 20/01/26 11:32, K Prateek Nayak wrote:
>>> @@ -2655,8 +2655,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>>>               unsigned int imb_span = 1;
>>>
>>>               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
>>> +			struct sched_domain *parent = sd->parent;
>>>                       struct sched_domain *child = sd->child;
>>>
>>> +			/* Attach sd->shared to the topmost SD_SHARE_LLC domain. */
>>> +			if ((sd->flags & SD_SHARE_LLC) &&
>>> +			    (!parent || !(parent->flags & SD_SHARE_LLC))) {
>>> +				int llc_id = cpumask_first(sched_domain_span(sd));
>>> +
>>> +				sd->shared = *per_cpu_ptr(d.sds, llc_id);
>>> +				atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
>>> +				atomic_inc(&sd->shared->ref);
>>> +			}
>>> +
>>
>> We now have two if's looking for the highest_flag_domain(i, SD_SHARE_LLC),
>> but given this needs to write the sd->imb_numa_nr for every SD I couldn't
>> factorize this into something that looked sane :(
>
> Yeah! The "imb_numa_nr" cares about the "sd_llc" *after* we've crossed
> it and "sd->shared" assignment cares when we are *at* the sd_llc.
>
> Since we have to assign the "sd->shared" before claim_allocations(),
> I couldn't find a better spot to assign it.
>
> That said, "imb_numa_nr" calculation can be modified to use the "sd_llc"
> and its "parent". I'll let you be the judge of whether the following is
> better or worse ;-)
>
>   (Only build tested)
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index ac268da91778..e98bb812de35 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2614,13 +2614,23 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>               unsigned int imb_span = 1;
>
>               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
> -			struct sched_domain *child = sd->child;
> +			struct sched_domain *parent = sd->parent;
>
> -			if (!(sd->flags & SD_SHARE_LLC) && child &&
> -			    (child->flags & SD_SHARE_LLC)) {
> -				struct sched_domain __rcu *top_p;
> +			/* Topmost SD_SHARE_LLC domain. */
> +			if ((sd->flags & SD_SHARE_LLC) &&
> +			    (!parent || !(parent->flags & SD_SHARE_LLC))) {
> +				int sd_id = cpumask_first(sched_domain_span(sd));
> +				struct sched_domain *top_p;
>                               unsigned int nr_llcs;
>
> +				sd->shared = *per_cpu_ptr(d.sds, sd_id);
> +				atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
> +				atomic_inc(&sd->shared->ref);
> +
> +				/* No SD_NUMA domains. */
> +				if (!parent)
> +					break;
> +

AIUI we currently write sd->imb_numa_nr for all SD's, but it's only useful
for the SD_NUMA ones... How about the lightly tested:
---
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index afb2c26efb4e5..03db45658f6bd 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2575,6 +2575,51 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
 	return true;
 }
 
+static inline void adjust_numa_imbalance(struct sched_domain *sd_llc,
+					 unsigned int *imb, unsigned int *imb_span)
+{
+	/*
+	 * For a single LLC per node, allow an
+	 * imbalance up to 12.5% of the node. This is
+	 * arbitrary cutoff based two factors -- SMT and
+	 * memory channels. For SMT-2, the intent is to
+	 * avoid premature sharing of HT resources but
+	 * SMT-4 or SMT-8 *may* benefit from a different
+	 * cutoff. For memory channels, this is a very
+	 * rough estimate of how many channels may be
+	 * active and is based on recent CPUs with
+	 * many cores.
+	 *
+	 * For multiple LLCs, allow an imbalance
+	 * until multiple tasks would share an LLC
+	 * on one node while LLCs on another node
+	 * remain idle. This assumes that there are
+	 * enough logical CPUs per LLC to avoid SMT
+	 * factors and that there is a correlation
+	 * between LLCs and memory channels.
+	 */
+	struct sched_domain *top_p;
+	unsigned int nr_llcs;
+
+	WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
+	WARN_ON(!sd_llc->parent);
+
+	nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
+	if (nr_llcs == 1)
+		*imb = sd_llc->parent->span_weight >> 3;
+	else
+		*imb = nr_llcs;
+	*imb = max(1U, *imb);
+	sd_llc->parent->imb_numa_nr = *imb;
+
+	/* Set span based on the first NUMA domain. */
+	top_p = sd_llc->parent->parent;
+	while (top_p && !(top_p->flags & SD_NUMA)) {
+		top_p = top_p->parent;
+	}
+	*imb_span = top_p ? top_p->span_weight : sd_llc->parent->span_weight;
+}
+
 /*
  * Build sched domains for a given set of CPUs and attach the sched domains
  * to the individual CPUs
@@ -2640,63 +2685,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		unsigned int imb = 0;
 		unsigned int imb_span = 1;
 
-		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-			struct sched_domain *parent = sd->parent;
-
-			/* Topmost SD_SHARE_LLC domain. */
-			if ((sd->flags & SD_SHARE_LLC) &&
-			    (!parent || !(parent->flags & SD_SHARE_LLC))) {
-				int sd_id = cpumask_first(sched_domain_span(sd));
-				struct sched_domain *top_p;
-				unsigned int nr_llcs;
-
-				sd->shared = *per_cpu_ptr(d.sds, sd_id);
-				atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
-				atomic_inc(&sd->shared->ref);
-
-				/* No SD_NUMA domains. */
-				if (!parent)
-					break;
-
-				/*
-				 * For a single LLC per node, allow an
-				 * imbalance up to 12.5% of the node. This is
-				 * arbitrary cutoff based two factors -- SMT and
-				 * memory channels. For SMT-2, the intent is to
-				 * avoid premature sharing of HT resources but
-				 * SMT-4 or SMT-8 *may* benefit from a different
-				 * cutoff. For memory channels, this is a very
-				 * rough estimate of how many channels may be
-				 * active and is based on recent CPUs with
-				 * many cores.
-				 *
-				 * For multiple LLCs, allow an imbalance
-				 * until multiple tasks would share an LLC
-				 * on one node while LLCs on another node
-				 * remain idle. This assumes that there are
-				 * enough logical CPUs per LLC to avoid SMT
-				 * factors and that there is a correlation
-				 * between LLCs and memory channels.
-				 */
-				nr_llcs = parent->span_weight / sd->span_weight;
-				if (nr_llcs == 1)
-					imb = sd->span_weight >> 3;
-				else
-					imb = nr_llcs;
-				imb = max(1U, imb);
-				sd->imb_numa_nr = imb;
-
-				/* Set span based on the first NUMA domain. */
-				top_p = parent;
-				while (top_p && !(top_p->flags & SD_NUMA)) {
-					top_p = top_p->parent;
-				}
-				imb_span = top_p ? top_p->span_weight : parent->span_weight;
-			} else {
-				int factor = max(1U, (sd->span_weight / imb_span));
+		sd = *per_cpu_ptr(d.sd, i);
+		/* First, find the topmost SD_SHARE_LLC domain */
+		while (sd && sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+			sd = sd->parent;
 
-				sd->imb_numa_nr = imb * factor;
-			}
+		if (sd->flags & SD_SHARE_LLC) {
+			int sd_id = cpumask_first(sched_domain_span(sd));
+
+			sd->shared = *per_cpu_ptr(d.sds, sd_id);
+			atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+			atomic_inc(&sd->shared->ref);
+		}
+
+		/* Special case the first parent of the topmost SD_SHARE_LLC domain. */
+		if ((sd->flags & SD_SHARE_LLC) && sd->parent) {
+			adjust_numa_imbalance(sd, &imb, &imb_span);
+			sd = sd->parent->parent;
+		}
+
+		/* Update the upper remainder of the topology */
+		while (sd) {
+			int factor = max(1U, (sd->span_weight / imb_span));
+			sd->imb_numa_nr = imb * factor;
+			sd = sd->parent;
 		}
 	}
 


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ