linux-kernel - [PATCH] sched/topology: delay imb_numa_nr calculation until after domain degeneration

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260119035727.2867477-1-txpeng@tencent.com>
Date: Mon, 19 Jan 2026 11:57:27 +0800
From: Tianxiang Peng <luminosity1999@...il.com>
To: mingo@...hat.com,
	peterz@...radead.org,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org,
	dietmar.eggemann@....com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	vschneid@...hat.com
Cc: linux-kernel@...r.kernel.org,
	flyingpeng@...cent.com,
	Tianxiang Peng <txpeng@...cent.com>
Subject: [PATCH] sched/topology: delay imb_numa_nr calculation until after domain degeneration

Currently, imb_numa_nr is calculated before sched_domain degeneration in
cpu_attach_domain(), which might reflect a transient topology that no longer
exists.

This is observed on our Kunpeng 920 systems (4 NUMA nodes, 80 cores per
node, 8 cores per cluster), where the initial PKG domain is redundant with
the MC domain and subsequently removed.

Observed topology data on Kunpeng 920:

  Topology order: Child -> Parent

  [Before Patch]
    (before degeneration)
      Domains:     CLS(8) -> MC(80) -> PKG(80)
      Flags:       [LLC]     [LLC]     [!LLC]
      imb_numa_nr:   0         0         10

    (after degeneration)
      Domains:     CLS(8) -> MC(80) -> NUMA(160)
      Flags:       [LLC]     [LLC]     [!LLC]
      imb_numa_nr:   0         0         10

  [After Patch]
    (before degeneration)
      Domains:     CLS(8) -> MC(80) -> PKG(80)
      Flags:       [LLC]     [LLC]     [!LLC]

    (after degeneration)
      Domains:     CLS(8) -> MC(80) -> NUMA(160)
      Flags:       [LLC]     [LLC]     [!LLC]
      imb_numa_nr:   0         0         2

Move the calculation to cpu_attach_domain() after degeneration to
ensure it always reflects the effective topology.

Signed-off-by: Tianxiang Peng <txpeng@...cent.com>
Reviewed-by: Hao Peng <flyingpeng@...cent.com>
---
 kernel/sched/topology.c | 115 ++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 58 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..e8774e587f15 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -717,6 +717,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
+	unsigned int imb = 0;
+	unsigned int imb_span = 1;
 
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
@@ -764,6 +766,61 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 		}
 	}
 
+	/*
+	 * Calculate an allowed NUMA imbalance such that LLCs do not get
+	 * imbalanced.
+	 * Perform this calculation after domain degeneration so that
+	 * sd->imb_numa_nr reflects the final effective topology.
+	 */
+	for (tmp = sd; tmp; tmp = tmp->parent) {
+		struct sched_domain *child = tmp->child;
+
+		if (!(tmp->flags & SD_SHARE_LLC) && child &&
+			(child->flags & SD_SHARE_LLC)) {
+			struct sched_domain __rcu *top_p;
+			unsigned int nr_llcs;
+
+			/*
+			 * For a single LLC per node, allow an
+			 * imbalance up to 12.5% of the node. This is
+			 * arbitrary cutoff based two factors -- SMT and
+			 * memory channels. For SMT-2, the intent is to
+			 * avoid premature sharing of HT resources but
+			 * SMT-4 or SMT-8 *may* benefit from a different
+			 * cutoff. For memory channels, this is a very
+			 * rough estimate of how many channels may be
+			 * active and is based on recent CPUs with
+			 * many cores.
+			 *
+			 * For multiple LLCs, allow an imbalance
+			 * until multiple tasks would share an LLC
+			 * on one node while LLCs on another node
+			 * remain idle. This assumes that there are
+			 * enough logical CPUs per LLC to avoid SMT
+			 * factors and that there is a correlation
+			 * between LLCs and memory channels.
+			 */
+			nr_llcs = tmp->span_weight / child->span_weight;
+			if (nr_llcs == 1)
+				imb = tmp->span_weight >> 3;
+			else
+				imb = nr_llcs;
+			imb = max(1U, imb);
+			tmp->imb_numa_nr = imb;
+
+			/* Set span based on the first NUMA domain. */
+			top_p = tmp->parent;
+			while (top_p && !(top_p->flags & SD_NUMA)) {
+				top_p = top_p->parent;
+			}
+			imb_span = top_p ? top_p->span_weight : tmp->span_weight;
+		} else {
+			int factor = max(1U, (tmp->span_weight / imb_span));
+
+			tmp->imb_numa_nr = imb * factor;
+		}
+	}
+
 	sched_domain_debug(sd, cpu);
 
 	rq_attach_root(rq, rd);
@@ -2600,64 +2657,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
-	/*
-	 * Calculate an allowed NUMA imbalance such that LLCs do not get
-	 * imbalanced.
-	 */
-	for_each_cpu(i, cpu_map) {
-		unsigned int imb = 0;
-		unsigned int imb_span = 1;
-
-		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-			struct sched_domain *child = sd->child;
-
-			if (!(sd->flags & SD_SHARE_LLC) && child &&
-			    (child->flags & SD_SHARE_LLC)) {
-				struct sched_domain __rcu *top_p;
-				unsigned int nr_llcs;
-
-				/*
-				 * For a single LLC per node, allow an
-				 * imbalance up to 12.5% of the node. This is
-				 * arbitrary cutoff based two factors -- SMT and
-				 * memory channels. For SMT-2, the intent is to
-				 * avoid premature sharing of HT resources but
-				 * SMT-4 or SMT-8 *may* benefit from a different
-				 * cutoff. For memory channels, this is a very
-				 * rough estimate of how many channels may be
-				 * active and is based on recent CPUs with
-				 * many cores.
-				 *
-				 * For multiple LLCs, allow an imbalance
-				 * until multiple tasks would share an LLC
-				 * on one node while LLCs on another node
-				 * remain idle. This assumes that there are
-				 * enough logical CPUs per LLC to avoid SMT
-				 * factors and that there is a correlation
-				 * between LLCs and memory channels.
-				 */
-				nr_llcs = sd->span_weight / child->span_weight;
-				if (nr_llcs == 1)
-					imb = sd->span_weight >> 3;
-				else
-					imb = nr_llcs;
-				imb = max(1U, imb);
-				sd->imb_numa_nr = imb;
-
-				/* Set span based on the first NUMA domain. */
-				top_p = sd->parent;
-				while (top_p && !(top_p->flags & SD_NUMA)) {
-					top_p = top_p->parent;
-				}
-				imb_span = top_p ? top_p->span_weight : sd->span_weight;
-			} else {
-				int factor = max(1U, (sd->span_weight / imb_span));
-
-				sd->imb_numa_nr = imb * factor;
-			}
-		}
-	}
-
 	/* Calculate CPU capacity for physical packages and nodes */
 	for (i = nr_cpumask_bits-1; i >= 0; i--) {
 		if (!cpumask_test_cpu(i, cpu_map))
-- 
2.43.5