linux-kernel - [PATCH v4] sched/fair: Cache NUMA node statistics to avoid O(N) scanning

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20260129031948.2384178-1-realwujing@gmail.com>
Date: Wed, 28 Jan 2026 22:19:43 -0500
From: Qiliang Yuan <realwujing@...il.com>
To: Ingo Molnar <mingo@...hat.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@...hat.com>,
	Vincent Guittot <vincent.guittot@...aro.org>
Cc: Qiliang Yuan <realwujing@...il.com>,
	Qiliang Yuan <yuanql9@...natelecom.cn>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	linux-kernel@...r.kernel.org
Subject: [PATCH v4] sched/fair: Cache NUMA node statistics to avoid O(N) scanning

Optimize update_numa_stats() by reducing the algorithmic complexity of
node statistics aggregation from O(N_cpus_per_node) to O(1) for the source
node.

Currently, the NUMA balancing path performs an O(N) scan across all CPUs
in a node to aggregate load, utilization, and runnable statistics. On
large-scale systems, this repetitive traversal of per-CPU runqueues
incurs significant overhead.

Introduce a per-node statistics cache within the 'root_domain' and update
it during the load-balancing hot path (update_sg_lb_stats). Leverage these
pre-aggregated stats to allow update_numa_stats() to bypass the O(N) scan
for the source node, achieving O(1) complexity. Maintain consistency
through a seqlock mechanism to prevent data tearing during concurrent
updates. Amortize the cost of data collection and significantly reduce
scheduling overhead in the NUMA placement hot path.

Signed-off-by: Qiliang Yuan <realwujing@...il.com>
Signed-off-by: Qiliang Yuan <yuanql9@...natelecom.cn>
---
v4:
 - Implement seqlock (seqcount_t + spinlock_t) to ensure multi-variable consistency and prevent tearing.
 - Move cache to 'root_domain' with dynamic allocation to handle partitioning and save memory on non-NUMA.
 - Add #ifdef CONFIG_NUMA guards and improve code robustness.
v3:
 - Optimize update_numa_stats() to O(1) for source node by caching stats during load balancing.
 - Introduce a global cache (later refined in v4).
v2:
 - Use cached stats from the first group in the hierarchy.
v1:
 - Initial implementation of NUMA stats caching.

 kernel/sched/fair.c     | 64 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h    | 17 +++++++++++
 kernel/sched/topology.c | 20 +++++++++++++
 3 files changed, 101 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e71302282671..eafe1cfd7ff6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -48,6 +48,7 @@
 #include <linux/psi.h>
 #include <linux/ratelimit.h>
 #include <linux/task_work.h>
+#include <linux/seqlock.h>
 #include <linux/rbtree_augmented.h>
 
 #include <asm/switch_to.h>
@@ -2104,6 +2105,43 @@ static void update_numa_stats(struct task_numa_env *env,
 	ns->idle_cpu = -1;
 
 	rcu_read_lock();
+	/*
+	 * Algorithmic Optimization: Avoid O(N) scan by using cached stats.
+	 * Only applicable for the source node where we don't need to find
+	 * an idle CPU. This skip is safe because we only need the totals
+	 * for the source node's load, util, and nr_running to check for
+	 * relative imbalance.
+	 */
+	if (!find_idle && nid == env->src_nid) {
+		struct root_domain *rd = rcu_dereference(cpu_rq(env->src_cpu)->rd);
+		struct numa_stats_cache *cache;
+		unsigned int seq;
+
+		if (rd && rd->node_stats_cache) {
+			cache = &rd->node_stats_cache[nid];
+			do {
+				seq = read_seqcount_begin(&cache->seq);
+				if (!time_before(jiffies, cache->last_update + msecs_to_jiffies(10))) {
+					ns->compute_capacity = 0;
+					break;
+				}
+
+				ns->load = cache->load;
+				ns->runnable = cache->runnable;
+				ns->util = cache->util;
+				ns->nr_running = cache->nr_running;
+				ns->compute_capacity = cache->capacity;
+			} while (read_seqcount_retry(&cache->seq, seq));
+
+			if (ns->compute_capacity)
+				goto skip_scan;
+
+			/* Reset stats before O(N) scan if cache was stale or torn */
+			memset(ns, 0, sizeof(*ns));
+			ns->idle_cpu = -1;
+		}
+	}
+
 	for_each_cpu(cpu, cpumask_of_node(nid)) {
 		struct rq *rq = cpu_rq(cpu);
 
@@ -2124,6 +2162,8 @@ static void update_numa_stats(struct task_numa_env *env,
 			idle_core = numa_idle_core(idle_core, cpu);
 		}
 	}
+
+skip_scan:
 	rcu_read_unlock();
 
 	ns->weight = cpumask_weight(cpumask_of_node(nid));
@@ -10488,6 +10528,30 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	if (sgs->group_type == group_overloaded)
 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
 				sgs->group_capacity;
+
+	/* Algorithmic Optimization: Cache node stats for O(1) NUMA lookups */
+#ifdef CONFIG_NUMA
+	if (env->sd->flags & SD_NUMA) {
+		int nid = cpu_to_node(cpumask_first(sched_group_span(group)));
+		struct root_domain *rd = env->dst_rq->rd;
+		struct numa_stats_cache *cache;
+
+		if (rd && rd->node_stats_cache) {
+			cache = &rd->node_stats_cache[nid];
+
+			spin_lock(&cache->lock);
+			write_seqcount_begin(&cache->seq);
+			cache->nr_running = sgs->sum_h_nr_running;
+			cache->load = sgs->group_load;
+			cache->util = sgs->group_util;
+			cache->runnable = sgs->group_runnable;
+			cache->capacity = sgs->group_capacity;
+			cache->last_update = jiffies;
+			write_seqcount_end(&cache->seq);
+			spin_unlock(&cache->lock);
+		}
+	}
+#endif
 }
 
 /**
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d30cca6870f5..643e6c743fc8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -974,6 +974,19 @@ struct perf_domain {
 	struct rcu_head rcu;
 };
 
+#ifdef CONFIG_NUMA
+struct numa_stats_cache {
+	spinlock_t lock;
+	seqcount_t seq;
+	unsigned long load;
+	unsigned long runnable;
+	unsigned long util;
+	unsigned long capacity;
+	unsigned int nr_running;
+	unsigned long last_update;
+} ____cacheline_aligned;
+#endif
+
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
@@ -1042,6 +1055,10 @@ struct root_domain {
 	 * CPUs of the rd. Protected by RCU.
 	 */
 	struct perf_domain __rcu *pd;
+
+#ifdef CONFIG_NUMA
+	struct numa_stats_cache *node_stats_cache;
+#endif
 };
 
 extern void init_defrootdomain(void);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..094a4f197af3 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -5,6 +5,7 @@
 
 #include <linux/sched/isolation.h>
 #include <linux/bsearch.h>
+#include <linux/slab.h>
 #include "sched.h"
 
 DEFINE_MUTEX(sched_domains_mutex);
@@ -466,6 +467,9 @@ static void free_rootdomain(struct rcu_head *rcu)
 	free_cpumask_var(rd->online);
 	free_cpumask_var(rd->span);
 	free_pd(rd->pd);
+#ifdef CONFIG_NUMA
+	kfree(rd->node_stats_cache);
+#endif
 	kfree(rd);
 }
 
@@ -551,8 +555,24 @@ static int init_rootdomain(struct root_domain *rd)
 
 	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_cpudl;
+
+#ifdef CONFIG_NUMA
+	rd->node_stats_cache = kcalloc(nr_node_ids, sizeof(struct numa_stats_cache),
+				      GFP_KERNEL);
+	if (!rd->node_stats_cache)
+		goto free_cpupri;
+
+	int i;
+	for (i = 0; i < nr_node_ids; i++) {
+		spin_lock_init(&rd->node_stats_cache[i].lock);
+		seqcount_init(&rd->node_stats_cache[i].seq);
+	}
+#endif
+
 	return 0;
 
+free_cpupri:
+	cpupri_cleanup(&rd->cpupri);
 free_cpudl:
 	cpudl_cleanup(&rd->cpudl);
 free_rto_mask:
-- 
2.51.0