[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20260129031948.2384178-1-realwujing@gmail.com>
Date: Wed, 28 Jan 2026 22:19:43 -0500
From: Qiliang Yuan <realwujing@...il.com>
To: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>
Cc: Qiliang Yuan <realwujing@...il.com>,
Qiliang Yuan <yuanql9@...natelecom.cn>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>,
linux-kernel@...r.kernel.org
Subject: [PATCH v4] sched/fair: Cache NUMA node statistics to avoid O(N) scanning
Optimize update_numa_stats() by reducing the algorithmic complexity of
node statistics aggregation from O(N_cpus_per_node) to O(1) for the source
node.
Currently, the NUMA balancing path performs an O(N) scan across all CPUs
in a node to aggregate load, utilization, and runnable statistics. On
large-scale systems, this repetitive traversal of per-CPU runqueues
incurs significant overhead.
Introduce a per-node statistics cache within the 'root_domain' and update
it during the load-balancing hot path (update_sg_lb_stats). Leverage these
pre-aggregated stats to allow update_numa_stats() to bypass the O(N) scan
for the source node, achieving O(1) complexity. Maintain consistency
through a seqlock mechanism to prevent data tearing during concurrent
updates. Amortize the cost of data collection and significantly reduce
scheduling overhead in the NUMA placement hot path.
Signed-off-by: Qiliang Yuan <realwujing@...il.com>
Signed-off-by: Qiliang Yuan <yuanql9@...natelecom.cn>
---
v4:
- Implement seqlock (seqcount_t + spinlock_t) to ensure multi-variable consistency and prevent tearing.
- Move cache to 'root_domain' with dynamic allocation to handle partitioning and save memory on non-NUMA.
- Add #ifdef CONFIG_NUMA guards and improve code robustness.
v3:
- Optimize update_numa_stats() to O(1) for source node by caching stats during load balancing.
- Introduce a global cache (later refined in v4).
v2:
- Use cached stats from the first group in the hierarchy.
v1:
- Initial implementation of NUMA stats caching.
kernel/sched/fair.c | 64 +++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 17 +++++++++++
kernel/sched/topology.c | 20 +++++++++++++
3 files changed, 101 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e71302282671..eafe1cfd7ff6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -48,6 +48,7 @@
#include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
+#include <linux/seqlock.h>
#include <linux/rbtree_augmented.h>
#include <asm/switch_to.h>
@@ -2104,6 +2105,43 @@ static void update_numa_stats(struct task_numa_env *env,
ns->idle_cpu = -1;
rcu_read_lock();
+ /*
+ * Algorithmic Optimization: Avoid O(N) scan by using cached stats.
+ * Only applicable for the source node where we don't need to find
+ * an idle CPU. This skip is safe because we only need the totals
+ * for the source node's load, util, and nr_running to check for
+ * relative imbalance.
+ */
+ if (!find_idle && nid == env->src_nid) {
+ struct root_domain *rd = rcu_dereference(cpu_rq(env->src_cpu)->rd);
+ struct numa_stats_cache *cache;
+ unsigned int seq;
+
+ if (rd && rd->node_stats_cache) {
+ cache = &rd->node_stats_cache[nid];
+ do {
+ seq = read_seqcount_begin(&cache->seq);
+ if (!time_before(jiffies, cache->last_update + msecs_to_jiffies(10))) {
+ ns->compute_capacity = 0;
+ break;
+ }
+
+ ns->load = cache->load;
+ ns->runnable = cache->runnable;
+ ns->util = cache->util;
+ ns->nr_running = cache->nr_running;
+ ns->compute_capacity = cache->capacity;
+ } while (read_seqcount_retry(&cache->seq, seq));
+
+ if (ns->compute_capacity)
+ goto skip_scan;
+
+ /* Reset stats before O(N) scan if cache was stale or torn */
+ memset(ns, 0, sizeof(*ns));
+ ns->idle_cpu = -1;
+ }
+ }
+
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
@@ -2124,6 +2162,8 @@ static void update_numa_stats(struct task_numa_env *env,
idle_core = numa_idle_core(idle_core, cpu);
}
}
+
+skip_scan:
rcu_read_unlock();
ns->weight = cpumask_weight(cpumask_of_node(nid));
@@ -10488,6 +10528,30 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
sgs->group_capacity;
+
+ /* Algorithmic Optimization: Cache node stats for O(1) NUMA lookups */
+#ifdef CONFIG_NUMA
+ if (env->sd->flags & SD_NUMA) {
+ int nid = cpu_to_node(cpumask_first(sched_group_span(group)));
+ struct root_domain *rd = env->dst_rq->rd;
+ struct numa_stats_cache *cache;
+
+ if (rd && rd->node_stats_cache) {
+ cache = &rd->node_stats_cache[nid];
+
+ spin_lock(&cache->lock);
+ write_seqcount_begin(&cache->seq);
+ cache->nr_running = sgs->sum_h_nr_running;
+ cache->load = sgs->group_load;
+ cache->util = sgs->group_util;
+ cache->runnable = sgs->group_runnable;
+ cache->capacity = sgs->group_capacity;
+ cache->last_update = jiffies;
+ write_seqcount_end(&cache->seq);
+ spin_unlock(&cache->lock);
+ }
+ }
+#endif
}
/**
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d30cca6870f5..643e6c743fc8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -974,6 +974,19 @@ struct perf_domain {
struct rcu_head rcu;
};
+#ifdef CONFIG_NUMA
+struct numa_stats_cache {
+ spinlock_t lock;
+ seqcount_t seq;
+ unsigned long load;
+ unsigned long runnable;
+ unsigned long util;
+ unsigned long capacity;
+ unsigned int nr_running;
+ unsigned long last_update;
+} ____cacheline_aligned;
+#endif
+
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -1042,6 +1055,10 @@ struct root_domain {
* CPUs of the rd. Protected by RCU.
*/
struct perf_domain __rcu *pd;
+
+#ifdef CONFIG_NUMA
+ struct numa_stats_cache *node_stats_cache;
+#endif
};
extern void init_defrootdomain(void);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..094a4f197af3 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -5,6 +5,7 @@
#include <linux/sched/isolation.h>
#include <linux/bsearch.h>
+#include <linux/slab.h>
#include "sched.h"
DEFINE_MUTEX(sched_domains_mutex);
@@ -466,6 +467,9 @@ static void free_rootdomain(struct rcu_head *rcu)
free_cpumask_var(rd->online);
free_cpumask_var(rd->span);
free_pd(rd->pd);
+#ifdef CONFIG_NUMA
+ kfree(rd->node_stats_cache);
+#endif
kfree(rd);
}
@@ -551,8 +555,24 @@ static int init_rootdomain(struct root_domain *rd)
if (cpupri_init(&rd->cpupri) != 0)
goto free_cpudl;
+
+#ifdef CONFIG_NUMA
+ rd->node_stats_cache = kcalloc(nr_node_ids, sizeof(struct numa_stats_cache),
+ GFP_KERNEL);
+ if (!rd->node_stats_cache)
+ goto free_cpupri;
+
+ int i;
+ for (i = 0; i < nr_node_ids; i++) {
+ spin_lock_init(&rd->node_stats_cache[i].lock);
+ seqcount_init(&rd->node_stats_cache[i].seq);
+ }
+#endif
+
return 0;
+free_cpupri:
+ cpupri_cleanup(&rd->cpupri);
free_cpudl:
cpudl_cleanup(&rd->cpudl);
free_rto_mask:
--
2.51.0
Powered by blists - more mailing lists