[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251208092744.32737-13-kprateek.nayak@amd.com>
Date: Mon, 8 Dec 2025 09:26:59 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
<vincent.guittot@...aro.org>, Anna-Maria Behnsen <anna-maria@...utronix.de>,
Frederic Weisbecker <frederic@...nel.org>, Thomas Gleixner
<tglx@...utronix.de>
CC: <linux-kernel@...r.kernel.org>, Dietmar Eggemann
<dietmar.eggemann@....com>, Steven Rostedt <rostedt@...dmis.org>, Ben Segall
<bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>, Valentin Schneider
<vschneid@...hat.com>, K Prateek Nayak <kprateek.nayak@....com>, "Gautham R.
Shenoy" <gautham.shenoy@....com>, Swapnil Sapkal <swapnil.sapkal@....com>,
Shrikanth Hegde <sshegde@...ux.ibm.com>, Chen Yu <yu.c.chen@...el.com>
Subject: [RESEND RFC PATCH v2 13/29] sched/fair: Account idle cpus instead of busy cpus in sd->shared
Switch to keeping track of "sd->shared->nr_idle_cpus" instead of
"nr_busy_cpus". Since previous commit corrected the "sd->nohz_idle"
state during sched domain rebuild, the nr_idle_cpus will reflect the
correct number of idle CPUs.
The idle CPUs accounting will be used for nohz idle balance in the
subsequent commits.
Races are possible during hotplug / cpuset where "nr_idle_cpus" might
be incorrectly accounted if the CPU enters exits out of nohz idle state
between the read of "rq->nohz_tick_stopped" and the subsequent update of
"sd->nohz_idle" in the hotplug path but these inaccuracies are transient
and will be corrected when the CPU enters idle or receives a tick.
CPU0 (hotplug) CPU1 (exits nohz idle)
============== ======================
online()
if (rq->nohz_tick_stopped) /* True */
... rq->nohz_tick_stopped = 0
... set_cpu_sd_state_busy()
...
set_cpu_sd_state_idle()
These situations are rare and should not have any long-term effect on
the nohz idle balancing since there isn't a case where a nohz idle CPU
is not set on the mask - either the hotplug thread sees that
"rq->nohz_tick_stopped" is set or the CPU going idle sees the updated
sched_domain hierarchy.
After the conversion, all the bits that use "nr_idle_cpus" are already
guarded behind CONFIG_NO_HZ_COMMON which makes it convenient to put the
declaration behind CONFIG_NO_HZ_COMMON as well.
Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
include/linux/sched/topology.h | 4 +++-
kernel/sched/fair.c | 10 +++++-----
kernel/sched/topology.c | 1 -
3 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index fc3d89160513..15c61aed1b5c 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -65,7 +65,9 @@ struct sched_group;
struct sched_domain_shared {
atomic_t ref;
- atomic_t nr_busy_cpus;
+#ifdef CONFIG_NO_HZ_COMMON
+ atomic_t nr_idle_cpus;
+#endif
int has_idle_cores;
int nr_idle_scan;
};
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index de9e81eeb93d..fef3826a258f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12533,7 +12533,7 @@ static void nohz_balancer_kick(struct rq *rq)
* the others are - so just get a NOHZ balance going if it looks
* like this LLC domain has tasks we could move.
*/
- nr_busy = atomic_read(&sds->nr_busy_cpus);
+ nr_busy = per_cpu(sd_llc_size, cpu) - atomic_read(&sds->nr_idle_cpus);
if (nr_busy > 1) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
@@ -12562,7 +12562,7 @@ static void set_cpu_sd_state_busy(int cpu)
if (!xchg(&sd->nohz_idle, 0))
return;
- atomic_inc(&sd->shared->nr_busy_cpus);
+ atomic_dec(&sd->shared->nr_idle_cpus);
}
void nohz_balance_exit_idle(struct rq *rq)
@@ -12592,7 +12592,7 @@ static void set_cpu_sd_state_idle(int cpu)
if (xchg(&sd->nohz_idle, 1))
return;
- atomic_dec(&sd->shared->nr_busy_cpus);
+ atomic_inc(&sd->shared->nr_idle_cpus);
}
static void cpu_sd_exit_nohz_balance(struct rq *rq)
@@ -13075,7 +13075,7 @@ static void rq_online_fair(struct rq *rq)
update_runtime_enabled(rq);
- /* Fixup nr_busy_cpus and nohz stats. */
+ /* Fixup nr_idle_cpus and nohz stats. */
cpu_sd_reenter_nohz_balance(rq);
}
@@ -13089,7 +13089,7 @@ static void rq_offline_fair(struct rq *rq)
/* Ensure that we remove rq contribution to group share: */
clear_tg_offline_cfs_rqs(rq);
- /* Fixup nr_busy_cpus and nohz stats. */
+ /* Fixup nr_idle_cpus and nohz stats. */
cpu_sd_exit_nohz_balance(rq);
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a212ae52cdac..6b14c7db3e35 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2656,7 +2656,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
int llc_id = cpumask_first(sched_domain_span(sd));
sd->shared = *per_cpu_ptr(d.sds, llc_id);
- atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
atomic_inc(&sd->shared->ref);
}
--
2.43.0
Powered by blists - more mailing lists