[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250904041516.3046-8-kprateek.nayak@amd.com>
Date: Thu, 4 Sep 2025 04:15:03 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
<vincent.guittot@...aro.org>, Anna-Maria Behnsen <anna-maria@...utronix.de>,
Frederic Weisbecker <frederic@...nel.org>, Thomas Gleixner
<tglx@...utronix.de>, <linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
<rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>, K Prateek Nayak
<kprateek.nayak@....com>, "Gautham R. Shenoy" <gautham.shenoy@....com>,
Swapnil Sapkal <swapnil.sapkal@....com>
Subject: [RFC PATCH 07/19] sched/fair: Account idle cpus instead of busy cpus in sd->shared
Switch to keeping track of "sd->shared->nr_idle_cpus" instead of
"nr_busy_cpus". Since previous commit corrected the "sd->nohz_idle"
state during sched domain rebuild, the nr_idle_cpus will reflect the
correct number of idle CPUs.
The idle CPUs accounting will be used for nohz idle balance in the
subsequent commits.
Races are possible during hotplug / cpuset where "nr_idle_cpus" might
be incorrectly accounted if the CPU enters exits out of nohz idle state
between the read of "rq->nohz_tick_stopped" and the subsequent update of
"sd->nohz_idle" in the hotplug path but these inaccuracies are transient
and will be corrected when the CPU enters idle or receives a tick.
CPU0 (hotplug) CPU1 (exits nohz idle)
============== ======================
online()
if (rq->nohz_tick_stopped) /* True */
... rq->nohz_tick_stopped = 0
... set_cpu_sd_state_busy()
...
set_cpu_sd_state_idle()
These situations are rare and should not have any long-term effect on
the nohz idle balancing since there isn't a case where a nohz idle CPU
is not set on the mask - either the hotplug thread sees that
"rq->nohz_tick_stopped" is set or the CPU going idle sees the updated
sched_domain hierarchy.
After the conversion, all the bits that use "nr_idle_cpus" are already
guarded behind CONFIG_NO_HZ_COMMON which makes it convenient to put the
declaration behind CONFIG_NO_HZ_COMMON as well.
Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
include/linux/sched/topology.h | 4 +++-
kernel/sched/fair.c | 10 +++++-----
kernel/sched/topology.c | 1 -
3 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index d816911de435..2f0d8ecea427 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -65,7 +65,9 @@ struct sched_group;
struct sched_domain_shared {
atomic_t ref;
- atomic_t nr_busy_cpus;
+#ifdef CONFIG_NO_HZ_COMMON
+ atomic_t nr_idle_cpus;
+#endif
int has_idle_cores;
int nr_idle_scan;
};
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 61e1b4deb3e8..dee0ded7f40d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12429,7 +12429,7 @@ static void nohz_balancer_kick(struct rq *rq)
* the others are - so just get a NOHZ balance going if it looks
* like this LLC domain has tasks we could move.
*/
- nr_busy = atomic_read(&sds->nr_busy_cpus);
+ nr_busy = per_cpu(sd_llc_size, cpu) - atomic_read(&sds->nr_idle_cpus);
if (nr_busy > 1) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
@@ -12458,7 +12458,7 @@ static void set_cpu_sd_state_busy(int cpu)
if (!xchg(&sd->nohz_idle, 0))
return;
- atomic_inc(&sd->shared->nr_busy_cpus);
+ atomic_dec(&sd->shared->nr_idle_cpus);
}
void nohz_balance_exit_idle(struct rq *rq)
@@ -12488,7 +12488,7 @@ static void set_cpu_sd_state_idle(int cpu)
if (xchg(&sd->nohz_idle, 1))
return;
- atomic_dec(&sd->shared->nr_busy_cpus);
+ atomic_inc(&sd->shared->nr_idle_cpus);
}
static void cpu_sd_exit_nohz_balance(struct rq *rq)
@@ -12955,7 +12955,7 @@ static void rq_online_fair(struct rq *rq)
update_runtime_enabled(rq);
- /* Fixup nr_busy_cpus and nohz stats. */
+ /* Fixup nr_idle_cpus and nohz stats. */
cpu_sd_reenter_nohz_balance(rq);
}
@@ -12969,7 +12969,7 @@ static void rq_offline_fair(struct rq *rq)
/* Ensure that we remove rq contribution to group share: */
clear_tg_offline_cfs_rqs(rq);
- /* Fixup nr_busy_cpus and nohz stats. */
+ /* Fixup nr_idle_cpus and nohz stats. */
cpu_sd_exit_nohz_balance(rq);
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a059641e12e5..0b0257937a97 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2581,7 +2581,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
int llc_id = cpumask_first(sched_domain_span(sd));
sd->shared = *per_cpu_ptr(d.sds, llc_id);
- atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
atomic_inc(&sd->shared->ref);
}
--
2.34.1
Powered by blists - more mailing lists