[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <2f9165fe-55f2-4919-be01-e9d2cdd1f960@amd.com>
Date: Tue, 23 Dec 2025 11:01:40 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Tim Chen <tim.c.chen@...ux.intel.com>, Peter Zijlstra
<peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>, "Gautham R . Shenoy"
<gautham.shenoy@....com>, Vincent Guittot <vincent.guittot@...aro.org>
CC: Juri Lelli <juri.lelli@...hat.com>, Dietmar Eggemann
<dietmar.eggemann@....com>, Steven Rostedt <rostedt@...dmis.org>, Ben Segall
<bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>, Valentin Schneider
<vschneid@...hat.com>, Madadi Vineeth Reddy <vineethr@...ux.ibm.com>, "Hillf
Danton" <hdanton@...a.com>, Shrikanth Hegde <sshegde@...ux.ibm.com>,
"Jianyong Wu" <jianyong.wu@...look.com>, Yangyu Chen <cyy@...self.name>,
Tingyin Duan <tingyin.duan@...il.com>, Vern Hao <vernhao@...cent.com>, Vern
Hao <haoxing990@...il.com>, Len Brown <len.brown@...el.com>, Aubrey Li
<aubrey.li@...el.com>, Zhao Liu <zhao1.liu@...el.com>, Chen Yu
<yu.chen.surf@...il.com>, Chen Yu <yu.c.chen@...el.com>, Adam Li
<adamli@...amperecomputing.com>, Aaron Lu <ziqianlu@...edance.com>, Tim Chen
<tim.c.chen@...el.com>, <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v2 04/23] sched/cache: Make LLC id continuous
Hello Tim, Chenyu,
On 12/4/2025 4:37 AM, Tim Chen wrote:
> +/*
> + * Assign continuous llc id for the CPU, and return
> + * the assigned llc id.
> + */
> +static int update_llc_id(struct sched_domain *sd,
> + int cpu)
> +{
> + int id = per_cpu(sd_llc_id, cpu), i;
> +
> + if (id >= 0)
> + return id;
> +
> + if (sd) {
> + /* Look for any assigned id and reuse it.*/
> + for_each_cpu(i, sched_domain_span(sd)) {
> + id = per_cpu(sd_llc_id, i);
> +
> + if (id >= 0) {
> + per_cpu(sd_llc_id, cpu) = id;
> + return id;
> + }
> + }
> + }
I don't really like tying this down to the sched_domain span since
partition and other weirdness can cause the max_llc count to go
unnecessarily high. The tl->mask() (from sched_domain_topology_level)
should give the mask considering all online CPUs and not bothering
about cpusets.
How about something like:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b17d8e3cb55..c19b1c4e6472 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8270,6 +8270,18 @@ static void cpuset_cpu_active(void)
static void cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
+ /*
+ * This is necessary since offline CPUs are
+ * taken out of the tl->mask() and a newly
+ * onlined CPU in same LLC will not realize
+ * whether it should reuse the LLC ID owned
+ * by an offline CPU without knowing the
+ * LLC association.
+ *
+ * Safe to release the reference if this is
+ * the last CPU in the LLC going offline.
+ */
+ sched_domain_free_llc_id(cpu);
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..1378a1cfad18 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -631,6 +631,7 @@ void update_sched_domain_debugfs(void)
i++;
}
+ debugfs_create_u32("llc_id", 0444, d_cpu, (u32 *)per_cpu_ptr(&sd_llc_id, cpu));
__cpumask_clear_cpu(cpu, sd_sysctl_cpus);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3ceaa9dc9a9e..69fad88b57d8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2142,6 +2142,7 @@ extern int group_balance_cpu(struct sched_group *sg);
extern void update_sched_domain_debugfs(void);
extern void dirty_sched_domain_sysctl(int cpu);
+void sched_domain_free_llc_id(int cpu);
extern int sched_update_scaling(void);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..d6e134767f30 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -20,6 +20,46 @@ void sched_domains_mutex_unlock(void)
/* Protected by sched_domains_mutex: */
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
+static cpumask_var_t sched_llc_id_alloc_mask;
+DEFINE_PER_CPU(int, sd_llc_id) = -1;
+static int max_llcs = 0;
+
+static inline int sched_domain_alloc_llc_id(void)
+{
+ int llc_id;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ llc_id = cpumask_first_zero(sched_llc_id_alloc_mask);
+ BUG_ON((unsigned int)llc_id >= nr_cpumask_bits);
+ cpumask_set_cpu(llc_id, sched_llc_id_alloc_mask);
+ ++max_llcs;
+
+ return llc_id;
+}
+
+void sched_domain_free_llc_id(int cpu)
+{
+ int i, llc_id = per_cpu(sd_llc_id, cpu);
+ bool found = false;
+
+ lockdep_assert_cpus_held(); /* For cpu_active_mask. */
+ guard(mutex)(&sched_domains_mutex);
+
+ per_cpu(sd_llc_id, cpu) = -1;
+ for_each_cpu(i, cpu_active_mask) {
+ if (per_cpu(sd_llc_id, i) == llc_id) {
+ found = true;
+ break;
+ }
+ }
+
+ /* Allow future hotplugs to claim this ID */
+ if (!found) {
+ cpumask_clear_cpu(llc_id, sched_llc_id_alloc_mask);
+ --max_llcs;
+ }
+}
static int __init sched_debug_setup(char *str)
{
@@ -658,7 +698,6 @@ static void destroy_sched_domains(struct sched_domain *sd)
*/
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(int, sd_share_id);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
@@ -684,7 +723,6 @@ static void update_top_cache_domain(int cpu)
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -2567,10 +2605,35 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
- struct sched_domain_topology_level *tl;
+ struct sched_domain_topology_level *tl, *tl_llc = NULL;
+ bool done = false;
sd = NULL;
for_each_sd_topology(tl) {
+ int flags = 0;
+
+ if (tl->sd_flags)
+ flags = (*tl->sd_flags)();
+
+ if (flags & SD_SHARE_LLC) {
+ tl_llc = tl;
+
+ /*
+ * Entire cpu_map has been covered. We are
+ * traversing only to find the highest
+ * SD_SHARE_LLC level.
+ */
+ if (done)
+ continue;
+ }
+
+ /*
+ * Since SD_SHARE_LLC is SDF_SHARED_CHILD, we can
+ * safely break out if the entire cpu_map has been
+ * covered by a child domain.
+ */
+ if (done)
+ break;
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
@@ -2579,7 +2642,41 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
+ done = true;
+ }
+
+ /* First time visiting this CPU. Assign the llc_id. */
+ if (per_cpu(sd_llc_id, i) == -1) {
+ int j, llc_id = -1;
+
+ /*
+ * In case there are no SD_SHARE_LLC domains,
+ * each CPU gets its own llc_id. Find the first
+ * free bit on the mask and use it.
+ */
+ if (!tl_llc) {
+ per_cpu(sd_llc_id, i) = sched_domain_alloc_llc_id();
+ continue;
+ }
+
+ /*
+ * Visit all the CPUs of the LLC irrespective of the
+ * partition constraints and find if any of them have
+ * a valid llc_id.
+ */
+ for_each_cpu(j, tl_llc->mask(tl, i)) {
+ llc_id = per_cpu(sd_llc_id, j);
+
+ /* Found a valid llc_id for CPU's LLC. */
+ if (llc_id != -1)
+ break;
+ }
+
+ /* Valid llc_id not found. Allocate a new one. */
+ if (llc_id == -1)
+ llc_id = sched_domain_alloc_llc_id();
+
+ per_cpu(sd_llc_id, i) = llc_id;
}
}
@@ -2759,6 +2856,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
+ zalloc_cpumask_var(&sched_llc_id_alloc_mask, GFP_KERNEL);
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
arch_update_cpu_topology();
---
AFAICT, "sd_llc_id" isn't compared across different partitions so having
the CPUs that are actually associated with same physical LLC but across
different partitions sharing the same "sd_llc_id" shouldn't be a problem.
Thoughts?
--
Thanks and Regards,
Prateek
Powered by blists - more mailing lists