linux-kernel - [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <63091f7ca7bb473fbc176af86a87d27a07a6e149.1764801860.git.tim.c.chen@linux.intel.com>
Date: Wed,  3 Dec 2025 15:07:26 -0800
From: Tim Chen <tim.c.chen@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	K Prateek Nayak <kprateek.nayak@....com>,
	"Gautham R . Shenoy" <gautham.shenoy@....com>,
	Vincent Guittot <vincent.guittot@...aro.org>
Cc: Tim Chen <tim.c.chen@...ux.intel.com>,
	Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
	Hillf Danton <hdanton@...a.com>,
	Shrikanth Hegde <sshegde@...ux.ibm.com>,
	Jianyong Wu <jianyong.wu@...look.com>,
	Yangyu Chen <cyy@...self.name>,
	Tingyin Duan <tingyin.duan@...il.com>,
	Vern Hao <vernhao@...cent.com>,
	Vern Hao <haoxing990@...il.com>,
	Len Brown <len.brown@...el.com>,
	Aubrey Li <aubrey.li@...el.com>,
	Zhao Liu <zhao1.liu@...el.com>,
	Chen Yu <yu.chen.surf@...il.com>,
	Chen Yu <yu.c.chen@...el.com>,
	Adam Li <adamli@...amperecomputing.com>,
	Aaron Lu <ziqianlu@...edance.com>,
	Tim Chen <tim.c.chen@...el.com>,
	linux-kernel@...r.kernel.org
Subject: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter

Each runqueue is assigned an array where each element tracks
the number of tasks preferring a given LLC, indexed from 0 to
max_llcs - 1.

For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on
this runqueue which prefer to run within LLC3.

The load balancer can use this information to identify busy
runqueues and migrate tasks to their preferred LLC domains.
This array will be reallocated at runtime if the number of LLCs
increases due to CPU hotplug. Only extending the buffer(rather
than shrinking it) is supported to simplify the implementation.

Introduce the buffer allocation mechanism, and the statistics
will be calculated in the subsequent patch.

Co-developed-by: Chen Yu <yu.c.chen@...el.com>
Signed-off-by: Chen Yu <yu.c.chen@...el.com>
Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
---

Notes:
    v1->v2:
        Remove static allocation of per runqueue LLC preference arrays.
        Allocate array size to the actual number of LLCs online. (Peter Zijlstra, Madadi Vineeth Reddy)

 kernel/sched/core.c     |   1 +
 kernel/sched/sched.h    |   1 +
 kernel/sched/topology.c | 117 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 118 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 48626c81ba8e..ce533dc485f5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8800,6 +8800,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SCHED_CACHE
 		raw_spin_lock_init(&rq->cpu_epoch_lock);
 		rq->cpu_epoch_next = jiffies;
+		rq->nr_pref_llc = NULL;
 #endif
 
 		zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ee8b70647835..8f2a779825e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1129,6 +1129,7 @@ struct rq {
 #ifdef CONFIG_SCHED_CACHE
 	unsigned int		nr_pref_llc_running;
 	unsigned int		nr_llc_running;
+	unsigned int		*nr_pref_llc;
 #endif
 #ifdef CONFIG_NO_HZ_COMMON
 	unsigned long		last_blocked_load_update_tick;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f25d950ab015..d583399fc6a1 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -17,8 +17,121 @@ void sched_domains_mutex_unlock(void)
 	mutex_unlock(&sched_domains_mutex);
 }
 
+/* the number of max LLCs being detected */
+static int new_max_llcs;
+/* the current number of max LLCs */
 int max_llcs;
 
+#ifdef CONFIG_SCHED_CACHE
+
+static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
+{
+	unsigned int *new = NULL;
+
+	new = kcalloc(new_max_llcs, sizeof(unsigned int),
+		      GFP_KERNEL | __GFP_NOWARN);
+
+	if (!new) {
+		*gc = NULL;
+	} else {
+		/*
+		 * Place old entry in garbage collector
+		 * for later disposal.
+		 */
+		*gc = old;
+	}
+	return new;
+}
+
+static void populate_new_pref_llcs(unsigned int *old, unsigned int *new)
+{
+	int i;
+
+	if (!old)
+		return;
+
+	for (i = 0; i < max_llcs; i++)
+		new[i] = old[i];
+}
+
+static int resize_llc_pref(void)
+{
+	unsigned int *__percpu *tmp_llc_pref;
+	int i, ret = 0;
+
+	if (new_max_llcs <= max_llcs)
+		return 0;
+
+	/*
+	 * Allocate temp percpu pointer for old llc_pref,
+	 * which will be released after switching to the
+	 * new buffer.
+	 */
+	tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
+	if (!tmp_llc_pref)
+		return -ENOMEM;
+
+	for_each_present_cpu(i)
+		*per_cpu_ptr(tmp_llc_pref, i) = NULL;
+
+	/*
+	 * Resize the per rq nr_pref_llc buffer and
+	 * switch to this new buffer.
+	 */
+	for_each_present_cpu(i) {
+		struct rq_flags rf;
+		unsigned int *new;
+		struct rq *rq;
+
+		rq = cpu_rq(i);
+		new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
+		if (!new) {
+			ret = -ENOMEM;
+
+			goto release_old;
+		}
+
+		/*
+		 * Locking rq ensures that rq->nr_pref_llc values
+		 * don't change with new task enqueue/dequeue
+		 * when we repopulate the newly enlarged array.
+		 */
+		rq_lock_irqsave(rq, &rf);
+		populate_new_pref_llcs(rq->nr_pref_llc, new);
+		rq->nr_pref_llc = new;
+		rq_unlock_irqrestore(rq, &rf);
+	}
+
+release_old:
+	/*
+	 * Load balance is done under rcu_lock.
+	 * Wait for load balance before and during resizing to
+	 * be done. They may refer to old nr_pref_llc[]
+	 * that hasn't been resized.
+	 */
+	synchronize_rcu();
+	for_each_present_cpu(i)
+		kfree(*per_cpu_ptr(tmp_llc_pref, i));
+
+	free_percpu(tmp_llc_pref);
+
+	/* succeed and update */
+	if (!ret)
+		max_llcs = new_max_llcs;
+
+	return ret;
+}
+
+#else
+
+static int resize_llc_pref(void)
+{
+	max_llcs = new_max_llcs;
+	return 0;
+}
+
+#endif
+
 /* Protected by sched_domains_mutex: */
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
@@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd,
 	 *
 	 * For both cases, we want to increase the number of LLCs.
 	 */
-	per_cpu(sd_llc_id, cpu) = max_llcs++;
+	per_cpu(sd_llc_id, cpu) = new_max_llcs++;
 
 	return per_cpu(sd_llc_id, cpu);
 }
@@ -2674,6 +2787,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	if (has_cluster)
 		static_branch_inc_cpuslocked(&sched_cluster_active);
 
+	resize_llc_pref();
+
 	if (rq && sched_debug_verbose)
 		pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
 
-- 
2.32.0