linux-kernel - [RFC patch v3 06/20] sched: Save the per LLC utilization for better cache aware scheduling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-Id: <22f5c52b3e904bd782c43bc4bfc6fcd4b447ea54.1750268218.git.tim.c.chen@linux.intel.com>
Date: Wed, 18 Jun 2025 11:27:54 -0700
From: Tim Chen <tim.c.chen@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	K Prateek Nayak <kprateek.nayak@....com>,
	"Gautham R . Shenoy" <gautham.shenoy@....com>
Cc: Chen Yu <yu.c.chen@...el.com>,
	Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	Tim Chen <tim.c.chen@...el.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Libo Chen <libo.chen@...cle.com>,
	Abel Wu <wuyun.abel@...edance.com>,
	Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
	Hillf Danton <hdanton@...a.com>,
	Len Brown <len.brown@...el.com>,
	linux-kernel@...r.kernel.org
Subject: [RFC patch v3 06/20] sched: Save the per LLC utilization for better cache aware scheduling

From: Chen Yu <yu.c.chen@...el.com>

When a system gets busy and a process's preferred LLC
is saturated by too many threads within this process, there are significant
in-LLC task migrations within its preferred LLC. This leads to migration
latency and degrades performance. Ideally, task aggregation should be
inhibited if the task's preferred LLC is overloaded. This implies that a
metric is needed to indicate whether the LLC is busy.

Store the per-LLC utilization calculated via periodic load
balancing. These statistics will be used in subsequent patches to
determine whether tasks should be aggregated to their preferred LLC.

Signed-off-by: Chen Yu <yu.c.chen@...el.com>
---
 include/linux/sched/topology.h |  3 ++
 kernel/sched/fair.c            | 53 ++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7b4301b7235f..b3115bc1cbc0 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -78,6 +78,9 @@ struct sched_domain_shared {
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
 	int		nr_idle_scan;
+#ifdef CONFIG_SCHED_CACHE
+	unsigned long	util_avg;
+#endif
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7fb2322c5d9e..02f104414b9a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8806,6 +8806,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 #ifdef CONFIG_SCHED_CACHE
 static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle);
 
+/* expected to be protected by rcu_read_lock() */
+static bool get_llc_stats(int cpu, unsigned long *util,
+			  unsigned long *cap)
+{
+	struct sched_domain_shared *sd_share;
+
+	sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+	if (!sd_share)
+		return false;
+
+	*util = READ_ONCE(sd_share->util_avg);
+	*cap = per_cpu(sd_llc_size, cpu) * SCHED_CAPACITY_SCALE;
+
+	return true;
+}
+
 static int select_cache_cpu(struct task_struct *p, int prev_cpu)
 {
 	struct mm_struct *mm = p->mm;
@@ -10646,6 +10662,42 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 	return check_cpu_capacity(rq, sd);
 }
 
+#ifdef CONFIG_SCHED_CACHE
+/*
+ * Save this sched group's statistic for later use:
+ * The task wakeup and load balance can make better
+ * decision based on these statistics.
+ */
+static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
+			     struct sched_group *group)
+{
+	/* Find the sched domain that spans this group. */
+	struct sched_domain *sd = env->sd->child;
+	struct sched_domain_shared *sd_share;
+
+	if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE)
+		return;
+
+	/* only care the sched domain that spans 1 LLC */
+	if (!sd || !(sd->flags & SD_SHARE_LLC) ||
+	    !sd->parent || (sd->parent->flags & SD_SHARE_LLC))
+		return;
+
+	sd_share = rcu_dereference(per_cpu(sd_llc_shared,
+				   cpumask_first(sched_group_span(group))));
+	if (!sd_share)
+		return;
+
+	if (likely(READ_ONCE(sd_share->util_avg) != sgs->group_util))
+		WRITE_ONCE(sd_share->util_avg, sgs->group_util);
+}
+#else
+static inline void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs,
+				    struct sched_group *group)
+{
+}
+#endif
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -10735,6 +10787,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
 
+	update_sg_if_llc(env, sgs, group);
 	/* Computing avg_load makes sense only when group is overloaded */
 	if (sgs->group_type == group_overloaded)
 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
-- 
2.32.0