linux-kernel - [PATCH 03/19] sched/fair: Introduce helper functions to enforce LLC migration policy

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <d6830774db8e260af4da728c44e9f899376a0ea6.1760206683.git.tim.c.chen@linux.intel.com>
Date: Sat, 11 Oct 2025 11:24:40 -0700
From: Tim Chen <tim.c.chen@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	K Prateek Nayak <kprateek.nayak@....com>,
	"Gautham R . Shenoy" <gautham.shenoy@....com>
Cc: Chen Yu <yu.c.chen@...el.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
	Hillf Danton <hdanton@...a.com>,
	Shrikanth Hegde <sshegde@...ux.ibm.com>,
	Jianyong Wu <jianyong.wu@...look.com>,
	Yangyu Chen <cyy@...self.name>,
	Tingyin Duan <tingyin.duan@...il.com>,
	Vern Hao <vernhao@...cent.com>,
	Len Brown <len.brown@...el.com>,
	Tim Chen <tim.c.chen@...ux.intel.com>,
	Aubrey Li <aubrey.li@...el.com>,
	Zhao Liu <zhao1.liu@...el.com>,
	Chen Yu <yu.chen.surf@...il.com>,
	Libo Chen <libo.chen@...cle.com>,
	Adam Li <adamli@...amperecomputing.com>,
	Tim Chen <tim.c.chen@...el.com>,
	linux-kernel@...r.kernel.org
Subject: [PATCH 03/19] sched/fair: Introduce helper functions to enforce LLC migration policy

From: Chen Yu <yu.c.chen@...el.com>

Cache-aware scheduling aggregates threads onto their preferred LLC,
mainly through load balancing. When the preferred LLC becomes
saturated, more threads are still placed there, increasing latency.
A mechanism is needed to limit aggregation so that the preferred LLC
does not become overloaded.

Introduce helper functions can_migrate_llc() and
can_migrate_llc_task() to enforce the LLC migration policy:

  1. Aggregate a task to its preferred LLC if both source and
     destination LLCs are not too busy (<50% utilization, tunable),
     or if doing so will not leave the preferred LLC much more
     imbalanced than the non-preferred one (>20% utilization
     difference, tunable, similar to imbalance_pct of the LLC domain).
  2. Allow moving a task from overloaded preferred LLC to a non preferred
     LLC if this will not cause the non preferred LLC to become
     too imbalanced to cause a later migration back.
  3. If both LLCs are too busy, let the generic load balance to spread
     the tasks.

This hysteresis prevents tasks from being migrated into and out of the
preferred LLC frequently (back and forth): the threshold for migrating
a task out of its preferred LLC is higher than that for migrating it
into the LLC.

Since aggregation tends to make the preferred LLC busier than others,
the imbalance tolerance is controlled by llc_imb_pct. If set to 0,
tasks may still aggregate to the preferred LLC as long as it is
not more utilized than the source LLC, preserving the preference.

Co-developed-by: Tim Chen <tim.c.chen@...ux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
Signed-off-by: Chen Yu <yu.c.chen@...el.com>
---
 kernel/sched/debug.c |   4 ++
 kernel/sched/fair.c  | 145 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |   5 ++
 3 files changed, 154 insertions(+)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 02e16b70a790..57bb04ebbf96 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -523,6 +523,10 @@ static __init int sched_init_debug(void)
 	debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_SCHED_CACHE
+	debugfs_create_u32("llc_overload_pct", 0644, debugfs_sched, &llc_overload_pct);
+	debugfs_create_u32("llc_imb_pct", 0644, debugfs_sched, &llc_imb_pct);
+#endif
 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
 
 	debugfs_fair_server_init();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1ebb0d99a906..cd080468ddc9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1205,6 +1205,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
 #define EPOCH_PERIOD	(HZ / 100)	/* 10 ms */
 #define EPOCH_LLC_AFFINITY_TIMEOUT	5	/* 50 ms */
 
+__read_mostly unsigned int llc_overload_pct       = 50;
+__read_mostly unsigned int llc_imb_pct            = 20;
+
 static int llc_id(int cpu)
 {
 	if (cpu < 0)
@@ -9560,6 +9563,27 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_
 }
 
 #ifdef CONFIG_SCHED_CACHE
+/*
+ * The margin used when comparing LLC utilization with CPU capacity.
+ * Parameter llc_overload_pct determines the LLC load level where
+ * active LLC aggregation is done.
+ * Derived from fits_capacity().
+ *
+ * (default: ~50%)
+ */
+#define fits_llc_capacity(util, max)	\
+	((util) * 100 < (max) * llc_overload_pct)
+
+/*
+ * The margin used when comparing utilization.
+ * is 'util1' noticeably greater than 'util2'
+ * Derived from capacity_greater().
+ * Bias is in perentage.
+ */
+/* Allows dst util to be bigger than src util by up to bias percent */
+#define util_greater(util1, util2) \
+	((util1) * 100 > (util2) * (100 + llc_imb_pct))
+
 /* Called from load balancing paths with rcu_read_lock held */
 static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
 					 unsigned long *cap)
@@ -9575,6 +9599,127 @@ static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util,
 
 	return true;
 }
+
+/*
+ * Decision matrix according to the LLC utilization. To
+ * decide whether we can do task aggregation across LLC.
+ *
+ * By default, 50% is the threshold to treat the LLC as busy,
+ * and 20% is the utilization imbalance percentage to decide
+ * if the preferred LLC is busier than the non-preferred LLC.
+ *
+ * 1. moving towards the preferred LLC, dst is the preferred
+ *    LLC, src is not.
+ *
+ * src \ dst      30%  40%  50%  60%
+ * 30%            Y    Y    Y    N
+ * 40%            Y    Y    Y    Y
+ * 50%            Y    Y    G    G
+ * 60%            Y    Y    G    G
+ *
+ * 2. moving out of the preferred LLC, src is the preferred
+ *    LLC, dst is not:
+ *
+ * src \ dst      30%  40%  50%  60%
+ * 30%            N    N    N    N
+ * 40%            N    N    N    N
+ * 50%            N    N    G    G
+ * 60%            Y    N    G    G
+ *
+ * src :      src_util
+ * dst :      dst_util
+ * Y :        Yes, migrate
+ * N :        No, do not migrate
+ * G :        let the Generic load balance to even the load.
+ *
+ * The intention is that if both LLCs are quite busy, cache aware
+ * load balance should not be performed, and generic load balance
+ * should take effect. However, if one is busy and the other is not,
+ * the preferred LLC capacity(50%) and imbalance criteria(20%) should
+ * be considered to determine whether LLC aggregation should be
+ * performed to bias the load towards the preferred LLC.
+ */
+
+/* migration decision, 3 states are orthogonal. */
+enum llc_mig {
+	mig_forbid = 0,		/* N: Don't migrate task, respect LLC preference */
+	mig_llc,		/* Y: Do LLC preference based migration */
+	mig_unrestricted	/* G: Don't restrict generic load balance migration */
+};
+
+static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu,
+				    unsigned long tsk_util,
+				    bool to_pref)
+{
+	unsigned long src_util, dst_util, src_cap, dst_cap;
+
+	if (!get_llc_stats(src_cpu, &src_util, &src_cap) ||
+	    !get_llc_stats(dst_cpu, &dst_util, &dst_cap))
+		return mig_unrestricted;
+
+	if (!fits_llc_capacity(dst_util, dst_cap) &&
+	    !fits_llc_capacity(src_util, src_cap))
+		return mig_unrestricted;
+
+	src_util = src_util < tsk_util ? 0 : src_util - tsk_util;
+	dst_util = dst_util + tsk_util;
+	if (to_pref) {
+		/*
+		 * llc_imb_pct is the imbalance allowed between
+		 * preferred LLC and non-preferred LLC.
+		 * Don't migrate if we will get preferred LLC too
+		 * heavily loaded and if the dest is much busier
+		 * than the src, in which case migration will
+		 * increase the imbalance too much.
+		 */
+		if (!fits_llc_capacity(dst_util, dst_cap) &&
+		    util_greater(dst_util, src_util))
+			return mig_forbid;
+	} else {
+		/*
+		 * Don't migrate if we will leave preferred LLC
+		 * too idle, or if this migration leads to the
+		 * non-preferred LLC falls within sysctl_aggr_imb percent
+		 * of preferred LLC, leading to migration again
+		 * back to preferred LLC.
+		 */
+		if (fits_llc_capacity(src_util, src_cap) ||
+		    !util_greater(src_util, dst_util))
+			return mig_forbid;
+	}
+	return mig_llc;
+}
+
+/*
+ * Check if task p can migrate from src_cpu to dst_cpu
+ * in terms of cache aware load balance.
+ */
+static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
+							struct task_struct *p)
+{
+	struct mm_struct *mm;
+	bool to_pref;
+	int cpu;
+
+	mm = p->mm;
+	if (!mm)
+		return mig_unrestricted;
+
+	cpu = mm->mm_sched_cpu;
+	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
+		return mig_unrestricted;
+
+	if (cpus_share_cache(dst_cpu, cpu))
+		to_pref = true;
+	else if (cpus_share_cache(src_cpu, cpu))
+		to_pref = false;
+	else
+		return mig_unrestricted;
+
+	return can_migrate_llc(src_cpu, dst_cpu,
+			       task_util(p), to_pref);
+}
+
 #else
 static inline bool get_llc_stats(int cpu, unsigned long *util,
 				 unsigned long *cap)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2ded8d3d0ecc..a52c96064b36 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2797,6 +2797,11 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
 extern unsigned int sysctl_numa_balancing_scan_size;
 extern unsigned int sysctl_numa_balancing_hot_threshold;
 
+#ifdef CONFIG_SCHED_CACHE
+extern unsigned int llc_overload_pct;
+extern unsigned int llc_imb_pct;
+#endif
+
 #ifdef CONFIG_SCHED_HRTICK
 
 /*
-- 
2.32.0