linux-kernel - [RFC PATCH v4 27/28] sched: Allow the user space to tune the scale factor for RSS comparison

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <81c197882b7c9f4325a5cb32f8a9d1e1fc900297.1754712565.git.tim.c.chen@linux.intel.com>
Date: Sat,  9 Aug 2025 13:09:02 +0800
From: Chen Yu <yu.c.chen@...el.com>
To: Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	K Prateek Nayak <kprateek.nayak@....com>,
	"Gautham R . Shenoy" <gautham.shenoy@....com>
Cc: Vincent Guittot <vincent.guittot@...aro.org>,
	Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	Libo Chen <libo.chen@...cle.com>,
	Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
	Hillf Danton <hdanton@...a.com>,
	Shrikanth Hegde <sshegde@...ux.ibm.com>,
	Jianyong Wu <jianyong.wu@...look.com>,
	Yangyu Chen <cyy@...self.name>,
	Tingyin Duan <tingyin.duan@...il.com>,
	Vern Hao <vernhao@...cent.com>,
	Len Brown <len.brown@...el.com>,
	Tim Chen <tim.c.chen@...ux.intel.com>,
	Aubrey Li <aubrey.li@...el.com>,
	Zhao Liu <zhao1.liu@...el.com>,
	Chen Yu <yu.chen.surf@...il.com>,
	Chen Yu <yu.c.chen@...el.com>,
	linux-kernel@...r.kernel.org
Subject: [RFC PATCH v4 27/28] sched: Allow the user space to tune the scale factor for RSS comparison

sched_cache compares the process's resident pages with
the size of the LLC to determine whether task aggregation
on the preferred LLC might cause cache contention. If the
former is larger than the latter, skip cache-aware task
aggregation. However, some workloads with large resident
pages have a small memory footprint; such workloads could
benefit from cache-aware scheduling. The kernel lacks a
efficient mechanism to track the task's memory footprint
(yes, we have resctrl, but it is for user-space query,
and not process scope), so it is up to userspace to pass
this hint to the kernel.

Introduce /sys/kernel/debug/sched/sched_cache_ignore_rss
to control the extent to which users ignore the RSS
restriction. This value ranges from 0 to 100. A value of
0 means that the user disables the cache aware scheduling.
1 means if a process's RSS is larger than the LLC size,
cache-aware scheduling will be skipped. 100 means cache
aware scheduling is alwasy enabled regardless of RSS size.
N (between 1 and 100) means turn off cache aware scheduling
when RSS is greater than (N-1) * 256 * LLC size

For example, suppose the L3 size is 32MB. If the
sysctl_sched_cache_ignore_rss is 1: When the RSS is larger
than 32MB, the process is regarded as exceeding the LLC capacity.
If the sysctl_sched_cache_ignore_rss is 99: When the RSS is
larger than 784GB, the process is regarded as exceeding the
LLC capacity(please refer to the code):
784GB = (1 + (99 - 1) * 256) * 32MB

Additionally, the number of SMTs is also considered for
sysctl_sched_cache_aggr_cap; if there are many SMTs in the core,
sysctl_llc_aggr_cap will be reduced. This inhibits task aggregation
from cache-aware scheduling on systems with a high number of SMTs,
like Power 10 and Power 11.

Reported-by: K Prateek Nayak <kprateek.nayak@....com>
Reported-by: Madadi Vineeth Reddy <vineethr@...ux.ibm.com>
Reported-by: Shrikanth Hegde <sshegde@...ux.ibm.com>
Co-developed-by: Tim Chen <tim.c.chen@...ux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
Signed-off-by: Chen Yu <yu.c.chen@...el.com>
Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
---
 kernel/sched/debug.c | 82 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/fair.c  | 10 ++++--
 kernel/sched/sched.h |  3 +-
 3 files changed, 90 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7a9ec03704b9..6676fc2a8c08 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -166,6 +166,83 @@ static const struct file_operations sched_feat_fops = {
 	.release	= single_release,
 };
 
+#ifdef CONFIG_SCHED_CACHE
+#define SCHED_CACHE_CREATE_CONTROL(name, val)			  \
+static int sysctl_sched_cache_##name = val;			  \
+static ssize_t sched_cache_write_##name(struct file *filp,	  \
+					const char __user *ubuf,  \
+					size_t cnt, loff_t *ppos) \
+{								  \
+	char buf[16];						  \
+	unsigned int percent;					  \
+	if (cnt > 15)						  \
+		cnt = 15;					  \
+	if (copy_from_user(&buf, ubuf, cnt))			  \
+		return -EFAULT;					  \
+	buf[cnt] = '\0';					  \
+	if (kstrtouint(buf, 10, &percent))			  \
+		return -EINVAL;					  \
+	if (percent > 100)					  \
+		return -EINVAL;					  \
+	sysctl_sched_cache_##name = percent;			  \
+	*ppos += cnt;						  \
+	return cnt;						  \
+}								  \
+static int sched_cache_show_##name(struct seq_file *m, void *v)	  \
+{								  \
+	seq_printf(m, "%d\n", sysctl_sched_cache_##name);	  \
+	return 0;						  \
+}								  \
+static int sched_cache_open_##name(struct inode *inode,		  \
+				   struct file *filp)		  \
+{								  \
+	return single_open(filp, sched_cache_show_##name, NULL);  \
+}								  \
+static const struct file_operations sched_cache_fops_##name = {	  \
+	.open		= sched_cache_open_##name,		  \
+	.write		= sched_cache_write_##name,		  \
+	.read		= seq_read,				  \
+	.llseek		= seq_lseek,				  \
+	.release	= single_release,			  \
+}
+
+SCHED_CACHE_CREATE_CONTROL(ignore_rss, 1);
+int get_sched_cache_rss_scale(void)
+{
+	if (!sysctl_sched_cache_ignore_rss)
+		return 0;
+
+	if (sysctl_sched_cache_ignore_rss >= 100)
+		return INT_MAX;
+	/*
+	 * Suppose the L3 size is 32MB. If the
+	 * sysctl_sched_cache_ignore_rss is 1:
+	 * When the RSS is larger than 32MB,
+	 * the process is regarded as exceeding
+	 * the LLC capacity. If the
+	 * sysctl_sched_cache_ignore_rss is 99:
+	 * When the RSS is larger than 784GB,
+	 * the process is regarded as exceeding
+	 * the LLC capacity:
+	 * 784GB = (1 + (99 - 1) * 256) * 32MB
+	 */
+	return (1 + (sysctl_sched_cache_ignore_rss - 1) * 256);
+}
+
+SCHED_CACHE_CREATE_CONTROL(aggr_cap, 50);
+int get_sched_cache_cap_scale(void)
+{
+	int smt_nr = 1;
+
+#ifdef CONFIG_SCHED_SMT
+	if (sched_smt_active())
+		smt_nr =
+			cpumask_weight(cpu_smt_mask(raw_smp_processor_id()));
+#endif
+	return (sysctl_sched_cache_aggr_cap / smt_nr);
+}
+#endif /* SCHED_CACHE */
+
 #ifdef CONFIG_SMP
 
 static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
@@ -533,10 +610,13 @@ static __init int sched_init_debug(void)
 #endif
 
 #ifdef CONFIG_SCHED_CACHE
-	debugfs_create_u32("llc_aggr_cap", 0644, debugfs_sched, &sysctl_llc_aggr_cap);
 	debugfs_create_u32("llc_aggr_imb", 0644, debugfs_sched, &sysctl_llc_aggr_imb);
 	debugfs_create_u32("llc_period", 0644, debugfs_sched, &sysctl_llc_period);
 	debugfs_create_u32("llc_old", 0644, debugfs_sched, &sysctl_llc_old);
+	debugfs_create_file("llc_aggr_cap", 0644, debugfs_sched, NULL,
+			    &sched_cache_fops_aggr_cap);
+	debugfs_create_file("llc_ignore_rss", 0644, debugfs_sched, NULL,
+			    &sched_cache_fops_ignore_rss);
 #endif
 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cbda7dad1305..018825f04063 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1211,6 +1211,7 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
 	struct cacheinfo *l3_leaf;
 	unsigned long rss;
 	unsigned int llc;
+	int scale;
 
 	/*
 	 * get_cpu_cacheinfo_level() can not be used
@@ -1230,7 +1231,11 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
 	rss = get_mm_counter(mm, MM_ANONPAGES) +
 		get_mm_counter(mm, MM_SHMEMPAGES);
 
-	return (llc <= (rss * PAGE_SIZE));
+	scale = get_sched_cache_rss_scale();
+	if (scale == INT_MAX)
+		return false;
+
+	return ((llc * scale) <= (rss * PAGE_SIZE));
 }
 
 static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
@@ -9037,7 +9042,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 static long __migrate_degrades_locality(struct task_struct *p,
 					int src_cpu, int dst_cpu,
 					bool idle);
-__read_mostly unsigned int sysctl_llc_aggr_cap       = 50;
 __read_mostly unsigned int sysctl_llc_aggr_imb       = 20;
 
 /*
@@ -9049,7 +9053,7 @@ __read_mostly unsigned int sysctl_llc_aggr_imb       = 20;
  * (default: ~50%)
  */
 #define fits_llc_capacity(util, max)	\
-	((util) * 100 < (max) * sysctl_llc_aggr_cap)
+	((util) * 100 < (max) * get_sched_cache_cap_scale())
 
 /*
  * The margin used when comparing utilization.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d752d64d4acd..eaeca4e77ead 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2855,11 +2855,12 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 extern unsigned int sysctl_numa_balancing_hot_threshold;
 
 #ifdef CONFIG_SCHED_CACHE
-extern unsigned int sysctl_llc_aggr_cap;
 extern unsigned int sysctl_llc_aggr_imb;
 extern struct static_key_false sched_cache_present;
 extern unsigned int sysctl_llc_period;
 extern unsigned int sysctl_llc_old;
+int get_sched_cache_rss_scale(void);
+int get_sched_cache_cap_scale(void);
 #endif
 
 #ifdef CONFIG_SCHED_HRTICK
-- 
2.25.1