linux-kernel - Re: [PATCH v2 20/23] sched/cache: Add user control to adjust the parameters of cache-aware scheduling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8c33a6b3-4ff6-4cf0-855e-a9940b80c831@gmail.com>
Date: Fri, 19 Dec 2025 12:14:23 +0800
From: Vern Hao <haoxing990@...il.com>
To: Tim Chen <tim.c.chen@...ux.intel.com>,
 Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
 K Prateek Nayak <kprateek.nayak@....com>,
 "Gautham R . Shenoy" <gautham.shenoy@....com>,
 Vincent Guittot <vincent.guittot@...aro.org>
Cc: Chen Yu <yu.c.chen@...el.com>, Juri Lelli <juri.lelli@...hat.com>,
 Dietmar Eggemann <dietmar.eggemann@....com>,
 Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
 Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
 Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
 Hillf Danton <hdanton@...a.com>, Shrikanth Hegde <sshegde@...ux.ibm.com>,
 Jianyong Wu <jianyong.wu@...look.com>, Yangyu Chen <cyy@...self.name>,
 Tingyin Duan <tingyin.duan@...il.com>, Vern Hao <vernhao@...cent.com>,
 Len Brown <len.brown@...el.com>, Aubrey Li <aubrey.li@...el.com>,
 Zhao Liu <zhao1.liu@...el.com>, Chen Yu <yu.chen.surf@...il.com>,
 Adam Li <adamli@...amperecomputing.com>, Aaron Lu <ziqianlu@...edance.com>,
 Tim Chen <tim.c.chen@...el.com>, linux-kernel@...r.kernel.org,
 Vern Hao <haoxing990@...il.com>
Subject: Re: [PATCH v2 20/23] sched/cache: Add user control to adjust the
 parameters of cache-aware scheduling


On 2025/12/4 07:07, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@...el.com>
>
> Introduce a set of debugfs knobs to control the enabling of
> and parameters for cache-aware load balancing.
>
> (1) llc_enabled
> llc_enabled acts as the primary switch - users can toggle it to
> enable or disable cache aware load balancing.
>
> (2) llc_aggr_tolerance
> With sched_cache enabled, the scheduler uses a process's RSS as a
> proxy for its LLC footprint to determine if aggregating tasks on the
> preferred LLC could cause cache contention. If RSS exceeds the LLC
> size, aggregation is skipped. Some workloads with large RSS but small
> actual memory footprints may still benefit from aggregation. Since
> the kernel cannot efficiently track per-task cache usage (resctrl is
> user-space only), userspace can provide a more accurate hint.
>
> Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let
> users control how strictly RSS limits aggregation. Values range from
> 0 to 100:
>
>    - 0: Cache-aware scheduling is disabled.
>    - 1: Strict; tasks with RSS larger than LLC size are skipped.
>    - 100: Aggressive; tasks are aggregated regardless of RSS.
>
> For example, with a 32MB L3 cache:
>
>    - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped.
>    - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped
>      (784GB = (1 + (99 - 1) * 256) * 32MB).
>
> Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls
> how strictly the number of active threads is considered when doing
> cache aware load balance. The number of SMTs is also considered.
> High SMT counts reduce the aggregation capacity, preventing excessive
> task aggregation on SMT-heavy systems like Power10/Power11.
>
> For example, with 8 Cores/16 CPUs in a L3:
>
>    - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped.
>    - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped
>      785 = (1 + (99 - 1) * 8).
>
> (3) llc_epoch_period/llc_epoch_affinity_timeout
> Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned
> into tunable.
>
> Suggested-by: K Prateek Nayak <kprateek.nayak@....com>
> Suggested-by: Madadi Vineeth Reddy <vineethr@...ux.ibm.com>
> Suggested-by: Shrikanth Hegde <sshegde@...ux.ibm.com>
> Suggested-by: Tingyin Duan <tingyin.duan@...il.com>
> Co-developed-by: Tim Chen <tim.c.chen@...ux.intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
> Signed-off-by: Chen Yu <yu.c.chen@...el.com>
> ---
>
> Notes:
>      v1->v2: Remove the smt_nr check in fits_llc_capacity().
>              (Aaron Lu)
>
>   include/linux/sched.h   |  4 ++-
>   kernel/sched/debug.c    | 62 ++++++++++++++++++++++++++++++++++++++++
>   kernel/sched/fair.c     | 63 ++++++++++++++++++++++++++++++++++++-----
>   kernel/sched/sched.h    |  5 ++++
>   kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++--
>   5 files changed, 178 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 466ba8b7398c..95bf080bbbf0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2436,9 +2436,11 @@ extern void migrate_enable(void);
>   DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>   
>   #ifdef CONFIG_SCHED_CACHE
> +DECLARE_STATIC_KEY_FALSE(sched_cache_on);
> +
>   static inline bool sched_cache_enabled(void)
>   {
> -	return false;
> +	return static_branch_unlikely(&sched_cache_on);
>   }
>   #endif
>   
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 02e16b70a790..cde324672103 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = {
>   	.release	= single_release,
>   };
>   
> +#ifdef CONFIG_SCHED_CACHE
> +#define SCHED_CACHE_CREATE_CONTROL(name, max)			  \
> +static ssize_t sched_cache_write_##name(struct file *filp,	  \
> +					const char __user *ubuf,  \
> +					size_t cnt, loff_t *ppos) \
> +{								  \
> +	char buf[16];						  \
> +	unsigned int val;					  \
> +	if (cnt > 15)						  \
> +		cnt = 15;					  \
> +	if (copy_from_user(&buf, ubuf, cnt))			  \
> +		return -EFAULT;					  \
> +	buf[cnt] = '\0';					  \
> +	if (kstrtouint(buf, 10, &val))				  \
> +		return -EINVAL;					  \
> +	if (val > (max))						  \
> +		return -EINVAL;					  \
> +	llc_##name = val;					  \
> +	if (!strcmp(#name, "enabled"))				  \
> +		sched_cache_set(false);				  \
> +	*ppos += cnt;						  \
> +	return cnt;						  \
> +}								  \
> +static int sched_cache_show_##name(struct seq_file *m, void *v)	  \
> +{								  \
> +	seq_printf(m, "%d\n", llc_##name);			  \
> +	return 0;						  \
> +}								  \
> +static int sched_cache_open_##name(struct inode *inode,		  \
> +				   struct file *filp)		  \
> +{								  \
> +	return single_open(filp, sched_cache_show_##name, NULL);  \
> +}								  \
> +static const struct file_operations sched_cache_fops_##name = {	  \
> +	.open		= sched_cache_open_##name,		  \
> +	.write		= sched_cache_write_##name,		  \
> +	.read		= seq_read,				  \
> +	.llseek		= seq_lseek,				  \
> +	.release	= single_release,			  \
> +}
> +
> +SCHED_CACHE_CREATE_CONTROL(overload_pct, 100);
> +SCHED_CACHE_CREATE_CONTROL(imb_pct, 100);
> +SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100);
> +SCHED_CACHE_CREATE_CONTROL(enabled, 1);
> +#endif /* SCHED_CACHE */
> +
>   static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
>   				   size_t cnt, loff_t *ppos)
>   {
> @@ -523,6 +570,21 @@ static __init int sched_init_debug(void)
>   	debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
>   #endif /* CONFIG_NUMA_BALANCING */
>   
> +#ifdef CONFIG_SCHED_CACHE
> +	debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL,
> +			    &sched_cache_fops_overload_pct);
> +	debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL,
> +			    &sched_cache_fops_imb_pct);
> +	debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL,
> +			    &sched_cache_fops_aggr_tolerance);
> +	debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL,
> +			    &sched_cache_fops_enabled);
> +	debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched,
> +			   &llc_epoch_period);
> +	debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched,
> +			   &llc_epoch_affinity_timeout);
> +#endif
> +
>   	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>   
>   	debugfs_fair_server_init();
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 424ec601cfdf..a2e2d6742481 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>   
>   __read_mostly unsigned int llc_overload_pct       = 50;
>   __read_mostly unsigned int llc_imb_pct            = 20;
> +__read_mostly unsigned int llc_aggr_tolerance     = 1;
> +__read_mostly unsigned int llc_epoch_period       = EPOCH_PERIOD;
> +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT;
>   
>   static int llc_id(int cpu)
>   {
> @@ -1223,11 +1226,22 @@ static int llc_id(int cpu)
>   	return llc;
>   }
>   
> +static inline int get_sched_cache_scale(int mul)
> +{
> +	if (!llc_aggr_tolerance)
> +		return 0;
> +
> +	if (llc_aggr_tolerance == 100)
the range of llc_aggr_tolerance is [0, 100], so a little bug here? maybe 
check if (llc_aggr_tolerance >= 100)

and if llc_aggr_tolerance = 0, the func return 0, it means 
exceed_llc_capacity & exceed_llc_nr always true, there maybeinconsistent 
to have this value set while |llc_enable=1| is set.

> +		return INT_MAX;
> +
> +	return (1 + (llc_aggr_tolerance - 1) * mul);
> +}
> +
>   static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>   {
> +	unsigned int llc, scale;
>   	struct cacheinfo *ci;
>   	unsigned long rss;
> -	unsigned int llc;
>   
>   	/*
>   	 * get_cpu_cacheinfo_level() can not be used
> @@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
>   	rss = get_mm_counter(mm, MM_ANONPAGES) +
>   		get_mm_counter(mm, MM_SHMEMPAGES);
>   
> -	return (llc <= (rss * PAGE_SIZE));
> +	/*
> +	 * Scale the LLC size by 256*llc_aggr_tolerance
> +	 * and compare it to the task's RSS size.
> +	 *
> +	 * Suppose the L3 size is 32MB. If the
> +	 * llc_aggr_tolerance is 1:
> +	 * When the RSS is larger than 32MB, the process
> +	 * is regarded as exceeding the LLC capacity. If
> +	 * the llc_aggr_tolerance is 99:
> +	 * When the RSS is larger than 784GB, the process
> +	 * is regarded as exceeding the LLC capacity because:
> +	 * 784GB = (1 + (99 - 1) * 256) * 32MB
> +	 */
> +	scale = get_sched_cache_scale(256);
> +	if (scale == INT_MAX)
> +		return false;
> +
> +	return ((llc * scale) <= (rss * PAGE_SIZE));
>   }
>   
>   static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
>   {
> -	int smt_nr = 1;
> +	int smt_nr = 1, scale;
>   
>   #ifdef CONFIG_SCHED_SMT
>   	if (sched_smt_active())
>   		smt_nr = cpumask_weight(cpu_smt_mask(cpu));
>   #endif
> +	/*
> +	 * Scale the Core number in a LLC by llc_aggr_tolerance
> +	 * and compare it to the task's active threads.
> +	 *
> +	 * Suppose the number of Cores in LLC is 8.
> +	 * Every core has 2 SMTs.
> +	 * If the llc_aggr_tolerance is 1: When the
> +	 * nr_running is larger than 8, the process
> +	 * is regarded as exceeding the LLC capacity.
> +	 * If the llc_aggr_tolerance is 99:
> +	 * When the nr_running is larger than 785,
> +	 * the process is regarded as exceeding
> +	 * the LLC capacity:
> +	 * 785 = 1 + (99 - 1) * 8
> +	 */
> +	scale = get_sched_cache_scale(1);
> +	if (scale == INT_MAX)
> +		return false;
>   
> -	return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu));
> +	return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu)));
>   }
>   
>   static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
> @@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
>   	long delta = now - rq->cpu_epoch_next;
>   
>   	if (delta > 0) {
> -		n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
> +		n = (delta + llc_epoch_period - 1) / llc_epoch_period;
>   		rq->cpu_epoch += n;
> -		rq->cpu_epoch_next += n * EPOCH_PERIOD;
> +		rq->cpu_epoch_next += n * llc_epoch_period;
>   		__shr_u64(&rq->cpu_runtime, n);
>   	}
>   
> @@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>   	 * has only 1 thread, or has too many active threads, invalidate
>   	 * its preferred state.
>   	 */
> -	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
> +	if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout ||
>   	    get_nr_threads(p) <= 1 ||
>   	    exceed_llc_nr(mm, cpu_of(rq)) ||
>   	    exceed_llc_capacity(mm, cpu_of(rq))) {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 40798a06e058..15d126bd3728 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold;
>   #ifdef CONFIG_SCHED_CACHE
>   extern unsigned int llc_overload_pct;
>   extern unsigned int llc_imb_pct;
> +extern unsigned int llc_aggr_tolerance;
> +extern unsigned int llc_epoch_period;
> +extern unsigned int llc_epoch_affinity_timeout;
> +extern unsigned int llc_enabled;
> +void sched_cache_set(bool locked);
>   #endif
>   
>   #ifdef CONFIG_SCHED_HRTICK
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9799e3a9a609..818599ddaaef 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -26,6 +26,49 @@ int max_llcs;
>   
>   static bool sched_cache_present;
>   
> +unsigned int llc_enabled = 1;
> +DEFINE_STATIC_KEY_FALSE(sched_cache_on);
> +
> +/*
> + * Enable/disable cache aware scheduling according to
> + * user input and the presence of hardware support.
> + */
> +static void _sched_cache_set(bool enable, bool locked)
> +{
> +	if (enable) {
> +		if (locked)
> +			static_branch_enable_cpuslocked(&sched_cache_on);
> +		else
> +			static_branch_enable(&sched_cache_on);
> +	} else {
> +		if (locked)
> +			static_branch_disable_cpuslocked(&sched_cache_on);
> +		else
> +			static_branch_disable(&sched_cache_on);
> +	}
> +}
> +
> +void sched_cache_set(bool locked)
> +{
> +	/* hardware does not support */
> +	if (!sched_cache_present) {
> +		if (static_branch_likely(&sched_cache_on))
> +			_sched_cache_set(false, locked);
> +
> +		return;
> +	}
> +
> +	/* user wants it or not ?*/
> +	if (llc_enabled) {
> +		if (!static_branch_likely(&sched_cache_on))
> +			_sched_cache_set(true, locked);
> +
> +	} else {
> +		if (static_branch_likely(&sched_cache_on))
> +			_sched_cache_set(false, locked);
> +	}
> +}
> +
>   static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc)
>   {
>   	unsigned int *new = NULL;
> @@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs)
>   	 * new buffer.
>   	 */
>   	tmp_llc_pref = alloc_percpu_noprof(unsigned int *);
> -	if (!tmp_llc_pref)
> -		return -ENOMEM;
> +	if (!tmp_llc_pref) {
> +		sched_cache_present = false;
> +		ret = -ENOMEM;
> +
> +		goto out;
> +	}
>   
>   	for_each_present_cpu(i)
>   		*per_cpu_ptr(tmp_llc_pref, i) = NULL;
> @@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs)
>   		new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i));
>   		if (!new) {
>   			ret = -ENOMEM;
> +			sched_cache_present = false;
>   
>   			goto release_old;
>   		}
> @@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs)
>   	if (!ret)
>   		max_llcs = new_max_llcs;
>   
> +out:
> +	sched_cache_set(true);
>   	return ret;
>   }
>