linux-kernel - Re: [PATCH v2 06/23] sched/cache: Track LLC-preferred tasks per runqueue

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <bea56432-09ec-496f-a95d-c04a779bbcf2@gmail.com>
Date: Wed, 17 Dec 2025 18:04:06 +0800
From: Vern Hao <haoxing990@...il.com>
To: Tim Chen <tim.c.chen@...ux.intel.com>,
 Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
 K Prateek Nayak <kprateek.nayak@....com>,
 "Gautham R . Shenoy" <gautham.shenoy@....com>,
 Vincent Guittot <vincent.guittot@...aro.org>
Cc: Juri Lelli <juri.lelli@...hat.com>,
 Dietmar Eggemann <dietmar.eggemann@....com>,
 Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
 Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
 Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
 Hillf Danton <hdanton@...a.com>, Shrikanth Hegde <sshegde@...ux.ibm.com>,
 Jianyong Wu <jianyong.wu@...look.com>, Yangyu Chen <cyy@...self.name>,
 Tingyin Duan <tingyin.duan@...il.com>, Vern Hao <vernhao@...cent.com>,
 Len Brown <len.brown@...el.com>, Aubrey Li <aubrey.li@...el.com>,
 Zhao Liu <zhao1.liu@...el.com>, Chen Yu <yu.chen.surf@...il.com>,
 Chen Yu <yu.c.chen@...el.com>, Adam Li <adamli@...amperecomputing.com>,
 Aaron Lu <ziqianlu@...edance.com>, Tim Chen <tim.c.chen@...el.com>,
 linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 06/23] sched/cache: Track LLC-preferred tasks per
 runqueue


On 2025/12/4 07:07, Tim Chen wrote:
> For each runqueue, track the number of tasks with an LLC preference
> and how many of them are running on their preferred LLC. This mirrors
> nr_numa_running and nr_preferred_running for NUMA balancing, and will
> be used by cache-aware load balancing in later patches.
>
> Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
> ---
>
> Notes:
>      v1->v2: Invoke task_of() once and reuse its result afterwards.
>              (Peter Zijlstra)
>              Remove hacky reset_llc_stats() and introduce sched_llc_active flag
>              to properly pair enqueue/dequeue statistics update (Peter Zijlstra, K Prateek Nayak)
>
>   include/linux/sched.h |  2 ++
>   init/init_task.c      |  1 +
>   kernel/sched/core.c   |  5 ++++
>   kernel/sched/fair.c   | 60 ++++++++++++++++++++++++++++++++++++++++---
>   kernel/sched/sched.h  |  6 +++++
>   5 files changed, 71 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 1ad46220cd04..466ba8b7398c 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1408,6 +1408,8 @@ struct task_struct {
>   
>   #ifdef CONFIG_SCHED_CACHE
>   	struct callback_head		cache_work;
> +	/*the p is currently refcounted in a rq's preferred llc stats*/
> +	bool				sched_llc_active;
>   	int				preferred_llc;
>   #endif
>   
> diff --git a/init/init_task.c b/init/init_task.c
> index 44bae72b5b7d..ee78837b0aa2 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -192,6 +192,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
>   	.numa_faults	= NULL,
>   #endif
>   #ifdef CONFIG_SCHED_CACHE
> +	.sched_llc_active = false,
>   	.preferred_llc  = -1,
>   #endif
>   #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e8bdf03a4b7f..48626c81ba8e 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -531,6 +531,11 @@ void __trace_set_current_state(int state_value)
>   }
>   EXPORT_SYMBOL(__trace_set_current_state);
>   
> +int task_llc(const struct task_struct *p)
> +{
> +	return per_cpu(sd_llc_id, task_cpu(p));
> +}
> +
>   /*
>    * Serialization rules:
>    *
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 10cec83f65d5..d46a70a9d9fb 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1223,6 +1223,43 @@ static int llc_id(int cpu)
>   	return llc;
>   }
>   
> +static void account_llc_enqueue(struct rq *rq, struct task_struct *p)
> +{
> +	int pref_llc;
> +
> +	if (!sched_cache_enabled())
> +		return;
> +
> +	pref_llc = p->preferred_llc;
> +	if (pref_llc < 0)
> +		return;
> +
> +	rq->nr_llc_running++;
> +	rq->nr_pref_llc_running += (pref_llc == task_llc(p));
> +	p->sched_llc_active = true;
> +}
> +
> +static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
> +{
> +	int pref_llc;
> +
> +	/*
> +	 * Borrow the uc_se->active from uclamp_rq_inc_id(),
> +	 * uclamp_rq_dec_id() to avoid the unbalanced calculation
> +	 * of rq statistics.
> +	 */
> +	if (unlikely(!p->sched_llc_active))
> +		return;
> +
> +	pref_llc = p->preferred_llc;
> +	if (pref_llc < 0)
> +		return;
> +
> +	rq->nr_llc_running--;
> +	rq->nr_pref_llc_running -= (pref_llc == task_llc(p));
> +	p->sched_llc_active = false;
> +}
> +
>   void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
>   {
>   	unsigned long epoch;
> @@ -1294,6 +1331,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch
>   	return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
>   }
>   
> +static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
> +
>   static inline
>   void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>   {
> @@ -1346,8 +1385,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>   #endif
>   	}
>   
> -	if (p->preferred_llc != mm_sched_llc)
> +	/* task not on rq accounted later in account_entity_enqueue() */
> +	if (task_running_on_cpu(rq->cpu, p) &&
> +	    p->preferred_llc != mm_sched_llc) {
#ifdef CONFIG_NUMA_BALANCING
                 /*
                  * Don't assign preferred LLC if it
                  * conflicts with NUMA balancing.
                  */
                 if (p->numa_preferred_nid >= 0 &&
                     cpu_to_node(mm->mm_sched_cpu) != p->numa_preferred_nid)
                         mm_sched_llc = -1;
#endif
         }

         /* task not on rq accounted later in account_entity_enqueue() */
         if (task_running_on_cpu(rq->cpu, p) &&
             p->preferred_llc != mm_sched_llc) {
                 account_llc_dequeue(rq, p);
                 p->preferred_llc = mm_sched_llc;
                 account_llc_enqueue(rq, p);

         }

I am a little concerned that there might be cases where both 
|p->preferred_llc| and |mm_sched_llc| are equal to -1 at this point.", 
Is it necessary to add a check here?



> +		account_llc_dequeue(rq, p);
>   		p->preferred_llc = mm_sched_llc;
> +		account_llc_enqueue(rq, p);
> +	}
>   }
>   
>   static void task_tick_cache(struct rq *rq, struct task_struct *p)
> @@ -1475,6 +1519,10 @@ void init_sched_mm(struct task_struct *p) { }
>   
>   static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
>   
> +static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {}
> +
> +static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {}
> +
>   #endif
>   
>   /*
> @@ -3965,9 +4013,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>   {
>   	update_load_add(&cfs_rq->load, se->load.weight);
>   	if (entity_is_task(se)) {
> +		struct task_struct *p = task_of(se);
>   		struct rq *rq = rq_of(cfs_rq);
>   
> -		account_numa_enqueue(rq, task_of(se));
> +		account_numa_enqueue(rq, p);
> +		account_llc_enqueue(rq, p);
>   		list_add(&se->group_node, &rq->cfs_tasks);
>   	}
>   	cfs_rq->nr_queued++;
> @@ -3978,7 +4028,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>   {
>   	update_load_sub(&cfs_rq->load, se->load.weight);
>   	if (entity_is_task(se)) {
> -		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
> +		struct task_struct *p = task_of(se);
> +		struct rq *rq = rq_of(cfs_rq);
> +
> +		account_numa_dequeue(rq, p);
> +		account_llc_dequeue(rq, p);
>   		list_del_init(&se->group_node);
>   	}
>   	cfs_rq->nr_queued--;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 728737641847..ee8b70647835 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1126,6 +1126,10 @@ struct rq {
>   	unsigned int		nr_preferred_running;
>   	unsigned int		numa_migrate_on;
>   #endif
> +#ifdef CONFIG_SCHED_CACHE
> +	unsigned int		nr_pref_llc_running;
> +	unsigned int		nr_llc_running;
> +#endif
>   #ifdef CONFIG_NO_HZ_COMMON
>   	unsigned long		last_blocked_load_update_tick;
>   	unsigned int		has_blocked_load;
> @@ -1980,6 +1984,8 @@ init_numa_balancing(u64 clone_flags, struct task_struct *p)
>   
>   #endif /* !CONFIG_NUMA_BALANCING */
>   
> +int task_llc(const struct task_struct *p);
> +
>   static inline void
>   queue_balance_callback(struct rq *rq,
>   		       struct balance_callback *head,