[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <398a83d7-0a8c-42cb-af66-5974582cc2ae@linux.ibm.com>
Date: Fri, 4 Jul 2025 01:03:13 +0530
From: Shrikanth Hegde <sshegde@...ux.ibm.com>
To: Tim Chen <tim.c.chen@...ux.intel.com>,
Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
K Prateek Nayak <kprateek.nayak@....com>,
"Gautham R . Shenoy" <gautham.shenoy@....com>
Cc: Chen Yu <yu.c.chen@...el.com>, Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
Tim Chen <tim.c.chen@...el.com>,
Vincent Guittot
<vincent.guittot@...aro.org>,
Libo Chen <libo.chen@...cle.com>, Abel Wu <wuyun.abel@...edance.com>,
Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
Hillf Danton <hdanton@...a.com>, Len Brown <len.brown@...el.com>,
linux-kernel@...r.kernel.org
Subject: Re: [RFC patch v3 02/20] sched: Several fixes for cache aware
scheduling
On 6/18/25 23:57, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@...el.com>
>
> 1. Fix compile error on percpu allocation.
> 2. Enqueue to the target CPU rather than the current CPU.
> 3. NULL LLC sched domain check(Libo Chen).
> 4. Introduce sched feature SCHED_CACHE to control cache aware scheduling
> 5. Fix unsigned occupancy initialization to -1.
> 6. If there is only 1 thread in the process, no need to enable cache
> awareness
> 7. Add __maybe_unused to __migrate_degrades_locality() to
> avoid compile warnings.
>
> Signed-off-by: Chen Yu <yu.c.chen@...el.com>
> ---
> include/linux/mm_types.h | 4 ++--
> kernel/sched/fair.c | 27 ++++++++++++++++-----------
> kernel/sched/features.h | 1 +
> 3 files changed, 19 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 013291c6aaa2..9de4a0a13c4d 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1411,11 +1411,11 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
> #endif /* CONFIG_SCHED_MM_CID */
>
> #ifdef CONFIG_SCHED_CACHE
> -extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
> +extern void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
>
> static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
> {
> - struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
> + struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
> if (!pcpu_sched)
> return -ENOMEM;
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index df7d4a324fbe..89db97f8ef02 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1175,7 +1175,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
> #define EPOCH_PERIOD (HZ/100) /* 10 ms */
> #define EPOCH_OLD 5 /* 50 ms */
>
> -void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
> +void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
> {
> unsigned long epoch;
> int i;
> @@ -1186,7 +1186,7 @@ void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
>
> pcpu_sched->runtime = 0;
> pcpu_sched->epoch = epoch = rq->cpu_epoch;
> - pcpu_sched->occ = -1;
> + pcpu_sched->occ = 0;
> }
>
> raw_spin_lock_init(&mm->mm_sched_lock);
> @@ -1254,7 +1254,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> if (!mm || !mm->pcpu_sched)
> return;
>
> - pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
> + pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
>
> scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
> __update_mm_sched(rq, pcpu_sched);
> @@ -1264,12 +1264,14 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> }
>
> /*
> - * If this task hasn't hit task_cache_work() for a while, invalidate
> + * If this task hasn't hit task_cache_work() for a while, or it
> + * has only 1 thread, invalidate
> * it's preferred state.
> */
> - if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
> + if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD ||
> + get_nr_threads(p) <= 1) {
> mm->mm_sched_cpu = -1;
> - pcpu_sched->occ = -1;
> + pcpu_sched->occ = 0;
> }
> }
>
> @@ -1286,9 +1288,6 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
>
> guard(raw_spinlock)(&mm->mm_sched_lock);
>
> - if (mm->mm_sched_epoch == rq->cpu_epoch)
> - return;
> -
> if (work->next == work) {
> task_work_add(p, work, TWA_RESUME);
> WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
> @@ -1322,6 +1321,9 @@ static void task_cache_work(struct callback_head *work)
> unsigned long occ, m_occ = 0, a_occ = 0;
> int m_cpu = -1, nr = 0, i;
>
> + if (!sd)
> + continue;
> +
> for_each_cpu(i, sched_domain_span(sd)) {
> occ = fraction_mm_sched(cpu_rq(i),
> per_cpu_ptr(mm->pcpu_sched, i));
> @@ -8801,6 +8803,9 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
> struct mm_struct *mm = p->mm;
> int cpu;
>
> + if (!sched_feat(SCHED_CACHE))
> + return prev_cpu;
> +
> if (!mm || p->nr_cpus_allowed == 1)
> return prev_cpu;
>
> @@ -9555,7 +9560,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
> return 0;
>
> #ifdef CONFIG_SCHED_CACHE
> - if (p->mm && p->mm->pcpu_sched) {
> + if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) {
> /*
> * XXX things like Skylake have non-inclusive L3 and might not
> * like this L3 centric view. What to do about L2 stickyness ?
> @@ -9633,7 +9638,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
> }
>
> #else
> -static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
> +static __maybe_unused long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
> {
> return 0;
> }
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index 3c12d9f93331..d2af7bfd36bf 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
> */
> SCHED_FEAT(SIS_UTIL, true)
>
> +SCHED_FEAT(SCHED_CACHE, true)
Having both SCHED_FEAT and CONFIG_SCHED_CACHE seems like overkill.
Is it really necessary to have both?
Also, given the complexity it brings and only a workloads which spawns threads
which have data sharing among them benefit, it could be false by default.
> /*
> * Issue a WARN when we do multiple update_rq_clock() calls
> * in a single rq->lock section. Default disabled because the
Powered by blists - more mailing lists