linux-kernel - [RFC PATCH 2/5] sched: Several fixes for cache aware scheduling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-Id: <660bc36a8aacc6ba55fbcf8b0f9f05b6326e69ce.1745199017.git.yu.c.chen@intel.com>
Date: Mon, 21 Apr 2025 11:24:41 +0800
From: Chen Yu <yu.c.chen@...el.com>
To: Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	K Prateek Nayak <kprateek.nayak@....com>,
	"Gautham R . Shenoy" <gautham.shenoy@....com>
Cc: Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	Tim Chen <tim.c.chen@...el.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Libo Chen <libo.chen@...cle.com>,
	Abel Wu <wuyun.abel@...edance.com>,
	Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
	Hillf Danton <hdanton@...a.com>,
	linux-kernel@...r.kernel.org,
	Chen Yu <yu.c.chen@...el.com>
Subject: [RFC PATCH 2/5] sched: Several fixes for cache aware scheduling

1. Fix the compile errors on per-CPU allocation.
2. Enqueue tasks to the target CPU instead of the current CPU;
   otherwise, the per-CPU occupancy will be messed up.
3. Fix the NULL LLC sched domain issue(Libo Chen).
4. Avoid duplicated epoch check in task_tick_cache()
5. Introduce sched feature SCHED_CACHE to control cache aware
   scheduling

TBD suggestion in previous version:
move cache_work from per task to per mm_struct, consider the actual cpu
capacity in fraction_mm_sched() (Abel Wu)

Signed-off-by: Chen Yu <yu.c.chen@...el.com>
---
 include/linux/mm_types.h |  4 ++--
 kernel/sched/fair.c      | 15 +++++++++------
 kernel/sched/features.h  |  1 +
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 013291c6aaa2..9de4a0a13c4d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1411,11 +1411,11 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
 #endif /* CONFIG_SCHED_MM_CID */
 
 #ifdef CONFIG_SCHED_CACHE
-extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
+extern void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
 
 static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
 {
-	struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
+	struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
 	if (!pcpu_sched)
 		return -ENOMEM;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 23ea35dbd381..22b5830e7e4e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1175,7 +1175,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
 #define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
 #define EPOCH_OLD	5		/* 50 ms */
 
-void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
+void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
 {
 	unsigned long epoch;
 	int i;
@@ -1254,7 +1254,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 	if (!mm || !mm->pcpu_sched)
 		return;
 
-	pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
+	pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
 
 	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
 		__update_mm_sched(rq, pcpu_sched);
@@ -1286,9 +1286,6 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
 
 	guard(raw_spinlock)(&mm->mm_sched_lock);
 
-	if (mm->mm_sched_epoch == rq->cpu_epoch)
-		return;
-
 	if (work->next == work) {
 		task_work_add(p, work, TWA_RESUME);
 		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
@@ -1322,6 +1319,9 @@ static void task_cache_work(struct callback_head *work)
 			unsigned long occ, m_occ = 0, a_occ = 0;
 			int m_cpu = -1, nr = 0, i;
 
+			if (!sd)
+				continue;
+
 			for_each_cpu(i, sched_domain_span(sd)) {
 				occ = fraction_mm_sched(cpu_rq(i),
 							per_cpu_ptr(mm->pcpu_sched, i));
@@ -8796,6 +8796,9 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
 	struct mm_struct *mm = p->mm;
 	int cpu;
 
+	if (!sched_feat(SCHED_CACHE))
+		return prev_cpu;
+
 	if (!mm || p->nr_cpus_allowed == 1)
 		return prev_cpu;
 
@@ -9550,7 +9553,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 		return 0;
 
 #ifdef CONFIG_SCHED_CACHE
-	if (p->mm && p->mm->pcpu_sched) {
+	if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) {
 		/*
 		 * XXX things like Skylake have non-inclusive L3 and might not
 		 * like this L3 centric view. What to do about L2 stickyness ?
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 3c12d9f93331..d2af7bfd36bf 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
  */
 SCHED_FEAT(SIS_UTIL, true)
 
+SCHED_FEAT(SCHED_CACHE, true)
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
  * in a single rq->lock section. Default disabled because the
-- 
2.25.1