linux-kernel - [PATCH v3 20/21] -- DO NOT APPLY!!! -- sched/cache/debug: Display the per LLC occupancy for each process via proc fs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <09c48847deeb9d2c1c7de1f2799cc128cd2e866e.1770760558.git.tim.c.chen@linux.intel.com>
Date: Tue, 10 Feb 2026 14:19:00 -0800
From: Tim Chen <tim.c.chen@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	K Prateek Nayak <kprateek.nayak@....com>,
	"Gautham R . Shenoy" <gautham.shenoy@....com>,
	Vincent Guittot <vincent.guittot@...aro.org>
Cc: Chen Yu <yu.c.chen@...el.com>,
	Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
	Hillf Danton <hdanton@...a.com>,
	Shrikanth Hegde <sshegde@...ux.ibm.com>,
	Jianyong Wu <jianyong.wu@...look.com>,
	Yangyu Chen <cyy@...self.name>,
	Tingyin Duan <tingyin.duan@...il.com>,
	Vern Hao <vernhao@...cent.com>,
	Vern Hao <haoxing990@...il.com>,
	Len Brown <len.brown@...el.com>,
	Tim Chen <tim.c.chen@...ux.intel.com>,
	Aubrey Li <aubrey.li@...el.com>,
	Zhao Liu <zhao1.liu@...el.com>,
	Chen Yu <yu.chen.surf@...il.com>,
	Adam Li <adamli@...amperecomputing.com>,
	Aaron Lu <ziqianlu@...edance.com>,
	Tim Chen <tim.c.chen@...el.com>,
	Josh Don <joshdon@...gle.com>,
	Gavin Guo <gavinguo@...lia.com>,
	Qais Yousef <qyousef@...alina.io>,
	Libo Chen <libchen@...estorage.com>,
	linux-kernel@...r.kernel.org
Subject: [PATCH v3 20/21] -- DO NOT APPLY!!! -- sched/cache/debug: Display the per LLC occupancy for each process via proc fs

From: Chen Yu <yu.c.chen@...el.com>

Debug patch only.

Show the per-LLC occupancy in /proc/{PID}/schedstat, with each column
corresponding to one LLC. This can be used to verify if the cache-aware
load balancer works as expected by aggregating threads onto dedicated LLCs.

Suppose there are 2 LLCs and the sampling duration is 10 seconds:

Enable the cache aware load balance:
0 12281  <--- LLC0 residency delta is 0, LLC1 is 12 seconds
0 18881
0 16217

disable the cache aware load balance:
6497 15802
9299 5435
17811 8278

Co-developed-by: Aaron Lu <ziqianlu@...edance.com>
Signed-off-by: Aaron Lu <ziqianlu@...edance.com>
Signed-off-by: Chen Yu <yu.c.chen@...el.com>
Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
---

Notes:
    v2->v3:
    Enhance the informational output by printing the task's
    preferred LLC. (Aaron Lu)

 fs/proc/base.c           | 31 +++++++++++++++++++++++++
 include/linux/mm_types.h | 17 +++++++++++---
 include/linux/sched.h    |  6 +++++
 kernel/sched/fair.c      | 50 ++++++++++++++++++++++++++++++++++++----
 4 files changed, 97 insertions(+), 7 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4eec684baca9..76b49e80af1a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -518,6 +518,37 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
 		   (unsigned long long)task->se.sum_exec_runtime,
 		   (unsigned long long)task->sched_info.run_delay,
 		   task->sched_info.pcount);
+#ifdef CONFIG_SCHED_CACHE
+	if (sched_cache_inuse()) {
+		struct mm_struct *mm = task->mm;
+		u64 *llc_runtime;
+		int mm_sched_llc;
+
+		if (!mm)
+			return 0;
+
+		llc_runtime = kcalloc(max_llcs, sizeof(u64), GFP_KERNEL);
+		if (!llc_runtime)
+			return 0;
+
+		if (get_mm_per_llc_runtime(task, llc_runtime))
+			goto out;
+
+		if (mm->sc_stat.cpu == -1)
+			mm_sched_llc = -1;
+		else
+			mm_sched_llc = llc_id(mm->sc_stat.cpu);
+
+		for (int i = 0; i < max_llcs; i++)
+			seq_printf(m, "%s%s%llu ",
+				   i == task->preferred_llc ? "*" : "",
+				   i == mm_sched_llc ? "?" : "",
+				   llc_runtime[i]);
+		seq_puts(m, "\n");
+out:
+		kfree(llc_runtime);
+	}
+#endif
 
 	return 0;
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 777a48523aa6..2b8d0ec032e8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1523,17 +1523,26 @@ static inline unsigned int mm_cid_size(void)
 
 #ifdef CONFIG_SCHED_CACHE
 void mm_init_sched(struct mm_struct *mm,
-		   struct sched_cache_time __percpu *pcpu_sched);
+		   struct sched_cache_time __percpu *pcpu_sched,
+		   struct sched_cache_time __percpu *pcpu_time);
 
 static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
 {
 	struct sched_cache_time __percpu *pcpu_sched =
-		alloc_percpu_noprof(struct sched_cache_time);
+		alloc_percpu_noprof(struct sched_cache_time),
+		*pcpu_time;
 
 	if (!pcpu_sched)
 		return -ENOMEM;
 
-	mm_init_sched(mm, pcpu_sched);
+	pcpu_time = alloc_percpu_noprof(struct sched_cache_time);
+	if (!pcpu_time) {
+		free_percpu(pcpu_sched);
+		return -ENOMEM;
+	}
+
+	mm_init_sched(mm, pcpu_sched, pcpu_time);
+
 	return 0;
 }
 
@@ -1542,7 +1551,9 @@ static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
 static inline void mm_destroy_sched(struct mm_struct *mm)
 {
 	free_percpu(mm->sc_stat.pcpu_sched);
+	free_percpu(mm->sc_stat.pcpu_time);
 	mm->sc_stat.pcpu_sched = NULL;
+	mm->sc_stat.pcpu_time = NULL;
 }
 #else /* !CONFIG_SCHED_CACHE */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 511c9b263386..4236cacbb409 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2344,12 +2344,18 @@ struct sched_cache_time {
 
 struct sched_cache_stat {
 	struct sched_cache_time __percpu *pcpu_sched;
+	struct sched_cache_time __percpu *pcpu_time;
 	raw_spinlock_t lock;
 	unsigned long epoch;
 	u64 nr_running_avg;
 	int cpu;
 } ____cacheline_aligned_in_smp;
 
+int get_mm_per_llc_runtime(struct task_struct *p, u64 *buf);
+bool sched_cache_inuse(void);
+extern int max_llcs;
+int llc_id(int cpu);
+
 #else
 
 struct sched_cache_stat { };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da4291ace24c..25cee3dd767c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1197,7 +1197,12 @@ __read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEO
 __read_mostly unsigned int llc_imb_pct     = 20;
 __read_mostly unsigned int llc_overaggr_pct     = 50;
 
-static int llc_id(int cpu)
+bool sched_cache_inuse(void)
+{
+	return sched_cache_enabled();
+}
+
+int llc_id(int cpu)
 {
 	if (cpu < 0)
 		return -1;
@@ -1365,17 +1370,20 @@ static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
 }
 
 void mm_init_sched(struct mm_struct *mm,
-		   struct sched_cache_time __percpu *_pcpu_sched)
+		   struct sched_cache_time __percpu *_pcpu_sched,
+		   struct sched_cache_time __percpu *_pcpu_time)
 {
 	unsigned long epoch;
 	int i;
 
 	for_each_possible_cpu(i) {
 		struct sched_cache_time *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
+		struct sched_cache_time *pcpu_time = per_cpu_ptr(_pcpu_time, i);
 		struct rq *rq = cpu_rq(i);
 
 		pcpu_sched->runtime = 0;
 		pcpu_sched->epoch = rq->cpu_epoch;
+		pcpu_time->runtime = 0;
 		epoch = rq->cpu_epoch;
 	}
 
@@ -1389,6 +1397,8 @@ void mm_init_sched(struct mm_struct *mm,
 	 * the readers may get invalid mm_sched_epoch, etc.
 	 */
 	smp_store_release(&mm->sc_stat.pcpu_sched, _pcpu_sched);
+	/* barrier */
+	smp_store_release(&mm->sc_stat.pcpu_time, _pcpu_time);
 }
 
 /* because why would C be fully specified */
@@ -1474,7 +1484,8 @@ static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
 static inline
 void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 {
-	struct sched_cache_time *pcpu_sched;
+	struct sched_cache_time *pcpu_sched,
+		*pcpu_time;
 	struct mm_struct *mm = p->mm;
 	int mm_sched_llc = -1;
 	unsigned long epoch;
@@ -1488,14 +1499,18 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
 	 * init_task, kthreads and user thread created
 	 * by user_mode_thread() don't have mm.
 	 */
-	if (!mm || !mm->sc_stat.pcpu_sched)
+	if (!mm || !mm->sc_stat.pcpu_sched ||
+	    !mm->sc_stat.pcpu_time)
 		return;
 
 	pcpu_sched = per_cpu_ptr(p->mm->sc_stat.pcpu_sched, cpu_of(rq));
+	pcpu_time = per_cpu_ptr(p->mm->sc_stat.pcpu_time, cpu_of(rq));
 
 	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
 		__update_mm_sched(rq, pcpu_sched);
 		pcpu_sched->runtime += delta_exec;
+		/* pure runtime without decay */
+		pcpu_time->runtime += delta_exec;
 		rq->cpu_runtime += delta_exec;
 		epoch = rq->cpu_epoch;
 	}
@@ -1676,6 +1691,33 @@ void init_sched_mm(struct task_struct *p)
 	work->next = work;
 }
 
+/* p->pi_lock is hold */
+int get_mm_per_llc_runtime(struct task_struct *p, u64 *buf)
+{
+	struct sched_cache_time *pcpu_time;
+	struct mm_struct *mm = p->mm;
+	int cpu;
+
+	if (!mm)
+		return -EINVAL;
+
+	rcu_read_lock();
+	for_each_online_cpu(cpu) {
+		int llc = llc_id(cpu);
+		u64 runtime_ms;
+
+		if (!valid_llc_id(llc))
+			continue;
+
+		pcpu_time = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu);
+		runtime_ms = div_u64(pcpu_time->runtime, NSEC_PER_MSEC);
+		buf[llc] += runtime_ms;
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
 #else
 
 static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
-- 
2.32.0