[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <09c48847deeb9d2c1c7de1f2799cc128cd2e866e.1770760558.git.tim.c.chen@linux.intel.com>
Date: Tue, 10 Feb 2026 14:19:00 -0800
From: Tim Chen <tim.c.chen@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>,
K Prateek Nayak <kprateek.nayak@....com>,
"Gautham R . Shenoy" <gautham.shenoy@....com>,
Vincent Guittot <vincent.guittot@...aro.org>
Cc: Chen Yu <yu.c.chen@...el.com>,
Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>,
Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
Hillf Danton <hdanton@...a.com>,
Shrikanth Hegde <sshegde@...ux.ibm.com>,
Jianyong Wu <jianyong.wu@...look.com>,
Yangyu Chen <cyy@...self.name>,
Tingyin Duan <tingyin.duan@...il.com>,
Vern Hao <vernhao@...cent.com>,
Vern Hao <haoxing990@...il.com>,
Len Brown <len.brown@...el.com>,
Tim Chen <tim.c.chen@...ux.intel.com>,
Aubrey Li <aubrey.li@...el.com>,
Zhao Liu <zhao1.liu@...el.com>,
Chen Yu <yu.chen.surf@...il.com>,
Adam Li <adamli@...amperecomputing.com>,
Aaron Lu <ziqianlu@...edance.com>,
Tim Chen <tim.c.chen@...el.com>,
Josh Don <joshdon@...gle.com>,
Gavin Guo <gavinguo@...lia.com>,
Qais Yousef <qyousef@...alina.io>,
Libo Chen <libchen@...estorage.com>,
linux-kernel@...r.kernel.org
Subject: [PATCH v3 20/21] -- DO NOT APPLY!!! -- sched/cache/debug: Display the per LLC occupancy for each process via proc fs
From: Chen Yu <yu.c.chen@...el.com>
Debug patch only.
Show the per-LLC occupancy in /proc/{PID}/schedstat, with each column
corresponding to one LLC. This can be used to verify if the cache-aware
load balancer works as expected by aggregating threads onto dedicated LLCs.
Suppose there are 2 LLCs and the sampling duration is 10 seconds:
Enable the cache aware load balance:
0 12281 <--- LLC0 residency delta is 0, LLC1 is 12 seconds
0 18881
0 16217
disable the cache aware load balance:
6497 15802
9299 5435
17811 8278
Co-developed-by: Aaron Lu <ziqianlu@...edance.com>
Signed-off-by: Aaron Lu <ziqianlu@...edance.com>
Signed-off-by: Chen Yu <yu.c.chen@...el.com>
Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
---
Notes:
v2->v3:
Enhance the informational output by printing the task's
preferred LLC. (Aaron Lu)
fs/proc/base.c | 31 +++++++++++++++++++++++++
include/linux/mm_types.h | 17 +++++++++++---
include/linux/sched.h | 6 +++++
kernel/sched/fair.c | 50 ++++++++++++++++++++++++++++++++++++----
4 files changed, 97 insertions(+), 7 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4eec684baca9..76b49e80af1a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -518,6 +518,37 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
(unsigned long long)task->se.sum_exec_runtime,
(unsigned long long)task->sched_info.run_delay,
task->sched_info.pcount);
+#ifdef CONFIG_SCHED_CACHE
+ if (sched_cache_inuse()) {
+ struct mm_struct *mm = task->mm;
+ u64 *llc_runtime;
+ int mm_sched_llc;
+
+ if (!mm)
+ return 0;
+
+ llc_runtime = kcalloc(max_llcs, sizeof(u64), GFP_KERNEL);
+ if (!llc_runtime)
+ return 0;
+
+ if (get_mm_per_llc_runtime(task, llc_runtime))
+ goto out;
+
+ if (mm->sc_stat.cpu == -1)
+ mm_sched_llc = -1;
+ else
+ mm_sched_llc = llc_id(mm->sc_stat.cpu);
+
+ for (int i = 0; i < max_llcs; i++)
+ seq_printf(m, "%s%s%llu ",
+ i == task->preferred_llc ? "*" : "",
+ i == mm_sched_llc ? "?" : "",
+ llc_runtime[i]);
+ seq_puts(m, "\n");
+out:
+ kfree(llc_runtime);
+ }
+#endif
return 0;
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 777a48523aa6..2b8d0ec032e8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1523,17 +1523,26 @@ static inline unsigned int mm_cid_size(void)
#ifdef CONFIG_SCHED_CACHE
void mm_init_sched(struct mm_struct *mm,
- struct sched_cache_time __percpu *pcpu_sched);
+ struct sched_cache_time __percpu *pcpu_sched,
+ struct sched_cache_time __percpu *pcpu_time);
static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
{
struct sched_cache_time __percpu *pcpu_sched =
- alloc_percpu_noprof(struct sched_cache_time);
+ alloc_percpu_noprof(struct sched_cache_time),
+ *pcpu_time;
if (!pcpu_sched)
return -ENOMEM;
- mm_init_sched(mm, pcpu_sched);
+ pcpu_time = alloc_percpu_noprof(struct sched_cache_time);
+ if (!pcpu_time) {
+ free_percpu(pcpu_sched);
+ return -ENOMEM;
+ }
+
+ mm_init_sched(mm, pcpu_sched, pcpu_time);
+
return 0;
}
@@ -1542,7 +1551,9 @@ static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
static inline void mm_destroy_sched(struct mm_struct *mm)
{
free_percpu(mm->sc_stat.pcpu_sched);
+ free_percpu(mm->sc_stat.pcpu_time);
mm->sc_stat.pcpu_sched = NULL;
+ mm->sc_stat.pcpu_time = NULL;
}
#else /* !CONFIG_SCHED_CACHE */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 511c9b263386..4236cacbb409 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2344,12 +2344,18 @@ struct sched_cache_time {
struct sched_cache_stat {
struct sched_cache_time __percpu *pcpu_sched;
+ struct sched_cache_time __percpu *pcpu_time;
raw_spinlock_t lock;
unsigned long epoch;
u64 nr_running_avg;
int cpu;
} ____cacheline_aligned_in_smp;
+int get_mm_per_llc_runtime(struct task_struct *p, u64 *buf);
+bool sched_cache_inuse(void);
+extern int max_llcs;
+int llc_id(int cpu);
+
#else
struct sched_cache_stat { };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da4291ace24c..25cee3dd767c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1197,7 +1197,12 @@ __read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEO
__read_mostly unsigned int llc_imb_pct = 20;
__read_mostly unsigned int llc_overaggr_pct = 50;
-static int llc_id(int cpu)
+bool sched_cache_inuse(void)
+{
+ return sched_cache_enabled();
+}
+
+int llc_id(int cpu)
{
if (cpu < 0)
return -1;
@@ -1365,17 +1370,20 @@ static void account_llc_dequeue(struct rq *rq, struct task_struct *p)
}
void mm_init_sched(struct mm_struct *mm,
- struct sched_cache_time __percpu *_pcpu_sched)
+ struct sched_cache_time __percpu *_pcpu_sched,
+ struct sched_cache_time __percpu *_pcpu_time)
{
unsigned long epoch;
int i;
for_each_possible_cpu(i) {
struct sched_cache_time *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
+ struct sched_cache_time *pcpu_time = per_cpu_ptr(_pcpu_time, i);
struct rq *rq = cpu_rq(i);
pcpu_sched->runtime = 0;
pcpu_sched->epoch = rq->cpu_epoch;
+ pcpu_time->runtime = 0;
epoch = rq->cpu_epoch;
}
@@ -1389,6 +1397,8 @@ void mm_init_sched(struct mm_struct *mm,
* the readers may get invalid mm_sched_epoch, etc.
*/
smp_store_release(&mm->sc_stat.pcpu_sched, _pcpu_sched);
+ /* barrier */
+ smp_store_release(&mm->sc_stat.pcpu_time, _pcpu_time);
}
/* because why would C be fully specified */
@@ -1474,7 +1484,8 @@ static unsigned int task_running_on_cpu(int cpu, struct task_struct *p);
static inline
void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
{
- struct sched_cache_time *pcpu_sched;
+ struct sched_cache_time *pcpu_sched,
+ *pcpu_time;
struct mm_struct *mm = p->mm;
int mm_sched_llc = -1;
unsigned long epoch;
@@ -1488,14 +1499,18 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
* init_task, kthreads and user thread created
* by user_mode_thread() don't have mm.
*/
- if (!mm || !mm->sc_stat.pcpu_sched)
+ if (!mm || !mm->sc_stat.pcpu_sched ||
+ !mm->sc_stat.pcpu_time)
return;
pcpu_sched = per_cpu_ptr(p->mm->sc_stat.pcpu_sched, cpu_of(rq));
+ pcpu_time = per_cpu_ptr(p->mm->sc_stat.pcpu_time, cpu_of(rq));
scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
__update_mm_sched(rq, pcpu_sched);
pcpu_sched->runtime += delta_exec;
+ /* pure runtime without decay */
+ pcpu_time->runtime += delta_exec;
rq->cpu_runtime += delta_exec;
epoch = rq->cpu_epoch;
}
@@ -1676,6 +1691,33 @@ void init_sched_mm(struct task_struct *p)
work->next = work;
}
+/* p->pi_lock is hold */
+int get_mm_per_llc_runtime(struct task_struct *p, u64 *buf)
+{
+ struct sched_cache_time *pcpu_time;
+ struct mm_struct *mm = p->mm;
+ int cpu;
+
+ if (!mm)
+ return -EINVAL;
+
+ rcu_read_lock();
+ for_each_online_cpu(cpu) {
+ int llc = llc_id(cpu);
+ u64 runtime_ms;
+
+ if (!valid_llc_id(llc))
+ continue;
+
+ pcpu_time = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu);
+ runtime_ms = div_u64(pcpu_time->runtime, NSEC_PER_MSEC);
+ buf[llc] += runtime_ms;
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
#else
static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
--
2.32.0
Powered by blists - more mailing lists