[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <3e3622a5b2129b56741989f15a8debabec064de9.1754712565.git.tim.c.chen@linux.intel.com>
Date: Sat, 9 Aug 2025 13:09:17 +0800
From: Chen Yu <yu.c.chen@...el.com>
To: Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>,
K Prateek Nayak <kprateek.nayak@....com>,
"Gautham R . Shenoy" <gautham.shenoy@....com>
Cc: Vincent Guittot <vincent.guittot@...aro.org>,
Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>,
Libo Chen <libo.chen@...cle.com>,
Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
Hillf Danton <hdanton@...a.com>,
Shrikanth Hegde <sshegde@...ux.ibm.com>,
Jianyong Wu <jianyong.wu@...look.com>,
Yangyu Chen <cyy@...self.name>,
Tingyin Duan <tingyin.duan@...il.com>,
Vern Hao <vernhao@...cent.com>,
Len Brown <len.brown@...el.com>,
Tim Chen <tim.c.chen@...ux.intel.com>,
Aubrey Li <aubrey.li@...el.com>,
Zhao Liu <zhao1.liu@...el.com>,
Chen Yu <yu.chen.surf@...il.com>,
Chen Yu <yu.c.chen@...el.com>,
linux-kernel@...r.kernel.org
Subject: [RFC PATCH v4 28/28] sched: Add ftrace to track cache aware load balance and hottest CPU changes
Introduce 3 trace events:
1.
The average time spent scanning CPUs and calculating occupancy
in each sample period. This event can be used to track the
overhead of cache-aware scheduling.
2.
The footprint when switching to a new mm_sched_cpu (a cache-hot CPU).
This event can be used to track whether there is any abnormal
bouncing of mm_sched_cpu.
3.
The footprint of load balancing when migrating a task between CPUs.
This event can be used to track whether cache-aware load balancing
behaves as expected.
All these events can be used with bpftrace to gain a basic
understanding of whether cache-aware scheduling is effective.
Suggested-by: Shrikanth Hegde <sshegde@...ux.ibm.com>
Signed-off-by: Chen Yu <yu.c.chen@...el.com>
---
include/trace/events/sched.h | 93 ++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 25 ++++++++--
2 files changed, 113 insertions(+), 5 deletions(-)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 4e6b2910cec3..398180c18946 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -10,6 +10,99 @@
#include <linux/tracepoint.h>
#include <linux/binfmts.h>
+TRACE_EVENT(sched_scan_cost,
+
+ TP_PROTO(struct task_struct *t, u64 cost, int nr,
+ u64 old_running, u64 new_running),
+
+ TP_ARGS(t, cost, nr, old_running, new_running),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( u64, cost )
+ __field( int, nr )
+ __field( u64, old_running )
+ __field( u64, new_running )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+ __entry->pid = t->pid;
+ __entry->cost = cost;
+ __entry->nr = nr;
+ __entry->old_running = old_running;
+ __entry->new_running = new_running;
+ ),
+
+ TP_printk("comm=%s pid=%d cost=%llu nr=%d old_r=%lld new_r=%lld",
+ __entry->comm, __entry->pid,
+ __entry->cost, __entry->nr,
+ __entry->old_running, __entry->new_running)
+);
+
+TRACE_EVENT(sched_cache_work,
+
+ TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc,
+ int new_cpu, int new_llc),
+
+ TP_ARGS(t, pref_cpu, pref_llc, new_cpu, new_llc),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, pref_cpu )
+ __field( int, pref_llc )
+ __field( int, new_cpu )
+ __field( int, new_llc )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+ __entry->pid = t->pid;
+ __entry->pref_cpu = pref_cpu;
+ __entry->pref_llc = pref_llc;
+ __entry->new_cpu = new_cpu;
+ __entry->new_llc = new_llc;
+ ),
+
+ TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d",
+ __entry->comm, __entry->pid,
+ __entry->pref_cpu, __entry->pref_llc,
+ __entry->new_cpu, __entry->new_llc)
+);
+
+TRACE_EVENT(sched_attach_task,
+
+ TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc,
+ int attach_cpu, int attach_llc),
+
+ TP_ARGS(t, pref_cpu, pref_llc, attach_cpu, attach_llc),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, pref_cpu )
+ __field( int, pref_llc )
+ __field( int, attach_cpu )
+ __field( int, attach_llc )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+ __entry->pid = t->pid;
+ __entry->pref_cpu = pref_cpu;
+ __entry->pref_llc = pref_llc;
+ __entry->attach_cpu = attach_cpu;
+ __entry->attach_llc = attach_llc;
+ ),
+
+ TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d",
+ __entry->comm, __entry->pid,
+ __entry->pref_cpu, __entry->pref_llc,
+ __entry->attach_cpu, __entry->attach_llc)
+);
+
/*
* Tracepoint for calling kthread_stop, performed to end a kthread:
*/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 018825f04063..cb2c33ee0d92 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1467,8 +1467,9 @@ static void __no_profile task_cache_work(struct callback_head *work)
unsigned long last_m_a_occ = 0;
int cpu, m_a_cpu = -1, cache_cpu,
pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id(),
- nr_running = 0;
+ nr_running = 0, nr_scan = 0;
cpumask_var_t cpus;
+ u64 t0, scan_cost = 0;
WARN_ON_ONCE(work != &p->cache_work);
@@ -1499,6 +1500,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
pref_nid = p->numa_preferred_nid;
#endif
+ t0 = sched_clock_cpu(curr_cpu);
scoped_guard (cpus_read_lock) {
get_scan_cpumasks(cpus, cache_cpu,
pref_nid, curr_cpu);
@@ -1521,6 +1523,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
m_cpu = i;
}
nr++;
+ nr_scan++;
rcu_read_lock();
cur = rcu_dereference(cpu_rq(i)->curr);
@@ -1529,8 +1532,8 @@ static void __no_profile task_cache_work(struct callback_head *work)
nr_running++;
rcu_read_unlock();
- trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
- per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
+ //trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
+ // per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
}
// a_occ /= nr;
@@ -1541,8 +1544,8 @@ static void __no_profile task_cache_work(struct callback_head *work)
if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
last_m_a_occ = a_occ;
- trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
- per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
+ //trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
+ // per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
for_each_cpu(i, sched_domain_span(sd)) {
/* XXX threshold ? */
@@ -1553,12 +1556,17 @@ static void __no_profile task_cache_work(struct callback_head *work)
}
}
+ scan_cost = sched_clock_cpu(curr_cpu) - t0;
+
if (m_a_occ > (2 * last_m_a_occ)) {
/* avoid the bouncing of mm_sched_cpu */
+ trace_sched_cache_work(p, mm->mm_sched_cpu, llc_id(mm->mm_sched_cpu),
+ m_a_cpu, llc_id(m_a_cpu));
mm->mm_sched_cpu = m_a_cpu;
}
update_avg(&mm->nr_running_avg, nr_running);
+ trace_sched_scan_cost(p, scan_cost, nr_scan, mm->nr_running_avg, nr_running);
free_cpumask_var(cpus);
}
@@ -10443,6 +10451,13 @@ static void attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
+#ifdef CONFIG_SCHED_CACHE
+ if (p->mm)
+ trace_sched_attach_task(p,
+ p->mm->mm_sched_cpu,
+ p->mm->mm_sched_cpu != -1 ? llc_id(p->mm->mm_sched_cpu) : -1,
+ cpu_of(rq), llc_id(cpu_of(rq)));
+#endif
WARN_ON_ONCE(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
wakeup_preempt(rq, p, 0);
--
2.25.1
Powered by blists - more mailing lists