linux-kernel - [RFC PATCH v4 28/28] sched: Add ftrace to track cache aware load balance and hottest CPU changes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <3e3622a5b2129b56741989f15a8debabec064de9.1754712565.git.tim.c.chen@linux.intel.com>
Date: Sat,  9 Aug 2025 13:09:17 +0800
From: Chen Yu <yu.c.chen@...el.com>
To: Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	K Prateek Nayak <kprateek.nayak@....com>,
	"Gautham R . Shenoy" <gautham.shenoy@....com>
Cc: Vincent Guittot <vincent.guittot@...aro.org>,
	Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	Libo Chen <libo.chen@...cle.com>,
	Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
	Hillf Danton <hdanton@...a.com>,
	Shrikanth Hegde <sshegde@...ux.ibm.com>,
	Jianyong Wu <jianyong.wu@...look.com>,
	Yangyu Chen <cyy@...self.name>,
	Tingyin Duan <tingyin.duan@...il.com>,
	Vern Hao <vernhao@...cent.com>,
	Len Brown <len.brown@...el.com>,
	Tim Chen <tim.c.chen@...ux.intel.com>,
	Aubrey Li <aubrey.li@...el.com>,
	Zhao Liu <zhao1.liu@...el.com>,
	Chen Yu <yu.chen.surf@...il.com>,
	Chen Yu <yu.c.chen@...el.com>,
	linux-kernel@...r.kernel.org
Subject: [RFC PATCH v4 28/28] sched: Add ftrace to track cache aware load balance and hottest CPU changes

Introduce 3 trace events:

1.
The average time spent scanning CPUs and calculating occupancy
in each sample period. This event can be used to track the
overhead of cache-aware scheduling.

2.
The footprint when switching to a new mm_sched_cpu (a cache-hot CPU).
This event can be used to track whether there is any abnormal
bouncing of mm_sched_cpu.

3.
The footprint of load balancing when migrating a task between CPUs.
This event can be used to track whether cache-aware load balancing
behaves as expected.

All these events can be used with bpftrace to gain a basic
understanding of whether cache-aware scheduling is effective.

Suggested-by: Shrikanth Hegde <sshegde@...ux.ibm.com>
Signed-off-by: Chen Yu <yu.c.chen@...el.com>
---
 include/trace/events/sched.h | 93 ++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          | 25 ++++++++--
 2 files changed, 113 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 4e6b2910cec3..398180c18946 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -10,6 +10,99 @@
 #include <linux/tracepoint.h>
 #include <linux/binfmts.h>
 
+TRACE_EVENT(sched_scan_cost,
+
+	TP_PROTO(struct task_struct *t, u64 cost, int nr,
+		 u64 old_running, u64 new_running),
+
+	TP_ARGS(t, cost, nr, old_running, new_running),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	u64,	cost			)
+		__field(	int,	nr			)
+		__field(	u64,	old_running		)
+		__field(	u64,	new_running		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid	= t->pid;
+		__entry->cost	= cost;
+		__entry->nr	= nr;
+		__entry->old_running	= old_running;
+		__entry->new_running	= new_running;
+	),
+
+	TP_printk("comm=%s pid=%d cost=%llu nr=%d old_r=%lld new_r=%lld",
+		  __entry->comm, __entry->pid,
+		  __entry->cost, __entry->nr,
+		  __entry->old_running, __entry->new_running)
+);
+
+TRACE_EVENT(sched_cache_work,
+
+	TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc,
+		 int new_cpu, int new_llc),
+
+	TP_ARGS(t, pref_cpu, pref_llc, new_cpu, new_llc),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	pref_cpu		)
+		__field(	int,	pref_llc		)
+		__field(	int,	new_cpu			)
+		__field(	int,	new_llc			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid	= t->pid;
+		__entry->pref_cpu	= pref_cpu;
+		__entry->pref_llc	= pref_llc;
+		__entry->new_cpu	= new_cpu;
+		__entry->new_llc	= new_llc;
+	),
+
+	TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d",
+		  __entry->comm, __entry->pid,
+		  __entry->pref_cpu, __entry->pref_llc,
+		  __entry->new_cpu, __entry->new_llc)
+);
+
+TRACE_EVENT(sched_attach_task,
+
+	TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc,
+		 int attach_cpu, int attach_llc),
+
+	TP_ARGS(t, pref_cpu, pref_llc, attach_cpu, attach_llc),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	pref_cpu		)
+		__field(	int,	pref_llc		)
+		__field(	int,	attach_cpu		)
+		__field(	int,	attach_llc		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid	= t->pid;
+		__entry->pref_cpu	= pref_cpu;
+		__entry->pref_llc	= pref_llc;
+		__entry->attach_cpu	= attach_cpu;
+		__entry->attach_llc	= attach_llc;
+	),
+
+	TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d",
+		  __entry->comm, __entry->pid,
+		  __entry->pref_cpu, __entry->pref_llc,
+		  __entry->attach_cpu, __entry->attach_llc)
+);
+
 /*
  * Tracepoint for calling kthread_stop, performed to end a kthread:
  */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 018825f04063..cb2c33ee0d92 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1467,8 +1467,9 @@ static void __no_profile task_cache_work(struct callback_head *work)
 	unsigned long last_m_a_occ = 0;
 	int cpu, m_a_cpu = -1, cache_cpu,
 	    pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id(),
-	    nr_running = 0;
+	    nr_running = 0, nr_scan = 0;
 	cpumask_var_t cpus;
+	u64 t0, scan_cost = 0;
 
 	WARN_ON_ONCE(work != &p->cache_work);
 
@@ -1499,6 +1500,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
 		pref_nid = p->numa_preferred_nid;
 #endif
 
+	t0 = sched_clock_cpu(curr_cpu);
 	scoped_guard (cpus_read_lock) {
 		get_scan_cpumasks(cpus, cache_cpu,
 				  pref_nid, curr_cpu);
@@ -1521,6 +1523,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
 					m_cpu = i;
 				}
 				nr++;
+				nr_scan++;
 
 				rcu_read_lock();
 				cur = rcu_dereference(cpu_rq(i)->curr);
@@ -1529,8 +1532,8 @@ static void __no_profile task_cache_work(struct callback_head *work)
 					nr_running++;
 				rcu_read_unlock();
 
-				trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
-					     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
+				//trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n",
+				//	     per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr);
 			}
 
 			// a_occ /= nr;
@@ -1541,8 +1544,8 @@ static void __no_profile task_cache_work(struct callback_head *work)
 			if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
 				last_m_a_occ = a_occ;
 
-			trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
-				     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
+			//trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n",
+			//	     per_cpu(sd_llc_id, cpu), a_occ, m_a_occ);
 
 			for_each_cpu(i, sched_domain_span(sd)) {
 				/* XXX threshold ? */
@@ -1553,12 +1556,17 @@ static void __no_profile task_cache_work(struct callback_head *work)
 		}
 	}
 
+	scan_cost = sched_clock_cpu(curr_cpu) - t0;
+
 	if (m_a_occ > (2 * last_m_a_occ)) {
 		/* avoid the bouncing of mm_sched_cpu */
+		trace_sched_cache_work(p, mm->mm_sched_cpu, llc_id(mm->mm_sched_cpu),
+					m_a_cpu, llc_id(m_a_cpu));
 		mm->mm_sched_cpu = m_a_cpu;
 	}
 
 	update_avg(&mm->nr_running_avg, nr_running);
+	trace_sched_scan_cost(p, scan_cost, nr_scan, mm->nr_running_avg, nr_running);
 	free_cpumask_var(cpus);
 }
 
@@ -10443,6 +10451,13 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 {
 	lockdep_assert_rq_held(rq);
 
+#ifdef CONFIG_SCHED_CACHE
+	if (p->mm)
+		trace_sched_attach_task(p,
+					p->mm->mm_sched_cpu,
+					p->mm->mm_sched_cpu != -1 ? llc_id(p->mm->mm_sched_cpu) : -1,
+					cpu_of(rq), llc_id(cpu_of(rq)));
+#endif
 	WARN_ON_ONCE(task_rq(p) != rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
 	wakeup_preempt(rq, p, 0);
-- 
2.25.1