linux-kernel - [PATCH] delayacct/sched: add SOFTIRQ delay

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250819092739.GA31177@didi-ThinkCentre-M930t-N000>
Date: Tue, 19 Aug 2025 17:27:39 +0800
From: Tio Zhang <tiozhang@...iglobal.com>
To: <akpm@...ux-foundation.org>, <wang.yaxin@....com.cn>,
	<fan.yu9@....com.cn>, <corbet@....net>, <bsingharora@...il.com>,
	<yang.yang29@....com.cn>
CC: <linux-kernel@...r.kernel.org>, <linux-doc@...r.kernel.org>,
	<mingo@...hat.com>, <peterz@...radead.org>, <juri.lelli@...hat.com>,
	<vincent.guittot@...aro.org>, <dietmar.eggemann@....com>,
	<rostedt@...dmis.org>, <bsegall@...gle.com>, <mgorman@...e.de>,
	<vschneid@...hat.com>, <jiang.kun2@....com.cn>, <xu.xin16@....com.cn>,
	<wang.yong12@....com.cn>, <tiozhang@...iglobal.com>, <zyhtheonly@...il.com>,
	<zyhtheonly@...h.net>
Subject: [PATCH] delayacct/sched: add SOFTIRQ delay

Intro SOFTIRQ delay, so we can separate softirq as SOFTIRQ delay
and hardirq as {IRQ - SOFTIRQ} delay.

A typical scenario is when tasks delayed by network,
if they delayed by rx net packets, i.e, net_rx_action(),
SOFTIRQ delay is almost same as IRQ delay;
if they delayed by, e.g, bad driver or broken hardware,
SOFTIRQ delay is almost 0 while IRQ delay remains big.

Examples tool usage could be found in
Documentation/accounting/delay-accounting.rst

Signed-off-by: Tio Zhang <tiozhang@...iglobal.com>
---
 Documentation/accounting/delay-accounting.rst |  5 ++++-
 include/linux/delayacct.h                     | 18 ++++++++++------
 include/uapi/linux/taskstats.h                |  9 +++++++-
 kernel/delayacct.c                            |  9 +++++++-
 kernel/sched/core.c                           | 14 +++++++++----
 kernel/sched/cputime.c                        | 21 ++++++++++++++-----
 kernel/sched/sched.h                          |  6 +++++-
 tools/accounting/getdelays.c                  |  7 +++++++
 8 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 8ccc5af5ea1e..b6453723fbac 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -17,6 +17,7 @@ e) thrashing
 f) direct compact
 g) write-protect copy
 h) IRQ/SOFTIRQ
+i) SOFTIRQ
 
 and makes these statistics available to userspace through
 the taskstats interface.
@@ -50,7 +51,7 @@ this structure. See
 for a description of the fields pertaining to delay accounting.
 It will generally be in the form of counters returning the cumulative
 delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page
-cache, direct compact, write-protect copy, IRQ/SOFTIRQ etc.
+cache, direct compact, write-protect copy, IRQ/SOFTIRQ, SOFTIRQ etc.
 
 Taking the difference of two successive readings of a given
 counter (say cpu_delay_total) for a task will give the delay
@@ -123,6 +124,8 @@ Get sum and peak of delays, since system boot, for all pids with tgid 242::
 	              156       11215873          0.072ms     0.207403ms     0.033913ms
 	IRQ         count    delay total  delay average      delay max      delay min
 	                0              0          0.000ms     0.000000ms     0.000000ms
+	SOFTIRQ     count    delay total  delay average      delay max      delay min
+	                0              0          0.000ms     0.000000ms     0.000000ms
 
 Get IO accounting for pid 1, it works only with -p::
 
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 800dcc360db2..b73d777d7a96 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -62,13 +62,18 @@ struct task_delay_info {
 
 	u64 irq_delay_max;
 	u64 irq_delay_min;
-	u64 irq_delay;	/* wait for IRQ/SOFTIRQ */
+	u64 irq_delay;		/* wait for IRQ/SOFTIRQ */
+
+	u64 soft_delay_max;
+	u64 soft_delay_min;
+	u64 soft_delay;		/* wait for SOFTIRQ */
 
 	u32 freepages_count;	/* total count of memory reclaim */
 	u32 thrashing_count;	/* total count of thrash waits */
 	u32 compact_count;	/* total count of memory compact */
 	u32 wpcopy_count;	/* total count of write-protect copy */
-	u32 irq_count;	/* total count of IRQ/SOFTIRQ */
+	u32 irq_count;		/* total count of IRQ/SOFTIRQ */
+	u32 soft_count;		/* total count of SOFTIRQ */
 };
 #endif
 
@@ -98,7 +103,7 @@ extern void __delayacct_compact_start(void);
 extern void __delayacct_compact_end(void);
 extern void __delayacct_wpcopy_start(void);
 extern void __delayacct_wpcopy_end(void);
-extern void __delayacct_irq(struct task_struct *task, u32 delta);
+extern void __delayacct_irq(struct task_struct *task, u32 delta, u32 delta_soft);
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
 {
@@ -233,13 +238,14 @@ static inline void delayacct_wpcopy_end(void)
 		__delayacct_wpcopy_end();
 }
 
-static inline void delayacct_irq(struct task_struct *task, u32 delta)
+static inline void delayacct_irq(struct task_struct *task, u32 delta,
+					u32 delta_soft)
 {
 	if (!static_branch_unlikely(&delayacct_key))
 		return;
 
 	if (task->delays)
-		__delayacct_irq(task, delta);
+		__delayacct_irq(task, delta, delta_soft);
 }
 
 #else
@@ -280,7 +286,7 @@ static inline void delayacct_wpcopy_start(void)
 {}
 static inline void delayacct_wpcopy_end(void)
 {}
-static inline void delayacct_irq(struct task_struct *task, u32 delta)
+static inline void delayacct_irq(struct task_struct *task, u32 delta, u32 delta_soft)
 {}
 
 #endif /* CONFIG_TASK_DELAY_ACCT */
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 5929030d4e8b..23307f88e255 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
  */
 
 
-#define TASKSTATS_VERSION	16
+#define TASKSTATS_VERSION	17
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -230,6 +230,13 @@ struct taskstats {
 
 	__u64	irq_delay_max;
 	__u64	irq_delay_min;
+
+	/* v17: Delay waiting for SOFTIRQ */
+	__u64	soft_count;
+	__u64	soft_delay_total;
+
+	__u64	soft_delay_max;
+	__u64	soft_delay_min;
 };
 
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 30e7912ebb0d..15f88ca0c0e6 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -189,6 +189,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	UPDATE_DELAY(compact);
 	UPDATE_DELAY(wpcopy);
 	UPDATE_DELAY(irq);
+	UPDATE_DELAY(soft);
 	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 	return 0;
@@ -289,7 +290,7 @@ void __delayacct_wpcopy_end(void)
 		      &current->delays->wpcopy_delay_min);
 }
 
-void __delayacct_irq(struct task_struct *task, u32 delta)
+void __delayacct_irq(struct task_struct *task, u32 delta, u32 delta_soft)
 {
 	unsigned long flags;
 
@@ -300,6 +301,12 @@ void __delayacct_irq(struct task_struct *task, u32 delta)
 		task->delays->irq_delay_max = delta;
 	if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
 		task->delays->irq_delay_min = delta;
+	task->delays->soft_delay += delta_soft;
+	task->delays->soft_count++;
+	if (delta_soft > task->delays->soft_delay_max)
+		task->delays->soft_delay_max = delta_soft;
+	if (delta_soft && (!task->delays->soft_delay_min || delta_soft < task->delays->soft_delay_min))
+		task->delays->soft_delay_min = delta_soft;
 	raw_spin_unlock_irqrestore(&task->delays->lock, flags);
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index be00629f0ba4..30ba2e312356 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -773,11 +773,12 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
  * In theory, the compile should just see 0 here, and optimize out the call
  * to sched_rt_avg_update. But I don't trust it...
  */
-	s64 __maybe_unused steal = 0, irq_delta = 0;
+	s64 __maybe_unused steal = 0, irq_delta = 0, soft_delta = 0;
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	if (irqtime_enabled()) {
-		irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+		irq_delta = irq_time_read(cpu_of(rq), &soft_delta) - rq->prev_irq_time;
+		soft_delta -= rq->prev_soft_time;
 
 		/*
 		 * Since irq_time is only updated on {soft,}irq_exit, we might run into
@@ -794,12 +795,17 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 		 * the current rq->clock timestamp, except that would require using
 		 * atomic ops.
 		 */
-		if (irq_delta > delta)
+		if (soft_delta > delta) {  /* IRQ includes SOFTIRQ */
+			soft_delta = delta;
 			irq_delta = delta;
+		} else if (irq_delta > delta) {
+			irq_delta = delta;
+		}
 
 		rq->prev_irq_time += irq_delta;
+		rq->prev_soft_time += soft_delta;
 		delta -= irq_delta;
-		delayacct_irq(rq->curr, irq_delta);
+		delayacct_irq(rq->curr, irq_delta, soft_delta);
 	}
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 7097de2c8cda..17467f1f3e72 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -38,13 +38,14 @@ void disable_sched_clock_irqtime(void)
 }
 
 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
-				  enum cpu_usage_stat idx)
+				  u64 delta_soft, enum cpu_usage_stat idx)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
 	u64_stats_update_begin(&irqtime->sync);
 	cpustat[idx] += delta;
 	irqtime->total += delta;
+	irqtime->total_soft += delta_soft;
 	irqtime->tick_delta += delta;
 	u64_stats_update_end(&irqtime->sync);
 }
@@ -57,17 +58,27 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
 {
 	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
 	unsigned int pc;
-	s64 delta;
+	s64 delta, delta_soft = 0, cpu_clock;
 	int cpu;
 
 	if (!irqtime_enabled())
 		return;
 
 	cpu = smp_processor_id();
-	delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
+	cpu_clock = sched_clock_cpu(cpu);
+	delta = cpu_clock - irqtime->irq_start_time;
 	irqtime->irq_start_time += delta;
 	pc = irq_count() - offset;
 
+	/*
+	 * We only account softirq time when we are called by
+	 * account_softirq_enter{,exit}
+	 */
+	if ((offset & SOFTIRQ_OFFSET) || (pc & SOFTIRQ_OFFSET)) {
+		delta_soft = cpu_clock - irqtime->soft_start_time;
+		irqtime->soft_start_time += delta_soft;
+	}
+
 	/*
 	 * We do not account for softirq time from ksoftirqd here.
 	 * We want to continue accounting softirq time to ksoftirqd thread
@@ -75,9 +86,9 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
 	 * that do not consume any time, but still wants to run.
 	 */
 	if (pc & HARDIRQ_MASK)
-		irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+		irqtime_account_delta(irqtime, delta, delta_soft, CPUTIME_IRQ);
 	else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
-		irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+		irqtime_account_delta(irqtime, delta, delta_soft, CPUTIME_SOFTIRQ);
 }
 
 static u64 irqtime_tick_accounted(u64 maxtime)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index be9745d104f7..b263cb046cfa 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1219,6 +1219,7 @@ struct rq {
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64			prev_irq_time;
+	u64			prev_soft_time;
 	u64			psi_irq_time;
 #endif
 #ifdef CONFIG_PARAVIRT
@@ -3135,8 +3136,10 @@ static inline void sched_core_tick(struct rq *rq) { }
 
 struct irqtime {
 	u64			total;
+	u64			total_soft;
 	u64			tick_delta;
 	u64			irq_start_time;
+	u64			soft_start_time;
 	struct u64_stats_sync	sync;
 };
 
@@ -3153,7 +3156,7 @@ static inline int irqtime_enabled(void)
  * Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime
  * and never move forward.
  */
-static inline u64 irq_time_read(int cpu)
+static inline u64 irq_time_read(int cpu, u64 *total_soft)
 {
 	struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
 	unsigned int seq;
@@ -3162,6 +3165,7 @@ static inline u64 irq_time_read(int cpu)
 	do {
 		seq = __u64_stats_fetch_begin(&irqtime->sync);
 		total = irqtime->total;
+		*total_soft = irqtime->total_soft;
 	} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
 
 	return total;
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 21cb3c3d1331..7299cb60aa33 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -205,6 +205,7 @@ static int get_family_id(int sd)
  * version >= 13  - supports WPCOPY statistics
  * version >= 14  - supports IRQ statistics
  * version >= 16  - supports *_max and *_min delay statistics
+ * version >= 17  - supports SOFTIRQ statistics
  *
  * Always verify version before accessing version-dependent fields
  * to maintain backward compatibility.
@@ -296,6 +297,12 @@ static void print_delayacct(struct taskstats *t)
 			irq_count, irq_delay_total,
 			irq_delay_max, irq_delay_min);
 	}
+
+	if (t->version >= 17) {
+		PRINT_FILED_DELAY("SOFTIRQ", t->version, t,
+			soft_count, soft_delay_total,
+			soft_delay_max, soft_delay_min);
+	}
 }
 
 static void task_context_switch_counts(struct taskstats *t)
-- 
2.39.3 (Apple Git-145)