[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250819092739.GA31177@didi-ThinkCentre-M930t-N000>
Date: Tue, 19 Aug 2025 17:27:39 +0800
From: Tio Zhang <tiozhang@...iglobal.com>
To: <akpm@...ux-foundation.org>, <wang.yaxin@....com.cn>,
<fan.yu9@....com.cn>, <corbet@....net>, <bsingharora@...il.com>,
<yang.yang29@....com.cn>
CC: <linux-kernel@...r.kernel.org>, <linux-doc@...r.kernel.org>,
<mingo@...hat.com>, <peterz@...radead.org>, <juri.lelli@...hat.com>,
<vincent.guittot@...aro.org>, <dietmar.eggemann@....com>,
<rostedt@...dmis.org>, <bsegall@...gle.com>, <mgorman@...e.de>,
<vschneid@...hat.com>, <jiang.kun2@....com.cn>, <xu.xin16@....com.cn>,
<wang.yong12@....com.cn>, <tiozhang@...iglobal.com>, <zyhtheonly@...il.com>,
<zyhtheonly@...h.net>
Subject: [PATCH] delayacct/sched: add SOFTIRQ delay
Intro SOFTIRQ delay, so we can separate softirq as SOFTIRQ delay
and hardirq as {IRQ - SOFTIRQ} delay.
A typical scenario is when tasks delayed by network,
if they delayed by rx net packets, i.e, net_rx_action(),
SOFTIRQ delay is almost same as IRQ delay;
if they delayed by, e.g, bad driver or broken hardware,
SOFTIRQ delay is almost 0 while IRQ delay remains big.
Examples tool usage could be found in
Documentation/accounting/delay-accounting.rst
Signed-off-by: Tio Zhang <tiozhang@...iglobal.com>
---
Documentation/accounting/delay-accounting.rst | 5 ++++-
include/linux/delayacct.h | 18 ++++++++++------
include/uapi/linux/taskstats.h | 9 +++++++-
kernel/delayacct.c | 9 +++++++-
kernel/sched/core.c | 14 +++++++++----
kernel/sched/cputime.c | 21 ++++++++++++++-----
kernel/sched/sched.h | 6 +++++-
tools/accounting/getdelays.c | 7 +++++++
8 files changed, 70 insertions(+), 19 deletions(-)
diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 8ccc5af5ea1e..b6453723fbac 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -17,6 +17,7 @@ e) thrashing
f) direct compact
g) write-protect copy
h) IRQ/SOFTIRQ
+i) SOFTIRQ
and makes these statistics available to userspace through
the taskstats interface.
@@ -50,7 +51,7 @@ this structure. See
for a description of the fields pertaining to delay accounting.
It will generally be in the form of counters returning the cumulative
delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page
-cache, direct compact, write-protect copy, IRQ/SOFTIRQ etc.
+cache, direct compact, write-protect copy, IRQ/SOFTIRQ, SOFTIRQ etc.
Taking the difference of two successive readings of a given
counter (say cpu_delay_total) for a task will give the delay
@@ -123,6 +124,8 @@ Get sum and peak of delays, since system boot, for all pids with tgid 242::
156 11215873 0.072ms 0.207403ms 0.033913ms
IRQ count delay total delay average delay max delay min
0 0 0.000ms 0.000000ms 0.000000ms
+ SOFTIRQ count delay total delay average delay max delay min
+ 0 0 0.000ms 0.000000ms 0.000000ms
Get IO accounting for pid 1, it works only with -p::
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 800dcc360db2..b73d777d7a96 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -62,13 +62,18 @@ struct task_delay_info {
u64 irq_delay_max;
u64 irq_delay_min;
- u64 irq_delay; /* wait for IRQ/SOFTIRQ */
+ u64 irq_delay; /* wait for IRQ/SOFTIRQ */
+
+ u64 soft_delay_max;
+ u64 soft_delay_min;
+ u64 soft_delay; /* wait for SOFTIRQ */
u32 freepages_count; /* total count of memory reclaim */
u32 thrashing_count; /* total count of thrash waits */
u32 compact_count; /* total count of memory compact */
u32 wpcopy_count; /* total count of write-protect copy */
- u32 irq_count; /* total count of IRQ/SOFTIRQ */
+ u32 irq_count; /* total count of IRQ/SOFTIRQ */
+ u32 soft_count; /* total count of SOFTIRQ */
};
#endif
@@ -98,7 +103,7 @@ extern void __delayacct_compact_start(void);
extern void __delayacct_compact_end(void);
extern void __delayacct_wpcopy_start(void);
extern void __delayacct_wpcopy_end(void);
-extern void __delayacct_irq(struct task_struct *task, u32 delta);
+extern void __delayacct_irq(struct task_struct *task, u32 delta, u32 delta_soft);
static inline void delayacct_tsk_init(struct task_struct *tsk)
{
@@ -233,13 +238,14 @@ static inline void delayacct_wpcopy_end(void)
__delayacct_wpcopy_end();
}
-static inline void delayacct_irq(struct task_struct *task, u32 delta)
+static inline void delayacct_irq(struct task_struct *task, u32 delta,
+ u32 delta_soft)
{
if (!static_branch_unlikely(&delayacct_key))
return;
if (task->delays)
- __delayacct_irq(task, delta);
+ __delayacct_irq(task, delta, delta_soft);
}
#else
@@ -280,7 +286,7 @@ static inline void delayacct_wpcopy_start(void)
{}
static inline void delayacct_wpcopy_end(void)
{}
-static inline void delayacct_irq(struct task_struct *task, u32 delta)
+static inline void delayacct_irq(struct task_struct *task, u32 delta, u32 delta_soft)
{}
#endif /* CONFIG_TASK_DELAY_ACCT */
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 5929030d4e8b..23307f88e255 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
*/
-#define TASKSTATS_VERSION 16
+#define TASKSTATS_VERSION 17
#define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
* in linux/sched.h */
@@ -230,6 +230,13 @@ struct taskstats {
__u64 irq_delay_max;
__u64 irq_delay_min;
+
+ /* v17: Delay waiting for SOFTIRQ */
+ __u64 soft_count;
+ __u64 soft_delay_total;
+
+ __u64 soft_delay_max;
+ __u64 soft_delay_min;
};
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 30e7912ebb0d..15f88ca0c0e6 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -189,6 +189,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
UPDATE_DELAY(compact);
UPDATE_DELAY(wpcopy);
UPDATE_DELAY(irq);
+ UPDATE_DELAY(soft);
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
@@ -289,7 +290,7 @@ void __delayacct_wpcopy_end(void)
¤t->delays->wpcopy_delay_min);
}
-void __delayacct_irq(struct task_struct *task, u32 delta)
+void __delayacct_irq(struct task_struct *task, u32 delta, u32 delta_soft)
{
unsigned long flags;
@@ -300,6 +301,12 @@ void __delayacct_irq(struct task_struct *task, u32 delta)
task->delays->irq_delay_max = delta;
if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
task->delays->irq_delay_min = delta;
+ task->delays->soft_delay += delta_soft;
+ task->delays->soft_count++;
+ if (delta_soft > task->delays->soft_delay_max)
+ task->delays->soft_delay_max = delta_soft;
+ if (delta_soft && (!task->delays->soft_delay_min || delta_soft < task->delays->soft_delay_min))
+ task->delays->soft_delay_min = delta_soft;
raw_spin_unlock_irqrestore(&task->delays->lock, flags);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index be00629f0ba4..30ba2e312356 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -773,11 +773,12 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
* In theory, the compile should just see 0 here, and optimize out the call
* to sched_rt_avg_update. But I don't trust it...
*/
- s64 __maybe_unused steal = 0, irq_delta = 0;
+ s64 __maybe_unused steal = 0, irq_delta = 0, soft_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
if (irqtime_enabled()) {
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ irq_delta = irq_time_read(cpu_of(rq), &soft_delta) - rq->prev_irq_time;
+ soft_delta -= rq->prev_soft_time;
/*
* Since irq_time is only updated on {soft,}irq_exit, we might run into
@@ -794,12 +795,17 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
* the current rq->clock timestamp, except that would require using
* atomic ops.
*/
- if (irq_delta > delta)
+ if (soft_delta > delta) { /* IRQ includes SOFTIRQ */
+ soft_delta = delta;
irq_delta = delta;
+ } else if (irq_delta > delta) {
+ irq_delta = delta;
+ }
rq->prev_irq_time += irq_delta;
+ rq->prev_soft_time += soft_delta;
delta -= irq_delta;
- delayacct_irq(rq->curr, irq_delta);
+ delayacct_irq(rq->curr, irq_delta, soft_delta);
}
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 7097de2c8cda..17467f1f3e72 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -38,13 +38,14 @@ void disable_sched_clock_irqtime(void)
}
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
- enum cpu_usage_stat idx)
+ u64 delta_soft, enum cpu_usage_stat idx)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
u64_stats_update_begin(&irqtime->sync);
cpustat[idx] += delta;
irqtime->total += delta;
+ irqtime->total_soft += delta_soft;
irqtime->tick_delta += delta;
u64_stats_update_end(&irqtime->sync);
}
@@ -57,17 +58,27 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
unsigned int pc;
- s64 delta;
+ s64 delta, delta_soft = 0, cpu_clock;
int cpu;
if (!irqtime_enabled())
return;
cpu = smp_processor_id();
- delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
+ cpu_clock = sched_clock_cpu(cpu);
+ delta = cpu_clock - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
pc = irq_count() - offset;
+ /*
+ * We only account softirq time when we are called by
+ * account_softirq_enter{,exit}
+ */
+ if ((offset & SOFTIRQ_OFFSET) || (pc & SOFTIRQ_OFFSET)) {
+ delta_soft = cpu_clock - irqtime->soft_start_time;
+ irqtime->soft_start_time += delta_soft;
+ }
+
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
@@ -75,9 +86,9 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
* that do not consume any time, but still wants to run.
*/
if (pc & HARDIRQ_MASK)
- irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+ irqtime_account_delta(irqtime, delta, delta_soft, CPUTIME_IRQ);
else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
- irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+ irqtime_account_delta(irqtime, delta, delta_soft, CPUTIME_SOFTIRQ);
}
static u64 irqtime_tick_accounted(u64 maxtime)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index be9745d104f7..b263cb046cfa 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1219,6 +1219,7 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
+ u64 prev_soft_time;
u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
@@ -3135,8 +3136,10 @@ static inline void sched_core_tick(struct rq *rq) { }
struct irqtime {
u64 total;
+ u64 total_soft;
u64 tick_delta;
u64 irq_start_time;
+ u64 soft_start_time;
struct u64_stats_sync sync;
};
@@ -3153,7 +3156,7 @@ static inline int irqtime_enabled(void)
* Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime
* and never move forward.
*/
-static inline u64 irq_time_read(int cpu)
+static inline u64 irq_time_read(int cpu, u64 *total_soft)
{
struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
unsigned int seq;
@@ -3162,6 +3165,7 @@ static inline u64 irq_time_read(int cpu)
do {
seq = __u64_stats_fetch_begin(&irqtime->sync);
total = irqtime->total;
+ *total_soft = irqtime->total_soft;
} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
return total;
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 21cb3c3d1331..7299cb60aa33 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -205,6 +205,7 @@ static int get_family_id(int sd)
* version >= 13 - supports WPCOPY statistics
* version >= 14 - supports IRQ statistics
* version >= 16 - supports *_max and *_min delay statistics
+ * version >= 17 - supports SOFTIRQ statistics
*
* Always verify version before accessing version-dependent fields
* to maintain backward compatibility.
@@ -296,6 +297,12 @@ static void print_delayacct(struct taskstats *t)
irq_count, irq_delay_total,
irq_delay_max, irq_delay_min);
}
+
+ if (t->version >= 17) {
+ PRINT_FILED_DELAY("SOFTIRQ", t->version, t,
+ soft_count, soft_delay_total,
+ soft_delay_max, soft_delay_min);
+ }
}
static void task_context_switch_counts(struct taskstats *t)
--
2.39.3 (Apple Git-145)
Powered by blists - more mailing lists