[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <202601141934036563c-MXYWMY4VX0grNNmdA_@zte.com.cn>
Date: Wed, 14 Jan 2026 19:34:03 +0800 (CST)
From: <wang.yaxin@....com.cn>
To: <akpm@...ux-foundation.org>, <fan.yu9@....com.cn>,
<yang.yang29@....com.cn>
Cc: <corbet@....net>, <linux-kernel@...r.kernel.org>,
<linux-doc@...r.kernel.org>, <xu.xin16@....com.cn>
Subject: [PATCH] delayacct: add timestamp of delay max
From: Wang Yaxin <wang.yaxin@....com.cn>
Problem
=======
Commit 658eb5ab916d ("delayacct: add delay max to record delay peak")
introduced the delay max for getdelays, which records abnormal latency
peaks and helps us understand the magnitude of such delays. However,
the peak latency value alone is insufficient for effective root cause
analysis. Without the precise timestamp of when the peak occurred, we
still lack the critical context needed to correlate it with other
system events.
Solution
========
To address this, we need to additionally record a precise timestamp
when the maximum latency occurs. By correlating this timestamp with
system logs and monitoring metrics, we can identify processes with
abnormal resource usage at the same moment, which can help us to
pinpoint root causes.
Use Case
========
bash-4.4# ./getdelays -d -t 200
print delayacct stats ON
TGID 200
CPU count real total virtual total delay total delay average delay max delay min max timestamp
45 176000000 181535534 1429077 0.032ms 0.418387ms 0.124835ms 2026-01-13 12:38:39
IO count delay total delay average delay max delay min max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
SWAP count delay total delay average delay max delay min max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
RECLAIM count delay total delay average delay max delay min max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
THRASHING count delay total delay average delay max delay min max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
COMPACT count delay total delay average delay max delay min max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
WPCOPY count delay total delay average delay max delay min max timestamp
154 14315567 0.093ms 3.450139ms 0.018359ms 2026-01-13 12:38:47
IRQ count delay total delay average delay max delay min max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
bash-4.4#
Signed-off-by: Wang Yaxin <wang.yaxin@....com.cn>
---
include/linux/delayacct.h | 8 ++
include/linux/sched.h | 5 +
include/uapi/linux/taskstats.h | 22 ++++-
kernel/delayacct.c | 27 ++++--
kernel/sched/stats.h | 8 +-
tools/accounting/getdelays.c | 171 +++++++++++++++++++++++++++------
6 files changed, 203 insertions(+), 38 deletions(-)
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 800dcc360db2..ecb06f16d22c 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -69,6 +69,14 @@ struct task_delay_info {
u32 compact_count; /* total count of memory compact */
u32 wpcopy_count; /* total count of write-protect copy */
u32 irq_count; /* total count of IRQ/SOFTIRQ */
+
+ struct timespec64 blkio_delay_max_ts;
+ struct timespec64 swapin_delay_max_ts;
+ struct timespec64 freepages_delay_max_ts;
+ struct timespec64 thrashing_delay_max_ts;
+ struct timespec64 compact_delay_max_ts;
+ struct timespec64 wpcopy_delay_max_ts;
+ struct timespec64 irq_delay_max_ts;
};
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c403f572bac8..ee83e4646ebd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -49,6 +49,7 @@
#include <linux/tracepoint-defs.h>
#include <linux/unwind_deferred_types.h>
#include <asm/kmap_size.h>
+#include <linux/time64.h>
#ifndef COMPILE_OFFSETS
#include <generated/rq-offsets.h>
#endif
@@ -87,6 +88,7 @@ struct task_delay_info;
struct task_group;
struct task_struct;
struct user_event_mm;
+struct timespec64;
#include <linux/sched/ext.h>
@@ -435,6 +437,9 @@ struct sched_info {
/* When were we last queued to run? */
unsigned long long last_queued;
+ /* Timestamp of max time spent waiting on a runqueue: */
+ struct timespec64 max_run_delay_ts;
+
#endif /* CONFIG_SCHED_INFO */
};
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 5929030d4e8b..1b31e8e14d2f 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -18,6 +18,16 @@
#define _LINUX_TASKSTATS_H
#include <linux/types.h>
+#ifdef __KERNEL__
+#include <linux/time64.h>
+#else
+#ifndef _LINUX_TIME64_H
+struct timespec64 {
+ __s64 tv_sec; /* seconds */
+ long tv_nsec; /* nanoseconds */
+};
+#endif
+#endif
/* Format for per-task data returned to userland when
* - a task exits
@@ -34,7 +44,7 @@
*/
-#define TASKSTATS_VERSION 16
+#define TASKSTATS_VERSION 17
#define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
* in linux/sched.h */
@@ -230,6 +240,16 @@ struct taskstats {
__u64 irq_delay_max;
__u64 irq_delay_min;
+
+ /*v17: delay max timestamp record*/
+ struct timespec64 cpu_delay_max_ts;
+ struct timespec64 blkio_delay_max_ts;
+ struct timespec64 swapin_delay_max_ts;
+ struct timespec64 freepages_delay_max_ts;
+ struct timespec64 thrashing_delay_max_ts;
+ struct timespec64 compact_delay_max_ts;
+ struct timespec64 wpcopy_delay_max_ts;
+ struct timespec64 irq_delay_max_ts;
};
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 30e7912ebb0d..143c5b9b67ea 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -18,6 +18,7 @@
do { \
d->type##_delay_max = tsk->delays->type##_delay_max; \
d->type##_delay_min = tsk->delays->type##_delay_min; \
+ d->type##_delay_max_ts = tsk->delays->type##_delay_max_ts; \
tmp = d->type##_delay_total + tsk->delays->type##_delay; \
d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
d->type##_count += tsk->delays->type##_count; \
@@ -104,7 +105,8 @@ void __delayacct_tsk_init(struct task_struct *tsk)
* Finish delay accounting for a statistic using its timestamps (@start),
* accumulator (@total) and @count
*/
-static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count,
+ u64 *max, u64 *min, struct timespec64 *ts)
{
s64 ns = local_clock() - *start;
unsigned long flags;
@@ -113,8 +115,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou
raw_spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
- if (ns > *max)
+ if (ns > *max) {
*max = ns;
+ ktime_get_real_ts64(ts);
+ }
if (*min == 0 || ns < *min)
*min = ns;
raw_spin_unlock_irqrestore(lock, flags);
@@ -137,7 +141,8 @@ void __delayacct_blkio_end(struct task_struct *p)
&p->delays->blkio_delay,
&p->delays->blkio_count,
&p->delays->blkio_delay_max,
- &p->delays->blkio_delay_min);
+ &p->delays->blkio_delay_min,
+ &p->delays->blkio_delay_max_ts);
}
int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -170,6 +175,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->cpu_delay_max = tsk->sched_info.max_run_delay;
d->cpu_delay_min = tsk->sched_info.min_run_delay;
+ d->cpu_delay_max_ts = tsk->sched_info.max_run_delay_ts;
tmp = (s64)d->cpu_delay_total + t2;
d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
tmp = (s64)d->cpu_run_virtual_total + t3;
@@ -217,7 +223,8 @@ void __delayacct_freepages_end(void)
¤t->delays->freepages_delay,
¤t->delays->freepages_count,
¤t->delays->freepages_delay_max,
- ¤t->delays->freepages_delay_min);
+ ¤t->delays->freepages_delay_min,
+ ¤t->delays->freepages_delay_max_ts);
}
void __delayacct_thrashing_start(bool *in_thrashing)
@@ -241,7 +248,8 @@ void __delayacct_thrashing_end(bool *in_thrashing)
¤t->delays->thrashing_delay,
¤t->delays->thrashing_count,
¤t->delays->thrashing_delay_max,
- ¤t->delays->thrashing_delay_min);
+ ¤t->delays->thrashing_delay_min,
+ ¤t->delays->thrashing_delay_max_ts);
}
void __delayacct_swapin_start(void)
@@ -256,7 +264,8 @@ void __delayacct_swapin_end(void)
¤t->delays->swapin_delay,
¤t->delays->swapin_count,
¤t->delays->swapin_delay_max,
- ¤t->delays->swapin_delay_min);
+ ¤t->delays->swapin_delay_min,
+ ¤t->delays->swapin_delay_max_ts);
}
void __delayacct_compact_start(void)
@@ -271,7 +280,8 @@ void __delayacct_compact_end(void)
¤t->delays->compact_delay,
¤t->delays->compact_count,
¤t->delays->compact_delay_max,
- ¤t->delays->compact_delay_min);
+ ¤t->delays->compact_delay_min,
+ ¤t->delays->compact_delay_max_ts);
}
void __delayacct_wpcopy_start(void)
@@ -286,7 +296,8 @@ void __delayacct_wpcopy_end(void)
¤t->delays->wpcopy_delay,
¤t->delays->wpcopy_count,
¤t->delays->wpcopy_delay_max,
- ¤t->delays->wpcopy_delay_min);
+ ¤t->delays->wpcopy_delay_min,
+ ¤t->delays->wpcopy_delay_max_ts);
}
void __delayacct_irq(struct task_struct *task, u32 delta)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c903f1a42891..a612cf253c87 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -253,8 +253,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
delta = rq_clock(rq) - t->sched_info.last_queued;
t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
- if (delta > t->sched_info.max_run_delay)
+ if (delta > t->sched_info.max_run_delay) {
t->sched_info.max_run_delay = delta;
+ ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
+ }
if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
t->sched_info.min_run_delay = delta;
rq_sched_info_dequeue(rq, delta);
@@ -278,8 +280,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
- if (delta > t->sched_info.max_run_delay)
+ if (delta > t->sched_info.max_run_delay) {
t->sched_info.max_run_delay = delta;
+ ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
+ }
if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
t->sched_info.min_run_delay = delta;
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 21cb3c3d1331..fdf4d30bcf24 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -24,6 +24,7 @@
#include <sys/socket.h>
#include <sys/wait.h>
#include <signal.h>
+#include <time.h>
#include <linux/genetlink.h>
#include <linux/taskstats.h>
@@ -194,6 +195,36 @@ static int get_family_id(int sd)
#define average_ms(t, c) (t / 1000000ULL / (c ? c : 1))
#define delay_ms(t) (t / 1000000ULL)
+/*
+ * Format timespec64 to human readable string (YYYY-MM-DD HH:MM:SS)
+ * Returns formatted string or "N/A" if timestamp is zero
+ */
+static const char *format_timespec64(struct timespec64 *ts)
+{
+ static char buffer[32];
+ struct tm *tm_info;
+ time_t time_sec;
+
+ /* Check if timestamp is zero (not set) */
+ if (ts->tv_sec == 0 && ts->tv_nsec == 0)
+ return "N/A";
+
+ time_sec = (time_t)ts->tv_sec;
+ tm_info = localtime(&time_sec);
+ if (!tm_info)
+ return "N/A";
+
+ snprintf(buffer, sizeof(buffer), "%04d-%02d-%02d %02d:%02d:%02d",
+ tm_info->tm_year + 1900,
+ tm_info->tm_mon + 1,
+ tm_info->tm_mday,
+ tm_info->tm_hour,
+ tm_info->tm_min,
+ tm_info->tm_sec);
+
+ return buffer;
+}
+
/*
* Version compatibility note:
* Field availability depends on taskstats version (t->version),
@@ -205,13 +236,28 @@ static int get_family_id(int sd)
* version >= 13 - supports WPCOPY statistics
* version >= 14 - supports IRQ statistics
* version >= 16 - supports *_max and *_min delay statistics
+ * version >= 17 - supports delay max timestamp statistics
*
* Always verify version before accessing version-dependent fields
* to maintain backward compatibility.
*/
#define PRINT_CPU_DELAY(version, t) \
do { \
- if (version >= 16) { \
+ if (version >= 17) { \
+ printf("%-10s%15s%15s%15s%15s%15s%15s%15s%20s\n", \
+ "CPU", "count", "real total", "virtual total", \
+ "delay total", "delay average", "delay max", \
+ "delay min", "max timestamp"); \
+ printf(" %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms%20s\n", \
+ (unsigned long long)(t)->cpu_count, \
+ (unsigned long long)(t)->cpu_run_real_total, \
+ (unsigned long long)(t)->cpu_run_virtual_total, \
+ (unsigned long long)(t)->cpu_delay_total, \
+ average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \
+ delay_ms((double)(t)->cpu_delay_max), \
+ delay_ms((double)(t)->cpu_delay_min), \
+ format_timespec64(&(t)->cpu_delay_max_ts)); \
+ } else if (version >= 16) { \
printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \
"CPU", "count", "real total", "virtual total", \
"delay total", "delay average", "delay max", "delay min"); \
@@ -257,44 +303,115 @@ static int get_family_id(int sd)
} \
} while (0)
+#define PRINT_FILED_DELAY_WITH_TS(name, version, t, count, total, max, min, max_ts) \
+ do { \
+ if (version >= 17) { \
+ printf("%-10s%15s%15s%15s%15s%15s%20s\n", \
+ name, "count", "delay total", "delay average", \
+ "delay max", "delay min", "max timestamp"); \
+ printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms%20s\n", \
+ (unsigned long long)(t)->count, \
+ (unsigned long long)(t)->total, \
+ average_ms((double)(t)->total, (t)->count), \
+ delay_ms((double)(t)->max), \
+ delay_ms((double)(t)->min), \
+ format_timespec64(&(t)->max_ts)); \
+ } else if (version >= 16) { \
+ printf("%-10s%15s%15s%15s%15s%15s\n", \
+ name, "count", "delay total", "delay average", \
+ "delay max", "delay min"); \
+ printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \
+ (unsigned long long)(t)->count, \
+ (unsigned long long)(t)->total, \
+ average_ms((double)(t)->total, (t)->count), \
+ delay_ms((double)(t)->max), \
+ delay_ms((double)(t)->min)); \
+ } else { \
+ printf("%-10s%15s%15s%15s\n", \
+ name, "count", "delay total", "delay average"); \
+ printf(" %15llu%15llu%15.3fms\n", \
+ (unsigned long long)(t)->count, \
+ (unsigned long long)(t)->total, \
+ average_ms((double)(t)->total, (t)->count)); \
+ } \
+ } while (0)
+
static void print_delayacct(struct taskstats *t)
{
printf("\n\n");
PRINT_CPU_DELAY(t->version, t);
- PRINT_FILED_DELAY("IO", t->version, t,
- blkio_count, blkio_delay_total,
- blkio_delay_max, blkio_delay_min);
+ /* Use new macro with timestamp support for version >= 17 */
+ if (t->version >= 17) {
+ PRINT_FILED_DELAY_WITH_TS("IO", t->version, t,
+ blkio_count, blkio_delay_total,
+ blkio_delay_max, blkio_delay_min, blkio_delay_max_ts);
- PRINT_FILED_DELAY("SWAP", t->version, t,
- swapin_count, swapin_delay_total,
- swapin_delay_max, swapin_delay_min);
+ PRINT_FILED_DELAY_WITH_TS("SWAP", t->version, t,
+ swapin_count, swapin_delay_total,
+ swapin_delay_max, swapin_delay_min, swapin_delay_max_ts);
- PRINT_FILED_DELAY("RECLAIM", t->version, t,
- freepages_count, freepages_delay_total,
- freepages_delay_max, freepages_delay_min);
+ PRINT_FILED_DELAY_WITH_TS("RECLAIM", t->version, t,
+ freepages_count, freepages_delay_total,
+ freepages_delay_max, freepages_delay_min, freepages_delay_max_ts);
- PRINT_FILED_DELAY("THRASHING", t->version, t,
- thrashing_count, thrashing_delay_total,
- thrashing_delay_max, thrashing_delay_min);
+ PRINT_FILED_DELAY_WITH_TS("THRASHING", t->version, t,
+ thrashing_count, thrashing_delay_total,
+ thrashing_delay_max, thrashing_delay_min, thrashing_delay_max_ts);
- if (t->version >= 11) {
- PRINT_FILED_DELAY("COMPACT", t->version, t,
- compact_count, compact_delay_total,
- compact_delay_max, compact_delay_min);
- }
+ if (t->version >= 11) {
+ PRINT_FILED_DELAY_WITH_TS("COMPACT", t->version, t,
+ compact_count, compact_delay_total,
+ compact_delay_max, compact_delay_min, compact_delay_max_ts);
+ }
- if (t->version >= 13) {
- PRINT_FILED_DELAY("WPCOPY", t->version, t,
- wpcopy_count, wpcopy_delay_total,
- wpcopy_delay_max, wpcopy_delay_min);
- }
+ if (t->version >= 13) {
+ PRINT_FILED_DELAY_WITH_TS("WPCOPY", t->version, t,
+ wpcopy_count, wpcopy_delay_total,
+ wpcopy_delay_max, wpcopy_delay_min, wpcopy_delay_max_ts);
+ }
- if (t->version >= 14) {
- PRINT_FILED_DELAY("IRQ", t->version, t,
- irq_count, irq_delay_total,
- irq_delay_max, irq_delay_min);
+ if (t->version >= 14) {
+ PRINT_FILED_DELAY_WITH_TS("IRQ", t->version, t,
+ irq_count, irq_delay_total,
+ irq_delay_max, irq_delay_min, irq_delay_max_ts);
+ }
+ } else {
+ /* Use original macro for older versions */
+ PRINT_FILED_DELAY("IO", t->version, t,
+ blkio_count, blkio_delay_total,
+ blkio_delay_max, blkio_delay_min);
+
+ PRINT_FILED_DELAY("SWAP", t->version, t,
+ swapin_count, swapin_delay_total,
+ swapin_delay_max, swapin_delay_min);
+
+ PRINT_FILED_DELAY("RECLAIM", t->version, t,
+ freepages_count, freepages_delay_total,
+ freepages_delay_max, freepages_delay_min);
+
+ PRINT_FILED_DELAY("THRASHING", t->version, t,
+ thrashing_count, thrashing_delay_total,
+ thrashing_delay_max, thrashing_delay_min);
+
+ if (t->version >= 11) {
+ PRINT_FILED_DELAY("COMPACT", t->version, t,
+ compact_count, compact_delay_total,
+ compact_delay_max, compact_delay_min);
+ }
+
+ if (t->version >= 13) {
+ PRINT_FILED_DELAY("WPCOPY", t->version, t,
+ wpcopy_count, wpcopy_delay_total,
+ wpcopy_delay_max, wpcopy_delay_min);
+ }
+
+ if (t->version >= 14) {
+ PRINT_FILED_DELAY("IRQ", t->version, t,
+ irq_count, irq_delay_total,
+ irq_delay_max, irq_delay_min);
+ }
}
}
--
2.25.1
Powered by blists - more mailing lists