[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250630124721.18232-1-wladislav.wiebe@nokia.com>
Date: Mon, 30 Jun 2025 14:46:44 +0200
From: Wladislav Wiebe <wladislav.wiebe@...ia.com>
To: anna-maria@...utronix.de,
frederic@...nel.org,
mingo@...nel.org,
tglx@...utronix.de
Cc: akpm@...ux-foundation.org,
bigeasy@...utronix.de,
peterz@...radead.org,
linux-kernel@...r.kernel.org,
wladislav.wiebe@...ia.com
Subject: [PATCH] irq: add support for warning on long-running IRQ handlers
Introduce a new option CONFIG_IRQ_LATENCY_WARN that enables warnings when
IRQ handlers take an unusually long time to execute.
When triggered, the warning includes the CPU, IRQ number, handler address,
name, and execution duration, for example:
[CPU0] latency on IRQ[787:bad_irq_handler+0x1/0x34 [bad_irq]], took: 5 jiffies (~50 ms)
To keep runtime overhead minimal, this implementation uses a jiffies-based
timing mechanism. While coarse, it is sufficient to detect problematic IRQs.
A warning is triggered only when IRQs are disabled on one CPU long enough to
stall jiffies updates and exceed MAX_STALLED_JIFFIES.
Optionally, the reporting threshold can be adjusted by adding
extra jiffies via CONFIG_IRQ_LATENCY_WARN_THRESHOLD.
This approach avoids relying on high-resolution timers and aims for negligible
impact during normal operation.
Signed-off-by: Wladislav Wiebe <wladislav.wiebe@...ia.com>
---
include/linux/interrupt.h | 25 +++++++++++++++++++++++++
include/linux/tick.h | 2 ++
kernel/irq/handle.c | 2 ++
kernel/time/tick-sched.c | 2 --
lib/Kconfig.debug | 29 +++++++++++++++++++++++++++++
5 files changed, 58 insertions(+), 2 deletions(-)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 51b6484c0493..d3c5f920c4ea 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -15,6 +15,7 @@
#include <linux/cpumask_types.h>
#include <linux/workqueue.h>
#include <linux/jump_label.h>
+#include <linux/tick.h>
#include <linux/atomic.h>
#include <asm/ptrace.h>
@@ -881,4 +882,28 @@ extern int arch_early_irq_init(void);
#define __softirq_entry __section(".softirqentry.text")
+#ifdef CONFIG_IRQ_LATENCY_WARN
+static inline void warn_on_irq_latency(struct irqaction *action, unsigned int irq,
+ unsigned long jiffies_start)
+{
+ unsigned long delta = jiffies - jiffies_start;
+
+ /*
+ * Warn about long IRQ handler latency only if jiffies are reliable.
+ * The reporting condition hits only when there are at least two CPUs
+ * with active ticks.
+ * Jiffies updates are stalled on this CPU until MAX_STALLED_JIFFIES
+ * reaches and a force update happens on another CPU with active ticks.
+ */
+ if (unlikely(delta >= (MAX_STALLED_JIFFIES + CONFIG_IRQ_LATENCY_WARN_THRESHOLD))) {
+ pr_warn_ratelimited("[CPU%d] latency on IRQ[%u:%pS], took: %lu jiffies (~%u ms)\n",
+ smp_processor_id(), irq, action->handler,
+ delta, jiffies_to_msecs(delta));
+ }
+}
+#else
+static inline void warn_on_irq_latency(struct irqaction *action, unsigned int irq,
+ unsigned long jiffies_start) { }
+#endif
+
#endif
diff --git a/include/linux/tick.h b/include/linux/tick.h
index ac76ae9fa36d..543bd96b0653 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -14,6 +14,8 @@
#include <linux/rcupdate.h>
#include <linux/static_key.h>
+#define MAX_STALLED_JIFFIES 5
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS
extern void __init tick_init(void);
/* Should be core only, but ARM BL switcher requires it */
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 9489f93b3db3..273aebd71d8d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -145,6 +145,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
record_irq_time(desc);
for_each_action_of_desc(desc, action) {
+ unsigned long __maybe_unused jiffies_start = jiffies;
irqreturn_t res;
/*
@@ -156,6 +157,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
trace_irq_handler_entry(irq, action);
res = action->handler(irq, action->dev_id);
+ warn_on_irq_latency(action, irq, jiffies_start);
trace_irq_handler_exit(irq, action, res);
if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c527b421c865..5daee2bb3a18 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -201,8 +201,6 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts,
ts->flags &= ~flag;
}
-#define MAX_STALLED_JIFFIES 5
-
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
int tick_cpu, cpu = smp_processor_id();
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ebe33181b6e6..88566adf4381 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1856,6 +1856,35 @@ config LATENCYTOP
Enable this option if you want to use the LatencyTOP tool
to find out which userspace is blocking on what kernel operations.
+config IRQ_LATENCY_WARN
+ bool "Warn on IRQ latency"
+ depends on NR_CPUS >= 2
+ default n
+ help
+ Enable this option to receive warnings when IRQ handlers take too long.
+
+ To keep overhead very low, this implementation uses jiffies-based
+ timing - which is coarse, but sufficient to detect problematic IRQs.
+
+ The minimal possible threshold is hardcoded by MAX_STALLED_JIFFIES
+ as the reporting condition hits only when there are at least two CPUs
+ with active ticks. The reporting threshold can be extended by adding
+ additional jiffies to CONFIG_IRQ_LATENCY_WARN_THRESHOLD.
+
+ The warning includes the affected CPU, IRQ number, handler address,
+ name, and the duration it took to execute.
+
+ Say Y if you want to identify problematic IRQs in the system.
+
+config IRQ_LATENCY_WARN_THRESHOLD
+ int "IRQ latency warning threshold in jiffies"
+ depends on IRQ_LATENCY_WARN
+ range 0 10000
+ default 0
+ help
+ Set the latency threshold (in jiffies) for the IRQ warning messages.
+ Consider it will be added to MAX_STALLED_JIFFIES.
+
config DEBUG_CGROUP_REF
bool "Disable inlining of cgroup css reference count functions"
depends on DEBUG_KERNEL
--
2.39.3.dirty
Powered by blists - more mailing lists