linux-kernel - [PATCH 1/2] workqueue: add time-based panic for stalls

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite for Android: free password hash cracker in your pocket

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-Id: <20260206-wqstall_panic_time-v1-1-f2a21d5d87a4@debian.org>
Date: Fri, 06 Feb 2026 03:18:01 -0800
From: Breno Leitao <leitao@...ian.org>
To: Tejun Heo <tj@...nel.org>, Lai Jiangshan <jiangshanlai@...il.com>
Cc: linux-kernel@...r.kernel.org, osandov@...ndov.com, rneu@...a.com, 
 Breno Leitao <leitao@...ian.org>, kernel-team@...a.com
Subject: [PATCH 1/2] workqueue: add time-based panic for stalls

Add a new module parameter 'panic_on_stall_time' that triggers a panic
when a workqueue stall persists for longer than the specified duration
in seconds.

Unlike 'panic_on_stall' which counts accumulated stall events, this
parameter triggers based on the duration of a single continuous stall.
This is useful for catching truly stuck workqueues rather than
accumulating transient stalls.

Usage:
  workqueue.panic_on_stall_time=120

This would panic if any workqueue pool has been stalled for 120 seconds
or more.

The stall duration is measured from the workqueue last progress
(poll_ts) which accounts for legitimate system stalls.

Signed-off-by: Breno Leitao <leitao@...ian.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  8 ++++++++
 kernel/workqueue.c                              | 22 ++++++++++++++++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1058f2a6d6a8c..a2953cf6c4038 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -8373,6 +8373,14 @@ Kernel parameters
 
 			The default is 0, which disables the panic on stall.
 
+	workqueue.panic_on_stall_time=<uint>
+			Panic when a workqueue stall has been continuous for
+			the specified number of seconds. Unlike panic_on_stall
+			which counts accumulated stall events, this triggers
+			based on the duration of a single continuous stall.
+
+			The default is 0, which disables the time-based panic.
+
 	workqueue.cpu_intensive_thresh_us=
 			Per-cpu work items which run for longer than this
 			threshold are automatically considered CPU intensive
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 253311af47c6d..6f63899dd6317 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7508,6 +7508,10 @@ static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
 static unsigned int wq_panic_on_stall;
 module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);
 
+static unsigned int wq_panic_on_stall_time;
+module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644);
+MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)");
+
 /*
  * Show workers that might prevent the processing of pending work items.
  * The only candidates are CPU-bound workers in the running state.
@@ -7559,7 +7563,12 @@ static void show_cpu_pools_hogs(void)
 	rcu_read_unlock();
 }
 
-static void panic_on_wq_watchdog(void)
+/*
+ * It triggers a panic in two scenarios: when the total number of stalls
+ * exceeds a threshold, and when a stall lasts longer than
+ * wq_panic_on_stall_time
+ */
+static void panic_on_wq_watchdog(unsigned int stall_time_sec)
 {
 	static unsigned int wq_stall;
 
@@ -7567,6 +7576,8 @@ static void panic_on_wq_watchdog(void)
 		wq_stall++;
 		BUG_ON(wq_stall >= wq_panic_on_stall);
 	}
+
+	BUG_ON(wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time);
 }
 
 static void wq_watchdog_reset_touched(void)
@@ -7581,10 +7592,12 @@ static void wq_watchdog_reset_touched(void)
 static void wq_watchdog_timer_fn(struct timer_list *unused)
 {
 	unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+	unsigned int max_stall_time = 0;
 	bool lockup_detected = false;
 	bool cpu_pool_stall = false;
 	unsigned long now = jiffies;
 	struct worker_pool *pool;
+	unsigned int stall_time;
 	int pi;
 
 	if (!thresh)
@@ -7618,14 +7631,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
 		/* did we stall? */
 		if (time_after(now, ts + thresh)) {
 			lockup_detected = true;
+			stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
+			max_stall_time = max(max_stall_time, stall_time);
 			if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
 				pool->cpu_stall = true;
 				cpu_pool_stall = true;
 			}
 			pr_emerg("BUG: workqueue lockup - pool");
 			pr_cont_pool_info(pool);
-			pr_cont(" stuck for %us!\n",
-				jiffies_to_msecs(now - pool_ts) / 1000);
+			pr_cont(" stuck for %us!\n", stall_time);
 		}
 
 
@@ -7638,7 +7652,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
 		show_cpu_pools_hogs();
 
 	if (lockup_detected)
-		panic_on_wq_watchdog();
+		panic_on_wq_watchdog(max_stall_time);
 
 	wq_watchdog_reset_touched();
 	mod_timer(&wq_watchdog_timer, jiffies + thresh);

-- 
2.47.3