[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <gmjp3pipxnng6vlnqri5nhcgju5vkmxkwi3z4rxixaiovontnh@nq74fa3xplcr>
Date: Thu, 18 Dec 2025 22:09:57 -0500
From: Aaron Tomlin <atomlin@...mlin.com>
To: Petr Mladek <pmladek@...e.com>
Cc: Lance Yang <lance.yang@...ux.dev>, sean@...e.io,
linux-kernel@...r.kernel.org, mhiramat@...nel.org, akpm@...ux-foundation.org,
gregkh@...uxfoundation.org
Subject: Re: [PATCH v3 2/2] hung_task: Enable runtime reset of
hung_task_detect_count
On Wed, Dec 17, 2025 at 02:09:14PM +0100, Petr Mladek wrote:
> The counter is used to decide how many hung tasks were found in
> by a single check, see:
>
> static void check_hung_uninterruptible_tasks(unsigned long timeout)
> {
> [...]
> unsigned long prev_detect_count = sysctl_hung_task_detect_count;
> [...]
> for_each_process_thread(g, t) {
> [...]
> check_hung_task(t, timeout, prev_detect_count);
> }
> [...]
> if (!(sysctl_hung_task_detect_count - prev_detect_count))
> return;
>
> if (need_warning || hung_task_call_panic) {
> si_mask |= SYS_INFO_LOCKS;
>
> if (sysctl_hung_task_all_cpu_backtrace)
> si_mask |= SYS_INFO_ALL_BT;
> }
>
> sys_info(si_mask);
>
> if (hung_task_call_panic)
> panic("hung_task: blocked tasks");
> }
>
> Any race might cause false positives and even panic() !!!
> And the race window is rather big (checking all processes).
>
> This race can't be prevented by an atomic read/write.
> IMHO, the only solution would be to add some locking,
> e.g. mutex and take it around the entire
> check_hung_uninterruptible_tasks().
>
> On one hand, it might work. The code is called from
> a task context...
>
> But adding a lock into a lockup-detector code triggers
> some warning bells in my head.
>
> Honestly, I am not sure if it is worth it. I understand
> the motivation for the reset. But IMHO, even more important
> is to make sure that the watchdog works as expected.
Hi Petr,
Thank you for the feedback regarding the potential for false positives and
erroneous panics. You are absolutely correct that the race window is
substantial, given that it spans the entire duration of a full
process-thread iteration.
I believe we can resolve this race condition and prevent the integer
underflow without the overhead or risk of a mutex by leveraging hardware
atomicity and a slight adjustment to the delta logic.
Specifically, by converting sysctl_hung_task_detect_count to an
atomic_long_t, we ensure that the increment and reset operations are
indivisible at the CPU level. To handle the "reset mid-scan" issue you
highlighted, we can modify the delta calculation to be "reset-aware":
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 01ce46a107b0..74725dc1efd0 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -17,6 +17,7 @@
#include <linux/export.h>
#include <linux/panic_notifier.h>
#include <linux/sysctl.h>
+#include <linux/atomic.h>
#include <linux/suspend.h>
#include <linux/utsname.h>
#include <linux/sched/signal.h>
@@ -36,7 +37,7 @@ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Total number of tasks detected as hung since boot:
*/
-static unsigned long __read_mostly sysctl_hung_task_detect_count;
+static atomic_long_t atomic_long_t sysctl_hung_task_detect_count = ATOMIC_LONG_INIT(0);
/*
* Limit number of tasks checked in a batch.
@@ -253,6 +254,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout,
unsigned long prev_detect_count)
{
unsigned long total_hung_task;
+ unsigned long current_detect;
if (!task_is_hung(t, timeout))
return;
@@ -261,9 +263,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout,
* This counter tracks the total number of tasks detected as hung
* since boot.
*/
- sysctl_hung_task_detect_count++;
+ atomic_long_inc(&sysctl_hung_task_detect_count);
+
+ current_detect = atomic_long_read(&sysctl_hung_task_detect_count);
+ if (current_detect >= prev_detect_count)
+ total_hung_task = current_detect - prev_detect_count;
+ else
+ total_hung_task = current_detect;
- total_hung_task = sysctl_hung_task_detect_count - prev_detect_count;
trace_sched_process_hang(t);
if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) {
@@ -322,7 +329,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
int max_count = sysctl_hung_task_check_count;
unsigned long last_break = jiffies;
struct task_struct *g, *t;
- unsigned long prev_detect_count = sysctl_hung_task_detect_count;
+ unsigned long prev_detect_count = atomic_long_read(&sysctl_hung_task_detect_count);
int need_warning = sysctl_hung_task_warnings;
unsigned long si_mask = hung_task_si_mask;
@@ -350,7 +357,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
unlock:
rcu_read_unlock();
- if (!(sysctl_hung_task_detect_count - prev_detect_count))
+ if (!(atomic_long_read(&sysctl_hung_task_detect_count) - prev_detect_count))
return;
if (need_warning || hung_task_call_panic) {
@@ -391,10 +398,15 @@ static long hung_timeout_jiffies(unsigned long last_checked,
static int proc_dohung_task_detect_count(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- if (!write)
- return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ if (!write) {
+ unsigned long count = atomic_long_read(&sysctl_hung_task_detect_count);
+ struct ctl_table proxy_table = *table;
+
+ proxy_table.data = &count;
+ return proc_doulongvec_minmax(&proxy_table, write, buffer, lenp, ppos);
+ }
- WRITE_ONCE(sysctl_hung_task_detect_count, 0);
+ atomic_long_set(&sysctl_hung_task_detect_count, 0);
*ppos += *lenp;
return 0;
@@ -480,7 +492,6 @@ static const struct ctl_table hung_task_sysctls[] = {
},
{
.procname = "hung_task_detect_count",
- .data = &sysctl_hung_task_detect_count,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_dohung_task_detect_count,
Please note that this updated approach has not been tested yet, but I
believe it addresses the core concerns while maintaining the architectural
integrity of the detector.
Kind regards,
--
Aaron Tomlin
Download attachment "signature.asc" of type "application/pgp-signature" (834 bytes)
Powered by blists - more mailing lists