lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220728024327.1202-1-thunder.leizhen@huawei.com>
Date:   Thu, 28 Jul 2022 10:43:27 +0800
From:   Zhen Lei <thunder.leizhen@...wei.com>
To:     "Paul E . McKenney" <paulmck@...nel.org>,
        Frederic Weisbecker <frederic@...nel.org>,
        Neeraj Upadhyay <quic_neeraju@...cinc.com>,
        "Josh Triplett" <josh@...htriplett.org>,
        Steven Rostedt <rostedt@...dmis.org>,
        Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
        Lai Jiangshan <jiangshanlai@...il.com>,
        Joel Fernandes <joel@...lfernandes.org>, <rcu@...r.kernel.org>,
        <linux-kernel@...r.kernel.org>
CC:     Zhen Lei <thunder.leizhen@...wei.com>
Subject: [PATCH v2] rcu: Display registers of self-detected stall as far as possible

For architectures that do not support NMI interrupts, registers is not
printed when rcu stall is self-detected. However, this information is
useful for analyzing the root cause of the fault. Fortunately, the rcu
stall is always detected in the tick interrupt handler. So we can take
it through get_irq_regs() and display it through show_regs(). Further,
show_regs() unwind the call trace based on 'regs', the worthless call
trace associated with tick handling will be omitted, this helps us to
focus more on the problem.

This is an example on arm64:
[   27.501721] rcu: INFO: rcu_preempt self-detected stall on CPU
[   27.502238] rcu:     0-....: (1250 ticks this GP) idle=4f7/1/0x4000000000000000 softirq=2594/2594 fqs=619
[   27.502632]  (t=1251 jiffies g=2989 q=29 ncpus=4)
[   27.503845] CPU: 0 PID: 306 Comm: test0 Not tainted 5.19.0-rc7-00009-g1c1a6c29ff99-dirty #46
[   27.504732] Hardware name: linux,dummy-virt (DT)
[   27.504947] pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   27.504998] pc : arch_counter_read+0x18/0x24
[   27.505301] lr : arch_counter_read+0x18/0x24
[   27.505328] sp : ffff80000b29bdf0
[   27.505345] x29: ffff80000b29bdf0 x28: 0000000000000000 x27: 0000000000000000
[   27.505475] x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000
[   27.505553] x23: 0000000000001f40 x22: ffff800009849c48 x21: 000000065f871ae0
[   27.505627] x20: 00000000000025ec x19: ffff80000a6eb300 x18: ffffffffffffffff
[   27.505654] x17: 0000000000000001 x16: 0000000000000000 x15: ffff80000a6d0296
[   27.505681] x14: ffffffffffffffff x13: ffff80000a29bc18 x12: 0000000000000426
[   27.505709] x11: 0000000000000162 x10: ffff80000a2f3c18 x9 : ffff80000a29bc18
[   27.505736] x8 : 00000000ffffefff x7 : ffff80000a2f3c18 x6 : 00000000759bd013
[   27.505761] x5 : 01ffffffffffffff x4 : 0002dc6c00000000 x3 : 0000000000000017
[   27.505787] x2 : 00000000000025ec x1 : ffff80000b29bdf0 x0 : 0000000075a30653
[   27.505937] Call trace:
[   27.506002]  arch_counter_read+0x18/0x24
[   27.506171]  ktime_get+0x48/0xa0
[   27.506207]  test_task+0x70/0xf0
[   27.506227]  kthread+0x10c/0x110
[   27.506243]  ret_from_fork+0x10/0x20

The old output is as follows:
[   27.944550] rcu: INFO: rcu_preempt self-detected stall on CPU
[   27.944980] rcu:     0-....: (1249 ticks this GP) idle=cbb/1/0x4000000000000000 softirq=2610/2610 fqs=614
[   27.945407]  (t=1251 jiffies g=2681 q=28 ncpus=4)
[   27.945731] Task dump for CPU 0:
[   27.945844] task:test0           state:R  running task     stack:    0 pid:  306 ppid:     2 flags:0x0000000a
[   27.946073] Call trace:
[   27.946151]  dump_backtrace.part.0+0xc8/0xd4
[   27.946378]  show_stack+0x18/0x70
[   27.946405]  sched_show_task+0x150/0x180
[   27.946427]  dump_cpu_task+0x44/0x54
[   27.947193]  rcu_dump_cpu_stacks+0xec/0x130
[   27.947212]  rcu_sched_clock_irq+0xb18/0xef0
[   27.947231]  update_process_times+0x68/0xac
[   27.947248]  tick_sched_handle+0x34/0x60
[   27.947266]  tick_sched_timer+0x4c/0xa4
[   27.947281]  __hrtimer_run_queues+0x178/0x360
[   27.947295]  hrtimer_interrupt+0xe8/0x244
[   27.947309]  arch_timer_handler_virt+0x38/0x4c
[   27.947326]  handle_percpu_devid_irq+0x88/0x230
[   27.947342]  generic_handle_domain_irq+0x2c/0x44
[   27.947357]  gic_handle_irq+0x44/0xc4
[   27.947376]  call_on_irq_stack+0x2c/0x54
[   27.947415]  do_interrupt_handler+0x80/0x94
[   27.947431]  el1_interrupt+0x34/0x70
[   27.947447]  el1h_64_irq_handler+0x18/0x24
[   27.947462]  el1h_64_irq+0x64/0x68                       <--- the above backtrace is worthless
[   27.947474]  arch_counter_read+0x18/0x24
[   27.947487]  ktime_get+0x48/0xa0
[   27.947501]  test_task+0x70/0xf0
[   27.947520]  kthread+0x10c/0x110
[   27.947538]  ret_from_fork+0x10/0x20

Signed-off-by: Zhen Lei <thunder.leizhen@...wei.com>
---
 kernel/rcu/tree_stall.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

v1 --> v2:
Fix C99 build warning:
kernel/rcu/tree_stall.h:358:10: error: call to undeclared function 'get_irq_regs';
ISO C99 and later do not support implicit function declarations

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index a001e1e7a99269c..23bfd755c3f6e5b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -8,6 +8,7 @@
  */
 
 #include <linux/kvm_para.h>
+#include <asm/irq_regs.h>
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -350,6 +351,21 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
 }
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
+static void rcu_dump_cpu_task(int cpu)
+{
+	if (cpu == smp_processor_id() && in_irq()) {
+		struct pt_regs *regs;
+
+		regs = get_irq_regs();
+		if (regs) {
+			show_regs(regs);
+			return;
+		}
+	}
+
+	dump_cpu_task(cpu);
+}
+
 /*
  * Dump stacks of all tasks running on stalled CPUs.  First try using
  * NMIs, but fall back to manual remote stack tracing on architectures
@@ -369,7 +385,7 @@ static void rcu_dump_cpu_stacks(void)
 				if (cpu_is_offline(cpu))
 					pr_err("Offline CPU %d blocking current GP.\n", cpu);
 				else if (!trigger_single_cpu_backtrace(cpu))
-					dump_cpu_task(cpu);
+					rcu_dump_cpu_task(cpu);
 			}
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
-- 
2.25.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ