linux-kernel - Re: [linux-next:master] [x86] 66fbf67705: kernel-selftests.kvm.hardware_disable

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250220155722.2Z2a-3z0@linutronix.de>
Date: Thu, 20 Feb 2025 16:57:22 +0100
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: Oliver Sang <oliver.sang@...el.com>
Cc: oe-lkp@...ts.linux.dev, lkp@...el.com, Petr Pavlu <petr.pavlu@...e.com>,
	"H. Peter Anvin" <hpa@...or.com>, Borislav Petkov <bp@...en8.de>,
	Dave Hansen <dave.hansen@...ux.intel.com>,
	Ingo Molnar <mingo@...hat.com>,
	Josh Poimboeuf <jpoimboe@...nel.org>,
	Peter Zijlstra <peterz@...radead.org>,
	Thomas Gleixner <tglx@...utronix.de>, linux-kernel@...r.kernel.org,
	kasan-dev@...glegroups.com, linux-hardening@...r.kernel.org,
	Andrey Ryabinin <ryabinin.a.a@...il.com>,
	Kees Cook <kees@...nel.org>,
	Alexander Potapenko <glider@...gle.com>,
	Andrey Konovalov <andreyknvl@...il.com>,
	Dmitry Vyukov <dvyukov@...gle.com>, Marco Elver <elver@...gle.com>,
	Vincenzo Frascino <vincenzo.frascino@....com>
Subject: Re: [linux-next:master] [x86]  66fbf67705:
 kernel-selftests.kvm.hardware_disable_test.fail

On 2025-02-20 15:04:43 [+0800], Oliver Sang wrote:
> hi, Sebastian,
Hi Oliver,

+ UBSAN & KASAN + STACK unwind people. The commit question is
    e9d25b42bde5a ("x86: Use RCU in all users of __module_address().")
  in -next.

> just FYI. we rebuild the kernel, and run the tests more times upon this commit
> and its parent, still see the issue persistent:
> 
…
> 
> f985e39203090cc6 66fbf677051818b9b5339fa8bfe
> ---------------- ---------------------------
>        fail:runs  %reproduction    fail:runs
>            |             |             |
>            :20         105%          20:20    kernel-selftests.kvm.hardware_disable_test.fail
>          %stddev     %change         %stddev
>              \          |                \
>     580.92           +17.4%     682.06        kernel-selftests.time.elapsed_time
>     580.92           +17.4%     682.06        kernel-selftests.time.elapsed_time.max
>     550.23           +13.1%     622.20        kernel-selftests.time.system_time

this is +~100 secs?

> as above, the time spent is also longer, though it's not only for
> kvm.hardware_disable_test (the time is for whole kernel-selftests.kvm)
…
> it seems to us that the commit really causes some slow down and it happens to
> make kvm.hardware_disable_test timeout on the older machine for our original

I made it slower, why is that so, let me look. On my HW
#1 patched (as of 66fbf677051818b9b5339fa8bfe)
| ~# time ./hardware_disable_test
| Random seed: 0x6b8b4567
| 
| real    0m43.242s
| user    0m0.635s
| sys     0m18.292s

#2 use preempt_disable instead of rcu_read_lock() in unwind_orc.c (=revert)
| ~# time ./hardware_disable_test
| Random seed: 0x6b8b4567
| 
| real    0m30.212s
| user    0m0.448s
| sys     0m12.939s

#3 replace preempt_disable with __rcu_read_lock() [slim without debug]
| ~# time ./hardware_disable_test
| Random seed: 0x6b8b4567
|
| real    0m29.953s
| user    0m0.436s
| sys     0m12.789s

#4 replace preempt_disable with __rcu_read_lock() + lockdep [RCU watching
  test is missing]
| ~# time ./hardware_disable_test
| Random seed: 0x6b8b4567
| 
| real    0m41.497s
| user    0m0.639s
| sys     0m17.112s

#5 Using rcu_read_lock() only if orc_module_find() is invoked.
Lost the output but it was more or less at #2 level meaning it does not
lookup modules to the point that it matters.

#6 CONFIG_UBSAN -CONFIG_KASAN +revert
| ~# time ./hardware_disable_test
| Random seed: 0x6b8b4567
|
| real    0m9.318s
| user    0m0.207s
| sys     0m3.395s

#7 -CONFIG_UBSAN -CONFIG_KASAN with RCU
| ~# time ./hardware_disable_test
| Random seed: 0x6b8b4567
| 
| real    0m9.249s
| user    0m0.196s
| sys     0m3.332s

#8 -CONFIG_UBSAN -CONFIG_KASAN -LOCKDEP
| ~# time ./hardware_disable_test
| Random seed: 0x6b8b4567
| ^[[A
| 
| real    0m4.416s
| user    0m0.120s
| sys     0m1.426s

#9 -CONFIG_UBSAN -CONFIG_KASAN -LOCKDEP +revert
| ~# time ./hardware_disable_test
| Random seed: 0x6b8b4567
| 
| real    0m4.391s
| user    0m0.137s
| sys     0m1.415s


Now. Based on this:
The RCU read section increased the runtime (on my hardware) for the test
from 30 to 43 seconds which is roughly 43%.
This is due to the lockdep annotation within rcu_read_lock() and
unlock() which is not existing in preempt_disable(). After disabling
UBSAN + KASAN  the lockdep annotation has no effect. My guess that
UBSAN/ KASAN is in charge of countless backtraces while enabled. Those
backtraces seem to be limited to the core kernel.

How much do we care here? Is this something that makes UBSAN + KASAN
folks uncomfortable? Or is lockdep slowing things down anyway?

If so, we could either move the RCU section down (as in #5) so it is not
used that often or go the other direction and move it up. I got this:
| ~# time ./hardware_disable_test
| Random seed: 0x6b8b4567
| 
| real    0m32.618s
| user    0m0.537s
| sys     0m13.942s

which is almost the pre-level with the hunk below after figuring out
that most callers are from arch_stack_walk(). 

diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 7cede4dc21f0..f20e3613942f 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -42,6 +42,7 @@ struct unwind_state {
 void __unwind_start(struct unwind_state *state, struct task_struct *task,
 		    struct pt_regs *regs, unsigned long *first_frame);
 bool unwind_next_frame(struct unwind_state *state);
+bool unwind_next_frame_unlocked(struct unwind_state *state);
 unsigned long unwind_get_return_address(struct unwind_state *state);
 unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);
 
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index ee117fcf46ed..4df346b11f1e 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -21,8 +21,9 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
 	if (regs && !consume_entry(cookie, regs->ip))
 		return;
 
+	guard(rcu)();
 	for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
-	     unwind_next_frame(&state)) {
+	     unwind_next_frame_unlocked(&state)) {
 		addr = unwind_get_return_address(&state);
 		if (!addr || !consume_entry(cookie, addr))
 			break;
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 977ee75e047c..402779b3e90a 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -465,7 +465,7 @@ static bool get_reg(struct unwind_state *state, unsigned int reg_off,
 	return false;
 }
 
-bool unwind_next_frame(struct unwind_state *state)
+bool unwind_next_frame_unlocked(struct unwind_state *state)
 {
 	unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp;
 	enum stack_type prev_type = state->stack_info.type;
@@ -475,9 +475,6 @@ bool unwind_next_frame(struct unwind_state *state)
 	if (unwind_done(state))
 		return false;
 
-	/* Don't let modules unload while we're reading their ORC data. */
-	guard(rcu)();
-
 	/* End-of-stack check for user tasks: */
 	if (state->regs && user_mode(state->regs))
 		goto the_end;
@@ -678,6 +675,13 @@ bool unwind_next_frame(struct unwind_state *state)
 	state->stack_info.type = STACK_TYPE_UNKNOWN;
 	return false;
 }
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+	/* Don't let modules unload while we're reading their ORC data. */
+	guard(rcu)();
+	return unwind_next_frame_unlocked(state);
+}
 EXPORT_SYMBOL_GPL(unwind_next_frame);
 
 void __unwind_start(struct unwind_state *state, struct task_struct *task,

Sebastian