linux-kernel - Re: BUG: scheduling while atomic with PREEMPT

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aa28ef09763eeefd54d4c26fb01599fd5197b265.camel@web.de>
Date: Sat, 14 Jun 2025 14:01:54 +0200
From: Bert Karwatzki <spasswolf@....de>
To: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Cc: linux-kernel@...r.kernel.org, linux-next@...r.kernel.org, 
	bpf@...r.kernel.org, linux-rt-users@...r.kernel.org, 
	linux-rt-devel@...ts.linux.dev, Thomas Gleixner <tglx@...utronix.de>,
 Alexei Starovoitov <alexei.starovoitov@...il.com>, spasswolf@....de, Steven
 Rostedt <rostedt@...dmis.org>
Subject: Re: BUG: scheduling while atomic with PREEMPT_RT=y and bpf selftests

Am Montag, dem 09.06.2025 um 13:37 +0200 schrieb Bert Karwatzki:
> Am Sonntag, dem 08.06.2025 um 17:53 +0200 schrieb Bert Karwatzki:
> > Am Sonntag, dem 08.06.2025 um 10:45 +0200 schrieb Bert Karwatzki:
> > > Am Donnerstag, dem 05.06.2025 um 14:51 +0200 schrieb Sebastian Andrzej Siewior:
> > > > On 2025-06-05 08:48:38 [-0400], Steven Rostedt wrote:
> > > > > On Thu,  5 Jun 2025 11:19:03 +0200
> > > > > Bert Karwatzki <spasswolf@....de> wrote:
> > > > > 
> > > > > > This patch seems to create so much output that the orginal error message and
> > > > > > backtrace often get lost, so I needed several runs to get a meaningful message
> > > > > > when running
> > > > > 
> > > > > Are you familiar with preempt count tracing?
> > > > 
> > > > I have an initial set of patches to tackle this problem, I'm going to
> > > > send them after the merge window.
> > > > 
> > > > Sebastian
> > > 
> > > I've found the reason for the "mysterious" increase of preempt_count:
> > > 
> > > [   70.821750] [   T2746] bpf_link_settle calling fd_install() preemt_count = 0
> > > [   70.821751] [   T2746] preempt_count_add 5898: preempt_count = 0x0 counter = 0x1b232c
> > > [   70.821752] [   T2746] preempt_count_add 5900: preempt_count = 0x1 counter = 0x1b232d
> > > [   70.821754] [   T2746] preempt_count_sub 5966: preempt_count = 0x1 counter = 0x1b232e
> > > [   70.821755] [   T2746] preempt_count_sub 5968: preempt_count = 0x0 counter = 0x1b232f
> > > [   70.821761] [   T2746] __bpf_trace_sys_enter 18: preempt_count = 0x0
> > > [   70.821762] [   T2746] __bpf_trace_sys_enter 18: preempt_count = 0x1
> > > [   70.821764] [   T2746] __bpf_trace_run: preempt_count = 1
> > > [   70.821765] [   T2746] bpf_prog_run: preempt_count = 1
> > > [   70.821766] [   T2746] __bpf_prog_run: preempt_count = 1
> > > 
> > > It's caused by this macro from include/trace/bpf_probe.h (with my pr_err()):
> > > 
> > > #define __BPF_DECLARE_TRACE_SYSCALL(call, proto, args) \
> > > static notrace void \
> > > __bpf_trace_##call(void *__data, proto) \
> > > { \
> > >  might_fault(); \
> > >  if (!strcmp(get_current()->comm, "test_progs")) \
> > >  pr_err("%s %d: preempt_count = 0x%x", __func__, __LINE__, preempt_count());\
> > >  preempt_disable_notrace(); \
> > >  if (!strcmp(get_current()->comm, "test_progs")) \
> > >  pr_err("%s %d: preempt_count = 0x%x", __func__, __LINE__, preempt_count());\
> > >  CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \
> > >  preempt_enable_notrace(); \
> > > }
> > > 
> > > The preempt_{en,dis}able_notrace were introduced in
> > > commit 4aadde89d81f ("tracing/bpf: disable preemption in syscall probe")
> > > This commit is present in v6.14 and v6.15, but the bug already appears in
> > > v6.12 so in that case preemption is disable somewhere else. 
> > > 
> > > Bert Karwatzki
> > 
> > After reading this 
> > https://lore.kernel.org/bpf/CAADnVQJf535hwud5XtQKStOge9=pYVYWSiq_8Q2YAvN5rba==A@mail.gmail.com/
> > I tried using migrate_{en,disable} like this (in v6.15)
> > 
> > diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
> > index 183fa2aa2935..49257cb90209 100644
> > --- a/include/trace/bpf_probe.h
> > +++ b/include/trace/bpf_probe.h
> > @@ -58,9 +58,9 @@ static notrace void							\
> >  __bpf_trace_##call(void *__data, proto)					\
> >  {									\
> >  	might_fault();							\
> > -	preempt_disable_notrace();					\
> > +	migrate_disable();					\
> >  	CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args));	\
> > -	preempt_enable_notrace();					\
> > +	migrate_enable();					\
> >  }
> >  
> >  #undef DECLARE_EVENT_SYSCALL_CLASS
> > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > index 187dc37d61d4..ec0326405fc3 100644
> > --- a/kernel/trace/bpf_trace.c
> > +++ b/kernel/trace/bpf_trace.c
> > @@ -2350,7 +2350,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
> >  	struct bpf_run_ctx *old_run_ctx;
> >  	struct bpf_trace_run_ctx run_ctx;
> >  
> > -	cant_sleep();
> > +	cant_migrate();
> >  	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
> >  		bpf_prog_inc_misses_counter(prog);
> >  		goto out;
> > diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
> > index e1fba28e4a86..7cfb9473a526 100644
> > --- a/tools/testing/selftests/bpf/progs/dynptr_success.c
> > +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
> > @@ -7,6 +7,7 @@
> >  #include <bpf/bpf_helpers.h>
> >  #include <bpf/bpf_tracing.h>
> >  #include "bpf_misc.h"
> > +#include "bpf_kfuncs.h"
> >  #include "errno.h"
> >  
> >  char _license[] SEC("license") = "GPL";
> > 
> > 
> > This fixes the warnings when running the bpf cgroup examples:
> > 
> > ./test_progs -a "cgrp_local_storage/cgrp1*"
> > 
> > but I still get a warning from another example (I don't know which, yet):
> > 
> > Bert Karwatzki
> 
> Another of the bpf selftests that gives a warning with PREEMPT_RT=y (for calling spinlock
> with preemption disabled) is
> 
> $ ./test_progs -a wq
> 
> giving this warning:
> 
> [ T3576] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
> [ T3576] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 3576, name: test_progs
> [ T3576] preempt_count: 1, expected: 0
> [ T3576] RCU nest depth: 3, expected: 3
> [ T3576] 6 locks held by test_progs/3576:
> [ T3576]  #0: ffffffffa1131300 (rcu_read_lock){....}-{1:3}, at: bpf_test_timer_enter+0x1e/0xc0
> [ T3576]  #1: ffffffffa109acc0 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x29/0x1c0
> [ T3576]  #2: ffff997b0e7d78b8 ((softirq_ctrl.lock)){+.+.}-{3:3}, at: __local_bh_disable_ip+0xc8/0x1c0
> [ T3576]  #3: ffffffffa1131300 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0xf0/0x190
> [ T3576]  #4: ffffffffa1131300 (rcu_read_lock){....}-{1:3}, at: __local_bh_disable_ip+0x29/0x1c0
> [ T3576]  #5: ffff997b0e7f4588 ((&c->lock)){+.+.}-{3:3}, at: ___slab_alloc+0x68/0xde0
> [ T3576] irq event stamp: 247437
> [ T3576] hardirqs last  enabled at (247435): [<ffffffffa05b5fa7>] _raw_spin_unlock_irqrestore+0x57/0x80
> [ T3576] hardirqs last disabled at (247437): [<ffffffff9fbbc57b>] __bpf_async_init+0xdb/0x310
> [ T3576] softirqs last  enabled at (241464): [<ffffffff9f98a2e1>] __local_bh_enable_ip+0x111/0x180
> [ T3576] softirqs last disabled at (247436): [<ffffffffa036688c>] bpf_test_run+0x10c/0x350
> [ T3576] CPU: 7 UID: 0 PID: 3576 Comm: test_progs Tainted: G           O        6.15.0-bpf-00003-g5197b534e6ad #4 PREEMPT_{RT,(full)} 
> [ T3576] Tainted: [O]=OOT_MODULE
> [ T3576] Hardware name: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-158L, BIOS E158LAMS.10F 11/11/2024
> [ T3576] Call Trace:
> [ T3576]  <TASK>
> [ T3576]  dump_stack_lvl+0x6d/0xb0
> [ T3576]  __might_resched.cold+0xe1/0xf3
> [ T3576]  rt_spin_lock+0x5f/0x190
> [ T3576]  ? ___slab_alloc+0x68/0xde0
> [ T3576]  ? srso_alias_return_thunk+0x5/0xfbef5
> [ T3576]  ? __lock_acquire+0x45f/0x2a70
> [ T3576]  ___slab_alloc+0x68/0xde0
> [ T3576]  ? bpf_map_kmalloc_node+0x72/0x220
> [ T3576]  ? srso_alias_return_thunk+0x5/0xfbef5
> [ T3576]  ? lock_acquire+0xbe/0x2e0
> [ T3576]  ? bpf_map_get_memcg.isra.0+0x182/0x310
> [ T3576]  ? srso_alias_return_thunk+0x5/0xfbef5
> [ T3576]  ? find_held_lock+0x2b/0x80
> [ T3576]  ? bpf_map_get_memcg.isra.0+0x8d/0x310
> [ T3576]  ? bpf_map_kmalloc_node+0x72/0x220
> [ T3576]  __kmalloc_node_noprof+0xee/0x490
> [ T3576]  bpf_map_kmalloc_node+0x72/0x220
> [ T3576]  __bpf_async_init+0x107/0x310
> [ T3576]  bpf_prog_aa38f9274c0318a2_test_call_array_sleepable+0xb3/0x10e
> [ T3576]  bpf_test_run+0x1ef/0x350
> [ T3576]  ? bpf_test_run+0x10c/0x350
> [ T3576]  ? migrate_enable+0x115/0x160
> [ T3576]  ? kmem_cache_alloc_noprof+0x210/0x2b0
> [ T3576]  bpf_prog_test_run_skb+0x37b/0x7c0
> [ T3576]  ? fput+0x3f/0x90
> [ T3576]  __sys_bpf+0xd33/0x26d0
> [ T3576]  ? srso_alias_return_thunk+0x5/0xfbef5
> [ T3576]  __x64_sys_bpf+0x21/0x30
> [ T3576]  do_syscall_64+0x72/0xfa0
> [ T3576]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [ T3576] RIP: 0033:0x7f1c8e2a6779
> [ T3576] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff
> 73 01 c3 48 8b 0d 4f 86 0d 00 f7 d8 64 89 01 48
> [ T3576] RSP: 002b:00007fff8ef7b4d8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141
> [ T3576] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f1c8e2a6779
> [ T3576] RDX: 0000000000000050 RSI: 00007fff8ef7b510 RDI: 000000000000000a
> [ T3576] RBP: 00007fff8ef7b4f0 R08: 00000000ffffffff R09: 00007fff8ef7b510
> [ T3576] R10: 0000000000000064 R11: 0000000000000202 R12: 0000000000000000
> [ T3576] R13: 00007fff8ef7c038 R14: 00007f1c8e8db000 R15: 000055d507eb3890
> [ T3576]  </TASK>
> 
> 
> Here the problem is in __bpf_spin_lock() which calls arch_spin_lock()
> with preemption disabled:
> 
> static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
> {
> 	arch_spinlock_t *l = (void *)lock;
> 	union {
> 		__u32 val;
> 		arch_spinlock_t lock;
> 	} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
> 
> 	compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
> 	BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
> 	BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
> 	if (!strcmp(get_current()->comm, "test_progs"))
> 		pr_err("%s: calling preempt_disable()\n", __func__);
> 	preempt_disable();
> 	arch_spin_lock(l);
> }
> 
> The call to preempt_disable here was introduced in commit
> 5861d1e8dbc4 ("bpf: Allow bpf_spin_{lock,unlock} in sleepable progs").
> 
> 
> Bert Karwatzki

As a quick fix of this problem I  moved the __bpf_spin_lock_irqsave() beyond the
allocation in __bpf_async_init() (can this leak memory?):

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e3a2662f4e33..94fcd8c8661c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1263,19 +1263,16 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
                return -EINVAL;
        }
 
-       __bpf_spin_lock_irqsave(&async->lock);
        t = async->timer;
-       if (t) {
-               ret = -EBUSY;
-               goto out;
-       }
+       if (t)
+               return -EBUSY;
 
        /* allocate hrtimer via map_kmalloc to use memcg accounting */
        cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
-       if (!cb) {
-               ret = -ENOMEM;
-               goto out;
-       }
+       if (!cb)
+               return -ENOMEM;
+
+       __bpf_spin_lock_irqsave(&async->lock);
 
        switch (type) {
        case BPF_ASYNC_TYPE_TIMER:
@@ -1315,7 +1312,6 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
                kfree(cb);
                ret = -EPERM;
        }
-out:
        __bpf_spin_unlock_irqrestore(&async->lock);
        return ret;
 }

With this these bpf example programs run without giving a warning in dmesg:

./test_progs -a timer -a timer_mim -a free_timer

But running ./test_progs -a timer_lockup gives this error:


[  127.373597] [      C1] ============================================
[  127.373598] [      C1] WARNING: possible recursive locking detected
[  127.373601] [      C1] 6.15.0-bpf-00006-g31cf22212ed9 #41 Tainted: G           O       
[  127.373602] [      C1] --------------------------------------------
[  127.373603] [      C1] ktimers/1/85 is trying to acquire lock:
[  127.373605] [      C1] ffff98f62e61c1b8 (&base->softirq_expiry_lock){+...}-{3:3}, at: hrtimer_cancel_wait_running+0x4d/0x80
[  127.373614] [      C1] 
                          but task is already holding lock:
[  127.373615] [      C1] ffff98f62e65c1b8 (&base->softirq_expiry_lock){+...}-{3:3}, at: hrtimer_run_softirq+0x37/0x100
[  127.373621] [      C1] 
                          other info that might help us debug this:
[  127.373621] [      C1]  Possible unsafe locking scenario:

[  127.373622] [      C1]        CPU0
[  127.373623] [      C1]        ----
[  127.373624] [      C1]   lock(&base->softirq_expiry_lock);
[  127.373626] [      C1]   lock(&base->softirq_expiry_lock);
[  127.373627] [      C1] 
                           *** DEADLOCK ***

[  127.373628] [      C1]  May be due to missing lock nesting notation

[  127.373629] [      C1] 8 locks held by ktimers/1/85:
[  127.373630] [      C1]  #0: ffffffffa7a9acc0 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x29/0x1c0
[  127.373636] [      C1]  #1: ffff98f62e6578b8 ((softirq_ctrl.lock)){+.+.}-{3:3}, at: __local_bh_disable_ip+0xc8/0x1c0
[  127.373641] [      C1]  #2: ffffffffa7b31300 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0xf0/0x190
[  127.373648] [      C1]  #3: ffffffffa7b31300 (rcu_read_lock){....}-{1:3}, at: __local_bh_disable_ip+0x29/0x1c0
[  127.373653] [      C1]  #4: ffff98f62e65c1b8 (&base->softirq_expiry_lock){+...}-{3:3}, at: hrtimer_run_softirq+0x37/0x100
[  127.373658] [      C1]  #5: ffffffffa7b31300 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0xf0/0x190
[  127.373663] [      C1]  #6: ffffffffa7b31300 (rcu_read_lock){....}-{1:3}, at: bpf_timer_cancel+0x42/0x2e0
[  127.373668] [      C1]  #7: ffffffffa7a9acc0 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x29/0x1c0
[  127.373673] [      C1] 
                          stack backtrace:
[  127.373675] [      C1] CPU: 1 UID: 0 PID: 85 Comm: ktimers/1 Tainted: G           O        6.15.0-bpf-00006-g31cf22212ed9 #41 PREEMPT_{RT,(full)} 
[  127.373679] [      C1] Tainted: [O]=OOT_MODULE
[  127.373680] [      C1] Hardware name: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-158L, BIOS E158LAMS.10F 11/11/2024
[  127.373681] [      C1] Call Trace:
[  127.373683] [      C1]  <TASK>
[  127.373684] [      C1]  dump_stack_lvl+0x6d/0xb0
[  127.373689] [      C1]  print_deadlock_bug.cold+0xbd/0xca
[  127.373693] [      C1]  __lock_acquire+0x1390/0x2a70
[  127.373699] [      C1]  ? __lock_acquire+0x45f/0x2a70
[  127.373703] [      C1]  lock_acquire+0xbe/0x2e0
[  127.373706] [      C1]  ? hrtimer_cancel_wait_running+0x4d/0x80
[  127.373711] [      C1]  ? hrtimer_cancel_wait_running+0x39/0x80
[  127.373714] [      C1]  rt_spin_lock+0x3d/0x190
[  127.373716] [      C1]  ? hrtimer_cancel_wait_running+0x4d/0x80
[  127.373718] [      C1]  ? __local_bh_disable_ip+0x48/0x1c0
[  127.373720] [      C1]  ? __local_bh_disable_ip+0x29/0x1c0
[  127.373722] [      C1]  ? hrtimer_cancel_wait_running+0x39/0x80
[  127.373724] [      C1]  hrtimer_cancel_wait_running+0x4d/0x80
[  127.373727] [      C1]  hrtimer_cancel+0x34/0x50
[  127.373730] [      C1]  bpf_timer_cancel+0x1fd/0x2e0
[  127.373734] [      C1]  bpf_prog_c55f7d3cdccd3222_timer_cb2+0x59/0x6e
[  127.373737] [      C1]  ? 0xffffffffc014d2a4
[  127.373754] [      C1]  bpf_timer_cb+0x74/0x140
[  127.373757] [      C1]  ? __pfx_bpf_timer_cb+0x10/0x10
[  127.373760] [      C1]  __hrtimer_run_queues+0x1b3/0x430
[  127.373763] [      C1]  ? srso_alias_return_thunk+0x5/0xfbef5
[  127.373768] [      C1]  hrtimer_run_softirq+0x9d/0x100
[  127.373771] [      C1]  handle_softirqs.isra.0+0xb0/0x3e0
[  127.373775] [      C1]  ? smpboot_thread_fn+0x25/0x2c0
[  127.373779] [      C1]  ? __pfx_smpboot_thread_fn+0x10/0x10
[  127.373782] [      C1]  run_ktimerd+0x40/0xa0
[  127.373784] [      C1]  smpboot_thread_fn+0x143/0x2c0
[  127.373788] [      C1]  kthread+0x11c/0x210
[  127.373791] [      C1]  ? __pfx_kthread+0x10/0x10
[  127.373793] [      C1]  ret_from_fork+0x34/0x50
[  127.373796] [      C1]  ? __pfx_kthread+0x10/0x10
[  127.373799] [      C1]  ret_from_fork_asm+0x1a/0x30
[  127.373806] [      C1]  </TASK>


Bert Karwatzki