lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7937d287a3ff24ce7c7e3eb2cd18788521975511.camel@web.de>
Date: Mon, 16 Jun 2025 00:12:49 +0200
From: Bert Karwatzki <spasswolf@....de>
To: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Cc: linux-kernel@...r.kernel.org, linux-next@...r.kernel.org, 
	bpf@...r.kernel.org, linux-rt-users@...r.kernel.org, 
	linux-rt-devel@...ts.linux.dev, Thomas Gleixner <tglx@...utronix.de>,
 Alexei Starovoitov <alexei.starovoitov@...il.com>, Steven Rostedt
 <rostedt@...dmis.org>, spasswolf@....de
Subject: Re: BUG: scheduling while atomic with PREEMPT_RT=y and bpf selftests

These three patches fixes all the dmesg warning (with CONFIG_LOCKDEP) issues when running the
bpf test_progs and does not cause deadlocks without CONFIG_LOCKDEP.

The warning when running the cgroup examples 

[ T2916] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
[ T2916] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 2916, name: test_progs
[ T2916] preempt_count: 1, expected: 0
[ T2916] RCU nest depth: 2, expected: 2
[...]
[ T2916]  __might_resched.cold+0xfa/0x135
[ T2916]  rt_spin_lock+0x5f/0x190
[ T2916]  ? task_get_cgroup1+0xe8/0x340
[ T2916]  task_get_cgroup1+0xe8/0x340
[ T2916]  bpf_task_get_cgroup1+0xe/0x20
[ T2916]  bpf_prog_8d22669ef1ee8049_on_enter+0x62/0x1d4
[ T2916]  bpf_trace_run2+0xd3/0x260

is fixed by this:

diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index 183fa2aa2935..49257cb90209 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -58,9 +58,9 @@ static notrace void \
 __bpf_trace_##call(void *__data, proto) \
 { \
 might_fault(); \
- preempt_disable_notrace(); \
+ migrate_disable(); \
 CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \
- preempt_enable_notrace(); \
+ migrate_enable(); \
 }
 
 #undef DECLARE_EVENT_SYSCALL_CLASS


The warning when running the (wq, timer, free_timer, timer_mim, timer_lockup) examples

[ T4696] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
[ T4696] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 4696, name: test_progs
[ T4696] preempt_count: 1, expected: 0
[ T4696] RCU nest depth: 0, expected: 0
[...]
[ T4696]  __kmalloc_node_noprof+0xee/0x490
[ T4696]  bpf_map_kmalloc_node+0x72/0x220
[ T4696]  __bpf_async_init+0x107/0x310
[ T4696]  bpf_timer_init+0x33/0x40
[ T4696]  bpf_prog_7e15f1bc7d1d26d0_start_cb+0x5d/0x91
[ T4696]  bpf_prog_d85f43676fabf521_start_timer+0x65/0x8a
[ T4696]  bpf_prog_test_run_syscall+0x103/0x250

is fixed by this (this might leak
memory due to the allcoation before the lock(?))

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e3a2662f4e33..94fcd8c8661c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1263,19 +1263,16 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
 return -EINVAL;
 }
 
- __bpf_spin_lock_irqsave(&async->lock);
 t = async->timer;
- if (t) {
- ret = -EBUSY;
- goto out;
- }
+ if (t)
+ return -EBUSY;
 
 /* allocate hrtimer via map_kmalloc to use memcg accounting */
 cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
- if (!cb) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!cb)
+ return -ENOMEM;
+
+ __bpf_spin_lock_irqsave(&async->lock);
 
 switch (type) {
 case BPF_ASYNC_TYPE_TIMER:
@@ -1315,7 +1312,6 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
 kfree(cb);
 ret = -EPERM;
 }
-out:
 __bpf_spin_unlock_irqrestore(&async->lock);
 return ret;
 }

In addition to these there's a deadlock warning from lockdep when running the
timer_lockup example

[  127.373597] [      C1] ============================================
[  127.373598] [      C1] WARNING: possible recursive locking detected
[  127.373601] [      C1] 6.15.0-bpf-00006-g31cf22212ed9 #41 Tainted: G           O      
[  127.373602] [      C1] --------------------------------------------
[  127.373603] [      C1] ktimers/1/85 is trying to acquire lock:
[  127.373605] [      C1] ffff98f62e61c1b8 (&base->softirq_expiry_lock){+...}-{3:3}, at: hrtimer_cancel_wait_running+0x4d/0x80
[  127.373614] [      C1]
                          but task is already holding lock:
[  127.373615] [      C1] ffff98f62e65c1b8 (&base->softirq_expiry_lock){+...}-{3:3}, at: hrtimer_run_softirq+0x37/0x100
[  127.373621] [      C1]
                          other info that might help us debug this:
[  127.373621] [      C1]  Possible unsafe locking scenario:

[  127.373622] [      C1]        CPU0
[  127.373623] [      C1]        ----
[  127.373624] [      C1]   lock(&base->softirq_expiry_lock);
[  127.373626] [      C1]   lock(&base->softirq_expiry_lock);
[  127.373627] [      C1]
                           *** DEADLOCK ***

[  127.373628] [      C1]  May be due to missing lock nesting notation

which can by fixed (?) by reintroducing spin_lock_bh_nested()
(which was introduced in v4.0 and removed in v4.11 ...)

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index d3561c4a080e..b6635466b35f 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -219,6 +219,8 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define raw_spin_lock_nested(lock, subclass) \
 _raw_spin_lock_nested(lock, subclass)
+# define raw_spin_lock_bh_nested(lock, subclass) \
+ _raw_spin_lock_bh_nested(lock, subclass)
 
 # define raw_spin_lock_nest_lock(lock, nest_lock) \
 do { \
@@ -233,6 +235,8 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
  */
 # define raw_spin_lock_nested(lock, subclass) \
 _raw_spin_lock(((void)(subclass), (lock)))
+# define raw_spin_lock_bh_nested(lock, subclass) \
+ _raw_spin_lock_bh(((void)(subclass), (lock)))
 # define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock)
 #endif
 
@@ -366,6 +370,11 @@ do { \
 raw_spin_lock_nested(spinlock_check(lock), subclass); \
 } while (0)
 
+#define spin_lock_bh_nested(lock, subclass) \
+do { \
+ raw_spin_lock_bh_nested(spinlock_check(lock), subclass); \
+} while (0)
+
 #define spin_lock_nest_lock(lock, nest_lock) \
 do { \
 raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \
@@ -552,6 +561,10 @@ DEFINE_LOCK_GUARD_1(raw_spinlock_bh, raw_spinlock_t,
 raw_spin_lock_bh(_T->lock),
 raw_spin_unlock_bh(_T->lock))
 
+DEFINE_LOCK_GUARD_1(raw_spinlock_bh_nested, raw_spinlock_t,
+ raw_spin_lock_bh_nested(_T->lock, SINGLE_DEPTH_NESTING),
+ raw_spin_unlock_bh(_T->lock))
+
 DEFINE_LOCK_GUARD_1_COND(raw_spinlock_bh, _try, raw_spin_trylock_bh(_T->lock))
 
 DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t,
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 9ecb0ab504e3..791867efad04 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -22,6 +22,8 @@ int in_lock_functions(unsigned long addr);
 void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 __acquires(lock);
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+ __acquires(lock);
 void __lockfunc
 _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map)
 __acquires(lock);
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index 819aeba1c87e..a42887a0c846 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -57,6 +57,7 @@
 
 #define _raw_spin_lock(lock) __LOCK(lock)
 #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock)
+#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock)
 #define _raw_read_lock(lock) __LOCK(lock)
 #define _raw_write_lock(lock) __LOCK(lock)
 #define _raw_write_lock_nested(lock, subclass) __LOCK(lock)
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index f6499c37157d..b89dc9e272cc 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -88,6 +88,13 @@ static __always_inline void spin_lock_bh(spinlock_t *lock)
 rt_spin_lock(lock);
 }
 
+static __always_inline void spin_lock_bh_nested(spinlock_t *lock, int subclass)
+{
+ /* Investigate: Drop bh when blocking ? */
+ local_bh_disable();
+ rt_spin_lock_nested(lock, subclass);
+}
+
 static __always_inline void spin_lock_irq(spinlock_t *lock)
 {
 rt_spin_lock(lock);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 7685defd7c52..c4aa80441a1d 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -380,6 +380,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 }
 EXPORT_SYMBOL(_raw_spin_lock_nested);
 
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
+
 unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
 int subclass)
 {
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index db1e11b45de6..5b76072c4838 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -58,7 +58,6 @@ void __sched rt_spin_lock(spinlock_t *lock) __acquires(RCU)
 }
 EXPORT_SYMBOL(rt_spin_lock);
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
 void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass)
 {
 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
@@ -66,6 +65,7 @@ void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass)
 }
 EXPORT_SYMBOL(rt_spin_lock_nested);
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 void __sched rt_spin_lock_nest_lock(spinlock_t *lock,
 struct lockdep_map *nest_lock)
 {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 30899a8cc52c..d62455f62fc4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1457,7 +1457,7 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer)
 * unlikely and just causes another wait loop.
 */
 atomic_inc(&base->cpu_base->timer_waiters);
- spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
+ spin_lock_bh_nested(&base->cpu_base->softirq_expiry_lock, SINGLE_DEPTH_NESTING);
 atomic_dec(&base->cpu_base->timer_waiters);
 spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
 }
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index e1fba28e4a86..7cfb9473a526 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -7,6 +7,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
+#include "bpf_kfuncs.h"
 #include "errno.h"
 
 char _license[] SEC("license") = "GPL";


Bert Karwatzki

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ