[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250605091904.5853-1-spasswolf@web.de>
Date: Thu, 5 Jun 2025 11:19:03 +0200
From: Bert Karwatzki <spasswolf@....de>
To: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Cc: Bert Karwatzki <spasswolf@....de>,
linux-kernel@...r.kernel.org,
linux-next@...r.kernel.org,
bpf@...r.kernel.org,
linux-rt-users@...r.kernel.org,
linux-rt-devel@...ts.linux.dev,
Thomas Gleixner <tglx@...utronix.de>,
Alexei Starovoitov <alexei.starovoitov@...il.com>
Subject: Re: BUG: scheduling while atomic with PREEMPT_RT=y and bpf selftests
To debug the issue I've created a patch that monitors preempt_count_{sub,add}
(except those used by printk to avoid infinite recursion).
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b25d278409b..37992b164282 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2219,6 +2219,8 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
u32 ret = 1;
+ if (!strcmp(get_current()->comm, "test_progs"))
+ pr_err("%s: ctx = %px preempt_count = %u\n", __func__, ctx, preempt_count());
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f5cf4d35d83e..ac9ddd1b9bac 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -699,6 +699,8 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
bpf_dispatcher_fn dfunc)
{
u32 ret;
+ if (!strcmp(get_current()->comm, "test_progs"))
+ pr_err("%s: preempt_count = %u\n", __func__, preempt_count());
cant_migrate();
if (static_branch_unlikely(&bpf_stats_enabled_key)) {
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index b0af8d4ef6e6..e1d5d1e00860 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -194,12 +194,17 @@ static __always_inline unsigned char interrupt_context_level(void)
#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
extern void preempt_count_add(int val);
extern void preempt_count_sub(int val);
+extern void preempt_count_add_printk(int val);
+extern void preempt_count_sub_printk(int val);
#define preempt_count_dec_and_test() \
({ preempt_count_sub(1); should_resched(0); })
+#define preempt_count_dec_and_test_printk() \
+ ({ preempt_count_sub_printk(1); should_resched(0); })
#else
#define preempt_count_add(val) __preempt_count_add(val)
#define preempt_count_sub(val) __preempt_count_sub(val)
#define preempt_count_dec_and_test() __preempt_count_dec_and_test()
+#define preempt_count_dec_and_test_printk() __preempt_count_dec_and_test_printk()
#endif
#define __preempt_count_inc() __preempt_count_add(1)
@@ -208,8 +213,20 @@ extern void preempt_count_sub(int val);
#define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1)
+#define preempt_count_inc_printk() preempt_count_add_printk(1)
+#define preempt_count_dec_printk() preempt_count_sub_printk(1)
+
#ifdef CONFIG_PREEMPT_COUNT
+/* We can't monitor preempt_count changes in printk or we go into
+ * an infinte recursion loop!
+ * */
+#define preempt_disable_printk() \
+do { \
+ preempt_count_inc_printk(); \
+ barrier(); \
+} while (0)
+
#define preempt_disable() \
do { \
preempt_count_inc(); \
@@ -227,6 +244,13 @@ do { \
#define preemptible() (preempt_count() == 0 && !irqs_disabled())
#ifdef CONFIG_PREEMPTION
+#define preempt_enable_printk() \
+do { \
+ barrier(); \
+ if (unlikely(preempt_count_dec_and_test_printk())) \
+ __preempt_schedule(); \
+} while (0)
+
#define preempt_enable() \
do { \
barrier(); \
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c1113b74e1e2..555c01c7f983 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -25,6 +25,8 @@
#include <linux/kasan.h>
#include <linux/bpf_verifier.h>
+#include <asm/preempt.h>
+
#include "../../lib/kstrtox.h"
/* If kernel subsystem is allowing eBPF programs to call this function,
@@ -2596,6 +2598,9 @@ const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
__bpf_kfunc struct cgroup *
bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
{
+ if (!strcmp(get_current()->comm, "test_progs"))
+ pr_err("%s: calling task_get_cgroup1 preempt_count = 0x%x\n", __func__, preempt_count());
+
struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id);
if (IS_ERR(cgrp))
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dd5304c6ac3c..137478a0b8f6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3339,6 +3339,8 @@ int bpf_link_settle(struct bpf_link_primer *primer)
primer->link->id = primer->id;
spin_unlock_bh(&link_idr_lock);
/* make bpf_link fetchable by FD */
+ if (!strcmp(get_current()->comm, "test_progs"))
+ pr_err("%s calling fd_install() preemt_count = %u\n", __func__, preempt_count());
fd_install(primer->fd, primer->file);
/* pass through installed FD */
return primer->fd;
@@ -5798,6 +5800,8 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
int err;
+ if (!strcmp(get_current()->comm, "test_progs"))
+ pr_err("%s: preempt_count = %u\n", __func__, preempt_count());
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
@@ -5986,6 +5990,8 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
{
struct bpf_prog * __maybe_unused prog;
struct bpf_tramp_run_ctx __maybe_unused run_ctx;
+ if (!strcmp(get_current()->comm, "test_progs"))
+ pr_err("%s: preempt_count = %u\n", __func__, preempt_count());
switch (cmd) {
#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index fa24c032ed6f..001632155bfe 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1295,6 +1295,8 @@ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id)
struct cgroup *cgrp = ERR_PTR(-ENOENT);
struct cgroup_root *root;
unsigned long flags;
+ if (!strcmp(get_current()->comm, "test_progs"))
+ pr_err("%s: preempt_count = 0x%x\n", __func__, preempt_count());
rcu_read_lock();
for_each_root(root) {
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index fd12efcc4aed..cce689bc6280 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -1634,7 +1634,7 @@ void nbcon_cpu_emergency_enter(void)
{
unsigned int *cpu_emergency_nesting;
- preempt_disable();
+ preempt_disable_printk();
cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
(*cpu_emergency_nesting)++;
@@ -1654,7 +1654,7 @@ void nbcon_cpu_emergency_exit(void)
if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0))
(*cpu_emergency_nesting)--;
- preempt_enable();
+ preempt_enable_printk();
}
/**
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1eea80d0648e..d070220a817b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2439,7 +2439,7 @@ asmlinkage int vprintk_emit(int facility, int level,
* this context can return as soon as possible. Hopefully
* another printk() caller will take over the printing.
*/
- preempt_disable();
+ preempt_disable_printk();
/*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers. With the
@@ -2448,7 +2448,7 @@ asmlinkage int vprintk_emit(int facility, int level,
*/
if (console_trylock_spinning())
console_unlock();
- preempt_enable();
+ preempt_enable_printk();
}
if (ft.legacy_offload)
@@ -4536,7 +4536,7 @@ static void __wake_up_klogd(int val)
if (!printk_percpu_data_ready())
return;
- preempt_disable();
+ preempt_disable_printk();
/*
* Guarantee any new records can be seen by tasks preparing to wait
* before this context checks if the wait queue is empty.
@@ -4553,7 +4553,7 @@ static void __wake_up_klogd(int val)
this_cpu_or(printk_pending, val);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
}
- preempt_enable();
+ preempt_enable_printk();
}
/**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dce50fa57471..41426f8f806f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5857,7 +5857,9 @@ static inline void preempt_latency_start(int val)
}
}
-void preempt_count_add(int val)
+/* non printing version of preempt_count_add for use in
+ * printk to avoid infinite recursion */
+void preempt_count_add_printk(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -5876,6 +5878,34 @@ void preempt_count_add(int val)
#endif
preempt_latency_start(val);
}
+EXPORT_SYMBOL(preempt_count_add_printk);
+NOKPROBE_SYMBOL(preempt_count_add_printk);
+
+void preempt_count_add(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+ return;
+#endif
+ if (!strcmp(get_current()->comm, "test_progs")) {
+ pr_err("%s %d: preempt_count = 0x%x", __func__, __LINE__, preempt_count());
+ __preempt_count_add(val);
+ pr_err("%s %d: preempt_count = 0x%x", __func__, __LINE__, preempt_count());
+ } else {
+ __preempt_count_add(val);
+ }
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Spinlock count overflowing soon?
+ */
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+ PREEMPT_MASK - 10);
+#endif
+ preempt_latency_start(val);
+}
EXPORT_SYMBOL(preempt_count_add);
NOKPROBE_SYMBOL(preempt_count_add);
@@ -5889,7 +5919,7 @@ static inline void preempt_latency_stop(int val)
trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
}
-void preempt_count_sub(int val)
+void preempt_count_sub_printk(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -5908,6 +5938,34 @@ void preempt_count_sub(int val)
preempt_latency_stop(val);
__preempt_count_sub(val);
}
+EXPORT_SYMBOL(preempt_count_sub_printk);
+NOKPROBE_SYMBOL(preempt_count_sub_printk);
+
+void preempt_count_sub(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ return;
+ /*
+ * Is the spinlock portion underflowing?
+ */
+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+ !(preempt_count() & PREEMPT_MASK)))
+ return;
+#endif
+
+ preempt_latency_stop(val);
+ if (!strcmp(get_current()->comm, "test_progs")) {
+ pr_err("%s %d: preempt_count = 0x%x", __func__, __LINE__, preempt_count());
+ __preempt_count_sub(val);
+ pr_err("%s %d: preempt_count = 0x%x", __func__, __LINE__, preempt_count());
+ } else {
+ __preempt_count_sub(val);
+ }
+}
EXPORT_SYMBOL(preempt_count_sub);
NOKPROBE_SYMBOL(preempt_count_sub);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b0eb721fcfb5..762f8b429af3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -35,6 +35,9 @@
#include "trace_probe.h"
#include "trace.h"
+#include <asm/preempt.h>
+#include <linux/sched.h>
+
#define CREATE_TRACE_POINTS
#include "bpf_trace.h"
@@ -110,6 +113,8 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
unsigned int ret;
+ if (!strcmp(get_current()->comm, "test_progs"))
+ pr_err("%s: call = %px ctx = %px preempt_count = %u\n", __func__, call, ctx, preempt_count());
cant_sleep();
@@ -766,6 +771,8 @@ const struct bpf_func_proto bpf_get_current_task_proto = {
BPF_CALL_0(bpf_get_current_task_btf)
{
+ if (!strcmp(get_current()->comm, "test_progs"))
+ pr_err("%s: preempt_count = 0x%x", __func__, preempt_count());
return (unsigned long) current;
}
@@ -2244,6 +2251,8 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
struct bpf_prog *prog = link->link.prog;
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
+ if (!strcmp(get_current()->comm, "test_progs"))
+ pr_err("%s: preempt_count = %u\n", __func__, preempt_count());
cant_sleep();
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index a0391f9da2d4..0d179a1192ca 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -7,6 +7,7 @@
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "bpf_misc.h"
+#include "bpf_kfuncs.h"
#include "errno.h"
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index e6c248e3ae54..e9e918cdf31f 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -385,7 +385,7 @@ int bpf_testmod_fentry_ok;
noinline ssize_t
bpf_testmod_test_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t len)
{
struct bpf_testmod_test_read_ctx ctx = {
@@ -465,7 +465,7 @@ ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO);
noinline ssize_t
bpf_testmod_test_write(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t len)
{
struct bpf_testmod_test_write_ctx ctx = {
@@ -567,7 +567,7 @@ static void testmod_unregister_uprobe(void)
static ssize_t
bpf_testmod_uprobe_write(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t len)
{
unsigned long offset = 0;
This patch seems to create so much output that the orginal error message and
backtrace often get lost, so I needed several runs to get a meaningful message
when running
$ ./test_progs -a cgrp_local_storage/cgrp1_tp_btf
Here's the crucial part of the message which show an unexplained increase
in preempt_count:
[ 99.467001] [ T2857] bpf_link_settle calling fd_install() preemt_count = 0
The next two message are from fd install calling rcu_read_lock_sched() (I know
that from an earlier version of the debug patch which also monitored fd_install()):
[ 99.467003] [ T2857] preempt_count_add 5894: preempt_count = 0x0
[ 99.467004] [ T2857] preempt_count_add 5896: preempt_count = 0x1
The next two message are from fd install calling rcu_read_unlock_sched():
[ 99.467006] [ T2857] preempt_count_sub 5962: preempt_count = 0x1
[ 99.467007] [ T2857] preempt_count_sub 5964: preempt_count = 0x0
Here we are after fd_install which exited with preempt_count = 0, but
when entering __bpf_trace_run we have preempt_count = 1 anyway without
a call to preempt_count_add(). So who or what is increasing preempt_count here?
[ 99.467014] [ T2857] __bpf_trace_run: preempt_count = 1
[ 99.467015] [ T2857] __bpf_prog_run: preempt_count = 1
[ 99.467017] [ T2857] ____bpf_get_current_task_btf: preempt_count = 0x1
[ 99.467018] [ T2857] bpf_task_get_cgroup1: calling task_get_cgroup1 preempt_count = 0x1
[ 99.467019] [ T2857] task_get_cgroup1: preempt_count = 0x1
[ 99.467022] [ T2857] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
[ 99.467024] [ T2857] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 2857, name: test_progs
[ 99.467025] [ T2857] preempt_count: 1, expected: 0
[ 99.467027] [ T2857] RCU nest depth: 2, expected: 2
[ 99.467028] [ T2857] 4 locks held by test_progs/2857:
[ 99.467029] [ T2857] #0: ffffffffa6932de0 (rcu_read_lock_trace){....}-{0:0}, at: syscall_trace_enter+0x18e/0x260
[ 99.467040] [ T2857] #1: ffffffffa6933880 (rcu_read_lock){....}-{1:3}, at: bpf_trace_run2+0xa6/0x2a0
[ 99.467048] [ T2857] #2: ffffffffa6933880 (rcu_read_lock){....}-{1:3}, at: task_get_cgroup1+0x4c/0x360
[ 99.467054] [ T2857] #3: ffffffffa6956878 (css_set_lock){+.+.}-{3:3}, at: task_get_cgroup1+0x109/0x360
The "Preemption disabled at:" message is not useful here because it just comes
from printk() (which enables preemption again). In the usual case (no monitoring)
we get fd_install as the "Preemption disable at:" message, but as can be seen above
fd_install exits with preempt_count = 0.
[ 99.467060] [ T2857] Preemption disabled at:
[ 99.467060] [ T2857] [<ffffffffa503a823>] __wake_up_klogd.part.0+0x13/0xa0
[ 99.467066] [ T2857] CPU: 13 UID: 0 PID: 2857 Comm: test_progs Tainted: G O 6.15.0-rc7-next-20250523-cgroupbpfbug-00024-gc1d60c8159da #66 PREEMPT_{RT,(full)}
[ 99.467070] [ T2857] Tainted: [O]=OOT_MODULE
[ 99.467071] [ T2857] Hardware name: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-158L, BIOS E158LAMS.10F 11/11/2024
[ 99.467072] [ T2857] Call Trace:
[ 99.467075] [ T2857] <TASK>
[ 99.467077] [ T2857] dump_stack_lvl+0x6d/0xb0
[ 99.467083] [ T2857] __might_resched.cold+0xfa/0x135
[ 99.467089] [ T2857] rt_spin_lock+0x5f/0x190
[ 99.467092] [ T2857] ? task_get_cgroup1+0x109/0x360
[ 99.467098] [ T2857] task_get_cgroup1+0x109/0x360
[ 99.467102] [ T2857] bpf_task_get_cgroup1+0x3d/0x50
[ 99.467109] [ T2857] bpf_prog_8d22669ef1ee8049_on_enter+0x62/0x1d4
[ 99.467115] [ T2857] bpf_trace_run2+0x108/0x2a0
[ 99.467119] [ T2857] ? srso_alias_return_thunk+0x5/0xfbef5
[ 99.467126] [ T2857] __bpf_trace_sys_enter+0x37/0x60
[ 99.467131] [ T2857] syscall_trace_enter+0x1c7/0x260
[ 99.467136] [ T2857] do_syscall_64+0x395/0xfa0
[ 99.467139] [ T2857] ? srso_alias_return_thunk+0x5/0xfbef5
[ 99.467142] [ T2857] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 99.467142] [ T2857] preempt_count_add 5894: preempt_count = 0x1
[ 99.467142] [ T2857] preempt_count_add 5896: preempt_count = 0x2
[ 99.467142] [ T2857] preempt_count_sub 5962: preempt_count = 0x2
[ 99.467142] [ T2857] preempt_count_sub 5964: preempt_count = 0x1
[ 99.467142] [ T2857] RIP: 0033:0x7f23eb72c779
[ 99.467142] [ T2857] preempt_count_add 5894: preempt_count = 0x1
[ 99.467142] [ T2857] preempt_count_add 5896: preempt_count = 0x2
[ 99.467142] [ T2857] preempt_count_sub 5962: preempt_count = 0x2
[ 99.467142] [ T2857] preempt_count_sub 5964: preempt_count = 0x1
[ 99.467142] [ T2857] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 4f 86 0d 00 f7 d8 64 89 01 48
[ 99.467142] [ T2857] RSP: 002b:00007ffd8b5d7f38 EFLAGS: 00000202 ORIG_RAX: 0000000000000141
[ 99.467142] [ T2857] RAX: ffffffffffffffda RBX: 00007ffd8b5d86c8 RCX: 00007f23eb72c779
[ 99.467142] [ T2857] RDX: 0000000000000040 RSI: 00007ffd8b5d7fb0 RDI: 000000000000001c
[ 99.467142] [ T2857] RBP: 00007ffd8b5d7f50 R08: 0000000000000001 R09: 00007ffd8b5d7fb0
[ 99.467142] [ T2857] R10: 00007f23eb805ac0 R11: 0000000000000202 R12: 0000000000000000
[ 99.467142] [ T2857] R13: 00007ffd8b5d86e8 R14: 00007f23ebd61000 R15: 0000563e4fd2d890
[ 99.467142] [ T2857] </TASK>
Bert Karwatzki
Powered by blists - more mailing lists