[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87fs3qnnh4.fsf@toke.dk>
Date: Thu, 07 Sep 2023 15:04:39 +0200
From: Toke Høiland-Jørgensen <toke@...nel.org>
To: Kumar Kartikeya Dwivedi <memxor@...il.com>
Cc: Hsin-Wei Hung <hsinweih@....edu>, Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>, Andrii Nakryiko
<andrii@...nel.org>, Martin KaFai Lau <kafai@...com>, Song Liu
<songliubraving@...com>, Yonghong Song <yhs@...com>, John Fastabend
<john.fastabend@...il.com>, KP Singh <kpsingh@...nel.org>,
netdev@...r.kernel.org, bpf@...r.kernel.org, Arnaldo Carvalho de Melo
<acme@...nel.org>
Subject: Re: Possible deadlock in bpf queue map
Kumar Kartikeya Dwivedi <memxor@...il.com> writes:
> On Thu, 7 Sept 2023 at 12:26, Toke Høiland-Jørgensen <toke@...nel.org> wrote:
>>
>> +Arnaldo
>>
>> > Hi,
>> >
>> > Our bpf fuzzer, a customized Syzkaller, triggered a lockdep warning in
>> > the bpf queue map in v5.15. Since queue_stack_maps.c has no major changes
>> > since v5.15, we think this should still exist in the latest kernel.
>> > The bug can be occasionally triggered, and we suspect one of the
>> > eBPF programs involved to be the following one. We also attached the lockdep
>> > warning at the end.
>> >
>> > #define DEFINE_BPF_MAP_NO_KEY(the_map, TypeOfMap, MapFlags,
>> > TypeOfValue, MaxEntries) \
>> > struct { \
>> > __uint(type, TypeOfMap); \
>> > __uint(map_flags, (MapFlags)); \
>> > __uint(max_entries, (MaxEntries)); \
>> > __type(value, TypeOfValue); \
>> > } the_map SEC(".maps");
>> >
>> > DEFINE_BPF_MAP_NO_KEY(map_0, BPF_MAP_TYPE_QUEUE, 0 | BPF_F_WRONLY,
>> > struct_0, 162);
>> > SEC("perf_event")
>> > int func(struct bpf_perf_event_data *ctx) {
>> > char v0[96] = {};
>> > uint64_t v1 = 0;
>> > v1 = bpf_map_pop_elem(&map_0, v0);
>> > return 163819661;
>> > }
>> >
>> >
>> > The program is attached to the following perf event.
>> >
>> > struct perf_event_attr attr_type_hw = {
>> > .type = PERF_TYPE_HARDWARE,
>> > .config = PERF_COUNT_HW_CPU_CYCLES,
>> > .sample_freq = 50,
>> > .inherit = 1,
>> > .freq = 1,
>> > };
>> >
>> > ================================WARNING: inconsistent lock state
>> > 5.15.26+ #2 Not tainted
>> > --------------------------------
>> > inconsistent {INITIAL USE} -> {IN-NMI} usage.
>> > syz-executor.5/19749 [HC1[1]:SC0[0]:HE0:SE1] takes:
>> > ffff88804c9fc198 (&qs->lock){..-.}-{2:2}, at: __queue_map_get+0x31/0x250
>> > {INITIAL USE} state was registered at:
>> > lock_acquire+0x1a3/0x4b0
>> > _raw_spin_lock_irqsave+0x48/0x60
>> > __queue_map_get+0x31/0x250
>> > bpf_prog_577904e86c81dead_func+0x12/0x4b4
>> > trace_call_bpf+0x262/0x5d0
>> > perf_trace_run_bpf_submit+0x91/0x1c0
>> > perf_trace_sched_switch+0x46c/0x700
>> > __schedule+0x11b5/0x24a0
>> > schedule+0xd4/0x270
>> > futex_wait_queue_me+0x25f/0x520
>> > futex_wait+0x1e0/0x5f0
>> > do_futex+0x395/0x1890
>> > __x64_sys_futex+0x1cb/0x480
>> > do_syscall_64+0x3b/0xc0
>> > entry_SYSCALL_64_after_hwframe+0x44/0xae
>> > irq event stamp: 13640
>> > hardirqs last enabled at (13639): [<ffffffff95eb2bf4>]
>> > _raw_spin_unlock_irq+0x24/0x40
>> > hardirqs last disabled at (13640): [<ffffffff95eb2d4d>]
>> > _raw_spin_lock_irqsave+0x5d/0x60
>> > softirqs last enabled at (13464): [<ffffffff93e26de5>] __sys_bpf+0x3e15/0x4e80
>> > softirqs last disabled at (13462): [<ffffffff93e26da3>] __sys_bpf+0x3dd3/0x4e80
>> >
>> > other info that might help us debug this:
>> > Possible unsafe locking scenario:
>> >
>> > CPU0
>> > ----
>> > lock(&qs->lock);
>> > <Interrupt>
>> > lock(&qs->lock);
>>
>> Hmm, so that lock() uses raw_spin_lock_irqsave(), which *should* be
>> disabling interrupts entirely for the critical section. But I guess a
>> Perf hardware event can still trigger? Which seems like it would
>> potentially wreak havoc with lots of things, not just this queue map
>> function?
>>
>> No idea how to protect against this, though. Hoping Arnaldo knows? :)
>>
>
> The locking should probably be protected by a percpu integer counter,
> incremented and decremented before and after the lock is taken,
> respectively. If it is already non-zero, then -EBUSY should be
> returned. It is similar to what htab_lock_bucket protects against in
> hashtab.c.
Ah, neat! Okay, seems straight-forward enough to replicate. Hsin, could
you please check if the patch below gets rid of the splat?
-Toke
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 8d2ddcb7566b..f96945311eec 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -16,6 +16,7 @@
struct bpf_queue_stack {
struct bpf_map map;
raw_spinlock_t lock;
+ int __percpu *map_locked;
u32 head, tail;
u32 size; /* max_entries + 1 */
@@ -66,6 +67,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_queue_stack *qs;
u64 size, queue_size;
+ int err = -ENOMEM;
size = (u64) attr->max_entries + 1;
queue_size = sizeof(*qs) + size * attr->value_size;
@@ -80,7 +82,18 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&qs->lock);
+ qs->map_locked = bpf_map_alloc_percpu(&qs->map,
+ sizeof(*qs->map_locked),
+ sizeof(*qs->map_locked),
+ GFP_USER);
+ if (!qs->map_locked)
+ goto free_map;
+
return &qs->map;
+
+free_map:
+ bpf_map_area_free(qs);
+ return ERR_PTR(err);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -88,9 +101,37 @@ static void queue_stack_map_free(struct bpf_map *map)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
+ free_percpu(qs->map_locked);
bpf_map_area_free(qs);
}
+static inline int queue_stack_map_lock(struct bpf_queue_stack *qs,
+ unsigned long *pflags)
+{
+ unsigned long flags;
+
+ preempt_disable();
+ if (unlikely(__this_cpu_inc_return(*qs->map_locked) != 1)) {
+ __this_cpu_dec(*qs->map_locked);
+ preempt_enable();
+ return -EBUSY;
+ }
+
+ raw_spin_lock_irqsave(&qs->lock, flags);
+ *pflags = flags;
+
+ return 0;
+}
+
+
+static inline void queue_stack_map_unlock(struct bpf_queue_stack *qs,
+ unsigned long flags)
+{
+ raw_spin_unlock_irqrestore(&qs->lock, flags);
+ __this_cpu_dec(*qs->map_locked);
+ preempt_enable();
+}
+
static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
@@ -98,7 +139,9 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
int err = 0;
void *ptr;
- raw_spin_lock_irqsave(&qs->lock, flags);
+ err = queue_stack_map_lock(qs, &flags);
+ if (err)
+ return err;
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -115,7 +158,7 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
}
out:
- raw_spin_unlock_irqrestore(&qs->lock, flags);
+ queue_stack_map_unlock(qs, flags);
return err;
}
@@ -128,7 +171,9 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
void *ptr;
u32 index;
- raw_spin_lock_irqsave(&qs->lock, flags);
+ err = queue_stack_map_lock(qs, &flags);
+ if (err)
+ return err;
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -147,7 +192,7 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
qs->head = index;
out:
- raw_spin_unlock_irqrestore(&qs->lock, flags);
+ queue_stack_map_unlock(qs, flags);
return err;
}
@@ -193,7 +238,9 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
if (flags & BPF_NOEXIST || flags > BPF_EXIST)
return -EINVAL;
- raw_spin_lock_irqsave(&qs->lock, irq_flags);
+ err = queue_stack_map_lock(qs, &irq_flags);
+ if (err)
+ return err;
if (queue_stack_map_is_full(qs)) {
if (!replace) {
@@ -212,7 +259,7 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
qs->head = 0;
out:
- raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
+ queue_stack_map_unlock(qs, irq_flags);
return err;
}
Powered by blists - more mailing lists