[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <71a03505-6ec4-8f1d-09c6-fff78f4880d0@huaweicloud.com>
Date: Fri, 8 Sep 2023 08:56:47 +0800
From: Hou Tao <houtao@...weicloud.com>
To: Toke Høiland-Jørgensen <toke@...nel.org>,
Kumar Kartikeya Dwivedi <memxor@...il.com>
Cc: Hsin-Wei Hung <hsinweih@....edu>, Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>, Andrii Nakryiko <andrii@...nel.org>,
Martin KaFai Lau <kafai@...com>, Song Liu <songliubraving@...com>,
Yonghong Song <yhs@...com>, John Fastabend <john.fastabend@...il.com>,
KP Singh <kpsingh@...nel.org>, netdev@...r.kernel.org, bpf@...r.kernel.org,
Arnaldo Carvalho de Melo <acme@...nel.org>
Subject: Re: Possible deadlock in bpf queue map
Hi,
On 9/7/2023 9:04 PM, Toke Høiland-Jørgensen wrote:
> Kumar Kartikeya Dwivedi <memxor@...il.com> writes:
>
>> On Thu, 7 Sept 2023 at 12:26, Toke Høiland-Jørgensen <toke@...nel.org> wrote:
>>> +Arnaldo
>>>
>>>> Hi,
>>>>
>>>> Our bpf fuzzer, a customized Syzkaller, triggered a lockdep warning in
>>>> the bpf queue map in v5.15. Since queue_stack_maps.c has no major changes
>>>> since v5.15, we think this should still exist in the latest kernel.
>>>> The bug can be occasionally triggered, and we suspect one of the
>>>> eBPF programs involved to be the following one. We also attached the lockdep
>>>> warning at the end.
>>>>
>>>> #define DEFINE_BPF_MAP_NO_KEY(the_map, TypeOfMap, MapFlags,
>>>> TypeOfValue, MaxEntries) \
>>>> struct { \
>>>> __uint(type, TypeOfMap); \
>>>> __uint(map_flags, (MapFlags)); \
>>>> __uint(max_entries, (MaxEntries)); \
>>>> __type(value, TypeOfValue); \
>>>> } the_map SEC(".maps");
>>>>
>>>> DEFINE_BPF_MAP_NO_KEY(map_0, BPF_MAP_TYPE_QUEUE, 0 | BPF_F_WRONLY,
>>>> struct_0, 162);
>>>> SEC("perf_event")
>>>> int func(struct bpf_perf_event_data *ctx) {
>>>> char v0[96] = {};
>>>> uint64_t v1 = 0;
>>>> v1 = bpf_map_pop_elem(&map_0, v0);
>>>> return 163819661;
>>>> }
>>>>
>>>>
>>>> The program is attached to the following perf event.
>>>>
>>>> struct perf_event_attr attr_type_hw = {
>>>> .type = PERF_TYPE_HARDWARE,
>>>> .config = PERF_COUNT_HW_CPU_CYCLES,
>>>> .sample_freq = 50,
>>>> .inherit = 1,
>>>> .freq = 1,
>>>> };
>>>>
>>>> ================================WARNING: inconsistent lock state
>>>> 5.15.26+ #2 Not tainted
>>>> --------------------------------
>>>> inconsistent {INITIAL USE} -> {IN-NMI} usage.
>>>> syz-executor.5/19749 [HC1[1]:SC0[0]:HE0:SE1] takes:
>>>> ffff88804c9fc198 (&qs->lock){..-.}-{2:2}, at: __queue_map_get+0x31/0x250
>>>> {INITIAL USE} state was registered at:
>>>> lock_acquire+0x1a3/0x4b0
>>>> _raw_spin_lock_irqsave+0x48/0x60
>>>> __queue_map_get+0x31/0x250
>>>> bpf_prog_577904e86c81dead_func+0x12/0x4b4
>>>> trace_call_bpf+0x262/0x5d0
>>>> perf_trace_run_bpf_submit+0x91/0x1c0
>>>> perf_trace_sched_switch+0x46c/0x700
>>>> __schedule+0x11b5/0x24a0
>>>> schedule+0xd4/0x270
>>>> futex_wait_queue_me+0x25f/0x520
>>>> futex_wait+0x1e0/0x5f0
>>>> do_futex+0x395/0x1890
>>>> __x64_sys_futex+0x1cb/0x480
>>>> do_syscall_64+0x3b/0xc0
>>>> entry_SYSCALL_64_after_hwframe+0x44/0xae
>>>> irq event stamp: 13640
>>>> hardirqs last enabled at (13639): [<ffffffff95eb2bf4>]
>>>> _raw_spin_unlock_irq+0x24/0x40
>>>> hardirqs last disabled at (13640): [<ffffffff95eb2d4d>]
>>>> _raw_spin_lock_irqsave+0x5d/0x60
>>>> softirqs last enabled at (13464): [<ffffffff93e26de5>] __sys_bpf+0x3e15/0x4e80
>>>> softirqs last disabled at (13462): [<ffffffff93e26da3>] __sys_bpf+0x3dd3/0x4e80
>>>>
>>>> other info that might help us debug this:
>>>> Possible unsafe locking scenario:
>>>>
>>>> CPU0
>>>> ----
>>>> lock(&qs->lock);
>>>> <Interrupt>
>>>> lock(&qs->lock);
>>> Hmm, so that lock() uses raw_spin_lock_irqsave(), which *should* be
>>> disabling interrupts entirely for the critical section. But I guess a
>>> Perf hardware event can still trigger? Which seems like it would
>>> potentially wreak havoc with lots of things, not just this queue map
>>> function?
>>>
>>> No idea how to protect against this, though. Hoping Arnaldo knows? :)
>>>
It seems my reply from last night was dropped by mail-list.
>> The locking should probably be protected by a percpu integer counter,
>> incremented and decremented before and after the lock is taken,
>> respectively. If it is already non-zero, then -EBUSY should be
>> returned. It is similar to what htab_lock_bucket protects against in
>> hashtab.c.
> Ah, neat! Okay, seems straight-forward enough to replicate. Hsin, could
> you please check if the patch below gets rid of the splat?
The fixes could fix the potential dead-lock, but I think the lockdep
warning will be there, because lockdep thinks it is only safe to call
try_lock under NMI-contex. So using raw_spin_trylock() for NMI context
will both fix the potential dead-lock and the lockdep splat.
>
> -Toke
>
>
> diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
> index 8d2ddcb7566b..f96945311eec 100644
> --- a/kernel/bpf/queue_stack_maps.c
> +++ b/kernel/bpf/queue_stack_maps.c
> @@ -16,6 +16,7 @@
> struct bpf_queue_stack {
> struct bpf_map map;
> raw_spinlock_t lock;
> + int __percpu *map_locked;
> u32 head, tail;
> u32 size; /* max_entries + 1 */
>
> @@ -66,6 +67,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
> int numa_node = bpf_map_attr_numa_node(attr);
> struct bpf_queue_stack *qs;
> u64 size, queue_size;
> + int err = -ENOMEM;
>
> size = (u64) attr->max_entries + 1;
> queue_size = sizeof(*qs) + size * attr->value_size;
> @@ -80,7 +82,18 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
>
> raw_spin_lock_init(&qs->lock);
>
> + qs->map_locked = bpf_map_alloc_percpu(&qs->map,
> + sizeof(*qs->map_locked),
> + sizeof(*qs->map_locked),
> + GFP_USER);
> + if (!qs->map_locked)
> + goto free_map;
> +
> return &qs->map;
> +
> +free_map:
> + bpf_map_area_free(qs);
> + return ERR_PTR(err);
> }
>
> /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
> @@ -88,9 +101,37 @@ static void queue_stack_map_free(struct bpf_map *map)
> {
> struct bpf_queue_stack *qs = bpf_queue_stack(map);
>
> + free_percpu(qs->map_locked);
> bpf_map_area_free(qs);
> }
>
> +static inline int queue_stack_map_lock(struct bpf_queue_stack *qs,
> + unsigned long *pflags)
> +{
> + unsigned long flags;
> +
> + preempt_disable();
> + if (unlikely(__this_cpu_inc_return(*qs->map_locked) != 1)) {
> + __this_cpu_dec(*qs->map_locked);
> + preempt_enable();
> + return -EBUSY;
> + }
> +
> + raw_spin_lock_irqsave(&qs->lock, flags);
> + *pflags = flags;
> +
> + return 0;
> +}
> +
> +
> +static inline void queue_stack_map_unlock(struct bpf_queue_stack *qs,
> + unsigned long flags)
> +{
> + raw_spin_unlock_irqrestore(&qs->lock, flags);
> + __this_cpu_dec(*qs->map_locked);
> + preempt_enable();
> +}
> +
> static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
> {
> struct bpf_queue_stack *qs = bpf_queue_stack(map);
> @@ -98,7 +139,9 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
> int err = 0;
> void *ptr;
>
> - raw_spin_lock_irqsave(&qs->lock, flags);
> + err = queue_stack_map_lock(qs, &flags);
> + if (err)
> + return err;
>
> if (queue_stack_map_is_empty(qs)) {
> memset(value, 0, qs->map.value_size);
> @@ -115,7 +158,7 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
> }
>
> out:
> - raw_spin_unlock_irqrestore(&qs->lock, flags);
> + queue_stack_map_unlock(qs, flags);
> return err;
> }
>
> @@ -128,7 +171,9 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
> void *ptr;
> u32 index;
>
> - raw_spin_lock_irqsave(&qs->lock, flags);
> + err = queue_stack_map_lock(qs, &flags);
> + if (err)
> + return err;
>
> if (queue_stack_map_is_empty(qs)) {
> memset(value, 0, qs->map.value_size);
> @@ -147,7 +192,7 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
> qs->head = index;
>
> out:
> - raw_spin_unlock_irqrestore(&qs->lock, flags);
> + queue_stack_map_unlock(qs, flags);
> return err;
> }
>
> @@ -193,7 +238,9 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
> if (flags & BPF_NOEXIST || flags > BPF_EXIST)
> return -EINVAL;
>
> - raw_spin_lock_irqsave(&qs->lock, irq_flags);
> + err = queue_stack_map_lock(qs, &irq_flags);
> + if (err)
> + return err;
>
> if (queue_stack_map_is_full(qs)) {
> if (!replace) {
> @@ -212,7 +259,7 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
> qs->head = 0;
>
> out:
> - raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
> + queue_stack_map_unlock(qs, irq_flags);
> return err;
> }
>
>
> .
Powered by blists - more mailing lists