netdev - Re: Possible deadlock in bpf queue map

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <71a03505-6ec4-8f1d-09c6-fff78f4880d0@huaweicloud.com>
Date: Fri, 8 Sep 2023 08:56:47 +0800
From: Hou Tao <houtao@...weicloud.com>
To: Toke Høiland-Jørgensen <toke@...nel.org>,
 Kumar Kartikeya Dwivedi <memxor@...il.com>
Cc: Hsin-Wei Hung <hsinweih@....edu>, Alexei Starovoitov <ast@...nel.org>,
 Daniel Borkmann <daniel@...earbox.net>, Andrii Nakryiko <andrii@...nel.org>,
 Martin KaFai Lau <kafai@...com>, Song Liu <songliubraving@...com>,
 Yonghong Song <yhs@...com>, John Fastabend <john.fastabend@...il.com>,
 KP Singh <kpsingh@...nel.org>, netdev@...r.kernel.org, bpf@...r.kernel.org,
 Arnaldo Carvalho de Melo <acme@...nel.org>
Subject: Re: Possible deadlock in bpf queue map

Hi,

On 9/7/2023 9:04 PM, Toke Høiland-Jørgensen wrote:
> Kumar Kartikeya Dwivedi <memxor@...il.com> writes:
>
>> On Thu, 7 Sept 2023 at 12:26, Toke Høiland-Jørgensen <toke@...nel.org> wrote:
>>> +Arnaldo
>>>
>>>> Hi,
>>>>
>>>> Our bpf fuzzer, a customized Syzkaller, triggered a lockdep warning in
>>>> the bpf queue map in v5.15. Since queue_stack_maps.c has no major changes
>>>> since v5.15, we think this should still exist in the latest kernel.
>>>> The bug can be occasionally triggered, and we suspect one of the
>>>> eBPF programs involved to be the following one. We also attached the lockdep
>>>> warning at the end.
>>>>
>>>> #define DEFINE_BPF_MAP_NO_KEY(the_map, TypeOfMap, MapFlags,
>>>> TypeOfValue, MaxEntries) \
>>>>         struct {                                                        \
>>>>             __uint(type, TypeOfMap);                                    \
>>>>             __uint(map_flags, (MapFlags));                              \
>>>>             __uint(max_entries, (MaxEntries));                          \
>>>>             __type(value, TypeOfValue);                                 \
>>>>         } the_map SEC(".maps");
>>>>
>>>> DEFINE_BPF_MAP_NO_KEY(map_0, BPF_MAP_TYPE_QUEUE, 0 | BPF_F_WRONLY,
>>>> struct_0, 162);
>>>> SEC("perf_event")
>>>> int func(struct bpf_perf_event_data *ctx) {
>>>>         char v0[96] = {};
>>>>         uint64_t v1 = 0;
>>>>         v1 = bpf_map_pop_elem(&map_0, v0);
>>>>         return 163819661;
>>>> }
>>>>
>>>>
>>>> The program is attached to the following perf event.
>>>>
>>>> struct perf_event_attr attr_type_hw = {
>>>>         .type = PERF_TYPE_HARDWARE,
>>>>         .config = PERF_COUNT_HW_CPU_CYCLES,
>>>>         .sample_freq = 50,
>>>>         .inherit = 1,
>>>>         .freq = 1,
>>>> };
>>>>
>>>> ================================WARNING: inconsistent lock state
>>>> 5.15.26+ #2 Not tainted
>>>> --------------------------------
>>>> inconsistent {INITIAL USE} -> {IN-NMI} usage.
>>>> syz-executor.5/19749 [HC1[1]:SC0[0]:HE0:SE1] takes:
>>>> ffff88804c9fc198 (&qs->lock){..-.}-{2:2}, at: __queue_map_get+0x31/0x250
>>>> {INITIAL USE} state was registered at:
>>>>   lock_acquire+0x1a3/0x4b0
>>>>   _raw_spin_lock_irqsave+0x48/0x60
>>>>   __queue_map_get+0x31/0x250
>>>>   bpf_prog_577904e86c81dead_func+0x12/0x4b4
>>>>   trace_call_bpf+0x262/0x5d0
>>>>   perf_trace_run_bpf_submit+0x91/0x1c0
>>>>   perf_trace_sched_switch+0x46c/0x700
>>>>   __schedule+0x11b5/0x24a0
>>>>   schedule+0xd4/0x270
>>>>   futex_wait_queue_me+0x25f/0x520
>>>>   futex_wait+0x1e0/0x5f0
>>>>   do_futex+0x395/0x1890
>>>>   __x64_sys_futex+0x1cb/0x480
>>>>   do_syscall_64+0x3b/0xc0
>>>>   entry_SYSCALL_64_after_hwframe+0x44/0xae
>>>> irq event stamp: 13640
>>>> hardirqs last  enabled at (13639): [<ffffffff95eb2bf4>]
>>>> _raw_spin_unlock_irq+0x24/0x40
>>>> hardirqs last disabled at (13640): [<ffffffff95eb2d4d>]
>>>> _raw_spin_lock_irqsave+0x5d/0x60
>>>> softirqs last  enabled at (13464): [<ffffffff93e26de5>] __sys_bpf+0x3e15/0x4e80
>>>> softirqs last disabled at (13462): [<ffffffff93e26da3>] __sys_bpf+0x3dd3/0x4e80
>>>>
>>>> other info that might help us debug this:
>>>>  Possible unsafe locking scenario:
>>>>
>>>>        CPU0
>>>>        ----
>>>>   lock(&qs->lock);
>>>>   <Interrupt>
>>>>     lock(&qs->lock);
>>> Hmm, so that lock() uses raw_spin_lock_irqsave(), which *should* be
>>> disabling interrupts entirely for the critical section. But I guess a
>>> Perf hardware event can still trigger? Which seems like it would
>>> potentially wreak havoc with lots of things, not just this queue map
>>> function?
>>>
>>> No idea how to protect against this, though. Hoping Arnaldo knows? :)
>>>

It seems my reply from last night was dropped by mail-list.

>> The locking should probably be protected by a percpu integer counter,
>> incremented and decremented before and after the lock is taken,
>> respectively. If it is already non-zero, then -EBUSY should be
>> returned. It is similar to what htab_lock_bucket protects against in
>> hashtab.c.
> Ah, neat! Okay, seems straight-forward enough to replicate. Hsin, could
> you please check if the patch below gets rid of the splat?

The fixes could fix the potential dead-lock, but I think the lockdep
warning will be there, because lockdep thinks it is only safe to call
try_lock under NMI-contex. So using raw_spin_trylock() for NMI context
will both fix the potential dead-lock and the lockdep splat.

>
> -Toke
>
>
> diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
> index 8d2ddcb7566b..f96945311eec 100644
> --- a/kernel/bpf/queue_stack_maps.c
> +++ b/kernel/bpf/queue_stack_maps.c
> @@ -16,6 +16,7 @@
>  struct bpf_queue_stack {
>  	struct bpf_map map;
>  	raw_spinlock_t lock;
> +	int __percpu *map_locked;
>  	u32 head, tail;
>  	u32 size; /* max_entries + 1 */
>  
> @@ -66,6 +67,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
>  	int numa_node = bpf_map_attr_numa_node(attr);
>  	struct bpf_queue_stack *qs;
>  	u64 size, queue_size;
> +	int err = -ENOMEM;
>  
>  	size = (u64) attr->max_entries + 1;
>  	queue_size = sizeof(*qs) + size * attr->value_size;
> @@ -80,7 +82,18 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
>  
>  	raw_spin_lock_init(&qs->lock);
>  
> +	qs->map_locked = bpf_map_alloc_percpu(&qs->map,
> +					      sizeof(*qs->map_locked),
> +					      sizeof(*qs->map_locked),
> +					      GFP_USER);
> +	if (!qs->map_locked)
> +		goto free_map;
> +
>  	return &qs->map;
> +
> +free_map:
> +	bpf_map_area_free(qs);
> +	return ERR_PTR(err);
>  }
>  
>  /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
> @@ -88,9 +101,37 @@ static void queue_stack_map_free(struct bpf_map *map)
>  {
>  	struct bpf_queue_stack *qs = bpf_queue_stack(map);
>  
> +	free_percpu(qs->map_locked);
>  	bpf_map_area_free(qs);
>  }
>  
> +static inline int queue_stack_map_lock(struct bpf_queue_stack *qs,
> +				       unsigned long *pflags)
> +{
> +	unsigned long flags;
> +
> +	preempt_disable();
> +	if (unlikely(__this_cpu_inc_return(*qs->map_locked) != 1)) {
> +		__this_cpu_dec(*qs->map_locked);
> +		preempt_enable();
> +		return -EBUSY;
> +	}
> +
> +	raw_spin_lock_irqsave(&qs->lock, flags);
> +	*pflags = flags;
> +
> +	return 0;
> +}
> +
> +
> +static inline void queue_stack_map_unlock(struct bpf_queue_stack *qs,
> +					  unsigned long flags)
> +{
> +	raw_spin_unlock_irqrestore(&qs->lock, flags);
> +	__this_cpu_dec(*qs->map_locked);
> +	preempt_enable();
> +}
> +
>  static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
>  {
>  	struct bpf_queue_stack *qs = bpf_queue_stack(map);
> @@ -98,7 +139,9 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
>  	int err = 0;
>  	void *ptr;
>  
> -	raw_spin_lock_irqsave(&qs->lock, flags);
> +	err = queue_stack_map_lock(qs, &flags);
> +	if (err)
> +		return err;
>  
>  	if (queue_stack_map_is_empty(qs)) {
>  		memset(value, 0, qs->map.value_size);
> @@ -115,7 +158,7 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
>  	}
>  
>  out:
> -	raw_spin_unlock_irqrestore(&qs->lock, flags);
> +	queue_stack_map_unlock(qs, flags);
>  	return err;
>  }
>  
> @@ -128,7 +171,9 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
>  	void *ptr;
>  	u32 index;
>  
> -	raw_spin_lock_irqsave(&qs->lock, flags);
> +	err = queue_stack_map_lock(qs, &flags);
> +	if (err)
> +		return err;
>  
>  	if (queue_stack_map_is_empty(qs)) {
>  		memset(value, 0, qs->map.value_size);
> @@ -147,7 +192,7 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
>  		qs->head = index;
>  
>  out:
> -	raw_spin_unlock_irqrestore(&qs->lock, flags);
> +	queue_stack_map_unlock(qs, flags);
>  	return err;
>  }
>  
> @@ -193,7 +238,9 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
>  	if (flags & BPF_NOEXIST || flags > BPF_EXIST)
>  		return -EINVAL;
>  
> -	raw_spin_lock_irqsave(&qs->lock, irq_flags);
> +	err = queue_stack_map_lock(qs, &irq_flags);
> +	if (err)
> +		return err;
>  
>  	if (queue_stack_map_is_full(qs)) {
>  		if (!replace) {
> @@ -212,7 +259,7 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
>  		qs->head = 0;
>  
>  out:
> -	raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
> +	queue_stack_map_unlock(qs, irq_flags);
>  	return err;
>  }
>  
>
> .