[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d7ec13dc-a7e7-9381-9728-9157454cadc9@fb.com>
Date: Tue, 18 Feb 2020 15:55:02 -0800
From: Yonghong Song <yhs@...com>
To: Hillf Danton <hdanton@...a.com>,
syzbot <syzbot+122b5421d14e68f29cd1@...kaller.appspotmail.com>
CC: <andriin@...com>, <ast@...nel.org>, <bpf@...r.kernel.org>,
<daniel@...earbox.net>, <kafai@...com>,
<linux-kernel@...r.kernel.org>, <netdev@...r.kernel.org>,
<songliubraving@...com>, <syzkaller-bugs@...glegroups.com>
Subject: Re: possible deadlock in bpf_lru_push_free
On 2/18/20 9:44 AM, Yonghong Song wrote:
>
>
> On 2/16/20 9:23 PM, Hillf Danton wrote:
>>
>> On Sun, 16 Feb 2020 04:17:09 -0800
>>> syzbot has found a reproducer for the following crash on:
>>>
>>> HEAD commit: 2019fc96 Merge
>>> git://git.kernel.org/pub/scm/linux/kernel/g..
>>> git tree: net
>>> console output:
>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_log.txt-3Fx-3D1358bb11e00000&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=zrgWcBnddWkMWG2zm-9nC8EwvHMsuqw_-EEXwl23XLg&e=
>>>
>>> kernel config:
>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_.config-3Fx-3D735296e4dd620b10&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=kbT6Yw89JDoIWSQtlLJ7sjyNoP2Ulud27GNorna1zQk&e=
>>>
>>> dashboard link:
>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_bug-3Fextid-3D122b5421d14e68f29cd1&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=U3pdUmrcroaeNsJ9DgFbTlvftQUCUcJ1CW_0NxS8yGA&e=
>>>
>>> compiler: gcc (GCC) 9.0.0 20181231 (experimental)
>>> syz repro:
>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_repro.syz-3Fx-3D14b67d6ee00000&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=TuSfjosRFQW3ArpQwikTtx-dgLLBSMgJfVKtUltqQBM&e=
>>>
>>>
>>> IMPORTANT: if you fix the bug, please add the following tag to the
>>> commit:
>>> Reported-by: syzbot+122b5421d14e68f29cd1@...kaller.appspotmail.com
>>>
>>> ======================================================
>>> WARNING: possible circular locking dependency detected
>>> 5.6.0-rc1-syzkaller #0 Not tainted
>>> ------------------------------------------------------
>>> syz-executor.4/13544 is trying to acquire lock:
>>> ffffe8ffffcba0b8 (&loc_l->lock){....}, at: bpf_common_lru_push_free
>>> kernel/bpf/bpf_lru_list.c:516 [inline]
>>> ffffe8ffffcba0b8 (&loc_l->lock){....}, at:
>>> bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555
>>>
>>> but task is already holding lock:
>>> ffff888094985960 (&htab->buckets[i].lock){....}, at:
>>> __htab_map_lookup_and_delete_batch+0x617/0x1540
>>> kernel/bpf/hashtab.c:1322
>>>
>>> which lock already depends on the new lock.
>>>
>>>
>>> the existing dependency chain (in reverse order) is:
>>>
>>> -> #2 (&htab->buckets[i].lock){....}:
>>> __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110
>>> [inline]
>>> _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159
>>> htab_lru_map_delete_node+0xce/0x2f0 kernel/bpf/hashtab.c:593
>>> __bpf_lru_list_shrink_inactive kernel/bpf/bpf_lru_list.c:220
>>> [inline]
>>> __bpf_lru_list_shrink+0xf9/0x470 kernel/bpf/bpf_lru_list.c:266
>>> bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:340
>>> [inline]
>>> bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline]
>>> bpf_lru_pop_free+0x87c/0x1670 kernel/bpf/bpf_lru_list.c:499
>>> prealloc_lru_pop+0x2c/0xa0 kernel/bpf/hashtab.c:132
>>> __htab_lru_percpu_map_update_elem+0x67e/0xa90
>>> kernel/bpf/hashtab.c:1069
>>> bpf_percpu_hash_update+0x16e/0x210 kernel/bpf/hashtab.c:1585
>>> bpf_map_update_value.isra.0+0x2d7/0x8e0 kernel/bpf/syscall.c:181
>>> generic_map_update_batch+0x41f/0x610 kernel/bpf/syscall.c:1319
>>> bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
>>> __do_sys_bpf+0x9b7/0x41e0 kernel/bpf/syscall.c:3460
>>> __se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
>>> __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
>>> do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
>>> entry_SYSCALL_64_after_hwframe+0x49/0xbe
>>>
>>> -> #1 (&l->lock){....}:
>>> __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
>>> _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:151
>>> bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:325
>>> [inline]
>>> bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline]
>>> bpf_lru_pop_free+0x67f/0x1670 kernel/bpf/bpf_lru_list.c:499
>>> prealloc_lru_pop+0x2c/0xa0 kernel/bpf/hashtab.c:132
>>> __htab_lru_percpu_map_update_elem+0x67e/0xa90
>>> kernel/bpf/hashtab.c:1069
>>> bpf_percpu_hash_update+0x16e/0x210 kernel/bpf/hashtab.c:1585
>>> bpf_map_update_value.isra.0+0x2d7/0x8e0 kernel/bpf/syscall.c:181
>>> generic_map_update_batch+0x41f/0x610 kernel/bpf/syscall.c:1319
>>> bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
>>> __do_sys_bpf+0x9b7/0x41e0 kernel/bpf/syscall.c:3460
>>> __se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
>>> __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
>>> do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
>>> entry_SYSCALL_64_after_hwframe+0x49/0xbe
>>>
>>> -> #0 (&loc_l->lock){....}:
>>> check_prev_add kernel/locking/lockdep.c:2475 [inline]
>>> check_prevs_add kernel/locking/lockdep.c:2580 [inline]
>>> validate_chain kernel/locking/lockdep.c:2970 [inline]
>>> __lock_acquire+0x2596/0x4a00 kernel/locking/lockdep.c:3954
>>> lock_acquire+0x190/0x410 kernel/locking/lockdep.c:4484
>>> __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110
>>> [inline]
>>> _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159
>>> bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:516 [inline]
>>> bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555
>>> __htab_map_lookup_and_delete_batch+0x8d4/0x1540
>>> kernel/bpf/hashtab.c:1374
>>> htab_lru_map_lookup_and_delete_batch+0x34/0x40
>>> kernel/bpf/hashtab.c:1491
>>> bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
>>> __do_sys_bpf+0x1f7d/0x41e0 kernel/bpf/syscall.c:3456
>>> __se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
>>> __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
>>> do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
>>> entry_SYSCALL_64_after_hwframe+0x49/0xbe
>>>
>>> other info that might help us debug this:
>>>
>>> Chain exists of:
>>> &loc_l->lock --> &l->lock --> &htab->buckets[i].lock
>>>
>>> Possible unsafe locking scenario:
>>>
>>> CPU0 CPU1
>>> ---- ----
>>> lock(&htab->buckets[i].lock);
>>> lock(&l->lock);
>>> lock(&htab->buckets[i].lock);
>>> lock(&loc_l->lock);
>>>
>>> *** DEADLOCK ***
>>>
>>> 2 locks held by syz-executor.4/13544:
>>> #0: ffffffff89bac240 (rcu_read_lock){....}, at:
>>> __htab_map_lookup_and_delete_batch+0x54b/0x1540
>>> kernel/bpf/hashtab.c:1308
>>> #1: ffff888094985960 (&htab->buckets[i].lock){....}, at:
>>> __htab_map_lookup_and_delete_batch+0x617/0x1540
>>> kernel/bpf/hashtab.c:1322
>>>
>>> stack backtrace:
>>> CPU: 0 PID: 13544 Comm: syz-executor.4 Not tainted
>>> 5.6.0-rc1-syzkaller #0
>>> Hardware name: Google Google Compute Engine/Google Compute Engine,
>>> BIOS Google 01/01/2011
>>> Call Trace:
>>> __dump_stack lib/dump_stack.c:77 [inline]
>>> dump_stack+0x197/0x210 lib/dump_stack.c:118
>>> print_circular_bug.isra.0.cold+0x163/0x172
>>> kernel/locking/lockdep.c:1684
>>> check_noncircular+0x32e/0x3e0 kernel/locking/lockdep.c:1808
>>> check_prev_add kernel/locking/lockdep.c:2475 [inline]
>>> check_prevs_add kernel/locking/lockdep.c:2580 [inline]
>>> validate_chain kernel/locking/lockdep.c:2970 [inline]
>>> __lock_acquire+0x2596/0x4a00 kernel/locking/lockdep.c:3954
>>> lock_acquire+0x190/0x410 kernel/locking/lockdep.c:4484
>>> __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>>> _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159
>>> bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:516 [inline]
>>> bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555
>>> __htab_map_lookup_and_delete_batch+0x8d4/0x1540
>>> kernel/bpf/hashtab.c:1374
>>> htab_lru_map_lookup_and_delete_batch+0x34/0x40
>>> kernel/bpf/hashtab.c:1491
>>> bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
>>> __do_sys_bpf+0x1f7d/0x41e0 kernel/bpf/syscall.c:3456
>>> __se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
>>> __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
>>> do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
>>> entry_SYSCALL_64_after_hwframe+0x49/0xbe
>>
>> Reclaim hash table elememt outside bucket lock.
>
> Thanks for the following patch. Yes, we do have an potential issue
> with the above deadlock if LRU hash map is not preallocated.
>
> I am not a RCU expert, but maybe you could you help clarify
> one thing below?
>
>>
>> --- a/kernel/bpf/hashtab.c
>> +++ b/kernel/bpf/hashtab.c
>> @@ -1259,6 +1259,7 @@ __htab_map_lookup_and_delete_batch(struc
>> u64 elem_map_flags, map_flags;
>> struct hlist_nulls_head *head;
>> struct hlist_nulls_node *n;
>> + struct hlist_nulls_node *node_to_free = NULL;
>> unsigned long flags;
>> struct htab_elem *l;
>> struct bucket *b;
>> @@ -1370,9 +1371,10 @@ again_nocopy:
>> }
>> if (do_delete) {
>> hlist_nulls_del_rcu(&l->hash_node);
>> - if (is_lru_map)
>> - bpf_lru_push_free(&htab->lru, &l->lru_node);
>> - else
>> + if (is_lru_map) {
>> + l->hash_node.next = node_to_free;
>> + node_to_free = &l->hash_node;
>
> Here, we change "next" pointer. How does this may impact the existing
> parallel map lookup which does not need to take bucket pointer?
Thanks for Martin for explanation! I think changing l->hash_node.next is
unsafe here as another thread may execute on a different cpu and
traverse the same list. It will see hash_node.next = NULL and it is
unexpected.
How about the following patch?
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2d182c4ee9d9..246ef0f2e985 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -56,6 +56,7 @@ struct htab_elem {
union {
struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
+ struct htab_elem *link;
};
};
};
@@ -1256,6 +1257,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map
*map,
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
u32 batch, max_count, size, bucket_size;
+ struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
@@ -1370,9 +1372,14 @@ __htab_map_lookup_and_delete_batch(struct bpf_map
*map,
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
- if (is_lru_map)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
- else
+ if (is_lru_map) {
+ /* l->hnode overlaps with *
l->hash_node.pprev
+ * in memory. l->hash_node.pprev has been
+ * poisoned and nobody should access it.
+ */
+ l->link = node_to_free;
+ node_to_free = l;
+ } else
free_htab_elem(htab, l);
}
dst_key += key_size;
@@ -1380,6 +1387,13 @@ __htab_map_lookup_and_delete_batch(struct bpf_map
*map,
}
raw_spin_unlock_irqrestore(&b->lock, flags);
+
+ while (node_to_free) {
+ l = node_to_free;
+ node_to_free = node_to_free->link;
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ }
+
/* If we are not copying data, we can go to next bucket and avoid
* unlocking the rcu.
*/
Powered by blists - more mailing lists