[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180622152144.24502.93661.stgit@john-Precision-Tower-5810>
Date: Fri, 22 Jun 2018 08:21:44 -0700
From: John Fastabend <john.fastabend@...il.com>
To: john.fastabend@...il.com, ast@...nel.org, daniel@...earbox.net,
kafai@...com
Cc: netdev@...r.kernel.org
Subject: [bpf PATCH v3 3/4] bpf: sockhash fix omitted bucket lock in
sock_close
First in tcp_close, reduce scope of sk_callback_lock() the lock is
only needed for protecting maps list the ingress and cork
lists are protected by sock lock. Having the lock in wider scope is
harmless but may confuse the reader who may infer it is in fact
needed.
Next, in sock_hash_delete_elem() the pattern is as follows,
sock_hash_delete_elem()
[...]
spin_lock(bucket_lock)
l = lookup_elem_raw()
if (l)
hlist_del_rcu()
write_lock(sk_callback_lock)
.... destroy psock ...
write_unlock(sk_callback_lock)
spin_unlock(bucket_lock)
The ordering is necessary because we only know the {p}sock after
dereferencing the hash table which we can't do unless we have the
bucket lock held. Once we have the bucket lock and the psock element
it is deleted from the hashmap to ensure any other path doing a lookup
will fail. Finally, the refcnt is decremented and if zero the psock
is destroyed.
In parallel with the above (or free'ing the map) a tcp close event
may trigger tcp_close(). Which at the moment omits the bucket lock
altogether (oops!) where the flow looks like this,
bpf_tcp_close()
[...]
write_lock(sk_callback_lock)
for each psock->maps // list of maps this sock is part of
hlist_del_rcu(ref_hash_node);
.... destroy psock ...
write_unlock(sk_callback_lock)
Obviously, and demonstrated by syzbot, this is broken because
we can have multiple threads deleting entries via hlist_del_rcu().
To fix this we might be tempted to wrap the hlist operation in a
bucket lock but that would create a lock inversion problem. In
summary to follow locking rules the psocks maps list needs the
sk_callback_lock but we need the bucket lock to do the hlist_del_rcu.
To resolve the lock inversion problem pop the head of the maps list
repeatedly and remove the reference until no more are left. If a
delete happens in parallel from the BPF API that is OK as well because
it will do a similar action, lookup the lock in the map/hash, delete
it from the map/hash, and dec the refcnt. We check for this case
before doing a destroy on the psock to ensure we don't have two
threads tearing down a psock. The new logic is as follows,
bpf_tcp_close()
e = psock_map_pop(psock->maps) // done with sk_callback_lock
bucket_lock() // lock hash list bucket
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
//only get here if elmnt was not already removed
hlist_del_rcu()
... destroy psock...
}
bucket_unlock()
And finally for all the above to work add missing sk_callback_lock
around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
delete and update may corrupt maps list. Then add RCU annotations and
use rcu_dereference/rcu_assign_pointer to manage values relying on
RCU so that the object is not free'd from sock_hash_free() while it
is being referenced in bpf_tcp_close().
(As an aside the sk_callback_lock serves two purposes. The
first, is to update the sock callbacks sk_data_ready, sk_write_space,
etc. The second is to protect the psock 'maps' list. The 'maps' list
is used to (as shown above) to delete all map/hash references to a
sock when the sock is closed)
Reported-by: syzbot+0ce137753c78f7b6acc1@...kaller.appspotmail.com
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: John Fastabend <john.fastabend@...il.com>
---
kernel/bpf/sockmap.c | 120 +++++++++++++++++++++++++++++++++++---------------
1 file changed, 84 insertions(+), 36 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 69b26af..333427b 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -72,6 +72,7 @@ struct bpf_htab {
u32 n_buckets;
u32 elem_size;
struct bpf_sock_progs progs;
+ struct rcu_head rcu;
};
struct htab_elem {
@@ -89,8 +90,8 @@ enum smap_psock_state {
struct smap_psock_map_entry {
struct list_head list;
struct sock **entry;
- struct htab_elem *hash_link;
- struct bpf_htab *htab;
+ struct htab_elem __rcu *hash_link;
+ struct bpf_htab __rcu *htab;
};
struct smap_psock {
@@ -258,16 +259,54 @@ static void bpf_tcp_release(struct sock *sk)
rcu_read_unlock();
}
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+ u32 hash, void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node) {
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+ }
+
+ return NULL;
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
atomic_dec(&htab->count);
kfree_rcu(l, rcu);
}
+static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
+ struct smap_psock *psock)
+{
+ struct smap_psock_map_entry *e;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ e = list_first_entry_or_null(&psock->maps,
+ struct smap_psock_map_entry,
+ list);
+ if (e)
+ list_del(&e->list);
+ write_unlock_bh(&sk->sk_callback_lock);
+ return e;
+}
+
static void bpf_tcp_close(struct sock *sk, long timeout)
{
void (*close_fun)(struct sock *sk, long timeout);
- struct smap_psock_map_entry *e, *tmp;
+ struct smap_psock_map_entry *e;
struct sk_msg_buff *md, *mtmp;
struct smap_psock *psock;
struct sock *osk;
@@ -286,7 +325,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
*/
close_fun = psock->save_close;
- write_lock_bh(&sk->sk_callback_lock);
if (psock->cork) {
free_start_sg(psock->sock, psock->cork);
kfree(psock->cork);
@@ -299,20 +337,38 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
kfree(md);
}
- list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+ e = psock_map_pop(sk, psock);
+ while (e) {
if (e->entry) {
osk = cmpxchg(e->entry, sk, NULL);
if (osk == sk) {
- list_del(&e->list);
smap_release_sock(psock, sk);
}
} else {
- hlist_del_rcu(&e->hash_link->hash_node);
- smap_release_sock(psock, e->hash_link->sk);
- free_htab_elem(e->htab, e->hash_link);
+ struct htab_elem *link = rcu_dereference(e->hash_link);
+ struct bpf_htab *htab = rcu_dereference(e->htab);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ struct bucket *b;
+
+ b = __select_bucket(htab, link->hash);
+ head = &b->head;
+ raw_spin_lock_bh(&b->lock);
+ l = lookup_elem_raw(head,
+ link->hash, link->key,
+ htab->map.key_size);
+ /* If another thread deleted this object skip deletion.
+ * The refcnt on psock may or may not be zero.
+ */
+ if (l) {
+ hlist_del_rcu(&link->hash_node);
+ smap_release_sock(psock, link->sk);
+ free_htab_elem(htab, link);
+ }
+ raw_spin_unlock_bh(&b->lock);
}
+ e = psock_map_pop(sk, psock);
}
- write_unlock_bh(&sk->sk_callback_lock);
rcu_read_unlock();
close_fun(sk, timeout);
}
@@ -1618,7 +1674,7 @@ static void smap_list_hash_remove(struct smap_psock *psock,
struct smap_psock_map_entry *e, *tmp;
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- struct htab_elem *c = e->hash_link;
+ struct htab_elem *c = rcu_dereference(e->hash_link);
if (c == hash_link)
list_del(&e->list);
@@ -2090,14 +2146,13 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
return ERR_PTR(err);
}
-static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+static void __bpf_htab_free(struct rcu_head *rcu)
{
- return &htab->buckets[hash & (htab->n_buckets - 1)];
-}
+ struct bpf_htab *htab;
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
-{
- return &__select_bucket(htab, hash)->head;
+ htab = container_of(rcu, struct bpf_htab, rcu);
+ bpf_map_area_free(htab->buckets);
+ kfree(htab);
}
static void sock_hash_free(struct bpf_map *map)
@@ -2116,10 +2171,13 @@ static void sock_hash_free(struct bpf_map *map)
*/
rcu_read_lock();
for (i = 0; i < htab->n_buckets; i++) {
- struct hlist_head *head = select_bucket(htab, i);
+ struct bucket *b = __select_bucket(htab, i);
+ struct hlist_head *head;
struct hlist_node *n;
struct htab_elem *l;
+ raw_spin_lock_bh(&b->lock);
+ head = &b->head;
hlist_for_each_entry_safe(l, n, head, hash_node) {
struct sock *sock = l->sk;
struct smap_psock *psock;
@@ -2137,12 +2195,12 @@ static void sock_hash_free(struct bpf_map *map)
smap_release_sock(psock, sock);
}
write_unlock_bh(&sock->sk_callback_lock);
- kfree(l);
+ free_htab_elem(htab, l);
}
+ raw_spin_unlock_bh(&b->lock);
}
rcu_read_unlock();
- bpf_map_area_free(htab->buckets);
- kfree(htab);
+ call_rcu(&htab->rcu, __bpf_htab_free);
}
static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
@@ -2169,19 +2227,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
return l_new;
}
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
- u32 hash, void *key, u32 key_size)
-{
- struct htab_elem *l;
-
- hlist_for_each_entry_rcu(l, head, hash_node) {
- if (l->hash == hash && !memcmp(&l->key, key, key_size))
- return l;
- }
-
- return NULL;
-}
-
static inline u32 htab_map_hash(const void *key, u32 key_len)
{
return jhash(key, key_len, 0);
@@ -2301,8 +2346,9 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
goto bucket_err;
}
- e->hash_link = l_new;
- e->htab = container_of(map, struct bpf_htab, map);
+ rcu_assign_pointer(e->hash_link, l_new);
+ rcu_assign_pointer(e->htab,
+ container_of(map, struct bpf_htab, map));
list_add_tail(&e->list, &psock->maps);
/* add new element to the head of the list, so that
@@ -2313,8 +2359,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
psock = smap_psock_sk(l_old->sk);
hlist_del_rcu(&l_old->hash_node);
+ write_lock_bh(&l_old->sk->sk_callback_lock);
smap_list_hash_remove(psock, l_old);
smap_release_sock(psock, l_old->sk);
+ write_unlock_bh(&l_old->sk->sk_callback_lock);
free_htab_elem(htab, l_old);
}
raw_spin_unlock_bh(&b->lock);
Powered by blists - more mailing lists