lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20180623074538.5e42dlerj4dz6q25@kafai-mbp.dhcp.thefacebook.com>
Date:   Sat, 23 Jun 2018 00:45:38 -0700
From:   Martin KaFai Lau <kafai@...com>
To:     John Fastabend <john.fastabend@...il.com>
CC:     <ast@...nel.org>, <daniel@...earbox.net>, <netdev@...r.kernel.org>
Subject: Re: [bpf PATCH v3 3/4] bpf: sockhash fix omitted bucket lock in
 sock_close

On Fri, Jun 22, 2018 at 08:21:44AM -0700, John Fastabend wrote:
> First in tcp_close, reduce scope of sk_callback_lock() the lock is
> only needed for protecting maps list the ingress and cork
> lists are protected by sock lock. Having the lock in wider scope is
> harmless but may confuse the reader who may infer it is in fact
> needed.
> 
> Next, in sock_hash_delete_elem() the pattern is as follows,
> 
>   sock_hash_delete_elem()
>      [...]
>      spin_lock(bucket_lock)
>      l = lookup_elem_raw()
>      if (l)
>         hlist_del_rcu()
>         write_lock(sk_callback_lock)
>          .... destroy psock ...
>         write_unlock(sk_callback_lock)
>      spin_unlock(bucket_lock)
> 
> The ordering is necessary because we only know the {p}sock after
> dereferencing the hash table which we can't do unless we have the
> bucket lock held. Once we have the bucket lock and the psock element
> it is deleted from the hashmap to ensure any other path doing a lookup
> will fail. Finally, the refcnt is decremented and if zero the psock
> is destroyed.
> 
> In parallel with the above (or free'ing the map) a tcp close event
> may trigger tcp_close(). Which at the moment omits the bucket lock
> altogether (oops!) where the flow looks like this,
> 
>   bpf_tcp_close()
>      [...]
>      write_lock(sk_callback_lock)
>      for each psock->maps // list of maps this sock is part of
>          hlist_del_rcu(ref_hash_node);
>          .... destroy psock ...
>      write_unlock(sk_callback_lock)
> 
> Obviously, and demonstrated by syzbot, this is broken because
> we can have multiple threads deleting entries via hlist_del_rcu().
> 
> To fix this we might be tempted to wrap the hlist operation in a
> bucket lock but that would create a lock inversion problem. In
> summary to follow locking rules the psocks maps list needs the
> sk_callback_lock but we need the bucket lock to do the hlist_del_rcu.
> To resolve the lock inversion problem pop the head of the maps list
> repeatedly and remove the reference until no more are left. If a
> delete happens in parallel from the BPF API that is OK as well because
> it will do a similar action, lookup the lock in the map/hash, delete
> it from the map/hash, and dec the refcnt. We check for this case
> before doing a destroy on the psock to ensure we don't have two
> threads tearing down a psock. The new logic is as follows,
> 
>   bpf_tcp_close()
>   e = psock_map_pop(psock->maps) // done with sk_callback_lock
>   bucket_lock() // lock hash list bucket
>   l = lookup_elem_raw(head, hash, key, key_size);
>   if (l) {
>      //only get here if elmnt was not already removed
>      hlist_del_rcu()
>      ... destroy psock...
>   }
>   bucket_unlock()
> 
> And finally for all the above to work add missing sk_callback_lock
> around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
> delete and update may corrupt maps list. Then add RCU annotations and
> use rcu_dereference/rcu_assign_pointer to manage values relying on
> RCU so that the object is not free'd from sock_hash_free() while it
> is being referenced in bpf_tcp_close().
> 
> (As an aside the sk_callback_lock serves two purposes. The
>  first, is to update the sock callbacks sk_data_ready, sk_write_space,
>  etc. The second is to protect the psock 'maps' list. The 'maps' list
>  is used to (as shown above) to delete all map/hash references to a
>  sock when the sock is closed)
> 
> Reported-by: syzbot+0ce137753c78f7b6acc1@...kaller.appspotmail.com
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend <john.fastabend@...il.com>
> ---
>  kernel/bpf/sockmap.c |  120 +++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 84 insertions(+), 36 deletions(-)
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 69b26af..333427b 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -72,6 +72,7 @@ struct bpf_htab {
>  	u32 n_buckets;
>  	u32 elem_size;
>  	struct bpf_sock_progs progs;
> +	struct rcu_head rcu;
>  };
>  
>  struct htab_elem {
> @@ -89,8 +90,8 @@ enum smap_psock_state {
>  struct smap_psock_map_entry {
>  	struct list_head list;
>  	struct sock **entry;
> -	struct htab_elem *hash_link;
> -	struct bpf_htab *htab;
> +	struct htab_elem __rcu *hash_link;
> +	struct bpf_htab __rcu *htab;
>  };
>  
>  struct smap_psock {
> @@ -258,16 +259,54 @@ static void bpf_tcp_release(struct sock *sk)
>  	rcu_read_unlock();
>  }
>  
> +static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
> +					 u32 hash, void *key, u32 key_size)
> +{
> +	struct htab_elem *l;
> +
> +	hlist_for_each_entry_rcu(l, head, hash_node) {
> +		if (l->hash == hash && !memcmp(&l->key, key, key_size))
> +			return l;
> +	}
> +
> +	return NULL;
> +}
> +
> +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> +{
> +	return &htab->buckets[hash & (htab->n_buckets - 1)];
> +}
> +
> +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
> +{
> +	return &__select_bucket(htab, hash)->head;
> +}
> +
>  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
>  {
>  	atomic_dec(&htab->count);
>  	kfree_rcu(l, rcu);
>  }
>  
> +static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
> +						  struct smap_psock *psock)
> +{
> +	struct smap_psock_map_entry *e;
> +
> +	write_lock_bh(&sk->sk_callback_lock);
> +	e = list_first_entry_or_null(&psock->maps,
> +				     struct smap_psock_map_entry,
> +				     list);
> +	if (e)
> +		list_del(&e->list);
> +	write_unlock_bh(&sk->sk_callback_lock);
> +	return e;
> +}
> +
>  static void bpf_tcp_close(struct sock *sk, long timeout)
>  {
>  	void (*close_fun)(struct sock *sk, long timeout);
> -	struct smap_psock_map_entry *e, *tmp;
> +	struct smap_psock_map_entry *e;
>  	struct sk_msg_buff *md, *mtmp;
>  	struct smap_psock *psock;
>  	struct sock *osk;
> @@ -286,7 +325,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
>  	 */
>  	close_fun = psock->save_close;
>  
> -	write_lock_bh(&sk->sk_callback_lock);
>  	if (psock->cork) {
>  		free_start_sg(psock->sock, psock->cork);
>  		kfree(psock->cork);
> @@ -299,20 +337,38 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
>  		kfree(md);
>  	}
>  
> -	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
> +	e = psock_map_pop(sk, psock);
> +	while (e) {
>  		if (e->entry) {
>  			osk = cmpxchg(e->entry, sk, NULL);
>  			if (osk == sk) {
> -				list_del(&e->list);
>  				smap_release_sock(psock, sk);
>  			}
>  		} else {
> -			hlist_del_rcu(&e->hash_link->hash_node);
> -			smap_release_sock(psock, e->hash_link->sk);
> -			free_htab_elem(e->htab, e->hash_link);
> +			struct htab_elem *link = rcu_dereference(e->hash_link);
> +			struct bpf_htab *htab = rcu_dereference(e->htab);
> +			struct hlist_head *head;
> +			struct htab_elem *l;
> +			struct bucket *b;
> +
> +			b = __select_bucket(htab, link->hash);
> +			head = &b->head;
> +			raw_spin_lock_bh(&b->lock);
> +			l = lookup_elem_raw(head,
> +					    link->hash, link->key,
> +					    htab->map.key_size);
> +			/* If another thread deleted this object skip deletion.
> +			 * The refcnt on psock may or may not be zero.
> +			 */
> +			if (l) {
> +				hlist_del_rcu(&link->hash_node);
> +				smap_release_sock(psock, link->sk);
> +				free_htab_elem(htab, link);
> +			}
> +			raw_spin_unlock_bh(&b->lock);
>  		}
> +		e = psock_map_pop(sk, psock);
>  	}
> -	write_unlock_bh(&sk->sk_callback_lock);
>  	rcu_read_unlock();
>  	close_fun(sk, timeout);
>  }
> @@ -1618,7 +1674,7 @@ static void smap_list_hash_remove(struct smap_psock *psock,
>  	struct smap_psock_map_entry *e, *tmp;
>  
>  	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
> -		struct htab_elem *c = e->hash_link;
> +		struct htab_elem *c = rcu_dereference(e->hash_link);
>  
>  		if (c == hash_link)
>  			list_del(&e->list);
> @@ -2090,14 +2146,13 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
>  	return ERR_PTR(err);
>  }
>  
> -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> +static void __bpf_htab_free(struct rcu_head *rcu)
>  {
> -	return &htab->buckets[hash & (htab->n_buckets - 1)];
> -}
> +	struct bpf_htab *htab;
>  
> -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
> -{
> -	return &__select_bucket(htab, hash)->head;
> +	htab = container_of(rcu, struct bpf_htab, rcu);
> +	bpf_map_area_free(htab->buckets);
> +	kfree(htab);
>  }
>  
>  static void sock_hash_free(struct bpf_map *map)
> @@ -2116,10 +2171,13 @@ static void sock_hash_free(struct bpf_map *map)
>  	 */
>  	rcu_read_lock();
>  	for (i = 0; i < htab->n_buckets; i++) {
> -		struct hlist_head *head = select_bucket(htab, i);
> +		struct bucket *b = __select_bucket(htab, i);
> +		struct hlist_head *head;
>  		struct hlist_node *n;
>  		struct htab_elem *l;
>  
> +		raw_spin_lock_bh(&b->lock);
> +		head = &b->head;
>  		hlist_for_each_entry_safe(l, n, head, hash_node) {
>  			struct sock *sock = l->sk;
>  			struct smap_psock *psock;
> @@ -2137,12 +2195,12 @@ static void sock_hash_free(struct bpf_map *map)
>  				smap_release_sock(psock, sock);
>  			}
>  			write_unlock_bh(&sock->sk_callback_lock);
> -			kfree(l);
> +			free_htab_elem(htab, l);
>  		}
> +		raw_spin_unlock_bh(&b->lock);
>  	}
>  	rcu_read_unlock();
> -	bpf_map_area_free(htab->buckets);
> -	kfree(htab);
> +	call_rcu(&htab->rcu, __bpf_htab_free);
>  }
>  
>  static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
> @@ -2169,19 +2227,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
>  	return l_new;
>  }
>  
> -static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
> -					 u32 hash, void *key, u32 key_size)
> -{
> -	struct htab_elem *l;
> -
> -	hlist_for_each_entry_rcu(l, head, hash_node) {
> -		if (l->hash == hash && !memcmp(&l->key, key, key_size))
> -			return l;
> -	}
> -
> -	return NULL;
> -}
> -
>  static inline u32 htab_map_hash(const void *key, u32 key_len)
>  {
>  	return jhash(key, key_len, 0);
> @@ -2301,8 +2346,9 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
>  		goto bucket_err;
>  	}
>  
> -	e->hash_link = l_new;
> -	e->htab = container_of(map, struct bpf_htab, map);
> +	rcu_assign_pointer(e->hash_link, l_new);
> +	rcu_assign_pointer(e->htab,
> +			   container_of(map, struct bpf_htab, map));
>  	list_add_tail(&e->list, &psock->maps);
Is sock->sk_callback_lock held here?

Others LGTM.

>  
>  	/* add new element to the head of the list, so that
> @@ -2313,8 +2359,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
>  		psock = smap_psock_sk(l_old->sk);
>  
>  		hlist_del_rcu(&l_old->hash_node);
> +		write_lock_bh(&l_old->sk->sk_callback_lock);
>  		smap_list_hash_remove(psock, l_old);
>  		smap_release_sock(psock, l_old->sk);
> +		write_unlock_bh(&l_old->sk->sk_callback_lock);
>  		free_htab_elem(htab, l_old);
>  	}
>  	raw_spin_unlock_bh(&b->lock);
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ