netdev - Re: [PATCH v2 bpf-next 6/9] bpf: introduce BPF_F

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190119105654.1ed0e7ba@cakuba.netronome.com>
Date:   Sat, 19 Jan 2019 10:56:54 -0800
From:   Jakub Kicinski <jakub.kicinski@...ronome.com>
To:     Alexei Starovoitov <ast@...nel.org>
Cc:     <davem@...emloft.net>, <daniel@...earbox.net>,
        <netdev@...r.kernel.org>, <kernel-team@...com>
Subject: Re: [PATCH v2 bpf-next 6/9] bpf: introduce BPF_F_LOCK flag

On Thu, 17 Jan 2019 22:50:53 -0800, Alexei Starovoitov wrote:
> Introduce BPF_F_LOCK flag for map_lookup and map_update syscall commands
> and for map_update() helper function.
> In all these cases take a lock of existing element (which was provided
> in BTF description) before copying (in or out) the rest of map value.
> 
> Implementation details that are part of uapi:
> 
> Array:
> The array map takes the element lock for lookup/update.
> 
> Hash:
> hash map also takes the lock for lookup/update and tries to avoid the bucket lock.
> If old element exists it takes the element lock and updates the element in place.
> If element doesn't exist it allocates new one and inserts into hash table
> while holding the bucket lock.
> In rare case the hashmap has to take both the bucket lock and the element lock
> to update old value in place.
> 
> Cgroup local storage:
> It is similar to array. update in place and lookup are done with lock taken.
> 
> Signed-off-by: Alexei Starovoitov <ast@...nel.org>

> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 48a41bf65e1b..6397b12c130e 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -809,11 +809,11 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
>  static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
>  		       u64 map_flags)
>  {
> -	if (l_old && map_flags == BPF_NOEXIST)
> +	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
>  		/* elem already exists */
>  		return -EEXIST;
>  
> -	if (!l_old && map_flags == BPF_EXIST)
> +	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
>  		/* elem doesn't exist, cannot update it */
>  		return -ENOENT;
>  
> @@ -832,7 +832,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>  	u32 key_size, hash;
>  	int ret;
>  
> -	if (unlikely(map_flags > BPF_EXIST))
> +	if (unlikely(map_flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
>  		/* unknown flags */
>  		return -EINVAL;

Perhaps we should check for:

	hweight(map_flags & (BPF_EXIST | BPF_NOEXIST)) < 2 

right now this > BPF_EXIST is a little hacky, as it depends on the fact,
that BPF_EXIST is 2..  If I read the code correctly  BPF_EXIST |
BPF_NOEXIST will be treated as BPF_NOEXIST.  Not sure that matters.

> @@ -845,6 +845,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>  	b = __select_bucket(htab, hash);
>  	head = &b->head;
>  
> +	if (unlikely(map_flags & BPF_F_LOCK)) {
> +		if (unlikely(!map_value_has_spin_lock(map)))
> +			return -EINVAL;
> +		/* find an element without taking the bucket lock */
> +		l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
> +					      htab->n_buckets);
> +		ret = check_flags(htab, l_old, map_flags);
> +		if (ret)
> +			return ret;
> +		if (l_old) {
> +			/* grab the element lock and update value in place */
> +			copy_map_value_locked(map,
> +					      l_old->key + round_up(key_size, 8),
> +					      value, false);
> +			return 0;
> +		}
> +		/* fall through, grab the bucket lock and lookup again.
> +		 * 99.9% chance that the element won't be found,
> +		 * but second lookup under lock has to be done.
> +		 */
> +	}
> +
>  	/* bpf_map_update_elem() can be called in_irq() */
>  	raw_spin_lock_irqsave(&b->lock, flags);
>  
> @@ -854,6 +876,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>  	if (ret)
>  		goto err;
>  
> +	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
> +		/* first lookup without the bucket lock didn't find the element,
> +		 * but second lookup with the bucket lock found it.
> +		 * This case is highly unlikely, but has to be dealt with:
> +		 * grab the element lock in addition to the bucket lock
> +		 * and update element in place
> +		 */
> +		copy_map_value_locked(map,
> +				      l_old->key + round_up(key_size, 8),
> +				      value, false);
> +		ret = 0;
> +		goto err;
> +	}
> +
>  	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
>  				l_old);
>  	if (IS_ERR(l_new)) {

> diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
> index 0295427f06e2..6b572e2de7fb 100644
> --- a/kernel/bpf/local_storage.c
> +++ b/kernel/bpf/local_storage.c
> @@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
>  	struct bpf_cgroup_storage *storage;
>  	struct bpf_storage_buffer *new;
>  
> -	if (flags != BPF_ANY && flags != BPF_EXIST)
> +	if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
> +		return -EINVAL;
> +
> +	if (unlikely(flags & BPF_NOEXIST))
> +		return -EINVAL;

nit: for arrays we have a different return code with BPF_NOEXIST, but
     here its both EINVAL, so perhaps just drop the BPF_NOEXIST from
     the mask?

> +	if (unlikely((flags & BPF_F_LOCK) &&
> +		     !map_value_has_spin_lock(map)))
>  		return -EINVAL;
>  
>  	storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
> @@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
>  	if (!storage)
>  		return -ENOENT;
>  
> +	if (flags & BPF_F_LOCK) {
> +		copy_map_value_locked(map, storage->buf->data, value, false);
> +		return 0;
> +	}
> +
>  	new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
>  			   map->value_size,
>  			   __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index b29e6dc44650..ed55f03ba586 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -682,7 +682,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
>  }
>  
>  /* last field in 'union bpf_attr' used by this command */
> -#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
> +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
>  
>  static int map_lookup_elem(union bpf_attr *attr)
>  {
> @@ -698,6 +698,9 @@ static int map_lookup_elem(union bpf_attr *attr)
>  	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
>  		return -EINVAL;
>  
> +	if (attr->flags & ~BPF_F_LOCK)
> +		return -EINVAL;
> +
>  	f = fdget(ufd);
>  	map = __bpf_map_get(f);
>  	if (IS_ERR(map))
> @@ -708,6 +711,12 @@ static int map_lookup_elem(union bpf_attr *attr)
>  		goto err_put;
>  	}
>  
> +	if ((attr->flags & BPF_F_LOCK) &&
> +	    !map_value_has_spin_lock(map)) {
> +		err = -EINVAL;
> +		goto err_put;
> +	}
> +
>  	key = __bpf_copy_key(ukey, map->key_size);
>  	if (IS_ERR(key)) {
>  		err = PTR_ERR(key);
> @@ -758,7 +767,16 @@ static int map_lookup_elem(union bpf_attr *attr)
>  			err = -ENOENT;
>  		} else {
>  			err = 0;
> -			copy_map_value(map, value, ptr);
> +			if (attr->flags & BPF_F_LOCK) {
> +				/* lock 'ptr' elem and copy
> +				 * everything but the lock
> +				 */
> +				copy_map_value_locked(map, value, ptr, true);
> +				/* mask lock, since value was kmalloced */
> +				check_and_init_map_lock(map, value);
> +			} else {
> +				copy_map_value(map, value, ptr);
> +			}
>  		}
>  		rcu_read_unlock();
>  	}
> @@ -818,6 +836,12 @@ static int map_update_elem(union bpf_attr *attr)
>  		goto err_put;
>  	}
>  
> +	if ((attr->flags & BPF_F_LOCK) &&
> +	    !map_value_has_spin_lock(map)) {

nit: do we need this check in syscall.c for update?  Just wondering.

> +		err = -EINVAL;
> +		goto err_put;
> +	}
> +
>  	key = __bpf_copy_key(ukey, map->key_size);
>  	if (IS_ERR(key)) {
>  		err = PTR_ERR(key);