[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190119105654.1ed0e7ba@cakuba.netronome.com>
Date: Sat, 19 Jan 2019 10:56:54 -0800
From: Jakub Kicinski <jakub.kicinski@...ronome.com>
To: Alexei Starovoitov <ast@...nel.org>
Cc: <davem@...emloft.net>, <daniel@...earbox.net>,
<netdev@...r.kernel.org>, <kernel-team@...com>
Subject: Re: [PATCH v2 bpf-next 6/9] bpf: introduce BPF_F_LOCK flag
On Thu, 17 Jan 2019 22:50:53 -0800, Alexei Starovoitov wrote:
> Introduce BPF_F_LOCK flag for map_lookup and map_update syscall commands
> and for map_update() helper function.
> In all these cases take a lock of existing element (which was provided
> in BTF description) before copying (in or out) the rest of map value.
>
> Implementation details that are part of uapi:
>
> Array:
> The array map takes the element lock for lookup/update.
>
> Hash:
> hash map also takes the lock for lookup/update and tries to avoid the bucket lock.
> If old element exists it takes the element lock and updates the element in place.
> If element doesn't exist it allocates new one and inserts into hash table
> while holding the bucket lock.
> In rare case the hashmap has to take both the bucket lock and the element lock
> to update old value in place.
>
> Cgroup local storage:
> It is similar to array. update in place and lookup are done with lock taken.
>
> Signed-off-by: Alexei Starovoitov <ast@...nel.org>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 48a41bf65e1b..6397b12c130e 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -809,11 +809,11 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
> static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
> u64 map_flags)
> {
> - if (l_old && map_flags == BPF_NOEXIST)
> + if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
> /* elem already exists */
> return -EEXIST;
>
> - if (!l_old && map_flags == BPF_EXIST)
> + if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
> /* elem doesn't exist, cannot update it */
> return -ENOENT;
>
> @@ -832,7 +832,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> u32 key_size, hash;
> int ret;
>
> - if (unlikely(map_flags > BPF_EXIST))
> + if (unlikely(map_flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
> /* unknown flags */
> return -EINVAL;
Perhaps we should check for:
hweight(map_flags & (BPF_EXIST | BPF_NOEXIST)) < 2
right now this > BPF_EXIST is a little hacky, as it depends on the fact,
that BPF_EXIST is 2.. If I read the code correctly BPF_EXIST |
BPF_NOEXIST will be treated as BPF_NOEXIST. Not sure that matters.
> @@ -845,6 +845,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> b = __select_bucket(htab, hash);
> head = &b->head;
>
> + if (unlikely(map_flags & BPF_F_LOCK)) {
> + if (unlikely(!map_value_has_spin_lock(map)))
> + return -EINVAL;
> + /* find an element without taking the bucket lock */
> + l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
> + htab->n_buckets);
> + ret = check_flags(htab, l_old, map_flags);
> + if (ret)
> + return ret;
> + if (l_old) {
> + /* grab the element lock and update value in place */
> + copy_map_value_locked(map,
> + l_old->key + round_up(key_size, 8),
> + value, false);
> + return 0;
> + }
> + /* fall through, grab the bucket lock and lookup again.
> + * 99.9% chance that the element won't be found,
> + * but second lookup under lock has to be done.
> + */
> + }
> +
> /* bpf_map_update_elem() can be called in_irq() */
> raw_spin_lock_irqsave(&b->lock, flags);
>
> @@ -854,6 +876,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> if (ret)
> goto err;
>
> + if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
> + /* first lookup without the bucket lock didn't find the element,
> + * but second lookup with the bucket lock found it.
> + * This case is highly unlikely, but has to be dealt with:
> + * grab the element lock in addition to the bucket lock
> + * and update element in place
> + */
> + copy_map_value_locked(map,
> + l_old->key + round_up(key_size, 8),
> + value, false);
> + ret = 0;
> + goto err;
> + }
> +
> l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
> l_old);
> if (IS_ERR(l_new)) {
> diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
> index 0295427f06e2..6b572e2de7fb 100644
> --- a/kernel/bpf/local_storage.c
> +++ b/kernel/bpf/local_storage.c
> @@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
> struct bpf_cgroup_storage *storage;
> struct bpf_storage_buffer *new;
>
> - if (flags != BPF_ANY && flags != BPF_EXIST)
> + if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
> + return -EINVAL;
> +
> + if (unlikely(flags & BPF_NOEXIST))
> + return -EINVAL;
nit: for arrays we have a different return code with BPF_NOEXIST, but
here its both EINVAL, so perhaps just drop the BPF_NOEXIST from
the mask?
> + if (unlikely((flags & BPF_F_LOCK) &&
> + !map_value_has_spin_lock(map)))
> return -EINVAL;
>
> storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
> @@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
> if (!storage)
> return -ENOENT;
>
> + if (flags & BPF_F_LOCK) {
> + copy_map_value_locked(map, storage->buf->data, value, false);
> + return 0;
> + }
> +
> new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
> map->value_size,
> __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index b29e6dc44650..ed55f03ba586 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -682,7 +682,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
> }
>
> /* last field in 'union bpf_attr' used by this command */
> -#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
> +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
>
> static int map_lookup_elem(union bpf_attr *attr)
> {
> @@ -698,6 +698,9 @@ static int map_lookup_elem(union bpf_attr *attr)
> if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
> return -EINVAL;
>
> + if (attr->flags & ~BPF_F_LOCK)
> + return -EINVAL;
> +
> f = fdget(ufd);
> map = __bpf_map_get(f);
> if (IS_ERR(map))
> @@ -708,6 +711,12 @@ static int map_lookup_elem(union bpf_attr *attr)
> goto err_put;
> }
>
> + if ((attr->flags & BPF_F_LOCK) &&
> + !map_value_has_spin_lock(map)) {
> + err = -EINVAL;
> + goto err_put;
> + }
> +
> key = __bpf_copy_key(ukey, map->key_size);
> if (IS_ERR(key)) {
> err = PTR_ERR(key);
> @@ -758,7 +767,16 @@ static int map_lookup_elem(union bpf_attr *attr)
> err = -ENOENT;
> } else {
> err = 0;
> - copy_map_value(map, value, ptr);
> + if (attr->flags & BPF_F_LOCK) {
> + /* lock 'ptr' elem and copy
> + * everything but the lock
> + */
> + copy_map_value_locked(map, value, ptr, true);
> + /* mask lock, since value was kmalloced */
> + check_and_init_map_lock(map, value);
> + } else {
> + copy_map_value(map, value, ptr);
> + }
> }
> rcu_read_unlock();
> }
> @@ -818,6 +836,12 @@ static int map_update_elem(union bpf_attr *attr)
> goto err_put;
> }
>
> + if ((attr->flags & BPF_F_LOCK) &&
> + !map_value_has_spin_lock(map)) {
nit: do we need this check in syscall.c for update? Just wondering.
> + err = -EINVAL;
> + goto err_put;
> + }
> +
> key = __bpf_copy_key(ukey, map->key_size);
> if (IS_ERR(key)) {
> err = PTR_ERR(key);
Powered by blists - more mailing lists