[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8ba54068-0a0f-479e-e4d7-42bfb7f8ed55@fb.com>
Date: Tue, 7 Jan 2020 18:18:16 +0000
From: Yonghong Song <yhs@...com>
To: Brian Vazquez <brianvv.kernel@...il.com>
CC: Brian Vazquez <brianvv@...gle.com>,
Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
"David S . Miller" <davem@...emloft.net>,
Stanislav Fomichev <sdf@...gle.com>,
Petar Penkov <ppenkov@...gle.com>,
Willem de Bruijn <willemb@...gle.com>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>,
"bpf@...r.kernel.org" <bpf@...r.kernel.org>
Subject: Re: [PATCH v3 bpf-next 06/11] bpf: add batch ops to all htab bpf map
On 1/6/20 11:02 PM, Brian Vazquez wrote:
> On Fri, Dec 13, 2019 at 12:58 PM Yonghong Song <yhs@...com> wrote:
>>
>>
>>
>> On 12/11/19 2:33 PM, Brian Vazquez wrote:
>>> From: Yonghong Song <yhs@...com>
>>>
>>> htab can't use generic batch support due some problematic behaviours
>>> inherent to the data structre, i.e. while iterating the bpf map a
>>> concurrent program might delete the next entry that batch was about to
>>> use, in that case there's no easy solution to retrieve the next entry,
>>> the issue has been discussed multiple times (see [1] and [2]).
>>>
>>> The only way hmap can be traversed without the problem previously
>>> exposed is by making sure that the map is traversing entire buckets.
>>> This commit implements those strict requirements for hmap, the
>>> implementation follows the same interaction that generic support with
>>> some exceptions:
>>>
>>> - If keys/values buffer are not big enough to traverse a bucket,
>>> ENOSPC will be returned.
>>> - out_batch contains the value of the next bucket in the iteration, not
>>> the next key, but this is transparent for the user since the user
>>> should never use out_batch for other than bpf batch syscalls.
>>>
>>> Note that only lookup and lookup_and_delete batch ops require the hmap
>>> specific implementation, update/delete batch ops can be the generic
>>> ones.
>>>
>>> [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@google.com/
>>> [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@fb.com/
>>>
>>> Signed-off-by: Yonghong Song <yhs@...com>
>>> Signed-off-by: Brian Vazquez <brianvv@...gle.com>
>>> ---
>>> kernel/bpf/hashtab.c | 242 +++++++++++++++++++++++++++++++++++++++++++
>>> 1 file changed, 242 insertions(+)
>>>
>>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>>> index 22066a62c8c97..fac107bdaf9ec 100644
>>> --- a/kernel/bpf/hashtab.c
>>> +++ b/kernel/bpf/hashtab.c
>>> @@ -17,6 +17,17 @@
>>> (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
>>> BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
>>>
>>> +#define BATCH_OPS(_name) \
>>> + .map_lookup_batch = \
>>> + _name##_map_lookup_batch, \
>>> + .map_lookup_and_delete_batch = \
>>> + _name##_map_lookup_and_delete_batch, \
>>> + .map_update_batch = \
>>> + generic_map_update_batch, \
>>> + .map_delete_batch = \
>>> + generic_map_delete_batch
>>> +
>>> +
>>> struct bucket {
>>> struct hlist_nulls_head head;
>>> raw_spinlock_t lock;
>>> @@ -1232,6 +1243,233 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
>>> rcu_read_unlock();
>>> }
>>>
>>> +static int
>>> +__htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>> + const union bpf_attr *attr,
>>> + union bpf_attr __user *uattr,
>>> + bool do_delete, bool is_lru_map,
>>> + bool is_percpu)
>>> +{
>>> + struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
>>> + u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
>>> + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
>>> + void __user *uvalues = u64_to_user_ptr(attr->batch.values);
>>> + void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
>>> + void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
>>> + u64 elem_map_flags, map_flags;
>>> + struct hlist_nulls_head *head;
>>> + u32 batch, max_count, size;
>>> + struct hlist_nulls_node *n;
>>> + unsigned long flags;
>>> + struct htab_elem *l;
>>> + struct bucket *b;
>>> + int ret = 0;
>>> +
>>> + max_count = attr->batch.count;
>>> + if (!max_count)
>>> + return 0;
>>> +
>>> + elem_map_flags = attr->batch.elem_flags;
>>> + if ((elem_map_flags & ~BPF_F_LOCK) ||
>>> + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
>>> + return -EINVAL;
>>> +
>>> + map_flags = attr->batch.flags;
>>> + if (map_flags)
>>> + return -EINVAL;
>>> +
>>> + batch = 0;
>>> + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
>>> + return -EFAULT;
>>> +
>>> + if (batch >= htab->n_buckets)
>>> + return -ENOENT;
>>> +
>>> + /* We cannot do copy_from_user or copy_to_user inside
>>> + * the rcu_read_lock. Allocate enough space here.
>>> + */
>>> + key_size = htab->map.key_size;
>>> + roundup_key_size = round_up(htab->map.key_size, 8);
>>> + value_size = htab->map.value_size;
>>> + size = round_up(value_size, 8);
>>> + if (is_percpu)
>>> + value_size = size * num_possible_cpus();
>>> + keys = kvmalloc(key_size, GFP_USER | __GFP_NOWARN);
>>> + values = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
>>> + if (!keys || !values) {
>>> + ret = -ENOMEM;
>>> + goto out;
>>> + }
>>> +
>>> + dst_key = keys;
>>> + dst_val = values;
>>> + total = 0;
>>> +
>>> + preempt_disable();
>>> + this_cpu_inc(bpf_prog_active);
>>> + rcu_read_lock();
>>> +
>>> +again:
>>> + b = &htab->buckets[batch];
>>> + head = &b->head;
>>> + raw_spin_lock_irqsave(&b->lock, flags);
>>> +
>>> + bucket_cnt = 0;
>>> + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
>>> + bucket_cnt++;
>>> +
>>> + if (bucket_cnt > (max_count - total)) {
>>> + if (total == 0)
>>> + ret = -ENOSPC;
>>> + goto after_loop;
>>> + }
>>> +
>>> + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
>>> + memcpy(dst_key, l->key, key_size);
>>> +
>>> + if (is_percpu) {
>>> + int off = 0, cpu;
>>> + void __percpu *pptr;
>>> +
>>> + pptr = htab_elem_get_ptr(l, map->key_size);
>>> + for_each_possible_cpu(cpu) {
>>> + bpf_long_memcpy(dst_val + off,
>>> + per_cpu_ptr(pptr, cpu), size);
>>> + off += size;
>>> + }
>>> + } else {
>>> + value = l->key + roundup_key_size;
>>> + if (elem_map_flags & BPF_F_LOCK)
>>> + copy_map_value_locked(map, dst_val, value,
>>> + true);
>>> + else
>>> + copy_map_value(map, dst_val, value);
>>> + check_and_init_map_lock(map, dst_val);
>>> + }
>>> + if (do_delete) {
>>> + hlist_nulls_del_rcu(&l->hash_node);
>>> + if (is_lru_map)
>>> + bpf_lru_push_free(&htab->lru, &l->lru_node);
>>> + else
>>> + free_htab_elem(htab, l);
>>> + }
>>> + if (copy_to_user(ukeys + total * key_size, keys, key_size) ||
>>> + copy_to_user(uvalues + total * value_size, values,
>>> + value_size)) {
>>
>> We cannot do copy_to_user inside atomic region where irq is disabled
>> with raw_spin_lock_irqsave(). We could do the following:
>> . we kalloc memory before preempt_disable() with the current count
>> of bucket size.
>> . inside the raw_spin_lock_irqsave() region, we can do copy to kernel
>> memory.
>> . inside the raw_spin_lock_irqsave() region, if the bucket size
>> changes, we can have a few retries to increase allocation size
>> before giving up.
>> Do you think this may work?
>
> Yes, it does.
>
> What should be the initial value for the allocated memory
> max_entries/2? Do you see any issue if we just kalloc the entire
> buffer?
Allocating max_entries/2 or entire buffer risks allocating too much
memory from the system, which may not be a good thing in a production
system. That is why I proposed to allocate memory at bucket level.
For a reasonable balanced hash table, this should not cause large
memory pressure on the host. What do you think?
>
>>
>>> + ret = -EFAULT;
>>> + goto after_loop;
>>> + }
>>> + total++;
>>> + }
>>> +
>>> + batch++;
>>> + if (batch >= htab->n_buckets) {
>>> + ret = -ENOENT;
>>> + goto after_loop;
>>> + }
>>> +
>>> + raw_spin_unlock_irqrestore(&b->lock, flags);
>>> + goto again;
>>> +
>>> +after_loop:
>>> + raw_spin_unlock_irqrestore(&b->lock, flags);
>>> +
>>> + rcu_read_unlock();
>>> + this_cpu_dec(bpf_prog_active);
>>> + preempt_enable();
>>> +
>>> + if (ret && ret != -ENOENT)
>>> + goto out;
>>> +
>>> + /* copy data back to user */
>>> + ubatch = u64_to_user_ptr(attr->batch.out_batch);
>>> + if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
>>> + put_user(total, &uattr->batch.count))
>>> + ret = -EFAULT;
>>> +
>>> +out:
>>> + kvfree(keys);
>>> + kvfree(values);
>>> + return ret;
>>> +}
>>> +
>> [...]
Powered by blists - more mailing lists