[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190301170831.6ae29baa@cakuba.netronome.com>
Date: Fri, 1 Mar 2019 17:08:31 -0800
From: Jakub Kicinski <jakub.kicinski@...ronome.com>
To: Toke Høiland-Jørgensen <toke@...hat.com>
Cc: David Miller <davem@...emloft.net>, netdev@...r.kernel.org,
Jesper Dangaard Brouer <brouer@...hat.com>,
Daniel Borkmann <daniel@...earbox.net>,
Alexei Starovoitov <ast@...nel.org>
Subject: Re: [PATCH net-next v3 1/3] xdp: Refactor devmap code in
preparation for subsequent additions
On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
> The subsequent commits introducing default maps and a hash-based ifindex
> devmap require a bit of refactoring of the devmap code. Perform this first
> so the subsequent commits become easier to read.
>
> Signed-off-by: Toke Høiland-Jørgensen <toke@...hat.com>
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 191b79948424..1037fc08c504 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -75,6 +75,7 @@ struct bpf_dtab {
> struct bpf_dtab_netdev **netdev_map;
> unsigned long __percpu *flush_needed;
> struct list_head list;
> + struct rcu_head rcu;
I think the RCU change may warrant a separate commit or at least a
mention in the commit message ;)
> };
>
> static DEFINE_SPINLOCK(dev_map_lock);
> @@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
> return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
> }
>
> -static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
> +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> + bool check_memlock)
> {
> - struct bpf_dtab *dtab;
> - int err = -EINVAL;
> u64 cost;
> -
> - if (!capable(CAP_NET_ADMIN))
> - return ERR_PTR(-EPERM);
> -
> - /* check sanity of attributes */
> - if (attr->max_entries == 0 || attr->key_size != 4 ||
> - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
> - return ERR_PTR(-EINVAL);
perhaps consider moving these to a map_alloc_check callback?
> - dtab = kzalloc(sizeof(*dtab), GFP_USER);
> - if (!dtab)
> - return ERR_PTR(-ENOMEM);
> + int err;
>
> bpf_map_init_from_attr(&dtab->map, attr);
>
> @@ -109,60 +98,72 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
> cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
> cost += dev_map_bitmap_size(attr) * num_possible_cpus();
> if (cost >= U32_MAX - PAGE_SIZE)
> - goto free_dtab;
> + return -EINVAL;
>
> dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
>
> - /* if map size is larger than memlock limit, reject it early */
> - err = bpf_map_precharge_memlock(dtab->map.pages);
> - if (err)
> - goto free_dtab;
> -
> - err = -ENOMEM;
> + if (check_memlock) {
> + /* if map size is larger than memlock limit, reject it early */
> + err = bpf_map_precharge_memlock(dtab->map.pages);
> + if (err)
> + return -EINVAL;
> + }
>
> /* A per cpu bitfield with a bit per possible net device */
> dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
> __alignof__(unsigned long),
> GFP_KERNEL | __GFP_NOWARN);
> if (!dtab->flush_needed)
> - goto free_dtab;
> + goto err_alloc;
>
> dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
> sizeof(struct bpf_dtab_netdev *),
> dtab->map.numa_node);
> if (!dtab->netdev_map)
> - goto free_dtab;
> + goto err_map;
>
> - spin_lock(&dev_map_lock);
> - list_add_tail_rcu(&dtab->list, &dev_map_list);
> - spin_unlock(&dev_map_lock);
> + return 0;
>
> - return &dtab->map;
> -free_dtab:
> + err_map:
The label should name the first action. So it was correct, you're
making it less good :) Also why the space?
> free_percpu(dtab->flush_needed);
> - kfree(dtab);
> - return ERR_PTR(err);
> + err_alloc:
and no need for this one
> + return -ENOMEM;
> }
>
> -static void dev_map_free(struct bpf_map *map)
> +static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
> {
> - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> - int i, cpu;
> + struct bpf_dtab *dtab;
> + int err;
>
> - /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> - * so the programs (can be more than one that used this map) were
> - * disconnected from events. Wait for outstanding critical sections in
> - * these programs to complete. The rcu critical section only guarantees
> - * no further reads against netdev_map. It does __not__ ensure pending
> - * flush operations (if any) are complete.
> - */
> + if (!capable(CAP_NET_ADMIN))
> + return ERR_PTR(-EPERM);
> +
> + /* check sanity of attributes */
> + if (attr->max_entries == 0 || attr->key_size != 4 ||
> + attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
> + return ERR_PTR(-EINVAL);
> +
> + dtab = kzalloc(sizeof(*dtab), GFP_USER);
> + if (!dtab)
> + return ERR_PTR(-ENOMEM);
> +
> + err = dev_map_init_map(dtab, attr, true);
> + if (err) {
> + kfree(dtab);
> + return ERR_PTR(err);
> + }
>
> spin_lock(&dev_map_lock);
> - list_del_rcu(&dtab->list);
> + list_add_tail_rcu(&dtab->list, &dev_map_list);
> spin_unlock(&dev_map_lock);
>
> - bpf_clear_redirect_map(map);
> - synchronize_rcu();
> + return &dtab->map;
> +}
> +
> +static void __dev_map_free(struct rcu_head *rcu)
> +{
> + struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu);
> + int i, cpu;
>
> /* To ensure all pending flush operations have completed wait for flush
> * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
> @@ -192,6 +193,26 @@ static void dev_map_free(struct bpf_map *map)
There is a cond_resched() here, I don't think you can call
cond_resched() from an RCU callback.
> kfree(dtab);
> }
>
> +static void dev_map_free(struct bpf_map *map)
> +{
> + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +
> + /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> + * so the programs (can be more than one that used this map) were
> + * disconnected from events. Wait for outstanding critical sections in
> + * these programs to complete. The rcu critical section only guarantees
> + * no further reads against netdev_map. It does __not__ ensure pending
> + * flush operations (if any) are complete.
> + */
> +
> + spin_lock(&dev_map_lock);
> + list_del_rcu(&dtab->list);
> + spin_unlock(&dev_map_lock);
> +
> + bpf_clear_redirect_map(map);
> + call_rcu(&dtab->rcu, __dev_map_free);
> +}
> +
> static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> {
> struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
Powered by blists - more mailing lists