lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190301170831.6ae29baa@cakuba.netronome.com>
Date:   Fri, 1 Mar 2019 17:08:31 -0800
From:   Jakub Kicinski <jakub.kicinski@...ronome.com>
To:     Toke Høiland-Jørgensen <toke@...hat.com>
Cc:     David Miller <davem@...emloft.net>, netdev@...r.kernel.org,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Daniel Borkmann <daniel@...earbox.net>,
        Alexei Starovoitov <ast@...nel.org>
Subject: Re: [PATCH net-next v3 1/3] xdp: Refactor devmap code in
 preparation for subsequent additions

On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
> The subsequent commits introducing default maps and a hash-based ifindex
> devmap require a bit of refactoring of the devmap code. Perform this first
> so the subsequent commits become easier to read.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@...hat.com>

> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 191b79948424..1037fc08c504 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -75,6 +75,7 @@ struct bpf_dtab {
>  	struct bpf_dtab_netdev **netdev_map;
>  	unsigned long __percpu *flush_needed;
>  	struct list_head list;
> +	struct rcu_head rcu;

I think the RCU change may warrant a separate commit or at least a
mention in the commit message ;)

>  };
>  
>  static DEFINE_SPINLOCK(dev_map_lock);
> @@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
>  	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
>  }
>  
> -static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
> +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
> +			    bool check_memlock)
>  {
> -	struct bpf_dtab *dtab;
> -	int err = -EINVAL;
>  	u64 cost;
> -
> -	if (!capable(CAP_NET_ADMIN))
> -		return ERR_PTR(-EPERM);
> -
> -	/* check sanity of attributes */
> -	if (attr->max_entries == 0 || attr->key_size != 4 ||
> -	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
> -		return ERR_PTR(-EINVAL);

perhaps consider moving these to a map_alloc_check callback?

> -	dtab = kzalloc(sizeof(*dtab), GFP_USER);
> -	if (!dtab)
> -		return ERR_PTR(-ENOMEM);
> +	int err;
>  
>  	bpf_map_init_from_attr(&dtab->map, attr);
>  
> @@ -109,60 +98,72 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>  	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
>  	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
>  	if (cost >= U32_MAX - PAGE_SIZE)
> -		goto free_dtab;
> +		return -EINVAL;
>  
>  	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
>  
> -	/* if map size is larger than memlock limit, reject it early */
> -	err = bpf_map_precharge_memlock(dtab->map.pages);
> -	if (err)
> -		goto free_dtab;
> -
> -	err = -ENOMEM;
> +	if (check_memlock) {
> +		/* if map size is larger than memlock limit, reject it early */
> +		err = bpf_map_precharge_memlock(dtab->map.pages);
> +		if (err)
> +			return -EINVAL;
> +	}
>  
>  	/* A per cpu bitfield with a bit per possible net device */
>  	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
>  						__alignof__(unsigned long),
>  						GFP_KERNEL | __GFP_NOWARN);
>  	if (!dtab->flush_needed)
> -		goto free_dtab;
> +		goto err_alloc;
>  
>  	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
>  					      sizeof(struct bpf_dtab_netdev *),
>  					      dtab->map.numa_node);
>  	if (!dtab->netdev_map)
> -		goto free_dtab;
> +		goto err_map;
>  
> -	spin_lock(&dev_map_lock);
> -	list_add_tail_rcu(&dtab->list, &dev_map_list);
> -	spin_unlock(&dev_map_lock);
> +	return 0;
>  
> -	return &dtab->map;
> -free_dtab:
> + err_map:

The label should name the first action.  So it was correct, you're
making it less good :)  Also why the space?

>  	free_percpu(dtab->flush_needed);
> -	kfree(dtab);
> -	return ERR_PTR(err);
> + err_alloc:

and no need for this one

> +	return -ENOMEM;
>  }
>  
> -static void dev_map_free(struct bpf_map *map)
> +static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>  {
> -	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> -	int i, cpu;
> +	struct bpf_dtab *dtab;
> +	int err;
>  
> -	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> -	 * so the programs (can be more than one that used this map) were
> -	 * disconnected from events. Wait for outstanding critical sections in
> -	 * these programs to complete. The rcu critical section only guarantees
> -	 * no further reads against netdev_map. It does __not__ ensure pending
> -	 * flush operations (if any) are complete.
> -	 */
> +	if (!capable(CAP_NET_ADMIN))
> +		return ERR_PTR(-EPERM);
> +
> +	/* check sanity of attributes */
> +	if (attr->max_entries == 0 || attr->key_size != 4 ||
> +	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
> +		return ERR_PTR(-EINVAL);
> +
> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);
> +	if (!dtab)
> +		return ERR_PTR(-ENOMEM);
> +
> +	err = dev_map_init_map(dtab, attr, true);
> +	if (err) {
> +		kfree(dtab);
> +		return ERR_PTR(err);
> +	}
>  
>  	spin_lock(&dev_map_lock);
> -	list_del_rcu(&dtab->list);
> +	list_add_tail_rcu(&dtab->list, &dev_map_list);
>  	spin_unlock(&dev_map_lock);
>  
> -	bpf_clear_redirect_map(map);
> -	synchronize_rcu();
> +	return &dtab->map;
> +}
> +
> +static void __dev_map_free(struct rcu_head *rcu)
> +{
> +	struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu);
> +	int i, cpu;
>  
>  	/* To ensure all pending flush operations have completed wait for flush
>  	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
> @@ -192,6 +193,26 @@ static void dev_map_free(struct bpf_map *map)

There is a cond_resched() here, I don't think you can call
cond_resched() from an RCU callback.

>  	kfree(dtab);
>  }
>  
> +static void dev_map_free(struct bpf_map *map)
> +{
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +
> +	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
> +	 * so the programs (can be more than one that used this map) were
> +	 * disconnected from events. Wait for outstanding critical sections in
> +	 * these programs to complete. The rcu critical section only guarantees
> +	 * no further reads against netdev_map. It does __not__ ensure pending
> +	 * flush operations (if any) are complete.
> +	 */
> +
> +	spin_lock(&dev_map_lock);
> +	list_del_rcu(&dtab->list);
> +	spin_unlock(&dev_map_lock);
> +
> +	bpf_clear_redirect_map(map);
> +	call_rcu(&dtab->rcu, __dev_map_free);
> +}
> +
>  static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>  {
>  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ