netdev - Re: [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87zhqakdqk.fsf@toke.dk>
Date:   Mon, 04 Mar 2019 13:47:47 +0100
From:   Toke Høiland-Jørgensen <toke@...hat.com>
To:     Jakub Kicinski <jakub.kicinski@...ronome.com>
Cc:     David Miller <davem@...emloft.net>, netdev@...r.kernel.org,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Daniel Borkmann <daniel@...earbox.net>,
        Alexei Starovoitov <ast@...nel.org>
Subject: Re: [PATCH net-next v3 1/3] xdp: Refactor devmap code in preparation for subsequent additions

Jakub Kicinski <jakub.kicinski@...ronome.com> writes:

> On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
>> The subsequent commits introducing default maps and a hash-based ifindex
>> devmap require a bit of refactoring of the devmap code. Perform this first
>> so the subsequent commits become easier to read.
>> 
>> Signed-off-by: Toke Høiland-Jørgensen <toke@...hat.com>
>
>> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>> index 191b79948424..1037fc08c504 100644
>> --- a/kernel/bpf/devmap.c
>> +++ b/kernel/bpf/devmap.c
>> @@ -75,6 +75,7 @@ struct bpf_dtab {
>>  	struct bpf_dtab_netdev **netdev_map;
>>  	unsigned long __percpu *flush_needed;
>>  	struct list_head list;
>> +	struct rcu_head rcu;
>
> I think the RCU change may warrant a separate commit or at least a
> mention in the commit message ;)

Heh, fair point.

>>  };
>>  
>>  static DEFINE_SPINLOCK(dev_map_lock);
>> @@ -85,23 +86,11 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
>>  	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
>>  }
>>  
>> -static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>> +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
>> +			    bool check_memlock)
>>  {
>> -	struct bpf_dtab *dtab;
>> -	int err = -EINVAL;
>>  	u64 cost;
>> -
>> -	if (!capable(CAP_NET_ADMIN))
>> -		return ERR_PTR(-EPERM);
>> -
>> -	/* check sanity of attributes */
>> -	if (attr->max_entries == 0 || attr->key_size != 4 ||
>> -	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
>> -		return ERR_PTR(-EINVAL);
>
> perhaps consider moving these to a map_alloc_check callback?

The reason I kept them here is that these checks are only needed
strictly when the parameters come from userspace. In the other
allocation paths, the attrs are hard-coded to valid values, so the only
thing having the check would do is guard against future changes to one
part of the file being out of sync with the other part. I can certainly
add the check, it just seemed a bit excessive...

>> -	dtab = kzalloc(sizeof(*dtab), GFP_USER);
>> -	if (!dtab)
>> -		return ERR_PTR(-ENOMEM);
>> +	int err;
>>  
>>  	bpf_map_init_from_attr(&dtab->map, attr);
>>  
>> @@ -109,60 +98,72 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>>  	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
>>  	cost += dev_map_bitmap_size(attr) * num_possible_cpus();
>>  	if (cost >= U32_MAX - PAGE_SIZE)
>> -		goto free_dtab;
>> +		return -EINVAL;
>>  
>>  	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
>>  
>> -	/* if map size is larger than memlock limit, reject it early */
>> -	err = bpf_map_precharge_memlock(dtab->map.pages);
>> -	if (err)
>> -		goto free_dtab;
>> -
>> -	err = -ENOMEM;
>> +	if (check_memlock) {
>> +		/* if map size is larger than memlock limit, reject it early */
>> +		err = bpf_map_precharge_memlock(dtab->map.pages);
>> +		if (err)
>> +			return -EINVAL;
>> +	}
>>  
>>  	/* A per cpu bitfield with a bit per possible net device */
>>  	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
>>  						__alignof__(unsigned long),
>>  						GFP_KERNEL | __GFP_NOWARN);
>>  	if (!dtab->flush_needed)
>> -		goto free_dtab;
>> +		goto err_alloc;
>>  
>>  	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
>>  					      sizeof(struct bpf_dtab_netdev *),
>>  					      dtab->map.numa_node);
>>  	if (!dtab->netdev_map)
>> -		goto free_dtab;
>> +		goto err_map;
>>  
>> -	spin_lock(&dev_map_lock);
>> -	list_add_tail_rcu(&dtab->list, &dev_map_list);
>> -	spin_unlock(&dev_map_lock);
>> +	return 0;
>>  
>> -	return &dtab->map;
>> -free_dtab:
>> + err_map:
>
> The label should name the first action.  So it was correct, you're
> making it less good :)

Heh, yeah, guess you're right.

> Also why the space?

Because the might GNU ordained it (i.e., Emacs added those). Will fix.

>>  	free_percpu(dtab->flush_needed);
>> -	kfree(dtab);
>> -	return ERR_PTR(err);
>> + err_alloc:
>
> and no need for this one

Right, guess I can just rely on the NULL check in the free functions;
would make things simpler in patch 3 as well :)

>> +	return -ENOMEM;
>>  }
>>  
>> -static void dev_map_free(struct bpf_map *map)
>> +static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>>  {
>> -	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>> -	int i, cpu;
>> +	struct bpf_dtab *dtab;
>> +	int err;
>>  
>> -	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
>> -	 * so the programs (can be more than one that used this map) were
>> -	 * disconnected from events. Wait for outstanding critical sections in
>> -	 * these programs to complete. The rcu critical section only guarantees
>> -	 * no further reads against netdev_map. It does __not__ ensure pending
>> -	 * flush operations (if any) are complete.
>> -	 */
>> +	if (!capable(CAP_NET_ADMIN))
>> +		return ERR_PTR(-EPERM);
>> +
>> +	/* check sanity of attributes */
>> +	if (attr->max_entries == 0 || attr->key_size != 4 ||
>> +	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
>> +		return ERR_PTR(-EINVAL);
>> +
>> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);
>> +	if (!dtab)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	err = dev_map_init_map(dtab, attr, true);
>> +	if (err) {
>> +		kfree(dtab);
>> +		return ERR_PTR(err);
>> +	}
>>  
>>  	spin_lock(&dev_map_lock);
>> -	list_del_rcu(&dtab->list);
>> +	list_add_tail_rcu(&dtab->list, &dev_map_list);
>>  	spin_unlock(&dev_map_lock);
>>  
>> -	bpf_clear_redirect_map(map);
>> -	synchronize_rcu();
>> +	return &dtab->map;
>> +}
>> +
>> +static void __dev_map_free(struct rcu_head *rcu)
>> +{
>> +	struct bpf_dtab *dtab = container_of(rcu, struct bpf_dtab, rcu);
>> +	int i, cpu;
>>  
>>  	/* To ensure all pending flush operations have completed wait for flush
>>  	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
>> @@ -192,6 +193,26 @@ static void dev_map_free(struct bpf_map *map)
>
> There is a cond_resched() here, I don't think you can call
> cond_resched() from an RCU callback.

Hmm, bugger. I thought I could just use call_rcu() as semantically
equivalent (but slightly slower) to just calling synchronize_rcu()
inside the function.

In an earlier version I had a namespace enter/exit notifier in devmap.c
as well, to react to new namespaces. And that notifier has a comment
about avoiding calls to synchronize_rcu(). But since this version
doesn't actually need that, maybe I can just keep using direct calls and
synchronize_rcu() and avoid the callback? I'm a bit worried about adding
both synchronize_rcu() and cond_resched() as a possible side effect of
every call to bpf_prog_put(), though; so maybe it's better to move the
cleanup somewhere it's actually safe to call cond_resched(); what would
that be, a workqueue?

-Toke