netdev - Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <877edelukk.fsf@toke.dk>
Date:   Mon, 04 Mar 2019 12:58:51 +0100
From:   Toke Høiland-Jørgensen <toke@...hat.com>
To:     Jakub Kicinski <jakub.kicinski@...ronome.com>
Cc:     David Miller <davem@...emloft.net>, netdev@...r.kernel.org,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Daniel Borkmann <daniel@...earbox.net>,
        Alexei Starovoitov <ast@...nel.org>
Subject: Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP_REDIRECT to a device

Hi Jakub

Thanks for the review! A few comments/questions below.

> On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
>> An XDP program can redirect packets between interfaces using either the
>> xdp_redirect() helper or the xdp_redirect_map() helper. Apart from the
>> flexibility of updating maps from userspace, the redirect_map() helper also
>> uses the map structure to batch packets, which results in a significant
>> (around 50%) performance boost. However, the xdp_redirect() API is simpler
>> if one just wants to redirect to another interface, which means people tend
>> to use this interface and then wonder why they getter worse performance
>> than expected.
>> 
>> This patch seeks to close this performance difference between the two APIs.
>> It achieves this by changing xdp_redirect() to use a hidden devmap for
>> looking up destination interfaces, thus gaining the batching benefit with
>> no visible difference from the user API point of view.
>> 
>> A hidden per-namespace map is allocated when an XDP program that uses the
>> non-map xdp_redirect() helper is first loaded. This map is populated with
>> all available interfaces in its namespace, and kept up to date as
>> interfaces come and go. Once allocated, the map is kept around until the
>> namespace is removed.
>> 
>> The hidden map uses the ifindex as map key, which means they are limited to
>> ifindexes smaller than the map size of 64. A later patch introduces a new
>> map type to lift this restriction.
>> 
>> Performance numbers:
>> 
>> Before patch:
>> xdp_redirect:     5426035 pkt/s
>> xdp_redirect_map: 8412754 pkt/s
>> 
>> After patch:
>> xdp_redirect:     8314702 pkt/s
>> xdp_redirect_map: 8411854 pkt/s
>> 
>> This corresponds to a 53% increase in xdp_redirect performance, or a
>> reduction in per-packet processing time by 64 nanoseconds.
>> 
>> Signed-off-by: Toke Høiland-Jørgensen <toke@...hat.com>
>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index de18227b3d95..060c5850af3b 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -25,6 +25,7 @@ struct sock;
>>  struct seq_file;
>>  struct btf;
>>  struct btf_type;
>> +struct net;
>>  
>>  /* map is generic key/value storage optionally accesible by eBPF programs */
>>  struct bpf_map_ops {
>> @@ -533,6 +534,7 @@ extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
>>  extern const struct bpf_verifier_ops xdp_analyzer_ops;
>>  
>>  struct bpf_prog *bpf_prog_get(u32 ufd);
>> +struct bpf_prog *bpf_prog_get_by_id(u32 id);
>>  struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
>>  				       bool attach_drv);
>>  struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
>> @@ -612,6 +614,11 @@ struct xdp_buff;
>>  struct sk_buff;
>>  
>>  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
>> +struct bpf_map *__dev_map_get_default_map(struct net_device *dev);
>> +int dev_map_ensure_default_map(struct net *net);
>> +void dev_map_put_default_map(struct net *net);
>> +int dev_map_inc_redirect_count(void);
>> +void dev_map_dec_redirect_count(void);
>>  void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
>>  void __dev_map_flush(struct bpf_map *map);
>>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>> @@ -641,6 +648,11 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
>>  	return ERR_PTR(-EOPNOTSUPP);
>>  }
>>  
>> +static inline struct bpf_prog *bpf_prog_get_by_id(u32 id)
>> +{
>> +	return ERR_PTR(-EOPNOTSUPP);
>> +}
>> +
>>  static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
>>  						     enum bpf_prog_type type,
>>  						     bool attach_drv)
>> @@ -693,6 +705,29 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
>>  	return NULL;
>>  }
>>  
>> +static inline struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
>> +{
>> +	return NULL;
>> +}
>> +
>> +static inline int dev_map_ensure_default_map(struct net *net)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline void dev_map_put_default_map(struct net *net)
>> +{
>> +}
>> +
>> +static inline int dev_map_inc_redirect_count(void)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline void dev_map_dec_redirect_count(void)
>> +{
>> +}
>> +
>>  static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
>>  {
>>  }
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 95e2d7ebdf21..dd6bbbab32f7 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -507,6 +507,8 @@ struct bpf_prog {
>>  				gpl_compatible:1, /* Is filter GPL compatible? */
>>  				cb_access:1,	/* Is control block accessed? */
>>  				dst_needed:1,	/* Do we need dst entry? */
>> +				redirect_needed:1,	/* Does program need access to xdp_redirect? */
>> +				redirect_used:1,	/* Does program use xdp_redirect? */
>>  				blinded:1,	/* Was blinded */
>>  				is_func:1,	/* program is a bpf function */
>>  				kprobe_override:1, /* Do we override a kprobe? */
>> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
>> index a68ced28d8f4..6706ecc25d8f 100644
>> --- a/include/net/net_namespace.h
>> +++ b/include/net/net_namespace.h
>> @@ -162,7 +162,7 @@ struct net {
>>  #if IS_ENABLED(CONFIG_CAN)
>>  	struct netns_can	can;
>>  #endif
>> -#ifdef CONFIG_XDP_SOCKETS
>> +#ifdef CONFIG_BPF_SYSCALL
>>  	struct netns_xdp	xdp;
>>  #endif
>>  	struct sock		*diag_nlsk;
>> diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h
>> index e5734261ba0a..4935dfe1cf43 100644
>> --- a/include/net/netns/xdp.h
>> +++ b/include/net/netns/xdp.h
>> @@ -4,10 +4,21 @@
>>  
>>  #include <linux/rculist.h>
>>  #include <linux/mutex.h>
>> +#include <linux/atomic.h>
>> +
>> +struct bpf_dtab;
>> +
>> +struct bpf_dtab_container {
>> +	struct bpf_dtab __rcu *dtab;
>> +	atomic_t refcnt;
>
> refcount_t ?  I'm not sure what the rules are for non-performance
> critical stuff are..

I based the use of atomic_t on what bpf/syscall.c is doing. The problem
with using refcount_t is that it's a bit of a weird refcount because
it's not strictly tied to the object lifetime (as the objects are only
allocated if *both* the globals and the per-namespace refcnt are >0).


[... snip things that I'll just fix ...]
>
>> +static void __dev_map_release_default_map(struct bpf_dtab_container *cont)
>> +{
>> +	struct bpf_dtab *dtab = NULL;
>> +
>> +	lockdep_assert_held(&dev_map_lock);
>> +
>> +	dtab = rcu_dereference(cont->dtab);
>> +	if (dtab) {
>
> How can it be NULL if it's refcounted? hm..

See above; it's not actually the object that is refcounted, it's a count
of the number of active XDP programs in the namespace, which still needs
to be kept so we can allocate the devmaps when a REDIRECT program is
loaded.

>> +		list_del_rcu(&dtab->list);
>> +		rcu_assign_pointer(cont->dtab, NULL);
>> +		bpf_clear_redirect_map(&dtab->map);
>> +		call_rcu(&dtab->rcu, __dev_map_free);
>> +	}
>> +}
>> +
>> +void dev_map_put_default_map(struct net *net)
>> +{
>> +	if (atomic_dec_and_test(&net->xdp.default_map.refcnt)) {
>> +		spin_lock(&dev_map_lock);
>> +		__dev_map_release_default_map(&net->xdp.default_map);
>> +		spin_unlock(&dev_map_lock);
>> +	}
>> +}
>> +
>> +static int __init_default_map(struct bpf_dtab_container *cont)
>> +{
>> +	struct net *net = bpf_default_map_to_net(cont);
>> +	struct bpf_dtab *dtab, *old_dtab;
>> +	int size = DEV_MAP_DEFAULT_SIZE;
>> +	struct net_device *netdev;
>> +	union bpf_attr attr = {};
>> +	u32 idx;
>> +	int err;
>> +
>> +	lockdep_assert_held(&dev_map_lock);
>> +
>> +	if (!atomic_read(&global_redirect_use))
>> +		return 0;
>> +
>> +	for_each_netdev(net, netdev)
>> +		if (netdev->ifindex >= size)
>> +			size <<= 1;
>> +
>> +	old_dtab = rcu_dereference(cont->dtab);
>> +	if (old_dtab && old_dtab->map.max_entries == size)
>> +		return 0;
>> +
>> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);
>
> You are calling this with a spin lock held :(

Yeah, not just this, there are more allocations in dev_map_init_map().
Hmm, I guess I need to think about the concurrency bits some more; it is
made a bit tricky by the refcounts being outside the objects.

Basically what I'm trying to protect against is the case where the
global refcount goes to zero and then immediately back to one (or vice
versa), which results in a deallocation immediately followed by an
allocation. I was worried about the allocation and deallocation racing
with each other (because there's a window after the refcnt is checked
before the (de)allocation begins). It's not quite clear to me whether
RCU is enough to protect against this... Thoughts?

>> +	if (!dtab)
>> +		return -ENOMEM;
>> +
>> +	attr.map_type = BPF_MAP_TYPE_DEVMAP;
>> +	attr.max_entries = size;
>> +	attr.value_size = 4;
>> +	attr.key_size = 4;
>> +
>> +	err = dev_map_init_map(dtab, &attr, false);
>> +	if (err) {
>> +		kfree(dtab);
>> +		return err;
>> +	}
>> +
>> +	for_each_netdev(net, netdev) {
>> +		idx = netdev->ifindex;
>> +		err = __dev_map_update_elem(net, &dtab->map, &idx, &idx, 0);
>> +		if (err) {
>> +			__dev_map_free(&dtab->rcu);
>
> This map wasn't visible yet, so probably no need to go through RCU? 
> Not that it matters much for an error path..

It's not actually using call_rcu(), it's just calling a function that
expects a struct_rcu as a parameter (to avoid another wrapper function) :)

>> +			return err;
>> +		}
>> +	}
>> +
>> +	rcu_assign_pointer(cont->dtab, dtab);
>> +	list_add_tail_rcu(&dtab->list, &dev_map_list);
>> +
>> +	if (old_dtab) {
>> +		list_del_rcu(&old_dtab->list);
>> +		bpf_clear_redirect_map(&old_dtab->map);
>> +		call_rcu(&old_dtab->rcu, __dev_map_free);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 1b9496c41383..f1b2f01e7ca1 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -7609,6 +7609,17 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>>  			prog->dst_needed = 1;
>>  		if (insn->imm == BPF_FUNC_get_prandom_u32)
>>  			bpf_user_rnd_init_once();
>> +		if (insn->imm == BPF_FUNC_redirect) {
>> +			prog->redirect_needed = true;
>
> nit: this field is a bitfield not boolean
>
>> +			if (!prog->redirect_used) {
>> +				int err = dev_map_inc_redirect_count();
>
> Don't call complex functions as variable init, please
>
>> +				if (err)
>> +					return err;
>> +				prog->redirect_used = true;
>> +			}
>> +		}
>> +
>>  		if (insn->imm == BPF_FUNC_override_return)
>>  			prog->kprobe_override = 1;
>>  		if (insn->imm == BPF_FUNC_tail_call) {
>> @@ -7618,6 +7629,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>>  			 * the program array.
>>  			 */
>>  			prog->cb_access = 1;
>> +			prog->redirect_needed = true;
>>  			env->prog->aux->stack_depth = MAX_BPF_STACK;
>>  			env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
>>  
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 2b67f2aa59dd..1df20d529026 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -7990,6 +7990,21 @@ u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
>>  	return xdp.prog_id;
>>  }
>>  
>> +static struct bpf_prog *dev_xdp_get_prog(struct net_device *dev)
>> +{
>> +	struct bpf_prog *prog;
>> +	u32 prog_id;
>> +
>> +	prog_id = __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
>
> if (!prog_id)
> 	return NULL;
>
> ...
>
>> +	if (prog_id) {
>> +		prog = bpf_prog_get_by_id(prog_id);
>> +		if (!IS_ERR(prog))
>> +			return prog;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>>  static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
>>  			   struct netlink_ext_ack *extack, u32 flags,
>>  			   struct bpf_prog *prog)
>> @@ -8024,9 +8039,18 @@ static void dev_xdp_uninstall(struct net_device *dev)
>>  	memset(&xdp, 0, sizeof(xdp));
>>  	xdp.command = XDP_QUERY_PROG;
>>  	WARN_ON(ndo_bpf(dev, &xdp));
>> -	if (xdp.prog_id)
>> +	if (xdp.prog_id) {
>> +		struct bpf_prog *prog = bpf_prog_get_by_id(xdp.prog_id);
>> +
>> +		if (!IS_ERR(prog)) {
>> +			if (prog->redirect_needed)
>> +				dev_map_put_default_map(dev_net(dev));
>> +			bpf_prog_put(prog);
>> +		}
>> +
>>  		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
>>  					NULL));
>> +	}
>>  
>>  	/* Remove HW offload */
>>  	memset(&xdp, 0, sizeof(xdp));
>> @@ -8091,6 +8115,23 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
>>  			bpf_prog_put(prog);
>>  			return -EINVAL;
>>  		}
>> +
>> +		if (!offload && bpf_op == ops->ndo_bpf &&
>> +		    prog->redirect_needed) {
>> +			err = dev_map_ensure_default_map(dev_net(dev));
>> +			if (err) {
>> +				NL_SET_ERR_MSG(extack, "unable to allocate default map for xdp_redirect()");
>> +				return err;
>> +			}
>> +		}
>> +	} else {
>> +		struct bpf_prog *old_prog = dev_xdp_get_prog(dev);
>> +
>> +		if (old_prog) {
>> +			if (old_prog->redirect_needed)
>> +				dev_map_put_default_map(dev_net(dev));
>
> I think you should hold the ref to this program and only release the
> map _after_ new program was installed.
>
> You also have to do this for replacing a program with a different one.

Ah, right, good points.

> Could you please add tests for the replacement combinations?

Sure. But, erm, where? selftests? :)

[... snip a few more things that I'll just fix ...]

-Toke