netdev - Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for XDP

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190301180945.1152c6de@cakuba.netronome.com>
Date:   Fri, 1 Mar 2019 18:09:45 -0800
From:   Jakub Kicinski <jakub.kicinski@...ronome.com>
To:     Toke Høiland-Jørgensen <toke@...hat.com>
Cc:     David Miller <davem@...emloft.net>, netdev@...r.kernel.org,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Daniel Borkmann <daniel@...earbox.net>,
        Alexei Starovoitov <ast@...nel.org>
Subject: Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for
 XDP_REDIRECT to a device

On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
> An XDP program can redirect packets between interfaces using either the
> xdp_redirect() helper or the xdp_redirect_map() helper. Apart from the
> flexibility of updating maps from userspace, the redirect_map() helper also
> uses the map structure to batch packets, which results in a significant
> (around 50%) performance boost. However, the xdp_redirect() API is simpler
> if one just wants to redirect to another interface, which means people tend
> to use this interface and then wonder why they getter worse performance
> than expected.
> 
> This patch seeks to close this performance difference between the two APIs.
> It achieves this by changing xdp_redirect() to use a hidden devmap for
> looking up destination interfaces, thus gaining the batching benefit with
> no visible difference from the user API point of view.
> 
> A hidden per-namespace map is allocated when an XDP program that uses the
> non-map xdp_redirect() helper is first loaded. This map is populated with
> all available interfaces in its namespace, and kept up to date as
> interfaces come and go. Once allocated, the map is kept around until the
> namespace is removed.
> 
> The hidden map uses the ifindex as map key, which means they are limited to
> ifindexes smaller than the map size of 64. A later patch introduces a new
> map type to lift this restriction.
> 
> Performance numbers:
> 
> Before patch:
> xdp_redirect:     5426035 pkt/s
> xdp_redirect_map: 8412754 pkt/s
> 
> After patch:
> xdp_redirect:     8314702 pkt/s
> xdp_redirect_map: 8411854 pkt/s
> 
> This corresponds to a 53% increase in xdp_redirect performance, or a
> reduction in per-packet processing time by 64 nanoseconds.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@...hat.com>

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index de18227b3d95..060c5850af3b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -25,6 +25,7 @@ struct sock;
>  struct seq_file;
>  struct btf;
>  struct btf_type;
> +struct net;
>  
>  /* map is generic key/value storage optionally accesible by eBPF programs */
>  struct bpf_map_ops {
> @@ -533,6 +534,7 @@ extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
>  extern const struct bpf_verifier_ops xdp_analyzer_ops;
>  
>  struct bpf_prog *bpf_prog_get(u32 ufd);
> +struct bpf_prog *bpf_prog_get_by_id(u32 id);
>  struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
>  				       bool attach_drv);
>  struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
> @@ -612,6 +614,11 @@ struct xdp_buff;
>  struct sk_buff;
>  
>  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
> +struct bpf_map *__dev_map_get_default_map(struct net_device *dev);
> +int dev_map_ensure_default_map(struct net *net);
> +void dev_map_put_default_map(struct net *net);
> +int dev_map_inc_redirect_count(void);
> +void dev_map_dec_redirect_count(void);
>  void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
>  void __dev_map_flush(struct bpf_map *map);
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> @@ -641,6 +648,11 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
>  	return ERR_PTR(-EOPNOTSUPP);
>  }
>  
> +static inline struct bpf_prog *bpf_prog_get_by_id(u32 id)
> +{
> +	return ERR_PTR(-EOPNOTSUPP);
> +}
> +
>  static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
>  						     enum bpf_prog_type type,
>  						     bool attach_drv)
> @@ -693,6 +705,29 @@ static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
>  	return NULL;
>  }
>  
> +static inline struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
> +{
> +	return NULL;
> +}
> +
> +static inline int dev_map_ensure_default_map(struct net *net)
> +{
> +	return 0;
> +}
> +
> +static inline void dev_map_put_default_map(struct net *net)
> +{
> +}
> +
> +static inline int dev_map_inc_redirect_count(void)
> +{
> +	return 0;
> +}
> +
> +static inline void dev_map_dec_redirect_count(void)
> +{
> +}
> +
>  static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
>  {
>  }
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 95e2d7ebdf21..dd6bbbab32f7 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -507,6 +507,8 @@ struct bpf_prog {
>  				gpl_compatible:1, /* Is filter GPL compatible? */
>  				cb_access:1,	/* Is control block accessed? */
>  				dst_needed:1,	/* Do we need dst entry? */
> +				redirect_needed:1,	/* Does program need access to xdp_redirect? */
> +				redirect_used:1,	/* Does program use xdp_redirect? */
>  				blinded:1,	/* Was blinded */
>  				is_func:1,	/* program is a bpf function */
>  				kprobe_override:1, /* Do we override a kprobe? */
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index a68ced28d8f4..6706ecc25d8f 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -162,7 +162,7 @@ struct net {
>  #if IS_ENABLED(CONFIG_CAN)
>  	struct netns_can	can;
>  #endif
> -#ifdef CONFIG_XDP_SOCKETS
> +#ifdef CONFIG_BPF_SYSCALL
>  	struct netns_xdp	xdp;
>  #endif
>  	struct sock		*diag_nlsk;
> diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h
> index e5734261ba0a..4935dfe1cf43 100644
> --- a/include/net/netns/xdp.h
> +++ b/include/net/netns/xdp.h
> @@ -4,10 +4,21 @@
>  
>  #include <linux/rculist.h>
>  #include <linux/mutex.h>
> +#include <linux/atomic.h>
> +
> +struct bpf_dtab;
> +
> +struct bpf_dtab_container {
> +	struct bpf_dtab __rcu *dtab;
> +	atomic_t refcnt;

refcount_t ?  I'm not sure what the rules are for non-performance
critical stuff are..

> +};
>  
>  struct netns_xdp {
> +#ifdef CONFIG_XDP_SOCKETS
>  	struct mutex		lock;
>  	struct hlist_head	list;
> +#endif
> +	struct bpf_dtab_container default_map;
>  };
>  
>  #endif /* __NETNS_XDP_H__ */
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 1037fc08c504..e55707e62b60 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -56,6 +56,9 @@
>  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
>  
>  #define DEV_MAP_BULK_SIZE 16
> +#define DEV_MAP_DEFAULT_SIZE 8
> +#define BPF_MAX_REFCNT 32768
> +
>  struct xdp_bulk_queue {
>  	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
>  	struct net_device *dev_rx;
> @@ -80,6 +83,7 @@ struct bpf_dtab {
>  
>  static DEFINE_SPINLOCK(dev_map_lock);
>  static LIST_HEAD(dev_map_list);
> +static atomic_t global_redirect_use = {};

REFCOUNT_INIT(0) or ATOMIC_INIT(0)

>  static u64 dev_map_bitmap_size(const union bpf_attr *attr)
>  {
> @@ -332,6 +336,18 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
>  	return obj;
>  }
>  
> +/* This is only being called from xdp_do_redirect() if the xdp_redirect helper
> + * is used; the default map is allocated on XDP program load if the helper is
> + * used, so will always be available at this point.
> + */

Yet it does check for NULL..?

> +struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
> +{
> +	struct net *net = dev_net(dev);
> +	struct bpf_dtab *dtab = rcu_dereference(net->xdp.default_map.dtab);

This init is long and break rxmas tree rules, separate line perhaps?

> +	return dtab ? &dtab->map : NULL;
> +}
> +
>  /* Runs under RCU-read-side, plus in softirq under NAPI protection.
>   * Thus, safe percpu variable access.
>   */
> @@ -533,14 +549,210 @@ const struct bpf_map_ops dev_map_ops = {
>  	.map_check_btf = map_check_no_btf,
>  };
>  
> +static inline struct net *bpf_default_map_to_net(struct bpf_dtab_container *cont)
> +{
> +	struct netns_xdp *xdp = container_of(cont, struct netns_xdp, default_map);
> +
> +	return container_of(xdp, struct net, xdp);
> +}
> +
> +static void __dev_map_release_default_map(struct bpf_dtab_container *cont)
> +{
> +	struct bpf_dtab *dtab = NULL;
> +
> +	lockdep_assert_held(&dev_map_lock);
> +
> +	dtab = rcu_dereference(cont->dtab);
> +	if (dtab) {

How can it be NULL if it's refcounted? hm..

> +		list_del_rcu(&dtab->list);
> +		rcu_assign_pointer(cont->dtab, NULL);
> +		bpf_clear_redirect_map(&dtab->map);
> +		call_rcu(&dtab->rcu, __dev_map_free);
> +	}
> +}
> +
> +void dev_map_put_default_map(struct net *net)
> +{
> +	if (atomic_dec_and_test(&net->xdp.default_map.refcnt)) {
> +		spin_lock(&dev_map_lock);
> +		__dev_map_release_default_map(&net->xdp.default_map);
> +		spin_unlock(&dev_map_lock);
> +	}
> +}
> +
> +static int __init_default_map(struct bpf_dtab_container *cont)
> +{
> +	struct net *net = bpf_default_map_to_net(cont);
> +	struct bpf_dtab *dtab, *old_dtab;
> +	int size = DEV_MAP_DEFAULT_SIZE;
> +	struct net_device *netdev;
> +	union bpf_attr attr = {};
> +	u32 idx;
> +	int err;
> +
> +	lockdep_assert_held(&dev_map_lock);
> +
> +	if (!atomic_read(&global_redirect_use))
> +		return 0;
> +
> +	for_each_netdev(net, netdev)
> +		if (netdev->ifindex >= size)
> +			size <<= 1;
> +
> +	old_dtab = rcu_dereference(cont->dtab);
> +	if (old_dtab && old_dtab->map.max_entries == size)
> +		return 0;
> +
> +	dtab = kzalloc(sizeof(*dtab), GFP_USER);

You are calling this with a spin lock held :(

> +	if (!dtab)
> +		return -ENOMEM;
> +
> +	attr.map_type = BPF_MAP_TYPE_DEVMAP;
> +	attr.max_entries = size;
> +	attr.value_size = 4;
> +	attr.key_size = 4;
> +
> +	err = dev_map_init_map(dtab, &attr, false);
> +	if (err) {
> +		kfree(dtab);
> +		return err;
> +	}
> +
> +	for_each_netdev(net, netdev) {
> +		idx = netdev->ifindex;
> +		err = __dev_map_update_elem(net, &dtab->map, &idx, &idx, 0);
> +		if (err) {
> +			__dev_map_free(&dtab->rcu);

This map wasn't visible yet, so probably no need to go through RCU? 
Not that it matters much for an error path..

> +			return err;
> +		}
> +	}
> +
> +	rcu_assign_pointer(cont->dtab, dtab);
> +	list_add_tail_rcu(&dtab->list, &dev_map_list);
> +
> +	if (old_dtab) {
> +		list_del_rcu(&old_dtab->list);
> +		bpf_clear_redirect_map(&old_dtab->map);
> +		call_rcu(&old_dtab->rcu, __dev_map_free);
> +	}
> +
> +	return 0;
> +}
> +

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 1b9496c41383..f1b2f01e7ca1 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7609,6 +7609,17 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>  			prog->dst_needed = 1;
>  		if (insn->imm == BPF_FUNC_get_prandom_u32)
>  			bpf_user_rnd_init_once();
> +		if (insn->imm == BPF_FUNC_redirect) {
> +			prog->redirect_needed = true;

nit: this field is a bitfield not boolean

> +			if (!prog->redirect_used) {
> +				int err = dev_map_inc_redirect_count();

Don't call complex functions as variable init, please

> +				if (err)
> +					return err;
> +				prog->redirect_used = true;
> +			}
> +		}
> +
>  		if (insn->imm == BPF_FUNC_override_return)
>  			prog->kprobe_override = 1;
>  		if (insn->imm == BPF_FUNC_tail_call) {
> @@ -7618,6 +7629,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
>  			 * the program array.
>  			 */
>  			prog->cb_access = 1;
> +			prog->redirect_needed = true;
>  			env->prog->aux->stack_depth = MAX_BPF_STACK;
>  			env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
>  
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 2b67f2aa59dd..1df20d529026 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -7990,6 +7990,21 @@ u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
>  	return xdp.prog_id;
>  }
>  
> +static struct bpf_prog *dev_xdp_get_prog(struct net_device *dev)
> +{
> +	struct bpf_prog *prog;
> +	u32 prog_id;
> +
> +	prog_id = __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);

if (!prog_id)
	return NULL;

...

> +	if (prog_id) {
> +		prog = bpf_prog_get_by_id(prog_id);
> +		if (!IS_ERR(prog))
> +			return prog;
> +	}
> +
> +	return NULL;
> +}
> +
>  static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
>  			   struct netlink_ext_ack *extack, u32 flags,
>  			   struct bpf_prog *prog)
> @@ -8024,9 +8039,18 @@ static void dev_xdp_uninstall(struct net_device *dev)
>  	memset(&xdp, 0, sizeof(xdp));
>  	xdp.command = XDP_QUERY_PROG;
>  	WARN_ON(ndo_bpf(dev, &xdp));
> -	if (xdp.prog_id)
> +	if (xdp.prog_id) {
> +		struct bpf_prog *prog = bpf_prog_get_by_id(xdp.prog_id);
> +
> +		if (!IS_ERR(prog)) {
> +			if (prog->redirect_needed)
> +				dev_map_put_default_map(dev_net(dev));
> +			bpf_prog_put(prog);
> +		}
> +
>  		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
>  					NULL));
> +	}
>  
>  	/* Remove HW offload */
>  	memset(&xdp, 0, sizeof(xdp));
> @@ -8091,6 +8115,23 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
>  			bpf_prog_put(prog);
>  			return -EINVAL;
>  		}
> +
> +		if (!offload && bpf_op == ops->ndo_bpf &&
> +		    prog->redirect_needed) {
> +			err = dev_map_ensure_default_map(dev_net(dev));
> +			if (err) {
> +				NL_SET_ERR_MSG(extack, "unable to allocate default map for xdp_redirect()");
> +				return err;
> +			}
> +		}
> +	} else {
> +		struct bpf_prog *old_prog = dev_xdp_get_prog(dev);
> +
> +		if (old_prog) {
> +			if (old_prog->redirect_needed)
> +				dev_map_put_default_map(dev_net(dev));

I think you should hold the ref to this program and only release the
map _after_ new program was installed.

You also have to do this for replacing a program with a different one.

Could you please add tests for the replacement combinations?

> +			bpf_prog_put(old_prog);
> +		}
>  	}

>  	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
> @@ -9333,6 +9374,7 @@ EXPORT_SYMBOL(unregister_netdev);
>  int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
>  {
>  	int err, new_nsid, new_ifindex;
> +	struct bpf_prog *prog = NULL;
>  
>  	ASSERT_RTNL();
>  
> @@ -9350,6 +9392,15 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>  	if (net_eq(dev_net(dev), net))
>  		goto out;
>  
> +	prog = dev_xdp_get_prog(dev);
> +	if (prog) {
> +		if (prog->redirect_needed)
> +			err = dev_map_ensure_default_map(net);
> +
> +		if (err)
> +			goto out;
> +	}

prog = dev_xdp_get_prog(dev);
if (prog && prog->redirect_needed) {
	err = dev_map_ensure_default_map(net);
	if (err)
		goto out;
}

>  	/* Pick the destination device name, and ensure
>  	 * we can use it in the destination network namespace.
>  	 */
> @@ -9388,6 +9439,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>  	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
>  	rcu_barrier();
>  
> +	if (prog && prog->redirect_needed)
> +		dev_map_put_default_map(dev_net(dev));
> +
>  	new_nsid = peernet2id_alloc(dev_net(dev), net);
>  	/* If there is an ifindex conflict assign a new one */
>  	if (__dev_get_by_index(net, dev->ifindex))
> @@ -9435,6 +9489,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>  	synchronize_net();
>  	err = 0;
>  out:
> +	if (prog)
> +		bpf_prog_put(prog);
> +
>  	return err;
>  }
>  EXPORT_SYMBOL_GPL(dev_change_net_namespace);