[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190301180945.1152c6de@cakuba.netronome.com>
Date: Fri, 1 Mar 2019 18:09:45 -0800
From: Jakub Kicinski <jakub.kicinski@...ronome.com>
To: Toke Høiland-Jørgensen <toke@...hat.com>
Cc: David Miller <davem@...emloft.net>, netdev@...r.kernel.org,
Jesper Dangaard Brouer <brouer@...hat.com>,
Daniel Borkmann <daniel@...earbox.net>,
Alexei Starovoitov <ast@...nel.org>
Subject: Re: [PATCH net-next v3 2/3] xdp: Always use a devmap for
XDP_REDIRECT to a device
On Fri, 01 Mar 2019 15:12:30 +0100, Toke Høiland-Jørgensen wrote:
> An XDP program can redirect packets between interfaces using either the
> xdp_redirect() helper or the xdp_redirect_map() helper. Apart from the
> flexibility of updating maps from userspace, the redirect_map() helper also
> uses the map structure to batch packets, which results in a significant
> (around 50%) performance boost. However, the xdp_redirect() API is simpler
> if one just wants to redirect to another interface, which means people tend
> to use this interface and then wonder why they getter worse performance
> than expected.
>
> This patch seeks to close this performance difference between the two APIs.
> It achieves this by changing xdp_redirect() to use a hidden devmap for
> looking up destination interfaces, thus gaining the batching benefit with
> no visible difference from the user API point of view.
>
> A hidden per-namespace map is allocated when an XDP program that uses the
> non-map xdp_redirect() helper is first loaded. This map is populated with
> all available interfaces in its namespace, and kept up to date as
> interfaces come and go. Once allocated, the map is kept around until the
> namespace is removed.
>
> The hidden map uses the ifindex as map key, which means they are limited to
> ifindexes smaller than the map size of 64. A later patch introduces a new
> map type to lift this restriction.
>
> Performance numbers:
>
> Before patch:
> xdp_redirect: 5426035 pkt/s
> xdp_redirect_map: 8412754 pkt/s
>
> After patch:
> xdp_redirect: 8314702 pkt/s
> xdp_redirect_map: 8411854 pkt/s
>
> This corresponds to a 53% increase in xdp_redirect performance, or a
> reduction in per-packet processing time by 64 nanoseconds.
>
> Signed-off-by: Toke Høiland-Jørgensen <toke@...hat.com>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index de18227b3d95..060c5850af3b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -25,6 +25,7 @@ struct sock;
> struct seq_file;
> struct btf;
> struct btf_type;
> +struct net;
>
> /* map is generic key/value storage optionally accesible by eBPF programs */
> struct bpf_map_ops {
> @@ -533,6 +534,7 @@ extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
> extern const struct bpf_verifier_ops xdp_analyzer_ops;
>
> struct bpf_prog *bpf_prog_get(u32 ufd);
> +struct bpf_prog *bpf_prog_get_by_id(u32 id);
> struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
> bool attach_drv);
> struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
> @@ -612,6 +614,11 @@ struct xdp_buff;
> struct sk_buff;
>
> struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
> +struct bpf_map *__dev_map_get_default_map(struct net_device *dev);
> +int dev_map_ensure_default_map(struct net *net);
> +void dev_map_put_default_map(struct net *net);
> +int dev_map_inc_redirect_count(void);
> +void dev_map_dec_redirect_count(void);
> void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
> void __dev_map_flush(struct bpf_map *map);
> int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> @@ -641,6 +648,11 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
> return ERR_PTR(-EOPNOTSUPP);
> }
>
> +static inline struct bpf_prog *bpf_prog_get_by_id(u32 id)
> +{
> + return ERR_PTR(-EOPNOTSUPP);
> +}
> +
> static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
> enum bpf_prog_type type,
> bool attach_drv)
> @@ -693,6 +705,29 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map,
> return NULL;
> }
>
> +static inline struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
> +{
> + return NULL;
> +}
> +
> +static inline int dev_map_ensure_default_map(struct net *net)
> +{
> + return 0;
> +}
> +
> +static inline void dev_map_put_default_map(struct net *net)
> +{
> +}
> +
> +static inline int dev_map_inc_redirect_count(void)
> +{
> + return 0;
> +}
> +
> +static inline void dev_map_dec_redirect_count(void)
> +{
> +}
> +
> static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
> {
> }
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 95e2d7ebdf21..dd6bbbab32f7 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -507,6 +507,8 @@ struct bpf_prog {
> gpl_compatible:1, /* Is filter GPL compatible? */
> cb_access:1, /* Is control block accessed? */
> dst_needed:1, /* Do we need dst entry? */
> + redirect_needed:1, /* Does program need access to xdp_redirect? */
> + redirect_used:1, /* Does program use xdp_redirect? */
> blinded:1, /* Was blinded */
> is_func:1, /* program is a bpf function */
> kprobe_override:1, /* Do we override a kprobe? */
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index a68ced28d8f4..6706ecc25d8f 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -162,7 +162,7 @@ struct net {
> #if IS_ENABLED(CONFIG_CAN)
> struct netns_can can;
> #endif
> -#ifdef CONFIG_XDP_SOCKETS
> +#ifdef CONFIG_BPF_SYSCALL
> struct netns_xdp xdp;
> #endif
> struct sock *diag_nlsk;
> diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h
> index e5734261ba0a..4935dfe1cf43 100644
> --- a/include/net/netns/xdp.h
> +++ b/include/net/netns/xdp.h
> @@ -4,10 +4,21 @@
>
> #include <linux/rculist.h>
> #include <linux/mutex.h>
> +#include <linux/atomic.h>
> +
> +struct bpf_dtab;
> +
> +struct bpf_dtab_container {
> + struct bpf_dtab __rcu *dtab;
> + atomic_t refcnt;
refcount_t ? I'm not sure what the rules are for non-performance
critical stuff are..
> +};
>
> struct netns_xdp {
> +#ifdef CONFIG_XDP_SOCKETS
> struct mutex lock;
> struct hlist_head list;
> +#endif
> + struct bpf_dtab_container default_map;
> };
>
> #endif /* __NETNS_XDP_H__ */
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 1037fc08c504..e55707e62b60 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -56,6 +56,9 @@
> (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
>
> #define DEV_MAP_BULK_SIZE 16
> +#define DEV_MAP_DEFAULT_SIZE 8
> +#define BPF_MAX_REFCNT 32768
> +
> struct xdp_bulk_queue {
> struct xdp_frame *q[DEV_MAP_BULK_SIZE];
> struct net_device *dev_rx;
> @@ -80,6 +83,7 @@ struct bpf_dtab {
>
> static DEFINE_SPINLOCK(dev_map_lock);
> static LIST_HEAD(dev_map_list);
> +static atomic_t global_redirect_use = {};
REFCOUNT_INIT(0) or ATOMIC_INIT(0)
> static u64 dev_map_bitmap_size(const union bpf_attr *attr)
> {
> @@ -332,6 +336,18 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
> return obj;
> }
>
> +/* This is only being called from xdp_do_redirect() if the xdp_redirect helper
> + * is used; the default map is allocated on XDP program load if the helper is
> + * used, so will always be available at this point.
> + */
Yet it does check for NULL..?
> +struct bpf_map *__dev_map_get_default_map(struct net_device *dev)
> +{
> + struct net *net = dev_net(dev);
> + struct bpf_dtab *dtab = rcu_dereference(net->xdp.default_map.dtab);
This init is long and break rxmas tree rules, separate line perhaps?
> + return dtab ? &dtab->map : NULL;
> +}
> +
> /* Runs under RCU-read-side, plus in softirq under NAPI protection.
> * Thus, safe percpu variable access.
> */
> @@ -533,14 +549,210 @@ const struct bpf_map_ops dev_map_ops = {
> .map_check_btf = map_check_no_btf,
> };
>
> +static inline struct net *bpf_default_map_to_net(struct bpf_dtab_container *cont)
> +{
> + struct netns_xdp *xdp = container_of(cont, struct netns_xdp, default_map);
> +
> + return container_of(xdp, struct net, xdp);
> +}
> +
> +static void __dev_map_release_default_map(struct bpf_dtab_container *cont)
> +{
> + struct bpf_dtab *dtab = NULL;
> +
> + lockdep_assert_held(&dev_map_lock);
> +
> + dtab = rcu_dereference(cont->dtab);
> + if (dtab) {
How can it be NULL if it's refcounted? hm..
> + list_del_rcu(&dtab->list);
> + rcu_assign_pointer(cont->dtab, NULL);
> + bpf_clear_redirect_map(&dtab->map);
> + call_rcu(&dtab->rcu, __dev_map_free);
> + }
> +}
> +
> +void dev_map_put_default_map(struct net *net)
> +{
> + if (atomic_dec_and_test(&net->xdp.default_map.refcnt)) {
> + spin_lock(&dev_map_lock);
> + __dev_map_release_default_map(&net->xdp.default_map);
> + spin_unlock(&dev_map_lock);
> + }
> +}
> +
> +static int __init_default_map(struct bpf_dtab_container *cont)
> +{
> + struct net *net = bpf_default_map_to_net(cont);
> + struct bpf_dtab *dtab, *old_dtab;
> + int size = DEV_MAP_DEFAULT_SIZE;
> + struct net_device *netdev;
> + union bpf_attr attr = {};
> + u32 idx;
> + int err;
> +
> + lockdep_assert_held(&dev_map_lock);
> +
> + if (!atomic_read(&global_redirect_use))
> + return 0;
> +
> + for_each_netdev(net, netdev)
> + if (netdev->ifindex >= size)
> + size <<= 1;
> +
> + old_dtab = rcu_dereference(cont->dtab);
> + if (old_dtab && old_dtab->map.max_entries == size)
> + return 0;
> +
> + dtab = kzalloc(sizeof(*dtab), GFP_USER);
You are calling this with a spin lock held :(
> + if (!dtab)
> + return -ENOMEM;
> +
> + attr.map_type = BPF_MAP_TYPE_DEVMAP;
> + attr.max_entries = size;
> + attr.value_size = 4;
> + attr.key_size = 4;
> +
> + err = dev_map_init_map(dtab, &attr, false);
> + if (err) {
> + kfree(dtab);
> + return err;
> + }
> +
> + for_each_netdev(net, netdev) {
> + idx = netdev->ifindex;
> + err = __dev_map_update_elem(net, &dtab->map, &idx, &idx, 0);
> + if (err) {
> + __dev_map_free(&dtab->rcu);
This map wasn't visible yet, so probably no need to go through RCU?
Not that it matters much for an error path..
> + return err;
> + }
> + }
> +
> + rcu_assign_pointer(cont->dtab, dtab);
> + list_add_tail_rcu(&dtab->list, &dev_map_list);
> +
> + if (old_dtab) {
> + list_del_rcu(&old_dtab->list);
> + bpf_clear_redirect_map(&old_dtab->map);
> + call_rcu(&old_dtab->rcu, __dev_map_free);
> + }
> +
> + return 0;
> +}
> +
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 1b9496c41383..f1b2f01e7ca1 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7609,6 +7609,17 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
> prog->dst_needed = 1;
> if (insn->imm == BPF_FUNC_get_prandom_u32)
> bpf_user_rnd_init_once();
> + if (insn->imm == BPF_FUNC_redirect) {
> + prog->redirect_needed = true;
nit: this field is a bitfield not boolean
> + if (!prog->redirect_used) {
> + int err = dev_map_inc_redirect_count();
Don't call complex functions as variable init, please
> + if (err)
> + return err;
> + prog->redirect_used = true;
> + }
> + }
> +
> if (insn->imm == BPF_FUNC_override_return)
> prog->kprobe_override = 1;
> if (insn->imm == BPF_FUNC_tail_call) {
> @@ -7618,6 +7629,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
> * the program array.
> */
> prog->cb_access = 1;
> + prog->redirect_needed = true;
> env->prog->aux->stack_depth = MAX_BPF_STACK;
> env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 2b67f2aa59dd..1df20d529026 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -7990,6 +7990,21 @@ u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
> return xdp.prog_id;
> }
>
> +static struct bpf_prog *dev_xdp_get_prog(struct net_device *dev)
> +{
> + struct bpf_prog *prog;
> + u32 prog_id;
> +
> + prog_id = __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
if (!prog_id)
return NULL;
...
> + if (prog_id) {
> + prog = bpf_prog_get_by_id(prog_id);
> + if (!IS_ERR(prog))
> + return prog;
> + }
> +
> + return NULL;
> +}
> +
> static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
> struct netlink_ext_ack *extack, u32 flags,
> struct bpf_prog *prog)
> @@ -8024,9 +8039,18 @@ static void dev_xdp_uninstall(struct net_device *dev)
> memset(&xdp, 0, sizeof(xdp));
> xdp.command = XDP_QUERY_PROG;
> WARN_ON(ndo_bpf(dev, &xdp));
> - if (xdp.prog_id)
> + if (xdp.prog_id) {
> + struct bpf_prog *prog = bpf_prog_get_by_id(xdp.prog_id);
> +
> + if (!IS_ERR(prog)) {
> + if (prog->redirect_needed)
> + dev_map_put_default_map(dev_net(dev));
> + bpf_prog_put(prog);
> + }
> +
> WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
> NULL));
> + }
>
> /* Remove HW offload */
> memset(&xdp, 0, sizeof(xdp));
> @@ -8091,6 +8115,23 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
> bpf_prog_put(prog);
> return -EINVAL;
> }
> +
> + if (!offload && bpf_op == ops->ndo_bpf &&
> + prog->redirect_needed) {
> + err = dev_map_ensure_default_map(dev_net(dev));
> + if (err) {
> + NL_SET_ERR_MSG(extack, "unable to allocate default map for xdp_redirect()");
> + return err;
> + }
> + }
> + } else {
> + struct bpf_prog *old_prog = dev_xdp_get_prog(dev);
> +
> + if (old_prog) {
> + if (old_prog->redirect_needed)
> + dev_map_put_default_map(dev_net(dev));
I think you should hold the ref to this program and only release the
map _after_ new program was installed.
You also have to do this for replacing a program with a different one.
Could you please add tests for the replacement combinations?
> + bpf_prog_put(old_prog);
> + }
> }
> err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
> @@ -9333,6 +9374,7 @@ EXPORT_SYMBOL(unregister_netdev);
> int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
> {
> int err, new_nsid, new_ifindex;
> + struct bpf_prog *prog = NULL;
>
> ASSERT_RTNL();
>
> @@ -9350,6 +9392,15 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
> if (net_eq(dev_net(dev), net))
> goto out;
>
> + prog = dev_xdp_get_prog(dev);
> + if (prog) {
> + if (prog->redirect_needed)
> + err = dev_map_ensure_default_map(net);
> +
> + if (err)
> + goto out;
> + }
prog = dev_xdp_get_prog(dev);
if (prog && prog->redirect_needed) {
err = dev_map_ensure_default_map(net);
if (err)
goto out;
}
> /* Pick the destination device name, and ensure
> * we can use it in the destination network namespace.
> */
> @@ -9388,6 +9439,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
> call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
> rcu_barrier();
>
> + if (prog && prog->redirect_needed)
> + dev_map_put_default_map(dev_net(dev));
> +
> new_nsid = peernet2id_alloc(dev_net(dev), net);
> /* If there is an ifindex conflict assign a new one */
> if (__dev_get_by_index(net, dev->ifindex))
> @@ -9435,6 +9489,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
> synchronize_net();
> err = 0;
> out:
> + if (prog)
> + bpf_prog_put(prog);
> +
> return err;
> }
> EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Powered by blists - more mailing lists