[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87r1kec7ih.fsf@toke.dk>
Date: Wed, 17 Mar 2021 13:03:02 +0100
From: Toke Høiland-Jørgensen <toke@...hat.com>
To: Hangbin Liu <liuhangbin@...il.com>, bpf@...r.kernel.org
Cc: netdev@...r.kernel.org, Jiri Benc <jbenc@...hat.com>,
Jesper Dangaard Brouer <brouer@...hat.com>,
Eelco Chaudron <echaudro@...hat.com>, ast@...nel.org,
Daniel Borkmann <daniel@...earbox.net>,
Lorenzo Bianconi <lorenzo.bianconi@...hat.com>,
David Ahern <dsahern@...il.com>,
Andrii Nakryiko <andrii.nakryiko@...il.com>,
Alexei Starovoitov <alexei.starovoitov@...il.com>,
John Fastabend <john.fastabend@...il.com>,
Maciej Fijalkowski <maciej.fijalkowski@...el.com>,
Hangbin Liu <liuhangbin@...il.com>
Subject: Re: [PATCHv2 bpf-next 2/4] xdp: extend xdp_redirect_map with
broadcast support
Hangbin Liu <liuhangbin@...il.com> writes:
> This patch add two flags BPF_F_BROADCAST and BPF_F_EXCLUDE_INGRESS to extend
> xdp_redirect_map for broadcast support.
>
> Keep the general data path in net/core/filter.c and the native data
> path in kernel/bpf/devmap.c so we can use direct calls to get better
> performace.
>
> Here is the performance result by using xdp_redirect_{map, map_multi} in
> sample/bpf and send pkts via pktgen cmd:
> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64
>
> There are some drop back as we need to loop the map and get each
> interface.
>
> Version | Test | Generic | Native
> 5.11 | redirect_map i40e->i40e | 1.9M | 9.3M
> 5.11 | redirect_map i40e->veth | 1.5M | 11.2M
> 5.11 + patch | redirect_map i40e->i40e | 1.9M | 9.6M
> 5.11 + patch | redirect_map i40e->veth | 1.5M | 11.9M
> 5.11 + patch | redirect_map_multi i40e->i40e | 1.5M | 7.7M
> 5.11 + patch | redirect_map_multi i40e->veth | 1.2M | 9.1M
> 5.11 + patch | redirect_map_multi i40e->mlx4+veth | 0.9M | 3.2M
>
> v2: fix flag renaming issue in v1
>
> Signed-off-by: Hangbin Liu <liuhangbin@...il.com>
FYI, this no longer applies to bpf-next due to Björn's refactor in
commit: ee75aef23afe ("bpf, xdp: Restructure redirect actions")
Also, two small nits below:
> ---
> include/linux/bpf.h | 16 +++++
> include/net/xdp.h | 1 +
> include/uapi/linux/bpf.h | 17 ++++-
> kernel/bpf/devmap.c | 119 +++++++++++++++++++++++++++++++++
> net/core/filter.c | 74 ++++++++++++++++++--
> net/core/xdp.c | 29 ++++++++
> tools/include/uapi/linux/bpf.h | 17 ++++-
> 7 files changed, 262 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index c931bc97019d..bb07ccd170f2 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1458,6 +1458,9 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
> struct net_device *dev_rx);
> int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> struct net_device *dev_rx);
> +bool dst_dev_is_ingress(struct bpf_dtab_netdev *obj, int ifindex);
> +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
> + struct bpf_map *map, bool exclude_ingress);
> int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
> struct bpf_prog *xdp_prog);
> bool dev_map_can_have_prog(struct bpf_map *map);
> @@ -1630,6 +1633,19 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> return 0;
> }
>
> +static inline
> +bool dst_dev_is_ingress(struct bpf_dtab_netdev *obj, int ifindex)
> +{
> + return false;
> +}
> +
> +static inline
> +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
> + struct bpf_map *map, bool exclude_ingress)
> +{
> + return 0;
> +}
> +
> struct sk_buff;
>
> static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index a5bc214a49d9..5533f0ab2afc 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
> struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
> struct net_device *dev);
> int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
> +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
>
> static inline
> void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 2d3036e292a9..5982ceb217dc 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2508,8 +2508,12 @@ union bpf_attr {
> * The lower two bits of *flags* are used as the return code if
> * the map lookup fails. This is so that the return value can be
> * one of the XDP program return codes up to **XDP_TX**, as chosen
> - * by the caller. Any higher bits in the *flags* argument must be
> - * unset.
> + * by the caller. The higher bits of *flags* can be set to
> + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
> + *
> + * With BPF_F_BROADCAST the packet will be broadcasted to all the
> + * interfaces in the map. with BPF_F_EXCLUDE_INGRESS the ingress
> + * interface will be excluded when do broadcasting.
> *
> * See also **bpf_redirect**\ (), which only supports redirecting
> * to an ifindex, but doesn't require a map to do so.
> @@ -5004,6 +5008,15 @@ enum {
> BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
> };
>
> +/* Flags for bpf_redirect_map helper */
> +enum {
> + BPF_F_BROADCAST = (1ULL << 3),
> + BPF_F_EXCLUDE_INGRESS = (1ULL << 4),
> +};
> +
> +#define BPF_F_ACTION_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)
> +#define BPF_F_REDIR_MASK (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS)
> +
> #define __bpf_md_ptr(type, name) \
> union { \
> type name; \
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index f80cf5036d39..ad616a043d2a 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -519,6 +519,125 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
> }
>
> +/* Use direct call in fast path instead of map->ops->map_get_next_key() */
> +static int devmap_get_next_key(struct bpf_map *map, void *key, void *next_key)
> +{
> + switch (map->map_type) {
> + case BPF_MAP_TYPE_DEVMAP:
> + return dev_map_get_next_key(map, key, next_key);
> + case BPF_MAP_TYPE_DEVMAP_HASH:
> + return dev_map_hash_get_next_key(map, key, next_key);
> + default:
> + break;
> + }
> +
> + return -ENOENT;
> +}
> +
> +bool dst_dev_is_ingress(struct bpf_dtab_netdev *dst, int ifindex)
> +{
> + return dst->dev->ifindex == ifindex;
> +}
> +
> +static struct bpf_dtab_netdev *devmap_get_next_obj(struct xdp_buff *xdp,
> + struct bpf_map *map,
> + u32 *key, u32 *next_key,
> + int ex_ifindex)
> +{
> + struct bpf_dtab_netdev *obj;
> + struct net_device *dev;
> + u32 *tmp_key = key;
why is tmp_key needed? you're not using key for anything else, so you
could just substitute that for all of the uses of tmp_key below?
> + u32 index;
> + int err;
> +
> + err = devmap_get_next_key(map, tmp_key, next_key);
> + if (err)
> + return NULL;
> +
> + /* When using dev map hash, we could restart the hashtab traversal
> + * in case the key has been updated/removed in the mean time.
> + * So we may end up potentially looping due to traversal restarts
> + * from first elem.
> + *
> + * Let's use map's max_entries to limit the loop number.
> + */
> + for (index = 0; index < map->max_entries; index++) {
> + switch (map->map_type) {
> + case BPF_MAP_TYPE_DEVMAP:
> + obj = __dev_map_lookup_elem(map, *next_key);
> + break;
> + case BPF_MAP_TYPE_DEVMAP_HASH:
> + obj = __dev_map_hash_lookup_elem(map, *next_key);
> + break;
> + default:
> + break;
> + }
> +
> + if (!obj || dst_dev_is_ingress(obj, ex_ifindex))
> + goto find_next;
> +
> + dev = obj->dev;
> +
> + if (!dev->netdev_ops->ndo_xdp_xmit)
> + goto find_next;
> +
> + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
> + if (unlikely(err))
> + goto find_next;
> +
> + return obj;
> +
> +find_next:
> + tmp_key = next_key;
> + err = devmap_get_next_key(map, tmp_key, next_key);
> + if (err)
> + break;
> + }
> +
> + return NULL;
> +}
> +
> +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
> + struct bpf_map *map, bool exclude_ingress)
> +{
> + struct bpf_dtab_netdev *obj = NULL, *next_obj = NULL;
> + struct xdp_frame *xdpf, *nxdpf;
> + int ex_ifindex;
> + u32 key, next_key;
Out of reverse-xmas-tree order...
-Toke
Powered by blists - more mailing lists