[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180427164303.qxodjc63i6pf75q6@kafai-mbp>
Date: Fri, 27 Apr 2018 09:43:05 -0700
From: Martin KaFai Lau <kafai@...com>
To: David Ahern <dsahern@...il.com>
CC: <netdev@...r.kernel.org>, <borkmann@...earbox.net>,
<ast@...nel.org>, <shm@...ulusnetworks.com>,
<roopa@...ulusnetworks.com>, <brouer@...hat.com>, <toke@...e.dk>,
<john.fastabend@...il.com>
Subject: Re: [RFC bpf-next 8/9] bpf: Provide helper to do lookups in kernel
FIB table
On Wed, Apr 25, 2018 at 11:34:48AM -0700, David Ahern wrote:
> Provide a helper for doing a FIB and neighbor lookup in the kernel
> tables from an XDP program. The helper provides a fastpath for forwarding
> packets. If the packet is a local delivery or for any reason is not a
> simple lookup and forward, the packet continues up the stack.
>
> If it is to be forwarded, the forwarding can be done directly if the
> neighbor is already known. If the neighbor does not exist, the first
> few packets go up the stack for neighbor resolution. Once resolved, the
> xdp program provides the fast path.
>
> On successful lookup the nexthop dmac, current device smac and egress
> device index are returned.
>
> The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
> are implemented in this patch. The API includes layer 4 parameters if
> the XDP program chooses to do deep packet inspection to allow compare
> against ACLs implemented as FIB rules.
>
> Header rewrite is left to the XDP program.
>
> The lookup takes 2 flags:
> - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
> straight to the table associated with the device (expert setting for
> those looking to maximize throughput)
>
> - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
> Default is an ingress lookup.
>
> Initial performance numbers collected by Jesper, forwarded packets/sec:
>
> Full stack XDP FIB lookup XDP Direct lookup
> IPv4 1,947,969 7,074,156 7,415,333
> IPv6 1,728,000 6,165,504 7,262,720
>
>
> Signed-off-by: David Ahern <dsahern@...il.com>
> ---
> include/uapi/linux/bpf.h | 68 +++++++++++++-
> net/core/filter.c | 233 +++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 300 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e6679393b687..82601c132b9f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -10,6 +10,8 @@
>
> #include <linux/types.h>
> #include <linux/bpf_common.h>
> +#include <linux/if_ether.h>
> +#include <linux/in6.h>
>
> /* Extended instruction set based on top of classic BPF */
>
> @@ -783,6 +785,17 @@ union bpf_attr {
> * @size: size of 'struct bpf_xfrm_state'
> * @flags: room for future extensions
> * Return: 0 on success or negative error
> + *
> + * int bpf_fib_lookup(ctx, params, plen, flags)
> + * Do a FIB lookup based on given parameters
> + * @ctx: pointer to context of type xdp_md
> + * @params: pointer to bpf_fib_lookup
> + * @plen: size of params argument
> + * @flags: u32 bitmask of BPF_FIB_LOOKUP_* flags
> + * Return: egress device index if packet is to be forwarded,
> + * 0 for local delivery (anything that needs to be handled
> + * by the full stack), or negative on error.
> + * If index is > 0, output data in bpf_fib_lookup is set
> */
> #define __BPF_FUNC_MAPPER(FN) \
> FN(unspec), \
> @@ -851,7 +864,9 @@ union bpf_attr {
> FN(msg_pull_data), \
> FN(bind), \
> FN(xdp_adjust_tail), \
> - FN(skb_get_xfrm_state),
> + FN(skb_get_xfrm_state), \
> + FN(fib_lookup), \
> +
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> * function eBPF program intends to call
> @@ -1255,4 +1270,55 @@ struct bpf_raw_tracepoint_args {
> __u64 args[0];
> };
>
> +/* DIRECT: Skip the FIB rules and go to FIB table associated with device
> + * OUTPUT: Do lookup from egress perspective; default is ingress
> + */
> +#define BPF_FIB_LOOKUP_DIRECT BIT(0)
> +#define BPF_FIB_LOOKUP_OUTPUT BIT(1)
> +
> +struct bpf_fib_lookup {
> + /* input */
> + __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */
> +
> + /* set if lookup is to consider L4 data - e.g., FIB rules */
> + __u8 l4_protocol;
> + __be16 sport;
> + __be16 dport;
> +
> + /* total length of packet from network header - used for MTU check */
> + __u16 tot_len;
> + __u32 ifindex; /* L3 device index for lookup */
> +
> + union {
> + /* inputs to lookup */
> + __u8 tos; /* AF_INET */
> + __be32 flowlabel; /* AF_INET6 */
> +
> + /* output: metric of fib result */
> + __u32 rt_metric;
> + };
> +
> + union {
> + __be32 mpls_in;
> + __be32 ipv4_src;
> + struct in6_addr ipv6_src;
> + };
> +
> + /* input to bpf_fib_lookup, *dst is destination address.
> + * output: bpf_fib_lookup sets to gateway address
> + */
> + union {
> + /* return for MPLS lookups */
> + __be32 mpls_out[4]; /* support up to 4 labels */
> + __be32 ipv4_dst;
> + struct in6_addr ipv6_dst;
> + };
> +
> + /* output */
> + __be16 h_vlan_proto;
> + __be16 h_vlan_TCI;
> + __u8 smac[ETH_ALEN];
> + __u8 dmac[ETH_ALEN];
> +};
> +
> #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 8e45c6c7ab08..37602b2fb94a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -59,6 +59,10 @@
> #include <net/tcp.h>
> #include <net/xfrm.h>
> #include <linux/bpf_trace.h>
> +#include <linux/inetdevice.h>
> +#include <net/ip_fib.h>
> +#include <net/flow.h>
> +#include <net/arp.h>
>
> /**
> * sk_filter_trim_cap - run a packet through a socket filter
> @@ -3787,6 +3791,231 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
> };
> #endif
>
> +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
> +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
> + const struct neighbour *neigh,
> + const struct net_device *dev)
> +{
> + memcpy(params->dmac, neigh->ha, ETH_ALEN);
> + memcpy(params->smac, dev->dev_addr, ETH_ALEN);
> + params->h_vlan_TCI = 0;
> + params->h_vlan_proto = 0;
> +
> + return dev->ifindex;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_INET)
> +static int bpf_ipv4_fib_lookup(struct xdp_buff *ctx,
> + struct bpf_fib_lookup *params, u32 flags)
> +{
> + struct net *net = dev_net(ctx->rxq->dev);
> + struct in_device *in_dev;
> + struct neighbour *neigh;
> + struct net_device *dev;
> + struct fib_result res;
> + struct fib_nh *nh;
> + struct flowi4 fl4;
> + int err;
> +
> + dev = dev_get_by_index_rcu(net, params->ifindex);
> + if (unlikely(!dev))
> + return -ENODEV;
> +
> + /* verify forwarding is enabled on this interface */
> + in_dev = __in_dev_get_rcu(dev);
> + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
> + return 0;
> +
> + if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> + fl4.flowi4_iif = 1;
> + fl4.flowi4_oif = params->ifindex;
> + } else {
> + fl4.flowi4_iif = params->ifindex;
> + fl4.flowi4_oif = 0;
> + }
> + fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
> + fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
> + fl4.flowi4_flags = 0;
> +
> + fl4.flowi4_proto = params->l4_protocol;
> + fl4.daddr = params->ipv4_dst;
> + fl4.saddr = params->ipv4_src;
> + fl4.fl4_sport = params->sport;
> + fl4.fl4_dport = params->dport;
> +
> + if (flags & BPF_FIB_LOOKUP_DIRECT) {
> + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> + struct fib_table *tb;
> +
> + tb = fib_get_table(net, tbid);
> + if (unlikely(!tb))
> + return 0;
> +
> + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
> + } else {
> + fl4.flowi4_mark = 0;
> + fl4.flowi4_secid = 0;
> + fl4.flowi4_tun_key.tun_id = 0;
> + fl4.flowi4_uid = sock_net_uid(net, NULL);
> +
> + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
> + }
> +
> + if (err || res.type != RTN_UNICAST)
> + return 0;
> +
> + if (res.fi->fib_nhs > 1)
> + fib_select_path(net, &res, &fl4, NULL);
> +
> + nh = &res.fi->fib_nh[res.nh_sel];
> +
> + /* do not handle lwt encaps right now */
> + if (nh->nh_lwtstate)
> + return 0;
> +
> + dev = nh->nh_dev;
> + if (unlikely(!dev))
> + return 0;
> +
> + if (nh->nh_gw)
> + params->ipv4_dst = nh->nh_gw;
> +
> + params->rt_metric = res.fi->fib_priority;
> +
> + /* xdp and cls_bpf programs are run in RCU-bh so
> + * rcu_read_lock_bh is not needed here
> + */
> + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
> + if (neigh)
> + return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> + return 0;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +static int bpf_ipv6_fib_lookup(struct xdp_buff *ctx,
> + struct bpf_fib_lookup *params, u32 flags)
> +{
> + struct net *net = dev_net(ctx->rxq->dev);
> + struct neighbour *neigh;
> + struct net_device *dev;
> + struct fib6_info *f6i;
> + struct flowi6 fl6;
> + int strict = 0;
> + int oif;
> +
> + /* link local addresses are never forwarded */
> + if (rt6_need_strict(¶ms->ipv6_dst) ||
> + rt6_need_strict(¶ms->ipv6_src))
> + return 0;
> +
> + dev = dev_get_by_index_rcu(net, params->ifindex);
> + if (unlikely(!dev))
> + return -ENODEV;
> +
> + if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> + fl6.flowi6_iif = 1;
1 is for LOOPBACK_IFINDEX?
> + oif = fl6.flowi6_oif = params->ifindex;
> + } else {
> + oif = fl6.flowi6_iif = params->ifindex;
> + fl6.flowi6_oif = 0;
> + strict = RT6_LOOKUP_F_HAS_SADDR;
> + }
> + fl6.flowlabel = params->flowlabel;
> + fl6.flowi6_scope = 0;
> + fl6.flowi6_flags = 0;
> + fl6.mp_hash = 0;
> +
> + fl6.flowi6_proto = params->l4_protocol;
> + fl6.daddr = params->ipv6_dst;
> + fl6.saddr = params->ipv6_src;
> + fl6.fl6_sport = params->sport;
> + fl6.fl6_dport = params->dport;
> +
> + if (flags & BPF_FIB_LOOKUP_DIRECT) {
> + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> + struct fib6_table *tb;
> +
> + tb = ipv6_stub->fib6_get_table(net, tbid);
> + if (unlikely(!tb))
> + return 0;
> +
> + f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
> + } else {
> + fl6.flowi6_mark = 0;
> + fl6.flowi6_secid = 0;
> + fl6.flowi6_tun_key.tun_id = 0;
> + fl6.flowi6_uid = sock_net_uid(net, NULL);
> +
> + f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
> + }
> +
> + if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
> + return 0;
> +
> + if (unlikely(f6i->fib6_flags & RTF_REJECT ||
> + f6i->fib6_type != RTN_UNICAST))
> + return 0;
> +
> + if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
> + f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
> + fl6.flowi6_oif, NULL,
> + strict);
> +
> + if (f6i->fib6_nh.nh_lwtstate)
> + return 0;
> +
> + if (f6i->fib6_flags & RTF_GATEWAY)
> + params->ipv6_dst = f6i->fib6_nh.nh_gw;
> +
> + dev = f6i->fib6_nh.nh_dev;
> + params->rt_metric = f6i->fib6_metric;
> +
> + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
> + * not needed here. Can not use __ipv6_neigh_lookup_noref here
> + * because we need to get nd_tbl via the stub
> + */
> + neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
> + ndisc_hashfn, ¶ms->ipv6_dst, dev);
> + if (neigh)
> + return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> + return 0;
> +}
> +#endif
> +
> +BPF_CALL_4(bpf_fib_lookup, struct xdp_buff *, ctx,
> + struct bpf_fib_lookup *, params, int, plen, u32, flags)
> +{
> + if (plen < sizeof(*params))
> + return -EINVAL;
> +
> + switch (params->family) {
> +#if IS_ENABLED(CONFIG_INET)
> + case AF_INET:
> + return bpf_ipv4_fib_lookup(ctx, params, flags);
> +#endif
> +#if IS_ENABLED(CONFIG_IPV6)
> + case AF_INET6:
> + return bpf_ipv6_fib_lookup(ctx, params, flags);
> +#endif
> + }
> + return -ENOTSUPP;
> +}
> +
> +static const struct bpf_func_proto bpf_fib_lookup_proto = {
> + .func = bpf_fib_lookup,
> + .gpl_only = true,
> + .pkt_access = true,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_CTX,
> + .arg2_type = ARG_PTR_TO_MEM,
> + .arg3_type = ARG_CONST_SIZE,
> + .arg4_type = ARG_ANYTHING,
> +};
> +
> static const struct bpf_func_proto *
> bpf_base_func_proto(enum bpf_func_id func_id)
> {
> @@ -3861,6 +4090,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> return &bpf_get_socket_cookie_proto;
> case BPF_FUNC_get_socket_uid:
> return &bpf_get_socket_uid_proto;
> + case BPF_FUNC_fib_lookup:
> + return &bpf_fib_lookup_proto;
> default:
> return bpf_base_func_proto(func_id);
> }
> @@ -3957,6 +4188,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> return &bpf_xdp_redirect_map_proto;
> case BPF_FUNC_xdp_adjust_tail:
> return &bpf_xdp_adjust_tail_proto;
> + case BPF_FUNC_fib_lookup:
> + return &bpf_fib_lookup_proto;
> default:
> return bpf_base_func_proto(func_id);
> }
> --
> 2.11.0
>
Powered by blists - more mailing lists