[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9488a57d-5559-b69f-631e-71cada1c1d2d@iogearbox.net>
Date: Wed, 25 Apr 2018 21:55:29 +0200
From: Daniel Borkmann <daniel@...earbox.net>
To: David Ahern <dsahern@...il.com>, netdev@...r.kernel.org,
borkmann@...earbox.net, ast@...nel.org
Cc: shm@...ulusnetworks.com, roopa@...ulusnetworks.com,
brouer@...hat.com, toke@...e.dk, john.fastabend@...il.com
Subject: Re: [RFC bpf-next 8/9] bpf: Provide helper to do lookups in kernel
FIB table
On 04/25/2018 08:34 PM, David Ahern wrote:
> Provide a helper for doing a FIB and neighbor lookup in the kernel
> tables from an XDP program. The helper provides a fastpath for forwarding
> packets. If the packet is a local delivery or for any reason is not a
> simple lookup and forward, the packet continues up the stack.
>
> If it is to be forwarded, the forwarding can be done directly if the
> neighbor is already known. If the neighbor does not exist, the first
> few packets go up the stack for neighbor resolution. Once resolved, the
> xdp program provides the fast path.
>
> On successful lookup the nexthop dmac, current device smac and egress
> device index are returned.
>
> The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
> are implemented in this patch. The API includes layer 4 parameters if
> the XDP program chooses to do deep packet inspection to allow compare
> against ACLs implemented as FIB rules.
>
> Header rewrite is left to the XDP program.
>
> The lookup takes 2 flags:
> - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
> straight to the table associated with the device (expert setting for
> those looking to maximize throughput)
>
> - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
> Default is an ingress lookup.
>
> Initial performance numbers collected by Jesper, forwarded packets/sec:
>
> Full stack XDP FIB lookup XDP Direct lookup
> IPv4 1,947,969 7,074,156 7,415,333
> IPv6 1,728,000 6,165,504 7,262,720
>
>
> Signed-off-by: David Ahern <dsahern@...il.com>
> ---
> include/uapi/linux/bpf.h | 68 +++++++++++++-
> net/core/filter.c | 233 +++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 300 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e6679393b687..82601c132b9f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -10,6 +10,8 @@
>
> #include <linux/types.h>
> #include <linux/bpf_common.h>
> +#include <linux/if_ether.h>
> +#include <linux/in6.h>
>
> /* Extended instruction set based on top of classic BPF */
>
> @@ -783,6 +785,17 @@ union bpf_attr {
> * @size: size of 'struct bpf_xfrm_state'
> * @flags: room for future extensions
> * Return: 0 on success or negative error
> + *
> + * int bpf_fib_lookup(ctx, params, plen, flags)
> + * Do a FIB lookup based on given parameters
> + * @ctx: pointer to context of type xdp_md
Nit: would just say pointer to context here since used with xdp/skb
> + * @params: pointer to bpf_fib_lookup
> + * @plen: size of params argument
> + * @flags: u32 bitmask of BPF_FIB_LOOKUP_* flags
> + * Return: egress device index if packet is to be forwarded,
> + * 0 for local delivery (anything that needs to be handled
> + * by the full stack), or negative on error.
> + * If index is > 0, output data in bpf_fib_lookup is set
> */
> #define __BPF_FUNC_MAPPER(FN) \
> FN(unspec), \
> @@ -851,7 +864,9 @@ union bpf_attr {
> FN(msg_pull_data), \
> FN(bind), \
> FN(xdp_adjust_tail), \
> - FN(skb_get_xfrm_state),
> + FN(skb_get_xfrm_state), \
> + FN(fib_lookup), \
> +
>
Nit: trailing '\' resp. double newline
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> * function eBPF program intends to call
[...]
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 8e45c6c7ab08..37602b2fb94a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -59,6 +59,10 @@
> #include <net/tcp.h>
> #include <net/xfrm.h>
> #include <linux/bpf_trace.h>
> +#include <linux/inetdevice.h>
> +#include <net/ip_fib.h>
> +#include <net/flow.h>
> +#include <net/arp.h>
>
> /**
> * sk_filter_trim_cap - run a packet through a socket filter
> @@ -3787,6 +3791,231 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
> };
> #endif
>
> +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
> +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
> + const struct neighbour *neigh,
> + const struct net_device *dev)
> +{
> + memcpy(params->dmac, neigh->ha, ETH_ALEN);
> + memcpy(params->smac, dev->dev_addr, ETH_ALEN);
> + params->h_vlan_TCI = 0;
> + params->h_vlan_proto = 0;
> +
> + return dev->ifindex;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_INET)
> +static int bpf_ipv4_fib_lookup(struct xdp_buff *ctx,
Instead of passing xdp_buff here, just pass the netdev pointer. More below
why it's needed.
> + struct bpf_fib_lookup *params, u32 flags)
> +{
> + struct net *net = dev_net(ctx->rxq->dev);
> + struct in_device *in_dev;
> + struct neighbour *neigh;
> + struct net_device *dev;
> + struct fib_result res;
> + struct fib_nh *nh;
> + struct flowi4 fl4;
> + int err;
> +
> + dev = dev_get_by_index_rcu(net, params->ifindex);
> + if (unlikely(!dev))
> + return -ENODEV;
> +
> + /* verify forwarding is enabled on this interface */
> + in_dev = __in_dev_get_rcu(dev);
> + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
> + return 0;
> +
> + if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> + fl4.flowi4_iif = 1;
> + fl4.flowi4_oif = params->ifindex;
> + } else {
> + fl4.flowi4_iif = params->ifindex;
> + fl4.flowi4_oif = 0;
> + }
> + fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
> + fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
> + fl4.flowi4_flags = 0;
> +
> + fl4.flowi4_proto = params->l4_protocol;
> + fl4.daddr = params->ipv4_dst;
> + fl4.saddr = params->ipv4_src;
> + fl4.fl4_sport = params->sport;
> + fl4.fl4_dport = params->dport;
> +
> + if (flags & BPF_FIB_LOOKUP_DIRECT) {
> + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> + struct fib_table *tb;
> +
> + tb = fib_get_table(net, tbid);
> + if (unlikely(!tb))
> + return 0;
> +
> + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
> + } else {
> + fl4.flowi4_mark = 0;
> + fl4.flowi4_secid = 0;
> + fl4.flowi4_tun_key.tun_id = 0;
> + fl4.flowi4_uid = sock_net_uid(net, NULL);
> +
> + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
> + }
> +
> + if (err || res.type != RTN_UNICAST)
> + return 0;
> +
> + if (res.fi->fib_nhs > 1)
> + fib_select_path(net, &res, &fl4, NULL);
> +
> + nh = &res.fi->fib_nh[res.nh_sel];
> +
> + /* do not handle lwt encaps right now */
> + if (nh->nh_lwtstate)
> + return 0;
> +
> + dev = nh->nh_dev;
> + if (unlikely(!dev))
> + return 0;
> +
> + if (nh->nh_gw)
> + params->ipv4_dst = nh->nh_gw;
> +
> + params->rt_metric = res.fi->fib_priority;
> +
> + /* xdp and cls_bpf programs are run in RCU-bh so
> + * rcu_read_lock_bh is not needed here
> + */
> + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
> + if (neigh)
> + return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> + return 0;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +static int bpf_ipv6_fib_lookup(struct xdp_buff *ctx,
Same here.
> + struct bpf_fib_lookup *params, u32 flags)
> +{
> + struct net *net = dev_net(ctx->rxq->dev);
> + struct neighbour *neigh;
> + struct net_device *dev;
> + struct fib6_info *f6i;
> + struct flowi6 fl6;
> + int strict = 0;
> + int oif;
> +
> + /* link local addresses are never forwarded */
> + if (rt6_need_strict(¶ms->ipv6_dst) ||
> + rt6_need_strict(¶ms->ipv6_src))
> + return 0;
> +
> + dev = dev_get_by_index_rcu(net, params->ifindex);
> + if (unlikely(!dev))
> + return -ENODEV;
> +
> + if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> + fl6.flowi6_iif = 1;
> + oif = fl6.flowi6_oif = params->ifindex;
> + } else {
> + oif = fl6.flowi6_iif = params->ifindex;
> + fl6.flowi6_oif = 0;
> + strict = RT6_LOOKUP_F_HAS_SADDR;
> + }
> + fl6.flowlabel = params->flowlabel;
> + fl6.flowi6_scope = 0;
> + fl6.flowi6_flags = 0;
> + fl6.mp_hash = 0;
> +
> + fl6.flowi6_proto = params->l4_protocol;
> + fl6.daddr = params->ipv6_dst;
> + fl6.saddr = params->ipv6_src;
> + fl6.fl6_sport = params->sport;
> + fl6.fl6_dport = params->dport;
> +
> + if (flags & BPF_FIB_LOOKUP_DIRECT) {
> + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> + struct fib6_table *tb;
> +
> + tb = ipv6_stub->fib6_get_table(net, tbid);
> + if (unlikely(!tb))
> + return 0;
> +
> + f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
> + } else {
> + fl6.flowi6_mark = 0;
> + fl6.flowi6_secid = 0;
> + fl6.flowi6_tun_key.tun_id = 0;
> + fl6.flowi6_uid = sock_net_uid(net, NULL);
> +
> + f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
> + }
> +
> + if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
> + return 0;
> +
> + if (unlikely(f6i->fib6_flags & RTF_REJECT ||
> + f6i->fib6_type != RTN_UNICAST))
> + return 0;
> +
> + if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
> + f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
> + fl6.flowi6_oif, NULL,
> + strict);
> +
> + if (f6i->fib6_nh.nh_lwtstate)
> + return 0;
> +
> + if (f6i->fib6_flags & RTF_GATEWAY)
> + params->ipv6_dst = f6i->fib6_nh.nh_gw;
> +
> + dev = f6i->fib6_nh.nh_dev;
> + params->rt_metric = f6i->fib6_metric;
> +
> + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
> + * not needed here. Can not use __ipv6_neigh_lookup_noref here
> + * because we need to get nd_tbl via the stub
> + */
> + neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
> + ndisc_hashfn, ¶ms->ipv6_dst, dev);
> + if (neigh)
> + return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> + return 0;
> +}
> +#endif
> +
> +BPF_CALL_4(bpf_fib_lookup, struct xdp_buff *, ctx,
> + struct bpf_fib_lookup *, params, int, plen, u32, flags)
> +{
> + if (plen < sizeof(*params))
> + return -EINVAL;
> +
> + switch (params->family) {
> +#if IS_ENABLED(CONFIG_INET)
> + case AF_INET:
> + return bpf_ipv4_fib_lookup(ctx, params, flags);
> +#endif
> +#if IS_ENABLED(CONFIG_IPV6)
> + case AF_INET6:
> + return bpf_ipv6_fib_lookup(ctx, params, flags);
> +#endif
> + }
> + return -ENOTSUPP;
> +}
> +
> +static const struct bpf_func_proto bpf_fib_lookup_proto = {
> + .func = bpf_fib_lookup,
> + .gpl_only = true,
> + .pkt_access = true,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_CTX,
> + .arg2_type = ARG_PTR_TO_MEM,
> + .arg3_type = ARG_CONST_SIZE,
> + .arg4_type = ARG_ANYTHING,
> +};
> +
> static const struct bpf_func_proto *
> bpf_base_func_proto(enum bpf_func_id func_id)
> {
> @@ -3861,6 +4090,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> return &bpf_get_socket_cookie_proto;
> case BPF_FUNC_get_socket_uid:
> return &bpf_get_socket_uid_proto;
> + case BPF_FUNC_fib_lookup:
> + return &bpf_fib_lookup_proto;
This part doesn't belong to sk_filter_func_proto(), but to the
tc_cls_act_func_proto() instead.
> default:
> return bpf_base_func_proto(func_id);
> }
> @@ -3957,6 +4188,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> return &bpf_xdp_redirect_map_proto;
> case BPF_FUNC_xdp_adjust_tail:
> return &bpf_xdp_adjust_tail_proto;
> + case BPF_FUNC_fib_lookup:
> + return &bpf_fib_lookup_proto;
Basically, you're using the very same bpf_fib_lookup_proto for
both XDP and skb. In the skb case, you're reusing the two functions
bpf_ipv{4,6}_fib_lookup(), so when you get the netdev pointer for
retrieving the netns, you'll crash at dev_net(ctx->rxq->dev) since
this is XDP only and not skb meta data.
Therefore, as mentioned, pass the netdev to bpf_ipv{4,6}_fib_lookup()
to have it generic and have bpf_xdp_fib_lookup_proto and
bpf_skb_fib_lookup_proto where both are under the case BPF_FUNC_fib_lookup
in the respective *func_proto(), but using the proper prototypes according
to their correct context. Meaning, both reuse bpf_ipv{4,6}_fib_lookup()
from each of their BPF_CALL_4() helper implementation.
> default:
> return bpf_base_func_proto(func_id);
> }
>
Powered by blists - more mailing lists