netdev - Re: [RFC bpf-next 8/9] bpf: Provide helper to do lookups in kernel FIB table

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9488a57d-5559-b69f-631e-71cada1c1d2d@iogearbox.net>
Date:   Wed, 25 Apr 2018 21:55:29 +0200
From:   Daniel Borkmann <daniel@...earbox.net>
To:     David Ahern <dsahern@...il.com>, netdev@...r.kernel.org,
        borkmann@...earbox.net, ast@...nel.org
Cc:     shm@...ulusnetworks.com, roopa@...ulusnetworks.com,
        brouer@...hat.com, toke@...e.dk, john.fastabend@...il.com
Subject: Re: [RFC bpf-next 8/9] bpf: Provide helper to do lookups in kernel
 FIB table

On 04/25/2018 08:34 PM, David Ahern wrote:
> Provide a helper for doing a FIB and neighbor lookup in the kernel
> tables from an XDP program. The helper provides a fastpath for forwarding
> packets. If the packet is a local delivery or for any reason is not a
> simple lookup and forward, the packet continues up the stack.
> 
> If it is to be forwarded, the forwarding can be done directly if the
> neighbor is already known. If the neighbor does not exist, the first
> few packets go up the stack for neighbor resolution. Once resolved, the
> xdp program provides the fast path.
> 
> On successful lookup the nexthop dmac, current device smac and egress
> device index are returned.
> 
> The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
> are implemented in this patch. The API includes layer 4 parameters if
> the XDP program chooses to do deep packet inspection to allow compare
> against ACLs implemented as FIB rules.
> 
> Header rewrite is left to the XDP program.
> 
> The lookup takes 2 flags:
> - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
>   straight to the table associated with the device (expert setting for
>   those looking to maximize throughput)
> 
> - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
>   Default is an ingress lookup.
> 
> Initial performance numbers collected by Jesper, forwarded packets/sec:
> 
>        Full stack    XDP FIB lookup    XDP Direct lookup
> IPv4   1,947,969       7,074,156          7,415,333
> IPv6   1,728,000       6,165,504          7,262,720
> 
> 
> Signed-off-by: David Ahern <dsahern@...il.com>
> ---
>  include/uapi/linux/bpf.h |  68 +++++++++++++-
>  net/core/filter.c        | 233 +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 300 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e6679393b687..82601c132b9f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -10,6 +10,8 @@
>  
>  #include <linux/types.h>
>  #include <linux/bpf_common.h>
> +#include <linux/if_ether.h>
> +#include <linux/in6.h>
>  
>  /* Extended instruction set based on top of classic BPF */
>  
> @@ -783,6 +785,17 @@ union bpf_attr {
>   *     @size: size of 'struct bpf_xfrm_state'
>   *     @flags: room for future extensions
>   *     Return: 0 on success or negative error
> + *
> + * int bpf_fib_lookup(ctx, params, plen, flags)
> + *     Do a FIB lookup based on given parameters
> + *     @ctx:     pointer to context of type xdp_md

Nit: would just say pointer to context here since used with xdp/skb

> + *     @params:  pointer to bpf_fib_lookup
> + *     @plen:    size of params argument
> + *     @flags:   u32 bitmask of BPF_FIB_LOOKUP_* flags
> + *     Return: egress device index if packet is to be forwarded,
> + *             0 for local delivery (anything that needs to be handled
> + *             by the full stack), or negative on error.
> + *             If index is > 0, output data in bpf_fib_lookup is set
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -851,7 +864,9 @@ union bpf_attr {
>  	FN(msg_pull_data),		\
>  	FN(bind),			\
>  	FN(xdp_adjust_tail),		\
> -	FN(skb_get_xfrm_state),
> +	FN(skb_get_xfrm_state),		\
> +	FN(fib_lookup),			\
> +
>  

Nit: trailing '\' resp. double newline

>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
[...]

> diff --git a/net/core/filter.c b/net/core/filter.c
> index 8e45c6c7ab08..37602b2fb94a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -59,6 +59,10 @@
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
>  #include <linux/bpf_trace.h>
> +#include <linux/inetdevice.h>
> +#include <net/ip_fib.h>
> +#include <net/flow.h>
> +#include <net/arp.h>
>  
>  /**
>   *	sk_filter_trim_cap - run a packet through a socket filter
> @@ -3787,6 +3791,231 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
>  };
>  #endif
>  
> +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
> +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
> +				  const struct neighbour *neigh,
> +				  const struct net_device *dev)
> +{
> +	memcpy(params->dmac, neigh->ha, ETH_ALEN);
> +	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
> +	params->h_vlan_TCI = 0;
> +	params->h_vlan_proto = 0;
> +
> +	return dev->ifindex;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_INET)
> +static int bpf_ipv4_fib_lookup(struct xdp_buff *ctx,

Instead of passing xdp_buff here, just pass the netdev pointer. More below
why it's needed.

> +			       struct bpf_fib_lookup *params, u32 flags)
> +{
> +	struct net *net = dev_net(ctx->rxq->dev);
> +	struct in_device *in_dev;
> +	struct neighbour *neigh;
> +	struct net_device *dev;
> +	struct fib_result res;
> +	struct fib_nh *nh;
> +	struct flowi4 fl4;
> +	int err;
> +
> +	dev = dev_get_by_index_rcu(net, params->ifindex);
> +	if (unlikely(!dev))
> +		return -ENODEV;
> +
> +	/* verify forwarding is enabled on this interface */
> +	in_dev = __in_dev_get_rcu(dev);
> +	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
> +		return 0;
> +
> +	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> +		fl4.flowi4_iif = 1;
> +		fl4.flowi4_oif = params->ifindex;
> +	} else {
> +		fl4.flowi4_iif = params->ifindex;
> +		fl4.flowi4_oif = 0;
> +	}
> +	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
> +	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
> +	fl4.flowi4_flags = 0;
> +
> +	fl4.flowi4_proto = params->l4_protocol;
> +	fl4.daddr = params->ipv4_dst;
> +	fl4.saddr = params->ipv4_src;
> +	fl4.fl4_sport = params->sport;
> +	fl4.fl4_dport = params->dport;
> +
> +	if (flags & BPF_FIB_LOOKUP_DIRECT) {
> +		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> +		struct fib_table *tb;
> +
> +		tb = fib_get_table(net, tbid);
> +		if (unlikely(!tb))
> +			return 0;
> +
> +		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
> +	} else {
> +		fl4.flowi4_mark = 0;
> +		fl4.flowi4_secid = 0;
> +		fl4.flowi4_tun_key.tun_id = 0;
> +		fl4.flowi4_uid = sock_net_uid(net, NULL);
> +
> +		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
> +	}
> +
> +	if (err || res.type != RTN_UNICAST)
> +		return 0;
> +
> +	if (res.fi->fib_nhs > 1)
> +		fib_select_path(net, &res, &fl4, NULL);
> +
> +	nh = &res.fi->fib_nh[res.nh_sel];
> +
> +	/* do not handle lwt encaps right now */
> +	if (nh->nh_lwtstate)
> +		return 0;
> +
> +	dev = nh->nh_dev;
> +	if (unlikely(!dev))
> +		return 0;
> +
> +	if (nh->nh_gw)
> +		params->ipv4_dst = nh->nh_gw;
> +
> +	params->rt_metric = res.fi->fib_priority;
> +
> +	/* xdp and cls_bpf programs are run in RCU-bh so
> +	 * rcu_read_lock_bh is not needed here
> +	 */
> +	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
> +	if (neigh)
> +		return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> +	return 0;
> +}
> +#endif
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +static int bpf_ipv6_fib_lookup(struct xdp_buff *ctx,

Same here.

> +			       struct bpf_fib_lookup *params, u32 flags)
> +{
> +	struct net *net = dev_net(ctx->rxq->dev);
> +	struct neighbour *neigh;
> +	struct net_device *dev;
> +	struct fib6_info *f6i;
> +	struct flowi6 fl6;
> +	int strict = 0;
> +	int oif;
> +
> +	/* link local addresses are never forwarded */
> +	if (rt6_need_strict(&params->ipv6_dst) ||
> +	    rt6_need_strict(&params->ipv6_src))
> +		return 0;
> +
> +	dev = dev_get_by_index_rcu(net, params->ifindex);
> +	if (unlikely(!dev))
> +		return -ENODEV;
> +
> +	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
> +		fl6.flowi6_iif = 1;
> +		oif = fl6.flowi6_oif = params->ifindex;
> +	} else {
> +		oif = fl6.flowi6_iif = params->ifindex;
> +		fl6.flowi6_oif = 0;
> +		strict = RT6_LOOKUP_F_HAS_SADDR;
> +	}
> +	fl6.flowlabel = params->flowlabel;
> +	fl6.flowi6_scope = 0;
> +	fl6.flowi6_flags = 0;
> +	fl6.mp_hash = 0;
> +
> +	fl6.flowi6_proto = params->l4_protocol;
> +	fl6.daddr = params->ipv6_dst;
> +	fl6.saddr = params->ipv6_src;
> +	fl6.fl6_sport = params->sport;
> +	fl6.fl6_dport = params->dport;
> +
> +	if (flags & BPF_FIB_LOOKUP_DIRECT) {
> +		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
> +		struct fib6_table *tb;
> +
> +		tb = ipv6_stub->fib6_get_table(net, tbid);
> +		if (unlikely(!tb))
> +			return 0;
> +
> +		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
> +	} else {
> +		fl6.flowi6_mark = 0;
> +		fl6.flowi6_secid = 0;
> +		fl6.flowi6_tun_key.tun_id = 0;
> +		fl6.flowi6_uid = sock_net_uid(net, NULL);
> +
> +		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
> +	}
> +
> +	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
> +		return 0;
> +
> +	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
> +	    f6i->fib6_type != RTN_UNICAST))
> +		return 0;
> +
> +	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
> +		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
> +						       fl6.flowi6_oif, NULL,
> +						       strict);
> +
> +	if (f6i->fib6_nh.nh_lwtstate)
> +		return 0;
> +
> +	if (f6i->fib6_flags & RTF_GATEWAY)
> +		params->ipv6_dst = f6i->fib6_nh.nh_gw;
> +
> +	dev = f6i->fib6_nh.nh_dev;
> +	params->rt_metric = f6i->fib6_metric;
> +
> +	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
> +	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
> +	 * because we need to get nd_tbl via the stub
> +	 */
> +	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
> +				      ndisc_hashfn, &params->ipv6_dst, dev);
> +	if (neigh)
> +		return bpf_fib_set_fwd_params(params, neigh, dev);
> +
> +	return 0;
> +}
> +#endif
> +
> +BPF_CALL_4(bpf_fib_lookup, struct xdp_buff *, ctx,
> +	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
> +{
> +	if (plen < sizeof(*params))
> +		return -EINVAL;
> +
> +	switch (params->family) {
> +#if IS_ENABLED(CONFIG_INET)
> +	case AF_INET:
> +		return bpf_ipv4_fib_lookup(ctx, params, flags);
> +#endif
> +#if IS_ENABLED(CONFIG_IPV6)
> +	case AF_INET6:
> +		return bpf_ipv6_fib_lookup(ctx, params, flags);
> +#endif
> +	}
> +	return -ENOTSUPP;
> +}
> +
> +static const struct bpf_func_proto bpf_fib_lookup_proto = {
> +	.func		= bpf_fib_lookup,
> +	.gpl_only	= true,
> +	.pkt_access	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type      = ARG_PTR_TO_CTX,
> +	.arg2_type      = ARG_PTR_TO_MEM,
> +	.arg3_type      = ARG_CONST_SIZE,
> +	.arg4_type	= ARG_ANYTHING,
> +};
> +
>  static const struct bpf_func_proto *
>  bpf_base_func_proto(enum bpf_func_id func_id)
>  {
> @@ -3861,6 +4090,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_get_socket_cookie_proto;
>  	case BPF_FUNC_get_socket_uid:
>  		return &bpf_get_socket_uid_proto;
> +	case BPF_FUNC_fib_lookup:
> +		return &bpf_fib_lookup_proto;

This part doesn't belong to sk_filter_func_proto(), but to the
tc_cls_act_func_proto() instead.

>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> @@ -3957,6 +4188,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_xdp_redirect_map_proto;
>  	case BPF_FUNC_xdp_adjust_tail:
>  		return &bpf_xdp_adjust_tail_proto;
> +	case BPF_FUNC_fib_lookup:
> +		return &bpf_fib_lookup_proto;

Basically, you're using the very same bpf_fib_lookup_proto for
both XDP and skb. In the skb case, you're reusing the two functions
bpf_ipv{4,6}_fib_lookup(), so when you get the netdev pointer for
retrieving the netns, you'll crash at dev_net(ctx->rxq->dev) since
this is XDP only and not skb meta data.

Therefore, as mentioned, pass the netdev to bpf_ipv{4,6}_fib_lookup()
to have it generic and have bpf_xdp_fib_lookup_proto and
bpf_skb_fib_lookup_proto where both are under the case BPF_FUNC_fib_lookup
in the respective *func_proto(), but using the proper prototypes according
to their correct context. Meaning, both reuse bpf_ipv{4,6}_fib_lookup()
from each of their BPF_CALL_4() helper implementation.

>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
>