netdev - Re: [PATCH bpf-next v2 2/3] bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <50573342-5ca2-6001-8e5d-e205702adcdb@gmail.com>
Date:   Mon, 28 Jan 2019 13:31:46 -0700
From:   David Ahern <dsahern@...il.com>
To:     Peter Oskolkov <posk@...gle.com>,
        Alexei Starovoitov <ast@...nel.org>,
        Daniel Borkmann <daniel@...earbox.net>, netdev@...r.kernel.org
Cc:     Peter Oskolkov <posk.devel@...il.com>,
        Willem de Bruijn <willemb@...gle.com>
Subject: Re: [PATCH bpf-next v2 2/3] bpf: implement BPF_LWT_ENCAP_IP mode in
 bpf_lwt_push_encap

On 1/24/19 12:34 PM, Peter Oskolkov wrote:
> This patch implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
> BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN
> and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
> to packets (e.g. IP/GRE, GUE, IPIP).
> 
> This is useful when thousands of different short-lived flows should be
> encapped, each with different and dynamically determined destination.
> Although lwtunnels can be used in some of these scenarios, the ability
> to dynamically generate encap headers adds more flexibility, e.g.
> when routing depends on the state of the host (reflected in global bpf
> maps).
> 
> Signed-off-by: Peter Oskolkov <posk@...gle.com>
> ---
>  include/net/lwtunnel.h |   3 +
>  net/core/filter.c      |   3 +-
>  net/core/lwt_bpf.c     | 142 +++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 147 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
> index 33fd9ba7e0e5..f0973eca8036 100644
> --- a/include/net/lwtunnel.h
> +++ b/include/net/lwtunnel.h
> @@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
>  int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
>  int lwtunnel_input(struct sk_buff *skb);
>  int lwtunnel_xmit(struct sk_buff *skb);
> +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
> +			  bool ingress);
>  
>  static inline void lwtunnel_set_redirect(struct dst_entry *dst)
>  {
> @@ -138,6 +140,7 @@ static inline void lwtunnel_set_redirect(struct dst_entry *dst)
>  		dst->input = lwtunnel_input;
>  	}
>  }
> +
>  #else
>  
>  static inline void lwtstate_free(struct lwtunnel_state *lws)
> diff --git a/net/core/filter.c b/net/core/filter.c
> index fd3ae092d3d7..81d18660c38b 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -73,6 +73,7 @@
>  #include <linux/seg6_local.h>
>  #include <net/seg6.h>
>  #include <net/seg6_local.h>
> +#include <net/lwtunnel.h>
>  
>  /**
>   *	sk_filter_trim_cap - run a packet through a socket filter
> @@ -4796,7 +4797,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
>  static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
>  			     bool ingress)
>  {
> -	return -EINVAL;  /* Implemented in the next patch. */
> +	return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
>  }
>  
>  BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
> diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
> index 3e85437f7106..a3f79bff3776 100644
> --- a/net/core/lwt_bpf.c
> +++ b/net/core/lwt_bpf.c
> @@ -16,6 +16,7 @@
>  #include <linux/types.h>
>  #include <linux/bpf.h>
>  #include <net/lwtunnel.h>
> +#include <net/ip6_route.h>
>  
>  struct bpf_lwt_prog {
>  	struct bpf_prog *prog;
> @@ -55,6 +56,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
>  
>  	switch (ret) {
>  	case BPF_OK:
> +	case BPF_LWT_REROUTE:
>  		break;
>  
>  	case BPF_REDIRECT:
> @@ -97,6 +99,8 @@ static int bpf_input(struct sk_buff *skb)
>  		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
>  		if (ret < 0)
>  			return ret;
> +		if (ret == BPF_LWT_REROUTE)
> +			return dst_input(skb);
>  	}
>  
>  	if (unlikely(!dst->lwtstate->orig_input)) {
> @@ -168,6 +172,13 @@ static int bpf_xmit(struct sk_buff *skb)
>  			return LWTUNNEL_XMIT_CONTINUE;
>  		case BPF_REDIRECT:
>  			return LWTUNNEL_XMIT_DONE;
> +		case BPF_LWT_REROUTE:
> +			ret = dst_output(dev_net(skb_dst(skb)->dev),
> +					 skb->sk, skb);
> +			if (unlikely(ret))
> +				return ret;
> +			/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
> +			return LWTUNNEL_XMIT_DONE;
>  		default:
>  			return ret;
>  		}
> @@ -389,6 +400,137 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
>  	.owner		= THIS_MODULE,
>  };
>  
> +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
> +{
> +	struct dst_entry *dst = NULL;
> +	struct iphdr *iph;
> +	bool ipv4;
> +	int err;
> +
> +	if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
> +		return -EINVAL;
> +
> +	/* validate protocol and length */
> +	iph = (struct iphdr *)hdr;
> +	if (iph->version == 4) {
> +		ipv4 = true;
> +		if (iph->ihl * 4 > len)
> +			return -EINVAL;
> +	} else if (iph->version == 6) {
> +		ipv4 = false;
> +		if (unlikely(len < sizeof(struct ipv6hdr)))
> +			return -EINVAL;
> +	} else {
> +		return -EINVAL;
> +	}
> +
> +	/* allocate enough space for the encap headers + L2 hdr */
> +	if (ingress) {
> +		err = skb_cow_head(skb, len + skb->mac_len);
> +		if (unlikely(err))
> +			return err;
> +	} else {
> +		/* ip_route_input_noref below does route lookup and dst
> +		 * drop/set for ingress. There is no similar function for
> +		 * egress, so we need to do route lookup and replace skb's
> +		 * dst in this function.
> +		 */
> +		struct sock *sk;
> +		struct net *net;
> +
> +		sk = sk_to_full_sk(skb->sk);
> +		if (sk)
> +			net = sock_net(sk);
> +		else
> +			net = dev_net(skb_dst(skb)->dev);

This delta gets VRF tests to pass too. Also, you should be able to
always get net from the device.

diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 526b7cfc6d52..79feebd6da34 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -436,20 +436,24 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb,
void *hdr, u32 len, bool ingress)
                 * egress, so we need to do route lookup and replace skb's
                 * dst in this function.
                 */
+               struct net_device *l3mdev =
l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+               int oif = l3mdev ? l3mdev->ifindex : 0;
                struct sock *sk;
                struct net *net;

                sk = sk_to_full_sk(skb->sk);
-               if (sk)
+               if (sk) {
+                       if (sk->sk_bound_dev_if)
+                               oif = sk->sk_bound_dev_if;
                        net = sock_net(sk);
-               else
+               } else
                        net = dev_net(skb_dst(skb)->dev);

                if (ipv4) {
                        struct flowi4 fl4 = {0};
                        struct rtable *rt;

-                       fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
+                       fl4.flowi4_oif = oif;
                        fl4.flowi4_mark = skb->mark;
                        fl4.flowi4_uid = sock_net_uid(net, sk);
                        fl4.flowi4_tos = RT_TOS(iph->tos);
@@ -466,7 +470,7 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void
*hdr, u32 len, bool ingress)
                        struct ipv6hdr *iph6 = (struct ipv6hdr *)hdr;
                        struct flowi6 fl6 = {0};

-                       fl6.flowi6_oif = sk ? sk->sk_bound_dev_if : 0;
+                       fl6.flowi6_oif = oif;
                        fl6.flowi6_mark = skb->mark;
                        fl6.flowi6_uid = sock_net_uid(net, sk);
                        fl6.flowlabel = ip6_flowinfo(iph6);

> +
> +		if (ipv4) {
> +			struct flowi4 fl4 = {0};
> +			struct rtable *rt;
> +
> +			fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
> +			fl4.flowi4_mark = skb->mark;
> +			fl4.flowi4_uid = sock_net_uid(net, sk);
> +			fl4.flowi4_tos = RT_TOS(iph->tos);
> +			fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
> +			fl4.flowi4_proto = iph->protocol;
> +			fl4.daddr = iph->daddr;
> +			fl4.saddr = iph->saddr;
> +
> +			rt = ip_route_output_key(net, &fl4);
> +			if (IS_ERR(rt) || rt->dst.error)
> +				return -EINVAL;
> +			dst = &rt->dst;
> +		} else {
> +			struct ipv6hdr *iph6 = (struct ipv6hdr *)hdr;
> +			struct flowi6 fl6 = {0};
> +
> +			fl6.flowi6_oif = sk ? sk->sk_bound_dev_if : 0;
> +			fl6.flowi6_mark = skb->mark;
> +			fl6.flowi6_uid = sock_net_uid(net, sk);
> +			fl6.flowlabel = ip6_flowinfo(iph6);
> +			fl6.flowi6_proto = iph6->nexthdr;
> +			fl6.daddr = iph6->daddr;
> +			fl6.saddr = iph6->saddr;
> +
> +			dst = ip6_route_output(net, skb->sk, &fl6);
> +			if (IS_ERR(dst) || dst->error)
> +				return -EINVAL;
> +		}
> +
> +		err = skb_cow_head(skb, len + LL_RESERVED_SPACE(dst->dev));
> +		if (unlikely(err))
> +			return err;
> +	}
> +
> +	/* push the encap headers and fix pointers */
> +	skb_reset_inner_headers(skb);
> +	skb->encapsulation = 1;
> +	skb_push(skb, len);
> +	if (ingress)
> +		skb_postpush_rcsum(skb, iph, len);
> +	skb_reset_network_header(skb);
> +	iph = ip_hdr(skb);
> +	memcpy(iph, hdr, len);

Calling it iph and using ip_hdr seems wrong given that hdr can also be
IPv6. Why not just use skb_network_header?

> +	bpf_compute_data_pointers(skb);
> +
> +	/* final skb touches + routing */
> +	if (ipv4) {
> +		skb->protocol = htons(ETH_P_IP);
> +		if (iph->ihl * 4 < len)
> +			skb_set_transport_header(skb, iph->ihl * 4);
> +
> +		if (!iph->check)
> +			iph->check = ip_fast_csum((unsigned char *)iph,
> +						  iph->ihl);
> +
> +		if (ingress) {
> +			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
> +						   iph->tos, skb_dst(skb)->dev);
> +			if (err)
> +				return err;
> +		} else {
> +			skb_dst_drop(skb);
> +			skb_dst_set(skb, dst);
> +		}
> +	} else {
> +		skb->protocol = htons(ETH_P_IPV6);
> +		if (sizeof(struct ipv6hdr) < len)
> +			skb_set_transport_header(skb, sizeof(struct ipv6hdr));
> +
> +		if (ingress) {
> +			ip6_route_input(skb);
> +			if (skb_dst(skb)->error)
> +				return skb_dst(skb)->error;
> +		} else {
> +			skb_dst_drop(skb);
> +			skb_dst_set(skb, dst);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int __init bpf_lwt_init(void)
>  {
>  	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
>