netdev - Re: [PATCHv2 bpf-next 07/11] bpf: Add helper to retrieve socket in BPF

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d2b803a4-509f-6d7f-1177-f3ccc1f875eb@iogearbox.net>
Date:   Mon, 24 Sep 2018 14:12:30 +0200
From:   Daniel Borkmann <daniel@...earbox.net>
To:     Joe Stringer <joe@...d.net.nz>, ast@...nel.org
Cc:     netdev@...r.kernel.org, john.fastabend@...il.com, tgraf@...g.ch,
        kafai@...com, nitin.hande@...il.com, mauricio.vasquez@...ito.it
Subject: Re: [PATCHv2 bpf-next 07/11] bpf: Add helper to retrieve socket in
 BPF

Hi Joe,

couple of comments inline:

On 09/21/2018 07:10 PM, Joe Stringer wrote:
> This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and
> bpf_sk_lookup_udp() which allows BPF programs to find out if there is a
> socket listening on this host, and returns a socket pointer which the
> BPF program can then access to determine, for instance, whether to
> forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the
> socket, so when a BPF program makes use of this function, it must
> subsequently pass the returned pointer into the newly added sk_release()
> to return the reference.
> 
> By way of example, the following pseudocode would filter inbound
> connections at XDP if there is no corresponding service listening for
> the traffic:
> 
>   struct bpf_sock_tuple tuple;
>   struct bpf_sock_ops *sk;
> 
>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
>   sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0);
>   if (!sk) {
>     // Couldn't find a socket listening for this traffic. Drop.
>     return TC_ACT_SHOT;
>   }
>   bpf_sk_release(sk, 0);
>   return TC_ACT_OK;
> 
> Signed-off-by: Joe Stringer <joe@...d.net.nz>
> 
> ---
> 
> v2: Rework 'struct bpf_sock_tuple' to allow passing a packet pointer
>     Limit netns_id field to 32 bits
>     Fix compile error with CONFIG_IPV6 enabled
>     Allow direct packet access from helper
> ---
>  include/uapi/linux/bpf.h                  |  57 ++++++++-
>  kernel/bpf/verifier.c                     |   8 +-
>  net/core/filter.c                         | 149 ++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h            |  57 ++++++++-
>  tools/testing/selftests/bpf/bpf_helpers.h |  12 ++
>  5 files changed, 280 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index aa5ccd2385ed..620adbb09a94 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2143,6 +2143,41 @@ union bpf_attr {
>   *		request in the skb.
>   *	Return
>   *		0 on success, or a negative error in case of failure.
> + *
> + * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, flags)

Nit: could you add proper signature like in other cases that are documented?

> + * 	Decription
> + * 		Look for TCP socket matching 'tuple'. The return value must
> + * 		be checked, and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@netns: network namespace id
> + * 		@flags: flags value

Should probably say in all three cases that it's unused right now and reserved
for future.

> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, flags)
> + * 	Decription
> + * 		Look for UDP socket matching 'tuple'. The return value must
> + * 		be checked, and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@netns: network namespace id
> + * 		@flags: flags value
> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + *  int bpf_sk_release(sock, flags)
> + * 	Description
> + * 		Release the reference held by 'sock'.
> + * 		@sock: Pointer reference to release. Must be found via
> + * 		       bpf_sk_lookup_xxx().
> + * 		@flags: flags value
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -2228,7 +2263,10 @@ union bpf_attr {
>  	FN(get_current_cgroup_id),	\
>  	FN(get_local_storage),		\
>  	FN(sk_select_reuseport),	\
> -	FN(skb_ancestor_cgroup_id),
> +	FN(skb_ancestor_cgroup_id),	\
> +	FN(sk_lookup_tcp),		\
> +	FN(sk_lookup_udp),		\
> +	FN(sk_release),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2398,6 +2436,23 @@ struct bpf_sock {
>  				 */
>  };
>  
> +struct bpf_sock_tuple {
> +	union {
> +		struct {
> +			__be32 saddr;
> +			__be32 daddr;
> +			__be16 sport;
> +			__be16 dport;
> +		} ipv4;
> +		struct {
> +			__be32 saddr[4];
> +			__be32 daddr[4];
> +			__be16 sport;
> +			__be16 dport;
> +		} ipv6;
> +	};
> +};
> +
>  #define XDP_PACKET_HEADROOM 256
>  
>  /* User return codes for XDP prog type.
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 16818508b225..7b7fa94aba58 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
>   * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
>   * passes through a NULL-check conditional. For the branch wherein the state is
>   * changed to CONST_IMM, the verifier releases the reference.
> + *
> + * For each helper function that allocates a reference, such as
> + * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
> + * bpf_sk_release(). When a reference type passes into the release function,
> + * the verifier also releases the reference. If any unchecked or unreleased
> + * reference remains at the end of the program, the verifier rejects it.
>   */
>  
>  /* verifier_state + insn_idx are pushed to stack when branch is encountered */
> @@ -300,7 +306,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type)
>   */
>  static bool is_release_function(enum bpf_func_id func_id)
>  {
> -	return false;
> +	return func_id == BPF_FUNC_sk_release;
>  }
>  
>  /* string representation of 'enum bpf_reg_type' */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 43f81883f31d..f0715e2e7b07 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -58,13 +58,17 @@
>  #include <net/busy_poll.h>
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
> +#include <net/udp.h>
>  #include <linux/bpf_trace.h>
>  #include <net/xdp_sock.h>
>  #include <linux/inetdevice.h>
> +#include <net/inet_hashtables.h>
> +#include <net/inet6_hashtables.h>
>  #include <net/ip_fib.h>
>  #include <net/flow.h>
>  #include <net/arp.h>
>  #include <net/ipv6.h>
> +#include <net/net_namespace.h>
>  #include <linux/seg6_local.h>
>  #include <net/seg6.h>
>  #include <net/seg6_local.h>
> @@ -4812,6 +4816,139 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
>  };
>  #endif /* CONFIG_IPV6_SEG6_BPF */
>  
> +struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
> +		       struct sk_buff *skb, u8 family, u8 proto)
> +{
> +	int dif = skb->dev->ifindex;
> +	bool refcounted = false;
> +	struct sock *sk = NULL;
> +
> +	if (family == AF_INET) {
> +		__be32 src4 = tuple->ipv4.saddr;
> +		__be32 dst4 = tuple->ipv4.daddr;
> +		int sdif = inet_sdif(skb);
> +
> +		if (proto == IPPROTO_TCP)
> +			sk = __inet_lookup(net, &tcp_hashinfo, skb, 0,
> +					   src4, tuple->ipv4.sport,
> +					   dst4, tuple->ipv4.dport,
> +					   dif, sdif, &refcounted);
> +		else
> +			sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
> +					       dst4, tuple->ipv4.dport,
> +					       dif, sdif, &udp_table, skb);
> +#if IS_ENABLED(CONFIG_IPV6)
> +	} else {
> +		struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
> +		struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
> +		int sdif = inet6_sdif(skb);
> +
> +		if (proto == IPPROTO_TCP)
> +			sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0,
> +					    src6, tuple->ipv6.sport,
> +					    dst6, tuple->ipv6.dport,
> +					    dif, sdif, &refcounted);
> +		else
> +			sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport,
> +					       dst6, tuple->ipv6.dport,
> +					       dif, sdif, &udp_table, skb);
> +#endif
> +	}
> +
> +	if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
> +		WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
> +		sk = NULL;
> +	}
> +	return sk;
> +}
> +
> +/* bpf_sk_lookup performs the core lookup for different types of sockets,
> + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
> + * Returns the socket as an 'unsigned long' to simplify the casting in the
> + * callers to satisfy BPF_CALL declarations.
> + */
> +static unsigned long
> +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> +	      u8 proto, u64 netns_id, u64 flags)
> +{
> +	struct net *caller_net = dev_net(skb->dev);

For sk_skb programs, are we *always* guaranteed to have a skb->dev assigned?

This definitely holds true for tc programs, but afaik not for sk_skb ones where
you enable the helpers below, so this would result in a NULL ptr dereference.

> +	struct sock *sk = NULL;
> +	u8 family = AF_UNSPEC;
> +	struct net *net;
> +
> +	family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
> +	if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
> +		goto out;
> +
> +	if (netns_id)
> +		net = get_net_ns_by_id(caller_net, netns_id);
> +	else
> +		net = caller_net;
> +	if (unlikely(!net))
> +		goto out;
> +	sk = sk_lookup(net, tuple, skb, family, proto);
> +	put_net(net);
> +
> +	if (sk)
> +		sk = sk_to_full_sk(sk);
> +out:
> +	return (unsigned long) sk;
> +}
> +
> +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
> +	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
> +{
> +	return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags);
> +}
> +
> +static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
> +	.func		= bpf_sk_lookup_tcp,
> +	.gpl_only	= false,
> +	.pkt_access	= true,
> +	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_PTR_TO_MEM,
> +	.arg3_type	= ARG_CONST_SIZE,
> +	.arg4_type	= ARG_ANYTHING,
> +	.arg5_type	= ARG_ANYTHING,
> +};
> +
> +BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
> +	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
> +{
> +	return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags);
> +}
> +
> +static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
> +	.func		= bpf_sk_lookup_udp,
> +	.gpl_only	= false,
> +	.pkt_access	= true,
> +	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_PTR_TO_MEM,
> +	.arg3_type	= ARG_CONST_SIZE,
> +	.arg4_type	= ARG_ANYTHING,
> +	.arg5_type	= ARG_ANYTHING,
> +};
> +
> +BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags)
> +{
> +	if (!sock_flag(sk, SOCK_RCU_FREE))
> +		sock_gen_put(sk);
> +
> +	if (unlikely(flags))
> +		return -EINVAL;

I guess it's probably okay to leave here, though I'm wondering whether it's
worth to have a flags part in general in this helper. We need to release the
reference in any case beforehands anyway as we otherwise leak it. Personally,
I'd probably that arg here.

> +	return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_sk_release_proto = {
> +	.func		= bpf_sk_release,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_SOCKET,
> +	.arg2_type	= ARG_ANYTHING,
> +};
> +
>  bool bpf_helper_changes_pkt_data(void *func)
>  {
>  	if (func == bpf_skb_vlan_push ||
> @@ -5018,6 +5155,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  	case BPF_FUNC_skb_ancestor_cgroup_id:
>  		return &bpf_skb_ancestor_cgroup_id_proto;
>  #endif
> +	case BPF_FUNC_sk_lookup_tcp:
> +		return &bpf_sk_lookup_tcp_proto;
> +	case BPF_FUNC_sk_lookup_udp:
> +		return &bpf_sk_lookup_udp_proto;
> +	case BPF_FUNC_sk_release:
> +		return &bpf_sk_release_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> @@ -5118,6 +5261,12 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_sk_redirect_hash_proto;
>  	case BPF_FUNC_get_local_storage:
>  		return &bpf_get_local_storage_proto;
> +	case BPF_FUNC_sk_lookup_tcp:
> +		return &bpf_sk_lookup_tcp_proto;
> +	case BPF_FUNC_sk_lookup_udp:
> +		return &bpf_sk_lookup_udp_proto;
> +	case BPF_FUNC_sk_release:
> +		return &bpf_sk_release_proto;

(See comment above.)

>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index aa5ccd2385ed..620adbb09a94 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -2143,6 +2143,41 @@ union bpf_attr {
>   *		request in the skb.
>   *	Return
>   *		0 on success, or a negative error in case of failure.
> + *
> + * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, flags)
> + * 	Decription
> + * 		Look for TCP socket matching 'tuple'. The return value must
> + * 		be checked, and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@netns: network namespace id
> + * 		@flags: flags value
> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, flags)
> + * 	Decription
> + * 		Look for UDP socket matching 'tuple'. The return value must
> + * 		be checked, and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@netns: network namespace id
> + * 		@flags: flags value
> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + *  int bpf_sk_release(sock, flags)
> + * 	Description
> + * 		Release the reference held by 'sock'.
> + * 		@sock: Pointer reference to release. Must be found via
> + * 		       bpf_sk_lookup_xxx().
> + * 		@flags: flags value
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -2228,7 +2263,10 @@ union bpf_attr {
>  	FN(get_current_cgroup_id),	\
>  	FN(get_local_storage),		\
>  	FN(sk_select_reuseport),	\
> -	FN(skb_ancestor_cgroup_id),
> +	FN(skb_ancestor_cgroup_id),	\
> +	FN(sk_lookup_tcp),		\
> +	FN(sk_lookup_udp),		\
> +	FN(sk_release),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2398,6 +2436,23 @@ struct bpf_sock {
>  				 */
>  };
>  
> +struct bpf_sock_tuple {
> +	union {
> +		struct {
> +			__be32 saddr;
> +			__be32 daddr;
> +			__be16 sport;
> +			__be16 dport;
> +		} ipv4;
> +		struct {
> +			__be32 saddr[4];
> +			__be32 daddr[4];
> +			__be16 sport;
> +			__be16 dport;
> +		} ipv6;
> +	};
> +};
> +
>  #define XDP_PACKET_HEADROOM 256
>  
>  /* User return codes for XDP prog type.
> diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
> index e4be7730222d..88ce00c3aa0f 100644
> --- a/tools/testing/selftests/bpf/bpf_helpers.h
> +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> @@ -143,6 +143,18 @@ static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) =
>  	(void *) BPF_FUNC_skb_cgroup_id;
>  static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) =
>  	(void *) BPF_FUNC_skb_ancestor_cgroup_id;
> +static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx,
> +					     struct bpf_sock_tuple *tuple,
> +					     int size, unsigned int netns_id,
> +					     unsigned long long flags) =
> +	(void *) BPF_FUNC_sk_lookup_tcp;
> +static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
> +					     struct bpf_sock_tuple *tuple,
> +					     int size, unsigned int netns_id,
> +					     unsigned long long flags) =
> +	(void *) BPF_FUNC_sk_lookup_udp;
> +static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) =
> +	(void *) BPF_FUNC_sk_release;
>  
>  /* llvm builtin functions that eBPF C program may use to
>   * emit BPF_LD_ABS and BPF_LD_IND instructions
>