netdev - Re: [PATCH RFC bpf-next 1/6] bpf: Hooks for sys

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <09232d22-fea0-bfc5-7f1a-8e11a78bf334@iogearbox.net>
Date:   Wed, 14 Mar 2018 15:55:49 +0100
From:   Daniel Borkmann <daniel@...earbox.net>
To:     Alexei Starovoitov <ast@...nel.org>, davem@...emloft.net
Cc:     netdev@...r.kernel.org, kernel-team@...com
Subject: Re: [PATCH RFC bpf-next 1/6] bpf: Hooks for sys_bind

On 03/14/2018 03:37 PM, Daniel Borkmann wrote:
> On 03/14/2018 04:39 AM, Alexei Starovoitov wrote:
> [...]
>> +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr)			       \
>> +	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND)
>> +
>> +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr)			       \
>> +	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND)
>> +
>>  #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
>>  ({									       \
>>  	int __ret = 0;							       \
>> @@ -135,6 +154,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
>>  #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
>>  #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
>>  #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
>> +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
>> +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
>>  #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
>>  #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
>>  
>> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
>> index 19b8349a3809..eefd877f8e68 100644
>> --- a/include/linux/bpf_types.h
>> +++ b/include/linux/bpf_types.h
>> @@ -8,6 +8,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act)
>>  BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
>>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
>>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
>> +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET4_BIND, cg_inet4_bind)
>> +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET6_BIND, cg_inet6_bind)
>>  BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
>>  BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
>>  BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index fdb691b520c0..fe469320feab 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -1001,6 +1001,16 @@ static inline int bpf_tell_extensions(void)
>>  	return SKF_AD_MAX;
>>  }
>>  
>> +struct bpf_sock_addr_kern {
>> +	struct sock *sk;
>> +	struct sockaddr *uaddr;
>> +	/* Temporary "register" to make indirect stores to nested structures
>> +	 * defined above. We need three registers to make such a store, but
>> +	 * only two (src and dst) are available at convert_ctx_access time
>> +	 */
>> +	u64 tmp_reg;
>> +};
>> +
>>  struct bpf_sock_ops_kern {
>>  	struct	sock *sk;
>>  	u32	op;
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 2a66769e5875..78628a3f3cd8 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -133,6 +133,8 @@ enum bpf_prog_type {
>>  	BPF_PROG_TYPE_SOCK_OPS,
>>  	BPF_PROG_TYPE_SK_SKB,
>>  	BPF_PROG_TYPE_CGROUP_DEVICE,
>> +	BPF_PROG_TYPE_CGROUP_INET4_BIND,
>> +	BPF_PROG_TYPE_CGROUP_INET6_BIND,
> 
> Could those all be merged into BPF_PROG_TYPE_SOCK_OPS? I'm slowly getting
> confused by the many sock_*/sk_* prog types we have. The attach hook could
> still be something like BPF_CGROUP_BIND/BPF_CGROUP_CONNECT. Potentially
> storing some prog-type specific void *private_data in prog's aux during
> verification could be a way (similarly as you mention) which can later be
> retrieved at attach time to reject with -ENOTSUPP or such.
> 
>>  };
>>  
>>  enum bpf_attach_type {
>> @@ -143,6 +145,8 @@ enum bpf_attach_type {
>>  	BPF_SK_SKB_STREAM_PARSER,
>>  	BPF_SK_SKB_STREAM_VERDICT,
>>  	BPF_CGROUP_DEVICE,
>> +	BPF_CGROUP_INET4_BIND,
>> +	BPF_CGROUP_INET6_BIND,
> 
> Binding to v4 mapped v6 address would work as well, right? Can't this be
> squashed into one attach type as mentioned?
> 
>>  	__MAX_BPF_ATTACH_TYPE
>>  };
>>  
>> @@ -953,6 +957,26 @@ struct bpf_map_info {
>>  	__u64 netns_ino;
>>  } __attribute__((aligned(8)));
>>  
>> +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
>> + * by user and intended to be used by socket (e.g. to bind to, depends on
>> + * attach attach type).
>> + */
>> +struct bpf_sock_addr {
>> +	__u32 user_family;	/* Allows 4-byte read, but no write. */
>> +	__u32 user_ip4;		/* Allows 1,2,4-byte read and 4-byte write.
>> +				 * Stored in network byte order.
>> +				 */
>> +	__u32 user_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
>> +				 * Stored in network byte order.
>> +				 */
>> +	__u32 user_port;	/* Allows 4-byte read and write.
>> +				 * Stored in network byte order
>> +				 */
>> +	__u32 family;		/* Allows 4-byte read, but no write */
>> +	__u32 type;		/* Allows 4-byte read, but no write */
>> +	__u32 protocol;		/* Allows 4-byte read, but no write */
> 
> I recall bind to prefix came up from time to time in BPF context in the sense
> to let the app itself be more flexible to bind to BPF prog. Do you see also app
> to be able to add a BPF prog into the array itself?
> 
>> +};
>> +
>>  /* User bpf_sock_ops struct to access socket values and specify request ops
>>   * and their replies.
>>   * Some of this fields are in network (bigendian) byte order and may need
>> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>> index c1c0b60d3f2f..78ef086a7c2d 100644
>> --- a/kernel/bpf/cgroup.c
>> +++ b/kernel/bpf/cgroup.c
>> @@ -495,6 +495,42 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
>>  EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
>>  
>>  /**
>> + * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
>> + *                                       provided by user sockaddr
>> + * @sk: sock struct that will use sockaddr
>> + * @uaddr: sockaddr struct provided by user
>> + * @type: The type of program to be exectuted
>> + *
>> + * socket is expected to be of type INET or INET6.
>> + *
>> + * This function will return %-EPERM if an attached program is found and
>> + * returned value != 1 during execution. In all other cases, 0 is returned.
>> + */
>> +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
>> +				      struct sockaddr *uaddr,
>> +				      enum bpf_attach_type type)
>> +{
>> +	struct bpf_sock_addr_kern ctx = {
>> +		.sk = sk,
>> +		.uaddr = uaddr,
>> +	};
>> +	struct cgroup *cgrp;
>> +	int ret;
>> +
>> +	/* Check socket family since not all sockets represent network
>> +	 * endpoint (e.g. AF_UNIX).
>> +	 */
>> +	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
>> +		return 0;
>> +
>> +	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
>> +	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
>> +
>> +	return ret == 1 ? 0 : -EPERM;
> 
> Semantics may be a little bit strange, though this would perhaps be at the risk
> of the orchestrator(s) (?). Basically when you run through the prog array, then
> the last attached program in that array has the final /real/ say to which address
> to bind/connect to; all the others decisions can freely be overridden, so this
> is dependent on the order the BPF progs how they were attached. I think we don't
> have this case for context in multi-prog tracing, cgroup/inet (only filtering)
> and cgroup/dev. Although in cgroup/sock_ops for the tcp/BPF hooks progs can already
> override the sock_ops.reply (and sk_txhash which may be less relevant) field from
> the ctx, so whatever one prog is assumed to reply back to the caller, another one
> could override it. Wouldn't it make more sense to just have a single prog instead
> to avoid this override/ordering issue?

Sigh, scratch that last thought ... lack of coffee, it all depends on where in the
cgroup hierarchy you are to be able to override of course.

Thanks,
Daniel