[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180314033934.3502167-7-ast@kernel.org>
Date: Tue, 13 Mar 2018 20:39:34 -0700
From: Alexei Starovoitov <ast@...nel.org>
To: <davem@...emloft.net>
CC: <daniel@...earbox.net>, <netdev@...r.kernel.org>,
<kernel-team@...com>
Subject: [PATCH RFC bpf-next 6/6] bpf: Post-hooks for sys_bind
From: Andrey Ignatov <rdna@...com>
"Post-hooks" are hooks that are called right before returning from
sys_bind. At this time IP and port are already allocated and no further
changes to `struct sock` can happen before returning from sys_bind but
BPF program has a chance to inspect the socket and change sys_bind
result.
Specifically it can e.g. inspect what port was allocated and if it
doesn't satisfy some policy, BPF program can force sys_bind to release
that port and return an error to user.
Another example of usage is recording the IP:port pair to some map to
use it in later calls to sys_connect. E.g. if some TCP server inside
cgroup was bound to some IP:port and then some TCP client inside same
cgroup is trying to connect to 127.0.0.1:port then BPF hook for
sys_connect can override the destination and connect application to
IP:port instead of 127.0.0.1:port. That helps forcing all applications
inside a cgroup to use desired IP and not break those applications if
they user e.g. localhost to communicate between each other.
== Implementation details ==
Post-hooks are implemented as two new prog types
`BPF_PROG_TYPE_CGROUP_INET4_POST_BIND` and
`BPF_PROG_TYPE_CGROUP_INET6_POST_BIND` and corresponding attach types
`BPF_CGROUP_INET4_POST_BIND` and `BPF_CGROUP_INET6_POST_BIND`.
Separate prog types for IPv4 and IPv6 are introduced to avoid access to
IPv6 field in `struct sock` from `inet_bind()` and to IPv4 field from
`inet6_bind()` since those fields might not make sense in such cases.
`BPF_PROG_TYPE_CGROUP_SOCK` prog type is not reused because it provides
write access to some `struct sock` fields, but socket must not be
changed in post-hooks for sys_bind.
Signed-off-by: Andrey Ignatov <rdna@...com>
Signed-off-by: Alexei Starovoitov <ast@...nel.org>
---
include/linux/bpf-cgroup.h | 16 ++++-
include/linux/bpf_types.h | 2 +
include/uapi/linux/bpf.h | 13 ++++
kernel/bpf/syscall.c | 14 ++++
kernel/bpf/verifier.c | 2 +
net/core/filter.c | 170 ++++++++++++++++++++++++++++++++++++++++-----
net/ipv4/af_inet.c | 3 +-
net/ipv6/af_inet6.c | 3 +-
8 files changed, 202 insertions(+), 21 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 6b5c25ef1482..693c542632e3 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -98,16 +98,24 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
__ret; \
})
-#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \
+#define BPF_CGROUP_RUN_SK_PROG(sk, type) \
({ \
int __ret = 0; \
if (cgroup_bpf_enabled) { \
- __ret = __cgroup_bpf_run_filter_sk(sk, \
- BPF_CGROUP_INET_SOCK_CREATE); \
+ __ret = __cgroup_bpf_run_filter_sk(sk, type); \
} \
__ret; \
})
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \
+ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE)
+
+#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \
+ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND)
+
+#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \
+ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND)
+
#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \
({ \
int __ret = 0; \
@@ -183,6 +191,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 52a571827b9f..23a97978b544 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -10,6 +10,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET4_BIND, cg_inet4_bind)
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET6_BIND, cg_inet6_bind)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET4_POST_BIND, cg_inet4_post_bind)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET6_POST_BIND, cg_inet6_post_bind)
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET4_CONNECT, cg_inet4_connect)
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_INET6_CONNECT, cg_inet6_connect)
BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 441a674f385a..7dcc75a65a97 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -137,6 +137,8 @@ enum bpf_prog_type {
BPF_PROG_TYPE_CGROUP_INET6_BIND,
BPF_PROG_TYPE_CGROUP_INET4_CONNECT,
BPF_PROG_TYPE_CGROUP_INET6_CONNECT,
+ BPF_PROG_TYPE_CGROUP_INET4_POST_BIND,
+ BPF_PROG_TYPE_CGROUP_INET6_POST_BIND,
};
enum bpf_attach_type {
@@ -151,6 +153,8 @@ enum bpf_attach_type {
BPF_CGROUP_INET6_BIND,
BPF_CGROUP_INET4_CONNECT,
BPF_CGROUP_INET6_CONNECT,
+ BPF_CGROUP_INET4_POST_BIND,
+ BPF_CGROUP_INET6_POST_BIND,
__MAX_BPF_ATTACH_TYPE
};
@@ -903,6 +907,15 @@ struct bpf_sock {
__u32 protocol;
__u32 mark;
__u32 priority;
+ __u32 src_ip4; /* Allows 1,2,4-byte read.
+ * Stored in network byte order.
+ */
+ __u32 src_ip6[4]; /* Allows 1,2,4-byte read.
+ * Stored in network byte order.
+ */
+ __u32 src_port; /* Allows 4-byte read.
+ * Stored in network byte order
+ */
};
#define XDP_PACKET_HEADROOM 256
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 145de3332e32..2eb941dacbc5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1382,6 +1382,12 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_BIND:
ptype = BPF_PROG_TYPE_CGROUP_INET6_BIND;
break;
+ case BPF_CGROUP_INET4_POST_BIND:
+ ptype = BPF_PROG_TYPE_CGROUP_INET4_POST_BIND;
+ break;
+ case BPF_CGROUP_INET6_POST_BIND:
+ ptype = BPF_PROG_TYPE_CGROUP_INET6_POST_BIND;
+ break;
case BPF_CGROUP_INET4_CONNECT:
ptype = BPF_PROG_TYPE_CGROUP_INET4_CONNECT;
break;
@@ -1449,6 +1455,12 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_BIND:
ptype = BPF_PROG_TYPE_CGROUP_INET6_BIND;
break;
+ case BPF_CGROUP_INET4_POST_BIND:
+ ptype = BPF_PROG_TYPE_CGROUP_INET4_POST_BIND;
+ break;
+ case BPF_CGROUP_INET6_POST_BIND:
+ ptype = BPF_PROG_TYPE_CGROUP_INET6_POST_BIND;
+ break;
case BPF_CGROUP_INET4_CONNECT:
ptype = BPF_PROG_TYPE_CGROUP_INET4_CONNECT;
break;
@@ -1504,6 +1516,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET_SOCK_CREATE:
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_SOCK_OPS:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cda7830a2c1b..84faec85fe3e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3874,6 +3874,8 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_INET4_BIND:
case BPF_PROG_TYPE_CGROUP_INET6_BIND:
+ case BPF_PROG_TYPE_CGROUP_INET4_POST_BIND:
+ case BPF_PROG_TYPE_CGROUP_INET6_POST_BIND:
case BPF_PROG_TYPE_CGROUP_INET4_CONNECT:
case BPF_PROG_TYPE_CGROUP_INET6_CONNECT:
case BPF_PROG_TYPE_SOCK_OPS:
diff --git a/net/core/filter.c b/net/core/filter.c
index 916195b86a23..e27196248c10 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3840,6 +3840,62 @@ static bool sock_filter_is_valid_access(int off, int size,
return true;
}
+static bool __sock_is_valid_access(unsigned short ctx_family, int off, int size,
+ enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+ unsigned short requested_family = 0;
+
+ if (off < 0 || off >= sizeof(struct bpf_sock))
+ return false;
+ if (off % size != 0)
+ return false;
+ if (type != BPF_READ)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_sock, src_ip4):
+ requested_family = AF_INET;
+ /* FALLTHROUGH */
+ case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+ if (!requested_family)
+ requested_family = AF_INET6;
+ /* Disallow access to IPv6 fields from IPv4 contex and vise
+ * versa.
+ */
+ if (requested_family != ctx_family)
+ return false;
+ bpf_ctx_record_field_size(info, size_default);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+ return false;
+ break;
+ case bpf_ctx_range(struct bpf_sock, family):
+ case bpf_ctx_range(struct bpf_sock, type):
+ case bpf_ctx_range(struct bpf_sock, protocol):
+ case bpf_ctx_range(struct bpf_sock, src_port):
+ if (size != size_default)
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static bool sock4_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ return __sock_is_valid_access(AF_INET, off, size, type, info);
+}
+
+static bool sock6_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ return __sock_is_valid_access(AF_INET6, off, size, type, info);
+}
+
static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
const struct bpf_prog *prog, int drop_verdict)
{
@@ -4406,6 +4462,40 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+static u32 __sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_family));
+ break;
+
+ case offsetof(struct bpf_sock, type):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
+ break;
+
+ case offsetof(struct bpf_sock, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
+ SK_FL_PROTO_SHIFT);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -4447,26 +4537,56 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
offsetof(struct sock, sk_priority));
break;
- case offsetof(struct bpf_sock, family):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
+ default:
+ return __sock_convert_ctx_access(type, si, insn_buf, prog,
+ target_size);
+ }
- *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
- offsetof(struct sock, sk_family));
- break;
+ return insn - insn_buf;
+}
- case offsetof(struct bpf_sock, type):
- *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sock, __sk_flags_offset));
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
- break;
+static u32 sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
- case offsetof(struct bpf_sock, protocol):
- *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sock, __sk_flags_offset));
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
+ switch (si->off) {
+ case offsetof(struct bpf_sock, src_ip4):
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_rcv_saddr,
+ FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr),
+ target_size));
break;
+ case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+ off = si->off;
+ off -= offsetof(struct bpf_sock, src_ip6[0]);
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(
+ struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0],
+ FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]),
+ target_size) + off);
+ break;
+ case offsetof(struct bpf_sock, src_port):
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock_common, skc_num),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_num,
+ FIELD_SIZEOF(struct sock_common,
+ skc_num),
+ target_size));
+ break;
+ default:
+ return __sock_convert_ctx_access(type, si, insn_buf, prog,
+ target_size);
}
return insn - insn_buf;
@@ -5122,6 +5242,24 @@ const struct bpf_verifier_ops cg_sock_verifier_ops = {
const struct bpf_prog_ops cg_sock_prog_ops = {
};
+const struct bpf_verifier_ops cg_inet4_post_bind_verifier_ops = {
+ .get_func_proto = sock_filter_func_proto,
+ .is_valid_access = sock4_is_valid_access,
+ .convert_ctx_access = sock_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_inet4_post_bind_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_inet6_post_bind_verifier_ops = {
+ .get_func_proto = sock_filter_func_proto,
+ .is_valid_access = sock6_is_valid_access,
+ .convert_ctx_access = sock_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_inet6_post_bind_prog_ops = {
+};
+
const struct bpf_verifier_ops cg_inet4_bind_verifier_ops = {
.get_func_proto = inet_bind_func_proto,
.is_valid_access = sock_addr4_is_valid_access,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 488fe26ac8e5..28e2e7fdd5b1 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -521,7 +521,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Make sure we are allowed to bind here. */
if ((snum || !(inet->bind_address_no_port ||
force_bind_address_no_port)) &&
- sk->sk_prot->get_port(sk, snum)) {
+ (sk->sk_prot->get_port(sk, snum) ||
+ BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk))) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 13110bee5c14..473cc55a3a7d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -414,7 +414,8 @@ int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Make sure we are allowed to bind here. */
if ((snum || !(inet->bind_address_no_port ||
force_bind_address_no_port)) &&
- sk->sk_prot->get_port(sk, snum)) {
+ (sk->sk_prot->get_port(sk, snum) ||
+ BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk))) {
sk->sk_ipv6only = saved_ipv6only;
inet_reset_saddr(sk);
err = -EADDRINUSE;
--
2.9.5
Powered by blists - more mailing lists