[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210107184305.444635-4-sdf@google.com>
Date: Thu, 7 Jan 2021 10:43:05 -0800
From: Stanislav Fomichev <sdf@...gle.com>
To: netdev@...r.kernel.org, bpf@...r.kernel.org
Cc: ast@...nel.org, daniel@...earbox.net,
Stanislav Fomichev <sdf@...gle.com>,
Martin KaFai Lau <kafai@...com>,
Song Liu <songliubraving@...com>,
Eric Dumazet <edumazet@...gle.com>
Subject: [PATCH bpf-next v4 3/3] bpf: remove extra lock_sock for TCP_ZEROCOPY_RECEIVE
Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
call in do_tcp_getsockopt using the on-stack data. This removes
2% overhead for locking/unlocking the socket.
Also:
- Removed BUILD_BUG_ON (zerocopy doesn't depend on the buf size anymore)
- Separated on-stack buffer into bpf_sockopt_buf and downsized to 32 bytes
(let's keep it to help with the other options)
(I can probably split this patch into two: add new features and rework
bpf_sockopt_buf; can follow up if the approach in general sounds
good).
Without this patch:
1.87% 0.06% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt
With the patch applied:
0.52% 0.12% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt_kern
Signed-off-by: Stanislav Fomichev <sdf@...gle.com>
Cc: Martin KaFai Lau <kafai@...com>
Cc: Song Liu <songliubraving@...com>
Cc: Eric Dumazet <edumazet@...gle.com>
---
include/linux/bpf-cgroup.h | 25 ++++-
include/linux/filter.h | 6 +-
include/net/sock.h | 2 +
include/net/tcp.h | 1 +
kernel/bpf/cgroup.c | 93 +++++++++++++------
net/ipv4/tcp.c | 14 +++
net/ipv4/tcp_ipv4.c | 1 +
net/ipv6/tcp_ipv6.c | 1 +
.../selftests/bpf/prog_tests/sockopt_sk.c | 22 +++++
.../testing/selftests/bpf/progs/sockopt_sk.c | 15 +++
10 files changed, 147 insertions(+), 33 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index dd4b8e300746..cbba9c9ab073 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -147,6 +147,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
int __user *optlen, int max_optlen,
int retval);
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+ int optname, void *optval,
+ int *optlen, int retval);
+
static inline enum bpf_cgroup_storage_type cgroup_storage_type(
struct bpf_map *map)
{
@@ -366,10 +370,21 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
({ \
int __ret = retval; \
if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \
- __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \
- optname, optval, \
- optlen, max_optlen, \
- retval); \
+ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \
+ !(sock)->sk_prot->bpf_bypass_getsockopt(level, optname)) \
+ __ret = __cgroup_bpf_run_filter_getsockopt( \
+ sock, level, optname, optval, optlen, \
+ max_optlen, retval); \
+ __ret; \
+})
+
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
+ optlen, retval) \
+({ \
+ int __ret = retval; \
+ if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \
+ __ret = __cgroup_bpf_run_filter_getsockopt_kern( \
+ sock, level, optname, optval, optlen, retval); \
__ret; \
})
@@ -454,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
optlen, max_optlen, retval) ({ retval; })
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
+ optlen, retval) ({ retval; })
#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
kernel_optval) ({ 0; })
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 54a4225f36d8..8739f1d4cac4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1281,7 +1281,10 @@ struct bpf_sysctl_kern {
u64 tmp_reg;
};
-#define BPF_SOCKOPT_KERN_BUF_SIZE 64
+#define BPF_SOCKOPT_KERN_BUF_SIZE 32
+struct bpf_sockopt_buf {
+ u8 data[BPF_SOCKOPT_KERN_BUF_SIZE];
+};
struct bpf_sockopt_kern {
struct sock *sk;
@@ -1291,7 +1294,6 @@ struct bpf_sockopt_kern {
s32 optname;
s32 optlen;
s32 retval;
- u8 buf[BPF_SOCKOPT_KERN_BUF_SIZE];
};
int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
diff --git a/include/net/sock.h b/include/net/sock.h
index bdc4323ce53c..ebf44d724845 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1174,6 +1174,8 @@ struct proto {
int (*backlog_rcv) (struct sock *sk,
struct sk_buff *skb);
+ bool (*bpf_bypass_getsockopt)(int level,
+ int optname);
void (*release_cb)(struct sock *sk);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 78d13c88720f..4bb42fb19711 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int tcp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
+bool tcp_bpf_bypass_getsockopt(int level, int optname);
int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
void tcp_set_keepalive(struct sock *sk, int val);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index adbecdcaa370..e82df63aedc7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -16,7 +16,6 @@
#include <linux/bpf-cgroup.h>
#include <net/sock.h>
#include <net/bpf_sk_storage.h>
-#include <uapi/linux/tcp.h> /* sizeof(struct tcp_zerocopy_receive) */
#include "../cgroup/cgroup-internal.h"
@@ -1299,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
return empty;
}
-static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
+ struct bpf_sockopt_buf *buf)
{
if (unlikely(max_optlen < 0))
return -EINVAL;
@@ -1311,18 +1311,11 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
max_optlen = PAGE_SIZE;
}
- if (max_optlen <= sizeof(ctx->buf)) {
+ if (max_optlen <= sizeof(buf->data)) {
/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
* bytes avoid the cost of kzalloc.
- *
- * In order to remove extra allocations from the TCP
- * fast zero-copy path ensure that buffer covers
- * the size of struct tcp_zerocopy_receive.
*/
- BUILD_BUG_ON(sizeof(struct tcp_zerocopy_receive) >
- BPF_SOCKOPT_KERN_BUF_SIZE);
-
- ctx->optval = ctx->buf;
+ ctx->optval = buf->data;
ctx->optval_end = ctx->optval + max_optlen;
return max_optlen;
}
@@ -1336,16 +1329,18 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
return max_optlen;
}
-static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
+ struct bpf_sockopt_buf *buf)
{
- if (ctx->optval == ctx->buf)
+ if (ctx->optval == buf->data)
return;
kfree(ctx->optval);
}
-static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx)
+static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
+ struct bpf_sockopt_buf *buf)
{
- return ctx->optval != ctx->buf;
+ return ctx->optval != buf->data;
}
int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
@@ -1353,6 +1348,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
int *optlen, char **kernel_optval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_buf buf = {};
struct bpf_sockopt_kern ctx = {
.sk = sk,
.level = *level,
@@ -1373,7 +1369,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
*/
max_optlen = max_t(int, 16, *optlen);
- max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1419,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
* No way to export on-stack buf, have to allocate a
* new buffer.
*/
- if (!sockopt_buf_allocated(&ctx)) {
+ if (!sockopt_buf_allocated(&ctx, &buf)) {
void *p = kzalloc(ctx.optlen, GFP_USER);
if (!p) {
@@ -1436,7 +1432,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
out:
if (ret)
- sockopt_free_buf(&ctx);
+ sockopt_free_buf(&ctx, &buf);
return ret;
}
@@ -1445,15 +1441,20 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
int __user *optlen, int max_optlen,
int retval)
{
- struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- struct bpf_sockopt_kern ctx = {
- .sk = sk,
- .level = level,
- .optname = optname,
- .retval = retval,
- };
+ struct bpf_sockopt_kern ctx;
+ struct bpf_sockopt_buf buf;
+ struct cgroup *cgrp;
int ret;
+ memset(&buf, 0, sizeof(buf));
+ memset(&ctx, 0, sizeof(ctx));
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ ctx.sk = sk;
+ ctx.level = level;
+ ctx.optname = optname;
+ ctx.retval = retval;
+
/* Opportunistic check to see whether we have any BPF program
* attached to the hook so we don't waste time allocating
* memory and locking the socket.
@@ -1463,7 +1464,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
ctx.optlen = max_optlen;
- max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1521,9 +1522,47 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
ret = ctx.retval;
out:
- sockopt_free_buf(&ctx);
+ sockopt_free_buf(&ctx, &buf);
return ret;
}
+
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+ int optname, void *optval,
+ int *optlen, int retval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = level,
+ .optname = optname,
+ .retval = retval,
+ .optlen = *optlen,
+ .optval = optval,
+ .optval_end = optval + *optlen,
+ };
+ int ret;
+
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ if (!ret)
+ return -EPERM;
+
+ if (ctx.optlen > *optlen)
+ return -EFAULT;
+
+ /* BPF programs only allowed to set retval to 0, not some
+ * arbitrary value.
+ */
+ if (ctx.retval != 0 && ctx.retval != retval)
+ return -EFAULT;
+
+ /* BPF programs can shrink the buffer, export the modifications.
+ */
+ if (ctx.optlen != 0)
+ *optlen = ctx.optlen;
+
+ return ctx.retval;
+}
#endif
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ed42d2193c5c..ef3c895b66c1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4098,6 +4098,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
lock_sock(sk);
err = tcp_zerocopy_receive(sk, &zc);
+ err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
+ &zc, &len, err);
release_sock(sk);
if (len >= offsetofend(struct tcp_zerocopy_receive, err))
goto zerocopy_rcv_sk_err;
@@ -4132,6 +4134,18 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return 0;
}
+bool tcp_bpf_bypass_getsockopt(int level, int optname)
+{
+ /* TCP do_tcp_getsockopt has optimized getsockopt implementation
+ * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
+ */
+ if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
+
int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int __user *optlen)
{
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 58207c7769d0..8b4906980fce 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2792,6 +2792,7 @@ struct proto tcp_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
+ .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e254569a3005..6624eccff85b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2121,6 +2121,7 @@ struct proto tcpv6_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
+ .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
index b25c9c45c148..6bb18b1d8578 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
@@ -11,6 +11,7 @@ static int getsetsockopt(void)
char u8[4];
__u32 u32;
char cc[16]; /* TCP_CA_NAME_MAX */
+ struct tcp_zerocopy_receive zc;
} buf = {};
socklen_t optlen;
char *big_buf = NULL;
@@ -154,6 +155,27 @@ static int getsetsockopt(void)
goto err;
}
+ /* TCP_ZEROCOPY_RECEIVE triggers */
+ memset(&buf, 0, sizeof(buf));
+ optlen = sizeof(buf.zc);
+ err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
+ if (err) {
+ log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
+ err, errno);
+ goto err;
+ }
+
+ memset(&buf, 0, sizeof(buf));
+ buf.zc.address = 12345; /* rejected by BPF */
+ optlen = sizeof(buf.zc);
+ errno = 0;
+ err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
+ if (errno != EPERM) {
+ log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
+ err, errno);
+ goto err;
+ }
+
free(big_buf);
close(fd);
return 0;
diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c
index 712df7b49cb1..c726f0763a13 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c
@@ -57,6 +57,21 @@ int _getsockopt(struct bpf_sockopt *ctx)
return 1;
}
+ if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) {
+ /* Verify that TCP_ZEROCOPY_RECEIVE triggers.
+ * It has a custom implementation for performance
+ * reasons.
+ */
+
+ if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ if (((struct tcp_zerocopy_receive *)optval)->address != 0)
+ return 0; /* EPERM, unexpected data */
+
+ return 1;
+ }
+
if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
if (optval + 1 > optval_end)
return 0; /* EPERM, bounds check */
--
2.29.2.729.g45daf8777d-goog
Powered by blists - more mailing lists