netdev - [bpf-next PATCH 06/16] bpf: sockmap, add bpf_msg_apply

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Mon, 05 Mar 2018 11:51:27 -0800
From:   John Fastabend <john.fastabend@...il.com>
To:     ast@...nel.org, daniel@...earbox.net
Cc:     netdev@...r.kernel.org, davejwatson@...com
Subject: [bpf-next PATCH 06/16] bpf: sockmap,
 add bpf_msg_apply_bytes() helper

A single sendmsg or sendfile system call can contain multiple logical
messages that a BPF program may want to read and apply a verdict. But,
without an apply_bytes helper any verdict on the data applies to all
bytes in the sendmsg/sendfile. Alternatively, a BPF program may only
care to read the first N bytes of a msg. If the payload is large say
MB or even GB setting up and calling the BPF program repeatedly for
all bytes, even though the verdict is already known, creates
unnecessary overhead.

To allow BPF programs to control how many bytes a given verdict
applies to we implement a bpf_msg_apply_bytes() helper. When called
from within a BPF program this sets a counter, internal to the
BPF infrastructure, that applies the last verdict to the next N
bytes. If the N is smaller than the current data being processed
from a sendmsg/sendfile call, the first N bytes will be sent and
the BPF program will be re-run with start_data pointing to the N+1
byte. If N is larger than the current data being processed the
BPF verdict will be applied to multiple sendmsg/sendfile calls
until N bytes are consumed.

Note, if a socket closes with apply_bytes counter non-zero this
is not a problem because data is not being buffered for N bytes
and is sent as its received.

Signed-off-by: John Fastabend <john.fastabend@...il.com>
---
 include/linux/filter.h   |    1 
 include/uapi/linux/bpf.h |    3 -
 kernel/bpf/sockmap.c     |  239 ++++++++++++++++++++++++++++++++++++----------
 net/core/filter.c        |   16 +++
 4 files changed, 205 insertions(+), 54 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 15c663e..805a566 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -510,6 +510,7 @@ struct xdp_buff {
 struct sk_msg_buff {
 	void *data;
 	void *data_end;
+	int apply_bytes;
 	int sg_start;
 	int sg_curr;
 	int sg_end;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b8275f0..e50c61f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -769,7 +769,8 @@ enum bpf_attach_type {
 	FN(getsockopt),			\
 	FN(override_return),		\
 	FN(sock_ops_cb_flags_set),	\
-	FN(msg_redirect_map),
+	FN(msg_redirect_map),		\
+	FN(msg_apply_bytes),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 0fd5556..98c6a3b 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -75,6 +75,12 @@ struct smap_psock {
 	int save_off;
 	struct sk_buff *save_skb;
 
+	/* datapath variables for tx_msg ULP */
+	struct sock *sk_redir;
+	int apply_bytes;
+	int sg_size;
+	int eval;
+
 	struct strparser strp;
 	struct bpf_prog *bpf_tx_msg;
 	struct bpf_prog *bpf_parse;
@@ -236,10 +242,11 @@ static int memcopy_from_iter(struct sock *sk,
 	return rc;
 }
 
-static int bpf_tcp_push(struct sock *sk,
-			struct smap_psock *psock, struct sk_msg_buff *md,
+static int bpf_tcp_push(struct sock *sk, int apply_bytes,
+			struct sk_msg_buff *md,
 			int flags, bool uncharge)
 {
+	bool apply = apply_bytes;
 	struct scatterlist *sg;
 	int offset, ret = 0;
 	struct page *p;
@@ -247,7 +254,8 @@ static int bpf_tcp_push(struct sock *sk,
 
 	while (1) {
 		sg = md->sg_data + md->sg_start;
-		size = sg->length;
+		size = (apply && apply_bytes < sg->length) ?
+			apply_bytes : sg->length;
 		offset = sg->offset;
 
 		tcp_rate_check_app_limited(sk);
@@ -256,6 +264,8 @@ static int bpf_tcp_push(struct sock *sk,
 		ret = do_tcp_sendpages(sk, p, offset, size, flags);
 		if (ret != size) {
 			if (ret > 0) {
+				if (apply)
+					apply_bytes -= ret;
 				size -= ret;
 				offset += ret;
 				if (uncharge)
@@ -268,7 +278,8 @@ static int bpf_tcp_push(struct sock *sk,
 			return ret;
 		}
 
-		put_page(p);
+		if (apply)
+			apply_bytes -= ret;
 		sg->offset += ret;
 		sg->length -= ret;
 		if (uncharge)
@@ -280,9 +291,12 @@ static int bpf_tcp_push(struct sock *sk,
 			if (md->sg_start == MAX_SKB_FRAGS)
 				md->sg_start = 0;
 			memset(sg, 0, sizeof(*sg));
+
+			if (md->sg_start == md->sg_end)
+				break;
 		}
 
-		if (md->sg_start == md->sg_end)
+		if (apply && !apply_bytes)
 			break;
 	}
 	return 0;
@@ -296,15 +310,18 @@ static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md)
 	md->data_end = md->data + sg->length;
 }
 
-static void return_mem_sg(struct sock *sk, struct sk_msg_buff *md)
+static void return_mem_sg(struct sock *sk, int bytes,  struct sk_msg_buff *md)
 {
 	struct scatterlist *sg = md->sg_data;
-	int i;
+	int i = md->sg_start;
 
-	i = md->sg_start;
 	do {
-		sk_mem_uncharge(sk, sg[i].length);
+		int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length;
 
+		sk_mem_uncharge(sk, uncharge);
+		bytes -= uncharge;
+		if (!bytes)
+			break;
 		i++;
 		if (i == MAX_SKB_FRAGS)
 			i = 0;
@@ -371,9 +388,26 @@ static unsigned int smap_do_tx_msg(struct sock *sk,
 
 	bpf_compute_data_pointers_sg(md);
 	rc = (*prog->bpf_func)(md, prog->insnsi);
+	psock->apply_bytes = md->apply_bytes;
 
 	/* Moving return codes from UAPI namespace into internal namespace */
 	_rc = bpf_map_msg_verdict(rc, md);
+
+	/* The psock has a refcount on the sock but not on the map and because
+	 * we need to drop rcu read lock here its possible the map could be
+	 * removed between here and when we need it to execute the sock
+	 * redirect. So do the map lookup here and cache it for future use.
+	 */
+	if (_rc == __SK_REDIRECT) {
+		if (psock->sk_redir)
+			sock_put(psock->sk_redir);
+		psock->sk_redir = do_msg_redirect_map(md);
+		if (!psock->sk_redir) {
+			_rc = __SK_DROP;
+			goto verdict;
+		}
+		sock_hold(psock->sk_redir);
+	}
 verdict:
 	rcu_read_unlock();
 	preempt_enable();
@@ -381,21 +415,17 @@ static unsigned int smap_do_tx_msg(struct sock *sk,
 	return _rc;
 }
 
-static int bpf_tcp_sendmsg_do_redirect(struct sk_msg_buff *md,
+static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
+				       struct sk_msg_buff *md,
 				       int flags)
 {
 	struct smap_psock *psock;
 	struct scatterlist *sg;
 	int i, err, free = 0;
-	struct sock *sk;
 
 	sg = md->sg_data;
 
 	rcu_read_lock();
-	sk = do_msg_redirect_map(md);
-	if (unlikely(!sk))
-		goto out_rcu;
-
 	psock = smap_psock_sk(sk);
 	if (unlikely(!psock))
 		goto out_rcu;
@@ -405,7 +435,7 @@ static int bpf_tcp_sendmsg_do_redirect(struct sk_msg_buff *md,
 
 	rcu_read_unlock();
 	lock_sock(sk);
-	err = bpf_tcp_push(sk, psock, md, flags, false);
+	err = bpf_tcp_push(sk, send, md, flags, false);
 	release_sock(sk);
 	smap_release_sock(psock, sk);
 	if (unlikely(err))
@@ -426,20 +456,27 @@ static int bpf_tcp_sendmsg_do_redirect(struct sk_msg_buff *md,
 	return free;
 }
 
-static inline void bpf_md_init(struct sk_msg_buff *md)
+static inline void bpf_md_init(struct smap_psock *psock)
 {
-	md->sg_size = 0;
+	if (!psock->apply_bytes) {
+		psock->eval =  __SK_NONE;
+		if (psock->sk_redir) {
+			sock_put(psock->sk_redir);
+			psock->sk_redir = NULL;
+		}
+	}
 }
 
 static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 {
 	int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
-	int err = 0, eval = __SK_NONE;
+	int send = 0, copied = 0, err = 0;
 	struct sk_msg_buff md = {0};
 	unsigned int sg_copy = 0;
 	struct smap_psock *psock;
-	size_t copy, copied = 0;
+	size_t copy;
 	struct scatterlist *sg;
+	struct sock *redir;
 	long timeo;
 
 	/* Its possible a sock event or user removed the psock _but_ the ops
@@ -472,8 +509,6 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	lock_sock(sk);
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
-	md.sg_size = 0;
-
 	while (msg_data_left(msg)) {
 		if (sk->sk_err) {
 			err = sk->sk_err;
@@ -500,40 +535,79 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			goto out_err;
 		}
 
+		psock->sg_size += copy;
 		copied += copy;
 		sg_copy = 0;
+more_data:
 		/* If msg is larger than MAX_SKB_FRAGS we can send multiple
 		 * scatterlists per msg. However BPF decisions apply to the
 		 * entire msg.
 		 */
-		if (eval == __SK_NONE)
-			eval = smap_do_tx_msg(sk, psock, &md);
+		if (psock->eval == __SK_NONE)
+			psock->eval = smap_do_tx_msg(sk, psock, &md);
 
-		switch (eval) {
+		send = psock->sg_size;
+		if (psock->apply_bytes && psock->apply_bytes < send)
+			send = psock->apply_bytes;
+
+		switch (psock->eval) {
 		case __SK_PASS:
-			err = bpf_tcp_push(sk, psock, &md, flags, true);
+			err = bpf_tcp_push(sk, send, &md, flags, true);
 			if (unlikely(err)) {
 				copied -= free_start_sg(sk, &md);
 				goto out_err;
 			}
+
+			if (psock->apply_bytes) {
+				if (psock->apply_bytes < send)
+					psock->apply_bytes = 0;
+				else
+					psock->apply_bytes -= send;
+			}
+			psock->sg_size -= send;
 			break;
 		case __SK_REDIRECT:
-			return_mem_sg(sk, &md);
+			redir = psock->sk_redir;
+
+			if (psock->apply_bytes) {
+				if (psock->apply_bytes < send)
+					psock->apply_bytes = 0;
+				else
+					psock->apply_bytes -= send;
+			}
+
+			return_mem_sg(sk, send, &md);
 			release_sock(sk);
-			err = bpf_tcp_sendmsg_do_redirect(&md, flags);
+
+			err = bpf_tcp_sendmsg_do_redirect(redir, send,
+							  &md, flags);
+			lock_sock(sk);
+
 			if (unlikely(err)) {
 				copied -= err;
 				goto out_redir;
 			}
-			lock_sock(sk);
+
+			psock->sg_size -= send;
 			break;
 		case __SK_DROP:
 		default:
 			copied -= free_start_sg(sk, &md);
-			goto out_err;
+
+			if (psock->apply_bytes) {
+				if (psock->apply_bytes < send)
+					psock->apply_bytes = 0;
+				else
+					psock->apply_bytes -= send;
+			}
+			psock->sg_size -= copied;
+			err = -EACCES;
+			break;
 		}
 
-		bpf_md_init(&md);
+		bpf_md_init(psock);
+		if (sg[md.sg_start].page_link && sg[md.sg_start].length)
+			goto more_data;
 		continue;
 wait_for_sndbuf:
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -543,28 +617,23 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			goto out_err;
 	}
 out_err:
-	bpf_md_init(&md);
 	if (err < 0)
 		err = sk_stream_error(sk, msg->msg_flags, err);
-	release_sock(sk);
 out_redir:
+	release_sock(sk);
 	smap_release_sock(psock, sk);
 	return copied ? copied : err;
 }
 
-static int bpf_tcp_sendpage_do_redirect(struct page *page, int offset,
+static int bpf_tcp_sendpage_do_redirect(struct sock *sk,
+					struct page *page, int offset,
 					size_t size, int flags,
 					struct sk_msg_buff *md)
 {
 	struct smap_psock *psock;
-	struct sock *sk;
 	int rc;
 
 	rcu_read_lock();
-	sk = do_msg_redirect_map(md);
-	if (unlikely(!sk))
-		goto out_rcu;
-
 	psock = smap_psock_sk(sk);
 	if (unlikely(!psock))
 		goto out_rcu;
@@ -590,8 +659,12 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
 {
 	struct sk_msg_buff md = {0};
 	struct smap_psock *psock;
-	int rc, _rc = __SK_PASS;
+	int send, total = 0, rc = __SK_NONE;
+	int orig_size = size;
 	struct bpf_prog *prog;
+	struct sock *redir;
+
+	send = size;
 
 	preempt_disable();
 	rcu_read_lock();
@@ -604,34 +677,90 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
 	if (unlikely(!prog))
 		goto verdict;
 
-	/* Calculate pkt data pointers and run BPF program */
-	md.data = page_address(page) + offset;
-	md.data_end = md.data + size;
-	_rc = (*prog->bpf_func)(&md, prog->insnsi);
-
+	if (!refcount_inc_not_zero(&psock->refcnt)) {
+		rcu_read_unlock();
+		return tcp_sendpage(sk, page, offset, size, flags);
+	}
 verdict:
 	rcu_read_unlock();
 	preempt_enable();
 
-	/* Moving return codes from UAPI namespace into internal namespace */
-	rc = bpf_map_msg_verdict(_rc, &md);
+	lock_sock(sk);
+more_sendpage_data:
+	if (psock->eval == __SK_NONE)
+		psock->eval = smap_do_tx_msg(sk, psock, &md);
+
+	if (psock->apply_bytes && psock->apply_bytes < send)
+		send = psock->apply_bytes;
 
 	switch (rc) {
 	case __SK_PASS:
-		lock_sock(sk);
-		rc = tcp_sendpage_locked(sk, page, offset, size, flags);
-		release_sock(sk);
+		rc = tcp_sendpage_locked(sk, page, offset, send, flags);
+		if (rc < 0) {
+			total = total ? : rc;
+			goto out_err;
+		}
+
+		if (psock->apply_bytes) {
+			if (psock->apply_bytes > rc)
+				psock->apply_bytes -= rc;
+			else
+				psock->apply_bytes = 0;
+		}
+
+		total += rc;
+		psock->sg_size -= rc;
+		offset += rc;
+		size -= rc;
 		break;
 	case __SK_REDIRECT:
-		rc = bpf_tcp_sendpage_do_redirect(page, offset, size, flags,
-						  &md);
+		redir = psock->sk_redir;
+
+		if (psock->apply_bytes) {
+			if (psock->apply_bytes > send)
+				psock->apply_bytes -= send;
+			else
+				psock->apply_bytes = 0;
+		}
+
+		release_sock(sk);
+
+		/* sock lock dropped must not dereference psock below */
+		rc = bpf_tcp_sendpage_do_redirect(redir,
+						  page, offset, send,
+						  flags, &md);
+		lock_sock(sk);
+		if (rc > 0) {
+			offset += rc;
+			psock->sg_size -= rc;
+			send -= rc;
+		}
+
+		if ((total && rc > 0) || (!total && rc < 0))
+			total += rc;
 		break;
 	case __SK_DROP:
 	default:
+		return_mem_sg(sk, send, &md);
+		if (psock->apply_bytes) {
+			if (psock->apply_bytes > send)
+				psock->apply_bytes -= send;
+			else
+				psock->apply_bytes -= 0;
+		}
+		psock->sg_size -= send;
+		size -= send;
+		total += send;
 		rc = -EACCES;
 	}
 
-	return rc;
+	bpf_md_init(psock);
+	if (size)
+		goto more_sendpage_data;
+out_err:
+	release_sock(sk);
+	smap_release_sock(psock, sk);
+	return total <= orig_size ? total : orig_size;
 }
 
 static void bpf_tcp_msg_add(struct smap_psock *psock,
@@ -953,6 +1082,9 @@ static void smap_gc_work(struct work_struct *w)
 		kfree(e);
 	}
 
+	if (psock->sk_redir)
+		sock_put(psock->sk_redir);
+
 	sock_put(psock->sock);
 	kfree(psock);
 }
@@ -968,6 +1100,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock,
 	if (!psock)
 		return ERR_PTR(-ENOMEM);
 
+	psock->eval =  __SK_NONE;
 	psock->sock = sock;
 	skb_queue_head_init(&psock->rxqueue);
 	INIT_WORK(&psock->tx_work, smap_tx_work);
diff --git a/net/core/filter.c b/net/core/filter.c
index 314c311..df2a8f4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1928,6 +1928,20 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
 	.arg4_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u64, bytes)
+{
+	msg->apply_bytes = bytes;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
+	.func           = bpf_msg_apply_bytes,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -3634,6 +3648,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
 	switch (func_id) {
 	case BPF_FUNC_msg_redirect_map:
 		return &bpf_msg_redirect_map_proto;
+	case BPF_FUNC_msg_apply_bytes:
+		return &bpf_msg_apply_bytes_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}