lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Thu, 03 Aug 2017 16:37:56 -0700
From:   John Fastabend <john.fastabend@...il.com>
To:     davem@...emloft.net, ast@...com
Cc:     netdev@...r.kernel.org, daniel@...earbox.net
Subject: [RFC PATCH 4/6] net: sockmap with sk redirect support

Recently we added a new map type called dev map used to forward XDP
packets between ports (6093ec2dc313). This patches introduces a
similar notion for sockets.

A sockmap allows users to add participating sockets to a map. When
sockets are added to the map enough context is stored with the
map entry to use the entry with a new helper

  bpf_sk_redirect_map(map, key, flags)

This helper (analogous to bpf_redirect_map in XDP) is given the map
and an entry in the map. When called from a sockmap program, discussed
below, the skb will be sent on the socket using skb_send_sock().

With the above we need a bpf program to call the helper from that will
then implement the send logic. The initial site implemented in this
series is the recv_sock hook. For this to work we implemented a map
attach command to add attributes to a map. In sockmap we add two
programs a parse program and a verdict program. The parse program
uses strparser to build messages and pass them to the verdict program.
The parse program usese normal strparser semantics. The verdict
program is of type SOCKET_FILTER.

The verdict program returns a verdict BPF_OK, BPF_DROP, BPF_REDIRECT.
When BPF_REDIRECT is returned, expected when bpf program uses
bpf_sk_redirect_map(), the sockmap logic will consult per cpu variables
set by the helper routine and pull the sock entry out of the sock map.
This pattern follows the existing redirect logic in cls and xdp
programs.

This gives the flow,

 recv_sock -> str_parser (parse_prog) -> verdict_prog -> skb_send_sock

As an example use case a message based load balancer may use specific
logic in the verdict program to select the sock to send on.

Example and sample programs are provided in future patches that
hopefully illustrate the user interfaces.

TBD: bpf program refcnt'ing needs to be cleaned up, some additional
cleanup in a few error paths, publish performance numbers and some
self tests.

Signed-off-by: John Fastabend <john.fastabend@...il.com>
---
 include/linux/bpf.h       |   11 +
 include/linux/bpf_types.h |    1 
 include/uapi/linux/bpf.h  |   13 +
 kernel/bpf/Makefile       |    2 
 kernel/bpf/helpers.c      |   20 +
 kernel/bpf/sockmap.c      |  623 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c      |   41 +++
 net/core/filter.c         |   51 ++++
 8 files changed, 759 insertions(+), 3 deletions(-)
 create mode 100644 kernel/bpf/sockmap.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6353c74..9ce6aa0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -15,6 +15,8 @@
 #include <linux/err.h>
 #include <linux/rbtree_latch.h>
 
+#include <net/sock.h>
+
 struct perf_event;
 struct bpf_map;
 
@@ -29,6 +31,9 @@ struct bpf_map_ops {
 	/* funcs callable from userspace and from eBPF programs */
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
 	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
+	int (*map_ctx_update_elem)(struct bpf_sock_ops_kern *skops,
+				   struct bpf_map *map,
+				   void *key, u64 flags, u64 map_flags);
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
 
 	/* funcs called by prog_array and perf_event_array map */
@@ -37,6 +42,7 @@ struct bpf_map_ops {
 	void (*map_fd_put_ptr)(void *ptr);
 	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
+	int (*map_attach)(struct bpf_map *map, struct bpf_prog *p1, struct bpf_prog *p2);
 };
 
 struct bpf_map {
@@ -321,6 +327,7 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 
 /* Map specifics */
 struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
@@ -378,9 +385,13 @@ static inline void __dev_map_flush(struct bpf_map *map)
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+inline struct sock *do_sk_redirect_map(void);
+inline u64 get_sk_redirect_flags(void);
+
 /* verifier prototypes for helper functions called from eBPF programs */
 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
+extern const struct bpf_func_proto bpf_map_ctx_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b1e1035..930be52 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -37,4 +37,5 @@
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1106a8c..a89e831 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -105,6 +105,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_ARRAY_OF_MAPS,
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
+	BPF_MAP_TYPE_SOCKMAP,
 };
 
 enum bpf_prog_type {
@@ -129,6 +130,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET_EGRESS,
 	BPF_CGROUP_INET_SOCK_CREATE,
 	BPF_CGROUP_SOCK_OPS,
+	BPF_SOCKMAP_INGRESS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -205,6 +207,7 @@ enum bpf_attach_type {
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
 		__u32		attach_flags;
+		__u32		attach_bpf_fd2;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
@@ -598,7 +601,9 @@ enum bpf_attach_type {
 	FN(set_hash),			\
 	FN(setsockopt),			\
 	FN(skb_adjust_room),		\
-	FN(redirect_map),
+	FN(redirect_map),		\
+	FN(sk_redirect_map),		\
+	FN(map_ctx_update_elem),	\
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -735,6 +740,12 @@ struct xdp_md {
 	__u32 data_end;
 };
 
+enum sk_action {
+	SK_ABORTED = 0,
+	SK_DROP,
+	SK_REDIRECT,
+};
+
 #define BPF_TAG_SIZE	8
 
 struct bpf_prog_info {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 48e9270..3089102 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,7 +3,7 @@ obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 ifeq ($(CONFIG_NET),y)
-obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o sockmap.o
 endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3d24e23..feb38e0 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -43,6 +43,26 @@
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 };
 
+BPF_CALL_5(bpf_ctx_map_update_elem, struct bpf_sock_ops_kern *, bpf_sock,
+	   struct bpf_map *, map, void *, key, u64, flags, u64, map_flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return map->ops->map_ctx_update_elem(bpf_sock, map, key,
+					     flags, map_flags);
+}
+
+const struct bpf_func_proto bpf_map_ctx_update_elem_proto = {
+	.func		= bpf_ctx_map_update_elem,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
 	   void *, value, u64, flags)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
new file mode 100644
index 0000000..9e88c32
--- /dev/null
+++ b/kernel/bpf/sockmap.c
@@ -0,0 +1,623 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* A BPF sock_map is used to store sock objects. This is primarly used
+ * for doing socket redirect with BPF helper routines.
+ *
+ * A sock map may have two BPF programs attached to it, a program used
+ * to parse packets and a program to provide a verdict and redirect
+ * decision on the packet. If no BPF parse program is provided it is
+ * assumed that every skb is a "message" (skb->len). Otherwise the
+ * parse program is attached to strparser and used to build messages
+ * that may span multiple skbs. The verdict program will either select
+ * a socket to send/receive the skb on or provide the drop code indicating
+ * the skb should be dropped. More actions may be added later as needed.
+ * The default program will drop packets.
+ *
+ * For reference this program is similar to devmap used in XDP context
+ * reviewing these together may be useful. For a set of examples and
+ * test codes using this map please review ./samples/bpf/sockmap/ here
+ * you can find common usages such as a socket level load balancer and
+ * cgroup integration.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <net/sock.h>
+#include <linux/rculist_nulls.h>
+#include "percpu_freelist.h"
+#include "bpf_lru_list.h"
+#include "map_in_map.h"
+
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <linux/bpf.h>
+#include <net/strparser.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+
+struct bpf_stab {
+	struct bpf_map map;
+	struct sock **sock_map;
+	struct bpf_prog *bpf_parse;
+	struct bpf_prog *bpf_mux;
+};
+
+struct smap_psock {
+	struct rcu_head	rcu;
+
+	/* datapath variables used under sock lock */
+	struct sk_buff_head rxqueue;
+
+	bool strp_enabled;
+
+	/* datapath error path cache across tx work invocations */
+	int save_rem;
+	int save_off;
+	struct sk_buff *save_skb;
+	u32 tx_stopped : 1;
+
+	struct strparser strp;
+	struct bpf_prog *bpf_parse;
+	struct bpf_prog *bpf_mux;
+	struct bpf_map *map;
+
+	/* Back reference to the file descriptor of the sock */
+	int key;
+	struct sock *sock;
+
+	struct work_struct tx_work;
+
+	void (*save_data_ready)(struct sock *sk);
+	void (*save_write_space)(struct sock *sk);
+	void (*save_state_change)(struct sock *sk);
+};
+
+static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
+{
+	return (struct smap_psock *)sk->sk_user_data;
+}
+
+static int smap_mux_func(struct smap_psock *psock, struct sk_buff *skb)
+{
+	struct bpf_prog *prog = psock->bpf_mux;
+	int rc;
+
+	if (unlikely(!prog))
+		return 0;
+
+	skb->sk = psock->sock;
+	rc = (*prog->bpf_func)(skb, prog->insnsi);
+	skb->sk = NULL;
+
+	return rc;
+}
+
+static struct smap_psock *smap_peers_get(struct smap_psock *psock,
+					     struct sk_buff *skb)
+{
+	struct sock *sock;
+	int rc;
+
+	rc = smap_mux_func(psock, skb);
+	if (unlikely(rc < 0))
+		return NULL;
+
+	sock = do_sk_redirect_map();
+	if (unlikely(!sock))
+		return NULL;
+
+	return smap_psock_sk(sock);
+}
+
+static void smap_report_sk_error(struct smap_psock *psock, int err)
+{
+	struct sock *sk = psock->sock;
+
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key);
+
+static void smap_state_change(struct sock *sk)
+{
+	struct smap_psock *psock = smap_psock_sk(sk);
+
+	/* Allowing transitions into established an syn_recv states allows
+	 * for early binding sockets to a smap object before the connection
+	 * is established. All other transitions indicate the connection is
+	 * being torn down so tear down the smap socket.
+	 */
+	switch (sk->sk_state) {
+	case TCP_SYN_RECV:
+	case TCP_ESTABLISHED:
+		break;
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSING:
+	case TCP_LAST_ACK:
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+	case TCP_LISTEN:
+		break;
+	case TCP_CLOSE:
+		sock_map_delete_elem(psock->map, &psock->key);
+		break;
+	default:
+		smap_report_sk_error(psock, EPIPE);
+		break;
+	}
+}
+
+static void smap_tx_work(struct work_struct *w);
+
+void schedule_writer(struct smap_psock *psock)
+{
+	schedule_work(&psock->tx_work);
+}
+
+static int smap_tx_writer(struct smap_psock *peer)
+{
+	schedule_writer(peer);
+	return 0;
+}
+
+static void smap_read_sock_strparser(struct strparser *strp,
+				     struct sk_buff *skb)
+{
+	struct smap_psock *psock = container_of(strp,
+						  struct smap_psock, strp);
+	struct smap_psock *peer;
+
+	/* TBD useful dbg, add trace here with output sock index or drop */
+	rcu_read_lock();
+	peer = smap_peers_get(psock, skb);
+	if (unlikely(!peer)) {
+		kfree_skb(skb);
+		goto out;
+	}
+
+	skb_queue_tail(&peer->rxqueue, skb);
+	smap_tx_writer(peer);
+out:
+	rcu_read_unlock();
+}
+
+/* Called with lock held on socket */
+static void smap_data_ready(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	read_lock_bh(&sk->sk_callback_lock);
+
+	psock = smap_psock_sk(sk);
+	if (likely(psock))
+		strp_data_ready(&psock->strp);
+
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void smap_tx_work(struct work_struct *w)
+{
+	struct smap_psock *psock;
+	struct sk_buff *skb;
+	int rem, off, n;
+
+	psock = container_of(w, struct smap_psock, tx_work);
+	if (unlikely(psock->tx_stopped))
+		return;
+
+	if (psock->save_skb) {
+		skb = psock->save_skb;
+		rem = psock->save_rem;
+		off = psock->save_off;
+		psock->save_skb = NULL;
+		goto start;
+	}
+
+	while ((skb = skb_dequeue(&psock->rxqueue))) {
+		rem = skb->len;
+		off = 0;
+start:
+		do {
+			n = skb_send_sock(psock->sock, skb, off, rem);
+			if (n <= 0) {
+				if (n == -EAGAIN) {
+					/* Save state to try again when
+					 * there's write space on the
+					 * socket.
+					 */
+					psock->save_skb = skb;
+					psock->save_rem = rem;
+					psock->save_off = off;
+					break;
+				}
+
+				/* Got a hard error or socket had
+				 * been closed somehow. Report this
+				 * on the transport socket.
+				 */
+				smap_report_sk_error(psock, n ? -n : EPIPE);
+				psock->tx_stopped = 1;
+				break;
+			}
+			rem -= n;
+			off += n;
+		} while (rem);
+	}
+}
+
+static void smap_write_space(struct sock *sk)
+{
+	struct smap_psock *psock = smap_psock_sk(sk);
+
+	schedule_writer(psock);
+}
+
+static void smap_stop_sock(struct smap_psock *psock, bool destroy)
+{
+	struct sock *sk = psock->sock;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	if (psock->strp_enabled) {
+		sk->sk_data_ready = psock->save_data_ready;
+		sk->sk_write_space = psock->save_write_space;
+		sk->sk_state_change = psock->save_state_change;
+		strp_stop(&psock->strp);
+	}
+
+	if (destroy)
+		sk->sk_user_data = NULL;
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	if (psock->strp_enabled)
+		strp_done(&psock->strp);
+	psock->strp_enabled = false;
+}
+
+static void smap_destroy_psock(struct rcu_head *rcu)
+{
+	struct smap_psock *psock = container_of(rcu,
+						  struct smap_psock, rcu);
+
+	smap_stop_sock(psock, true);
+	cancel_work_sync(&psock->tx_work);
+	__skb_queue_purge(&psock->rxqueue);
+	sock_put(psock->sock);
+	kfree(psock);
+}
+
+static void smap_release_proxy(struct sock *sock)
+{
+	struct smap_psock *psock = smap_psock_sk(sock);
+
+	call_rcu(&psock->rcu, smap_destroy_psock);
+}
+
+static int smap_parse_func_strparser(struct strparser *strp,
+				       struct sk_buff *skb)
+{
+	struct smap_psock *psock = container_of(strp,
+						  struct smap_psock, strp);
+	struct bpf_prog *prog = psock->bpf_parse;
+
+	if (unlikely(!prog))
+		return skb->len;
+
+	return (*prog->bpf_func)(skb, prog->insnsi);
+}
+
+
+static int smap_read_sock_done(struct strparser *strp, int err)
+{
+	return err;
+}
+
+static int smap_init_sock(struct smap_psock *psock,
+			  struct sock *sock)
+{
+	struct strp_callbacks cb;
+	int err;
+
+	cb.rcv_msg = smap_read_sock_strparser;
+	cb.abort_parser = NULL;
+	cb.parse_msg = smap_parse_func_strparser;
+	cb.read_sock_done = smap_read_sock_done;
+
+	err = strp_init(&psock->strp, sock, &cb);
+	if (err)
+		return -EINVAL;
+	return 0;
+}
+
+static void smap_init_progs(struct smap_psock *psock, struct bpf_stab *stab)
+{
+	/* TBD need prog_put and gets here to avoid programs leaving
+	 * us or something in attach
+	 */
+	if (psock->bpf_mux != stab->bpf_mux)
+		psock->bpf_mux = stab->bpf_mux;
+
+	if (psock->bpf_parse != stab->bpf_parse)
+		psock->bpf_parse = stab->bpf_parse;
+}
+
+static int smap_start_sock(struct smap_psock *psock, struct sock *sk)
+{
+	int err = 0;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	/* only start socket if it is not already running */
+	if (psock->save_data_ready) {
+		err = -EINVAL;
+		goto out;
+	}
+	psock->save_data_ready = sk->sk_data_ready;
+	psock->save_write_space = sk->sk_write_space;
+	psock->save_state_change = sk->sk_state_change;
+	sk->sk_data_ready = smap_data_ready;
+	sk->sk_write_space = smap_write_space;
+	sk->sk_state_change = smap_state_change;
+out:
+	write_unlock_bh(&sk->sk_callback_lock);
+	return err;
+}
+
+static struct smap_psock *smap_init_psock(struct sock *sock,
+					  struct bpf_stab *stab)
+{
+	struct smap_psock *psock;
+
+	psock = kmalloc(sizeof(struct smap_psock), GFP_ATOMIC);
+	if (!psock)
+		return ERR_PTR(-ENOMEM);
+
+	memset(psock, 0, sizeof(struct smap_psock));
+	smap_init_progs(psock, stab);
+	psock->sock = sock;
+
+	skb_queue_head_init(&psock->rxqueue);
+	INIT_WORK(&psock->tx_work, smap_tx_work);
+
+	write_lock_bh(&sock->sk_callback_lock);
+	sock->sk_user_data = psock;
+	write_unlock_bh(&sock->sk_callback_lock);
+
+	sock_hold(sock);
+	return psock;
+}
+
+#define SOCK_MAP_STRPARSER 0x01
+/* BPF map logic */
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_stab *stab;
+	int err = -EINVAL;
+	u64 cost;
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags)
+		return ERR_PTR(-EINVAL);
+
+	/* if value_size is bigger, the user space won't be able to
+	 * access the elements.
+	 */
+	if (attr->value_size > KMALLOC_MAX_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	stab = kzalloc(sizeof(*stab), GFP_USER);
+	if (!stab)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	stab->map.map_type = attr->map_type;
+	stab->map.key_size = attr->key_size;
+	stab->map.value_size = attr->value_size;
+	stab->map.max_entries = attr->max_entries;
+	stab->map.map_flags = attr->map_flags;
+
+
+	/* make sure page count doesn't overflow */
+	cost = (u64) stab->map.max_entries * sizeof(struct sock *) +
+			sizeof(struct socket *);
+	stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	err = -ENOMEM;
+
+	/* if map size is larger than memlock limit, reject it early */
+	err = bpf_map_precharge_memlock(stab->map.pages);
+	if (err)
+		goto free_stab;
+
+	stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
+					    sizeof(struct sock *));
+	if (!stab->sock_map)
+		goto free_stab;
+
+	return &stab->map;
+	/* TBD release progs on errors */
+free_stab:
+	kfree(stab);
+	return ERR_PTR(err);
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	int i;
+
+	synchronize_rcu();
+
+	for (i = 0; i < stab->map.max_entries; i++) {
+		struct sock *sock;
+
+		sock = stab->sock_map[i];
+		if (!sock)
+			continue;
+
+		smap_release_proxy(sock);
+	}
+
+	bpf_map_area_free(stab->sock_map);
+	if (stab->bpf_mux)
+		bpf_prog_put(stab->bpf_mux);
+	if (stab->bpf_parse)
+		bpf_prog_put(stab->bpf_mux);
+	kfree(stab);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	u32 i = key ? *(u32 *)key : U32_MAX;
+	u32 *next = (u32 *)next_key;
+
+	if (i >= stab->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (i == stab->map.max_entries - 1)
+		return -ENOENT;
+
+	*next = i + 1;
+	return 0;
+}
+
+struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	return stab->sock_map[key];
+}
+
+static void *sock_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct sock *sock;
+	u32 i = *(u32 *)key;
+
+	if (i >= map->max_entries)
+		return NULL;
+
+	sock = stab->sock_map[i];
+	return NULL;
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct sock *sock;
+	int k = *(u32 *)key;
+
+	if (k >= map->max_entries)
+		return -EINVAL;
+
+	sock = stab->sock_map[k];
+	if (!sock)
+		return -EINVAL;
+
+	smap_release_proxy(sock);
+	return 0;
+}
+
+static int sock_map_update_elem(struct bpf_sock_ops_kern *skops,
+				struct bpf_map *map,
+				void *key, u64 flags, u64 map_flags)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct sock *old_sock, *sock;
+	struct smap_psock *psock = NULL;
+	u32 i = *(u32 *)key;
+	bool update = false;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	if (unlikely(i >= stab->map.max_entries))
+		return -E2BIG;
+
+	if (unlikely(map_flags > SOCK_MAP_STRPARSER))
+		return -EINVAL;
+
+	if (flags == BPF_EXIST || flags == BPF_ANY) {
+		sock = rcu_dereference(stab->sock_map[i]);
+
+		if (!sock && flags == BPF_EXIST) {
+			return -ENOENT;
+		} else if (sock && sock != skops->sk) {
+			return -EINVAL;
+		} else if (sock) {
+			psock = smap_psock_sk(sock);
+			update = true;
+		}
+	}
+
+	if (!psock) {
+		sock = skops->sk;
+		psock = smap_init_psock(sock, stab);
+		if (IS_ERR(psock))
+			return PTR_ERR(psock);
+		psock->key = i;
+		psock->map = map;
+	}
+
+	if (map_flags & SOCK_MAP_STRPARSER) {
+		smap_start_sock(psock, sock);
+		smap_init_progs(psock, stab);
+		smap_init_sock(psock, sock);
+		psock->strp_enabled = true;
+	} else if (update) {
+		smap_stop_sock(psock, false);
+	}
+
+	if (!update) {
+		old_sock = xchg(&stab->sock_map[i], skops->sk);
+		if (old_sock)
+			smap_release_proxy(old_sock);
+	}
+
+	return 0;
+}
+
+static int sock_map_attach_prog(struct bpf_map *map,
+				struct bpf_prog *parse, struct bpf_prog *mux)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+	stab->bpf_parse = parse;
+	stab->bpf_mux = mux;
+	return 0;
+}
+
+const struct bpf_map_ops sock_map_ops = {
+	.map_alloc = sock_map_alloc,
+	.map_free = sock_map_free,
+	.map_get_next_key = sock_map_get_next_key,
+	.map_lookup_elem = sock_map_lookup_elem,
+	.map_ctx_update_elem = sock_map_update_elem,
+	.map_delete_elem = sock_map_delete_elem,
+	.map_attach = sock_map_attach_prog,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6c772ad..e4f48f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1045,7 +1045,40 @@ static int bpf_obj_get(const union bpf_attr *attr)
 
 #ifdef CONFIG_CGROUP_BPF
 
-#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2
+
+static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype)
+{
+	struct bpf_prog *prog1, *prog2;
+	struct bpf_map *map;
+	int err;
+
+	map = bpf_map_get_with_uref(attr->target_fd);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	if (!map->ops->map_attach)
+		return -EOPNOTSUPP;
+
+	prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog1))
+		return PTR_ERR(prog1);
+
+	prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype);
+	if (IS_ERR(prog2)) {
+		bpf_prog_put(prog1);
+		return PTR_ERR(prog2);
+	}
+
+	err = map->ops->map_attach(map, prog1, prog2);
+	if (err) {
+		bpf_prog_put(prog1);
+		bpf_prog_put(prog2);
+		return PTR_ERR(map);
+	}
+
+	return err;
+}
 
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
@@ -1074,10 +1107,16 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_SOCKMAP_INGRESS:
+		ptype = BPF_PROG_TYPE_SOCKET_FILTER;
+		break;
 	default:
 		return -EINVAL;
 	}
 
+	if (attr->attach_type == BPF_SOCKMAP_INGRESS)
+		return sockmap_get_from_fd(attr, ptype);
+
 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
diff --git a/net/core/filter.c b/net/core/filter.c
index 7e97086..2644f2d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1845,6 +1845,51 @@ int skb_do_redirect(struct sk_buff *skb)
 	.arg3_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+	ri->ifindex = key;
+	ri->flags = flags;
+	ri->map = map;
+
+	return SK_REDIRECT;
+}
+
+inline struct sock *do_sk_redirect_map(void)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct sock *sk = NULL;
+
+	if (ri->map) {
+		sk = __sock_map_lookup_elem(ri->map, ri->ifindex);
+
+		ri->ifindex = 0;
+		ri->map = NULL;
+		/* we do not clear flags for future lookup */
+	}
+
+	return sk;
+}
+EXPORT_SYMBOL(do_sk_redirect_map);
+
+inline u64 get_sk_redirect_flags(void)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+	return ri->flags;
+}
+EXPORT_SYMBOL(get_sk_redirect_flags);
+
+static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+	.func           = bpf_sk_redirect_map,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_CONST_MAP_PTR,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -3090,6 +3135,10 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+	case BPF_FUNC_sk_redirect_map:
+		return &bpf_sk_redirect_map_proto;
+	case BPF_FUNC_map_ctx_update_elem:
+		return &bpf_map_ctx_update_elem_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -3214,6 +3263,8 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 	switch (func_id) {
 	case BPF_FUNC_setsockopt:
 		return &bpf_setsockopt_proto;
+	case BPF_FUNC_map_ctx_update_elem:
+		return &bpf_map_ctx_update_elem_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ