[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180327165919.17933-8-bjorn.topel@gmail.com>
Date: Tue, 27 Mar 2018 18:59:12 +0200
From: Björn Töpel <bjorn.topel@...il.com>
To: bjorn.topel@...il.com, magnus.karlsson@...el.com,
alexander.h.duyck@...el.com, alexander.duyck@...il.com,
john.fastabend@...il.com, ast@...com, brouer@...hat.com,
willemdebruijn.kernel@...il.com, daniel@...earbox.net,
netdev@...r.kernel.org
Cc: Björn Töpel <bjorn.topel@...el.com>,
michael.lundkvist@...csson.com, jesse.brandeburg@...el.com,
anjali.singhai@...el.com, qi.z.zhang@...el.com,
ravineet.singh@...csson.com
Subject: [RFC PATCH v2 07/14] bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP
From: Björn Töpel <bjorn.topel@...el.com>
The xskmap is yet another BPF map, very much inspired by
dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application
adds AF_XDP sockets into the map, and by using the bpf_redirect_map
helper, an XDP program can redirect XDP frames to an AF_XDP socket.
Note that a socket that is bound to certain ifindex/queue index will
*only* accept XDP frames from that netdev/queue index. If an XDP
program tries to redirect from a netdev/queue index other than what
the socket is bound to, the frame will not be received on the socket.
A socket can reside in multiple maps.
Signed-off-by: Björn Töpel <bjorn.topel@...el.com>
---
include/linux/bpf.h | 26 +++++
include/linux/bpf_types.h | 3 +
include/net/xdp_sock.h | 34 ++++++
include/uapi/linux/bpf.h | 1 +
kernel/bpf/Makefile | 3 +
kernel/bpf/verifier.c | 8 +-
kernel/bpf/xskmap.c | 281 ++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 354 insertions(+), 2 deletions(-)
create mode 100644 include/net/xdp_sock.h
create mode 100644 kernel/bpf/xskmap.c
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 819229c80eca..0fe9d080adc3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -657,6 +657,32 @@ static inline int sock_map_prog(struct bpf_map *map,
}
#endif
+#if defined(CONFIG_XDP_SOCKETS)
+struct xdp_sock;
+struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key);
+int __xsk_map_redirect(struct bpf_map *map, u32 index,
+ struct xdp_buff *xdp, struct xdp_sock *xs);
+void __xsk_map_flush(struct bpf_map *map);
+#else
+struct xdp_sock;
+static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
+ u32 key)
+{
+ return NULL;
+}
+
+static inline int __xsk_map_redirect(struct bpf_map *map, u32 index,
+ struct xdp_buff *xdp,
+ struct xdp_sock *xs)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void __xsk_map_flush(struct bpf_map *map)
+{
+}
+#endif
+
/* verifier prototypes for helper functions called from eBPF programs */
extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
extern const struct bpf_func_proto bpf_map_update_elem_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5e2e8a49fb21..b525862c98ab 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -47,4 +47,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
+#if defined(CONFIG_XDP_SOCKETS)
+BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
+#endif
#endif
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
new file mode 100644
index 000000000000..80d119a685b2
--- /dev/null
+++ b/include/net/xdp_sock.h
@@ -0,0 +1,34 @@
+/*
+ * AF_XDP internal functions
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_XDP_SOCK_H
+#define _LINUX_XDP_SOCK_H
+
+struct xdp_sock;
+struct xdp_buff;
+#ifdef CONFIG_XDP_SOCKETS
+int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
+void xsk_flush(struct xdp_sock *xs);
+#else
+static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+ return -ENOTSUPP;
+}
+
+static inline void xsk_flush(struct xdp_sock *xs)
+{
+}
+#endif /* CONFIG_XDP_SOCKETS */
+
+#endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 18b7c510c511..c8e1d2977712 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_DEVMAP,
BPF_MAP_TYPE_SOCKMAP,
BPF_MAP_TYPE_CPUMAP,
+ BPF_MAP_TYPE_XSKMAP,
};
enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a713fd23ec88..3c59d9bcae14 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -7,6 +7,9 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+ifeq ($(CONFIG_XDP_SOCKETS),y)
+obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
+endif
obj-$(CONFIG_BPF_SYSCALL) += offload.o
ifeq ($(CONFIG_STREAM_PARSER),y)
ifeq ($(CONFIG_INET),y)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e9f7c20691c1..46d525539a3b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2059,8 +2059,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
- /* Restrict bpf side of cpumap, open when use-cases appear */
+ /* Restrict bpf side of cpumap and xskmap, open when use-cases
+ * appear.
+ */
case BPF_MAP_TYPE_CPUMAP:
+ case BPF_MAP_TYPE_XSKMAP:
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
@@ -2107,7 +2110,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_FUNC_redirect_map:
if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
- map->map_type != BPF_MAP_TYPE_CPUMAP)
+ map->map_type != BPF_MAP_TYPE_CPUMAP &&
+ map->map_type != BPF_MAP_TYPE_XSKMAP)
goto error;
break;
case BPF_FUNC_sk_redirect_map:
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
new file mode 100644
index 000000000000..77553f24daa2
--- /dev/null
+++ b/kernel/bpf/xskmap.c
@@ -0,0 +1,281 @@
+/*
+ * XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/capability.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <net/sock.h>
+
+struct xsk_map_entry {
+ struct xdp_sock *xs;
+ struct rcu_head rcu;
+};
+
+struct xsk_map {
+ struct bpf_map map;
+ struct xsk_map_entry **xsk_map;
+ unsigned long __percpu *flush_needed;
+};
+
+static u64 xsk_map_bitmap_size(const union bpf_attr *attr)
+{
+ return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+ struct xsk_map *m;
+ int err = -EINVAL;
+ u64 cost;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != 4 ||
+ attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+ return ERR_PTR(-EINVAL);
+
+ m = kzalloc(sizeof(*m), GFP_USER);
+ if (!m)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&m->map, attr);
+
+ cost = (u64)m->map.max_entries * sizeof(struct xsk_map_entry *);
+ cost += xsk_map_bitmap_size(attr) * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_m;
+
+ m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* Notice returns -EPERM on if map size is larger than memlock limit */
+ err = bpf_map_precharge_memlock(m->map.pages);
+ if (err)
+ goto free_m;
+
+ m->flush_needed = __alloc_percpu(xsk_map_bitmap_size(attr),
+ __alignof__(unsigned long));
+ if (!m->flush_needed)
+ goto free_m;
+
+ m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
+ sizeof(struct xsk_map_entry *),
+ m->map.numa_node);
+ if (!m->xsk_map)
+ goto free_percpu;
+ return &m->map;
+
+free_percpu:
+ free_percpu(m->flush_needed);
+free_m:
+ kfree(m);
+ return ERR_PTR(err);
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ int i, cpu;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this
+ * map->refcnt == 0, so the programs (can be more than one
+ * that used this map) were disconnected from events. Wait for
+ * outstanding critical sections in these programs to
+ * complete. The rcu critical section only guarantees no
+ * further reads against xsk_map. It does __not__ ensure
+ * pending flush operations (if any) are complete.
+ */
+
+ synchronize_rcu();
+
+ /* To ensure all pending flush operations have completed wait
+ * for flush bitmap to indicate all flush_needed bits to be
+ * zero on _all_ cpus. Because the above synchronize_rcu()
+ * ensures the map is disconnected from the program we can
+ * assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(m->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, map->max_entries))
+ cond_resched();
+ }
+
+ for (i = 0; i < map->max_entries; i++) {
+ struct xsk_map_entry *entry;
+
+ entry = m->xsk_map[i];
+ if (!entry)
+ continue;
+
+ sock_put((struct sock *)entry->xs);
+ kfree(entry);
+ }
+
+ free_percpu(m->flush_needed);
+ bpf_map_area_free(m->xsk_map);
+ kfree(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= m->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == m->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xsk_map_entry *entry;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ entry = READ_ONCE(m->xsk_map[key]);
+ return entry ? entry->xs : NULL;
+}
+
+int __xsk_map_redirect(struct bpf_map *map, u32 index,
+ struct xdp_buff *xdp, struct xdp_sock *xs)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ unsigned long *bitmap = this_cpu_ptr(m->flush_needed);
+ int err;
+
+ err = xsk_rcv(xs, xdp);
+ if (err)
+ return err;
+
+ __set_bit(index, bitmap);
+ return 0;
+}
+
+void __xsk_map_flush(struct bpf_map *map)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ unsigned long *bitmap = this_cpu_ptr(m->flush_needed);
+ u32 bit;
+
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct xsk_map_entry *entry = READ_ONCE(m->xsk_map[bit]);
+
+ /* This is possible if the entry is removed by user
+ * space between xdp redirect and flush op.
+ */
+ if (unlikely(!entry))
+ continue;
+
+ __clear_bit(bit, bitmap);
+ xsk_flush(entry->xs);
+ }
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+static void __xsk_map_entry_free(struct rcu_head *rcu)
+{
+ struct xsk_map_entry *entry;
+
+ entry = container_of(rcu, struct xsk_map_entry, rcu);
+ xsk_flush(entry->xs);
+ sock_put((struct sock *)entry->xs);
+ kfree(entry);
+}
+
+static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xsk_map_entry *entry, *old_entry;
+ u32 i = *(u32 *)key, fd = *(u32 *)value;
+ struct socket *sock;
+ int err;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(i >= m->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock)
+ return err;
+
+ if (sock->sk->sk_family != PF_XDP) {
+ sockfd_put(sock);
+ return -EOPNOTSUPP;
+ }
+
+ entry = kmalloc_node(sizeof(*entry), GFP_ATOMIC | __GFP_NOWARN,
+ map->numa_node);
+ if (!entry) {
+ sockfd_put(sock);
+ return -ENOMEM;
+ }
+
+ sock_hold(sock->sk);
+ entry->xs = (struct xdp_sock *)sock->sk;
+
+ old_entry = xchg(&m->xsk_map[i], entry);
+ if (old_entry)
+ call_rcu(&old_entry->rcu, __xsk_map_entry_free);
+
+ sockfd_put(sock);
+ return 0;
+}
+
+static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xsk_map_entry *old_entry;
+ int k = *(u32 *)key;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ old_entry = xchg(&m->xsk_map[k], NULL);
+ if (old_entry)
+ call_rcu(&old_entry->rcu, __xsk_map_entry_free);
+
+ return 0;
+}
+
+const struct bpf_map_ops xsk_map_ops = {
+ .map_alloc = xsk_map_alloc,
+ .map_free = xsk_map_free,
+ .map_get_next_key = xsk_map_get_next_key,
+ .map_lookup_elem = xsk_map_lookup_elem,
+ .map_update_elem = xsk_map_update_elem,
+ .map_delete_elem = xsk_map_delete_elem,
+};
+
+
--
2.14.1
Powered by blists - more mailing lists