[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180131135356.19134-3-bjorn.topel@gmail.com>
Date: Wed, 31 Jan 2018 14:53:34 +0100
From: Björn Töpel <bjorn.topel@...il.com>
To: bjorn.topel@...il.com, magnus.karlsson@...el.com,
alexander.h.duyck@...el.com, alexander.duyck@...il.com,
john.fastabend@...il.com, ast@...com, brouer@...hat.com,
willemdebruijn.kernel@...il.com, daniel@...earbox.net,
netdev@...r.kernel.org
Cc: Björn Töpel <bjorn.topel@...el.com>,
michael.lundkvist@...csson.com, jesse.brandeburg@...el.com,
anjali.singhai@...el.com, jeffrey.b.shaw@...el.com,
ferruh.yigit@...el.com, qi.z.zhang@...el.com
Subject: [RFC PATCH 02/24] xsk: add user memory registration sockopt
From: Björn Töpel <bjorn.topel@...el.com>
The XDP_MEM_REG socket option allows a process to register a window of
user space memory to the kernel. This memory will later be used as
frame data buffer.
Signed-off-by: Björn Töpel <bjorn.topel@...el.com>
---
include/uapi/linux/if_xdp.h | 7 ++
net/xdp/xsk.c | 294 +++++++++++++++++++++++++++++++++++++++++++-
net/xdp/xsk.h | 19 ++-
3 files changed, 316 insertions(+), 4 deletions(-)
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index cd09232e16c1..3f8c90c708b4 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -29,4 +29,11 @@ struct sockaddr_xdp {
#define XDP_RX_RING 2
#define XDP_TX_RING 3
+struct xdp_mr_req {
+ __u64 addr; /* Start of packet data area */
+ __u64 len; /* Length of packet data area */
+ __u32 frame_size; /* Frame size */
+ __u32 data_headroom; /* Frame head room */
+};
+
#endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 2d7c08a50c60..333ce1450cc7 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -19,18 +19,235 @@
#include <linux/if_xdp.h>
#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
#include <linux/socket.h>
#include <net/sock.h>
#include "xsk.h"
+#define XSK_UMEM_MIN_FRAME_SIZE 2048
+
struct xdp_sock {
/* struct sock must be the first member of struct xdp_sock */
struct sock sk;
+ struct xsk_umem *umem;
};
+static struct xdp_sock *xdp_sk(struct sock *sk)
+{
+ return (struct xdp_sock *)sk;
+}
+
+static void xsk_umem_unpin_pages(struct xsk_umem *umem)
+{
+ unsigned int i;
+
+ if (umem->pgs) {
+ for (i = 0; i < umem->npgs; i++) {
+ struct page *page = umem->pgs[i];
+
+ set_page_dirty_lock(page);
+ put_page(page);
+ }
+
+ kfree(umem->pgs);
+ umem->pgs = NULL;
+ }
+}
+
+static void xsk_umem_destroy(struct xsk_umem *umem)
+{
+ struct mm_struct *mm;
+ struct task_struct *task;
+ unsigned long diff;
+
+ if (!umem)
+ return;
+
+ xsk_umem_unpin_pages(umem);
+
+ task = get_pid_task(umem->pid, PIDTYPE_PID);
+ put_pid(umem->pid);
+ if (!task)
+ goto out;
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ diff = umem->size >> PAGE_SHIFT;
+
+ down_write(&mm->mmap_sem);
+ mm->pinned_vm -= diff;
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+out:
+ kfree(umem);
+}
+
+static struct xsk_umem *xsk_umem_create(u64 addr, u64 size, u32 frame_size,
+ u32 data_headroom)
+{
+ struct xsk_umem *umem;
+ unsigned int nframes;
+ int size_chk;
+
+ if (frame_size < XSK_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+ /* Strictly speaking we could support this, if:
+ * - huge pages, or*
+ * - using an IOMMU, or
+ * - making sure the memory area is consecutive
+ * but for now, we simply say "computer says no".
+ */
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!is_power_of_2(frame_size))
+ return ERR_PTR(-EINVAL);
+
+ if (!PAGE_ALIGNED(addr)) {
+ /* Memory area has to be page size aligned. For
+ * simplicity, this might change.
+ */
+ return ERR_PTR(-EINVAL);
+ }
+
+ if ((addr + size) < addr)
+ return ERR_PTR(-EINVAL);
+
+ nframes = size / frame_size;
+ if (nframes == 0)
+ return ERR_PTR(-EINVAL);
+
+ data_headroom = ALIGN(data_headroom, 64);
+
+ size_chk = frame_size - data_headroom - XSK_KERNEL_HEADROOM;
+ if (size_chk < 0)
+ return ERR_PTR(-EINVAL);
+
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ umem->pid = get_task_pid(current, PIDTYPE_PID);
+ umem->size = (size_t)size;
+ umem->address = (unsigned long)addr;
+ umem->frame_size = frame_size;
+ umem->nframes = nframes;
+ umem->data_headroom = data_headroom;
+ umem->pgs = NULL;
+
+ return umem;
+}
+
+static int xsk_umem_pin_pages(struct xsk_umem *umem)
+{
+ unsigned int gup_flags = FOLL_WRITE;
+ long npgs;
+ int err;
+
+ /* XXX Fix so that we don't always pin.
+ * "copy to user" from interrupt context, but how?
+ */
+ umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_ATOMIC);
+ if (!umem->pgs)
+ return -ENOMEM;
+
+ npgs = get_user_pages(umem->address, umem->npgs,
+ gup_flags, &umem->pgs[0], NULL);
+ if (npgs != umem->npgs) {
+ if (npgs >= 0) {
+ umem->npgs = npgs;
+ err = -ENOMEM;
+ goto out_pin;
+ }
+ err = npgs;
+ goto out_pgs;
+ }
+
+ return 0;
+
+out_pin:
+ xsk_umem_unpin_pages(umem);
+out_pgs:
+ kfree(umem->pgs);
+ umem->pgs = NULL;
+
+ return err;
+}
+
+static struct xsk_umem *xsk_mem_reg(u64 addr, u64 size, u32 frame_size,
+ u32 data_headroom)
+{
+ unsigned long lock_limit, locked, npages;
+ int ret = 0;
+ struct xsk_umem *umem;
+
+ if (!can_do_mlock())
+ return ERR_PTR(-EPERM);
+
+ umem = xsk_umem_create(addr, size, frame_size, data_headroom);
+ if (IS_ERR(umem))
+ return umem;
+
+ npages = PAGE_ALIGN(umem->nframes * umem->frame_size) >> PAGE_SHIFT;
+
+ down_write(¤t->mm->mmap_sem);
+
+ locked = npages + current->mm->pinned_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (npages == 0 || npages > UINT_MAX) {
+ ret = -EINVAL;
+ goto out;
+ }
+ umem->npgs = npages;
+
+ ret = xsk_umem_pin_pages(umem);
+
+out:
+ if (ret < 0) {
+ put_pid(umem->pid);
+ kfree(umem);
+ } else {
+ current->mm->pinned_vm = locked;
+ }
+
+ up_write(¤t->mm->mmap_sem);
+
+ return ret < 0 ? ERR_PTR(ret) : umem;
+}
+
static int xsk_release(struct socket *sock)
{
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct net *net;
+
+ if (!sk)
+ return 0;
+
+ net = sock_net(sk);
+
+ local_bh_disable();
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ local_bh_enable();
+
+ xsk_umem_destroy(xs->umem);
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ sk_refcnt_debug_release(sk);
+ sock_put(sk);
+
return 0;
}
@@ -48,6 +265,43 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock,
static int xsk_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ if (level != SOL_XDP)
+ return -ENOPROTOOPT;
+
+ switch (optname) {
+ case XDP_MEM_REG:
+ {
+ struct xdp_mr_req req;
+ struct xsk_umem *umem;
+
+ if (optlen < sizeof(req))
+ return -EINVAL;
+ if (copy_from_user(&req, optval, sizeof(req)))
+ return -EFAULT;
+
+ umem = xsk_mem_reg(req.addr, req.len, req.frame_size,
+ req.data_headroom);
+ if (IS_ERR(umem))
+ return PTR_ERR(umem);
+
+ lock_sock(sk);
+ if (xs->umem) { /* XXX create and check afterwards... really? */
+ release_sock(sk);
+ xsk_umem_destroy(umem);
+ return -EBUSY;
+ }
+ xs->umem = umem;
+ release_sock(sk);
+
+ return 0;
+ }
+ default:
+ break;
+ }
+
return -ENOPROTOOPT;
}
@@ -97,10 +351,48 @@ static const struct proto_ops xsk_proto_ops = {
/* the rest vvv, OK to be missing implementation -- checked against NULL. */
};
+static void xsk_destruct(struct sock *sk)
+{
+ if (!sock_flag(sk, SOCK_DEAD))
+ return;
+
+ sk_refcnt_debug_dec(sk);
+}
+
static int xsk_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
- return -EOPNOTSUPP;
+ struct sock *sk;
+
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ return -EPERM;
+ if (sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ /* XXX Require ETH_P_IP? Something else? */
+ if (protocol)
+ return -EPROTONOSUPPORT;
+
+ sock->state = SS_UNCONNECTED;
+
+ sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
+ if (!sk)
+ return -ENOBUFS;
+
+ sock->ops = &xsk_proto_ops;
+
+ sock_init_data(sock, sk);
+
+ sk->sk_family = PF_XDP;
+
+ sk->sk_destruct = xsk_destruct;
+ sk_refcnt_debug_inc(sk);
+
+ local_bh_disable();
+ sock_prot_inuse_add(net, &xsk_proto, 1);
+ local_bh_enable();
+
+ return 0;
}
static const struct net_proto_family xsk_family_ops = {
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index 441f8d00a9d5..71559374645b 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -12,7 +12,20 @@
* more details.
*/
-#ifndef _LINUX_XDPSOCK_H
-#define _LINUX_XDPSOCK_H
+#ifndef _LINUX_XSK_H
+#define _LINUX_XSK_H
-#endif /* _LINUX_XDPSOCK_H */
+#define XSK_KERNEL_HEADROOM 256 /* Headrom for XDP */
+
+struct xsk_umem {
+ struct pid *pid;
+ struct page **pgs;
+ unsigned long address;
+ size_t size;
+ u32 npgs;
+ u32 frame_size;
+ u32 nframes;
+ u32 data_headroom;
+};
+
+#endif /* _LINUX_XSK_H */
--
2.14.1
Powered by blists - more mailing lists