[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <13c752b7-7fbe-9709-5d50-d780c3ac1f33@iogearbox.net>
Date: Fri, 4 May 2018 14:34:21 +0200
From: Daniel Borkmann <daniel@...earbox.net>
To: Björn Töpel <bjorn.topel@...il.com>,
magnus.karlsson@...el.com, alexander.h.duyck@...el.com,
alexander.duyck@...il.com, john.fastabend@...il.com, ast@...com,
brouer@...hat.com, willemdebruijn.kernel@...il.com, mst@...hat.com,
netdev@...r.kernel.org
Cc: Björn Töpel <bjorn.topel@...el.com>,
michael.lundkvist@...csson.com, jesse.brandeburg@...el.com,
anjali.singhai@...el.com, qi.z.zhang@...el.com
Subject: Re: [PATCH bpf-next v3 02/15] xsk: add user memory registration
support sockopt
On 05/02/2018 01:01 PM, Björn Töpel wrote:
[...]
(Few nits for follow-ups I haven't sent out yesterday night yet; they are on
top of Alexei's remarks.)
> ---
> include/net/xdp_sock.h | 31 ++++++
> include/uapi/linux/if_xdp.h | 34 ++++++
> net/Makefile | 1 +
> net/xdp/Makefile | 2 +
> net/xdp/xdp_umem.c | 245 ++++++++++++++++++++++++++++++++++++++++++++
> net/xdp/xdp_umem.h | 45 ++++++++
> net/xdp/xdp_umem_props.h | 23 +++++
> net/xdp/xsk.c | 215 ++++++++++++++++++++++++++++++++++++++
> 8 files changed, 596 insertions(+)
> create mode 100644 include/net/xdp_sock.h
> create mode 100644 include/uapi/linux/if_xdp.h
> create mode 100644 net/xdp/Makefile
> create mode 100644 net/xdp/xdp_umem.c
> create mode 100644 net/xdp/xdp_umem.h
> create mode 100644 net/xdp/xdp_umem_props.h
> create mode 100644 net/xdp/xsk.c
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> new file mode 100644
> index 000000000000..94785f5db13e
> --- /dev/null
> +++ b/include/net/xdp_sock.h
> @@ -0,0 +1,31 @@
> +/* SPDX-License-Identifier: GPL-2.0
I think this should just be a single line comment, at least that's the case
for majority of files in tree, so probably good to stick to it as well ...
$ git grep -n "SPDX-License-Identifier" | grep "\/\*" | grep -v "\*\/" | wc -l
20
$ git grep -n "SPDX-License-Identifier" | grep "\/\*" | grep "\*\/" | wc -l
7742
... and probably would also make sense to get rid of the below boiler plate
text then. (Applies to other added files as well in this series, only mention
it here once.)
> + * AF_XDP internal functions
> + * Copyright(c) 2018 Intel Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + */
> +
> +#ifndef _LINUX_XDP_SOCK_H
> +#define _LINUX_XDP_SOCK_H
> +
> +#include <linux/mutex.h>
> +#include <net/sock.h>
> +
> +struct xdp_umem;
> +
> +struct xdp_sock {
> + /* struct sock must be the first member of struct xdp_sock */
> + struct sock sk;
> + struct xdp_umem *umem;
> + /* Protects multiple processes in the control path */
> + struct mutex mutex;
> +};
> +
> +#endif /* _LINUX_XDP_SOCK_H */
[...]
> diff --git a/net/Makefile b/net/Makefile
> index a6147c61b174..77aaddedbd29 100644
> --- a/net/Makefile
> +++ b/net/Makefile
> @@ -85,3 +85,4 @@ obj-y += l3mdev/
> endif
> obj-$(CONFIG_QRTR) += qrtr/
> obj-$(CONFIG_NET_NCSI) += ncsi/
> +obj-$(CONFIG_XDP_SOCKETS) += xdp/
> diff --git a/net/xdp/Makefile b/net/xdp/Makefile
> new file mode 100644
> index 000000000000..a5d736640a0f
> --- /dev/null
> +++ b/net/xdp/Makefile
> @@ -0,0 +1,2 @@
> +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o
> +
Nit: newline at end of file.
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> new file mode 100644
> index 000000000000..ec8b3552be44
> --- /dev/null
> +++ b/net/xdp/xdp_umem.c
[...]
> +
> +#include "xdp_umem.h"
> +
> +#define XDP_UMEM_MIN_FRAME_SIZE 2048
> +
> +int xdp_umem_create(struct xdp_umem **umem)
> +{
> + *umem = kzalloc(sizeof(**umem), GFP_KERNEL);
> +
> + if (!(*umem))
Nit: extra () not needed. You also have the extra brackets in couple of
other places/conditionals.
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +static void xdp_umem_unpin_pages(struct xdp_umem *umem)
> +{
> + unsigned int i;
> +
> + if (umem->pgs) {
All call-sites of this have umem->pgs never as NULL.
> + for (i = 0; i < umem->npgs; i++) {
> + struct page *page = umem->pgs[i];
> +
> + set_page_dirty_lock(page);
> + put_page(page);
> + }
> +
> + kfree(umem->pgs);
> + umem->pgs = NULL;
> + }
> +}
> +
> +static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
> +{
> + if (umem->user) {
> + atomic_long_sub(umem->npgs, &umem->user->locked_vm);
> + free_uid(umem->user);
> + }
> +}
> +
> +static void xdp_umem_release(struct xdp_umem *umem)
> +{
> + struct task_struct *task;
> + struct mm_struct *mm;
> +
> + if (umem->pgs) {
> + xdp_umem_unpin_pages(umem);
> +
> + task = get_pid_task(umem->pid, PIDTYPE_PID);
> + put_pid(umem->pid);
> + if (!task)
> + goto out;
> + mm = get_task_mm(task);
> + put_task_struct(task);
> + if (!mm)
> + goto out;
> +
> + mmput(mm);
> + umem->pgs = NULL;
> + }
> +
> + xdp_umem_unaccount_pages(umem);
> +out:
> + kfree(umem);
> +}
> +
> +static void xdp_umem_release_deferred(struct work_struct *work)
> +{
> + struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
> +
> + xdp_umem_release(umem);
> +}
> +
> +void xdp_get_umem(struct xdp_umem *umem)
> +{
> + atomic_inc(&umem->users);
> +}
> +
> +void xdp_put_umem(struct xdp_umem *umem)
> +{
> + if (!umem)
> + return;
> +
> + if (atomic_dec_and_test(&umem->users)) {
> + INIT_WORK(&umem->work, xdp_umem_release_deferred);
> + schedule_work(&umem->work);
> + }
> +}
> +
> +static int xdp_umem_pin_pages(struct xdp_umem *umem)
> +{
> + unsigned int gup_flags = FOLL_WRITE;
> + long npgs;
> + int err;
> +
> + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL);
> + if (!umem->pgs)
> + return -ENOMEM;
> +
> + down_write(¤t->mm->mmap_sem);
> + npgs = get_user_pages(umem->address, umem->npgs,
> + gup_flags, &umem->pgs[0], NULL);
> + up_write(¤t->mm->mmap_sem);
> +
> + if (npgs != umem->npgs) {
> + if (npgs >= 0) {
> + umem->npgs = npgs;
> + err = -ENOMEM;
> + goto out_pin;
> + }
> + err = npgs;
> + goto out_pgs;
> + }
> + return 0;
> +
> +out_pin:
> + xdp_umem_unpin_pages(umem);
> +out_pgs:
> + kfree(umem->pgs);
> + umem->pgs = NULL;
> + return err;
> +}
> +
> +static int xdp_umem_account_pages(struct xdp_umem *umem)
> +{
> + unsigned long lock_limit, new_npgs, old_npgs;
> +
> + if (capable(CAP_IPC_LOCK))
> + return 0;
> +
> + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> + umem->user = get_uid(current_user());
> +
> + do {
> + old_npgs = atomic_long_read(&umem->user->locked_vm);
> + new_npgs = old_npgs + umem->npgs;
> + if (new_npgs > lock_limit) {
> + free_uid(umem->user);
> + umem->user = NULL;
> + return -ENOBUFS;
> + }
> + } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
> + new_npgs) != old_npgs);
> + return 0;
> +}
> +
> +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> +{
> + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
> + u64 addr = mr->addr, size = mr->len;
> + unsigned int nframes, nfpp;
> + int size_chk, err;
> +
> + if (!umem)
> + return -EINVAL;
Wouldn't it be better to remove these sort of defensive checks (here and in
other places)? Eventually they might only end up hiding potential bugs rather
than having them noticed. Only call-site is in xsk_setsockopt() where you do:
[...]
mutex_lock(&xs->mutex);
err = xdp_umem_create(&umem);
err = xdp_umem_reg(umem, &mr);
if (err) {
kfree(umem);
mutex_unlock(&xs->mutex);
return err;
}
[...]
Seems more intuitive and easier to audit when you bail out initially when
xdp_umem_create() fails rather than in xdp_umem_reg(), and then the test
for !umem can be removed.
> + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
> + /* Strictly speaking we could support this, if:
> + * - huge pages, or*
> + * - using an IOMMU, or
> + * - making sure the memory area is consecutive
> + * but for now, we simply say "computer says no".
> + */
> + return -EINVAL;
> + }
> +
> + if (!is_power_of_2(frame_size))
> + return -EINVAL;
> +
> + if (!PAGE_ALIGNED(addr)) {
> + /* Memory area has to be page size aligned. For
> + * simplicity, this might change.
> + */
> + return -EINVAL;
> + }
> +
> + if ((addr + size) < addr)
> + return -EINVAL;
> +
> + nframes = size / frame_size;
> + if (nframes == 0 || nframes > UINT_MAX)
> + return -EINVAL;
> +
> + nfpp = PAGE_SIZE / frame_size;
> + if (nframes < nfpp || nframes % nfpp)
> + return -EINVAL;
> +
> + frame_headroom = ALIGN(frame_headroom, 64);
> +
> + size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
> + if (size_chk < 0)
> + return -EINVAL;
> +
> + umem->pid = get_task_pid(current, PIDTYPE_PID);
> + umem->size = (size_t)size;
> + umem->address = (unsigned long)addr;
> + umem->props.frame_size = frame_size;
> + umem->props.nframes = nframes;
> + umem->frame_headroom = frame_headroom;
> + umem->npgs = size / PAGE_SIZE;
> + umem->pgs = NULL;
> + umem->user = NULL;
> +
> + umem->frame_size_log2 = ilog2(frame_size);
> + umem->nfpp_mask = nfpp - 1;
> + umem->nfpplog2 = ilog2(nfpp);
> + atomic_set(&umem->users, 1);
> +
> + err = xdp_umem_account_pages(umem);
> + if (err)
> + goto out;
> +
> + err = xdp_umem_pin_pages(umem);
> + if (err)
> + goto out_account;
> + return 0;
> +
> +out_account:
> + xdp_umem_unaccount_pages(umem);
> +out:
> + put_pid(umem->pid);
> + return err;
> +}
[...]
> +#ifndef XDP_UMEM_H_
> +#define XDP_UMEM_H_
> +
> +#include <linux/mm.h>
> +#include <linux/if_xdp.h>
> +#include <linux/workqueue.h>
> +
> +#include "xdp_umem_props.h"
> +
> +struct xdp_umem {
> + struct page **pgs;
> + struct xdp_umem_props props;
> + u32 npgs;
> + u32 frame_headroom;
> + u32 nfpp_mask;
> + u32 nfpplog2;
> + u32 frame_size_log2;
> + struct user_struct *user;
> + struct pid *pid;
> + unsigned long address;
> + size_t size;
> + atomic_t users;
Convert to refcnt_t?
> + struct work_struct work;
> +};
> +
> +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
> +void xdp_get_umem(struct xdp_umem *umem);
> +void xdp_put_umem(struct xdp_umem *umem);
> +int xdp_umem_create(struct xdp_umem **umem);
> +
> +#endif /* XDP_UMEM_H_ */
[...]
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> new file mode 100644
> index 000000000000..84e0e867febb
[...]
> +static struct xdp_sock *xdp_sk(struct sock *sk)
> +{
> + return (struct xdp_sock *)sk;
> +}
> +
> +static int xsk_release(struct socket *sock)
> +{
> + struct sock *sk = sock->sk;
> + struct net *net;
> +
> + if (!sk)
> + return 0;
> +
> + net = sock_net(sk);
> +
> + local_bh_disable();
> + sock_prot_inuse_add(net, sk->sk_prot, -1);
> + local_bh_enable();
> +
> + sock_orphan(sk);
> + sock->sk = NULL;
> +
> + sk_refcnt_debug_release(sk);
> + sock_put(sk);
> +
> + return 0;
> +}
> +
> +static int xsk_setsockopt(struct socket *sock, int level, int optname,
> + char __user *optval, unsigned int optlen)
> +{
> + struct sock *sk = sock->sk;
> + struct xdp_sock *xs = xdp_sk(sk);
> + int err;
> +
> + if (level != SOL_XDP)
> + return -ENOPROTOOPT;
> +
> + switch (optname) {
> + case XDP_UMEM_REG:
> + {
> + struct xdp_umem_reg mr;
> + struct xdp_umem *umem;
> +
> + if (xs->umem)
Does this need READ_ONCE() or needs to go under the lock?
> + return -EBUSY;
> +
> + if (copy_from_user(&mr, optval, sizeof(mr)))
> + return -EFAULT;
> +
> + mutex_lock(&xs->mutex);
> + err = xdp_umem_create(&umem);
> +
> + err = xdp_umem_reg(umem, &mr);
> + if (err) {
> + kfree(umem);
The kfree() here begs for having a proper destructor such that once you extend
xdp_umem_create() these spots are not missed when further cleanups have to be
performed on dismantle.
> + mutex_unlock(&xs->mutex);
> + return err;
> + }
> +
> + /* Make sure umem is ready before it can be seen by others */
> + smp_wmb();
> +
> + xs->umem = umem;
> + mutex_unlock(&xs->mutex);
> + return 0;
> + }
> + default:
> + break;
> + }
> +
> + return -ENOPROTOOPT;
> +}
> +
> +static struct proto xsk_proto = {
> + .name = "XDP",
> + .owner = THIS_MODULE,
> + .obj_size = sizeof(struct xdp_sock),
> +};
> +
> +static const struct proto_ops xsk_proto_ops = {
> + .family = PF_XDP,
> + .owner = THIS_MODULE,
> + .release = xsk_release,
> + .bind = sock_no_bind,
> + .connect = sock_no_connect,
> + .socketpair = sock_no_socketpair,
> + .accept = sock_no_accept,
> + .getname = sock_no_getname,
> + .poll = sock_no_poll,
> + .ioctl = sock_no_ioctl,
> + .listen = sock_no_listen,
> + .shutdown = sock_no_shutdown,
> + .setsockopt = xsk_setsockopt,
> + .getsockopt = sock_no_getsockopt,
> + .sendmsg = sock_no_sendmsg,
> + .recvmsg = sock_no_recvmsg,
> + .mmap = sock_no_mmap,
> + .sendpage = sock_no_sendpage,
Nit: would have been nice to properly align the '='
> +};
> +
> +static void xsk_destruct(struct sock *sk)
> +{
> + struct xdp_sock *xs = xdp_sk(sk);
> +
> + if (!sock_flag(sk, SOCK_DEAD))
> + return;
> +
> + xdp_put_umem(xs->umem);
> +
> + sk_refcnt_debug_dec(sk);
> +}
> +
> +static int xsk_create(struct net *net, struct socket *sock, int protocol,
> + int kern)
> +{
> + struct sock *sk;
> + struct xdp_sock *xs;
> +
> + if (!ns_capable(net->user_ns, CAP_NET_RAW))
> + return -EPERM;
> + if (sock->type != SOCK_RAW)
> + return -ESOCKTNOSUPPORT;
> +
> + if (protocol)
> + return -EPROTONOSUPPORT;
> +
> + sock->state = SS_UNCONNECTED;
[...]
Powered by blists - more mailing lists