lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20171031124145.9667-3-bjorn.topel@gmail.com>
Date:   Tue, 31 Oct 2017 13:41:33 +0100
From:   Björn Töpel <bjorn.topel@...il.com>
To:     bjorn.topel@...il.com, magnus.karlsson@...el.com,
        alexander.h.duyck@...el.com, alexander.duyck@...il.com,
        john.fastabend@...il.com, ast@...com, brouer@...hat.com,
        michael.lundkvist@...csson.com, ravineet.singh@...csson.com,
        daniel@...earbox.net, netdev@...r.kernel.org
Cc:     Björn Töpel <bjorn.topel@...el.com>,
        jesse.brandeburg@...el.com, anjali.singhai@...el.com,
        rami.rosen@...el.com, jeffrey.b.shaw@...el.com,
        ferruh.yigit@...el.com, qi.z.zhang@...el.com
Subject: [RFC PATCH 02/14] packet: implement PACKET_MEMREG setsockopt

From: Björn Töpel <bjorn.topel@...el.com>

Here, the PACKET_MEMREG setsockopt is implemented for the AF_PACKET
protocol family. PACKET_MEMREG allows the user to register memory
regions that can be used by AF_PACKET V4 as packet data buffers.

Signed-off-by: Björn Töpel <bjorn.topel@...el.com>
---
 include/linux/tpacket4.h | 101 +++++++++++++++++++++++++++++
 net/packet/af_packet.c   | 163 +++++++++++++++++++++++++++++++++++++++++++++++
 net/packet/internal.h    |   4 ++
 3 files changed, 268 insertions(+)
 create mode 100644 include/linux/tpacket4.h

diff --git a/include/linux/tpacket4.h b/include/linux/tpacket4.h
new file mode 100644
index 000000000000..fcf4c333c78d
--- /dev/null
+++ b/include/linux/tpacket4.h
@@ -0,0 +1,101 @@
+/*
+ *  tpacket v4
+ *  Copyright(c) 2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_TPACKET4_H
+#define _LINUX_TPACKET4_H
+
+#define TP4_UMEM_MIN_FRAME_SIZE 2048
+#define TP4_KERNEL_HEADROOM 256 /* Headrom for XDP */
+
+struct tp4_umem {
+	struct pid *pid;
+	struct page **pgs;
+	unsigned int npgs;
+	size_t size;
+	unsigned long address;
+	unsigned int frame_size;
+	unsigned int frame_size_log2;
+	unsigned int nframes;
+	unsigned int nfpplog2; /* num frames per page in log2 */
+	unsigned int data_headroom;
+};
+
+/*************** V4 QUEUE OPERATIONS *******************************/
+
+/**
+ * tp4q_umem_new - Creates a new umem (packet buffer)
+ *
+ * @addr: The address to the umem
+ * @size: The size of the umem
+ * @frame_size: The size of each frame, between 2K and PAGE_SIZE
+ * @data_headroom: The desired data headroom before start of the packet
+ *
+ * Returns a pointer to the new umem or NULL for failure
+ **/
+static inline struct tp4_umem *tp4q_umem_new(unsigned long addr, size_t size,
+					     unsigned int frame_size,
+					     unsigned int data_headroom)
+{
+	struct tp4_umem *umem;
+	unsigned int nframes;
+
+	if (frame_size < TP4_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+		/* Strictly speaking we could support this, if:
+		 * - huge pages, or*
+		 * - using an IOMMU, or
+		 * - making sure the memory area is consecutive
+		 * but for now, we simply say "computer says no".
+		 */
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!is_power_of_2(frame_size))
+		return ERR_PTR(-EINVAL);
+
+	if (!PAGE_ALIGNED(addr)) {
+		/* Memory area has to be page size aligned. For
+		 * simplicity, this might change.
+		 */
+		return ERR_PTR(-EINVAL);
+	}
+
+	if ((addr + size) < addr)
+		return ERR_PTR(-EINVAL);
+
+	nframes = size / frame_size;
+	if (nframes == 0)
+		return ERR_PTR(-EINVAL);
+
+	data_headroom =	ALIGN(data_headroom, 64);
+
+	if (frame_size - data_headroom - TP4_KERNEL_HEADROOM < 0)
+		return ERR_PTR(-EINVAL);
+
+	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+
+	umem->pid = get_task_pid(current, PIDTYPE_PID);
+	umem->size = size;
+	umem->address = addr;
+	umem->frame_size = frame_size;
+	umem->frame_size_log2 = ilog2(frame_size);
+	umem->nframes = nframes;
+	umem->nfpplog2 = ilog2(PAGE_SIZE / frame_size);
+	umem->data_headroom = data_headroom;
+
+	return umem;
+}
+
+#endif /* _LINUX_TPACKET4_H */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 9603f6ff17a4..b39be424ec0e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -89,11 +89,15 @@
 #include <linux/errqueue.h>
 #include <linux/net_tstamp.h>
 #include <linux/percpu.h>
+#include <linux/log2.h>
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
 #endif
 #include <linux/bpf.h>
 #include <net/compat.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
 
 #include "internal.h"
 
@@ -2975,6 +2979,132 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 		return packet_snd(sock, msg, len);
 }
 
+static void
+packet_umem_unpin_pages(struct tp4_umem *umem)
+{
+	unsigned int i;
+
+	for (i = 0; i < umem->npgs; i++) {
+		struct page *page = umem->pgs[i];
+
+		set_page_dirty_lock(page);
+		put_page(page);
+	}
+	kfree(umem->pgs);
+	umem->pgs = NULL;
+}
+
+static void
+packet_umem_free(struct tp4_umem *umem)
+{
+	struct mm_struct *mm;
+	struct task_struct *task;
+	unsigned long diff;
+
+	packet_umem_unpin_pages(umem);
+
+	task = get_pid_task(umem->pid, PIDTYPE_PID);
+	put_pid(umem->pid);
+	if (!task)
+		goto out;
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	diff = umem->size >> PAGE_SHIFT;
+
+	down_write(&mm->mmap_sem);
+	mm->pinned_vm -= diff;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+out:
+	kfree(umem);
+}
+
+static struct tp4_umem *
+packet_umem_new(unsigned long addr, size_t size, unsigned int frame_size,
+		unsigned int data_headroom)
+{
+	unsigned long lock_limit, locked, npages;
+	unsigned int gup_flags = FOLL_WRITE;
+	int need_release = 0, j = 0, i, ret;
+	struct page **page_list;
+	struct tp4_umem *umem;
+
+	if (!can_do_mlock())
+		return ERR_PTR(-EPERM);
+
+	umem = tp4q_umem_new(addr, size, frame_size, data_headroom);
+	if (IS_ERR(umem))
+		return umem;
+
+	page_list = (struct page **)__get_free_page(GFP_KERNEL);
+	if (!page_list) {
+		put_pid(umem->pid);
+		kfree(umem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	npages = PAGE_ALIGN(umem->nframes * umem->frame_size) >> PAGE_SHIFT;
+
+	down_write(&current->mm->mmap_sem);
+
+	locked = npages + current->mm->pinned_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (npages == 0 || npages > UINT_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	umem->pgs = kcalloc(npages, sizeof(*umem->pgs), GFP_KERNEL);
+	if (!umem->pgs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	need_release = 1;
+	while (npages) {
+		ret = get_user_pages(addr,
+				     min_t(unsigned long, npages,
+					   PAGE_SIZE / sizeof(struct page *)),
+				     gup_flags, page_list, NULL);
+
+		if (ret < 0)
+			goto out;
+
+		umem->npgs += ret;
+		addr += ret * PAGE_SIZE;
+		npages -= ret;
+
+		for (i = 0; i < ret; i++)
+			umem->pgs[j++] = page_list[i];
+	}
+
+	ret = 0;
+
+out:
+	if (ret < 0) {
+		if (need_release)
+			packet_umem_unpin_pages(umem);
+		put_pid(umem->pid);
+		kfree(umem);
+	} else {
+		current->mm->pinned_vm = locked;
+	}
+
+	up_write(&current->mm->mmap_sem);
+	free_page((unsigned long)page_list);
+
+	return ret < 0 ? ERR_PTR(ret) : umem;
+}
+
 /*
  *	Close a PACKET socket. This is fairly simple. We immediately go
  *	to 'closed' state and remove our protocol entry in the device list.
@@ -3024,6 +3154,11 @@ static int packet_release(struct socket *sock)
 		packet_set_ring(sk, &req_u, 1, 1);
 	}
 
+	if (po->umem) {
+		packet_umem_free(po->umem);
+		po->umem = NULL;
+	}
+
 	f = fanout_release(sk);
 
 	synchronize_net();
@@ -3828,6 +3963,31 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
 		return 0;
 	}
+	case PACKET_MEMREG:
+	{
+		struct tpacket_memreg_req req;
+		struct tp4_umem *umem;
+
+		if (optlen < sizeof(req))
+			return -EINVAL;
+		if (copy_from_user(&req, optval, sizeof(req)))
+			return -EFAULT;
+
+		umem = packet_umem_new(req.addr, req.len, req.frame_size,
+				       req.data_headroom);
+		if (IS_ERR(umem))
+			return PTR_ERR(umem);
+
+		lock_sock(sk);
+		if (po->umem) {
+			release_sock(sk);
+			packet_umem_free(umem);
+			return -EBUSY;
+		}
+		po->umem = umem;
+		release_sock(sk);
+		return 0;
+	}
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -4245,6 +4405,9 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		case TPACKET_V3:
 			po->tp_hdrlen = TPACKET3_HDRLEN;
 			break;
+		default:
+			err = -EINVAL;
+			goto out;
 		}
 
 		err = -EINVAL;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 94d1d405a116..9c07cfe1b8a3 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -2,6 +2,7 @@
 #define __PACKET_INTERNAL_H__
 
 #include <linux/refcount.h>
+#include <linux/tpacket4.h>
 
 struct packet_mclist {
 	struct packet_mclist	*next;
@@ -109,6 +110,9 @@ struct packet_sock {
 	union  tpacket_stats_u	stats;
 	struct packet_ring_buffer	rx_ring;
 	struct packet_ring_buffer	tx_ring;
+
+	struct tp4_umem			*umem;
+
 	int			copy_thresh;
 	spinlock_t		bind_lock;
 	struct mutex		pg_vec_lock;
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ