lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 21 Nov 2014 15:44:54 -0500
From:	Willem de Bruijn <willemb@...gle.com>
To:	netdev@...r.kernel.org
Cc:	davem@...emloft.net, eric.dumazet@...il.com, dborkman@...hat.com,
	mst@...hat.com, Willem de Bruijn <willemb@...gle.com>
Subject: [PATCH rfc] packet: zerocopy packet_snd

From: Willem de Bruijn <willemb@...gle.com>

Extend the copy avoidance ("zero copy") interface from virtio and tap
to packet sockets.

Interface (send):
  Pass MSG_ZEROCOPY flag in send to have the kernel link the original
  user pages into its skb_shinfo(skb)->frags.

Interface (recv notification):
  Process the socket error queue. Ancillary data of type
  SO_EE_ORIGIN_UPAGE will return the pointer to the buffer passed in
  send to notify the completion of that send request.

  If the process allows the error queue to overflow, notifications
  will be dropped.

Tested by injecting max sized TCP packets on a host with TSO enabled
and GSO disabled. This passes the full packet to the NIC. Perf shows
a large drop in cycles spent (perf record -C 4 -a sleep 4)

Sending 10 Kpps without MSG_ZEROCOPY:

    Event count (approx.): 1191500371
    30.63%  psock_reinject  [kernel.kallsyms]  [k] copy_user_enhanced_fast_string
     7.50%  psock_reinject  [kernel.kallsyms]  [k] __rmqueue
     6.26%         swapper  [kernel.kallsyms]  [k] intel_idle
     5.06%  psock_reinject  [kernel.kallsyms]  [k] get_page_from_freelist
     1.99%  psock_reinject  [kernel.kallsyms]  [k] mlx4_en_xmit
     1.60%  psock_reinject  [kernel.kallsyms]  [k] __alloc_pages_nodemask

and the same with MSG_ZEROCOPY:

    Event count (approx.): 390783801
    13.96%         swapper  [kernel.kallsyms]  [k] intel_idle
     3.11%  psock_reinject  [kernel.kallsyms]  [k] mlx4_en_xmit
     2.75%  psock_reinject  [kernel.kallsyms]  [k] _raw_spin_lock
     2.67%         swapper  [kernel.kallsyms]  [k] lapic_next_deadline
     2.33%  psock_reinject  [kernel.kallsyms]  [k] gup_huge_pmd
     1.90%         swapper  [kernel.kallsyms]  [k] apic_timer_interrupt

The idle is probably due to using usleep to pace the sender, so this
underreports the effect.

Another test sent packets over loopback to a UDP socket. The test
executes mostly synchronously in a single thread:
  sendmsg(fd_packet);
  recvmsg(fd_inet);
  recvmsg(fd_packet, .., MSG_ERRQUEUE);

Result from that at various packet sizes:

            no-zc    zc   !ubufs  recvmmsg !callback
1           573     368      359       439       517
10          569     368      361       441       507
100         572     299      340       408       478
1000        564     300      338       412       486
10000       475     247      331       405       479
30000       328     177      324       385       450
65000       218     117      307       359       417

Legend:
  zc:         pass MSG_ZEROCOPY
  !ubufs:     (gross hack) comment out skb_orphan_frags
                  in __netif_receive_skb_core
  recvmmsg:   switch from synchronous completion notification
                  handling to batching.
  !callback:  (gross hack) disable upcall mechanism completely,
                  to expose its cost.

This demonstrates one of the large blocking issues to the practical
use of this feature in many datapaths: the kernel copies data of
sk_buffs with field SKBTX_DEV_ZEROCOPY at skb_clone and other points
to ensure safe reference counting. Follow on work would be needed
to refine this mechanism and increase the number of datapaths where
user pages can safely be used. This is one reason why I demonstrate
with TSO, as opposed to GSO: GSO also has to copy.

Implementation note: returning the pointer is somewhat ugly. An
alternative is to keep a per-socket counter, increment that for
each MSG_ZEROCOPY sendmsg() call and return this counter as
notification.

Signed-off-by: Willem de Bruijn <willemb@...gle.com>
---
 include/linux/skbuff.h        |   1 +
 include/linux/socket.h        |   1 +
 include/uapi/linux/errqueue.h |   1 +
 net/packet/af_packet.c        | 110 ++++++++++++++++++++++++++++++++++++++----
 4 files changed, 103 insertions(+), 10 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 78c299f..8e661d2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -295,6 +295,7 @@ struct ubuf_info {
 	void (*callback)(struct ubuf_info *, bool zerocopy_success);
 	void *ctx;
 	unsigned long desc;
+	void *callback_priv;
 };
 
 /* This data is invariant across clones and lives at
diff --git a/include/linux/socket.h b/include/linux/socket.h
index de52228..0a2e0ea 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -265,6 +265,7 @@ struct ucred {
 #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
 #define MSG_EOF         MSG_FIN
 
+#define MSG_ZEROCOPY	0x4000000	/* Pin user pages */
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
 #define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exec for file
 					   descriptor received through
diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h
index 07bdce1..61bf318 100644
--- a/include/uapi/linux/errqueue.h
+++ b/include/uapi/linux/errqueue.h
@@ -19,6 +19,7 @@ struct sock_extended_err {
 #define SO_EE_ORIGIN_ICMP6	3
 #define SO_EE_ORIGIN_TXSTATUS	4
 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
+#define SO_EE_ORIGIN_UPAGE	5
 
 #define SO_EE_OFFENDER(ee)	((struct sockaddr*)((ee)+1))
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 58af5802..367c23a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2370,28 +2370,112 @@ out:
 
 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
 				        size_t reserve, size_t len,
-				        size_t linear, int noblock,
+					size_t linear, int flags,
 				        int *err)
 {
 	struct sk_buff *skb;
+	size_t data_len;
 
-	/* Under a page?  Don't bother with paged skb. */
-	if (prepad + len < PAGE_SIZE || !linear)
-		linear = len;
+	if (flags & MSG_ZEROCOPY) {
+		/* Minimize linear, but respect header lower bound */
+		linear = min(len, max_t(size_t, linear, MAX_HEADER));
+		data_len = 0;
+	} else {
+		/* Under a page? Don't bother with paged skb. */
+		if (prepad + len < PAGE_SIZE || !linear)
+			linear = len;
+		data_len = len - linear;
+	}
 
-	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
-				   err, 0);
+	skb = sock_alloc_send_pskb(sk, prepad + linear, data_len,
+				   flags & MSG_DONTWAIT, err, 0);
 	if (!skb)
 		return NULL;
 
 	skb_reserve(skb, reserve);
 	skb_put(skb, linear);
-	skb->data_len = len - linear;
-	skb->len += len - linear;
+	skb->data_len = data_len;
+	skb->len += data_len;
 
 	return skb;
 }
 
+/* release zerocopy resources and avoid destructor callback on kfree */
+static void packet_snd_zerocopy_free(struct sk_buff *skb)
+{
+	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
+
+	if (uarg) {
+		kfree_skb(uarg->callback_priv);
+		sock_put((struct sock *) uarg->ctx);
+		kfree(uarg);
+
+		skb_shinfo(skb)->destructor_arg = NULL;
+		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+	}
+}
+
+static void packet_snd_zerocopy_callback(struct ubuf_info *uarg,
+					 bool zerocopy_success)
+{
+	struct sk_buff *err_skb;
+	struct sock *err_sk;
+	struct sock_exterr_skb *serr;
+
+	err_sk = uarg->ctx;
+	err_skb = uarg->callback_priv;
+
+	serr = SKB_EXT_ERR(err_skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = ENOMSG;
+	serr->ee.ee_origin = SO_EE_ORIGIN_UPAGE;
+	serr->ee.ee_data = uarg->desc & 0xFFFFFFFF;
+	serr->ee.ee_info = ((u64) uarg->desc) >> 32;
+	if (sock_queue_err_skb(err_sk, err_skb))
+		kfree_skb(err_skb);
+
+	kfree(uarg);
+	sock_put(err_sk);
+}
+
+static int packet_zerocopy_sg_from_iovec(struct sk_buff *skb,
+					 struct msghdr *msg)
+{
+	struct ubuf_info *uarg = NULL;
+	int ret;
+
+	if (iov_pages(msg->msg_iov, 0, msg->msg_iovlen) > MAX_SKB_FRAGS)
+		return -EMSGSIZE;
+
+	uarg = kzalloc(sizeof(*uarg), GFP_KERNEL);
+	if (!uarg)
+		return -ENOMEM;
+
+	uarg->callback_priv = alloc_skb(0, GFP_KERNEL);
+	if (!uarg->callback_priv) {
+		kfree(uarg);
+		return -ENOMEM;
+	}
+
+	ret = zerocopy_sg_from_iovec(skb, msg->msg_iov, 0, msg->msg_iovlen);
+	if (ret) {
+		kfree_skb(uarg->callback_priv);
+		kfree(uarg);
+		return -EIO;
+	}
+
+	sock_hold(skb->sk);
+	uarg->ctx = skb->sk;
+	uarg->callback = packet_snd_zerocopy_callback;
+	uarg->desc = (unsigned long) msg->msg_iov[0].iov_base;
+
+	skb_shinfo(skb)->destructor_arg = uarg;
+	skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+	skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+
+	return 0;
+}
+
 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 {
 	struct sock *sk = sock->sk;
@@ -2408,6 +2492,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	unsigned short gso_type = 0;
 	int hlen, tlen;
 	int extra_len = 0;
+	bool zerocopy = msg->msg_flags & MSG_ZEROCOPY;
 
 	/*
 	 *	Get and verify the address.
@@ -2501,7 +2586,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	hlen = LL_RESERVED_SPACE(dev);
 	tlen = dev->needed_tailroom;
 	skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
-			       msg->msg_flags & MSG_DONTWAIT, &err);
+			       msg->msg_flags, &err);
 	if (skb == NULL)
 		goto out_unlock;
 
@@ -2518,7 +2603,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	}
 
 	/* Returns -EFAULT on error */
-	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
+	if (zerocopy)
+		err = packet_zerocopy_sg_from_iovec(skb, msg);
+	else
+		err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov,
+						   0, len);
 	if (err)
 		goto out_free;
 
@@ -2578,6 +2667,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	return len;
 
 out_free:
+	packet_snd_zerocopy_free(skb);
 	kfree_skb(skb);
 out_unlock:
 	if (dev)
-- 
2.1.0.rc2.206.gedb03e5

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ