[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20170621211816.53837-11-willemdebruijn.kernel@gmail.com>
Date: Wed, 21 Jun 2017 17:18:13 -0400
From: Willem de Bruijn <willemdebruijn.kernel@...il.com>
To: netdev@...r.kernel.org
Cc: davem@...emloft.net, linux-api@...r.kernel.org,
Willem de Bruijn <willemb@...gle.com>
Subject: [PATCH net-next v3 10/13] udp: enable MSG_ZEROCOPY
From: Willem de Bruijn <willemb@...gle.com>
Add MSG_ZEROCOPY support to INET(6). This includes UDP, but also
RAW sockets that do not take the raw_send_hdrinc() path.
Zerocopy is only effective when payload is not touched at all. Limit
it to paths that support both checksum offload and scatter-gather.
When a caller passes MSG_ZEROCOPY to send and it returns a positive
result, the caller must always receive a completion notification.
Therefore, attach the structure even when zerocopy is not possible.
Also in edge cases, such as corking with mixed zc/non-zc calls.
Tested:
msg_zerocopy.sh 4 udp:
without zerocopy
tx=146127 (9118 MB) txc=0 zc=n
rx=146127 (9118 MB)
with zerocopy
tx=335789 (20954 MB) txc=335789 zc=y
rx=335789 (20954 MB)
msg_zerocopy.sh 4 raw:
without zerocopy
tx=106461 (6643 MB) txc=0 zc=n
rx=106461 (6643 MB)
with zerocopy
tx=296082 (18476 MB) txc=296082 zc=y
rx=296082 (18476 MB)
Signed-off-by: Willem de Bruijn <willemb@...gle.com>
---
net/core/skbuff.c | 4 ++++
net/ipv4/ip_output.c | 37 ++++++++++++++++++++++++++++++-------
net/ipv6/ip6_output.c | 40 +++++++++++++++++++++++++++++++++-------
3 files changed, 67 insertions(+), 14 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0beaf961f79c..7d4c12316df6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1145,6 +1145,10 @@ extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
int skb_zerocopy_iter(struct sock *sk, struct sk_buff *skb, struct msghdr *msg,
int len)
{
+ /* raw has extra indirection in raw_frag_vec */
+ if (sk->sk_type == SOCK_RAW && sk->sk_family != PF_PACKET)
+ msg = *(struct msghdr **)msg;
+
return __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7a3fd25e8913..3ff425f7ded6 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -919,7 +919,7 @@ static int __ip_append_data(struct sock *sk,
{
struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb;
-
+ struct ubuf_info *uarg = NULL;
struct ip_options *opt = cork->opt;
int hh_len;
int exthdrlen;
@@ -963,9 +963,21 @@ static int __ip_append_data(struct sock *sk,
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
+ if (flags & MSG_ZEROCOPY && length) {
+ uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ if (!uarg)
+ return -ENOBUFS;
+
+ if (!(rt->dst.dev->features & NETIF_F_SG) ||
+ (sk->sk_type == SOCK_DGRAM && csummode == CHECKSUM_NONE)) {
+ uarg->zerocopy = 0;
+ skb_zcopy_set(skb, uarg);
+ }
+ }
+
cork->length += length;
if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
- (sk->sk_protocol == IPPROTO_UDP) &&
+ (sk->sk_protocol == IPPROTO_UDP) && !uarg &&
(rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
(sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
@@ -997,6 +1009,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
+ unsigned int zcopylen = 0;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
@@ -1017,8 +1030,12 @@ static int __ip_append_data(struct sock *sk,
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else
+ else if (!uarg || !uarg->zerocopy)
alloclen = fraglen;
+ else {
+ alloclen = min_t(int, fraglen, MAX_HEADER);
+ zcopylen = fraglen - alloclen;
+ }
alloclen += exthdrlen;
@@ -1059,11 +1076,12 @@ static int __ip_append_data(struct sock *sk,
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
+ skb_zcopy_set(skb, uarg);
/*
* Find where to start putting bytes.
*/
- data = skb_put(skb, fraglen + exthdrlen);
+ data = skb_put(skb, fraglen + exthdrlen - zcopylen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
@@ -1079,7 +1097,7 @@ static int __ip_append_data(struct sock *sk,
pskb_trim_unique(skb_prev, maxfraglen);
}
- copy = datalen - transhdrlen - fraggap;
+ copy = datalen - transhdrlen - fraggap - zcopylen;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
@@ -1087,7 +1105,7 @@ static int __ip_append_data(struct sock *sk,
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
@@ -1115,7 +1133,7 @@ static int __ip_append_data(struct sock *sk,
err = -EFAULT;
goto error;
}
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
@@ -1145,6 +1163,10 @@ static int __ip_append_data(struct sock *sk,
skb->data_len += copy;
skb->truesize += copy;
atomic_add(copy, &sk->sk_wmem_alloc);
+ } else {
+ err = skb_zerocopy_iter(sk, skb, from, copy);
+ if (err)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1155,6 +1177,7 @@ static int __ip_append_data(struct sock *sk,
error_efault:
err = -EFAULT;
error:
+ sock_zerocopy_put_abort(uarg);
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
return err;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5baa6fab4b97..38d9722d4e3c 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1307,6 +1307,7 @@ static int __ip6_append_data(struct sock *sk,
struct ipv6_txoptions *opt = v6_cork->opt;
int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize;
+ struct ubuf_info *uarg = NULL;
skb = skb_peek_tail(queue);
if (!skb) {
@@ -1368,6 +1369,18 @@ static int __ip6_append_data(struct sock *sk,
tskey = sk->sk_tskey++;
}
+ if (flags & MSG_ZEROCOPY && length) {
+ uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ if (!uarg)
+ return -ENOBUFS;
+
+ if (!(rt->dst.dev->features & NETIF_F_SG) ||
+ (sk->sk_type == SOCK_DGRAM && csummode == CHECKSUM_NONE)) {
+ uarg->zerocopy = 0;
+ skb_zcopy_set(skb, uarg);
+ }
+ }
+
/*
* Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU.
@@ -1387,7 +1400,7 @@ static int __ip6_append_data(struct sock *sk,
cork->length += length;
if ((((length + fragheaderlen) > mtu) ||
(skb && skb_is_gso(skb))) &&
- (sk->sk_protocol == IPPROTO_UDP) &&
+ (sk->sk_protocol == IPPROTO_UDP) && !uarg &&
(rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
(sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
@@ -1413,6 +1426,7 @@ static int __ip6_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
+ unsigned int zcopylen = 0;
alloc_new_skb:
/* There's no room in the current skb */
if (skb)
@@ -1435,11 +1449,17 @@ static int __ip6_append_data(struct sock *sk,
if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
+ fraglen = datalen + fragheaderlen;
+
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else
- alloclen = datalen + fragheaderlen;
+ else if (!uarg || !uarg->zerocopy)
+ alloclen = fraglen;
+ else {
+ alloclen = min_t(int, fraglen, MAX_HEADER);
+ zcopylen = fraglen - alloclen;
+ }
alloclen += dst_exthdrlen;
@@ -1461,7 +1481,7 @@ static int __ip6_append_data(struct sock *sk,
*/
alloclen += sizeof(struct frag_hdr);
- copy = datalen - transhdrlen - fraggap;
+ copy = datalen - transhdrlen - fraggap - zcopylen;
if (copy < 0) {
err = -EINVAL;
goto error;
@@ -1497,11 +1517,12 @@ static int __ip6_append_data(struct sock *sk,
tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
+ skb_zcopy_set(skb, uarg);
/*
* Find where to start putting bytes
*/
- data = skb_put(skb, fraglen);
+ data = skb_put(skb, fraglen - zcopylen);
skb_set_network_header(skb, exthdrlen);
data += fragheaderlen;
skb->transport_header = (skb->network_header +
@@ -1524,7 +1545,7 @@ static int __ip6_append_data(struct sock *sk,
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
dst_exthdrlen = 0;
@@ -1552,7 +1573,7 @@ static int __ip6_append_data(struct sock *sk,
err = -EFAULT;
goto error;
}
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
@@ -1582,6 +1603,10 @@ static int __ip6_append_data(struct sock *sk,
skb->data_len += copy;
skb->truesize += copy;
atomic_add(copy, &sk->sk_wmem_alloc);
+ } else {
+ err = skb_zerocopy_iter(sk, skb, from, copy);
+ if (err)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1592,6 +1617,7 @@ static int __ip6_append_data(struct sock *sk,
error_efault:
err = -EFAULT;
error:
+ sock_zerocopy_put_abort(uarg);
cork->length -= length;
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
return err;
--
2.13.1.611.g7e3b11ae1-goog
Powered by blists - more mailing lists