lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4cd6006ed79a2690ee89f46c699553064f2500f4.camel@redhat.com>
Date:   Thu, 29 Nov 2018 09:27:36 +0100
From:   Paolo Abeni <pabeni@...hat.com>
To:     Willem de Bruijn <willemdebruijn.kernel@...il.com>
Cc:     Network Development <netdev@...r.kernel.org>,
        David Miller <davem@...emloft.net>,
        Willem de Bruijn <willemb@...gle.com>
Subject: Re: [PATCH net-next v2 1/2] udp: msg_zerocopy

Hi,

Thank you for the update!

On Wed, 2018-11-28 at 18:50 -0500, Willem de Bruijn wrote:
> I did revert to the basic implementation using an extra ref
> for the function call, similar to TCP, as you suggested.
> 
> On top of that as a separate optimization patch I have a
> variant that uses refcnt zero by replacing refcount_inc with
> refcount_set(.., refcount_read(..) + 1). Not very pretty.

If the skb/uarg is not shared (no other threads can touch the refcnt)
before ip*_append_data() completes, how about something like the
following (incremental diff on top of patch 1/2, untested, uncompiled,
just to give the idea):

---
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 04f52e719571..1e3d195ffdfb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -480,6 +480,13 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 	refcount_inc(&uarg->refcnt);
 }
 
+/* use only before uarg is actually shared */
+static inline void __sock_zerocopy_init(struct ubuf_info *uarg, int cnt)
+{
+	if (uarg)
+		refcount_set(&uarg->refcnt, cnt);
+}
+
 void sock_zerocopy_put(struct ubuf_info *uarg);
 void sock_zerocopy_put_abort(struct ubuf_info *uarg);
 
@@ -1326,13 +1333,20 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
-static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+static inline int __skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
 {
 	if (skb && uarg && !skb_zcopy(skb)) {
-		sock_zerocopy_get(uarg);
 		skb_shinfo(skb)->destructor_arg = uarg;
 		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+		return 1;
 	}
+	return 0;
+}
+
+static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+{
+	if (__skb_zcopy_set(skb, uarg))
+		sock_zerocopy_get(uarg);
 }
 
 static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2179ef84bb44..435bac91d293 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -957,7 +957,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg->len = 1;
 	uarg->bytelen = size;
 	uarg->zerocopy = 1;
-	refcount_set(&uarg->refcnt, sk->sk_type == SOCK_STREAM ? 1 : 0);
+	refcount_set(&uarg->refcnt, 1);
 	sock_hold(sk);
 
 	return uarg;
@@ -1097,13 +1097,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 		atomic_dec(&sk->sk_zckey);
 		uarg->len--;
 
-		/* Stream socks hold a ref for the syscall, as skbs can be sent
-		 * and freed inside the loop, dropping refcnt to 0 inbetween.
-		 * Datagrams do not need this, but sock_zerocopy_put expects it.
-		 */
-		if (sk->sk_type != SOCK_STREAM && !refcount_read(&uarg->refcnt))
-			refcount_set(&uarg->refcnt, 1);
-
 		sock_zerocopy_put(uarg);
 	}
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7504da2f33d6..d3285613d87a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,6 +882,7 @@ static int __ip_append_data(struct sock *sk,
 	struct rtable *rt = (struct rtable *)cork->dst;
 	unsigned int wmem_alloc_delta = 0;
 	u32 tskey = 0;
+	int uarg_refs = 0;
 	bool paged;
 
 	skb = skb_peek_tail(queue);
@@ -919,6 +920,7 @@ static int __ip_append_data(struct sock *sk,
 
 	if (flags & MSG_ZEROCOPY && length) {
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+		uarg_refs = 1;
 		if (!uarg)
 			return -ENOBUFS;
 		if (rt->dst.dev->features & NETIF_F_SG &&
@@ -926,7 +928,7 @@ static int __ip_append_data(struct sock *sk,
 			paged = true;
 		} else {
 			uarg->zerocopy = 0;
-			skb_zcopy_set(skb, uarg);
+			uarg_refs += __skb_zcopy_set(skb, uarg);
 		}
 	}
 
@@ -1019,7 +1021,7 @@ static int __ip_append_data(struct sock *sk,
 			cork->tx_flags = 0;
 			skb_shinfo(skb)->tskey = tskey;
 			tskey = 0;
-			skb_zcopy_set(skb, uarg);
+			uarg_refs += __skb_zcopy_set(skb, uarg);
 
 			/*
 			 *	Find where to start putting bytes.
@@ -1121,6 +1123,7 @@ static int __ip_append_data(struct sock *sk,
 		length -= copy;
 	}
 
+	__sock_zerocopy_init(uarg, uarg_refs);
 	if (wmem_alloc_delta)
 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
 	return 0;
@@ -1128,6 +1131,7 @@ static int __ip_append_data(struct sock *sk,
 error_efault:
 	err = -EFAULT;
 error:
+	__sock_zerocopy_init(uarg, uarg_refs);
 	sock_zerocopy_put_abort(uarg);
 	cork->length -= length;
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
---
The basic idea is using the same schema currently used for wmem
accounting: do the book-keeping inside the loop and set the atomic
reference counter only once at the end of the loop.

WDYT?

Thanks,

Paolo

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ