[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4cd6006ed79a2690ee89f46c699553064f2500f4.camel@redhat.com>
Date: Thu, 29 Nov 2018 09:27:36 +0100
From: Paolo Abeni <pabeni@...hat.com>
To: Willem de Bruijn <willemdebruijn.kernel@...il.com>
Cc: Network Development <netdev@...r.kernel.org>,
David Miller <davem@...emloft.net>,
Willem de Bruijn <willemb@...gle.com>
Subject: Re: [PATCH net-next v2 1/2] udp: msg_zerocopy
Hi,
Thank you for the update!
On Wed, 2018-11-28 at 18:50 -0500, Willem de Bruijn wrote:
> I did revert to the basic implementation using an extra ref
> for the function call, similar to TCP, as you suggested.
>
> On top of that as a separate optimization patch I have a
> variant that uses refcnt zero by replacing refcount_inc with
> refcount_set(.., refcount_read(..) + 1). Not very pretty.
If the skb/uarg is not shared (no other threads can touch the refcnt)
before ip*_append_data() completes, how about something like the
following (incremental diff on top of patch 1/2, untested, uncompiled,
just to give the idea):
---
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 04f52e719571..1e3d195ffdfb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -480,6 +480,13 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
refcount_inc(&uarg->refcnt);
}
+/* use only before uarg is actually shared */
+static inline void __sock_zerocopy_init(struct ubuf_info *uarg, int cnt)
+{
+ if (uarg)
+ refcount_set(&uarg->refcnt, cnt);
+}
+
void sock_zerocopy_put(struct ubuf_info *uarg);
void sock_zerocopy_put_abort(struct ubuf_info *uarg);
@@ -1326,13 +1333,20 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
return is_zcopy ? skb_uarg(skb) : NULL;
}
-static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+static inline int __skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
{
if (skb && uarg && !skb_zcopy(skb)) {
- sock_zerocopy_get(uarg);
skb_shinfo(skb)->destructor_arg = uarg;
skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+ return 1;
}
+ return 0;
+}
+
+static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+{
+ if (__skb_zcopy_set(skb, uarg))
+ sock_zerocopy_get(uarg);
}
static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2179ef84bb44..435bac91d293 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -957,7 +957,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
- refcount_set(&uarg->refcnt, sk->sk_type == SOCK_STREAM ? 1 : 0);
+ refcount_set(&uarg->refcnt, 1);
sock_hold(sk);
return uarg;
@@ -1097,13 +1097,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
atomic_dec(&sk->sk_zckey);
uarg->len--;
- /* Stream socks hold a ref for the syscall, as skbs can be sent
- * and freed inside the loop, dropping refcnt to 0 inbetween.
- * Datagrams do not need this, but sock_zerocopy_put expects it.
- */
- if (sk->sk_type != SOCK_STREAM && !refcount_read(&uarg->refcnt))
- refcount_set(&uarg->refcnt, 1);
-
sock_zerocopy_put(uarg);
}
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7504da2f33d6..d3285613d87a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,6 +882,7 @@ static int __ip_append_data(struct sock *sk,
struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0;
u32 tskey = 0;
+ int uarg_refs = 0;
bool paged;
skb = skb_peek_tail(queue);
@@ -919,6 +920,7 @@ static int __ip_append_data(struct sock *sk,
if (flags & MSG_ZEROCOPY && length) {
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ uarg_refs = 1;
if (!uarg)
return -ENOBUFS;
if (rt->dst.dev->features & NETIF_F_SG &&
@@ -926,7 +928,7 @@ static int __ip_append_data(struct sock *sk,
paged = true;
} else {
uarg->zerocopy = 0;
- skb_zcopy_set(skb, uarg);
+ uarg_refs += __skb_zcopy_set(skb, uarg);
}
}
@@ -1019,7 +1021,7 @@ static int __ip_append_data(struct sock *sk,
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
- skb_zcopy_set(skb, uarg);
+ uarg_refs += __skb_zcopy_set(skb, uarg);
/*
* Find where to start putting bytes.
@@ -1121,6 +1123,7 @@ static int __ip_append_data(struct sock *sk,
length -= copy;
}
+ __sock_zerocopy_init(uarg, uarg_refs);
if (wmem_alloc_delta)
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
return 0;
@@ -1128,6 +1131,7 @@ static int __ip_append_data(struct sock *sk,
error_efault:
err = -EFAULT;
error:
+ __sock_zerocopy_init(uarg, uarg_refs);
sock_zerocopy_put_abort(uarg);
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
---
The basic idea is using the same schema currently used for wmem
accounting: do the book-keeping inside the loop and set the atomic
reference counter only once at the end of the loop.
WDYT?
Thanks,
Paolo
Powered by blists - more mailing lists