[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1477677030.7065.250.camel@edumazet-glaptop3.roam.corp.google.com>
Date: Fri, 28 Oct 2016 10:50:30 -0700
From: Eric Dumazet <eric.dumazet@...il.com>
To: Paolo Abeni <pabeni@...hat.com>
Cc: netdev@...r.kernel.org, "David S. Miller" <davem@...emloft.net>,
James Morris <jmorris@...ei.org>,
Trond Myklebust <trond.myklebust@...marydata.com>,
Alexander Duyck <alexander.duyck@...il.com>,
Daniel Borkmann <daniel@...earbox.net>,
Eric Dumazet <edumazet@...gle.com>,
Tom Herbert <tom@...bertland.com>,
Hannes Frederic Sowa <hannes@...essinduktion.org>,
linux-nfs@...r.kernel.org
Subject: Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue
On Fri, 2016-10-28 at 10:16 -0700, Eric Dumazet wrote:
> Nice !
>
> I was working on this as well and my implementation was somewhat
> different.
This is my WIP
Note this can be split in two parts.
1) One adding struct sock *sk param to ip_cmsg_recv_offset()
This was because I left skb->sk NULL for skbs stored in receive
queue.
You chose instead to set skb->sk, which is unusual (check
skb_orphan() BUG_ON())
2) Udp changes.
Tell me what you think, thanks again !
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 601258f6e621..52e70431abc9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3033,9 +3033,13 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
const struct sk_buff *skb);
struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
int *peeked, int *off, int *err,
- struct sk_buff **last);
+ struct sk_buff **last,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb));
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
- int *peeked, int *off, int *err);
+ int *peeked, int *off, int *err,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb));
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
int *err);
unsigned int datagram_poll(struct file *file, struct socket *sock,
diff --git a/include/net/ip.h b/include/net/ip.h
index 79b102ffbe16..f91fc376f3fb 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -572,7 +572,8 @@ int ip_options_rcv_srr(struct sk_buff *skb);
*/
void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb);
-void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset);
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, int tlen, int offset);
int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
struct ipcm_cookie *ipc, bool allow_ipv6);
int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
@@ -594,7 +595,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
{
- ip_cmsg_recv_offset(msg, skb, 0, 0);
+ ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0);
}
bool icmp_global_allow(void);
diff --git a/include/net/udp.h b/include/net/udp.h
index 18f1e6b91927..0178f4552c4d 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -248,6 +248,7 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
/* net/ipv4/udp.c */
void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
void udp_v4_early_demux(struct sk_buff *skb);
int udp_get_port(struct sock *sk, unsigned short snum,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index bfb973aebb5b..48228d15f33c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -198,7 +198,8 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
*/
struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
int *peeked, int *off, int *err,
- struct sk_buff **last)
+ struct sk_buff **last,
+ void (*destructor)(struct sock *sk, struct sk_buff *skb))
{
struct sk_buff_head *queue = &sk->sk_receive_queue;
struct sk_buff *skb;
@@ -241,9 +242,11 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
}
atomic_inc(&skb->users);
- } else
+ } else {
__skb_unlink(skb, queue);
-
+ if (destructor)
+ destructor(sk, skb);
+ }
spin_unlock_irqrestore(&queue->lock, cpu_flags);
*off = _off;
return skb;
@@ -262,7 +265,9 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
EXPORT_SYMBOL(__skb_try_recv_datagram);
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
- int *peeked, int *off, int *err)
+ int *peeked, int *off, int *err,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb))
{
struct sk_buff *skb, *last;
long timeo;
@@ -271,7 +276,7 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
do {
skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
- &last);
+ &last, destructor);
if (skb)
return skb;
@@ -290,7 +295,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
int peeked, off = 0;
return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, err);
+ &peeked, &off, err, NULL);
}
EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b8a2d63d1fb8..1de70870d0aa 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -153,11 +153,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
}
-void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
- int tlen, int offset)
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, int tlen, int offset)
{
- struct inet_sock *inet = inet_sk(skb->sk);
- unsigned int flags = inet->cmsg_flags;
+ unsigned int flags = inet_sk(sk)->cmsg_flags;
/* Ordered by supposed usage frequency */
if (flags & IP_CMSG_PKTINFO) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index bfa9609ba0f4..8ff7ee74a992 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1178,20 +1178,20 @@ static void udp_rmem_release(struct sock *sk, int size, int partial)
atomic_sub(size, &sk->sk_rmem_alloc);
- spin_lock_bh(&sk->sk_receive_queue.lock);
sk->sk_forward_alloc += size;
amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
sk->sk_forward_alloc -= amt;
- spin_unlock_bh(&sk->sk_receive_queue.lock);
if (amt)
__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
}
-static void udp_rmem_free(struct sk_buff *skb)
+/* Note : called with sk_receive_queue.lock held */
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
{
- udp_rmem_release(skb->sk, skb->truesize, 1);
+ udp_rmem_release(sk, skb->truesize, 1);
}
+EXPORT_SYMBOL(udp_skb_destructor);
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
{
@@ -1220,17 +1220,14 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
err = -ENOBUFS;
spin_unlock(&list->lock);
- goto uncharge_drop;
+ goto drop;
}
sk->sk_forward_alloc += delta;
}
-
+ atomic_add(size, &sk->sk_rmem_alloc);
sk->sk_forward_alloc -= size;
- /* the skb owner in now the udp socket */
- skb->sk = sk;
- skb->destructor = udp_rmem_free;
skb->dev = NULL;
sock_skb_set_dropcount(sk, skb);
@@ -1253,9 +1250,14 @@ EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
static void udp_destruct_sock(struct sock *sk)
{
- /* reclaim completely the forward allocated memory */
- __skb_queue_purge(&sk->sk_receive_queue);
- udp_rmem_release(sk, 0, 0);
+ unsigned int total = 0;
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ total += skb->truesize;
+ kfree_skb(skb);
+ }
+ udp_rmem_release(sk, total, 0);
inet_sock_destruct(sk);
}
@@ -1287,12 +1289,11 @@ EXPORT_SYMBOL_GPL(skb_consume_udp);
*/
static int first_packet_length(struct sock *sk)
{
- struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
+ struct sk_buff_head *rcvq = &sk->sk_receive_queue;
struct sk_buff *skb;
+ int total = 0;
int res;
- __skb_queue_head_init(&list_kill);
-
spin_lock_bh(&rcvq->lock);
while ((skb = skb_peek(rcvq)) != NULL &&
udp_lib_checksum_complete(skb)) {
@@ -1302,12 +1303,14 @@ static int first_packet_length(struct sock *sk)
IS_UDPLITE(sk));
atomic_inc(&sk->sk_drops);
__skb_unlink(skb, rcvq);
- __skb_queue_tail(&list_kill, skb);
+ total += skb->truesize;
+ kfree_skb(skb);
}
res = skb ? skb->len : -1;
+ if (total)
+ udp_rmem_release(sk, total, 1);
spin_unlock_bh(&rcvq->lock);
- __skb_queue_purge(&list_kill);
return res;
}
@@ -1363,7 +1366,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
try_again:
peeking = off = sk_peek_offset(sk, flags);
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, &err);
+ &peeked, &off, &err, udp_skb_destructor);
if (!skb)
return err;
@@ -1420,7 +1423,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
*addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
- ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off);
+ ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
err = copied;
if (flags & MSG_TRUNC)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index a7700bbf6788..7e602b89c64d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -344,7 +344,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
try_again:
peeking = off = sk_peek_offset(sk, flags);
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, &err);
+ &peeked, &off, &err, udp_skb_destructor);
if (!skb)
return err;
@@ -425,7 +425,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (is_udp4) {
if (inet->cmsg_flags)
- ip_cmsg_recv_offset(msg, skb,
+ ip_cmsg_recv_offset(msg, sk, skb,
sizeof(struct udphdr), off);
} else {
if (np->rxopt.all)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 145082e2ba36..8b995f88d000 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2114,7 +2114,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
skip = sk_peek_offset(sk, flags);
skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
- &last);
+ &last, NULL);
if (skb)
break;
Powered by blists - more mailing lists