lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1477674975.7065.245.camel@edumazet-glaptop3.roam.corp.google.com>
Date:   Fri, 28 Oct 2016 10:16:15 -0700
From:   Eric Dumazet <eric.dumazet@...il.com>
To:     Paolo Abeni <pabeni@...hat.com>
Cc:     netdev@...r.kernel.org, "David S. Miller" <davem@...emloft.net>,
        James Morris <jmorris@...ei.org>,
        Trond Myklebust <trond.myklebust@...marydata.com>,
        Alexander Duyck <alexander.duyck@...il.com>,
        Daniel Borkmann <daniel@...earbox.net>,
        Eric Dumazet <edumazet@...gle.com>,
        Tom Herbert <tom@...bertland.com>,
        Hannes Frederic Sowa <hannes@...essinduktion.org>,
        linux-nfs@...r.kernel.org
Subject: Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue


Nice !

I was working on this as well and my implementation was somewhat
different.

I then stopped when I found the UDP checksum bug and was waiting for net
to be merged into net-next
( https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/commit/?id=10df8e6152c6c400a563a673e9956320bfce1871 )

On Fri, 2016-10-28 at 15:20 +0200, Paolo Abeni wrote:
> A new argument is added to __skb_recv_datagram, it allows
> the caller to perform protocol specific action on dequeue
> under the receive queue spinlock.
> The UDP protocol uses such argument to perform fwd memory
> reclaiming on dequeue, while protocol memory and rmem updatating
> is delayed after the lock release, to keep the time spent
> under the lock as low as possible.
> The UDP specific skb desctructor is not used anymore, instead
> explicit memory reclaiming is performed at close() time and
> when skbs are removed from the receive queue.
> The in kernel UDP procotol users now need to use an
> skb_recv_udp() variant instead of skb_recv_datagram() to
> properly perform memory accounting on dequeue.
> 
> Overall, this allows acquiring only once the receive queue
> lock on dequeue.
> 
> Tested using pktgen with random src port, 64 bytes packet,
> wire-speed on a 10G link as sender and udp_sink as the receiver,
> using an l4 tuple rxhash to stress the contention, and one or more
> udp_sink instances with reuseport.
> 
> nr sinks	vanilla		patched
> 1		440		560
> 3		2150		2300
> 6		3650		3800
> 9		4450		4600
> 12		6250		6450
> 
> Suggested-by: Eric Dumazet <edumazet@...gle.com>
> Acked-by: Hannes Frederic Sowa <hannes@...essinduktion.org>
> Signed-off-by: Paolo Abeni <pabeni@...hat.com>
> ---
>  include/linux/skbuff.h |  4 ++++
>  include/net/udp.h      | 10 ++++++++
>  net/core/datagram.c    | 11 ++++++---
>  net/ipv4/udp.c         | 65 ++++++++++++++++++++++++++++++++++++++++----------
>  net/ipv6/udp.c         |  3 +--
>  net/rxrpc/input.c      |  7 +++---
>  net/sunrpc/svcsock.c   |  2 +-
>  net/sunrpc/xprtsock.c  |  2 +-
>  net/unix/af_unix.c     |  4 ++--
>  9 files changed, 83 insertions(+), 25 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 601258f..dd171a9 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3028,13 +3028,17 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
>  #define skb_walk_frags(skb, iter)	\
>  	for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)
>  
> +typedef void (*skb_dequeue_cb_t)(struct sock *sk, struct sk_buff *skb,
> +				 int flags);
>  
>  int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
>  				const struct sk_buff *skb);
>  struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
> +					skb_dequeue_cb_t dequeue_cb,
>  					int *peeked, int *off, int *err,
>  					struct sk_buff **last);
>  struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
> +				    skb_dequeue_cb_t dequeue_cb,
>  				    int *peeked, int *off, int *err);
>  struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
>  				  int *err);
> diff --git a/include/net/udp.h b/include/net/udp.h
> index 18f1e6b..983c861 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -48,6 +48,7 @@ struct udp_skb_cb {
>  	} header;
>  	__u16		cscov;
>  	__u8		partial_cov;
> +	int		fwd_memory_released;

I was not adding this field.

>  };
>  #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
>  
> @@ -248,6 +249,15 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
>  /* net/ipv4/udp.c */
>  void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
>  int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
> +struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock,
> +			       int *peeked, int *off, int *err);
> +static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
> +					   int noblock, int *err)
> +{
> +	int peeked, off = 0;
> +
> +	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
> +}
>  
>  void udp_v4_early_demux(struct sk_buff *skb);
>  int udp_get_port(struct sock *sk, unsigned short snum,
> diff --git a/net/core/datagram.c b/net/core/datagram.c
> index bfb973a..226548b 100644
> --- a/net/core/datagram.c
> +++ b/net/core/datagram.c
> @@ -165,6 +165,7 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
>   *	__skb_try_recv_datagram - Receive a datagram skbuff
>   *	@sk: socket
>   *	@flags: MSG_ flags
> + *	@dequeue_cb: invoked under the receive lock on successful dequeue
>   *	@peeked: returns non-zero if this packet has been seen before
>   *	@off: an offset in bytes to peek skb from. Returns an offset
>   *	      within an skb where data actually starts
> @@ -197,6 +198,7 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
>   *	the standard around please.
>   */
>  struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
> +					skb_dequeue_cb_t dequeue_cb,
>  					int *peeked, int *off, int *err,
>  					struct sk_buff **last)
>  {
> @@ -244,6 +246,8 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
>  			} else
>  				__skb_unlink(skb, queue);
>  
> +			if (dequeue_cb)
> +				dequeue_cb(sk, skb, flags);

I was doing this only if the packet was unlinked few lines above.

>  			spin_unlock_irqrestore(&queue->lock, cpu_flags);
>  			*off = _off;
>  			return skb;
> @@ -262,6 +266,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
>  EXPORT_SYMBOL(__skb_try_recv_datagram);
>  
>  struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
> +				    skb_dequeue_cb_t dequeue_cb,
>  				    int *peeked, int *off, int *err)
>  {
>  	struct sk_buff *skb, *last;
> @@ -270,8 +275,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
>  	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
>  
>  	do {
> -		skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
> -					      &last);
> +		skb = __skb_try_recv_datagram(sk, flags, dequeue_cb, peeked,
> +					      off, err, &last);
>  		if (skb)
>  			return skb;
>  
> @@ -290,7 +295,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
>  	int peeked, off = 0;
>  
>  	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
> -				   &peeked, &off, err);
> +				   NULL, &peeked, &off, err);
>  }
>  EXPORT_SYMBOL(skb_recv_datagram);
>  
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index c833271..2f1a727 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1172,26 +1172,61 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
>  	return ret;
>  }
>  
> +/* fully reclaim rmem/fwd memory allocated for skb */
>  static void udp_rmem_release(struct sock *sk, int size, int partial)
>  {
>  	int amt;
>  
>  	atomic_sub(size, &sk->sk_rmem_alloc);
> -
> -	spin_lock_bh(&sk->sk_receive_queue.lock);
>  	sk->sk_forward_alloc += size;
>  	amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
>  	sk->sk_forward_alloc -= amt;
> -	spin_unlock_bh(&sk->sk_receive_queue.lock);
>  
>  	if (amt)
>  		__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
>  }
>  
> -static void udp_rmem_free(struct sk_buff *skb)
> +/* if we are not peeking the skb, reclaim fwd allocated memory;
> + * rmem and protocol memory updating is delayed outside the lock
> + */
> +static void udp_dequeue(struct sock *sk, struct sk_buff *skb, int flags)
> +{
> +	int amt;
> +
> +	if (flags & MSG_PEEK)
> +		return;
> +
> +	sk->sk_forward_alloc += skb->truesize;
> +	amt = (sk->sk_forward_alloc - 1) & ~(SK_MEM_QUANTUM - 1);
> +	sk->sk_forward_alloc -= amt;
> +	UDP_SKB_CB(skb)->fwd_memory_released = amt >> SK_MEM_QUANTUM_SHIFT;
> +}
> +
> +/* complete the memory reclaiming started with udp_dequeue */
> +static void __udp_rmem_release(struct sock *sk, struct sk_buff *skb, int flags)
> +{
> +	int amt = UDP_SKB_CB(skb)->fwd_memory_released;
> +
> +	if (flags & MSG_PEEK)
> +		return;
> +
> +	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
> +	if (amt)
> +		__sk_mem_reduce_allocated(sk, amt);
> +}
> +
> +struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
> +			       int noblock, int *peeked, int *off, int *err)
>  {
> -	udp_rmem_release(skb->sk, skb->truesize, 1);
> +	struct sk_buff *skb;
> +
> +	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
> +				  udp_dequeue, peeked, off, err);
> +	if (skb)
> +		__udp_rmem_release(sk, skb, flags);
> +	return skb;
>  }
> +EXPORT_SYMBOL(__skb_recv_udp);
>  
>  int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
>  {
> @@ -1230,7 +1265,6 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
>  
>  	/* the skb owner in now the udp socket */
>  	skb->sk = sk;
> -	skb->destructor = udp_rmem_free;
>  	skb->dev = NULL;
>  	sock_skb_set_dropcount(sk, skb);
>  
> @@ -1254,8 +1288,13 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
>  static void udp_destruct_sock(struct sock *sk)
>  {
>  	/* reclaim completely the forward allocated memory */
> -	__skb_queue_purge(&sk->sk_receive_queue);
> -	udp_rmem_release(sk, 0, 0);
> +	struct sk_buff *skb;
> +
> +	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
> +		udp_rmem_release(sk, skb->truesize, 0);
> +		kfree_skb(skb);
> +	}
> +

I was using instead :

+       unsigned int total = 0;
+       struct sk_buff *skb;
+
+       while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+               total += skb->truesize;
+               kfree_skb(skb);
+       }
+       udp_rmem_release(sk, total, 0);


>  	inet_sock_destruct(sk);
>  }
>  
> @@ -1303,11 +1342,11 @@ static int first_packet_length(struct sock *sk)
>  		atomic_inc(&sk->sk_drops);
>  		__skb_unlink(skb, rcvq);
>  		__skb_queue_tail(&list_kill, skb);
> +		udp_rmem_release(sk, skb->truesize, 1);
> +		kfree_skb(skb);
>  	}
>  	res = skb ? skb->len : -1;
>  	spin_unlock_bh(&rcvq->lock);
> -




Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ