netdev - Re: [PATCH] net: Fix struct sock bitfield annotation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 09 Oct 2009 22:41:54 +0200
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Christoph Lameter <cl@...ux-foundation.org>
CC:	"David S. Miller" <davem@...emloft.net>,
	Vegard Nossum <vegard.nossum@...il.com>,
	Linux Netdev List <netdev@...r.kernel.org>,
	Ingo Molnar <mingo@...e.hu>
Subject: Re: [PATCH] net: Fix struct sock bitfield annotation

Christoph Lameter a écrit :
> On Fri, 9 Oct 2009, Eric Dumazet wrote:
> 
>> For networking guys, here is the actual mess with "struct sock" on x86_64,
>> related to UDP handling (critical latencies for some people). We basically touch
>> all cache lines, in every paths, bad effects on SMP...
> 
> Please keep me posted on this. I am very interested in this work.
> 
> Some simple shuffling around may do some good here.
> 

Sure, will do, but first I want to suppress the lock_sock()/release_sock() in
rx path, that was added for sk_forward_alloc thing. This really hurts,
because of the backlog handling.

I have preliminary patch that restore UDP latencies we had in the past ;)

Trick is for UDP, sk_forward_alloc is not updated by tx/rx, only rx.
So we can use the sk_receive_queue.lock to forbid concurrent updates.

As this lock is already hot and only used by rx, we wont have to
dirty the sk_lock, that will only be used by tx path.

Then we can carefuly reorder struct sock to lower number of cache lines
needed for each path.


Patch against linux-2.6 git tree

 net/core/sock.c |    9 ++++
 net/ipv4/udp.c  |   89 ++++++++++++++++++++++------------------------
 2 files changed, 51 insertions(+), 47 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 7626b6a..45212d4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -276,6 +276,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int err = 0;
 	int skb_len;
+	unsigned long flags;
 
 	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
 	   number of warnings when compiling with -W --ANK
@@ -290,8 +291,12 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (err)
 		goto out;
 
+	skb_orphan(skb);
+
+	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
 	if (!sk_rmem_schedule(sk, skb->truesize)) {
 		err = -ENOBUFS;
+		spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
 		goto out;
 	}
 
@@ -305,7 +310,9 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	 */
 	skb_len = skb->len;
 
-	skb_queue_tail(&sk->sk_receive_queue, skb);
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+
+	spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
 
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_data_ready(sk, skb_len);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6ec6a8a..e8a1be4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -841,6 +841,36 @@ out:
 	return ret;
 }
 
+
+/**
+ *	first_packet_length	- return length of first packet in receive queue
+ *	@sk: socket
+ *
+ *	Drops all bad checksum frames, until a valid one is found.
+ *	Returns the length of found skb, or 0 if none is found.
+ */
+static unsigned int first_packet_length(struct sock *sk)
+{
+	struct sk_buff_head *rcvq = &sk->sk_receive_queue;
+	struct sk_buff *skb;
+	unsigned int res;
+
+	spin_lock_bh(&rcvq->lock);
+
+	while ((skb = skb_peek(rcvq)) != NULL &&
+		udp_lib_checksum_complete(skb)) {
+		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+				 IS_UDPLITE(sk));
+		__skb_unlink(skb, rcvq);
+		skb_kill_datagram(sk, skb, 0);
+	}
+	res = skb ? skb->len : 0;
+
+	spin_unlock_bh(&rcvq->lock);
+
+	return res;
+}
+
 /*
  *	IOCTL requests applicable to the UDP protocol
  */
@@ -857,21 +887,16 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 
 	case SIOCINQ:
 	{
-		struct sk_buff *skb;
-		unsigned long amount;
+		unsigned int amount = first_packet_length(sk);
 
-		amount = 0;
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		skb = skb_peek(&sk->sk_receive_queue);
-		if (skb != NULL) {
+		if (amount)
 			/*
 			 * We will only return the amount
 			 * of this packet since that is all
 			 * that will be read.
 			 */
-			amount = skb->len - sizeof(struct udphdr);
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
+			amount -= sizeof(struct udphdr);
+
 		return put_user(amount, (int __user *)arg);
 	}
 
@@ -968,17 +993,17 @@ try_again:
 		err = ulen;
 
 out_free:
-	lock_sock(sk);
+	spin_lock_bh(&sk->sk_receive_queue.lock);
 	skb_free_datagram(sk, skb);
-	release_sock(sk);
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
 out:
 	return err;
 
 csum_copy_err:
-	lock_sock(sk);
+	spin_lock_bh(&sk->sk_receive_queue.lock);
 	if (!skb_kill_datagram(sk, skb, flags))
-		UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
-	release_sock(sk);
+		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
 
 	if (noblock)
 		return -EAGAIN;
@@ -1060,7 +1085,6 @@ drop:
 int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct udp_sock *up = udp_sk(sk);
-	int rc;
 	int is_udplite = IS_UDPLITE(sk);
 
 	/*
@@ -1140,16 +1164,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 			goto drop;
 	}
 
-	rc = 0;
-
-	bh_lock_sock(sk);
-	if (!sock_owned_by_user(sk))
-		rc = __udp_queue_rcv_skb(sk, skb);
-	else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
-
-	return rc;
+	return __udp_queue_rcv_skb(sk, skb);
 
 drop:
 	UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
@@ -1540,29 +1555,11 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
 	unsigned int mask = datagram_poll(file, sock, wait);
 	struct sock *sk = sock->sk;
-	int 	is_lite = IS_UDPLITE(sk);
 
 	/* Check for false positives due to checksum errors */
-	if ((mask & POLLRDNORM) &&
-	    !(file->f_flags & O_NONBLOCK) &&
-	    !(sk->sk_shutdown & RCV_SHUTDOWN)) {
-		struct sk_buff_head *rcvq = &sk->sk_receive_queue;
-		struct sk_buff *skb;
-
-		spin_lock_bh(&rcvq->lock);
-		while ((skb = skb_peek(rcvq)) != NULL &&
-		       udp_lib_checksum_complete(skb)) {
-			UDP_INC_STATS_BH(sock_net(sk),
-					UDP_MIB_INERRORS, is_lite);
-			__skb_unlink(skb, rcvq);
-			kfree_skb(skb);
-		}
-		spin_unlock_bh(&rcvq->lock);
-
-		/* nothing to see, move along */
-		if (skb == NULL)
-			mask &= ~(POLLIN | POLLRDNORM);
-	}
+	if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+	    !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
+		mask &= ~(POLLIN | POLLRDNORM);
 
 	return mask;
 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html