[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4ACFA012.6010409@gmail.com>
Date: Fri, 09 Oct 2009 22:41:54 +0200
From: Eric Dumazet <eric.dumazet@...il.com>
To: Christoph Lameter <cl@...ux-foundation.org>
CC: "David S. Miller" <davem@...emloft.net>,
Vegard Nossum <vegard.nossum@...il.com>,
Linux Netdev List <netdev@...r.kernel.org>,
Ingo Molnar <mingo@...e.hu>
Subject: Re: [PATCH] net: Fix struct sock bitfield annotation
Christoph Lameter a écrit :
> On Fri, 9 Oct 2009, Eric Dumazet wrote:
>
>> For networking guys, here is the actual mess with "struct sock" on x86_64,
>> related to UDP handling (critical latencies for some people). We basically touch
>> all cache lines, in every paths, bad effects on SMP...
>
> Please keep me posted on this. I am very interested in this work.
>
> Some simple shuffling around may do some good here.
>
Sure, will do, but first I want to suppress the lock_sock()/release_sock() in
rx path, that was added for sk_forward_alloc thing. This really hurts,
because of the backlog handling.
I have preliminary patch that restore UDP latencies we had in the past ;)
Trick is for UDP, sk_forward_alloc is not updated by tx/rx, only rx.
So we can use the sk_receive_queue.lock to forbid concurrent updates.
As this lock is already hot and only used by rx, we wont have to
dirty the sk_lock, that will only be used by tx path.
Then we can carefuly reorder struct sock to lower number of cache lines
needed for each path.
Patch against linux-2.6 git tree
net/core/sock.c | 9 ++++
net/ipv4/udp.c | 89 ++++++++++++++++++++++------------------------
2 files changed, 51 insertions(+), 47 deletions(-)
diff --git a/net/core/sock.c b/net/core/sock.c
index 7626b6a..45212d4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -276,6 +276,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err = 0;
int skb_len;
+ unsigned long flags;
/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
number of warnings when compiling with -W --ANK
@@ -290,8 +291,12 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (err)
goto out;
+ skb_orphan(skb);
+
+ spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
if (!sk_rmem_schedule(sk, skb->truesize)) {
err = -ENOBUFS;
+ spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
goto out;
}
@@ -305,7 +310,9 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
*/
skb_len = skb->len;
- skb_queue_tail(&sk->sk_receive_queue, skb);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+
+ spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, skb_len);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6ec6a8a..e8a1be4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -841,6 +841,36 @@ out:
return ret;
}
+
+/**
+ * first_packet_length - return length of first packet in receive queue
+ * @sk: socket
+ *
+ * Drops all bad checksum frames, until a valid one is found.
+ * Returns the length of found skb, or 0 if none is found.
+ */
+static unsigned int first_packet_length(struct sock *sk)
+{
+ struct sk_buff_head *rcvq = &sk->sk_receive_queue;
+ struct sk_buff *skb;
+ unsigned int res;
+
+ spin_lock_bh(&rcvq->lock);
+
+ while ((skb = skb_peek(rcvq)) != NULL &&
+ udp_lib_checksum_complete(skb)) {
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ __skb_unlink(skb, rcvq);
+ skb_kill_datagram(sk, skb, 0);
+ }
+ res = skb ? skb->len : 0;
+
+ spin_unlock_bh(&rcvq->lock);
+
+ return res;
+}
+
/*
* IOCTL requests applicable to the UDP protocol
*/
@@ -857,21 +887,16 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
case SIOCINQ:
{
- struct sk_buff *skb;
- unsigned long amount;
+ unsigned int amount = first_packet_length(sk);
- amount = 0;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb != NULL) {
+ if (amount)
/*
* We will only return the amount
* of this packet since that is all
* that will be read.
*/
- amount = skb->len - sizeof(struct udphdr);
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
+ amount -= sizeof(struct udphdr);
+
return put_user(amount, (int __user *)arg);
}
@@ -968,17 +993,17 @@ try_again:
err = ulen;
out_free:
- lock_sock(sk);
+ spin_lock_bh(&sk->sk_receive_queue.lock);
skb_free_datagram(sk, skb);
- release_sock(sk);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
out:
return err;
csum_copy_err:
- lock_sock(sk);
+ spin_lock_bh(&sk->sk_receive_queue.lock);
if (!skb_kill_datagram(sk, skb, flags))
- UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- release_sock(sk);
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
if (noblock)
return -EAGAIN;
@@ -1060,7 +1085,6 @@ drop:
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
- int rc;
int is_udplite = IS_UDPLITE(sk);
/*
@@ -1140,16 +1164,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
}
- rc = 0;
-
- bh_lock_sock(sk);
- if (!sock_owned_by_user(sk))
- rc = __udp_queue_rcv_skb(sk, skb);
- else
- sk_add_backlog(sk, skb);
- bh_unlock_sock(sk);
-
- return rc;
+ return __udp_queue_rcv_skb(sk, skb);
drop:
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
@@ -1540,29 +1555,11 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
unsigned int mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk;
- int is_lite = IS_UDPLITE(sk);
/* Check for false positives due to checksum errors */
- if ((mask & POLLRDNORM) &&
- !(file->f_flags & O_NONBLOCK) &&
- !(sk->sk_shutdown & RCV_SHUTDOWN)) {
- struct sk_buff_head *rcvq = &sk->sk_receive_queue;
- struct sk_buff *skb;
-
- spin_lock_bh(&rcvq->lock);
- while ((skb = skb_peek(rcvq)) != NULL &&
- udp_lib_checksum_complete(skb)) {
- UDP_INC_STATS_BH(sock_net(sk),
- UDP_MIB_INERRORS, is_lite);
- __skb_unlink(skb, rcvq);
- kfree_skb(skb);
- }
- spin_unlock_bh(&rcvq->lock);
-
- /* nothing to see, move along */
- if (skb == NULL)
- mask &= ~(POLLIN | POLLRDNORM);
- }
+ if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+ !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
+ mask &= ~(POLLIN | POLLRDNORM);
return mask;
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists