[<prev] [next>] [day] [month] [year] [list]
Message-ID: <51489.68.160.147.35.1163979693.squirrel@webmail.metacarta.com>
Date: Sun, 19 Nov 2006 18:41:33 -0500 (EST)
From: jmalicki@...acarta.com
To: linux-kernel@...r.kernel.org
Subject: [PATCH] AF_UNIX recv/shutdown race
On some of our systems (Linux 2.6, 4-way SMP), we have found a race where
occasionally recv() can detect a shutdown before the last bytes written to
the socket, and will exhibit the odd behavior where recv() will return 0
to indicate a shutdown socket, and a subsequent call will return the last
bit data, after which it will return 0 again.
This is demonstrated by the attached test cases (cli.c and srv.c). Start
one srv and several cli processes (perhaps 4-8) on a 2x dual core P4 (most
likely other systems as well, but I haven't reproduced it on other
configurations), and you will soon see this behavior.
What happens is that in unix_stream_recvmsg, the system checks for any
skbuffs on the queue ready to return. Meanwhile, immediately afterwards,
another process/processor writes an skbuff and shuts down the sock. Our
original reader process then checks shutdown, and returns 0. On the next
call, it finally sees the skbuff written.
This patch combines af_unix's unix_sk(sk).readlock with
unix_state_rlock/runlock(sk). readlock was only ever used by bind,
autobind, and unix_{stream,dgram}_recvmsg. I can't see any way of
ensuring correctness for unix_stream_recvmsg without forcing it to hold
the state lock across getting the skbuf and checking shutdown.
Signed-off-by: Joseph Malicki <joe.malicki@...acarta.com>
-------------------------------
diff -rup linux-2.6.18.orig/include/net/af_unix.h
linux-2.6.18/include/net/af_unix.h
--- linux-2.6.18.orig/include/net/af_unix.h 2006-09-19 23:42:06.000000000
-0400
+++ linux-2.6.18/include/net/af_unix.h 2006-10-02 19:42:07.000000000 -0400
@@ -78,7 +78,6 @@ struct unix_sock {
struct unix_address *addr;
struct dentry *dentry;
struct vfsmount *mnt;
- struct mutex readlock;
struct sock *peer;
struct sock *other;
struct sock *gc_tree;
diff -rup linux-2.6.18.orig/net/unix/af_unix.c
linux-2.6.18/net/unix/af_unix.c
--- linux-2.6.18.orig/net/unix/af_unix.c 2006-09-19 23:42:06.000000000
-0400 +++ linux-2.6.18/net/unix/af_unix.c 2006-10-06 15:28:44.000000000
-0400 @@ -50,6 +50,8 @@
* Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
* the core infrastructure is doing that
* for all net proto families now (2.5.69+)
+ * Joseph Malicki : Fix SMP race between write/shutdown
+ * and read.
*
*
* Known differences from reference BSD that was tested:
@@ -593,7 +595,6 @@ static struct sock * unix_create1(struct
u->mnt = NULL;
spin_lock_init(&u->lock);
atomic_set(&u->inflight, sock ? 0 : -1);
- mutex_init(&u->readlock); /* single task reading lock */
init_waitqueue_head(&u->peer_wait);
unix_insert_socket(unix_sockets_unbound, sk);
out:
@@ -650,7 +651,7 @@ static int unix_autobind(struct socket *
struct unix_address * addr;
int err;
- mutex_lock(&u->readlock);
+ unix_state_rlock(sk);
err = 0;
if (u->addr)
@@ -687,7 +688,7 @@ retry:
spin_unlock(&unix_table_lock);
err = 0;
-out: mutex_unlock(&u->readlock);
+out: unix_state_runlock(sk);
return err;
}
@@ -770,7 +771,7 @@ static int unix_bind(struct socket *sock
goto out;
addr_len = err;
- mutex_lock(&u->readlock);
+ unix_state_rlock(sk);
err = -EINVAL;
if (u->addr)
@@ -842,7 +843,7 @@ static int unix_bind(struct socket *sock
out_unlock:
spin_unlock(&unix_table_lock);
out_up:
- mutex_unlock(&u->readlock);
+ unix_state_runlock(sk);
out:
return err;
@@ -1572,7 +1573,7 @@ static int unix_dgram_recvmsg(struct kio
msg->msg_namelen = 0;
- mutex_lock(&u->readlock);
+ unix_state_rlock(sk);
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (!skb)
@@ -1628,7 +1629,7 @@ static int unix_dgram_recvmsg(struct kio
out_free:
skb_free_datagram(sk,skb);
out_unlock:
- mutex_unlock(&u->readlock);
+ unix_state_runlock(sk);
out:
return err;
}
@@ -1674,7 +1675,6 @@ static int unix_stream_recvmsg(struct ki
struct sock_iocb *siocb = kiocb_to_siocb(iocb);
struct scm_cookie tmp_scm;
struct sock *sk = sock->sk;
- struct unix_sock *u = unix_sk(sk);
struct sockaddr_un *sunaddr=msg->msg_name;
int copied = 0;
int check_creds = 0;
@@ -1704,7 +1704,7 @@ static int unix_stream_recvmsg(struct ki
memset(&tmp_scm, 0, sizeof(tmp_scm));
}
- mutex_lock(&u->readlock);
+ unix_state_rlock(sk);
do
{
@@ -1728,7 +1728,7 @@ static int unix_stream_recvmsg(struct ki
err = -EAGAIN;
if (!timeo)
break;
- mutex_unlock(&u->readlock);
+ unix_state_runlock(sk);
timeo = unix_stream_data_wait(sk, timeo);
@@ -1736,7 +1736,7 @@ static int unix_stream_recvmsg(struct ki
err = sock_intr_errno(timeo);
goto out;
}
- mutex_lock(&u->readlock);
+ unix_state_rlock(sk);
continue;
}
@@ -1802,7 +1802,7 @@ static int unix_stream_recvmsg(struct ki
}
} while (size);
- mutex_unlock(&u->readlock);
+ unix_state_runlock(sk);
scm_recv(sock, msg, siocb->scm, flags);
out:
return copied ? : err;
View attachment "af_unix_readshutdownrace.diff" of type "text/plain" (3578 bytes)
View attachment "cli.c" of type "text/x-csrc" (1122 bytes)
View attachment "srv.c" of type "text/x-csrc" (1100 bytes)
Powered by blists - more mailing lists