lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <51489.68.160.147.35.1163979693.squirrel@webmail.metacarta.com>
Date:	Sun, 19 Nov 2006 18:41:33 -0500 (EST)
From:	jmalicki@...acarta.com
To:	linux-kernel@...r.kernel.org
Subject: [PATCH] AF_UNIX recv/shutdown race


On some of our systems (Linux 2.6, 4-way SMP), we have found a race where
occasionally recv() can detect a shutdown before the last bytes written to
the socket, and will exhibit the odd behavior where recv() will return 0
to indicate a shutdown socket, and a subsequent call will return the last
bit data, after which it will return 0 again.

This is demonstrated by the attached test cases (cli.c and srv.c).  Start
one srv and several cli processes (perhaps 4-8) on a 2x dual core P4 (most
likely other systems as well, but I haven't reproduced it on other
configurations), and you will soon see this behavior.

What happens is that in unix_stream_recvmsg, the system checks for any
skbuffs on the queue ready to return.  Meanwhile, immediately afterwards,
another process/processor writes an skbuff and shuts down the sock.  Our
original reader process then checks shutdown, and returns 0.  On the next
call, it finally sees the skbuff written.

This patch combines af_unix's unix_sk(sk).readlock with
unix_state_rlock/runlock(sk).  readlock was only ever used by bind,
autobind, and unix_{stream,dgram}_recvmsg.  I can't see any way of
ensuring correctness for unix_stream_recvmsg without forcing it to hold
the state lock across getting the skbuf and checking shutdown.


Signed-off-by: Joseph Malicki <joe.malicki@...acarta.com>
-------------------------------
diff -rup linux-2.6.18.orig/include/net/af_unix.h
linux-2.6.18/include/net/af_unix.h
--- linux-2.6.18.orig/include/net/af_unix.h	2006-09-19 23:42:06.000000000
-0400
+++ linux-2.6.18/include/net/af_unix.h	2006-10-02 19:42:07.000000000 -0400
@@ -78,7 +78,6 @@ struct unix_sock {
         struct unix_address     *addr;
         struct dentry		*dentry;
         struct vfsmount		*mnt;
-	struct mutex		readlock;
         struct sock		*peer;
         struct sock		*other;
         struct sock		*gc_tree;
diff -rup linux-2.6.18.orig/net/unix/af_unix.c
linux-2.6.18/net/unix/af_unix.c
--- linux-2.6.18.orig/net/unix/af_unix.c	2006-09-19 23:42:06.000000000
-0400 +++ linux-2.6.18/net/unix/af_unix.c	2006-10-06 15:28:44.000000000
-0400 @@ -50,6 +50,8 @@
  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
  *	     				the core infrastructure is doing that
  *	     				for all net proto families now (2.5.69+)
+ *	       Joseph Malicki	:	Fix SMP race between write/shutdown
+ *					and read.
  *
  *
  * Known differences from reference BSD that was tested:
@@ -593,7 +595,6 @@ static struct sock * unix_create1(struct
 	u->mnt	  = NULL;
 	spin_lock_init(&u->lock);
 	atomic_set(&u->inflight, sock ? 0 : -1);
-	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
@@ -650,7 +651,7 @@ static int unix_autobind(struct socket *
 	struct unix_address * addr;
 	int err;

-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);

 	err = 0;
 	if (u->addr)
@@ -687,7 +688,7 @@ retry:
 	spin_unlock(&unix_table_lock);
 	err = 0;

-out:	mutex_unlock(&u->readlock);
+out:	unix_state_runlock(sk);
 	return err;
 }

@@ -770,7 +771,7 @@ static int unix_bind(struct socket *sock
 		goto out;
 	addr_len = err;

-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);

 	err = -EINVAL;
 	if (u->addr)
@@ -842,7 +843,7 @@ static int unix_bind(struct socket *sock
 out_unlock:
 	spin_unlock(&unix_table_lock);
 out_up:
-	mutex_unlock(&u->readlock);
+	unix_state_runlock(sk);
 out:
 	return err;

@@ -1572,7 +1573,7 @@ static int unix_dgram_recvmsg(struct kio

 	msg->msg_namelen = 0;

-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);

 	skb = skb_recv_datagram(sk, flags, noblock, &err);
 	if (!skb)
@@ -1628,7 +1629,7 @@ static int unix_dgram_recvmsg(struct kio
 out_free:
 	skb_free_datagram(sk,skb);
 out_unlock:
-	mutex_unlock(&u->readlock);
+	unix_state_runlock(sk);
 out:
 	return err;
 }
@@ -1674,7 +1675,6 @@ static int unix_stream_recvmsg(struct ki
 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
 	struct scm_cookie tmp_scm;
 	struct sock *sk = sock->sk;
-	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr=msg->msg_name;
 	int copied = 0;
 	int check_creds = 0;
@@ -1704,7 +1704,7 @@ static int unix_stream_recvmsg(struct ki
 		memset(&tmp_scm, 0, sizeof(tmp_scm));
 	}

-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);

 	do
 	{
@@ -1728,7 +1728,7 @@ static int unix_stream_recvmsg(struct ki
 			err = -EAGAIN;
 			if (!timeo)
 				break;
-			mutex_unlock(&u->readlock);
+			unix_state_runlock(sk);

 			timeo = unix_stream_data_wait(sk, timeo);

@@ -1736,7 +1736,7 @@ static int unix_stream_recvmsg(struct ki
 				err = sock_intr_errno(timeo);
 				goto out;
 			}
-			mutex_lock(&u->readlock);
+			unix_state_rlock(sk);
 			continue;
 		}

@@ -1802,7 +1802,7 @@ static int unix_stream_recvmsg(struct ki
 		}
 	} while (size);

-	mutex_unlock(&u->readlock);
+	unix_state_runlock(sk);
 	scm_recv(sock, msg, siocb->scm, flags);
 out:
 	return copied ? : err;



View attachment "af_unix_readshutdownrace.diff" of type "text/plain" (3578 bytes)

View attachment "cli.c" of type "text/x-csrc" (1122 bytes)

View attachment "srv.c" of type "text/x-csrc" (1100 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ