lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180423175714.9794-3-pablo@netfilter.org>
Date:   Mon, 23 Apr 2018 19:57:04 +0200
From:   Pablo Neira Ayuso <pablo@...filter.org>
To:     netfilter-devel@...r.kernel.org
Cc:     davem@...emloft.net, netdev@...r.kernel.org
Subject: [PATCH 02/12] ipvs: fix rtnl_lock lockups caused by start_sync_thread

From: Julian Anastasov <ja@....bg>

syzkaller reports for wrong rtnl_lock usage in sync code [1] and [2]

We have 2 problems in start_sync_thread if error path is
taken, eg. on memory allocation error or failure to configure
sockets for mcast group or addr/port binding:

1. recursive locking: holding rtnl_lock while calling sock_release
which in turn calls again rtnl_lock in ip_mc_drop_socket to leave
the mcast group, as noticed by Florian Westphal. Additionally,
sock_release can not be called while holding sync_mutex (ABBA
deadlock).

2. task hung: holding rtnl_lock while calling kthread_stop to
stop the running kthreads. As the kthreads do the same to leave
the mcast group (sock_release -> ip_mc_drop_socket -> rtnl_lock)
they hang.

Fix the problems by calling rtnl_unlock early in the error path,
now sock_release is called after unlocking both mutexes.

Problem 3 (task hung reported by syzkaller [2]) is variant of
problem 2: use _trylock to prevent one user to call rtnl_lock and
then while waiting for sync_mutex to block kthreads that execute
sock_release when they are stopped by stop_sync_thread.

[1]
IPVS: stopping backup sync thread 4500 ...
WARNING: possible recursive locking detected
4.16.0-rc7+ #3 Not tainted
--------------------------------------------
syzkaller688027/4497 is trying to acquire lock:
  (rtnl_mutex){+.+.}, at: [<00000000bb14d7fb>] rtnl_lock+0x17/0x20
net/core/rtnetlink.c:74

but task is already holding lock:
IPVS: stopping backup sync thread 4495 ...
  (rtnl_mutex){+.+.}, at: [<00000000bb14d7fb>] rtnl_lock+0x17/0x20
net/core/rtnetlink.c:74

other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(rtnl_mutex);
   lock(rtnl_mutex);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

2 locks held by syzkaller688027/4497:
  #0:  (rtnl_mutex){+.+.}, at: [<00000000bb14d7fb>] rtnl_lock+0x17/0x20
net/core/rtnetlink.c:74
  #1:  (ipvs->sync_mutex){+.+.}, at: [<00000000703f78e3>]
do_ip_vs_set_ctl+0x10f8/0x1cc0 net/netfilter/ipvs/ip_vs_ctl.c:2388

stack backtrace:
CPU: 1 PID: 4497 Comm: syzkaller688027 Not tainted 4.16.0-rc7+ #3
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x194/0x24d lib/dump_stack.c:53
  print_deadlock_bug kernel/locking/lockdep.c:1761 [inline]
  check_deadlock kernel/locking/lockdep.c:1805 [inline]
  validate_chain kernel/locking/lockdep.c:2401 [inline]
  __lock_acquire+0xe8f/0x3e00 kernel/locking/lockdep.c:3431
  lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920
  __mutex_lock_common kernel/locking/mutex.c:756 [inline]
  __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893
  mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
  rtnl_lock+0x17/0x20 net/core/rtnetlink.c:74
  ip_mc_drop_socket+0x88/0x230 net/ipv4/igmp.c:2643
  inet_release+0x4e/0x1c0 net/ipv4/af_inet.c:413
  sock_release+0x8d/0x1e0 net/socket.c:595
  start_sync_thread+0x2213/0x2b70 net/netfilter/ipvs/ip_vs_sync.c:1924
  do_ip_vs_set_ctl+0x1139/0x1cc0 net/netfilter/ipvs/ip_vs_ctl.c:2389
  nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
  nf_setsockopt+0x67/0xc0 net/netfilter/nf_sockopt.c:115
  ip_setsockopt+0x97/0xa0 net/ipv4/ip_sockglue.c:1261
  udp_setsockopt+0x45/0x80 net/ipv4/udp.c:2406
  sock_common_setsockopt+0x95/0xd0 net/core/sock.c:2975
  SYSC_setsockopt net/socket.c:1849 [inline]
  SyS_setsockopt+0x189/0x360 net/socket.c:1828
  do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x42/0xb7
RIP: 0033:0x446a69
RSP: 002b:00007fa1c3a64da8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000446a69
RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003
RBP: 00000000006e29fc R08: 0000000000000018 R09: 0000000000000000
R10: 00000000200000c0 R11: 0000000000000246 R12: 00000000006e29f8
R13: 00676e697279656b R14: 00007fa1c3a659c0 R15: 00000000006e2b60

[2]
IPVS: sync thread started: state = BACKUP, mcast_ifn = syz_tun, syncid = 4,
id = 0
IPVS: stopping backup sync thread 25415 ...
INFO: task syz-executor7:25421 blocked for more than 120 seconds.
       Not tainted 4.16.0-rc6+ #284
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor7   D23688 25421   4408 0x00000004
Call Trace:
  context_switch kernel/sched/core.c:2862 [inline]
  __schedule+0x8fb/0x1ec0 kernel/sched/core.c:3440
  schedule+0xf5/0x430 kernel/sched/core.c:3499
  schedule_timeout+0x1a3/0x230 kernel/time/timer.c:1777
  do_wait_for_common kernel/sched/completion.c:86 [inline]
  __wait_for_common kernel/sched/completion.c:107 [inline]
  wait_for_common kernel/sched/completion.c:118 [inline]
  wait_for_completion+0x415/0x770 kernel/sched/completion.c:139
  kthread_stop+0x14a/0x7a0 kernel/kthread.c:530
  stop_sync_thread+0x3d9/0x740 net/netfilter/ipvs/ip_vs_sync.c:1996
  do_ip_vs_set_ctl+0x2b1/0x1cc0 net/netfilter/ipvs/ip_vs_ctl.c:2394
  nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
  nf_setsockopt+0x67/0xc0 net/netfilter/nf_sockopt.c:115
  ip_setsockopt+0x97/0xa0 net/ipv4/ip_sockglue.c:1253
  sctp_setsockopt+0x2ca/0x63e0 net/sctp/socket.c:4154
  sock_common_setsockopt+0x95/0xd0 net/core/sock.c:3039
  SYSC_setsockopt net/socket.c:1850 [inline]
  SyS_setsockopt+0x189/0x360 net/socket.c:1829
  do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x42/0xb7
RIP: 0033:0x454889
RSP: 002b:00007fc927626c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 00007fc9276276d4 RCX: 0000000000454889
RDX: 000000000000048c RSI: 0000000000000000 RDI: 0000000000000017
RBP: 000000000072bf58 R08: 0000000000000018 R09: 0000000000000000
R10: 0000000020000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 000000000000051c R14: 00000000006f9b40 R15: 0000000000000001

Showing all locks held in the system:
2 locks held by khungtaskd/868:
  #0:  (rcu_read_lock){....}, at: [<00000000a1a8f002>]
check_hung_uninterruptible_tasks kernel/hung_task.c:175 [inline]
  #0:  (rcu_read_lock){....}, at: [<00000000a1a8f002>] watchdog+0x1c5/0xd60
kernel/hung_task.c:249
  #1:  (tasklist_lock){.+.+}, at: [<0000000037c2f8f9>]
debug_show_all_locks+0xd3/0x3d0 kernel/locking/lockdep.c:4470
1 lock held by rsyslogd/4247:
  #0:  (&f->f_pos_lock){+.+.}, at: [<000000000d8d6983>]
__fdget_pos+0x12b/0x190 fs/file.c:765
2 locks held by getty/4338:
  #0:  (&tty->ldisc_sem){++++}, at: [<00000000bee98654>]
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1:  (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>]
n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131
2 locks held by getty/4339:
  #0:  (&tty->ldisc_sem){++++}, at: [<00000000bee98654>]
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1:  (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>]
n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131
2 locks held by getty/4340:
  #0:  (&tty->ldisc_sem){++++}, at: [<00000000bee98654>]
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1:  (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>]
n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131
2 locks held by getty/4341:
  #0:  (&tty->ldisc_sem){++++}, at: [<00000000bee98654>]
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1:  (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>]
n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131
2 locks held by getty/4342:
  #0:  (&tty->ldisc_sem){++++}, at: [<00000000bee98654>]
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1:  (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>]
n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131
2 locks held by getty/4343:
  #0:  (&tty->ldisc_sem){++++}, at: [<00000000bee98654>]
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1:  (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>]
n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131
2 locks held by getty/4344:
  #0:  (&tty->ldisc_sem){++++}, at: [<00000000bee98654>]
ldsem_down_read+0x37/0x40 drivers/tty/tty_ldsem.c:365
  #1:  (&ldata->atomic_read_lock){+.+.}, at: [<00000000c1d180aa>]
n_tty_read+0x2ef/0x1a40 drivers/tty/n_tty.c:2131
3 locks held by kworker/0:5/6494:
  #0:  ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at:
[<00000000a062b18e>] work_static include/linux/workqueue.h:198 [inline]
  #0:  ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at:
[<00000000a062b18e>] set_work_data kernel/workqueue.c:619 [inline]
  #0:  ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at:
[<00000000a062b18e>] set_work_pool_and_clear_pending kernel/workqueue.c:646
[inline]
  #0:  ((wq_completion)"%s"("ipv6_addrconf")){+.+.}, at:
[<00000000a062b18e>] process_one_work+0xb12/0x1bb0 kernel/workqueue.c:2084
  #1:  ((addr_chk_work).work){+.+.}, at: [<00000000278427d5>]
process_one_work+0xb89/0x1bb0 kernel/workqueue.c:2088
  #2:  (rtnl_mutex){+.+.}, at: [<00000000066e35ac>] rtnl_lock+0x17/0x20
net/core/rtnetlink.c:74
1 lock held by syz-executor7/25421:
  #0:  (ipvs->sync_mutex){+.+.}, at: [<00000000d414a689>]
do_ip_vs_set_ctl+0x277/0x1cc0 net/netfilter/ipvs/ip_vs_ctl.c:2393
2 locks held by syz-executor7/25427:
  #0:  (rtnl_mutex){+.+.}, at: [<00000000066e35ac>] rtnl_lock+0x17/0x20
net/core/rtnetlink.c:74
  #1:  (ipvs->sync_mutex){+.+.}, at: [<00000000e6d48489>]
do_ip_vs_set_ctl+0x10f8/0x1cc0 net/netfilter/ipvs/ip_vs_ctl.c:2388
1 lock held by syz-executor7/25435:
  #0:  (rtnl_mutex){+.+.}, at: [<00000000066e35ac>] rtnl_lock+0x17/0x20
net/core/rtnetlink.c:74
1 lock held by ipvs-b:2:0/25415:
  #0:  (rtnl_mutex){+.+.}, at: [<00000000066e35ac>] rtnl_lock+0x17/0x20
net/core/rtnetlink.c:74

Reported-and-tested-by: syzbot+a46d6abf9d56b1365a72@...kaller.appspotmail.com
Reported-and-tested-by: syzbot+5fe074c01b2032ce9618@...kaller.appspotmail.com
Fixes: e0b26cc997d5 ("ipvs: call rtnl_lock early")
Signed-off-by: Julian Anastasov <ja@....bg>
Signed-off-by: Simon Horman <horms@...ge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@...filter.org>
---
 net/netfilter/ipvs/ip_vs_ctl.c  |   8 ---
 net/netfilter/ipvs/ip_vs_sync.c | 155 +++++++++++++++++++++-------------------
 2 files changed, 80 insertions(+), 83 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5ebde4b15810..f36098887ad0 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2384,11 +2384,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 			strlcpy(cfg.mcast_ifn, dm->mcast_ifn,
 				sizeof(cfg.mcast_ifn));
 			cfg.syncid = dm->syncid;
-			rtnl_lock();
-			mutex_lock(&ipvs->sync_mutex);
 			ret = start_sync_thread(ipvs, &cfg, dm->state);
-			mutex_unlock(&ipvs->sync_mutex);
-			rtnl_unlock();
 		} else {
 			mutex_lock(&ipvs->sync_mutex);
 			ret = stop_sync_thread(ipvs, dm->state);
@@ -3481,12 +3477,8 @@ static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
 	if (ipvs->mixed_address_family_dests > 0)
 		return -EINVAL;
 
-	rtnl_lock();
-	mutex_lock(&ipvs->sync_mutex);
 	ret = start_sync_thread(ipvs, &c,
 				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
-	mutex_unlock(&ipvs->sync_mutex);
-	rtnl_unlock();
 	return ret;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index fbaf3bd05b2e..001501e25625 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -49,6 +49,7 @@
 #include <linux/kthread.h>
 #include <linux/wait.h>
 #include <linux/kernel.h>
+#include <linux/sched/signal.h>
 
 #include <asm/unaligned.h>		/* Used for ntoh_seq and hton_seq */
 
@@ -1360,15 +1361,9 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
 /*
  *      Specifiy default interface for outgoing multicasts
  */
-static int set_mcast_if(struct sock *sk, char *ifname)
+static int set_mcast_if(struct sock *sk, struct net_device *dev)
 {
-	struct net_device *dev;
 	struct inet_sock *inet = inet_sk(sk);
-	struct net *net = sock_net(sk);
-
-	dev = __dev_get_by_name(net, ifname);
-	if (!dev)
-		return -ENODEV;
 
 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
 		return -EINVAL;
@@ -1396,19 +1391,14 @@ static int set_mcast_if(struct sock *sk, char *ifname)
  *      in the in_addr structure passed in as a parameter.
  */
 static int
-join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev)
 {
-	struct net *net = sock_net(sk);
 	struct ip_mreqn mreq;
-	struct net_device *dev;
 	int ret;
 
 	memset(&mreq, 0, sizeof(mreq));
 	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
 
-	dev = __dev_get_by_name(net, ifname);
-	if (!dev)
-		return -ENODEV;
 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
 		return -EINVAL;
 
@@ -1423,15 +1413,10 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
 
 #ifdef CONFIG_IP_VS_IPV6
 static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
-			     char *ifname)
+			     struct net_device *dev)
 {
-	struct net *net = sock_net(sk);
-	struct net_device *dev;
 	int ret;
 
-	dev = __dev_get_by_name(net, ifname);
-	if (!dev)
-		return -ENODEV;
 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
 		return -EINVAL;
 
@@ -1443,24 +1428,18 @@ static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
 }
 #endif
 
-static int bind_mcastif_addr(struct socket *sock, char *ifname)
+static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
 {
-	struct net *net = sock_net(sock->sk);
-	struct net_device *dev;
 	__be32 addr;
 	struct sockaddr_in sin;
 
-	dev = __dev_get_by_name(net, ifname);
-	if (!dev)
-		return -ENODEV;
-
 	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
 	if (!addr)
 		pr_err("You probably need to specify IP address on "
 		       "multicast interface.\n");
 
 	IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
-		  ifname, &addr);
+		  dev->name, &addr);
 
 	/* Now bind the socket with the address of multicast interface */
 	sin.sin_family	     = AF_INET;
@@ -1493,7 +1472,8 @@ static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
 /*
  *      Set up sending multicast socket over UDP
  */
-static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
+static int make_send_sock(struct netns_ipvs *ipvs, int id,
+			  struct net_device *dev, struct socket **sock_ret)
 {
 	/* multicast addr */
 	union ipvs_sockaddr mcast_addr;
@@ -1505,9 +1485,10 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
 				  IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
-		return ERR_PTR(result);
+		goto error;
 	}
-	result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn);
+	*sock_ret = sock;
+	result = set_mcast_if(sock->sk, dev);
 	if (result < 0) {
 		pr_err("Error setting outbound mcast interface\n");
 		goto error;
@@ -1522,7 +1503,7 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
 		set_sock_size(sock->sk, 1, result);
 
 	if (AF_INET == ipvs->mcfg.mcast_af)
-		result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn);
+		result = bind_mcastif_addr(sock, dev);
 	else
 		result = 0;
 	if (result < 0) {
@@ -1538,19 +1519,18 @@ static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id)
 		goto error;
 	}
 
-	return sock;
+	return 0;
 
 error:
-	sock_release(sock);
-	return ERR_PTR(result);
+	return result;
 }
 
 
 /*
  *      Set up receiving multicast socket over UDP
  */
-static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
-					int ifindex)
+static int make_receive_sock(struct netns_ipvs *ipvs, int id,
+			     struct net_device *dev, struct socket **sock_ret)
 {
 	/* multicast addr */
 	union ipvs_sockaddr mcast_addr;
@@ -1562,8 +1542,9 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
 				  IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
-		return ERR_PTR(result);
+		goto error;
 	}
+	*sock_ret = sock;
 	/* it is equivalent to the REUSEADDR option in user-space */
 	sock->sk->sk_reuse = SK_CAN_REUSE;
 	result = sysctl_sync_sock_size(ipvs);
@@ -1571,7 +1552,7 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
 		set_sock_size(sock->sk, 0, result);
 
 	get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
-	sock->sk->sk_bound_dev_if = ifindex;
+	sock->sk->sk_bound_dev_if = dev->ifindex;
 	result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
 	if (result < 0) {
 		pr_err("Error binding to the multicast addr\n");
@@ -1582,21 +1563,20 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id,
 #ifdef CONFIG_IP_VS_IPV6
 	if (ipvs->bcfg.mcast_af == AF_INET6)
 		result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
-					   ipvs->bcfg.mcast_ifn);
+					   dev);
 	else
 #endif
 		result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
-					  ipvs->bcfg.mcast_ifn);
+					  dev);
 	if (result < 0) {
 		pr_err("Error joining to the multicast group\n");
 		goto error;
 	}
 
-	return sock;
+	return 0;
 
 error:
-	sock_release(sock);
-	return ERR_PTR(result);
+	return result;
 }
 
 
@@ -1778,13 +1758,12 @@ static int sync_thread_backup(void *data)
 int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 		      int state)
 {
-	struct ip_vs_sync_thread_data *tinfo;
+	struct ip_vs_sync_thread_data *tinfo = NULL;
 	struct task_struct **array = NULL, *task;
-	struct socket *sock;
 	struct net_device *dev;
 	char *name;
 	int (*threadfn)(void *data);
-	int id, count, hlen;
+	int id = 0, count, hlen;
 	int result = -ENOMEM;
 	u16 mtu, min_mtu;
 
@@ -1792,6 +1771,18 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
 		  sizeof(struct ip_vs_sync_conn_v0));
 
+	/* Do not hold one mutex and then to block on another */
+	for (;;) {
+		rtnl_lock();
+		if (mutex_trylock(&ipvs->sync_mutex))
+			break;
+		rtnl_unlock();
+		mutex_lock(&ipvs->sync_mutex);
+		if (rtnl_trylock())
+			break;
+		mutex_unlock(&ipvs->sync_mutex);
+	}
+
 	if (!ipvs->sync_state) {
 		count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
 		ipvs->threads_mask = count - 1;
@@ -1810,7 +1801,8 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 	dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
 	if (!dev) {
 		pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
-		return -ENODEV;
+		result = -ENODEV;
+		goto out_early;
 	}
 	hlen = (AF_INET6 == c->mcast_af) ?
 	       sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
@@ -1827,26 +1819,30 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 		c->sync_maxlen = mtu - hlen;
 
 	if (state == IP_VS_STATE_MASTER) {
+		result = -EEXIST;
 		if (ipvs->ms)
-			return -EEXIST;
+			goto out_early;
 
 		ipvs->mcfg = *c;
 		name = "ipvs-m:%d:%d";
 		threadfn = sync_thread_master;
 	} else if (state == IP_VS_STATE_BACKUP) {
+		result = -EEXIST;
 		if (ipvs->backup_threads)
-			return -EEXIST;
+			goto out_early;
 
 		ipvs->bcfg = *c;
 		name = "ipvs-b:%d:%d";
 		threadfn = sync_thread_backup;
 	} else {
-		return -EINVAL;
+		result = -EINVAL;
+		goto out_early;
 	}
 
 	if (state == IP_VS_STATE_MASTER) {
 		struct ipvs_master_sync_state *ms;
 
+		result = -ENOMEM;
 		ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL);
 		if (!ipvs->ms)
 			goto out;
@@ -1862,39 +1858,38 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 	} else {
 		array = kcalloc(count, sizeof(struct task_struct *),
 				GFP_KERNEL);
+		result = -ENOMEM;
 		if (!array)
 			goto out;
 	}
 
-	tinfo = NULL;
 	for (id = 0; id < count; id++) {
-		if (state == IP_VS_STATE_MASTER)
-			sock = make_send_sock(ipvs, id);
-		else
-			sock = make_receive_sock(ipvs, id, dev->ifindex);
-		if (IS_ERR(sock)) {
-			result = PTR_ERR(sock);
-			goto outtinfo;
-		}
+		result = -ENOMEM;
 		tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
 		if (!tinfo)
-			goto outsocket;
+			goto out;
 		tinfo->ipvs = ipvs;
-		tinfo->sock = sock;
+		tinfo->sock = NULL;
 		if (state == IP_VS_STATE_BACKUP) {
 			tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
 					     GFP_KERNEL);
 			if (!tinfo->buf)
-				goto outtinfo;
+				goto out;
 		} else {
 			tinfo->buf = NULL;
 		}
 		tinfo->id = id;
+		if (state == IP_VS_STATE_MASTER)
+			result = make_send_sock(ipvs, id, dev, &tinfo->sock);
+		else
+			result = make_receive_sock(ipvs, id, dev, &tinfo->sock);
+		if (result < 0)
+			goto out;
 
 		task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
 		if (IS_ERR(task)) {
 			result = PTR_ERR(task);
-			goto outtinfo;
+			goto out;
 		}
 		tinfo = NULL;
 		if (state == IP_VS_STATE_MASTER)
@@ -1911,20 +1906,20 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 	ipvs->sync_state |= state;
 	spin_unlock_bh(&ipvs->sync_buff_lock);
 
+	mutex_unlock(&ipvs->sync_mutex);
+	rtnl_unlock();
+
 	/* increase the module use count */
 	ip_vs_use_count_inc();
 
 	return 0;
 
-outsocket:
-	sock_release(sock);
-
-outtinfo:
-	if (tinfo) {
-		sock_release(tinfo->sock);
-		kfree(tinfo->buf);
-		kfree(tinfo);
-	}
+out:
+	/* We do not need RTNL lock anymore, release it here so that
+	 * sock_release below and in the kthreads can use rtnl_lock
+	 * to leave the mcast group.
+	 */
+	rtnl_unlock();
 	count = id;
 	while (count-- > 0) {
 		if (state == IP_VS_STATE_MASTER)
@@ -1932,13 +1927,23 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
 		else
 			kthread_stop(array[count]);
 	}
-	kfree(array);
-
-out:
 	if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
 		kfree(ipvs->ms);
 		ipvs->ms = NULL;
 	}
+	mutex_unlock(&ipvs->sync_mutex);
+	if (tinfo) {
+		if (tinfo->sock)
+			sock_release(tinfo->sock);
+		kfree(tinfo->buf);
+		kfree(tinfo);
+	}
+	kfree(array);
+	return result;
+
+out_early:
+	mutex_unlock(&ipvs->sync_mutex);
+	rtnl_unlock();
 	return result;
 }
 
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ