lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180206125624.GE15427@breakpoint.cc>
Date:   Tue, 6 Feb 2018 13:56:24 +0100
From:   Florian Westphal <fw@...len.de>
To:     Eyal Birger <eyal.birger@...il.com>
Cc:     steffen.klassert@...unet.com, netdev@...r.kernel.org,
        herbert@...dor.apana.org.au, davem@...emloft.net,
        shmulik@...anetworks.com
Subject: Re: xfrm, ip tunnel: non released device reference upon device
 unregistration

Eyal Birger <eyal.birger@...il.com> wrote:
> Hi,
> 
> We've encountered a non released device reference upon device
> unregistration which seems to stem from xfrm policy code.
> 
> The setup includes:
> - an underlay device (e.g. eth0) using IPv4
> - an xfrm IPv6 over IPv4 tunnel routed via the underlay device
> - an ipip6 tunnel over the xfrm IPv6 tunnel
> 
> When tearing down the underlay device, after traffic had passed via the ipip6
> tunnel, log messages of the following form are observed:
> 
> unregister_netdevice: waiting for eth0 to become free. Usage count = 2

This looks like it might be related to a bug reported by Hangbin Liu.

Setup is:

       Host A                --        Host B
tun0 -- br0 (ipsec) -- eth0  --  eth0 (ipsec) -- tun0

... where tun0 are ipip tunnels.

module removal then hangs forever as device refcount is held by
dst_cache.
Hangbin gave following reproducer script:

Host B:
ip link add tun0 type ipip local 192.168.7.1 remote 192.168.7.10
ip link set tun0 up
ip addr add 10.0.0.2/24 dev tun0
ip xfrm state add src 192.168.7.1 dst 192.168.7.10 spi 1000 proto ah
auth sha1 beef_fish_pork_salad mode transport
ip xfrm state add src 192.168.7.1 dst 192.168.7.1 spi 1000 proto ah auth
sha1 beef_fish_pork_salad mode transport
ip xfrm policy add src 192.168.7.1 dst 192.168.7.10 dir out tmpl src
192.168.7.1 dst 192.168.7.10 proto ah spi 1000 mode transport
ip xfrm policy add src 192.168.7.10 dst 192.168.7.1 dir in tmpl src
192.168.7.10 dst 192.168.7.1 proto ah spi 1000 mode transport level use

On Host A (tun0 10.0.0.1, br0 192.168.7.10):
ip addr flush dev eth0
brctl addbr br0
brctl addif br0 eth0
ip link set br0 up
ip addr add 192.168.7.10/24 dev br0

ip link add tun0 type ipip local 192.168.7.10 remote 192.168.7.1
ip link set tun0 up
ip addr add 10.0.0.1/24 dev tun0

ip xfrm state add src 192.168.7.10 dst 192.168.7.1 spi 1000
proto ah auth sha1 beef_fish_pork_salad mode transport
ip xfrm state add src 192.168.7.1 dst 192.168.7.10 spi 1000
proto ah auth sha1 beef_fish_pork_salad mode transport
ip xfrm policy add src 192.168.7.10 dst 192.168.7.1 dir out tmpl
src 192.168.7.10 dst 192.168.7.1 proto ah spi 1000 mode transport
ip xfrm policy add src 192.168.7.1 dst 192.168.7.10 dir in tmpl src 192.168.7.1 dst 192.168.7.10 proto ah spi 1000 mode transport level use

# should work, else something is broken:
ping 10.0.0.2 -c 2

# This hangs:
modprobe -r bridge

This hangs even when I remove the ipsec xfrm pcpu cache.

What does work in above setup is this:

diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h
index 72fd5067c353..d5bcd6ea77f3 100644
--- a/include/net/dst_cache.h
+++ b/include/net/dst_cache.h
@@ -95,4 +95,13 @@ int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp);
  */
 void dst_cache_destroy(struct dst_cache *dst_cache);
 
+/**
+ *	dst_cache_clear_dev - put device from cached tunnel
+ *	@dst_cache: the cache
+ *	@dev: device being removed
+ *
+ *	puts the device and resets the cache in case the device is
+ *	used by the cached dst.
+ */
+void dst_cache_clear_dev(struct dst_cache *dstcache, const struct net_device *dev);
 #endif
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 1f16773cfd76..4e337b99fa96 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -289,6 +289,8 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p, __u32 fwmark);
 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
 
+void ip_tunnel_device_down(const struct net_device *dev, unsigned int net_id);
+
 struct ip_tunnel_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
 	int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 554d36449231..56cafe08756d 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -166,3 +166,32 @@ void dst_cache_destroy(struct dst_cache *dst_cache)
 	free_percpu(dst_cache->cache);
 }
 EXPORT_SYMBOL_GPL(dst_cache_destroy);
+
+void dst_cache_clear_dev(struct dst_cache *dst_cache, const struct net_device *dev)
+{
+	struct dst_entry *dst;
+	bool reset = false;
+	int i;
+
+	if (!dst_cache->cache)
+		return;
+
+	rcu_read_lock();
+	for_each_possible_cpu(i) {
+		dst = per_cpu_ptr(dst_cache->cache, i)->dst;
+		if (!dst)
+			continue;
+
+		if (!dst_hold_safe(dst))
+			continue;
+
+		if (dst->dev == dev) {
+			dst_dev_put(dst);
+			reset = true;
+		}
+		dst_release(dst);
+	}
+	rcu_read_unlock();
+	if (reset)
+		dst_cache_reset(dst_cache);
+}
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d786a8441bce..cea578190258 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1224,4 +1224,25 @@ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
 
+void ip_tunnel_device_down(const struct net_device *dev, unsigned int id)
+{
+	struct net *net = dev_net(dev);
+	struct ip_tunnel_net *itn = net_generic(net,  id);
+	unsigned int i;
+
+	ASSERT_RTNL();
+
+	for (i = 0; i < IP_TNL_HASH_SIZE; i++) {
+		struct ip_tunnel *t;
+		struct hlist_node *n;
+		struct hlist_head *thead = &itn->tunnels[i];
+
+		hlist_for_each_entry_safe(t, n, thead, hash_node)
+			dst_cache_clear_dev(&t->dst_cache, dev);
+
+		cond_resched();
+	}
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_device_down);
+
 MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c891235b4966..da9640e082eb 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -357,6 +357,21 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return 0;
 }
 
+int ipip_tunnel_device_event(struct notifier_block *this,
+			     unsigned long event, void *ptr)
+{
+	const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_DOWN:
+		ip_tunnel_device_down(dev, ipip_net_id);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(ipip_tunnel_device_event);
+
 static const struct net_device_ops ipip_netdev_ops = {
 	.ndo_init       = ipip_tunnel_init,
 	.ndo_uninit     = ip_tunnel_uninit,
@@ -671,6 +686,10 @@ static struct pernet_operations ipip_net_ops = {
 	.size = sizeof(struct ip_tunnel_net),
 };
 
+static struct notifier_block ipip_device_notifier = {
+	.notifier_call	= ipip_tunnel_device_event,
+};
+
 static int __init ipip_init(void)
 {
 	int err;
@@ -696,6 +715,11 @@ static int __init ipip_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;
 
+	err = register_netdevice_notifier(&ipip_device_notifier);
+	if (err < 0) {
+		rtnl_link_unregister(&ipip_link_ops);
+		goto rtnl_link_failed;
+	}
 out:
 	return err;
 
@@ -713,6 +737,7 @@ static int __init ipip_init(void)
 
 static void __exit ipip_fini(void)
 {
+	unregister_netdevice_notifier(&ipip_device_notifier);
 	rtnl_link_unregister(&ipip_link_ops);
 	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
 		pr_info("%s: can't deregister tunnel\n", __func__);

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ