lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180206085338.wvxybymmqn6r67j2@gauss3.secunet.de>
Date:   Tue, 6 Feb 2018 09:53:38 +0100
From:   Steffen Klassert <steffen.klassert@...unet.com>
To:     Eyal Birger <eyal.birger@...il.com>
CC:     <netdev@...r.kernel.org>, <herbert@...dor.apana.org.au>,
        <davem@...emloft.net>, <shmulik@...anetworks.com>,
        Wei Wang <weiwan@...gle.com>
Subject: Re: xfrm, ip tunnel: non released device reference upon device
 unregistration

Cc Wei Wang

On Sun, Feb 04, 2018 at 01:21:18PM +0200, Eyal Birger wrote:
> Hi,
> 
> We've encountered a non released device reference upon device
> unregistration which seems to stem from xfrm policy code.
> 
> The setup includes:
> - an underlay device (e.g. eth0) using IPv4
> - an xfrm IPv6 over IPv4 tunnel routed via the underlay device
> - an ipip6 tunnel over the xfrm IPv6 tunnel
> 
> When tearing down the underlay device, after traffic had passed via the ipip6
> tunnel, log messages of the following form are observed:
> 
> unregister_netdevice: waiting for eth0 to become free. Usage count = 2

Looks like this happened when the dst garbage collection code was
removed. I could not point to a commit that introduced it so I
did a bisection and this pointed to:

commit 9514528d92d4cbe086499322370155ed69f5d06c
ipv6: call dst_dev_put() properly

With this commit we leak the one refcount and some further commit
leaked the second one.

> 
> The below synthetic script reproduces this consistently on a fresh ubuntu vm
> running net-next v4.15-6066-ge9522a5:
> ---------------------------------------------------------
> #!/bin/bash
> 
> ipsec_underlay_dst=192.168.6.1
> ipsec_underlay_src=192.168.5.2
> ipv6_pfx=1234
> local_ipv6_addr="$ipv6_pfx::1"
> remote_ipv6_addr="$ipv6_pfx::2"
> 
> # create dummy ipsec underlay
> ip l add dev dummy1 type dummy
> ip l set dev dummy1 up
> ip r add "$ipsec_underlay_dst/32" dev dummy1
> ip -6 r add "$ipv6_pfx::/16" dev dummy1
> 
> ip a add dev dummy1 "$local_ipv6_addr/128"
> ip a add dev dummy1 "$ipsec_underlay_src/24"
> 
> # add xfrm policy and state
> ip x p add src "$local_ipv6_addr/128" dst "$ipv6_pfx::/16" dir out tmpl src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp reqid 1 mode tunnel
> ip x s add src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp spi 0xcd440ce6 reqid 1 mode tunnel auth-trunc 'hmac(sha1)' 0x34a546d309031628962b814ef073aff1a638ad21 96 enc 'cbc(aes)' 0xf31e14149c328297fe7925ad7448420e encap espinudp 4500 4500 0.0.0.0
> 
> # add 4o6 tunnel
> ip l add tnl46 type ip6tnl mode ipip6 local "$local_ipv6_addr" remote "$remote_ipv6_addr"
> ip l set dev tnl46 up
> ip r add 10.64.0.0/10 dev tnl46 
> 
> # pass traffic so route is cached
> ping -w 1 -c 1 10.64.0.1
> 
> # remove dummy underlay
> ip l del dummy1
> ---------------------------------------------------------
> 
> Analysis:
> 
> ip6_tunnel holds a dst_cache which caches its underlay dst objects.
> When devices are unregistered, non-xfrm dst objects are invlidated by their
> original creators (ipv4/ipv6/...) and thus are wiped from dst_cache.
> 
> xfrm created routes otoh are not tracked by xfrm, and are not invalidated upon
> device unregistration, thus hold the device upon unregistration.
> 
> The following rough sketch patch illustrates an approach overcoming this
> issue:
> ---------------------------------------------------------
> >From e188dc5295e3500bc59e8780049840afa2eb3e24 Mon Sep 17 00:00:00 2001
> From: Eyal Birger <eyal@...anetworks.com>
> Date: Sun, 4 Feb 2018 13:08:02 +0200
> Subject: [PATCH] net: xfrm_policy: invalidate xfrm_dsts on device
>  unregistration
> 
> ---
>  include/net/xfrm.h     | 10 ++-----
>  net/xfrm/xfrm_policy.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 78 insertions(+), 7 deletions(-)
> 
> diff --git a/include/net/xfrm.h b/include/net/xfrm.h
> index 7d20776..c1c8493 100644
> --- a/include/net/xfrm.h
> +++ b/include/net/xfrm.h
> @@ -994,6 +994,8 @@ struct xfrm_dst {
>  	u32 child_mtu_cached;
>  	u32 route_cookie;
>  	u32 path_cookie;
> +	struct list_head xfrm_dst_gc;
> +	struct xfrm_dst_list *xfrm_dst_gc_list;
>  };
>  
>  static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst)
> @@ -1025,13 +1027,7 @@ static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *c
>  	xdst->child = child;
>  }
>  
> -static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
> -{
> -	xfrm_pols_put(xdst->pols, xdst->num_pols);
> -	dst_release(xdst->route);
> -	if (likely(xdst->u.dst.xfrm))
> -		xfrm_state_put(xdst->u.dst.xfrm);
> -}
> +void xfrm_dst_destroy(struct xfrm_dst *xdst);
>  #endif
>  
>  void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
> diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
> index 7a23078..d446641 100644
> --- a/net/xfrm/xfrm_policy.c
> +++ b/net/xfrm/xfrm_policy.c
> @@ -94,6 +94,58 @@ __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
>  		(fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
>  }
>  
> +struct xfrm_dst_list {
> +	spinlock_t		lock;
> +	struct list_head	head;
> +};
> +
> +static DEFINE_PER_CPU_ALIGNED(struct xfrm_dst_list, xfrm_dst_gc_list);
> +
> +static void xfrm_add_dst_list(struct xfrm_dst *xdst)
> +{
> +	struct xfrm_dst_list *xl = raw_cpu_ptr(&xfrm_dst_gc_list);
> +
> +	xdst->xfrm_dst_gc_list = xl;
> +
> +	spin_lock_bh(&xl->lock);
> +	list_add_tail(&xdst->xfrm_dst_gc, &xl->head);
> +	spin_unlock_bh(&xl->lock);
> +}
> +
> +void xfrm_dst_destroy(struct xfrm_dst *xdst)
> +{
> +	xfrm_pols_put(xdst->pols, xdst->num_pols);
> +	dst_release(xdst->route);
> +	if (likely(xdst->u.dst.xfrm))
> +		xfrm_state_put(xdst->u.dst.xfrm);
> +	if (!list_empty(&xdst->xfrm_dst_gc)) {
> +		struct xfrm_dst_list *xl = xdst->xfrm_dst_gc_list;
> +
> +		spin_lock_bh(&xl->lock);
> +		list_del(&xdst->xfrm_dst_gc);
> +		spin_unlock_bh(&xl->lock);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(xfrm_dst_destroy);
> +
> +void xfrm_flush_dev(struct net_device *dev)
> +{
> +	struct xfrm_dst *xdst;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu);
> +
> +		spin_lock_bh(&xl->lock);
> +		list_for_each_entry(xdst, &xl->head, xfrm_dst_gc) {
> +			if (xdst->u.dst.dev != dev)
> +				continue;
> +			dst_dev_put(&xdst->u.dst);
> +		}
> +		spin_unlock_bh(&xl->lock);
> +	}
> +}
> +
>  bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
>  			 unsigned short family)
>  {
> @@ -1581,6 +1633,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
>  		}
>  
>  		bundle[i] = xdst;
> +		xfrm_add_dst_list(xdst);
> +
>  		if (!xdst_prev)
>  			xdst0 = xdst;
>  		else
> @@ -2984,8 +3038,22 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
>  	.exit = xfrm_net_exit,
>  };
>  
> +static int xfrm_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
> +{
> +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> +
> +	if (event == NETDEV_UNREGISTER)
> +		xfrm_flush_dev(dev);
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block xfrm_netdev_notifier = {
> +	.notifier_call = xfrm_netdev_event,
> +};
> +
>  void __init xfrm_init(void)
>  {
> +	int cpu;
>  	int i;
>  
>  	xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
> @@ -2998,6 +3066,13 @@ void __init xfrm_init(void)
>  	register_pernet_subsys(&xfrm_net_ops);
>  	seqcount_init(&xfrm_policy_hash_generation);
>  	xfrm_input_init();
> +	for_each_possible_cpu(cpu) {
> +		struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu);
> +
> +		INIT_LIST_HEAD(&xl->head);
> +		spin_lock_init(&xl->lock);
> +	}
> +	register_netdevice_notifier(&xfrm_netdev_notifier);
>  }
>  
>  #ifdef CONFIG_AUDITSYSCALL
> -- 
> 2.7.4
> 
> ---------------------------------------------------------
> 
> This approach has the unfortunate side effects of adding a spin lock for the
> tracked list, as well as increasing struct xfrm_dst.

Reintroducing garbage collection is probably not a so good idea. I think
the patch below should fix it a bit less intrusive.


Subject: [PATCH RFC] xfrm: Fix netdev refcount leak when flushing the percpu dst cache.

The dst garbage collection code is removed, so we need to call
dst_dev_put() on cached dst entries before we release them.
Otherwise we leak the refcount to the netdev.

Signed-off-by: Steffen Klassert <steffen.klassert@...unet.com>
---
 net/xfrm/xfrm_policy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7a23078132cf..7836b7601b49 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1715,8 +1715,10 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family,
 static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old)
 {
 	this_cpu_write(xfrm_last_dst, xdst);
-	if (old)
+	if (old) {
+		dst_dev_put(&old->u.dst);
 		dst_release(&old->u.dst);
+	}
 }
 
 static void __xfrm_pcpu_work_fn(void)
@@ -1787,6 +1789,7 @@ void xfrm_policy_cache_flush(void)
 		old = per_cpu(xfrm_last_dst, cpu);
 		if (old && !xfrm_bundle_ok(old)) {
 			per_cpu(xfrm_last_dst, cpu) = NULL;
+			dst_dev_put(&old->u.dst);
 			dst_release(&old->u.dst);
 		}
 		rcu_read_unlock();
-- 
2.14.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ