[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180204132118.6fef9bf0@jimi>
Date: Sun, 4 Feb 2018 13:21:18 +0200
From: Eyal Birger <eyal.birger@...il.com>
To: steffen.klassert@...unet.com
Cc: netdev@...r.kernel.org, herbert@...dor.apana.org.au,
davem@...emloft.net, shmulik@...anetworks.com
Subject: xfrm, ip tunnel: non released device reference upon device
unregistration
Hi,
We've encountered a non released device reference upon device
unregistration which seems to stem from xfrm policy code.
The setup includes:
- an underlay device (e.g. eth0) using IPv4
- an xfrm IPv6 over IPv4 tunnel routed via the underlay device
- an ipip6 tunnel over the xfrm IPv6 tunnel
When tearing down the underlay device, after traffic had passed via the ipip6
tunnel, log messages of the following form are observed:
unregister_netdevice: waiting for eth0 to become free. Usage count = 2
The below synthetic script reproduces this consistently on a fresh ubuntu vm
running net-next v4.15-6066-ge9522a5:
---------------------------------------------------------
#!/bin/bash
ipsec_underlay_dst=192.168.6.1
ipsec_underlay_src=192.168.5.2
ipv6_pfx=1234
local_ipv6_addr="$ipv6_pfx::1"
remote_ipv6_addr="$ipv6_pfx::2"
# create dummy ipsec underlay
ip l add dev dummy1 type dummy
ip l set dev dummy1 up
ip r add "$ipsec_underlay_dst/32" dev dummy1
ip -6 r add "$ipv6_pfx::/16" dev dummy1
ip a add dev dummy1 "$local_ipv6_addr/128"
ip a add dev dummy1 "$ipsec_underlay_src/24"
# add xfrm policy and state
ip x p add src "$local_ipv6_addr/128" dst "$ipv6_pfx::/16" dir out tmpl src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp reqid 1 mode tunnel
ip x s add src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp spi 0xcd440ce6 reqid 1 mode tunnel auth-trunc 'hmac(sha1)' 0x34a546d309031628962b814ef073aff1a638ad21 96 enc 'cbc(aes)' 0xf31e14149c328297fe7925ad7448420e encap espinudp 4500 4500 0.0.0.0
# add 4o6 tunnel
ip l add tnl46 type ip6tnl mode ipip6 local "$local_ipv6_addr" remote "$remote_ipv6_addr"
ip l set dev tnl46 up
ip r add 10.64.0.0/10 dev tnl46
# pass traffic so route is cached
ping -w 1 -c 1 10.64.0.1
# remove dummy underlay
ip l del dummy1
---------------------------------------------------------
Analysis:
ip6_tunnel holds a dst_cache which caches its underlay dst objects.
When devices are unregistered, non-xfrm dst objects are invlidated by their
original creators (ipv4/ipv6/...) and thus are wiped from dst_cache.
xfrm created routes otoh are not tracked by xfrm, and are not invalidated upon
device unregistration, thus hold the device upon unregistration.
The following rough sketch patch illustrates an approach overcoming this
issue:
---------------------------------------------------------
From e188dc5295e3500bc59e8780049840afa2eb3e24 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal@...anetworks.com>
Date: Sun, 4 Feb 2018 13:08:02 +0200
Subject: [PATCH] net: xfrm_policy: invalidate xfrm_dsts on device
unregistration
---
include/net/xfrm.h | 10 ++-----
net/xfrm/xfrm_policy.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 78 insertions(+), 7 deletions(-)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 7d20776..c1c8493 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -994,6 +994,8 @@ struct xfrm_dst {
u32 child_mtu_cached;
u32 route_cookie;
u32 path_cookie;
+ struct list_head xfrm_dst_gc;
+ struct xfrm_dst_list *xfrm_dst_gc_list;
};
static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst)
@@ -1025,13 +1027,7 @@ static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *c
xdst->child = child;
}
-static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
-{
- xfrm_pols_put(xdst->pols, xdst->num_pols);
- dst_release(xdst->route);
- if (likely(xdst->u.dst.xfrm))
- xfrm_state_put(xdst->u.dst.xfrm);
-}
+void xfrm_dst_destroy(struct xfrm_dst *xdst);
#endif
void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7a23078..d446641 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -94,6 +94,58 @@ __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
(fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
}
+struct xfrm_dst_list {
+ spinlock_t lock;
+ struct list_head head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct xfrm_dst_list, xfrm_dst_gc_list);
+
+static void xfrm_add_dst_list(struct xfrm_dst *xdst)
+{
+ struct xfrm_dst_list *xl = raw_cpu_ptr(&xfrm_dst_gc_list);
+
+ xdst->xfrm_dst_gc_list = xl;
+
+ spin_lock_bh(&xl->lock);
+ list_add_tail(&xdst->xfrm_dst_gc, &xl->head);
+ spin_unlock_bh(&xl->lock);
+}
+
+void xfrm_dst_destroy(struct xfrm_dst *xdst)
+{
+ xfrm_pols_put(xdst->pols, xdst->num_pols);
+ dst_release(xdst->route);
+ if (likely(xdst->u.dst.xfrm))
+ xfrm_state_put(xdst->u.dst.xfrm);
+ if (!list_empty(&xdst->xfrm_dst_gc)) {
+ struct xfrm_dst_list *xl = xdst->xfrm_dst_gc_list;
+
+ spin_lock_bh(&xl->lock);
+ list_del(&xdst->xfrm_dst_gc);
+ spin_unlock_bh(&xl->lock);
+ }
+}
+EXPORT_SYMBOL_GPL(xfrm_dst_destroy);
+
+void xfrm_flush_dev(struct net_device *dev)
+{
+ struct xfrm_dst *xdst;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu);
+
+ spin_lock_bh(&xl->lock);
+ list_for_each_entry(xdst, &xl->head, xfrm_dst_gc) {
+ if (xdst->u.dst.dev != dev)
+ continue;
+ dst_dev_put(&xdst->u.dst);
+ }
+ spin_unlock_bh(&xl->lock);
+ }
+}
+
bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
unsigned short family)
{
@@ -1581,6 +1633,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
}
bundle[i] = xdst;
+ xfrm_add_dst_list(xdst);
+
if (!xdst_prev)
xdst0 = xdst;
else
@@ -2984,8 +3038,22 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
.exit = xfrm_net_exit,
};
+static int xfrm_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (event == NETDEV_UNREGISTER)
+ xfrm_flush_dev(dev);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block xfrm_netdev_notifier = {
+ .notifier_call = xfrm_netdev_event,
+};
+
void __init xfrm_init(void)
{
+ int cpu;
int i;
xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
@@ -2998,6 +3066,13 @@ void __init xfrm_init(void)
register_pernet_subsys(&xfrm_net_ops);
seqcount_init(&xfrm_policy_hash_generation);
xfrm_input_init();
+ for_each_possible_cpu(cpu) {
+ struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu);
+
+ INIT_LIST_HEAD(&xl->head);
+ spin_lock_init(&xl->lock);
+ }
+ register_netdevice_notifier(&xfrm_netdev_notifier);
}
#ifdef CONFIG_AUDITSYSCALL
--
2.7.4
---------------------------------------------------------
This approach has the unfortunate side effects of adding a spin lock for the
tracked list, as well as increasing struct xfrm_dst.
Any thoughts on how to best approach this within xfrm would be most welcomed.
Thanks,
Eyal.
Powered by blists - more mailing lists