lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20181009212751.17695-1-dsahern@kernel.org>
Date:   Tue,  9 Oct 2018 14:27:51 -0700
From:   David Ahern <dsahern@...nel.org>
To:     netdev@...r.kernel.org, davem@...emloft.net
Cc:     roopa@...ulusnetworks.com, David Ahern <dsahern@...il.com>
Subject: [PATCH net-next] net/ipv6: Add knob to skip DELROUTE message on device down

From: David Ahern <dsahern@...il.com>

Another difference between IPv4 and IPv6 is the generation of RTM_DELROUTE
notifications when a device is taken down (admin down) or deleted. IPv4
does not generate a message for routes evicted by the down or delete;
IPv6 does. A NOS at scale really needs to avoid these messages and have
IPv4 and IPv6 behave similarly, relying on userspace to handle link
notifications and evict the routes.

At this point existing user behavior needs to be preserved. Since
notifications are a global action (not per app) the only way to preserve
existing behavior and allow the messages to be skipped is to add a new
sysctl (net/ipv6/route/skip_notify_on_dev_down) which can be set to
disable the notificatioons.

IPv6 route code already supports the option to skip the message (it is
used for multipath routes for example). Besides the new sysctl we need
to pass the skip_notify setting through the generic fib6_clean and
fib6_walk functions to fib6_clean_node and to set skip_notify on calls
to __ip_del_rt for the addrconf_ifdown path.

Signed-off-by: David Ahern <dsahern@...il.com>
---
 Documentation/networking/ip-sysctl.txt |  8 +++++++
 include/net/addrconf.h                 |  3 ++-
 include/net/ip6_fib.h                  |  3 +++
 include/net/ip6_route.h                |  1 +
 include/net/netns/ipv6.h               |  1 +
 net/ipv6/addrconf.c                    | 44 ++++++++++++++++++++++------------
 net/ipv6/anycast.c                     | 10 +++++---
 net/ipv6/ip6_fib.c                     | 20 ++++++++++++----
 net/ipv6/route.c                       | 30 ++++++++++++++++++++++-
 9 files changed, 95 insertions(+), 25 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 960de8fe3f40..163b5ff1073c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1442,6 +1442,14 @@ max_hbh_length - INTEGER
 	header.
 	Default: INT_MAX (unlimited)
 
+skip_notify_on_dev_down - BOOLEAN
+	Controls whether an RTM_DELROUTE message is generated for routes
+	removed when a device is taken down or deleted. IPv4 does not
+	generate this message; IPv6 does by default. Setting this sysctl
+	to true skips the message, making IPv4 and IPv6 on par in relying
+	on userspace caches to track link events and evict routes.
+	Default: false (generate message)
+
 IPv6 Fragmentation:
 
 ip6frag_high_thresh - INTEGER
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 6def0351bcc3..ee6292f64c86 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -306,7 +306,8 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex,
 void ipv6_sock_ac_close(struct sock *sk);
 
 int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr);
-int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr);
+int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr,
+		      bool skip_notify);
 void ipv6_ac_destroy_dev(struct inet6_dev *idev);
 bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
 			 const struct in6_addr *addr);
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index f06e968f1992..caabfd84a098 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -407,6 +407,9 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 
 void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg),
 		    void *arg);
+void fib6_clean_all_skip_notify(struct net *net,
+				int (*func)(struct fib6_info *, void *arg),
+				void *arg);
 
 int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack);
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index cef186dbd2ce..7c140cb2eeb0 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -104,6 +104,7 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 		  struct netlink_ext_ack *extack);
 int ip6_ins_rt(struct net *net, struct fib6_info *f6i);
 int ip6_del_rt(struct net *net, struct fib6_info *f6i);
+int ip6_del_rt_skip_notify(struct net *net, struct fib6_info *f6i);
 
 void rt6_flush_exceptions(struct fib6_info *f6i);
 void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index f0e396ab9bec..ef1ed529f33c 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -45,6 +45,7 @@ struct netns_sysctl_ipv6 {
 	int max_dst_opts_len;
 	int max_hbh_opts_len;
 	int seg6_flowlabel;
+	bool skip_notify_on_dev_down;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2496b12bf721..cf591cf66884 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -164,7 +164,7 @@ static struct workqueue_struct *addrconf_wq;
 static DECLARE_DELAYED_WORK(addr_chk_work, addrconf_verify_work);
 
 static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
-static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp, bool skip_notify);
 
 static void addrconf_type_change(struct net_device *dev,
 				 unsigned long event);
@@ -181,7 +181,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
 				   bool send_na);
 static void addrconf_dad_run(struct inet6_dev *idev);
 static void addrconf_rs_timer(struct timer_list *t);
-static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
+static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa,
+			      bool skip_notify);
 static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
 
 static void inet6_prefix_notify(int event, struct inet6_dev *idev,
@@ -779,7 +780,7 @@ static void dev_forward_change(struct inet6_dev *idev)
 		if (idev->cnf.forwarding)
 			addrconf_join_anycast(ifa);
 		else
-			addrconf_leave_anycast(ifa);
+			addrconf_leave_anycast(ifa, false);
 	}
 	inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
 				     NETCONFA_FORWARDING,
@@ -2141,7 +2142,7 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
 }
 
 /* caller must hold RTNL */
-static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp, bool skip_notify)
 {
 	struct in6_addr addr;
 
@@ -2150,7 +2151,7 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
 	ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
 	if (ipv6_addr_any(&addr))
 		return;
-	__ipv6_dev_ac_dec(ifp->idev, &addr);
+	__ipv6_dev_ac_dec(ifp->idev, &addr, skip_notify);
 }
 
 static int addrconf_ifid_6lowpan(u8 *eui, struct net_device *dev)
@@ -3655,6 +3656,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 {
 	unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN;
 	struct net *net = dev_net(dev);
+	bool skip_notify = net->ipv6.sysctl.skip_notify_on_dev_down;
 	struct inet6_dev *idev;
 	struct inet6_ifaddr *ifa, *tmp;
 	bool keep_addr = false;
@@ -3772,15 +3774,19 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 		spin_unlock_bh(&ifa->lock);
 
-		if (rt)
-			ip6_del_rt(net, rt);
+		if (rt) {
+			if (skip_notify)
+				ip6_del_rt_skip_notify(net, rt);
+			else
+				ip6_del_rt(net, rt);
+		}
 
 		if (state != INET6_IFADDR_STATE_DEAD) {
-			__ipv6_ifa_notify(RTM_DELADDR, ifa);
+			__ipv6_ifa_notify(RTM_DELADDR, ifa, skip_notify);
 			inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
 		} else {
 			if (idev->cnf.forwarding)
-				addrconf_leave_anycast(ifa);
+				addrconf_leave_anycast(ifa, skip_notify);
 			addrconf_leave_solict(ifa->idev, &ifa->addr);
 		}
 
@@ -5830,7 +5836,8 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
 		rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
 }
 
-static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
+static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp,
+			      bool skip_notify)
 {
 	struct net *net = dev_net(ifp->idev->dev);
 
@@ -5858,18 +5865,25 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		break;
 	case RTM_DELADDR:
 		if (ifp->idev->cnf.forwarding)
-			addrconf_leave_anycast(ifp);
+			addrconf_leave_anycast(ifp, skip_notify);
 		addrconf_leave_solict(ifp->idev, &ifp->addr);
 		if (!ipv6_addr_any(&ifp->peer_addr)) {
 			struct fib6_info *rt;
 
 			rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
 						       ifp->idev->dev, 0, 0);
-			if (rt)
-				ip6_del_rt(net, rt);
+			if (rt) {
+				if (skip_notify)
+					ip6_del_rt_skip_notify(net, rt);
+				else
+					ip6_del_rt(net, rt);
+			}
 		}
 		if (ifp->rt) {
-			ip6_del_rt(net, ifp->rt);
+			if (skip_notify)
+				ip6_del_rt_skip_notify(net, ifp->rt);
+			else
+				ip6_del_rt(net, ifp->rt);
 			ifp->rt = NULL;
 		}
 		rt_genid_bump_ipv6(net);
@@ -5882,7 +5896,7 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 {
 	rcu_read_lock_bh();
 	if (likely(ifp->idev->dead == 0))
-		__ipv6_ifa_notify(event, ifp);
+		__ipv6_ifa_notify(event, ifp, false);
 	rcu_read_unlock_bh();
 }
 
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 4e0ff7031edd..762fb1de58a8 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -299,7 +299,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 /*
  *	device anycast group decrement
  */
-int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
+int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr,
+		      bool skip_notify)
 {
 	struct ifacaddr6 *aca, *prev_aca;
 
@@ -327,7 +328,10 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 	write_unlock_bh(&idev->lock);
 	addrconf_leave_solict(idev, &aca->aca_addr);
 
-	ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
+	if (skip_notify)
+		ip6_del_rt_skip_notify(dev_net(idev->dev), aca->aca_rt);
+	else
+		ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
 
 	aca_put(aca);
 	return 0;
@@ -340,7 +344,7 @@ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr)
 
 	if (!idev)
 		return -ENODEV;
-	return __ipv6_dev_ac_dec(idev, addr);
+	return __ipv6_dev_ac_dec(idev, addr, false);
 }
 
 void ipv6_ac_destroy_dev(struct inet6_dev *idev)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e14d244c551f..9ba72d94d60f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -47,6 +47,7 @@ struct fib6_cleaner {
 	int (*func)(struct fib6_info *, void *arg);
 	int sernum;
 	void *arg;
+	bool skip_notify;
 };
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1956,6 +1957,7 @@ static int fib6_clean_node(struct fib6_walker *w)
 	struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
 	struct nl_info info = {
 		.nl_net = c->net,
+		.skip_notify = c->skip_notify,
 	};
 
 	if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
@@ -2007,7 +2009,7 @@ static int fib6_clean_node(struct fib6_walker *w)
 
 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 			    int (*func)(struct fib6_info *, void *arg),
-			    int sernum, void *arg)
+			    int sernum, void *arg, bool skip_notify)
 {
 	struct fib6_cleaner c;
 
@@ -2019,13 +2021,14 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 	c.sernum = sernum;
 	c.arg = arg;
 	c.net = net;
+	c.skip_notify = skip_notify;
 
 	fib6_walk(net, &c.w);
 }
 
 static void __fib6_clean_all(struct net *net,
 			     int (*func)(struct fib6_info *, void *),
-			     int sernum, void *arg)
+			     int sernum, void *arg, bool skip_notify)
 {
 	struct fib6_table *table;
 	struct hlist_head *head;
@@ -2037,7 +2040,7 @@ static void __fib6_clean_all(struct net *net,
 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
 			spin_lock_bh(&table->tb6_lock);
 			fib6_clean_tree(net, &table->tb6_root,
-					func, sernum, arg);
+					func, sernum, arg, skip_notify);
 			spin_unlock_bh(&table->tb6_lock);
 		}
 	}
@@ -2047,14 +2050,21 @@ static void __fib6_clean_all(struct net *net,
 void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
 		    void *arg)
 {
-	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
+	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
+}
+
+void fib6_clean_all_skip_notify(struct net *net,
+				int (*func)(struct fib6_info *, void *),
+				void *arg)
+{
+	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
 }
 
 static void fib6_flush_trees(struct net *net)
 {
 	int new_sernum = fib6_new_sernum(net);
 
-	__fib6_clean_all(net, NULL, new_sernum, NULL);
+	__fib6_clean_all(net, NULL, new_sernum, NULL, false);
 }
 
 /*
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7c38e0e058ae..de161808c540 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3143,6 +3143,16 @@ static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
 	return err;
 }
 
+int ip6_del_rt_skip_notify(struct net *net, struct fib6_info *rt)
+{
+	struct nl_info info = {
+		.nl_net = net,
+		.skip_notify = true,
+	};
+
+	return __ip6_del_rt(rt, &info);
+}
+
 int ip6_del_rt(struct net *net, struct fib6_info *rt)
 {
 	struct nl_info info = { .nl_net = net };
@@ -4026,8 +4036,12 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
 			.event = event,
 		},
 	};
+	struct net *net = dev_net(dev);
 
-	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
+	if (net->ipv6.sysctl.skip_notify_on_dev_down)
+		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
+	else
+		fib6_clean_all(net, fib6_ifdown, &arg);
 }
 
 void rt6_disable_ip(struct net_device *dev, unsigned long event)
@@ -5031,6 +5045,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
 	return 0;
 }
 
+static int zero;
+static int one = 1;
+
 struct ctl_table ipv6_route_table_template[] = {
 	{
 		.procname	=	"flush",
@@ -5102,6 +5119,15 @@ struct ctl_table ipv6_route_table_template[] = {
 		.mode		=	0644,
 		.proc_handler	=	proc_dointvec_ms_jiffies,
 	},
+	{
+		.procname	=	"skip_notify_on_dev_down",
+		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec,
+		.extra1		=	&zero,
+		.extra2		=	&one,
+	},
 	{ }
 };
 
@@ -5125,6 +5151,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
 
 		/* Don't export sysctls to unprivileged users */
 		if (net->user_ns != &init_user_ns)
@@ -5189,6 +5216,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
 
 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
 
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ