lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <1480439718-18019-1-git-send-email-david.lebrun@uclouvain.be>
Date:   Tue, 29 Nov 2016 18:15:18 +0100
From:   David Lebrun <david.lebrun@...ouvain.be>
To:     <netdev@...r.kernel.org>
CC:     David Lebrun <david.lebrun@...ouvain.be>
Subject: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing

When multiple nexthops are available for a given route, the routing engine
chooses a nexthop by computing the flow hash through get_hash_from_flowi6
and by taking that value modulo the number of nexthops. The resulting value
indexes the nexthop to select. This method causes issues when a new nexthop
is added or one is removed (e.g. link failure). In that case, the number
of nexthops changes and potentially all the flows get re-routed to another
nexthop.

This patch implements a consistent hash method to select the nexthop in
case of ECMP. The idea is to generate K slices (or intervals) for each
route with multiple nexthops. The nexthops are randomly assigned to those
slices, in a uniform manner. The number K is configurable through a sysctl
net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
nexthop, the algorithm takes the flow hash and computes an index which is
the flow hash modulo K. As K = 2^x, the modulo can be computed using a
simple binary AND operation (idx = hash & (K - 1)). The resulting index
references the selected nexthop. The lookup time complexity is thus O(1).

When a nexthop is added, it steals K/N slices from the other nexthops,
where N is the new number of nexthops. The slices are stolen randomly and
uniformly from the other nexthops. When a nexthop is removed, the orphan
slices are randomly reassigned to the other nexthops.

The number of slices for a route also fixes the maximum number of nexthops
possible for that route.

Signed-off-by: David Lebrun <david.lebrun@...ouvain.be>
---
 include/net/ip6_fib.h    |  25 +++++++
 include/net/netns/ipv6.h |   1 +
 net/ipv6/ip6_fib.c       | 174 ++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/route.c         |  58 ++++++++--------
 4 files changed, 229 insertions(+), 29 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a74e2aa..29594cc 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -93,6 +93,30 @@ struct rt6key {
 
 struct fib6_table;
 
+/* Multipath nexthop.
+ * @nh is the actual nexthop
+ * @nslices is the number of slices assigned to this nexthop
+ */
+struct rt6_multi_nh {
+	struct rt6_info *nh;
+	unsigned int	nslices;
+};
+
+/* Multipath routes map.
+ * @nhs is an array of available nexthops
+ * @size is the number of elements in @nhs
+ * @nslices is the number of slices and the number of elements in @index
+ *	    and is always in the form 2^x to prevent using a division for
+ *	    the computation of the modulo.
+ * @index is an array mapping a slice index to a nexthop index in @nhs
+ */
+struct rt6_multi_map {
+	struct rt6_multi_nh	*nhs;
+	unsigned int		size;
+	unsigned int		nslices;
+	__u8			*index;
+};
+
 struct rt6_info {
 	struct dst_entry		dst;
 
@@ -113,6 +137,7 @@ struct rt6_info {
 	 */
 	struct list_head		rt6i_siblings;
 	unsigned int			rt6i_nsiblings;
+	struct rt6_multi_map		*rt6i_nh_map;
 
 	atomic_t			rt6i_ref;
 
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index de7745e..4967846 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -36,6 +36,7 @@ struct netns_sysctl_ipv6 {
 	int idgen_retries;
 	int idgen_delay;
 	int flowlabel_state_ranges;
+	int ip6_rt_ecmp_slices;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ef54852..b6b1895 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -727,6 +727,166 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 	}
 }
 
+static void fib6_mp_free(struct rt6_info *rt)
+{
+	struct rt6_multi_map *nh_map = rt->rt6i_nh_map;
+	struct rt6_info *sibling;
+
+	if (nh_map) {
+		list_for_each_entry(sibling, &rt->rt6i_siblings,
+				    rt6i_siblings) {
+			sibling->rt6i_nh_map = NULL;
+		}
+
+		rt->rt6i_nh_map = NULL;
+
+		kfree(nh_map->nhs);
+		kfree(nh_map->index);
+		kfree(nh_map);
+	}
+}
+
+static int fib6_mp_extend(struct net *net, struct rt6_info *sref,
+			  struct rt6_info *rt)
+{
+	struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
+	struct rt6_multi_nh *tmp_nhs, *old_nhs;
+	unsigned int kslices;
+	unsigned int i, j;
+
+	if (!nh_map) {
+		__u8 *index;
+
+		nh_map = kmalloc(sizeof(*nh_map), GFP_ATOMIC);
+		if (!nh_map)
+			return -ENOMEM;
+
+		nh_map->size = 1;
+		nh_map->nslices = (1 << net->ipv6.sysctl.ip6_rt_ecmp_slices);
+
+		tmp_nhs = kmalloc(sizeof(*tmp_nhs), GFP_ATOMIC);
+		if (!tmp_nhs) {
+			kfree(nh_map);
+			return -ENOMEM;
+		}
+
+		tmp_nhs[0].nh = sref;
+		tmp_nhs[0].nslices = nh_map->nslices;
+		nh_map->nhs = tmp_nhs;
+
+		index = kmalloc_array(nh_map->nslices, sizeof(*index),
+				      GFP_ATOMIC);
+		if (!index) {
+			kfree(nh_map->nhs);
+			kfree(nh_map);
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < nh_map->nslices; i++)
+			index[i] = 0;
+
+		nh_map->index = index;
+		sref->rt6i_nh_map = nh_map;
+	}
+
+	if (nh_map->size == nh_map->nslices)
+		return -ENOBUFS;
+
+	nh_map->size++;
+
+	tmp_nhs = kmalloc_array(nh_map->size, sizeof(*tmp_nhs), GFP_ATOMIC);
+	if (!tmp_nhs)
+		return -ENOMEM;
+
+	for (i = 0; i < nh_map->size - 1; i++)
+		tmp_nhs[i] = nh_map->nhs[i];
+
+	tmp_nhs[nh_map->size - 1].nh = rt;
+	tmp_nhs[nh_map->size - 1].nslices = 0;
+
+	kslices = nh_map->nslices / nh_map->size;
+
+	/* Loop over nexthops and steal a random slice until we have kslices for
+	 * the new nexthop. This algorithm ensures that flows are taken as
+	 * equally as possible from current nexthops.
+	 *
+	 * At each iteration, @j is the index in tmp_nhs of the nexthop
+	 * to steal from and @idx is the index of the slice to steal
+	 * among @j's slices.
+	 */
+	for (i = 0, j = 0; i < kslices; i++) {
+		unsigned int idx, k = 0;
+
+		if (tmp_nhs[j].nslices == 1)
+			continue;
+
+		idx = (prandom_u32() % tmp_nhs[j].nslices) + 1;
+		do {
+			if (nh_map->index[k++] == j)
+				idx--;
+		} while (idx);
+
+		nh_map->index[k - 1] = nh_map->size - 1;
+		tmp_nhs[nh_map->size - 1].nslices++;
+		tmp_nhs[j].nslices--;
+
+		j = (j + 1) % (nh_map->size - 1);
+	}
+
+	WARN_ON(tmp_nhs[nh_map->size - 1].nslices != kslices);
+
+	old_nhs = nh_map->nhs;
+	nh_map->nhs = tmp_nhs;
+	kfree(old_nhs);
+
+	rt->rt6i_nh_map = nh_map;
+
+	return 0;
+}
+
+static int fib6_mp_shrink(struct rt6_info *sref, struct rt6_info *rt)
+{
+	struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
+	struct rt6_multi_nh *tmp_nhs, *old_nhs;
+	unsigned int i, j, idx = 0;
+
+	tmp_nhs = kmalloc_array(nh_map->size - 1, sizeof(*tmp_nhs), GFP_ATOMIC);
+	if (!tmp_nhs)
+		return -ENOMEM;
+
+	for (i = 0, j = 0; i < nh_map->size; i++) {
+		if (nh_map->nhs[i].nh != rt)
+			tmp_nhs[j++] = nh_map->nhs[i];
+		else
+			idx = i;
+	}
+
+	nh_map->size--;
+	old_nhs = nh_map->nhs;
+	nh_map->nhs = tmp_nhs;
+	kfree(old_nhs);
+
+	rt->rt6i_nh_map = NULL;
+
+	/* Update the slices index. For each slice mapping to the removed
+	 * nexthop, assign another random nexthop. For each slice mapping
+	 * to a nexthop that was after the removed nexthop, decrement the
+	 * nexthop index to reflect the shift. Nexthops that were before
+	 * the removed nexthop are unaffected.
+	 */
+	for (i = 0; i < nh_map->nslices; i++) {
+		if (nh_map->index[i] == idx) {
+			j = prandom_u32() % nh_map->size;
+			nh_map->index[i] = j;
+			nh_map->nhs[j].nslices++;
+		} else if (nh_map->index[i] > idx) {
+			nh_map->index[i]--;
+		}
+	}
+
+	return 0;
+}
+
 /*
  *	Insert routing information in a node.
  */
@@ -837,6 +997,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			}
 			sibling = sibling->dst.rt6_next;
 		}
+		err = fib6_mp_extend(info->nl_net, sibling, rt);
+		if (err < 0)
+			return err;
 		/* For each sibling in the list, increment the counter of
 		 * siblings. BUG() if counters does not match, list of siblings
 		 * is broken!
@@ -900,6 +1063,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			fn->fn_flags |= RTN_RTINFO;
 		}
 		nsiblings = iter->rt6i_nsiblings;
+		if (nsiblings)
+			fib6_mp_free(iter);
 		fib6_purge_rt(iter, fn, info->nl_net);
 		rt6_release(iter);
 
@@ -1407,13 +1572,20 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 
 	/* Remove this entry from other siblings */
 	if (rt->rt6i_nsiblings) {
-		struct rt6_info *sibling, *next_sibling;
+		struct rt6_info *sibling, *next_sibling, *sref;
+
+		sref = list_first_entry(&rt->rt6i_siblings, struct rt6_info,
+					rt6i_siblings);
 
 		list_for_each_entry_safe(sibling, next_sibling,
 					 &rt->rt6i_siblings, rt6i_siblings)
 			sibling->rt6i_nsiblings--;
 		rt->rt6i_nsiblings = 0;
 		list_del_init(&rt->rt6i_siblings);
+		if (!sref->rt6i_nsiblings)
+			fib6_mp_free(sref);
+		else
+			fib6_mp_shrink(sref, rt);
 	}
 
 	/* Adjust walkers */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b317bb1..e9f13e0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -427,39 +427,27 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-/* Multipath route selection:
- *   Hash based function using packet header and flowlabel.
- * Adapted from fib_info_hashfn()
- */
-static int rt6_info_hash_nhsfn(unsigned int candidate_count,
-			       const struct flowi6 *fl6)
-{
-	return get_hash_from_flowi6(fl6) % candidate_count;
-}
-
 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 					     struct flowi6 *fl6, int oif,
 					     int strict)
 {
-	struct rt6_info *sibling, *next_sibling;
-	int route_choosen;
+	struct rt6_multi_map *nh_map = match->rt6i_nh_map;
+	__u32 hash = get_hash_from_flowi6(fl6);
+	unsigned int slice, idx;
+	struct rt6_info *res;
 
-	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
-	/* Don't change the route, if route_choosen == 0
-	 * (siblings does not include ourself)
-	 */
-	if (route_choosen)
-		list_for_each_entry_safe(sibling, next_sibling,
-				&match->rt6i_siblings, rt6i_siblings) {
-			route_choosen--;
-			if (route_choosen == 0) {
-				if (rt6_score_route(sibling, oif, strict) < 0)
-					break;
-				match = sibling;
-				break;
-			}
-		}
-	return match;
+	WARN_ON(!nh_map);
+	if (!nh_map)
+		return match;
+
+	slice = hash & (nh_map->nslices - 1);
+	idx = nh_map->index[slice];
+	res = nh_map->nhs[idx].nh;
+
+	if (rt6_score_route(res, oif, strict) < 0)
+		res = match;
+
+	return res;
 }
 
 /*
@@ -3550,6 +3538,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
 	return 0;
 }
 
+static int one = 1;
+static int thirtyone = 31;
+
 struct ctl_table ipv6_route_table_template[] = {
 	{
 		.procname	=	"flush",
@@ -3621,6 +3612,15 @@ struct ctl_table ipv6_route_table_template[] = {
 		.mode		=	0644,
 		.proc_handler	=	proc_dointvec_ms_jiffies,
 	},
+	{
+		.procname	=	"ecmp_slices",
+		.data		=	&init_net.ipv6.sysctl.ip6_rt_ecmp_slices,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec_minmax,
+		.extra1		=	&one,
+		.extra2		=	&thirtyone,
+	},
 	{ }
 };
 
@@ -3644,6 +3644,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+		table[10].data = &net->ipv6.sysctl.ip6_rt_ecmp_slices;
 
 		/* Don't export sysctls to unprivileged users */
 		if (net->user_ns != &init_user_ns)
@@ -3707,6 +3708,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+	net->ipv6.sysctl.ip6_rt_ecmp_slices = 5;
 
 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
 
-- 
2.7.3

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ