[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALx6S37NF6rqbAeu=9o8nFFT3Z88=OqcMPsm0rc9-vEPGTCYbw@mail.gmail.com>
Date: Tue, 29 Nov 2016 20:04:42 -0800
From: Tom Herbert <tom@...bertland.com>
To: David Lebrun <david.lebrun@...ouvain.be>
Cc: Linux Kernel Network Developers <netdev@...r.kernel.org>
Subject: Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for
equal-cost multipath routing
On Tue, Nov 29, 2016 at 9:15 AM, David Lebrun <david.lebrun@...ouvain.be> wrote:
> When multiple nexthops are available for a given route, the routing engine
> chooses a nexthop by computing the flow hash through get_hash_from_flowi6
> and by taking that value modulo the number of nexthops. The resulting value
> indexes the nexthop to select. This method causes issues when a new nexthop
> is added or one is removed (e.g. link failure). In that case, the number
> of nexthops changes and potentially all the flows get re-routed to another
> nexthop.
>
This is a lot of code to make ECMP work better. Can you be more
specific as to what the "issues" are? Assuming this is just the
transient packet reorder that happens in one link flap I am wondering
if this complexity is justified.
Tom
> This patch implements a consistent hash method to select the nexthop in
> case of ECMP. The idea is to generate K slices (or intervals) for each
> route with multiple nexthops. The nexthops are randomly assigned to those
> slices, in a uniform manner. The number K is configurable through a sysctl
> net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
> nexthop, the algorithm takes the flow hash and computes an index which is
> the flow hash modulo K. As K = 2^x, the modulo can be computed using a
> simple binary AND operation (idx = hash & (K - 1)). The resulting index
> references the selected nexthop. The lookup time complexity is thus O(1).
>
> When a nexthop is added, it steals K/N slices from the other nexthops,
> where N is the new number of nexthops. The slices are stolen randomly and
> uniformly from the other nexthops. When a nexthop is removed, the orphan
> slices are randomly reassigned to the other nexthops.
>
> The number of slices for a route also fixes the maximum number of nexthops
> possible for that route.
>
> Signed-off-by: David Lebrun <david.lebrun@...ouvain.be>
> ---
> include/net/ip6_fib.h | 25 +++++++
> include/net/netns/ipv6.h | 1 +
> net/ipv6/ip6_fib.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++-
> net/ipv6/route.c | 58 ++++++++--------
> 4 files changed, 229 insertions(+), 29 deletions(-)
>
> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> index a74e2aa..29594cc 100644
> --- a/include/net/ip6_fib.h
> +++ b/include/net/ip6_fib.h
> @@ -93,6 +93,30 @@ struct rt6key {
>
> struct fib6_table;
>
> +/* Multipath nexthop.
> + * @nh is the actual nexthop
> + * @nslices is the number of slices assigned to this nexthop
> + */
> +struct rt6_multi_nh {
> + struct rt6_info *nh;
> + unsigned int nslices;
> +};
> +
> +/* Multipath routes map.
> + * @nhs is an array of available nexthops
> + * @size is the number of elements in @nhs
> + * @nslices is the number of slices and the number of elements in @index
> + * and is always in the form 2^x to prevent using a division for
> + * the computation of the modulo.
> + * @index is an array mapping a slice index to a nexthop index in @nhs
> + */
> +struct rt6_multi_map {
> + struct rt6_multi_nh *nhs;
> + unsigned int size;
> + unsigned int nslices;
> + __u8 *index;
> +};
> +
> struct rt6_info {
> struct dst_entry dst;
>
> @@ -113,6 +137,7 @@ struct rt6_info {
> */
> struct list_head rt6i_siblings;
> unsigned int rt6i_nsiblings;
> + struct rt6_multi_map *rt6i_nh_map;
>
> atomic_t rt6i_ref;
>
> diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
> index de7745e..4967846 100644
> --- a/include/net/netns/ipv6.h
> +++ b/include/net/netns/ipv6.h
> @@ -36,6 +36,7 @@ struct netns_sysctl_ipv6 {
> int idgen_retries;
> int idgen_delay;
> int flowlabel_state_ranges;
> + int ip6_rt_ecmp_slices;
> };
>
> struct netns_ipv6 {
> diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
> index ef54852..b6b1895 100644
> --- a/net/ipv6/ip6_fib.c
> +++ b/net/ipv6/ip6_fib.c
> @@ -727,6 +727,166 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
> }
> }
>
> +static void fib6_mp_free(struct rt6_info *rt)
> +{
> + struct rt6_multi_map *nh_map = rt->rt6i_nh_map;
> + struct rt6_info *sibling;
> +
> + if (nh_map) {
> + list_for_each_entry(sibling, &rt->rt6i_siblings,
> + rt6i_siblings) {
> + sibling->rt6i_nh_map = NULL;
> + }
> +
> + rt->rt6i_nh_map = NULL;
> +
> + kfree(nh_map->nhs);
> + kfree(nh_map->index);
> + kfree(nh_map);
> + }
> +}
> +
> +static int fib6_mp_extend(struct net *net, struct rt6_info *sref,
> + struct rt6_info *rt)
> +{
> + struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
> + struct rt6_multi_nh *tmp_nhs, *old_nhs;
> + unsigned int kslices;
> + unsigned int i, j;
> +
> + if (!nh_map) {
> + __u8 *index;
> +
> + nh_map = kmalloc(sizeof(*nh_map), GFP_ATOMIC);
> + if (!nh_map)
> + return -ENOMEM;
> +
> + nh_map->size = 1;
> + nh_map->nslices = (1 << net->ipv6.sysctl.ip6_rt_ecmp_slices);
> +
> + tmp_nhs = kmalloc(sizeof(*tmp_nhs), GFP_ATOMIC);
> + if (!tmp_nhs) {
> + kfree(nh_map);
> + return -ENOMEM;
> + }
> +
> + tmp_nhs[0].nh = sref;
> + tmp_nhs[0].nslices = nh_map->nslices;
> + nh_map->nhs = tmp_nhs;
> +
> + index = kmalloc_array(nh_map->nslices, sizeof(*index),
> + GFP_ATOMIC);
> + if (!index) {
> + kfree(nh_map->nhs);
> + kfree(nh_map);
> + return -ENOMEM;
> + }
> +
> + for (i = 0; i < nh_map->nslices; i++)
> + index[i] = 0;
> +
> + nh_map->index = index;
> + sref->rt6i_nh_map = nh_map;
> + }
> +
> + if (nh_map->size == nh_map->nslices)
> + return -ENOBUFS;
> +
> + nh_map->size++;
> +
> + tmp_nhs = kmalloc_array(nh_map->size, sizeof(*tmp_nhs), GFP_ATOMIC);
> + if (!tmp_nhs)
> + return -ENOMEM;
> +
> + for (i = 0; i < nh_map->size - 1; i++)
> + tmp_nhs[i] = nh_map->nhs[i];
> +
> + tmp_nhs[nh_map->size - 1].nh = rt;
> + tmp_nhs[nh_map->size - 1].nslices = 0;
> +
> + kslices = nh_map->nslices / nh_map->size;
> +
> + /* Loop over nexthops and steal a random slice until we have kslices for
> + * the new nexthop. This algorithm ensures that flows are taken as
> + * equally as possible from current nexthops.
> + *
> + * At each iteration, @j is the index in tmp_nhs of the nexthop
> + * to steal from and @idx is the index of the slice to steal
> + * among @j's slices.
> + */
> + for (i = 0, j = 0; i < kslices; i++) {
> + unsigned int idx, k = 0;
> +
> + if (tmp_nhs[j].nslices == 1)
> + continue;
> +
> + idx = (prandom_u32() % tmp_nhs[j].nslices) + 1;
> + do {
> + if (nh_map->index[k++] == j)
> + idx--;
> + } while (idx);
> +
> + nh_map->index[k - 1] = nh_map->size - 1;
> + tmp_nhs[nh_map->size - 1].nslices++;
> + tmp_nhs[j].nslices--;
> +
> + j = (j + 1) % (nh_map->size - 1);
> + }
> +
> + WARN_ON(tmp_nhs[nh_map->size - 1].nslices != kslices);
> +
> + old_nhs = nh_map->nhs;
> + nh_map->nhs = tmp_nhs;
> + kfree(old_nhs);
> +
> + rt->rt6i_nh_map = nh_map;
> +
> + return 0;
> +}
> +
> +static int fib6_mp_shrink(struct rt6_info *sref, struct rt6_info *rt)
> +{
> + struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
> + struct rt6_multi_nh *tmp_nhs, *old_nhs;
> + unsigned int i, j, idx = 0;
> +
> + tmp_nhs = kmalloc_array(nh_map->size - 1, sizeof(*tmp_nhs), GFP_ATOMIC);
> + if (!tmp_nhs)
> + return -ENOMEM;
> +
> + for (i = 0, j = 0; i < nh_map->size; i++) {
> + if (nh_map->nhs[i].nh != rt)
> + tmp_nhs[j++] = nh_map->nhs[i];
> + else
> + idx = i;
> + }
> +
> + nh_map->size--;
> + old_nhs = nh_map->nhs;
> + nh_map->nhs = tmp_nhs;
> + kfree(old_nhs);
> +
> + rt->rt6i_nh_map = NULL;
> +
> + /* Update the slices index. For each slice mapping to the removed
> + * nexthop, assign another random nexthop. For each slice mapping
> + * to a nexthop that was after the removed nexthop, decrement the
> + * nexthop index to reflect the shift. Nexthops that were before
> + * the removed nexthop are unaffected.
> + */
> + for (i = 0; i < nh_map->nslices; i++) {
> + if (nh_map->index[i] == idx) {
> + j = prandom_u32() % nh_map->size;
> + nh_map->index[i] = j;
> + nh_map->nhs[j].nslices++;
> + } else if (nh_map->index[i] > idx) {
> + nh_map->index[i]--;
> + }
> + }
> +
> + return 0;
> +}
> +
> /*
> * Insert routing information in a node.
> */
> @@ -837,6 +997,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
> }
> sibling = sibling->dst.rt6_next;
> }
> + err = fib6_mp_extend(info->nl_net, sibling, rt);
> + if (err < 0)
> + return err;
> /* For each sibling in the list, increment the counter of
> * siblings. BUG() if counters does not match, list of siblings
> * is broken!
> @@ -900,6 +1063,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
> fn->fn_flags |= RTN_RTINFO;
> }
> nsiblings = iter->rt6i_nsiblings;
> + if (nsiblings)
> + fib6_mp_free(iter);
> fib6_purge_rt(iter, fn, info->nl_net);
> rt6_release(iter);
>
> @@ -1407,13 +1572,20 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
>
> /* Remove this entry from other siblings */
> if (rt->rt6i_nsiblings) {
> - struct rt6_info *sibling, *next_sibling;
> + struct rt6_info *sibling, *next_sibling, *sref;
> +
> + sref = list_first_entry(&rt->rt6i_siblings, struct rt6_info,
> + rt6i_siblings);
>
> list_for_each_entry_safe(sibling, next_sibling,
> &rt->rt6i_siblings, rt6i_siblings)
> sibling->rt6i_nsiblings--;
> rt->rt6i_nsiblings = 0;
> list_del_init(&rt->rt6i_siblings);
> + if (!sref->rt6i_nsiblings)
> + fib6_mp_free(sref);
> + else
> + fib6_mp_shrink(sref, rt);
> }
>
> /* Adjust walkers */
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index b317bb1..e9f13e0 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -427,39 +427,27 @@ static bool rt6_check_expired(const struct rt6_info *rt)
> return false;
> }
>
> -/* Multipath route selection:
> - * Hash based function using packet header and flowlabel.
> - * Adapted from fib_info_hashfn()
> - */
> -static int rt6_info_hash_nhsfn(unsigned int candidate_count,
> - const struct flowi6 *fl6)
> -{
> - return get_hash_from_flowi6(fl6) % candidate_count;
> -}
> -
> static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
> struct flowi6 *fl6, int oif,
> int strict)
> {
> - struct rt6_info *sibling, *next_sibling;
> - int route_choosen;
> + struct rt6_multi_map *nh_map = match->rt6i_nh_map;
> + __u32 hash = get_hash_from_flowi6(fl6);
> + unsigned int slice, idx;
> + struct rt6_info *res;
>
> - route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
> - /* Don't change the route, if route_choosen == 0
> - * (siblings does not include ourself)
> - */
> - if (route_choosen)
> - list_for_each_entry_safe(sibling, next_sibling,
> - &match->rt6i_siblings, rt6i_siblings) {
> - route_choosen--;
> - if (route_choosen == 0) {
> - if (rt6_score_route(sibling, oif, strict) < 0)
> - break;
> - match = sibling;
> - break;
> - }
> - }
> - return match;
> + WARN_ON(!nh_map);
> + if (!nh_map)
> + return match;
> +
> + slice = hash & (nh_map->nslices - 1);
> + idx = nh_map->index[slice];
> + res = nh_map->nhs[idx].nh;
> +
> + if (rt6_score_route(res, oif, strict) < 0)
> + res = match;
> +
> + return res;
> }
>
> /*
> @@ -3550,6 +3538,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
> return 0;
> }
>
> +static int one = 1;
> +static int thirtyone = 31;
> +
> struct ctl_table ipv6_route_table_template[] = {
> {
> .procname = "flush",
> @@ -3621,6 +3612,15 @@ struct ctl_table ipv6_route_table_template[] = {
> .mode = 0644,
> .proc_handler = proc_dointvec_ms_jiffies,
> },
> + {
> + .procname = "ecmp_slices",
> + .data = &init_net.ipv6.sysctl.ip6_rt_ecmp_slices,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec_minmax,
> + .extra1 = &one,
> + .extra2 = &thirtyone,
> + },
> { }
> };
>
> @@ -3644,6 +3644,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
> table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
> table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
> table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
> + table[10].data = &net->ipv6.sysctl.ip6_rt_ecmp_slices;
>
> /* Don't export sysctls to unprivileged users */
> if (net->user_ns != &init_user_ns)
> @@ -3707,6 +3708,7 @@ static int __net_init ip6_route_net_init(struct net *net)
> net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
> net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
> net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
> + net->ipv6.sysctl.ip6_rt_ecmp_slices = 5;
>
> net->ipv6.ip6_rt_gc_expire = 30*HZ;
>
> --
> 2.7.3
>
Powered by blists - more mailing lists