netdev - Re: [PATCH v4 net-next 1/2] ipv4: L3 hash-based multipath

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Thu, 24 Sep 2015 01:38:17 +0200
From:	Peter Nørlund <pch@...bogen.com>
To:	Tom Herbert <tom@...bertland.com>
Cc:	Linux Kernel Network Developers <netdev@...r.kernel.org>,
	"David S. Miller" <davem@...emloft.net>,
	Alexey Kuznetsov <kuznet@....inr.ac.ru>,
	James Morris <jmorris@...ei.org>,
	Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
	Patrick McHardy <kaber@...sh.net>
Subject: Re: [PATCH v4 net-next 1/2] ipv4: L3 hash-based multipath

On Wed, 23 Sep 2015 16:12:42 -0700
Tom Herbert <tom@...bertland.com> wrote:

> On Wed, Sep 23, 2015 at 2:43 PM, Peter Nørlund <pch@...bogen.com>
> wrote:
> > On Wed, 23 Sep 2015 14:00:43 -0700
> > Tom Herbert <tom@...bertland.com> wrote:
> >
> >> On Wed, Sep 23, 2015 at 12:49 PM, Peter Nørlund <pch@...bogen.com>
> >> wrote:
> >> > Replaces the per-packet multipath with a hash-based multipath
> >> > using source and destination address.
> >> >
> >> It's good that round robin is going away, but this still looks very
> >> different with how multipath routing is done done in IPv6
> >> (rt6_multipath_select and rt6_info_hash_nhsfn). For instance IPv4
> >> hashes addresses, but IPv6 includes ports. How can we rectify this?
> >>
> >> > Signed-off-by: Peter Nørlund <pch@...bogen.com>
> >> > ---
> >> >  include/net/ip_fib.h     |   14 ++++-
> >> >  net/ipv4/fib_semantics.c |  140
> >> > +++++++++++++++++++++++++---------------------
> >> > net/ipv4/route.c         |   16 ++++-- 3 files changed, 98
> >> > insertions(+), 72 deletions(-)
> >> >
> >> > diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> >> > index a37d043..c454203 100644
> >> > --- a/include/net/ip_fib.h
> >> > +++ b/include/net/ip_fib.h
> >> > @@ -79,7 +79,7 @@ struct fib_nh {
> >> >         unsigned char           nh_scope;
> >> >  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >> >         int                     nh_weight;
> >> > -       int                     nh_power;
> >> > +       atomic_t                nh_upper_bound;
> >> >  #endif
> >> >  #ifdef CONFIG_IP_ROUTE_CLASSID
> >> >         __u32                   nh_tclassid;
> >> > @@ -118,7 +118,7 @@ struct fib_info {
> >> >  #define fib_advmss fib_metrics[RTAX_ADVMSS-1]
> >> >         int                     fib_nhs;
> >> >  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >> > -       int                     fib_power;
> >> > +       int                     fib_weight;
> >> >  #endif
> >> >         struct rcu_head         rcu;
> >> >         struct fib_nh           fib_nh[0];
> >> > @@ -312,7 +312,15 @@ int ip_fib_check_default(__be32 gw, struct
> >> > net_device *dev); int fib_sync_down_dev(struct net_device *dev,
> >> > unsigned long event); int fib_sync_down_addr(struct net *net,
> >> > __be32 local); int fib_sync_up(struct net_device *dev, unsigned
> >> > int nh_flags); -void fib_select_multipath(struct fib_result
> >> > *res); +
> >> > +extern u32 fib_multipath_secret __read_mostly;
> >> > +
> >> > +static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
> >> > +{
> >> > +       return jhash_2words(saddr, daddr, fib_multipath_secret)
> >> > >> 1;
> >>
> >> Why the >> 1?
> >
> > Each nexthop is assigned a range within the hash key space with the
> > upper bound specified in nh_upper_bound, but I needed a way to show
> > that a nexthop was dead within that same atomic variable. I chose -1
> > reducing the actual hash to 31 bits.  I shift the hash because I
> > expect it to be slightly faster than AND'ing with 0x7FFFFFFF on
> > most platforms.
> >
> >>
> >> > +}
> >> > +
> >> > +void fib_select_multipath(struct fib_result *res, int hash);
> >> >
> >> >  /* Exported by fib_trie.c */
> >> >  void fib_trie_init(void);
> >> > diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> >> > index 064bd3c..0c49d2f 100644
> >> > --- a/net/ipv4/fib_semantics.c
> >> > +++ b/net/ipv4/fib_semantics.c
> >> > @@ -57,8 +57,7 @@ static unsigned int fib_info_cnt;
> >> >  static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
> >> >
> >> >  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >> > -
> >> > -static DEFINE_SPINLOCK(fib_multipath_lock);
> >> > +u32 fib_multipath_secret __read_mostly;
> >> >
> >> >  #define for_nexthops(fi)
> >> > {                                             \ int nhsel; const
> >> > struct fib_nh *nh;                             \ @@ -532,7
> >> > +531,67 @@ errout: return ret;
> >> >  }
> >> >
> >> > -#endif
> >> > +static void fib_rebalance(struct fib_info *fi)
> >> > +{
> >> > +       int total;
> >> > +       int w;
> >> > +       struct in_device *in_dev;
> >> > +
> >> > +       if (fi->fib_nhs < 2)
> >> > +               return;
> >> > +
> >> > +       total = 0;
> >> > +       for_nexthops(fi) {
> >> > +               if (nh->nh_flags & RTNH_F_DEAD)
> >> > +                       continue;
> >> > +
> >> > +               in_dev = __in_dev_get_rcu(nh->nh_dev);
> >> > +
> >> > +               if (in_dev &&
> >> > +                   IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
> >> > +                   nh->nh_flags & RTNH_F_LINKDOWN)
> >> > +                       continue;
> >> > +
> >> > +               total += nh->nh_weight;
> >> > +       } endfor_nexthops(fi);
> >> > +
> >> > +       w = 0;
> >> > +       change_nexthops(fi) {
> >> > +               int upper_bound;
> >> > +
> >> > +               in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev);
> >> > +
> >> > +               if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
> >> > +                       upper_bound = -1;
> >> > +               } else if (in_dev &&
> >> > +
> >> > IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
> >> > +                          nexthop_nh->nh_flags &
> >> > RTNH_F_LINKDOWN) {
> >> > +                       upper_bound = -1;
> >> > +               } else {
> >> > +                       w += nexthop_nh->nh_weight;
> >> > +                       upper_bound =
> >> > DIV_ROUND_CLOSEST(2147483648LL * w,
> >> > +                                                       total) -
> >> > 1;
> >> > +               }
> >> > +
> >>
> >> Is this complexity (aside from recomputing the key) needed because
> >> you want consistent hashing for the anycast case?
> >
> > It is an attempt to reduce the disruption when a path is added or
> > removed in a way recommended by RFC 2992. It is not specific to
> > anycast at all, although disruptions are more severe for anycast
> > than for ordinary TCP which might just slow down due to out of
> > order packets.
> >
> RFC2992 looks more like a mathematical proof than a practical
> algorithm :-) Anyway, the requirement to avoid OOO with TCP is so
> often over blown. TCP does not keel over dead with OOO packets, it
> just may experience a blip of being suboptimal. As long as we're not
> thrashing paths things should be fine quickly, and if we are thrashing
> links rather each change affects 1/2 or 1/4 of connections probably
> doesn't matter. IMO no one should complain about just doing simple
> modulo on the hash.
> 
> However, if you're saying that adding a removing a path can cause
> complete loss of connectivity for 1/2 of connections using anycast
> that is a little more worrisome (this is essentially the same problem
> we have with SO_REUSEPORT and TCP). In that case seems like you'd want
> the 1/N solution.
> 
> Tom
> 

I guess it depends on how you use anycast. If you use TCP and each path
would hit different servers which doesn't share state at all, things
would break. If you're having a pair of load-balancers behind the
router which synchronizes with each other (like IPVS), then only the
very new connections would fail (the connections which haven't been
synchronized yet).

If you only have two paths, then you can't get any better than 1/2 no
matter what algorithm you choose (all flows on the bad path will have
to move, which may cause issues depending on the setup).

It is actually still better than IPVS in an active/backup scenario,
since a crashing load balancer might cause issues for ALL flows.

I guess we are getting closer and closer to L4 with modulo after all -
at least it is simpler.

Best regards,
 Peter Nørlund

> > Traditionally people used modulo to calculate the path from the
> > hash, but with the modulo approach 1/2 of all flows would change
> > paths when a path is added or removed. With the hash threshold
> > approach, somewhere between 1/4 and 1/2 of all flows are affected.
> > So an improvement for routes with more than two paths.
> >
> > Ideally you would have a hash-to-port mapping table or similar,
> > ensuring that only 1/N of all flows were affected (with N being the
> > number of paths).
> >
> >>
> >> > +               atomic_set(&nexthop_nh->nh_upper_bound,
> >> > upper_bound);
> >> > +       } endfor_nexthops(fi);
> >> > +
> >> > +       net_get_random_once(&fib_multipath_secret,
> >> > +                           sizeof(fib_multipath_secret));
> >> > +}
> >> > +
> >> > +static inline void fib_add_weight(struct fib_info *fi,
> >> > +                                 const struct fib_nh *nh)
> >> > +{
> >> > +       fi->fib_weight += nh->nh_weight;
> >> > +}
> >> > +
> >> > +#else /* CONFIG_IP_ROUTE_MULTIPATH */
> >> > +
> >> > +#define fib_rebalance(fi) do { } while (0)
> >> > +#define fib_add_weight(fi, nh) do { } while (0)
> >> > +
> >> > +#endif /* CONFIG_IP_ROUTE_MULTIPATH */
> >> >
> >> >  static int fib_encap_match(struct net *net, u16 encap_type,
> >> >                            struct nlattr *encap,
> >> > @@ -1094,8 +1153,11 @@ struct fib_info *fib_create_info(struct
> >> > fib_config *cfg)
> >> >
> >> >         change_nexthops(fi) {
> >> >                 fib_info_update_nh_saddr(net, nexthop_nh);
> >> > +               fib_add_weight(fi, nexthop_nh);
> >> >         } endfor_nexthops(fi)
> >> >
> >> > +       fib_rebalance(fi);
> >> > +
> >> >  link_it:
> >> >         ofi = fib_find_info(fi);
> >> >         if (ofi) {
> >> > @@ -1317,12 +1379,6 @@ int fib_sync_down_dev(struct net_device
> >> > *dev, unsigned long event) nexthop_nh->nh_flags |=
> >> > RTNH_F_LINKDOWN; break;
> >> >                                 }
> >> > -#ifdef CONFIG_IP_ROUTE_MULTIPATH
> >> > -
> >> > spin_lock_bh(&fib_multipath_lock);
> >> > -                               fi->fib_power -=
> >> > nexthop_nh->nh_power;
> >> > -                               nexthop_nh->nh_power = 0;
> >> > -
> >> > spin_unlock_bh(&fib_multipath_lock); -#endif
> >> >                                 dead++;
> >> >                         }
> >> >  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >> > @@ -1345,6 +1401,8 @@ int fib_sync_down_dev(struct net_device
> >> > *dev, unsigned long event) }
> >> >                         ret++;
> >> >                 }
> >> > +
> >> > +               fib_rebalance(fi);
> >> >         }
> >> >
> >> >         return ret;
> >> > @@ -1467,20 +1525,15 @@ int fib_sync_up(struct net_device *dev,
> >> > unsigned int nh_flags) !__in_dev_get_rtnl(dev))
> >> >                                 continue;
> >> >                         alive++;
> >> > -#ifdef CONFIG_IP_ROUTE_MULTIPATH
> >> > -                       spin_lock_bh(&fib_multipath_lock);
> >> > -                       nexthop_nh->nh_power = 0;
> >> > -                       nexthop_nh->nh_flags &= ~nh_flags;
> >> > -                       spin_unlock_bh(&fib_multipath_lock);
> >> > -#else
> >> >                         nexthop_nh->nh_flags &= ~nh_flags;
> >> > -#endif
> >> >                 } endfor_nexthops(fi)
> >> >
> >> >                 if (alive > 0) {
> >> >                         fi->fib_flags &= ~nh_flags;
> >> >                         ret++;
> >> >                 }
> >> > +
> >> > +               fib_rebalance(fi);
> >> >         }
> >> >
> >> >         return ret;
> >> > @@ -1488,62 +1541,19 @@ int fib_sync_up(struct net_device *dev,
> >> > unsigned int nh_flags)
> >> >
> >> >  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >> >
> >> > -/*
> >> > - * The algorithm is suboptimal, but it provides really
> >> > - * fair weighted route distribution.
> >> > - */
> >> > -void fib_select_multipath(struct fib_result *res)
> >> > +void fib_select_multipath(struct fib_result *res, int hash)
> >> >  {
> >> >         struct fib_info *fi = res->fi;
> >> > -       struct in_device *in_dev;
> >> > -       int w;
> >> > -
> >> > -       spin_lock_bh(&fib_multipath_lock);
> >> > -       if (fi->fib_power <= 0) {
> >> > -               int power = 0;
> >> > -               change_nexthops(fi) {
> >> > -                       in_dev =
> >> > __in_dev_get_rcu(nexthop_nh->nh_dev);
> >> > -                       if (nexthop_nh->nh_flags & RTNH_F_DEAD)
> >> > -                               continue;
> >> > -                       if (in_dev &&
> >> > -
> >> > IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
> >> > -                           nexthop_nh->nh_flags &
> >> > RTNH_F_LINKDOWN)
> >> > -                               continue;
> >> > -                       power += nexthop_nh->nh_weight;
> >> > -                       nexthop_nh->nh_power =
> >> > nexthop_nh->nh_weight;
> >> > -               } endfor_nexthops(fi);
> >> > -               fi->fib_power = power;
> >> > -               if (power <= 0) {
> >> > -                       spin_unlock_bh(&fib_multipath_lock);
> >> > -                       /* Race condition: route has just become
> >> > dead. */
> >> > -                       res->nh_sel = 0;
> >> > -                       return;
> >> > -               }
> >> > -       }
> >> > -
> >> >
> >> > -       /* w should be random number [0..fi->fib_power-1],
> >> > -        * it is pretty bad approximation.
> >> > -        */
> >> > -
> >> > -       w = jiffies % fi->fib_power;
> >> > +       for_nexthops(fi) {
> >> > +               if (hash > atomic_read(&nh->nh_upper_bound))
> >> > +                       continue;
> >> >
> >> > -       change_nexthops(fi) {
> >> > -               if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
> >> > -                   nexthop_nh->nh_power) {
> >> > -                       w -= nexthop_nh->nh_power;
> >> > -                       if (w <= 0) {
> >> > -                               nexthop_nh->nh_power--;
> >> > -                               fi->fib_power--;
> >> > -                               res->nh_sel = nhsel;
> >> > -
> >> > spin_unlock_bh(&fib_multipath_lock);
> >> > -                               return;
> >> > -                       }
> >> > -               }
> >> > +               res->nh_sel = nhsel;
> >> > +               return;
> >> >         } endfor_nexthops(fi);
> >> >
> >> >         /* Race condition: route has just become dead. */
> >> >         res->nh_sel = 0;
> >> > -       spin_unlock_bh(&fib_multipath_lock);
> >> >  }
> >> >  #endif
> >> > diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> >> > index 80f7c5b..3dc83b6 100644
> >> > --- a/net/ipv4/route.c
> >> > +++ b/net/ipv4/route.c
> >> > @@ -1659,8 +1659,12 @@ static int ip_mkroute_input(struct sk_buff
> >> > *skb, __be32 daddr, __be32 saddr, u32 tos)
> >> >  {
> >> >  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >> > -       if (res->fi && res->fi->fib_nhs > 1)
> >> > -               fib_select_multipath(res);
> >> > +       if (res->fi && res->fi->fib_nhs > 1) {
> >> > +               int h;
> >> > +
> >> > +               h = fib_multipath_hash(saddr, daddr);
> >> > +               fib_select_multipath(res, h);
> >> > +       }
> >> >  #endif
> >> >
> >> >         /* create a routing cache entry */
> >> > @@ -2190,8 +2194,12 @@ struct rtable
> >> > *__ip_route_output_key(struct net *net, struct flowi4 *fl4) }
> >> >
> >> >  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >> > -       if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
> >> > -               fib_select_multipath(&res);
> >> > +       if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
> >> > +               int h;
> >> > +
> >> > +               h = fib_multipath_hash(fl4->saddr, fl4->daddr);
> >> > +               fib_select_multipath(&res, h);
> >> > +       }
> >> >         else
> >> >  #endif
> >> >         if (!res.prefixlen &&
> >> > --
> >> > 1.7.10.4
> >> >
> >> > --
> >> > To unsubscribe from this list: send the line "unsubscribe
> >> > netdev" in the body of a message to majordomo@...r.kernel.org
> >> > More majordomo info at
> >> > http://vger.kernel.org/majordomo-info.html
> >

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html