lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Tue, 28 Jul 2015 21:20:02 +0000
From:	Richard Laing <Richard.Laing@...iedtelesis.co.nz>
To:	Tom Herbert <tom@...bertland.com>
CC:	"netdev@...r.kernel.org" <netdev@...r.kernel.org>,
	"jmorris@...ei.org" <jmorris@...ei.org>
Subject: Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP



On 07/29/2015 09:02 AM, Tom Herbert wrote:
> On Mon, Jul 27, 2015 at 7:27 PM, Richard Laing
> <Richard.Laing@...iedtelesis.co.nz> wrote:
>> From: Richard Laing <richard.laing@...iedtelesis.co.nz>
>>
>> Enable flow-based ECMP.
>>
>> Currently if equal-cost multipath is enabled the kernel chooses between
>> equal cost paths for each matching packet, essentially packets are
>> round-robined between the routes. This means that packets from a single
>> flow can traverse different routes. If one of the routes experiences
>> congestion this can result in delayed or out of order packets arriving
>> at the destination.
>>
> Richard, someone was complaining to me just last week about the
> weakness of the round robin algorithm. Thanks for looking into this!
>
>> This patch allows packets to be routed based on their
>> flow - packets in the same flow will always use the same route. This
>> prevents out of order packets. There are other issues with round-robin
>> based ECMP routing related to variable path MTU handling and debugging.
>> See RFC2991 for more details on the problems associated with packet
>> based ECMP routing.
>>
>> This patch relies on the skb hash value to select between routes. The
>> selection uses a hash-threshold algorithm (see RFC2992).
>>
>> Signed-off-by: Richard Laing <richard.laing@...iedtelesis.co.nz>
>> ---
>>
>>    include/net/flow.h       |   14 ++++++++++++++
>>    include/net/ip_fib.h     |    9 +++++++++
>>    include/net/route.h      |    4 ++++
>>    net/ipv4/Kconfig         |    9 +++++++++
>>    net/ipv4/fib_semantics.c |   38 ++++++++++++++++++++++++++++++++++++++
>>    net/ipv4/route.c         |   14 +++++++++++++-
>>    6 files changed, 87 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/net/flow.h b/include/net/flow.h
>> index 8109a15..d1d933d 100644
>> --- a/include/net/flow.h
>> +++ b/include/net/flow.h
>> @@ -79,6 +79,10 @@ struct flowi4 {
>>    #define fl4_ipsec_spi        uli.spi
>>    #define fl4_mh_type        uli.mht.type
>>    #define fl4_gre_key        uli.gre_key
>> +
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> Why bother making this a CONFIG, round robin is a miserable algorithm
> anyway and nearly all the other packet steering mechanisms already use
> a hash.

Fair enough, I will look at making it a sysctl option. I guess the 
default can be the current behaviour.
>
>> +    __u32    flowi4_hash;
>> +#endif
>>    } __attribute__((__aligned__(BITS_PER_LONG/8)));
>>
>>    static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
>> @@ -99,6 +103,9 @@ static inline void flowi4_init_output(struct flowi4
>> *fl4, int oif,
>>        fl4->saddr = saddr;
>>        fl4->fl4_dport = dport;
>>        fl4->fl4_sport = sport;
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    fl4->flowi4_hash = 0;
>> +#endif
>>    }
>>
>>    /* Reset some input parameters after previous lookup */
>> @@ -182,6 +189,13 @@ static inline struct flowi *flowidn_to_flowi(struct
>> flowidn *fldn)
>>        return container_of(fldn, struct flowi, u.dn);
>>    }
>>
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +static inline void flowi4_set_flow_hash(struct flowi4 *fl, __u32 hash)
>> +{
>> +    fl->flowi4_hash = hash;
>> +}
>> +#endif
>> +
>>    typedef unsigned long flow_compare_t;
>>
>>    static inline size_t flow_key_size(u16 family)
>> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
>> index 5fa643b..c841698 100644
>> --- a/include/net/ip_fib.h
>> +++ b/include/net/ip_fib.h
>> @@ -117,6 +117,10 @@ struct fib_info {
>>    #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>        int            fib_power;
>>    #endif
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    /* Cache the number of live nexthops for flow based ECMP
>> calculation. */
>> +    int            live_nexthops;
>> +#endif
>>        struct rcu_head        rcu;
>>        struct fib_nh        fib_nh[0];
>>    #define fib_dev        fib_nh[0].nh_dev
>> @@ -311,6 +315,11 @@ int fib_sync_down_addr(struct net *net, __be32 local);
>>    int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
>>    void fib_select_multipath(struct fib_result *res);
>>
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +void fib_select_multipath_for_flow(struct fib_result *res,
>> +                   const struct flowi4 *fl4);
>> +#endif
>> +
>>    /* Exported by fib_trie.c */
>>    void fib_trie_init(void);
>>    struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
>> diff --git a/include/net/route.h b/include/net/route.h
>> index fe22d03..fdbbe7f 100644
>> --- a/include/net/route.h
>> +++ b/include/net/route.h
>> @@ -252,6 +252,10 @@ static inline void ip_route_connect_init(struct
>> flowi4 *fl4, __be32 dst, __be32
>>
>>        flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
>>                   protocol, flow_flags, dst, src, dport, sport);
>> +
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    flowi4_set_flow_hash(fl4, sk->sk_hash);
> Should be sk_txhash I think.

I will check, it looks likely!
>
>> +#endif
>>    }
>>
>>    static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
>> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
>> index 6fb3c90..76714d0 100644
>> --- a/net/ipv4/Kconfig
>> +++ b/net/ipv4/Kconfig
>> @@ -90,6 +90,15 @@ config IP_ROUTE_MULTIPATH
>>          equal "cost" and chooses one of them in a non-deterministic fashion
>>          if a matching packet arrives.
>>
>> +config IP_FLOW_BASED_MULTIPATH
>> +    bool "IP: flow based equal cost multipath"
>> +    depends on IP_ROUTE_MULTIPATH
>> +    help
>> +      Normally if equal cost multipath is enabled the router chooses
>> between
>> +          equal "cost" paths for each matching packet, essentially
>> packets are round-
>> +          robined between the routes. This option allows packets to be
>> routed based on
>> +          their flow - packets in the same flow will always use the
>> same route.
>> +
>>    config IP_ROUTE_VERBOSE
>>        bool "IP: verbose route monitoring"
>>        depends on IP_ADVANCED_ROUTER
>> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
>> index 3a06586..e4750e6 100644
>> --- a/net/ipv4/fib_semantics.c
>> +++ b/net/ipv4/fib_semantics.c
>> @@ -981,6 +981,9 @@ link_it:
>>            head = &fib_info_devhash[hash];
>>            hlist_add_head(&nexthop_nh->nh_hash, head);
>>        } endfor_nexthops(fi)
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +    fi->live_nexthops = fi->fib_nhs;
>> +#endif
>>        spin_unlock_bh(&fib_info_lock);
>>        return fi;
>>
>> @@ -1196,6 +1199,9 @@ int fib_sync_down_dev(struct net_device *dev,
>> unsigned long event)
>>                }
>>                ret++;
>>            }
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +        fi->live_nexthops = fi->fib_nhs - dead;
>> +#endif
>>        }
>>
>>        return ret;
>> @@ -1331,6 +1337,9 @@ int fib_sync_up(struct net_device *dev, unsigned
>> int nh_flags)
>>            if (alive > 0) {
>>                fi->fib_flags &= ~nh_flags;
>>                ret++;
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +            fi->live_nexthops = alive;
>> +#endif
>>            }
>>        }
>>
>> @@ -1397,4 +1406,33 @@ void fib_select_multipath(struct fib_result *res)
>>        res->nh_sel = 0;
>>        spin_unlock_bh(&fib_multipath_lock);
>>    }
>> +
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +void fib_select_multipath_for_flow(struct fib_result *res,
>> +                   const struct flowi4 *fl4)
>> +{
>> +    struct fib_info *fi = res->fi;
>> +    int multipath_entry;
>> +    int region_size;
>> +
>> +    if (fl4->flowi4_hash) {
>> +        /* Hash-threshold algorithm, see RFC2992. */
>> +        region_size = U32_MAX / fi->live_nexthops;
>> +        multipath_entry = fl4->flowi4_hash / region_size;
>> +
>> +        spin_lock_bh(&fib_multipath_lock);
>> +        for_nexthops(fi) {
>> +            if (!(nh->nh_flags & RTNH_F_DEAD)) {
>> +                res->nh_sel = nhsel;
>> +                if (multipath_entry == 0)
>> +                    break;
>> +                multipath_entry--;
>> +            }
>> +        } endfor_nexthops(fi);
>> +        spin_unlock_bh(&fib_multipath_lock);
>> +    } else {
>> +        fib_select_multipath(res);
>> +    }
>> +}
>> +#endif
>>    #endif
>> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
>> index e681b85..4629c04 100644
>> --- a/net/ipv4/route.c
>> +++ b/net/ipv4/route.c
>> @@ -1638,9 +1638,17 @@ static int ip_mkroute_input(struct sk_buff *skb,
>>                    __be32 daddr, __be32 saddr, u32 tos)
>>    {
>>    #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -    if (res->fi && res->fi->fib_nhs > 1)
>> +    if (res->fi && res->fi->fib_nhs > 1) {
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +        if (skb)
>> +            flowi4_set_flow_hash((struct flowi4 *)fl4,
>> +                         skb_get_hash(skb));
>> +        fib_select_multipath_for_flow(res, fl4);
>> +#else
>>            fib_select_multipath(res);
>>    #endif
>> +    }
>> +#endif
>>
>>        /* create a routing cache entry */
>>        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
>> @@ -2170,7 +2178,11 @@ struct rtable *__ip_route_output_key(struct net
>> *net, struct flowi4 *fl4)
>>
>>    #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
>> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
>> +        fib_select_multipath_for_flow(&res, fl4);
>> +#else
>>            fib_select_multipath(&res);
>> +#endif
>>        else
>>    #endif
>>        if (!res.prefixlen &&

-- 
Richard Laing
Software Team Leader
Allied Telesis Labs| 27 Nazareth Ave | Christchurch 8024 | New Zealand
Phone: +64 3 339 9248
Web: www.alliedtelesis.com

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ