[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALx6S361R1UeEECQn9vFDCS-wi_StoV6mekqgSK-0t-0NPvWEA@mail.gmail.com>
Date: Tue, 28 Jul 2015 14:02:05 -0700
From: Tom Herbert <tom@...bertland.com>
To: Richard Laing <Richard.Laing@...iedtelesis.co.nz>
Cc: "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
"jmorris@...ei.org" <jmorris@...ei.org>
Subject: Re: [RFC PATCH 1/1] net/ipv4: Enable flow-based ECMP
On Mon, Jul 27, 2015 at 7:27 PM, Richard Laing
<Richard.Laing@...iedtelesis.co.nz> wrote:
> From: Richard Laing <richard.laing@...iedtelesis.co.nz>
>
> Enable flow-based ECMP.
>
> Currently if equal-cost multipath is enabled the kernel chooses between
> equal cost paths for each matching packet, essentially packets are
> round-robined between the routes. This means that packets from a single
> flow can traverse different routes. If one of the routes experiences
> congestion this can result in delayed or out of order packets arriving
> at the destination.
>
Richard, someone was complaining to me just last week about the
weakness of the round robin algorithm. Thanks for looking into this!
> This patch allows packets to be routed based on their
> flow - packets in the same flow will always use the same route. This
> prevents out of order packets. There are other issues with round-robin
> based ECMP routing related to variable path MTU handling and debugging.
> See RFC2991 for more details on the problems associated with packet
> based ECMP routing.
>
> This patch relies on the skb hash value to select between routes. The
> selection uses a hash-threshold algorithm (see RFC2992).
>
> Signed-off-by: Richard Laing <richard.laing@...iedtelesis.co.nz>
> ---
>
> include/net/flow.h | 14 ++++++++++++++
> include/net/ip_fib.h | 9 +++++++++
> include/net/route.h | 4 ++++
> net/ipv4/Kconfig | 9 +++++++++
> net/ipv4/fib_semantics.c | 38 ++++++++++++++++++++++++++++++++++++++
> net/ipv4/route.c | 14 +++++++++++++-
> 6 files changed, 87 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/flow.h b/include/net/flow.h
> index 8109a15..d1d933d 100644
> --- a/include/net/flow.h
> +++ b/include/net/flow.h
> @@ -79,6 +79,10 @@ struct flowi4 {
> #define fl4_ipsec_spi uli.spi
> #define fl4_mh_type uli.mht.type
> #define fl4_gre_key uli.gre_key
> +
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
Why bother making this a CONFIG, round robin is a miserable algorithm
anyway and nearly all the other packet steering mechanisms already use
a hash.
> + __u32 flowi4_hash;
> +#endif
> } __attribute__((__aligned__(BITS_PER_LONG/8)));
>
> static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
> @@ -99,6 +103,9 @@ static inline void flowi4_init_output(struct flowi4
> *fl4, int oif,
> fl4->saddr = saddr;
> fl4->fl4_dport = dport;
> fl4->fl4_sport = sport;
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> + fl4->flowi4_hash = 0;
> +#endif
> }
>
> /* Reset some input parameters after previous lookup */
> @@ -182,6 +189,13 @@ static inline struct flowi *flowidn_to_flowi(struct
> flowidn *fldn)
> return container_of(fldn, struct flowi, u.dn);
> }
>
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +static inline void flowi4_set_flow_hash(struct flowi4 *fl, __u32 hash)
> +{
> + fl->flowi4_hash = hash;
> +}
> +#endif
> +
> typedef unsigned long flow_compare_t;
>
> static inline size_t flow_key_size(u16 family)
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 5fa643b..c841698 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -117,6 +117,10 @@ struct fib_info {
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> int fib_power;
> #endif
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> + /* Cache the number of live nexthops for flow based ECMP
> calculation. */
> + int live_nexthops;
> +#endif
> struct rcu_head rcu;
> struct fib_nh fib_nh[0];
> #define fib_dev fib_nh[0].nh_dev
> @@ -311,6 +315,11 @@ int fib_sync_down_addr(struct net *net, __be32 local);
> int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
> void fib_select_multipath(struct fib_result *res);
>
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +void fib_select_multipath_for_flow(struct fib_result *res,
> + const struct flowi4 *fl4);
> +#endif
> +
> /* Exported by fib_trie.c */
> void fib_trie_init(void);
> struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
> diff --git a/include/net/route.h b/include/net/route.h
> index fe22d03..fdbbe7f 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -252,6 +252,10 @@ static inline void ip_route_connect_init(struct
> flowi4 *fl4, __be32 dst, __be32
>
> flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
> protocol, flow_flags, dst, src, dport, sport);
> +
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> + flowi4_set_flow_hash(fl4, sk->sk_hash);
Should be sk_txhash I think.
> +#endif
> }
>
> static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
> index 6fb3c90..76714d0 100644
> --- a/net/ipv4/Kconfig
> +++ b/net/ipv4/Kconfig
> @@ -90,6 +90,15 @@ config IP_ROUTE_MULTIPATH
> equal "cost" and chooses one of them in a non-deterministic fashion
> if a matching packet arrives.
>
> +config IP_FLOW_BASED_MULTIPATH
> + bool "IP: flow based equal cost multipath"
> + depends on IP_ROUTE_MULTIPATH
> + help
> + Normally if equal cost multipath is enabled the router chooses
> between
> + equal "cost" paths for each matching packet, essentially
> packets are round-
> + robined between the routes. This option allows packets to be
> routed based on
> + their flow - packets in the same flow will always use the
> same route.
> +
> config IP_ROUTE_VERBOSE
> bool "IP: verbose route monitoring"
> depends on IP_ADVANCED_ROUTER
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 3a06586..e4750e6 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -981,6 +981,9 @@ link_it:
> head = &fib_info_devhash[hash];
> hlist_add_head(&nexthop_nh->nh_hash, head);
> } endfor_nexthops(fi)
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> + fi->live_nexthops = fi->fib_nhs;
> +#endif
> spin_unlock_bh(&fib_info_lock);
> return fi;
>
> @@ -1196,6 +1199,9 @@ int fib_sync_down_dev(struct net_device *dev,
> unsigned long event)
> }
> ret++;
> }
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> + fi->live_nexthops = fi->fib_nhs - dead;
> +#endif
> }
>
> return ret;
> @@ -1331,6 +1337,9 @@ int fib_sync_up(struct net_device *dev, unsigned
> int nh_flags)
> if (alive > 0) {
> fi->fib_flags &= ~nh_flags;
> ret++;
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> + fi->live_nexthops = alive;
> +#endif
> }
> }
>
> @@ -1397,4 +1406,33 @@ void fib_select_multipath(struct fib_result *res)
> res->nh_sel = 0;
> spin_unlock_bh(&fib_multipath_lock);
> }
> +
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> +void fib_select_multipath_for_flow(struct fib_result *res,
> + const struct flowi4 *fl4)
> +{
> + struct fib_info *fi = res->fi;
> + int multipath_entry;
> + int region_size;
> +
> + if (fl4->flowi4_hash) {
> + /* Hash-threshold algorithm, see RFC2992. */
> + region_size = U32_MAX / fi->live_nexthops;
> + multipath_entry = fl4->flowi4_hash / region_size;
> +
> + spin_lock_bh(&fib_multipath_lock);
> + for_nexthops(fi) {
> + if (!(nh->nh_flags & RTNH_F_DEAD)) {
> + res->nh_sel = nhsel;
> + if (multipath_entry == 0)
> + break;
> + multipath_entry--;
> + }
> + } endfor_nexthops(fi);
> + spin_unlock_bh(&fib_multipath_lock);
> + } else {
> + fib_select_multipath(res);
> + }
> +}
> +#endif
> #endif
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index e681b85..4629c04 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1638,9 +1638,17 @@ static int ip_mkroute_input(struct sk_buff *skb,
> __be32 daddr, __be32 saddr, u32 tos)
> {
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> - if (res->fi && res->fi->fib_nhs > 1)
> + if (res->fi && res->fi->fib_nhs > 1) {
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> + if (skb)
> + flowi4_set_flow_hash((struct flowi4 *)fl4,
> + skb_get_hash(skb));
> + fib_select_multipath_for_flow(res, fl4);
> +#else
> fib_select_multipath(res);
> #endif
> + }
> +#endif
>
> /* create a routing cache entry */
> return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
> @@ -2170,7 +2178,11 @@ struct rtable *__ip_route_output_key(struct net
> *net, struct flowi4 *fl4)
>
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
> +#ifdef CONFIG_IP_FLOW_BASED_MULTIPATH
> + fib_select_multipath_for_flow(&res, fl4);
> +#else
> fib_select_multipath(&res);
> +#endif
> else
> #endif
> if (!res.prefixlen &&
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists