[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <55834BA6.7050405@redhat.com>
Date: Thu, 18 Jun 2015 15:52:22 -0700
From: Alexander Duyck <alexander.h.duyck@...hat.com>
To: Peter Nørlund <pch@...bogen.com>,
netdev@...r.kernel.org
CC: "David S. Miller" <davem@...emloft.net>,
Alexey Kuznetsov <kuznet@....inr.ac.ru>,
James Morris <jmorris@...ei.org>,
Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
Patrick McHardy <kaber@...sh.net>, linux-api@...r.kernel.org
Subject: Re: [PATCH net-next 2/3] ipv4: L3 and L4 hash-based multipath routing
On 06/17/2015 01:08 PM, Peter Nørlund wrote:
> This patch adds L3 and L4 hash-based multipath routing, selectable on a
> per-route basis with the reintroduced RTA_MP_ALGO attribute. The default is
> now RT_MP_ALG_L3_HASH.
>
> Signed-off-by: Peter Nørlund <pch@...bogen.com>
> ---
> include/net/ip_fib.h | 4 ++-
> include/net/route.h | 5 ++--
> include/uapi/linux/rtnetlink.h | 14 ++++++++++-
> net/ipv4/fib_frontend.c | 4 +++
> net/ipv4/fib_semantics.c | 34 ++++++++++++++++++++++---
> net/ipv4/icmp.c | 4 +--
> net/ipv4/route.c | 56 +++++++++++++++++++++++++++++++++++-------
> net/ipv4/xfrm4_policy.c | 2 +-
> 8 files changed, 103 insertions(+), 20 deletions(-)
>
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 4be4f25..250d98e 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -37,6 +37,7 @@ struct fib_config {
> u32 fc_flags;
> u32 fc_priority;
> __be32 fc_prefsrc;
> + int fc_mp_alg;
> struct nlattr *fc_mx;
> struct rtnexthop *fc_mp;
> int fc_mx_len;
> @@ -116,6 +117,7 @@ struct fib_info {
> int fib_nhs;
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> int fib_mp_weight;
> + int fib_mp_alg;
> #endif
> struct rcu_head rcu;
> struct fib_nh fib_nh[0];
> @@ -308,7 +310,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev);
> int fib_sync_down_dev(struct net_device *dev, int force);
> int fib_sync_down_addr(struct net *net, __be32 local);
> int fib_sync_up(struct net_device *dev);
> -void fib_select_multipath(struct fib_result *res);
> +void fib_select_multipath(struct fib_result *res, const struct flowi4 *flow);
>
> /* Exported by fib_trie.c */
> void fib_trie_init(void);
> diff --git a/include/net/route.h b/include/net/route.h
> index fe22d03..1fc7deb 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -110,7 +110,8 @@ struct in_device;
> int ip_rt_init(void);
> void rt_cache_flush(struct net *net);
> void rt_flush_dev(struct net_device *dev);
> -struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
> +struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp,
> + const struct flowi4 *mp_flow);
> struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
> struct sock *sk);
> struct dst_entry *ipv4_blackhole_route(struct net *net,
> @@ -267,7 +268,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
> sport, dport, sk);
>
> if (!dst || !src) {
> - rt = __ip_route_output_key(net, fl4);
> + rt = __ip_route_output_key(net, fl4, NULL);
> if (IS_ERR(rt))
> return rt;
> ip_rt_put(rt);
> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
> index 17fb02f..dff4a72 100644
> --- a/include/uapi/linux/rtnetlink.h
> +++ b/include/uapi/linux/rtnetlink.h
> @@ -271,6 +271,18 @@ enum rt_scope_t {
> #define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */
> #define RTM_F_PREFIX 0x800 /* Prefix addresses */
>
> +/* Multipath algorithms */
> +
> +enum rt_mp_alg_t {
> + RT_MP_ALG_L3_HASH, /* Was IP_MP_ALG_NONE */
> + RT_MP_ALG_PER_PACKET, /* Was IP_MP_ALG_RR */
> + RT_MP_ALG_DRR, /* not used */
> + RT_MP_ALG_RANDOM, /* not used */
> + RT_MP_ALG_WRANDOM, /* not used */
> + RT_MP_ALG_L4_HASH,
> + __RT_MP_ALG_MAX
> +};
> +
> /* Reserved table identifiers */
>
> enum rt_class_t {
> @@ -301,7 +313,7 @@ enum rtattr_type_t {
> RTA_FLOW,
> RTA_CACHEINFO,
> RTA_SESSION, /* no longer used */
> - RTA_MP_ALGO, /* no longer used */
> + RTA_MP_ALGO,
> RTA_TABLE,
> RTA_MARK,
> RTA_MFC_STATS,
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 872494e..376e8c1 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -590,6 +590,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
> [RTA_PREFSRC] = { .type = NLA_U32 },
> [RTA_METRICS] = { .type = NLA_NESTED },
> [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
> + [RTA_MP_ALGO] = { .type = NLA_U32 },
> [RTA_FLOW] = { .type = NLA_U32 },
> };
>
> @@ -650,6 +651,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
> cfg->fc_mp = nla_data(attr);
> cfg->fc_mp_len = nla_len(attr);
> break;
> + case RTA_MP_ALGO:
> + cfg->fc_mp_alg = nla_get_u32(attr);
> + break;
> case RTA_FLOW:
> cfg->fc_flow = nla_get_u32(attr);
> break;
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 8c8df80..da06e88 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -257,6 +257,11 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
> {
> const struct fib_nh *onh = ofi->fib_nh;
>
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> + if (fi->fib_mp_alg != ofi->fib_mp_alg)
> + return -1;
> +#endif
> +
> for_nexthops(fi) {
> if (nh->nh_oif != onh->nh_oif ||
> nh->nh_gw != onh->nh_gw ||
> @@ -896,6 +901,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
>
> if (cfg->fc_mp) {
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> + fi->fib_mp_alg = cfg->fc_mp_alg;
> err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
> if (err != 0)
> goto failure;
> @@ -1085,6 +1091,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
> struct rtnexthop *rtnh;
> struct nlattr *mp;
>
> + if (fi->fib_mp_alg &&
> + nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg))
> + goto nla_put_failure;
> +
> mp = nla_nest_start(skb, RTA_MULTIPATH);
> if (!mp)
> goto nla_put_failure;
> @@ -1312,15 +1322,31 @@ int fib_sync_up(struct net_device *dev)
> }
>
> /*
> - * The algorithm is suboptimal, but it provides really
> - * fair weighted route distribution.
> + * Compute multipath hash based on 3- or 5-tuple
> */
> -void fib_select_multipath(struct fib_result *res)
> +static int fib_multipath_hash(const struct fib_result *res,
> + const struct flowi4 *flow)
> +{
> + u32 hash = flow->saddr ^ flow->daddr;
> +
> + if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH && flow->flowi4_proto != 0)
> + hash ^= flow->flowi4_proto ^ flow->fl4_sport ^ flow->fl4_dport;
> +
> + hash ^= hash >> 16;
> + hash ^= hash >> 8;
> + return hash & 0xFF;
> +}
> +
This hash is still far from optimal. Really I think you should look at
something such as jhash_3words or the like for mixing up the addresses.
Right now just XORing the values together like you are will end up
with a fairly high collision rate since for example in the case of two
endpoints on the same subnet you would lose the subnet as a result of
XORing the source and destination addresses. Also you would lose the
port data in the case of a protocol using something such as UDP where
the source and destination ports might be the same value.
> +void fib_select_multipath(struct fib_result *res, const struct flowi4 *flow)
> {
> struct fib_info *fi = res->fi;
> u8 w;
>
> - w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
> + if (res->fi->fib_mp_alg == RT_MP_ALG_PER_PACKET) {
> + w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
> + } else {
> + w = fib_multipath_hash(res, flow);
> + }
>
> for_nexthops(fi) {
> if (w >= atomic_read(&nh->nh_mp_upper_bound))
> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> index f5203fb..3abcfea 100644
> --- a/net/ipv4/icmp.c
> +++ b/net/ipv4/icmp.c
> @@ -459,7 +459,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
> fl4->fl4_icmp_type = type;
> fl4->fl4_icmp_code = code;
> security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
> - rt = __ip_route_output_key(net, fl4);
> + rt = __ip_route_output_key(net, fl4, NULL);
> if (IS_ERR(rt))
> return rt;
>
> @@ -481,7 +481,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
> goto relookup_failed;
>
> if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
> - rt2 = __ip_route_output_key(net, &fl4_dec);
> + rt2 = __ip_route_output_key(net, &fl4_dec, NULL);
> if (IS_ERR(rt2))
> err = PTR_ERR(rt2);
> } else {
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index f605598..a1ec62c 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1006,7 +1006,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
>
> __build_flow_key(&fl4, NULL, iph, oif,
> RT_TOS(iph->tos), protocol, mark, flow_flags);
> - rt = __ip_route_output_key(net, &fl4);
> + rt = __ip_route_output_key(net, &fl4, NULL);
> if (!IS_ERR(rt)) {
> __ip_rt_update_pmtu(rt, &fl4, mtu);
> ip_rt_put(rt);
> @@ -1025,7 +1025,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
> if (!fl4.flowi4_mark)
> fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
>
> - rt = __ip_route_output_key(sock_net(sk), &fl4);
> + rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
> if (!IS_ERR(rt)) {
> __ip_rt_update_pmtu(rt, &fl4, mtu);
> ip_rt_put(rt);
> @@ -1094,7 +1094,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
>
> __build_flow_key(&fl4, NULL, iph, oif,
> RT_TOS(iph->tos), protocol, mark, flow_flags);
> - rt = __ip_route_output_key(net, &fl4);
> + rt = __ip_route_output_key(net, &fl4, NULL);
> if (!IS_ERR(rt)) {
> __ip_do_redirect(rt, skb, &fl4, false);
> ip_rt_put(rt);
> @@ -1109,7 +1109,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
> struct rtable *rt;
>
> __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
> - rt = __ip_route_output_key(sock_net(sk), &fl4);
> + rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
> if (!IS_ERR(rt)) {
> __ip_do_redirect(rt, skb, &fl4, false);
> ip_rt_put(rt);
> @@ -1631,6 +1631,39 @@ out:
> return err;
> }
>
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +/* Fill flow key data based on packet for use in multipath routing. */
> +static void ip_multipath_flow(const struct sk_buff *skb, struct flowi4 *flow)
> +{
> + const struct iphdr *iph;
> +
> + iph = ip_hdr(skb);
> +
> + flow->saddr = iph->saddr;
> + flow->daddr = iph->daddr;
> + flow->flowi4_proto = iph->protocol;
> + flow->fl4_sport = 0;
> + flow->fl4_dport = 0;
> +
> + if (unlikely(ip_is_fragment(iph)))
> + return;
> +
I'm not sure if checking for fragmentation is enough. For example if
this system is routing and received a flow of UDP packets, some
fragmented some not then it might end up mixing them over 2 separate
next hops since some will include L4 header data and some won't.
As such you may want to have the option to exclude UDP from the
protocols listed below.
> + if (iph->protocol == IPPROTO_TCP ||
> + iph->protocol == IPPROTO_UDP ||
> + iph->protocol == IPPROTO_SCTP) {
> + __be16 _ports;
> + const __be16 *ports;
> +
> + ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
> + &_ports);
> + if (ports) {
> + flow->fl4_sport = ports[0];
> + flow->fl4_dport = ports[1];
> + }
> + }
> +}
> +#endif /* CONFIG_IP_ROUTE_MULTIPATH */
> +
> static int ip_mkroute_input(struct sk_buff *skb,
> struct fib_result *res,
> const struct flowi4 *fl4,
> @@ -1638,8 +1671,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
> __be32 daddr, __be32 saddr, u32 tos)
> {
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> - if (res->fi && res->fi->fib_nhs > 1)
> - fib_select_multipath(res);
> + if (res->fi && res->fi->fib_nhs > 1) {
> + struct flowi4 mp_flow;
> +
> + ip_multipath_flow(skb, &mp_flow);
> + fib_select_multipath(res, &mp_flow);
> + }
What is the point in populating the mp_flow if you don't know if it is
going to be used? You are populating it in ip_multipath_flow, and then
you might completely ignore it in fib_select_multipath.
Maybe instead of passing the mp_flow you could instead look at passing a
function pointer that would alter the flow for the multipath case instead.
> #endif
>
> /* create a routing cache entry */
> @@ -2012,7 +2049,8 @@ add:
> * Major route resolver routine.
> */
>
> -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4,
> + const struct flowi4 *mp_flow)
> {
> struct net_device *dev_out = NULL;
> __u8 tos = RT_FL_TOS(fl4);
> @@ -2170,7 +2208,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
>
> #ifdef CONFIG_IP_ROUTE_MULTIPATH
> if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
> - fib_select_multipath(&res);
> + fib_select_multipath(&res, (mp_flow ? mp_flow : fl4));
> else
> #endif
> if (!res.prefixlen &&
> @@ -2273,7 +2311,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
> struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
> struct sock *sk)
> {
> - struct rtable *rt = __ip_route_output_key(net, flp4);
> + struct rtable *rt = __ip_route_output_key(net, flp4, NULL);
>
> if (IS_ERR(rt))
> return rt;
> diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
> index bff6974..7eae158 100644
> --- a/net/ipv4/xfrm4_policy.c
> +++ b/net/ipv4/xfrm4_policy.c
> @@ -31,7 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
> if (saddr)
> fl4->saddr = saddr->a4;
>
> - rt = __ip_route_output_key(net, fl4);
> + rt = __ip_route_output_key(net, fl4, NULL);
> if (!IS_ERR(rt))
> return &rt->dst;
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists