[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180222100905.GA26190@splinter.mtl.com>
Date: Thu, 22 Feb 2018 12:09:05 +0200
From: Ido Schimmel <idosch@...sch.org>
To: David Ahern <dsahern@...il.com>
Cc: netdev@...r.kernel.org, tom@...bertland.com, davem@...emloft.net,
roopa@...ulusnetworks.com, nikolay@...ulusnetworks.com
Subject: Re: [PATCH net-next 5/7] net/ipv6: Add support for path selection
using hash of 5-tuple
Hi David,
On Wed, Feb 21, 2018 at 10:49:52AM -0800, David Ahern wrote:
> Some operators prefer IPv6 path selection to use a standard 5-tuple
> hash rather than just an L3 hash with the flow the label. To that end
> add support to IPv6 for multipath hash policy similar to bf4e0a3db97eb
> ("net: ipv4: add support for ECMP hash policy choice"). The default
> is still L3 which covers source and destination addresses along with
> flow label and IPv6 protocol.
>
> Signed-off-by: David Ahern <dsahern@...il.com>
> ---
> Documentation/networking/ip-sysctl.txt | 7 ++++
> include/net/ip6_route.h | 3 +-
> include/net/netevent.h | 1 +
> include/net/netns/ipv6.h | 1 +
> net/ipv6/icmp.c | 2 +-
> net/ipv6/route.c | 64 ++++++++++++++++++++++++++--------
> net/ipv6/sysctl_net_ipv6.c | 26 ++++++++++++++
> 7 files changed, 87 insertions(+), 17 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> index a553d4e4a0fb..783675a730e5 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -1363,6 +1363,13 @@ flowlabel_reflect - BOOLEAN
> FALSE: disabled
> Default: FALSE
>
> +fib_multipath_hash_policy - INTEGER
> + Controls which hash policy to use for multipath routes.
> + Default: 0 (Layer 3)
> + Possible values:
> + 0 - Layer 3 (source and destination addresses plus flow label)
> + 1 - Layer 4 (standard 5-tuple)
> +
> anycast_src_echo_reply - BOOLEAN
> Controls the use of anycast addresses as source addresses for ICMPv6
> echo reply
> diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
> index 27d23a65f3cd..f13657a62e5c 100644
> --- a/include/net/ip6_route.h
> +++ b/include/net/ip6_route.h
> @@ -127,7 +127,8 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
>
> struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
> const struct in6_addr *saddr, int oif, int flags);
> -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb);
> +u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
> + const struct sk_buff *skb);
>
> struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
>
> diff --git a/include/net/netevent.h b/include/net/netevent.h
> index baee605a94ab..d9918261701c 100644
> --- a/include/net/netevent.h
> +++ b/include/net/netevent.h
> @@ -27,6 +27,7 @@ enum netevent_notif_type {
> NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */
> NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */
> NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */
> + NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */
> };
>
> int register_netevent_notifier(struct notifier_block *nb);
> diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
> index 987cc4569cb8..6b0de3b71bbf 100644
> --- a/include/net/netns/ipv6.h
> +++ b/include/net/netns/ipv6.h
> @@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 {
> int ip6_rt_gc_elasticity;
> int ip6_rt_mtu_expires;
> int ip6_rt_min_advmss;
> + int multipath_hash_policy;
> int flowlabel_consistency;
> int auto_flowlabels;
> int icmpv6_time;
> diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
> index 4fa4f1b150a4..f53c14390d9f 100644
> --- a/net/ipv6/icmp.c
> +++ b/net/ipv6/icmp.c
> @@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
> fl6.fl6_icmp_type = type;
> fl6.fl6_icmp_code = code;
> fl6.flowi6_uid = sock_net_uid(net, NULL);
> - fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
> + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb);
> security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
>
> sk = icmpv6_xmit_lock(net);
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index ab0de69ec67d..5c4481e4f3e1 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -450,7 +450,8 @@ static bool rt6_check_expired(const struct rt6_info *rt)
> return false;
> }
>
> -static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
> +static struct rt6_info *rt6_multipath_select(const struct net *net,
> + struct rt6_info *match,
> struct flowi6 *fl6, int oif,
> int strict)
> {
> @@ -460,7 +461,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
> * case it will always be non-zero. Otherwise now is the time to do it.
> */
> if (!fl6->mp_hash)
> - fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
> + fl6->mp_hash = rt6_multipath_hash(net, fl6, NULL);
My test is failing and all the packets hit the same nexthop despite the
fact I'm using random UDP ports. This is because only the flow info is
passed here and it doesn't include the ports.
The equivalent IPv4 code passes the skb and I believe this is what we
need to do here as well.
>
> if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
> return match;
> @@ -929,7 +930,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
> rt = rt6_device_match(net, rt, &fl6->saddr,
> fl6->flowi6_oif, flags);
> if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
> - rt = rt6_multipath_select(rt, fl6,
> + rt = rt6_multipath_select(net, rt, fl6,
> fl6->flowi6_oif, flags);
> }
> if (rt == net->ipv6.ip6_null_entry) {
> @@ -1669,7 +1670,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
> redo_rt6_select:
> rt = rt6_select(net, fn, oif, strict);
> if (rt->rt6i_nsiblings)
> - rt = rt6_multipath_select(rt, fl6, oif, strict);
> + rt = rt6_multipath_select(net, rt, fl6, oif, strict);
> if (rt == net->ipv6.ip6_null_entry) {
> fn = fib6_backtrack(fn, &fl6->saddr);
> if (fn)
> @@ -1819,20 +1820,53 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
> }
>
> /* if skb is set it will be used and fl6 can be NULL */
> -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
> +u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
> + const struct sk_buff *skb)
> {
> struct flow_keys hash_keys;
> u32 mhash;
>
> - memset(&hash_keys, 0, sizeof(hash_keys));
> - hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
> - if (skb) {
> - ip6_multipath_l3_keys(skb, &hash_keys);
> - } else {
> - hash_keys.addrs.v6addrs.src = fl6->saddr;
> - hash_keys.addrs.v6addrs.dst = fl6->daddr;
> - hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
> - hash_keys.basic.ip_proto = fl6->flowi6_proto;
> + switch (net->ipv6.sysctl.multipath_hash_policy) {
> + case 0:
> + memset(&hash_keys, 0, sizeof(hash_keys));
> + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
> + if (skb) {
> + ip6_multipath_l3_keys(skb, &hash_keys);
> + } else {
> + hash_keys.addrs.v6addrs.src = fl6->saddr;
> + hash_keys.addrs.v6addrs.dst = fl6->daddr;
> + hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
> + hash_keys.basic.ip_proto = fl6->flowi6_proto;
> + }
> + break;
> + case 1:
> + if (skb) {
> + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
> + struct flow_keys keys;
> +
> + /* short-circuit if we already have L4 hash present */
> + if (skb->l4_hash)
> + return skb_get_hash_raw(skb) >> 1;
> +
> + memset(&hash_keys, 0, sizeof(hash_keys));
> +
> + skb_flow_dissect_flow_keys(skb, &keys, flag);
> +
> + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
> + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
> + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
> + hash_keys.ports.src = keys.ports.src;
> + hash_keys.ports.dst = keys.ports.dst;
> + hash_keys.basic.ip_proto = keys.basic.ip_proto;
> + } else {
> + memset(&hash_keys, 0, sizeof(hash_keys));
> + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
> + hash_keys.addrs.v6addrs.src = fl6->saddr;
> + hash_keys.addrs.v6addrs.dst = fl6->daddr;
> + hash_keys.ports.src = fl6->fl6_sport;
> + hash_keys.ports.dst = fl6->fl6_dport;
> + hash_keys.basic.ip_proto = fl6->flowi6_proto;
> + }
> }
> mhash = flow_hash_from_keys(&hash_keys);
>
> @@ -1858,7 +1892,7 @@ void ip6_route_input(struct sk_buff *skb)
> if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
> fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
> if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
> - fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
> + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb);
> skb_dst_drop(skb);
> skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
> }
> diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
> index 262f791f1b9b..6125accc993a 100644
> --- a/net/ipv6/sysctl_net_ipv6.c
> +++ b/net/ipv6/sysctl_net_ipv6.c
> @@ -16,14 +16,31 @@
> #include <net/ipv6.h>
> #include <net/addrconf.h>
> #include <net/inet_frag.h>
> +#include <net/netevent.h>
> #ifdef CONFIG_NETLABEL
> #include <net/calipso.h>
> #endif
>
> +static int zero;
> static int one = 1;
> static int auto_flowlabels_min;
> static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
>
> +static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write,
> + void __user *buffer, size_t *lenp,
> + loff_t *ppos)
> +{
> + struct net *net;
> + int ret;
> +
> + net = container_of(table->data, struct net,
> + ipv6.sysctl.multipath_hash_policy);
> + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
> + if (write && ret == 0)
> + call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net);
> +
> + return ret;
> +}
>
> static struct ctl_table ipv6_table_template[] = {
> {
> @@ -126,6 +143,15 @@ static struct ctl_table ipv6_table_template[] = {
> .mode = 0644,
> .proc_handler = proc_dointvec
> },
> + {
> + .procname = "fib_multipath_hash_policy",
> + .data = &init_net.ipv6.sysctl.multipath_hash_policy,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_rt6_multipath_hash_policy,
> + .extra1 = &zero,
> + .extra2 = &one,
> + },
> { }
> };
>
> --
> 2.11.0
>
Powered by blists - more mailing lists