lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b1bf5cfc-22c1-5640-6f71-be25647bebb3@cumulusnetworks.com>
Date:   Wed, 15 Mar 2017 14:10:08 +0200
From:   Nikolay Aleksandrov <nikolay@...ulusnetworks.com>
To:     Jakub Sitnicki <jkbs@...hat.com>
Cc:     netdev@...r.kernel.org, davem@...emloft.net,
        roopa@...ulusnetworks.com, dsa@...ulusnetworks.com,
        edumazet@...gle.com, pch@...bogen.com
Subject: Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy
 choice

On 15/03/17 13:32, Jakub Sitnicki wrote:
> On Tue, Mar 14, 2017 at 03:36 PM GMT, Nikolay Aleksandrov wrote:
>> This patch adds support for ECMP hash policy choice via a new sysctl
>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>> The current values for fib_multipath_hash_policy are:
>>  0 - layer 3 (default)
>>  1 - layer 4
>> If there's an skb hash already set and it matches the chosen policy then it
>> will be used instead of being calculated (currently only for L4).
>> In L3 mode we always calculate the hash due to the ICMP error special
>> case, the flow dissector's field consistentification should handle the
>> address order thus we can remove the address reversals.
>>
>> Signed-off-by: Nikolay Aleksandrov <nikolay@...ulusnetworks.com>
>> ---
>> v3:
>>  - keep the ICMP error special handling and always calc L3 hash
>>    Jakub, could you please run your tests with this version ?
>>
>> v2:
>>  - removed the output_key_hash as it's not needed anymore
>>  - reverted to my original/internal patch with L3 as default hash
>>
>>  Documentation/networking/ip-sysctl.txt |  8 +++
>>  include/net/ip_fib.h                   | 14 ++----
>>  include/net/netns/ipv4.h               |  1 +
>>  include/net/route.h                    |  9 +---
>>  net/ipv4/fib_semantics.c               | 11 ++--
>>  net/ipv4/icmp.c                        | 19 +------
>>  net/ipv4/route.c                       | 92 ++++++++++++++++++++++++++--------
>>  net/ipv4/sysctl_net_ipv4.c             |  9 ++++
>>  8 files changed, 98 insertions(+), 65 deletions(-)
>>
>> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
>> index fc73eeb7b3b8..558e19653106 100644
>> --- a/Documentation/networking/ip-sysctl.txt
>> +++ b/Documentation/networking/ip-sysctl.txt
>> @@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
>>  	0 - disabled
>>  	1 - enabled
>>  
>> +fib_multipath_hash_policy - INTEGER
>> +	Controls which hash policy to use for multipath routes. Only valid
>> +	for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
>> +	Default: 0 (Layer 3)
>> +	Possible values:
>> +	0 - Layer 3
>> +	1 - Layer 4
>> +
>>  route/max_size - INTEGER
>>  	Maximum number of routes allowed in the kernel.  Increase
>>  	this when using large numbers of interfaces and/or routes.
>> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
>> index d9cee9659978..8c608a17bd9a 100644
>> --- a/include/net/ip_fib.h
>> +++ b/include/net/ip_fib.h
>> @@ -383,17 +383,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
>>  int fib_sync_down_addr(struct net_device *dev, __be32 local);
>>  int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
>>  
>> -extern u32 fib_multipath_secret __read_mostly;
>> -
>> -static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
>> -{
>> -	return jhash_2words((__force u32)saddr, (__force u32)daddr,
>> -			    fib_multipath_secret) >> 1;
>> -}
>> -
>> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
>> +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
>> +		       const struct sk_buff *skb);
>> +#endif
>>  void fib_select_multipath(struct fib_result *res, int hash);
>>  void fib_select_path(struct net *net, struct fib_result *res,
>> -		     struct flowi4 *fl4, int mp_hash);
>> +		     struct flowi4 *fl4);
>>  
>>  /* Exported by fib_trie.c */
>>  void fib_trie_init(void);
>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>> index 622d2da27135..70a1d4251790 100644
>> --- a/include/net/netns/ipv4.h
>> +++ b/include/net/netns/ipv4.h
>> @@ -152,6 +152,7 @@ struct netns_ipv4 {
>>  #endif
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>  	int sysctl_fib_multipath_use_neigh;
>> +	int sysctl_fib_multipath_hash_policy;
>>  #endif
>>  
>>  	unsigned int	fib_seq;	/* protected by rtnl_mutex */
>> diff --git a/include/net/route.h b/include/net/route.h
>> index c0874c87c173..32412c91c222 100644
>> --- a/include/net/route.h
>> +++ b/include/net/route.h
>> @@ -113,14 +113,7 @@ struct in_device;
>>  int ip_rt_init(void);
>>  void rt_cache_flush(struct net *net);
>>  void rt_flush_dev(struct net_device *dev);
>> -struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
>> -					  int mp_hash);
>> -
>> -static inline struct rtable *__ip_route_output_key(struct net *net,
>> -						   struct flowi4 *flp)
>> -{
>> -	return __ip_route_output_key_hash(net, flp, -1);
>> -}
>> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp);
>>  
>>  struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
>>  				    const struct sock *sk);
>> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
>> index 317026a39cfa..6601bd9744c9 100644
>> --- a/net/ipv4/fib_semantics.c
>> +++ b/net/ipv4/fib_semantics.c
>> @@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
>>  static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
>>  
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -u32 fib_multipath_secret __read_mostly;
>>  
>>  #define for_nexthops(fi) {						\
>>  	int nhsel; const struct fib_nh *nh;				\
>> @@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi)
>>  
>>  		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
>>  	} endfor_nexthops(fi);
>> -
>> -	net_get_random_once(&fib_multipath_secret,
>> -			    sizeof(fib_multipath_secret));
>>  }
>>  
>>  static inline void fib_add_weight(struct fib_info *fi,
>> @@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
>>  #endif
>>  
>>  void fib_select_path(struct net *net, struct fib_result *res,
>> -		     struct flowi4 *fl4, int mp_hash)
>> +		     struct flowi4 *fl4)
>>  {
>>  	bool oif_check;
>>  
>> @@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res,
>>  
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>  	if (res->fi->fib_nhs > 1 && oif_check) {
>> -		if (mp_hash < 0)
>> -			mp_hash = get_hash_from_flowi4(fl4) >> 1;
>> +		int h = fib_multipath_hash(res->fi, fl4, NULL);
>>  
>> -		fib_select_multipath(res, mp_hash);
>> +		fib_select_multipath(res, h);
>>  	}
>>  	else
>>  #endif
>> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
>> index fc310db2708b..46630bc30754 100644
>> --- a/net/ipv4/icmp.c
>> +++ b/net/ipv4/icmp.c
>> @@ -464,22 +464,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
>>  	local_bh_enable();
>>  }
>>  
>> -#ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -
>> -/* Source and destination is swapped. See ip_multipath_icmp_hash */
>> -static int icmp_multipath_hash_skb(const struct sk_buff *skb)
>> -{
>> -	const struct iphdr *iph = ip_hdr(skb);
>> -
>> -	return fib_multipath_hash(iph->daddr, iph->saddr);
>> -}
>> -
>> -#else
>> -
>> -#define icmp_multipath_hash_skb(skb) (-1)
>> -
>> -#endif
>> -
>>  static struct rtable *icmp_route_lookup(struct net *net,
>>  					struct flowi4 *fl4,
>>  					struct sk_buff *skb_in,
>> @@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
>>  	fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
>>  
>>  	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
>> -	rt = __ip_route_output_key_hash(net, fl4,
>> -					icmp_multipath_hash_skb(skb_in));
>> +	rt = __ip_route_output_key(net, fl4);
>>  	if (IS_ERR(rt))
>>  		return rt;
>>  
> 
> I'm observing that the above hunk changes things because saddr passed to
> icmp_route_lookup doesn't always remain set to ip_hdr(skb_in)->daddr.
> 
> This happens when generating an ICMP error in response to a datagram
> which destination address is not a local one, that is from ip_forward.
> 
> -Jakub

Oh, of course. Good catch, I'll fix it in v4.

Thank you,
 Nik

> 
>> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
>> index 8471dd116771..3d7f99747285 100644
>> --- a/net/ipv4/route.c
>> +++ b/net/ipv4/route.c
>> @@ -1734,11 +1734,11 @@ static int __mkroute_input(struct sk_buff *skb,
>>  }
>>  
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>> -
>>  /* To make ICMP packets follow the right flow, the multipath hash is
>> - * calculated from the inner IP addresses in reverse order.
>> + * calculated from the inner IP addresses.
>>   */
>> -static int ip_multipath_icmp_hash(struct sk_buff *skb)
>> +static void ip_multipath_icmp_hash(const struct sk_buff *skb,
>> +				   struct flowi4 *fl4)
>>  {
>>  	const struct iphdr *outer_iph = ip_hdr(skb);
>>  	struct icmphdr _icmph;
>> @@ -1746,33 +1746,85 @@ static int ip_multipath_icmp_hash(struct sk_buff *skb)
>>  	struct iphdr _inner_iph;
>>  	const struct iphdr *inner_iph;
>>  
>> +	fl4->saddr = outer_iph->saddr;
>> +	fl4->daddr = outer_iph->daddr;
>>  	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
>> -		goto standard_hash;
>> +		return;
>>  
>>  	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
>>  				   &_icmph);
>>  	if (!icmph)
>> -		goto standard_hash;
>> +		return;
>>  
>>  	if (icmph->type != ICMP_DEST_UNREACH &&
>>  	    icmph->type != ICMP_REDIRECT &&
>>  	    icmph->type != ICMP_TIME_EXCEEDED &&
>> -	    icmph->type != ICMP_PARAMETERPROB) {
>> -		goto standard_hash;
>> -	}
>> +	    icmph->type != ICMP_PARAMETERPROB)
>> +		return;
>>  
>>  	inner_iph = skb_header_pointer(skb,
>>  				       outer_iph->ihl * 4 + sizeof(_icmph),
>>  				       sizeof(_inner_iph), &_inner_iph);
>>  	if (!inner_iph)
>> -		goto standard_hash;
>> +		return;
>> +	fl4->saddr = inner_iph->saddr;
>> +	fl4->daddr = inner_iph->daddr;
>> +}
>>  
>> -	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
>> +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
>> +		       const struct sk_buff *skb)
>> +{
>> +	struct net *net = fi->fib_net;
>> +	struct flow_keys hash_keys;
>> +	u32 mhash;
>>  
>> -standard_hash:
>> -	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
>> -}
>> +	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
>> +	case 0:
>> +		memset(&hash_keys, 0, sizeof(hash_keys));
>> +		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
>> +		if (skb && ip_hdr(skb)->protocol == IPPROTO_ICMP) {
>> +			struct flowi4 _fl4;
>>  
>> +			ip_multipath_icmp_hash(skb, &_fl4);
>> +			hash_keys.addrs.v4addrs.src = _fl4.saddr;
>> +			hash_keys.addrs.v4addrs.dst = _fl4.daddr;
>> +		} else {
>> +			hash_keys.addrs.v4addrs.src = fl4->saddr;
>> +			hash_keys.addrs.v4addrs.dst = fl4->daddr;
>> +		}
>> +		break;
>> +	case 1:
>> +		/* skb is currently provided only when forwarding */
>> +		if (skb) {
>> +			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
>> +			struct flow_keys keys;
>> +
>> +			/* short-circuit if we already have L4 hash present */
>> +			if (skb->l4_hash)
>> +				return skb_get_hash_raw(skb) >> 1;
>> +			memset(&hash_keys, 0, sizeof(hash_keys));
>> +			skb_flow_dissect_flow_keys(skb, &keys, flag);
>> +			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
>> +			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
>> +			hash_keys.ports.src = keys.ports.src;
>> +			hash_keys.ports.dst = keys.ports.dst;
>> +			hash_keys.basic.ip_proto = keys.basic.ip_proto;
>> +		} else {
>> +			memset(&hash_keys, 0, sizeof(hash_keys));
>> +			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
>> +			hash_keys.addrs.v4addrs.src = fl4->saddr;
>> +			hash_keys.addrs.v4addrs.dst = fl4->daddr;
>> +			hash_keys.ports.src = fl4->fl4_sport;
>> +			hash_keys.ports.dst = fl4->fl4_dport;
>> +			hash_keys.basic.ip_proto = fl4->flowi4_proto;
>> +		}
>> +		break;
>> +	}
>> +	mhash = flow_hash_from_keys(&hash_keys);
>> +
>> +	return mhash >> 1;
>> +}
>> +EXPORT_SYMBOL_GPL(fib_multipath_hash);
>>  #endif /* CONFIG_IP_ROUTE_MULTIPATH */
>>  
>>  static int ip_mkroute_input(struct sk_buff *skb,
>> @@ -1782,12 +1834,9 @@ static int ip_mkroute_input(struct sk_buff *skb,
>>  {
>>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>>  	if (res->fi && res->fi->fib_nhs > 1) {
>> -		int h;
>> +		struct flowi4 fl4 = { .daddr = daddr, .saddr = saddr };
>> +		int h = fib_multipath_hash(res->fi, &fl4, skb);
>>  
>> -		if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
>> -			h = ip_multipath_icmp_hash(skb);
>> -		else
>> -			h = fib_multipath_hash(saddr, daddr);
>>  		fib_select_multipath(res, h);
>>  	}
>>  #endif
>> @@ -2202,8 +2251,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
>>   * Major route resolver routine.
>>   */
>>  
>> -struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>> -					  int mp_hash)
>> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
>>  {
>>  	struct net_device *dev_out = NULL;
>>  	__u8 tos = RT_FL_TOS(fl4);
>> @@ -2365,7 +2413,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>>  		goto make_route;
>>  	}
>>  
>> -	fib_select_path(net, &res, fl4, mp_hash);
>> +	fib_select_path(net, &res, fl4);
>>  
>>  	dev_out = FIB_RES_DEV(res);
>>  	fl4->flowi4_oif = dev_out->ifindex;
>> @@ -2378,7 +2426,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>>  	rcu_read_unlock();
>>  	return rth;
>>  }
>> -EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
>> +EXPORT_SYMBOL_GPL(__ip_route_output_key);
>>  
>>  static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
>>  {
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index d6880a6149ee..62c4f94923e5 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -1004,6 +1004,15 @@ static struct ctl_table ipv4_net_table[] = {
>>  		.extra1		= &zero,
>>  		.extra2		= &one,
>>  	},
>> +	{
>> +		.procname	= "fib_multipath_hash_policy",
>> +		.data		= &init_net.ipv4.sysctl_fib_multipath_hash_policy,
>> +		.maxlen		= sizeof(int),
>> +		.mode		= 0644,
>> +		.proc_handler	= proc_dointvec_minmax,
>> +		.extra1		= &zero,
>> +		.extra2		= &one,
>> +	},
>>  #endif
>>  	{
>>  		.procname	= "ip_unprivileged_port_start",
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ