lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230815201048.1796-1-sriram.yagnaraman@est.tech>
Date: Tue, 15 Aug 2023 22:10:48 +0200
From: Sriram Yagnaraman <sriram.yagnaraman@....tech>
To:
Cc: netdev@...r.kernel.org, Sriram Yagnaraman <sriram.yagnaraman@....tech>
Subject: [Question]: TCP resets when using ECMP for load-balancing between multiple servers.

All packets in the same flow (L3/L4 depending on multipath hash policy)
should be directed to the same target, but after [0] we see stray
packets directed towards other targets. This, for instance, causes RST
to be sent on TCP connections. This happens on a static setup, with no
changes to the nexthops, so there is no hash space reassignment.

IIUC, route hints when the next hop is part of a multipath group causes
packets in the same receive batch to be sent to the same next hop
irrespective of which nexthop the multipath hash points to. I am no
expert in this area, so please let me know if there is a simple
explanation on how to fix this problem?

Below is a patch which has a selftest that describes the problem setup
and a hack to solve the problem in ipv4. For ipv6, I have just commented
out the part the returns the route hint, just for testing.

[0]: 02b24941619f ("ipv4: use dst hint for ipv4 list receive")

---
 include/uapi/linux/in_route.h                 |   1 +
 net/ipv4/ip_input.c                           |   9 +-
 net/ipv4/route.c                              |   7 +-
 net/ipv6/ip6_input.c                          |   4 +
 .../testing/selftests/net/forwarding/Makefile |   1 +
 .../net/forwarding/router_multipath_vip.sh    | 324 ++++++++++++++++++
 6 files changed, 341 insertions(+), 5 deletions(-)
 create mode 100755 tools/testing/selftests/net/forwarding/router_multipath_vip.sh

diff --git a/include/uapi/linux/in_route.h b/include/uapi/linux/in_route.h
index 0cc2c23b47f8..01ae06c7743b 100644
--- a/include/uapi/linux/in_route.h
+++ b/include/uapi/linux/in_route.h
@@ -15,6 +15,7 @@
 #define RTCF_REDIRECTED	0x00040000
 #define RTCF_TPROXY	0x00080000 /* unused */
 
+#define RTCF_MULTIPATH	0x00200000
 #define RTCF_FAST	0x00200000 /* unused */
 #define RTCF_MASQ	0x00400000 /* unused */
 #define RTCF_SNAT	0x00800000 /* unused */
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index fe9ead9ee863..e06a1a6a4357 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -582,9 +582,11 @@ static void ip_sublist_rcv_finish(struct list_head *head)
 }
 
 static struct sk_buff *ip_extract_route_hint(const struct net *net,
-					     struct sk_buff *skb, int rt_type)
+					     struct sk_buff *skb, int rt_type,
+					     unsigned int rt_flags)
 {
-	if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST)
+	if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
+	    !!(rt_flags & RTCF_MULTIPATH))
 		return NULL;
 
 	return skb;
@@ -615,7 +617,8 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 		dst = skb_dst(skb);
 		if (curr_dst != dst) {
 			hint = ip_extract_route_hint(net, skb,
-					       ((struct rtable *)dst)->rt_type);
+					       ((struct rtable *)dst)->rt_type,
+					       ((struct rtable *)dst)->rt_flags);
 
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 92fede388d52..232b507faf04 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1786,6 +1786,7 @@ static void ip_handle_martian_source(struct net_device *dev,
 
 /* called in rcu_read_lock() section */
 static int __mkroute_input(struct sk_buff *skb,
+			   unsigned int flags,
 			   const struct fib_result *res,
 			   struct in_device *in_dev,
 			   __be32 daddr, __be32 saddr, u32 tos)
@@ -1856,7 +1857,7 @@ static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
-	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
+	rth = rt_dst_alloc(out_dev->dev, flags, res->type,
 			   IN_DEV_ORCONF(out_dev, NOXFRM));
 	if (!rth) {
 		err = -ENOBUFS;
@@ -2139,16 +2140,18 @@ static int ip_mkroute_input(struct sk_buff *skb,
 			    __be32 daddr, __be32 saddr, u32 tos,
 			    struct flow_keys *hkeys)
 {
+	unsigned int flags = 0;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && fib_info_num_path(res->fi) > 1) {
 		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
 
 		fib_select_multipath(res, h);
+		flags |= RTCF_MULTIPATH;
 	}
 #endif
 
 	/* create a routing cache entry */
-	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
+	return __mkroute_input(skb, flags, res, in_dev, daddr, saddr, tos);
 }
 
 /* Implements all the saddr-related checks as ip_route_input_slow(),
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index d94041bb4287..1b7527a4a4bd 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -99,10 +99,14 @@ static bool ip6_can_use_hint(const struct sk_buff *skb,
 static struct sk_buff *ip6_extract_route_hint(const struct net *net,
 					      struct sk_buff *skb)
 {
+#if 0
 	if (fib6_routes_require_src(net) || fib6_has_custom_rules(net))
 		return NULL;
 
 	return skb;
+#else
+	return NULL;
+#endif
 }
 
 static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile
index 770efbe24f0d..bf4e5745fd5c 100644
--- a/tools/testing/selftests/net/forwarding/Makefile
+++ b/tools/testing/selftests/net/forwarding/Makefile
@@ -70,6 +70,7 @@ TEST_PROGS = bridge_igmp.sh \
 	router_mpath_nh.sh \
 	router_multicast.sh \
 	router_multipath.sh \
+	router_multipath_vip.sh \
 	router_nh.sh \
 	router.sh \
 	router_vid_1.sh \
diff --git a/tools/testing/selftests/net/forwarding/router_multipath_vip.sh b/tools/testing/selftests/net/forwarding/router_multipath_vip.sh
new file mode 100755
index 000000000000..0415cf974388
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_multipath_vip.sh
@@ -0,0 +1,324 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +--------------------+                     +----------------------+
+# | H1                 |                     |                   H2 |
+# |                    |                     |                      |
+# |              $h1 + |                     | + $h2                |
+# |     192.0.2.2/24 | |                     | | 198.51.100.2/24    |
+# | 2001:db8:1::2/64 | |                     | | 2001:db8:2::2/64   |
+# |                  | |                     | |                    |
+# +------------------|-+                     +-|--------------------+
+#                    |                         |
+# +------------------|-------------------------|--------------------+
+# | SW               |                         |                    |
+# |                  |                         |                    |
+# |             $rp1 +                         + $rp2               |
+# |     192.0.2.1/24                             198.51.100.1/24    |
+# | 2001:db8:1::1/64     + vip                   2001:db8:2::1/64   |
+# |                        198.18.0.0/24                            |
+# |                        2001:db8:18::/64    + $rp3               |
+# |                                            | 203.0.113.1/24     |
+# |                                            | 2001:db8:3::1/64   |
+# |                                            |                    |
+# |                                            |                    |
+# +--------------------------------------------|--------------------+
+#                                              |
+#                                            +-|--------------------+
+#                                            | |                 H3 |
+#                                            | |                    |
+#                                            | | 203.0.113.2/24     |
+#                                            | | 2001:db8:3::2/64   |
+#                                            | + $h3                |
+#                                            |                      |
+#                                            +----------------------+
+
+ALL_TESTS="ping_ipv4 ping_ipv6 multipath_test"
+NUM_NETIFS=6
+source lib.sh
+
+h1_create()
+{
+	vrf_create "vrf-h1"
+	ip link set dev $h1 master vrf-h1
+
+	ip link set dev vrf-h1 up
+	ip link set dev $h1 up
+
+	ip address add 192.0.2.2/24 dev $h1
+	ip address add 2001:db8:1::2/64 dev $h1
+
+	ip route add default vrf vrf-h1 via 192.0.2.1
+	ip route add default vrf vrf-h1 via 2001:db8:1::1
+}
+
+h1_destroy()
+{
+	ip route del default vrf vrf-h1 via 2001:db8:1::1
+	ip route del default vrf vrf-h1 via 192.0.2.1
+
+	ip address del 2001:db8:1::2/64 dev $h1
+	ip address del 192.0.2.2/24 dev $h1
+
+	ip link set dev $h1 down
+	vrf_destroy "vrf-h1"
+}
+
+h2_create()
+{
+	vrf_create "vrf-h2"
+	ip link set dev $h2 master vrf-h2
+
+	ip link set dev vrf-h2 up
+	ip link set dev $h2 up
+
+	ip address add 198.51.100.2/24 dev $h2
+	ip address add 2001:db8:2::2/64 dev $h2
+
+	ip address add 198.18.0.0/24 dev vrf-h2
+	ip address add 2001:db8:18::/64 dev vrf-h2
+
+	ip route add 192.0.2.0/24 vrf vrf-h2 via 198.51.100.1
+	ip route add 2001:db8:1::/64 vrf vrf-h2 nexthop via 2001:db8:2::1
+}
+
+h2_destroy()
+{
+	ip route del 2001:db8:1::/64 vrf vrf-h2 nexthop via 2001:db8:2::1
+	ip route del 192.0.2.0/24 vrf vrf-h2 via 198.51.100.1
+
+	ip address del 2001:db8:18::/64 dev vrf-h2
+	ip address del 198.18.0.0/24 dev vrf-h2
+
+	ip address del 2001:db8:2::2/64 dev $h2
+	ip address del 198.51.100.2/24 dev $h2
+
+	ip link set dev $h2 down
+	vrf_destroy "vrf-h2"
+}
+
+h3_create()
+{
+	vrf_create "vrf-h3"
+	ip link set dev $h3 master vrf-h3
+
+	ip link set dev vrf-h3 up
+	ip link set dev $h3 up
+
+	ip address add 203.0.113.2/24 dev $h3
+	ip address add 2001:db8:3::2/64 dev $h3
+
+	ip address add 198.18.0.0/24 dev vrf-h3
+	ip address add 2001:db8:18::/64 dev vrf-h3
+
+	ip route add 192.0.2.0/24 vrf vrf-h3 via 203.0.113.1
+	ip route add 2001:db8:1::/64 vrf vrf-h3 nexthop via 2001:db8:3::1
+}
+
+h3_destroy()
+{
+	ip route del 2001:db8:1::/64 vrf vrf-h3 nexthop via 2001:db8:3::1
+	ip route del 192.0.2.0/24 vrf vrf-h3 via 203.0.113.1
+
+	ip address del 198.18.0.0/24 dev vrf-h3
+	ip address del 2001:db8:18::/64 dev vrf-h3
+
+	ip address del 2001:db8:3::2/64 dev $h3
+	ip address del 203.0.113.2/24 dev $h3
+
+	ip link set dev $h3 down
+	vrf_destroy "vrf-h3"
+}
+
+router1_create()
+{
+	vrf_create "vrf-r1"
+	ip link set dev $rp1 master vrf-r1
+	ip link set dev $rp2 master vrf-r1
+	ip link set dev $rp3 master vrf-r1
+
+	ip link set dev vrf-r1 up
+	ip link set dev $rp1 up
+	ip link set dev $rp2 up
+	ip link set dev $rp3 up
+
+	ip address add 192.0.2.1/24 dev $rp1
+	ip address add 2001:db8:1::1/64 dev $rp1
+
+	ip address add 198.51.100.1/24 dev $rp2
+	ip address add 2001:db8:2::1/64 dev $rp2
+
+	ip address add 203.0.113.1/24 dev $rp3
+	ip address add 2001:db8:3::1/64 dev $rp3
+
+	ip route add 198.18.0.0/24 vrf vrf-r1 \
+		nexthop via 198.51.100.2 \
+		nexthop via 203.0.113.2
+	ip route add 2001:db8:18::/64 vrf vrf-r1 \
+		nexthop via 2001:db8:2::2 \
+		nexthop via 2001:db8:3::2
+}
+
+router1_destroy()
+{
+	ip route del 2001:db8:18::/64 vrf vrf-r1
+	ip route del 198.18.0.0/24 vrf vrf-r1
+
+	ip address del 2001:db8:3::1/64 dev $rp3
+	ip address del 203.0.113.1/24 dev $rp3
+
+	ip address del 2001:db8:2::1/64 dev $rp2
+	ip address del 198.51.100.1/24 dev $rp2
+
+	ip address del 2001:db8:1::1/64 dev $rp1
+	ip address del 192.0.2.1/24 dev $rp1
+
+	ip link set dev $rp3 down
+	ip link set dev $rp2 down
+	ip link set dev $rp1 down
+
+	vrf_destroy "vrf-r1"
+}
+
+multipath4_test()
+{
+	local desc="$1"
+	local weight_rp2=$2
+	local weight_rp3=$3
+	local t0_rp2 t0_rp3 t1_rp2 t1_rp3
+	local packets_rp2 packets_rp3
+
+	# Transmit multiple flows from h1 to h2 and make sure they are
+	# distributed between both multipath links (rp2 and rp3)
+	# according to the configured weights.
+	sysctl_set net.ipv4.fib_multipath_hash_policy 1
+	ip route replace 198.18.0.0/24 vrf vrf-r1 \
+		nexthop via 198.51.100.2 weight $weight_rp2 \
+		nexthop via 203.0.113.2 weight $weight_rp3
+
+	t0_rp2=$(link_stats_tx_packets_get $rp2)
+	t0_rp3=$(link_stats_tx_packets_get $rp3)
+
+	ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.18.0.0 \
+		-d 1msec -t tcp "sp=1024,dp=0-32768"
+
+	t1_rp2=$(link_stats_tx_packets_get $rp2)
+	t1_rp3=$(link_stats_tx_packets_get $rp3)
+
+	let "packets_rp2 = $t1_rp2 - $t0_rp2"
+	let "packets_rp3 = $t1_rp3 - $t0_rp3"
+	multipath_eval "$desc" $weight_rp2 $weight_rp3 $packets_rp2 $packets_rp3
+
+	ip route replace 198.18.0.0/24 vrf vrf-r1 \
+		nexthop via 198.51.100.2 \
+		nexthop via 203.0.113.2
+
+	sysctl_restore net.ipv4.fib_multipath_hash_policy
+}
+
+multipath6_l4_test()
+{
+	local desc="$1"
+	local weight_rp2=$2
+	local weight_rp3=$3
+	local t0_rp2 t0_rp3 t1_rp2 t1_rp3
+	local packets_rp2 packets_rp3
+
+	# Transmit multiple flows from h1 to h2 and make sure they are
+	# distributed between both multipath links (rp2 and rp3)
+	# according to the configured weights.
+	sysctl_set net.ipv6.fib_multipath_hash_policy 1
+	ip route replace 2001:db8:18::/64 vrf vrf-r1 \
+		nexthop via 2001:db8:2::2 weight $weight_rp2 \
+		nexthop via 2001:db8:3::2 weight $weight_rp3
+
+	t0_rp2=$(link_stats_tx_packets_get $rp2)
+	t0_rp3=$(link_stats_tx_packets_get $rp3)
+
+	ip vrf exec vrf-h1 $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:18::0 \
+		-d 1msec -t tcp "sp=1024,dp=0-32768"
+
+	t1_rp2=$(link_stats_tx_packets_get $rp2)
+	t1_rp3=$(link_stats_tx_packets_get $rp3)
+
+	let "packets_rp2 = $t1_rp2 - $t0_rp2"
+	let "packets_rp3 = $t1_rp3 - $t0_rp3"
+	multipath_eval "$desc" $weight_rp2 $weight_rp3 $packets_rp2 $packets_rp3
+
+	ip route replace 2001:db8:18::/64 vrf vrf-r1 \
+		nexthop via 2001:db8:2::2 \
+		nexthop via 2001:db8:3::2
+
+	sysctl_restore net.ipv6.fib_multipath_hash_policy
+}
+
+multipath_test()
+{
+	log_info "Running IPv4 multipath tests"
+	multipath4_test "ECMP" 1 1
+	multipath4_test "Weighted MP 2:1" 2 1
+	multipath4_test "Weighted MP 11:45" 11 45
+
+	log_info "Running IPv6 L4 hash multipath tests"
+	multipath6_l4_test "ECMP" 1 1
+	multipath6_l4_test "Weighted MP 2:1" 2 1
+	multipath6_l4_test "Weighted MP 11:45" 11 45
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	rp1=${NETIFS[p2]}
+
+	rp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	rp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	vrf_prepare
+
+	h1_create
+	h2_create
+	h3_create
+
+	router1_create
+
+	forwarding_enable
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	forwarding_restore
+
+	router1_destroy
+
+	h3_destroy
+	h2_destroy
+	h1_destroy
+
+	vrf_cleanup
+}
+
+ping_ipv4()
+{
+	ping_test $h1 198.51.100.2
+	ping_test $h1 203.0.113.2
+}
+
+ping_ipv6()
+{
+	ping6_test $h1 2001:db8:2::2
+	ping6_test $h1 2001:db8:3::2
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ