[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230815201048.1796-1-sriram.yagnaraman@est.tech>
Date: Tue, 15 Aug 2023 22:10:48 +0200
From: Sriram Yagnaraman <sriram.yagnaraman@....tech>
To:
Cc: netdev@...r.kernel.org, Sriram Yagnaraman <sriram.yagnaraman@....tech>
Subject: [Question]: TCP resets when using ECMP for load-balancing between multiple servers.
All packets in the same flow (L3/L4 depending on multipath hash policy)
should be directed to the same target, but after [0] we see stray
packets directed towards other targets. This, for instance, causes RST
to be sent on TCP connections. This happens on a static setup, with no
changes to the nexthops, so there is no hash space reassignment.
IIUC, route hints when the next hop is part of a multipath group causes
packets in the same receive batch to be sent to the same next hop
irrespective of which nexthop the multipath hash points to. I am no
expert in this area, so please let me know if there is a simple
explanation on how to fix this problem?
Below is a patch which has a selftest that describes the problem setup
and a hack to solve the problem in ipv4. For ipv6, I have just commented
out the part the returns the route hint, just for testing.
[0]: 02b24941619f ("ipv4: use dst hint for ipv4 list receive")
---
include/uapi/linux/in_route.h | 1 +
net/ipv4/ip_input.c | 9 +-
net/ipv4/route.c | 7 +-
net/ipv6/ip6_input.c | 4 +
.../testing/selftests/net/forwarding/Makefile | 1 +
.../net/forwarding/router_multipath_vip.sh | 324 ++++++++++++++++++
6 files changed, 341 insertions(+), 5 deletions(-)
create mode 100755 tools/testing/selftests/net/forwarding/router_multipath_vip.sh
diff --git a/include/uapi/linux/in_route.h b/include/uapi/linux/in_route.h
index 0cc2c23b47f8..01ae06c7743b 100644
--- a/include/uapi/linux/in_route.h
+++ b/include/uapi/linux/in_route.h
@@ -15,6 +15,7 @@
#define RTCF_REDIRECTED 0x00040000
#define RTCF_TPROXY 0x00080000 /* unused */
+#define RTCF_MULTIPATH 0x00200000
#define RTCF_FAST 0x00200000 /* unused */
#define RTCF_MASQ 0x00400000 /* unused */
#define RTCF_SNAT 0x00800000 /* unused */
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index fe9ead9ee863..e06a1a6a4357 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -582,9 +582,11 @@ static void ip_sublist_rcv_finish(struct list_head *head)
}
static struct sk_buff *ip_extract_route_hint(const struct net *net,
- struct sk_buff *skb, int rt_type)
+ struct sk_buff *skb, int rt_type,
+ unsigned int rt_flags)
{
- if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST)
+ if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
+ !!(rt_flags & RTCF_MULTIPATH))
return NULL;
return skb;
@@ -615,7 +617,8 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
dst = skb_dst(skb);
if (curr_dst != dst) {
hint = ip_extract_route_hint(net, skb,
- ((struct rtable *)dst)->rt_type);
+ ((struct rtable *)dst)->rt_type,
+ ((struct rtable *)dst)->rt_flags);
/* dispatch old sublist */
if (!list_empty(&sublist))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 92fede388d52..232b507faf04 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1786,6 +1786,7 @@ static void ip_handle_martian_source(struct net_device *dev,
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
+ unsigned int flags,
const struct fib_result *res,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
@@ -1856,7 +1857,7 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
- rth = rt_dst_alloc(out_dev->dev, 0, res->type,
+ rth = rt_dst_alloc(out_dev->dev, flags, res->type,
IN_DEV_ORCONF(out_dev, NOXFRM));
if (!rth) {
err = -ENOBUFS;
@@ -2139,16 +2140,18 @@ static int ip_mkroute_input(struct sk_buff *skb,
__be32 daddr, __be32 saddr, u32 tos,
struct flow_keys *hkeys)
{
+ unsigned int flags = 0;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
fib_select_multipath(res, h);
+ flags |= RTCF_MULTIPATH;
}
#endif
/* create a routing cache entry */
- return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
+ return __mkroute_input(skb, flags, res, in_dev, daddr, saddr, tos);
}
/* Implements all the saddr-related checks as ip_route_input_slow(),
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index d94041bb4287..1b7527a4a4bd 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -99,10 +99,14 @@ static bool ip6_can_use_hint(const struct sk_buff *skb,
static struct sk_buff *ip6_extract_route_hint(const struct net *net,
struct sk_buff *skb)
{
+#if 0
if (fib6_routes_require_src(net) || fib6_has_custom_rules(net))
return NULL;
return skb;
+#else
+ return NULL;
+#endif
}
static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile
index 770efbe24f0d..bf4e5745fd5c 100644
--- a/tools/testing/selftests/net/forwarding/Makefile
+++ b/tools/testing/selftests/net/forwarding/Makefile
@@ -70,6 +70,7 @@ TEST_PROGS = bridge_igmp.sh \
router_mpath_nh.sh \
router_multicast.sh \
router_multipath.sh \
+ router_multipath_vip.sh \
router_nh.sh \
router.sh \
router_vid_1.sh \
diff --git a/tools/testing/selftests/net/forwarding/router_multipath_vip.sh b/tools/testing/selftests/net/forwarding/router_multipath_vip.sh
new file mode 100755
index 000000000000..0415cf974388
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_multipath_vip.sh
@@ -0,0 +1,324 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +--------------------+ +----------------------+
+# | H1 | | H2 |
+# | | | |
+# | $h1 + | | + $h2 |
+# | 192.0.2.2/24 | | | | 198.51.100.2/24 |
+# | 2001:db8:1::2/64 | | | | 2001:db8:2::2/64 |
+# | | | | | |
+# +------------------|-+ +-|--------------------+
+# | |
+# +------------------|-------------------------|--------------------+
+# | SW | | |
+# | | | |
+# | $rp1 + + $rp2 |
+# | 192.0.2.1/24 198.51.100.1/24 |
+# | 2001:db8:1::1/64 + vip 2001:db8:2::1/64 |
+# | 198.18.0.0/24 |
+# | 2001:db8:18::/64 + $rp3 |
+# | | 203.0.113.1/24 |
+# | | 2001:db8:3::1/64 |
+# | | |
+# | | |
+# +--------------------------------------------|--------------------+
+# |
+# +-|--------------------+
+# | | H3 |
+# | | |
+# | | 203.0.113.2/24 |
+# | | 2001:db8:3::2/64 |
+# | + $h3 |
+# | |
+# +----------------------+
+
+ALL_TESTS="ping_ipv4 ping_ipv6 multipath_test"
+NUM_NETIFS=6
+source lib.sh
+
+h1_create()
+{
+ vrf_create "vrf-h1"
+ ip link set dev $h1 master vrf-h1
+
+ ip link set dev vrf-h1 up
+ ip link set dev $h1 up
+
+ ip address add 192.0.2.2/24 dev $h1
+ ip address add 2001:db8:1::2/64 dev $h1
+
+ ip route add default vrf vrf-h1 via 192.0.2.1
+ ip route add default vrf vrf-h1 via 2001:db8:1::1
+}
+
+h1_destroy()
+{
+ ip route del default vrf vrf-h1 via 2001:db8:1::1
+ ip route del default vrf vrf-h1 via 192.0.2.1
+
+ ip address del 2001:db8:1::2/64 dev $h1
+ ip address del 192.0.2.2/24 dev $h1
+
+ ip link set dev $h1 down
+ vrf_destroy "vrf-h1"
+}
+
+h2_create()
+{
+ vrf_create "vrf-h2"
+ ip link set dev $h2 master vrf-h2
+
+ ip link set dev vrf-h2 up
+ ip link set dev $h2 up
+
+ ip address add 198.51.100.2/24 dev $h2
+ ip address add 2001:db8:2::2/64 dev $h2
+
+ ip address add 198.18.0.0/24 dev vrf-h2
+ ip address add 2001:db8:18::/64 dev vrf-h2
+
+ ip route add 192.0.2.0/24 vrf vrf-h2 via 198.51.100.1
+ ip route add 2001:db8:1::/64 vrf vrf-h2 nexthop via 2001:db8:2::1
+}
+
+h2_destroy()
+{
+ ip route del 2001:db8:1::/64 vrf vrf-h2 nexthop via 2001:db8:2::1
+ ip route del 192.0.2.0/24 vrf vrf-h2 via 198.51.100.1
+
+ ip address del 2001:db8:18::/64 dev vrf-h2
+ ip address del 198.18.0.0/24 dev vrf-h2
+
+ ip address del 2001:db8:2::2/64 dev $h2
+ ip address del 198.51.100.2/24 dev $h2
+
+ ip link set dev $h2 down
+ vrf_destroy "vrf-h2"
+}
+
+h3_create()
+{
+ vrf_create "vrf-h3"
+ ip link set dev $h3 master vrf-h3
+
+ ip link set dev vrf-h3 up
+ ip link set dev $h3 up
+
+ ip address add 203.0.113.2/24 dev $h3
+ ip address add 2001:db8:3::2/64 dev $h3
+
+ ip address add 198.18.0.0/24 dev vrf-h3
+ ip address add 2001:db8:18::/64 dev vrf-h3
+
+ ip route add 192.0.2.0/24 vrf vrf-h3 via 203.0.113.1
+ ip route add 2001:db8:1::/64 vrf vrf-h3 nexthop via 2001:db8:3::1
+}
+
+h3_destroy()
+{
+ ip route del 2001:db8:1::/64 vrf vrf-h3 nexthop via 2001:db8:3::1
+ ip route del 192.0.2.0/24 vrf vrf-h3 via 203.0.113.1
+
+ ip address del 198.18.0.0/24 dev vrf-h3
+ ip address del 2001:db8:18::/64 dev vrf-h3
+
+ ip address del 2001:db8:3::2/64 dev $h3
+ ip address del 203.0.113.2/24 dev $h3
+
+ ip link set dev $h3 down
+ vrf_destroy "vrf-h3"
+}
+
+router1_create()
+{
+ vrf_create "vrf-r1"
+ ip link set dev $rp1 master vrf-r1
+ ip link set dev $rp2 master vrf-r1
+ ip link set dev $rp3 master vrf-r1
+
+ ip link set dev vrf-r1 up
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+ ip link set dev $rp3 up
+
+ ip address add 192.0.2.1/24 dev $rp1
+ ip address add 2001:db8:1::1/64 dev $rp1
+
+ ip address add 198.51.100.1/24 dev $rp2
+ ip address add 2001:db8:2::1/64 dev $rp2
+
+ ip address add 203.0.113.1/24 dev $rp3
+ ip address add 2001:db8:3::1/64 dev $rp3
+
+ ip route add 198.18.0.0/24 vrf vrf-r1 \
+ nexthop via 198.51.100.2 \
+ nexthop via 203.0.113.2
+ ip route add 2001:db8:18::/64 vrf vrf-r1 \
+ nexthop via 2001:db8:2::2 \
+ nexthop via 2001:db8:3::2
+}
+
+router1_destroy()
+{
+ ip route del 2001:db8:18::/64 vrf vrf-r1
+ ip route del 198.18.0.0/24 vrf vrf-r1
+
+ ip address del 2001:db8:3::1/64 dev $rp3
+ ip address del 203.0.113.1/24 dev $rp3
+
+ ip address del 2001:db8:2::1/64 dev $rp2
+ ip address del 198.51.100.1/24 dev $rp2
+
+ ip address del 2001:db8:1::1/64 dev $rp1
+ ip address del 192.0.2.1/24 dev $rp1
+
+ ip link set dev $rp3 down
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+
+ vrf_destroy "vrf-r1"
+}
+
+multipath4_test()
+{
+ local desc="$1"
+ local weight_rp2=$2
+ local weight_rp3=$3
+ local t0_rp2 t0_rp3 t1_rp2 t1_rp3
+ local packets_rp2 packets_rp3
+
+ # Transmit multiple flows from h1 to h2 and make sure they are
+ # distributed between both multipath links (rp2 and rp3)
+ # according to the configured weights.
+ sysctl_set net.ipv4.fib_multipath_hash_policy 1
+ ip route replace 198.18.0.0/24 vrf vrf-r1 \
+ nexthop via 198.51.100.2 weight $weight_rp2 \
+ nexthop via 203.0.113.2 weight $weight_rp3
+
+ t0_rp2=$(link_stats_tx_packets_get $rp2)
+ t0_rp3=$(link_stats_tx_packets_get $rp3)
+
+ ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.18.0.0 \
+ -d 1msec -t tcp "sp=1024,dp=0-32768"
+
+ t1_rp2=$(link_stats_tx_packets_get $rp2)
+ t1_rp3=$(link_stats_tx_packets_get $rp3)
+
+ let "packets_rp2 = $t1_rp2 - $t0_rp2"
+ let "packets_rp3 = $t1_rp3 - $t0_rp3"
+ multipath_eval "$desc" $weight_rp2 $weight_rp3 $packets_rp2 $packets_rp3
+
+ ip route replace 198.18.0.0/24 vrf vrf-r1 \
+ nexthop via 198.51.100.2 \
+ nexthop via 203.0.113.2
+
+ sysctl_restore net.ipv4.fib_multipath_hash_policy
+}
+
+multipath6_l4_test()
+{
+ local desc="$1"
+ local weight_rp2=$2
+ local weight_rp3=$3
+ local t0_rp2 t0_rp3 t1_rp2 t1_rp3
+ local packets_rp2 packets_rp3
+
+ # Transmit multiple flows from h1 to h2 and make sure they are
+ # distributed between both multipath links (rp2 and rp3)
+ # according to the configured weights.
+ sysctl_set net.ipv6.fib_multipath_hash_policy 1
+ ip route replace 2001:db8:18::/64 vrf vrf-r1 \
+ nexthop via 2001:db8:2::2 weight $weight_rp2 \
+ nexthop via 2001:db8:3::2 weight $weight_rp3
+
+ t0_rp2=$(link_stats_tx_packets_get $rp2)
+ t0_rp3=$(link_stats_tx_packets_get $rp3)
+
+ ip vrf exec vrf-h1 $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:18::0 \
+ -d 1msec -t tcp "sp=1024,dp=0-32768"
+
+ t1_rp2=$(link_stats_tx_packets_get $rp2)
+ t1_rp3=$(link_stats_tx_packets_get $rp3)
+
+ let "packets_rp2 = $t1_rp2 - $t0_rp2"
+ let "packets_rp3 = $t1_rp3 - $t0_rp3"
+ multipath_eval "$desc" $weight_rp2 $weight_rp3 $packets_rp2 $packets_rp3
+
+ ip route replace 2001:db8:18::/64 vrf vrf-r1 \
+ nexthop via 2001:db8:2::2 \
+ nexthop via 2001:db8:3::2
+
+ sysctl_restore net.ipv6.fib_multipath_hash_policy
+}
+
+multipath_test()
+{
+ log_info "Running IPv4 multipath tests"
+ multipath4_test "ECMP" 1 1
+ multipath4_test "Weighted MP 2:1" 2 1
+ multipath4_test "Weighted MP 11:45" 11 45
+
+ log_info "Running IPv6 L4 hash multipath tests"
+ multipath6_l4_test "ECMP" 1 1
+ multipath6_l4_test "Weighted MP 2:1" 2 1
+ multipath6_l4_test "Weighted MP 11:45" 11 45
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+
+ router1_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router1_destroy
+
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 198.51.100.2
+ ping_test $h1 203.0.113.2
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:2::2
+ ping6_test $h1 2001:db8:3::2
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
--
2.34.1
Powered by blists - more mailing lists