[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b044faad-1e3f-4c65-b2e6-fc418aebd22e@bytedance.com>
Date: Thu, 21 Nov 2024 17:22:36 +0800
From: Zelong Yue <yuezelong@...edance.com>
To: jgg@...pe.ca, leon@...nel.org
Cc: linux-rdma@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [RFC] RDMA/core: Fix IPv6 loopback dst MAC address lookup logic
Gently ping. Do I need to provide more detailed information on how to
reproduce the issue?
On 11/10/24 8:35 PM, yuezelong wrote:
> Imagine we have two RNICs on a single machine, named eth1 and eth2, with
>
> - IPv4 addresses: 192.168.1.2, 192.168.1.3
> - IPv6 addresses (scope global): fdbd::beef:2, fdbd::beef:3
> - MAC addresses: 11:11:11:11:11:02, 11:11:11:11:11:03,
>
> they all connnected to a gateway with MAC address 22:22:22:22:22:02.
>
> If we want to setup connections between these two RNICs, with RC QP, we
> would go through `rdma_resolve_ip` for looking up dst MAC addresses. The
> procedure it's the same as using command
>
> `ip route get dst_addr from src_addr oif src_dev`
>
> In IPv4 scenario, you would likely get
>
> ```
> $ ip route get 192.168.1.2 from 192.168.1.3 oif eth2
>
> 192.168.1.2 from 192.168.1.3 via 192.168.1.1 dev eth2 ...
> ```
>
> Looks reasonable as it would go through the gateway.
>
> But in IPv6 scenario, you would likely get
>
> ```
> $ ip route get fdbd::beef:2 from fdbd::beef:3 oif eth2
>
> local fdbd::beef:2 from fdbd::beed:3 dev lo table local proto kernel src fdbd::beef:2 metric 0 pref medium
> ```
>
> This would lead to the RDMA route lookup procedure filling the dst MAC
> address with src net device's MAC address (11:11:11:11:11:03), but
> filling the dst IP address with dst net device's IPv6 address
> (fdbd::beef:2), src net device would drop this packet, and we would fail
> to setup the connection.
>
> To make setting up loopback connections like this possible, we need to
> send packets to the gateway and let the gateway send it back (actually,
> the IPv4 lookup result would lead to this, so there is no problem in IPv4
> scenario), so we need to adjust current lookup procedure, if we find out
> the src device and dst device is on the same machine (same namespace),
> we need to send the packets to the gateway instead of the src device
> itself.
>
> Signed-off-by: yuezelong <yuezelong@...edance.com>
> ---
> drivers/infiniband/core/addr.c | 95 +++++++++++++++++++++++++++++++++-
> drivers/infiniband/core/cma.c | 7 ++-
> 2 files changed, 99 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
> index c4cf26f1d149..d194e6b7f2b9 100644
> --- a/drivers/infiniband/core/addr.c
> +++ b/drivers/infiniband/core/addr.c
> @@ -545,6 +545,29 @@ static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr)
> addr->bound_dev_if = 0;
> }
>
> +static struct dst_entry *get_default_ipv6_route(struct in6_addr *src_addr, struct net_device *dev)
> +{
> + struct flowi6 fl6;
> + struct dst_entry *dst = NULL;
> +
> + memset(&fl6, 0, sizeof(fl6));
> + fl6.flowi6_iif = dev_net(dev)->loopback_dev->ifindex;
> + fl6.flowi6_oif = dev->ifindex;
> + fl6.saddr = *src_addr;
> + fl6.daddr = in6addr_any;
> +
> + dst = ipv6_stub->ipv6_dst_lookup_flow(dev_net(dev), NULL, &fl6, NULL);
> + if (IS_ERR(dst))
> + return NULL;
> +
> + if (dst && dst->error) {
> + dst_release(dst);
> + return NULL;
> + }
> +
> + return dst;
> +}
> +
> static int addr_resolve(struct sockaddr *src_in,
> const struct sockaddr *dst_in,
> struct rdma_dev_addr *addr,
> @@ -597,9 +620,77 @@ static int addr_resolve(struct sockaddr *src_in,
> * Resolve neighbor destination address if requested and
> * only if src addr translation didn't fail.
> */
> - if (!ret && resolve_neigh)
> - ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
> + if (!ret && resolve_neigh) {
> + if ((src_in->sa_family == AF_INET6) && (ndev_flags & IFF_LOOPBACK)) {
> + rcu_read_lock();
> + /*
> + * When src net device and dst net device is different device,
> + * traditional TCP/IP loopback won't work for RDMA. We need to find
> + * gateway for src net device and send packets to the gateway, then
> + * let the gateway send it back to the dst device. This is likely
> + * be problematic in IPv6 scenario, the route logic would likely fill
> + * the dst MAC address with src net device's MAC, but with dst IP
> + * belongs to the dst net device, leading to packet drop.
> + *
> + * Thus, we need to figure out gateway's MAC address in IPv6 loopback
> + * scenario.
> + */
> + struct net_device *ndev = READ_ONCE(dst->dev);
> + struct net_device *src_ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev),
> + src_in);
> + struct net_device *dst_ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev),
> + dst_in);
> +
> + if (IS_ERR(src_ndev) || IS_ERR(dst_ndev)) {
> + ret = -ENODEV;
> + rcu_read_unlock();
> + goto exit;
> + }
> +
> + if (src_ndev != dst_ndev) {
> + dst_release(dst);
> + dst = get_default_ipv6_route((struct in6_addr *)src_in->sa_data,
> + src_ndev);
> + ndev_flags = src_ndev->flags;
> + } else {
> + rcu_read_unlock();
> + /*
> + * For real loopback (src and dst is the same device), we can
> + * just use the original code path.
> + */
> + ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
> + goto exit;
> + }
> + rcu_read_unlock();
> +
> + if (dst == NULL) {
> + ret = -EINVAL;
> + goto done;
> + }
> +
> + /*
> + * Though we fill `in6addr_any` as dst addr here, `xfrm_neigh_lookup`
> + * would still find nexthop for us, which provides gateway MAC address.
> + */
> + struct sockaddr_in6 addr_in = {
> + .sin6_family = AF_INET6,
> + .sin6_addr = in6addr_any,
> + };
> + const void *daddr = (const void *)&addr_in.sin6_addr;
> +
> + might_sleep();
> +
> + /*
> + * Use `addr_resolve_neigh` here would go into `ib_nl_fetch_ha` branch,
> + * which would fail because of `rdma_nl_chk_listeners` returns error.
> + */
> + ret = dst_fetch_ha(dst, addr, daddr);
> + } else {
> + ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
> + }
> + }
>
> +exit:
> if (src_in->sa_family == AF_INET)
> ip_rt_put(rt);
> else
> diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
> index 64ace0b968f0..744f396568cd 100644
> --- a/drivers/infiniband/core/cma.c
> +++ b/drivers/infiniband/core/cma.c
> @@ -1612,7 +1612,12 @@ static bool validate_ipv6_net_dev(struct net_device *net_dev,
> if (!rt)
> return false;
>
> - ret = rt->rt6i_idev->dev == net_dev;
> + if (rt->rt6i_flags & (RTF_LOCAL | RTF_NONEXTHOP)) {
> + // TODO: how to validate netdev when the device is loopback?
> + ret = true;
> + } else {
> + ret = rt->rt6i_idev->dev == net_dev;
> + }
> ip6_rt_put(rt);
>
> return ret;
Powered by blists - more mailing lists