lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b044faad-1e3f-4c65-b2e6-fc418aebd22e@bytedance.com>
Date: Thu, 21 Nov 2024 17:22:36 +0800
From: Zelong Yue <yuezelong@...edance.com>
To: jgg@...pe.ca, leon@...nel.org
Cc: linux-rdma@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [RFC] RDMA/core: Fix IPv6 loopback dst MAC address lookup logic

Gently ping. Do I need to provide more detailed information on how to 
reproduce the issue?

On 11/10/24 8:35 PM, yuezelong wrote:
> Imagine we have two RNICs on a single machine, named eth1 and eth2, with
>
> - IPv4 addresses: 192.168.1.2, 192.168.1.3
> - IPv6 addresses (scope global): fdbd::beef:2, fdbd::beef:3
> - MAC addresses: 11:11:11:11:11:02, 11:11:11:11:11:03,
>
> they all connnected to a gateway with MAC address 22:22:22:22:22:02.
>
> If we want to setup connections between these two RNICs, with RC QP, we
> would go through `rdma_resolve_ip` for looking up dst MAC addresses. The
> procedure it's the same as using command
>
> `ip route get dst_addr from src_addr oif src_dev`
>
> In IPv4 scenario, you would likely get
>
> ```
> $ ip route get 192.168.1.2 from 192.168.1.3 oif eth2
>
> 192.168.1.2 from 192.168.1.3 via 192.168.1.1 dev eth2 ...
> ```
>
> Looks reasonable as it would go through the gateway.
>
> But in IPv6 scenario, you would likely get
>
> ```
> $ ip route get fdbd::beef:2 from fdbd::beef:3 oif eth2
>
> local fdbd::beef:2 from fdbd::beed:3 dev lo table local proto kernel src fdbd::beef:2 metric 0 pref medium
> ```
>
> This would lead to the RDMA route lookup procedure filling the dst MAC
> address with src net device's MAC address (11:11:11:11:11:03),  but
> filling the dst IP address with dst net device's IPv6 address
> (fdbd::beef:2), src net device would drop this packet, and we would fail
> to setup the connection.
>
> To make setting up loopback connections like this possible, we need to
> send packets to the gateway and let the gateway send it back (actually,
> the IPv4 lookup result would lead to this, so there is no problem in IPv4
> scenario), so we need to adjust current lookup procedure, if we find out
> the src device and dst device is on the same machine (same namespace),
> we need to send the packets to the gateway instead of the src device
> itself.
>
> Signed-off-by: yuezelong <yuezelong@...edance.com>
> ---
>   drivers/infiniband/core/addr.c | 95 +++++++++++++++++++++++++++++++++-
>   drivers/infiniband/core/cma.c  |  7 ++-
>   2 files changed, 99 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
> index c4cf26f1d149..d194e6b7f2b9 100644
> --- a/drivers/infiniband/core/addr.c
> +++ b/drivers/infiniband/core/addr.c
> @@ -545,6 +545,29 @@ static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr)
>   	addr->bound_dev_if = 0;
>   }
>   
> +static struct dst_entry *get_default_ipv6_route(struct in6_addr *src_addr, struct net_device *dev)
> +{
> +	struct flowi6 fl6;
> +	struct dst_entry *dst = NULL;
> +
> +	memset(&fl6, 0, sizeof(fl6));
> +	fl6.flowi6_iif = dev_net(dev)->loopback_dev->ifindex;
> +	fl6.flowi6_oif = dev->ifindex;
> +	fl6.saddr = *src_addr;
> +	fl6.daddr = in6addr_any;
> +
> +	dst = ipv6_stub->ipv6_dst_lookup_flow(dev_net(dev), NULL, &fl6, NULL);
> +	if (IS_ERR(dst))
> +		return NULL;
> +
> +	if (dst && dst->error) {
> +		dst_release(dst);
> +		return NULL;
> +	}
> +
> +	return dst;
> +}
> +
>   static int addr_resolve(struct sockaddr *src_in,
>   			const struct sockaddr *dst_in,
>   			struct rdma_dev_addr *addr,
> @@ -597,9 +620,77 @@ static int addr_resolve(struct sockaddr *src_in,
>   	 * Resolve neighbor destination address if requested and
>   	 * only if src addr translation didn't fail.
>   	 */
> -	if (!ret && resolve_neigh)
> -		ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
> +	if (!ret && resolve_neigh) {
> +		if ((src_in->sa_family == AF_INET6) && (ndev_flags & IFF_LOOPBACK)) {
> +			rcu_read_lock();
> +			/*
> +			 * When src net device and dst net device is different device,
> +			 * traditional TCP/IP loopback won't work for RDMA. We need to find
> +			 * gateway for src net device and send packets to the gateway, then
> +			 * let the gateway send it back to the dst device. This is likely
> +			 * be problematic in IPv6 scenario, the route logic would likely fill
> +			 * the dst MAC address with src net device's MAC, but with dst IP
> +			 * belongs to the dst net device, leading to packet drop.
> +			 *
> +			 * Thus, we need to figure out gateway's MAC address in IPv6 loopback
> +			 * scenario.
> +			 */
> +			struct net_device *ndev = READ_ONCE(dst->dev);
> +			struct net_device *src_ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev),
> +										    src_in);
> +			struct net_device *dst_ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev),
> +										    dst_in);
> +
> +			if (IS_ERR(src_ndev) || IS_ERR(dst_ndev)) {
> +				ret = -ENODEV;
> +				rcu_read_unlock();
> +				goto exit;
> +			}
> +
> +			if (src_ndev != dst_ndev) {
> +				dst_release(dst);
> +				dst = get_default_ipv6_route((struct in6_addr *)src_in->sa_data,
> +							     src_ndev);
> +				ndev_flags = src_ndev->flags;
> +			} else {
> +				rcu_read_unlock();
> +				/*
> +				 * For real loopback (src and dst is the same device), we can
> +				 * just use the original code path.
> +				 */
> +				ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
> +				goto exit;
> +			}
> +			rcu_read_unlock();
> +
> +			if (dst == NULL) {
> +				ret = -EINVAL;
> +				goto done;
> +			}
> +
> +			/*
> +			 * Though we fill `in6addr_any` as dst addr here, `xfrm_neigh_lookup`
> +			 * would still find nexthop for us, which provides gateway MAC address.
> +			 */
> +			struct sockaddr_in6 addr_in = {
> +				.sin6_family = AF_INET6,
> +				.sin6_addr = in6addr_any,
> +			};
> +			const void *daddr = (const void *)&addr_in.sin6_addr;
> +
> +			might_sleep();
> +
> +			/*
> +			 * Use `addr_resolve_neigh` here would go into `ib_nl_fetch_ha` branch,
> +			 * which would fail because of `rdma_nl_chk_listeners` returns error.
> +			 */
> +			ret = dst_fetch_ha(dst, addr, daddr);
> +		} else {
> +			ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
> +		}
> +	}
>   
> +exit:
>   	if (src_in->sa_family == AF_INET)
>   		ip_rt_put(rt);
>   	else
> diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
> index 64ace0b968f0..744f396568cd 100644
> --- a/drivers/infiniband/core/cma.c
> +++ b/drivers/infiniband/core/cma.c
> @@ -1612,7 +1612,12 @@ static bool validate_ipv6_net_dev(struct net_device *net_dev,
>   	if (!rt)
>   		return false;
>   
> -	ret = rt->rt6i_idev->dev == net_dev;
> +	if (rt->rt6i_flags & (RTF_LOCAL | RTF_NONEXTHOP)) {
> +		// TODO: how to validate netdev when the device is loopback?
> +		ret = true;
> +	} else {
> +		ret = rt->rt6i_idev->dev == net_dev;
> +	}
>   	ip6_rt_put(rt);
>   
>   	return ret;

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ