lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date: Tue,  9 Jan 2024 10:13:49 +1100
From: Jon Maxwell <jmaxwell37@...il.com>
To: davem@...emloft.net
Cc: edumazet@...gle.com,
	kuba@...nel.org,
	pabeni@...hat.com,
	dsahern@...nel.org,
	netdev@...r.kernel.org,
	jmaxwell37@...il.com,
	Davide Caratti <dcaratti@...hat.com>
Subject: [net-next] tcp: Avoid sending an erroneous RST due to RCU race

There are 2 cases where a socket lookup races due to RCU and finds a 
LISTEN socket instead of an ESTABLISHED or a TIME-WAIT socket. As the ACK flag 
is set this will generate an erroneous RST. 

There are 2 scenarios, one where 2 ACKs (one for the 3 way handshake and 
another with the same sequence number carrying data) are sent with a very 
small time interval between them. In this case the 2 ACKs can race while being
processed on different CPUs and the latter may find the LISTEN socket instead 
of the ESTABLISHED socket. That will make the one end of the TCP connection 
out of sync with the other and cause a break in communications. The other 
scenario is a "FIN ACK" racing with an ACK which may also find the LISTEN 
socket instead of the TIME_WAIT socket. Instead of getting ignored that 
generates an invalid RST. 

Instead of the next connection attempt succeeding. The client then gets an 
ECONNREFUSED error on the next connection attempt when it finds a socket in 
the FIN_WAIT_2 state as discussed here: 

https://lore.kernel.org/netdev/20230606064306.9192-1-duanmuquan@baidu.com/ 

Modeled on Erics idea, introduce __inet_lookup_skb_locked() and
__inet6_lookup_skb_locked()  to fix this by doing a locked lookup only for 
these rare cases to avoid finding the LISTEN socket. The fix ensures it will 
wait until the state has transitioned on the other CPU and will find the 
socket in the correct state instead of finding the LISTEN socket, thus 
avoiding the invalid RST that causes secondary problems so that communications 
are unaffected. While maintaining better RCU performance for all other lookups.  

The commit is confirmed to fix both scenarios in testing.  

Signed-off-by: Jon Maxwell <jmaxwell37@...il.com>
Reviewed-by: Davide Caratti <dcaratti@...hat.com>
---
 include/net/inet6_hashtables.h | 53 ++++++++++++++++++++++++++++++++++
 include/net/inet_hashtables.h  | 52 +++++++++++++++++++++++++++++++++
 net/ipv4/inet_hashtables.c     | 49 +++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c            |  9 ++++++
 net/ipv6/inet6_hashtables.c    | 45 +++++++++++++++++++++++++++++
 net/ipv6/tcp_ipv6.c            |  9 ++++++
 6 files changed, 217 insertions(+)

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 533a7337865a4308c073b30b69dae4dcf7e6b264..04868881bf0c86afdd932494d65cbd594fe5defc 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -48,6 +48,14 @@ struct sock *__inet6_lookup_established(struct net *net,
 					const u16 hnum, const int dif,
 					const int sdif);
 
+struct sock *__inet6_lookup_established_locked(struct net *net,
+					       struct inet_hashinfo *hashinfo,
+					       const struct in6_addr *saddr,
+					       const __be16 sport,
+					       const struct in6_addr *daddr,
+					       const u16 hnum, const int dif,
+					       const int sdif);
+
 typedef u32 (inet6_ehashfn_t)(const struct net *net,
 			       const struct in6_addr *laddr, const u16 lport,
 			       const struct in6_addr *faddr, const __be16 fport);
@@ -103,6 +111,27 @@ static inline struct sock *__inet6_lookup(struct net *net,
 				     daddr, hnum, dif, sdif);
 }
 
+static inline struct sock *__inet6_lookup_locked(struct net *net,
+						 struct inet_hashinfo *hashinfo,
+						 struct sk_buff *skb, int doff,
+						 const struct in6_addr *saddr,
+						 const __be16 sport,
+						 const struct in6_addr *daddr,
+						 const u16 hnum,
+						 const int dif, const int sdif,
+						 bool *refcounted)
+{
+	struct sock *sk = __inet6_lookup_established_locked(net, hashinfo, saddr,
+							    sport, daddr, hnum,
+							    dif, sdif);
+	*refcounted = true;
+	if (sk)
+		return sk;
+	*refcounted = false;
+	return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
+				     daddr, hnum, dif, sdif);
+}
+
 static inline
 struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff,
 			      const struct in6_addr *saddr, const __be16 sport,
@@ -167,6 +196,30 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
 			      iif, sdif, refcounted);
 }
 
+static inline struct sock *__inet6_lookup_skb_locked(struct inet_hashinfo *hashinfo,
+						     struct sk_buff *skb, int doff,
+						     const __be16 sport,
+						     const __be16 dport,
+						     int iif, int sdif,
+						     bool *refcounted)
+{
+	struct net *net = dev_net(skb_dst(skb)->dev);
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	struct sock *sk;
+
+	sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport,
+			      refcounted, inet6_ehashfn);
+	if (IS_ERR(sk))
+		return NULL;
+	if (sk)
+		return sk;
+
+	return __inet6_lookup_locked(net, hashinfo, skb,
+				     doff, &ip6h->saddr, sport,
+				     &ip6h->daddr, ntohs(dport),
+				     iif, sdif, refcounted);
+}
+
 struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
 			  struct sk_buff *skb, int doff,
 			  const struct in6_addr *saddr, const __be16 sport,
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 7f1b384587437d06834bde554edd8df983fd64a4..6ea4a38c385629f989cbe1ace5ba0a055d48e511 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -374,6 +374,12 @@ struct sock *__inet_lookup_established(struct net *net,
 				       const __be32 daddr, const u16 hnum,
 				       const int dif, const int sdif);
 
+struct sock *__inet_lookup_established_locked(struct net *net,
+					      struct inet_hashinfo *hashinfo,
+					      const __be32 saddr, const __be16 sport,
+					      const __be32 daddr, const u16 hnum,
+					      const int dif, const int sdif);
+
 typedef u32 (inet_ehashfn_t)(const struct net *net,
 			      const __be32 laddr, const __u16 lport,
 			      const __be32 faddr, const __be16 fport);
@@ -426,6 +432,27 @@ static inline struct sock *__inet_lookup(struct net *net,
 				      sport, daddr, hnum, dif, sdif);
 }
 
+static inline struct sock *__inet_lookup_locked(struct net *net,
+						struct inet_hashinfo *hashinfo,
+						struct sk_buff *skb, int doff,
+						const __be32 saddr, const __be16 sport,
+						const __be32 daddr, const __be16 dport,
+						const int dif, const int sdif,
+						bool *refcounted)
+{
+	u16 hnum = ntohs(dport);
+	struct sock *sk;
+
+	sk = __inet_lookup_established_locked(net, hashinfo, saddr, sport,
+					      daddr, hnum, dif, sdif);
+	*refcounted = true;
+	if (sk)
+		return sk;
+	*refcounted = false;
+	return __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
+				      sport, daddr, hnum, dif, sdif);
+}
+
 static inline struct sock *inet_lookup(struct net *net,
 				       struct inet_hashinfo *hashinfo,
 				       struct sk_buff *skb, int doff,
@@ -509,6 +536,31 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 			     refcounted);
 }
 
+static inline struct sock *__inet_lookup_skb_locked(struct inet_hashinfo *hashinfo,
+						    struct sk_buff *skb,
+						    int doff,
+						    const __be16 sport,
+						    const __be16 dport,
+						    const int sdif,
+						    bool *refcounted)
+{
+	struct net *net = dev_net(skb_dst(skb)->dev);
+	const struct iphdr *iph = ip_hdr(skb);
+	struct sock *sk;
+
+	sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport,
+			     refcounted, inet_ehashfn);
+	if (IS_ERR(sk))
+		return NULL;
+	if (sk)
+		return sk;
+
+	return __inet_lookup_locked(net, hashinfo, skb,
+				    doff, iph->saddr, sport,
+				    iph->daddr, dport, inet_iif(skb), sdif,
+				    refcounted);
+}
+
 static inline void sk_daddr_set(struct sock *sk, __be32 addr)
 {
 	sk->sk_daddr = addr; /* alias of inet_daddr */
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 93e9193df54461b25c61089bd5db4dd33c32dab6..ef9c0b5462bd0a85ebf350f53b4f3e6f6d394282 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -535,6 +535,55 @@ struct sock *__inet_lookup_established(struct net *net,
 }
 EXPORT_SYMBOL_GPL(__inet_lookup_established);
 
+struct sock *__inet_lookup_established_locked(struct net *net,
+					      struct inet_hashinfo *hashinfo,
+					      const __be32 saddr, const __be16 sport,
+					      const __be32 daddr, const u16 hnum,
+					      const int dif, const int sdif)
+{
+	INET_ADDR_COOKIE(acookie, saddr, daddr);
+	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+	struct sock *sk;
+	const struct hlist_nulls_node *node;
+	/* Optimize here for direct hit, only listening connections can
+	 * have wildcards anyways.
+	 */
+	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+	unsigned int slot = hash & hashinfo->ehash_mask;
+	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+	spinlock_t *lock = inet_ehash_lockp(hashinfo, hash);
+
+	spin_lock(lock);
+begin:
+	sk_nulls_for_each(sk, node, &head->chain) {
+		if (sk->sk_hash != hash)
+			continue;
+		if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
+			if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+				goto out;
+			if (unlikely(!inet_match(net, sk, acookie,
+						 ports, dif, sdif))) {
+				sock_gen_put(sk);
+				goto begin;
+			}
+			goto found;
+		}
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != slot)
+		goto begin;
+out:
+	sk = NULL;
+found:
+	spin_unlock(lock);
+	return sk;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_established_locked);
+
 /* called with local bh disabled */
 static int __inet_check_established(struct inet_timewait_death_row *death_row,
 				    struct sock *sk, __u16 lport,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4bac6e319aca2e45d698889d2293b46632ae00f2..46c78464caf7ccae8a00e8029944d8438feed5dd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2209,6 +2209,15 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
 			       skb, __tcp_hdrlen(th), th->source,
 			       th->dest, sdif, &refcounted);
+
+	/* The 1st lookup is prone to races as it's RCU
+	 * Under rare conditions it can find a LISTEN socket
+	 * Avoid an erroneous RST and this time do a locked lookup.
+	 */
+	if (unlikely(sk && sk->sk_state == TCP_LISTEN && th->ack))
+		sk = __inet_lookup_skb_locked(net->ipv4.tcp_death_row.hashinfo,
+					      skb, __tcp_hdrlen(th), th->source,
+					      th->dest, sdif, &refcounted);
 	if (!sk)
 		goto no_tcp_socket;
 
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b0e8d278e8a9b794d0001efdc0f43716f9a34f8f..24138e4efbaf10f8f54bdd32ccd5767ec6962df2 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -90,6 +90,51 @@ struct sock *__inet6_lookup_established(struct net *net,
 }
 EXPORT_SYMBOL(__inet6_lookup_established);
 
+struct sock *__inet6_lookup_established_locked(struct net *net,
+					       struct inet_hashinfo *hashinfo,
+					       const struct in6_addr *saddr,
+					       const __be16 sport,
+					       const struct in6_addr *daddr,
+					       const u16 hnum,
+					       const int dif, const int sdif)
+{
+	struct sock *sk;
+	const struct hlist_nulls_node *node;
+	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+	/* Optimize here for direct hit, only listening connections can
+	 * have wildcards anyways.
+	 */
+	unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
+	unsigned int slot = hash & hashinfo->ehash_mask;
+	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+	spinlock_t *lock = inet_ehash_lockp(hashinfo, hash);
+
+	spin_lock(lock);
+begin:
+	sk_nulls_for_each(sk, node, &head->chain) {
+		if (sk->sk_hash != hash)
+			continue;
+		if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))
+			continue;
+		if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+			goto out;
+
+		if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) {
+			sock_gen_put(sk);
+			goto begin;
+		}
+		goto found;
+	}
+	if (get_nulls_value(node) != slot)
+		goto begin;
+out:
+	sk = NULL;
+found:
+	spin_unlock(lock);
+	return sk;
+}
+EXPORT_SYMBOL(__inet6_lookup_established_locked);
+
 static inline int compute_score(struct sock *sk, struct net *net,
 				const unsigned short hnum,
 				const struct in6_addr *daddr,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d1307d77a6f094e49ea14c525820ba7635d0aa7c..f5986ff21d2c08e54cca5ae9e6aa123598c53a11 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1790,6 +1790,15 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
 	sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th),
 				th->source, th->dest, inet6_iif(skb), sdif,
 				&refcounted);
+
+	/* The 1st lookup is prone to races as it's RCU
+	 * Under rare conditions it can find a LISTEN socket
+	 * Avoid an erroneous RST and this time do a locked lookup.
+	 */
+	if (unlikely(sk && sk->sk_state == TCP_LISTEN && th->ack))
+		sk = __inet6_lookup_skb_locked(net->ipv4.tcp_death_row.hashinfo, skb,
+					       __tcp_hdrlen(th), th->source, th->dest,
+					       inet6_iif(skb), sdif, &refcounted);
 	if (!sk)
 		goto no_tcp_socket;
 
-- 
2.39.3


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ