netdev - Re: [PATCH net-next v2 3/3] netlink: Fix wraparound of sk->sk_rmem

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250624170933.419907-1-kuni1840@gmail.com>
Date: Tue, 24 Jun 2025 10:08:41 -0700
From: Kuniyuki Iwashima <kuni1840@...il.com>
To: kuba@...nel.org
Cc: davem@...emloft.net,
	edumazet@...gle.com,
	horms@...nel.org,
	jbaron@...mai.com,
	kuni1840@...il.com,
	kuniyu@...gle.com,
	netdev@...r.kernel.org,
	pabeni@...hat.com
Subject: Re: [PATCH net-next v2 3/3] netlink: Fix wraparound of sk->sk_rmem_alloc

Date: Tue, 24 Jun 2025 07:11:57 -0700
From: Jakub Kicinski <kuba@...nel.org>
> On Tue, 24 Jun 2025 09:55:15 +0200 Paolo Abeni wrote:
> > > To be clear -- are you saying we should fix this differently?
> > > Or perhaps that the problem doesn't exist? The change doesn't
> > > seem very intrusive..  
> > 
> > AFAICS the race is possible even with netlink as netlink_unicast() runs
> > without the socket lock, too.
> > 
> > The point is that for UDP the scenario with multiple threads enqueuing a
> > packet into the same socket is a critical path, optimizing for
> > performances and allowing some memory accounting inaccuracy makes sense.
> > 
> > For netlink socket, that scenario looks a patological one and I think we
> > should prefer accuracy instead of optimization.
> 
> Could you ELI5 what you mean? Are you suggesting a lock around every
> sk_rmem write for netlink sockets? 
> If we think this is an attack vector the attacker can simply use a UDP
> socket instead. Or do you think it'd lead to simpler code?

I was wondering if atomic_add_return() is expensive for netlink,
and if not, we could use it like below.  I'm also not sure we want
to keep the allow-at-least-one-skb rule for netlink though, which
comes from the first condition in __sock_queue_rcv_skb() for UDP
in the past, IIRC.

untested:

---8<---
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e8972a857e51..e1a9ae7ff521 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -387,7 +387,6 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 	WARN_ON(skb->sk != NULL);
 	skb->sk = sk;
 	skb->destructor = netlink_skb_destructor;
-	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
 	sk_mem_charge(sk, skb->truesize);
 }
 
@@ -1212,41 +1211,45 @@ struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast)
 int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 		      long *timeo, struct sock *ssk)
 {
+	unsigned long rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
+	DECLARE_WAITQUEUE(wait, current);
 	struct netlink_sock *nlk;
 
 	nlk = nlk_sk(sk);
 
-	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	     test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
-		DECLARE_WAITQUEUE(wait, current);
-		if (!*timeo) {
-			if (!ssk || netlink_is_kernel(ssk))
-				netlink_overrun(sk);
-			sock_put(sk);
-			kfree_skb(skb);
-			return -EAGAIN;
-		}
-
-		__set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&nlk->wait, &wait);
+	if (rmem == skb->truesize ||
+	    (rmem < sk->sk_rcvbuf && !test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
+		netlink_skb_set_owner_r(skb, sk);
+		return 0;
+	}
 
-		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-		     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
-		    !sock_flag(sk, SOCK_DEAD))
-			*timeo = schedule_timeout(*timeo);
+	atomic_dec(skb->truesize, &sk->sk_rmem_alloc);
 
-		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&nlk->wait, &wait);
+	if (!*timeo) {
+		if (!ssk || netlink_is_kernel(ssk))
+			netlink_overrun(sk);
 		sock_put(sk);
+		kfree_skb(skb);
+		return -EAGAIN;
+	}
 
-		if (signal_pending(current)) {
-			kfree_skb(skb);
-			return sock_intr_errno(*timeo);
-		}
-		return 1;
+	__set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&nlk->wait, &wait);
+
+	if ((atomic_read(&sk->sk_rmem_alloc) + skb->truesize > sk->sk_rcvbuf ||
+	     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
+	    !sock_flag(sk, SOCK_DEAD))
+		*timeo = schedule_timeout(*timeo);
+
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&nlk->wait, &wait);
+	sock_put(sk);
+
+	if (signal_pending(current)) {
+		kfree_skb(skb);
+		return sock_intr_errno(*timeo);
 	}
-	netlink_skb_set_owner_r(skb, sk);
-	return 0;
+	return 1;
 }
 
 static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
@@ -1307,6 +1310,7 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
 	ret = -ECONNREFUSED;
 	if (nlk->netlink_rcv != NULL) {
 		ret = skb->len;
+		atomic_add(skb->truesize, &sk->sk_rmem_alloc);
 		netlink_skb_set_owner_r(skb, sk);
 		NETLINK_CB(skb).sk = ssk;
 		netlink_deliver_tap_kernel(sk, ssk, skb);
@@ -1382,14 +1386,18 @@ EXPORT_SYMBOL_GPL(netlink_strict_get_check);
 
 static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
 {
+	unsigned long rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
+	unsigned int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
 	struct netlink_sock *nlk = nlk_sk(sk);
 
-	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
-	    !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
+	if (rmem == skb->truesize ||
+	    (size <= rcvbuf && !test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
 		netlink_skb_set_owner_r(skb, sk);
 		__netlink_sendskb(sk, skb);
-		return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
+		return size > (rcvbuf >> 1);
 	}
+
+	atomic_dec(skb->truesize, &sk->sk_rmem_alloc);
 	return -1;
 }
 
@@ -2249,6 +2257,7 @@ static int netlink_dump(struct sock *sk, bool lock_taken)
 	struct module *module;
 	int err = -ENOBUFS;
 	int alloc_min_size;
+	unsigned int rmem;
 	int alloc_size;
 
 	if (!lock_taken)
@@ -2258,9 +2267,6 @@ static int netlink_dump(struct sock *sk, bool lock_taken)
 		goto errout_skb;
 	}
 
-	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
-		goto errout_skb;
-
 	/* NLMSG_GOODSIZE is small to avoid high order allocations being
 	 * required, but it makes sense to _attempt_ a 32KiB allocation
 	 * to reduce number of system calls on dump operations, if user
@@ -2283,6 +2289,12 @@ static int netlink_dump(struct sock *sk, bool lock_taken)
 	if (!skb)
 		goto errout_skb;
 
+	rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
+	if (rmem != skb->truesize && rmem >= sk->sk_rcvbuf) {
+		atomic_dec(skb->truesize, &sk->sk_rmem_alloc);
+		goto errout_skb;
+	}
+
 	/* Trim skb to allocated size. User is expected to provide buffer as
 	 * large as max(min_dump_alloc, 32KiB (max_recvmsg_len capped at
 	 * netlink_recvmsg())). dump will pack as many smaller messages as
---8<---