[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250609161244.3591029-1-jbaron@akamai.com>
Date: Mon, 9 Jun 2025 12:12:44 -0400
From: Jason Baron <jbaron@...mai.com>
To: netdev@...r.kernel.org
Cc: davem@...emloft.net, edumazet@...gle.com, kuba@...nel.org,
pabeni@...hat.com, horms@...nel.org, kuniyu@...zon.com
Subject: [PATCH net-next] netlink: Fix wraparounds of sk->sk_rmem_alloc
For netlink sockets, when comparing allocated rmem memory with the
rcvbuf limit, the comparison is done using signed values. This means
that if rcvbuf is near INT_MAX, then sk->sk_rmem_alloc may become
negative in the comparison with rcvbuf which will yield incorrect
results.
This can be reproduced by using the program from SOCK_DIAG(7) with
some slight modifications. First, setting sk->sk_rcvbuf to INT_MAX
using SO_RCVBUFFORCE and then secondly running the "send_query()"
in a loop while not calling "receive_responses()". In this case,
the value of sk->sk_rmem_alloc will continuously wrap around
and thus more memory is allocated than the sk->sk_rcvbuf limit.
This will eventually fill all of memory leading to an out of memory
condition with skbs filling up the slab.
Let's fix this in a similar manner to:
5a465a0da13e ("udp: Fix multiple wraparounds of sk->sk_rmem_alloc.")
As noted in that fix, if there are multiple threads writing to a
netlink socket it's possible to slightly exceed rcvbuf value. But as
noted this avoids an expensive 'atomic_add_return()' for the common
case. I've confirmed that with the fix the modified program from
SOCK_DIAG(7) can no longer fill memory and the sk->sk_rcvbuf limit
is enforced.
Signed-off-by: Jason Baron <jbaron@...mai.com>
---
net/netlink/af_netlink.c | 47 ++++++++++++++++++++++++++++------------
1 file changed, 33 insertions(+), 14 deletions(-)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e8972a857e51..607e5d72de39 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1213,11 +1213,15 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
long *timeo, struct sock *ssk)
{
struct netlink_sock *nlk;
+ unsigned int rmem, rcvbuf, size;
nlk = nlk_sk(sk);
+ rmem = atomic_read(&sk->sk_rmem_alloc);
+ rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ size = skb->truesize;
- if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
+ if (((rmem + size) > rcvbuf) ||
+ test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
DECLARE_WAITQUEUE(wait, current);
if (!*timeo) {
if (!ssk || netlink_is_kernel(ssk))
@@ -1230,7 +1234,9 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&nlk->wait, &wait);
- if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ rmem = atomic_read(&sk->sk_rmem_alloc);
+ rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ if ((((rmem + size) > rcvbuf) ||
test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
!sock_flag(sk, SOCK_DEAD))
*timeo = schedule_timeout(*timeo);
@@ -1383,12 +1389,18 @@ EXPORT_SYMBOL_GPL(netlink_strict_get_check);
static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
{
struct netlink_sock *nlk = nlk_sk(sk);
+ unsigned int rmem, rcvbuf;
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+ rmem = atomic_read(&sk->sk_rmem_alloc);
+ rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+
+ if (((rmem + skb->truesize) <= rcvbuf) &&
!test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
netlink_skb_set_owner_r(skb, sk);
__netlink_sendskb(sk, skb);
- return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
+ rmem = atomic_read(&sk->sk_rmem_alloc);
+ rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ return rmem > (rcvbuf >> 1);
}
return -1;
}
@@ -1896,6 +1908,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
size_t copied, max_recvmsg_len;
struct sk_buff *skb, *data_skb;
int err, ret;
+ unsigned int rmem, rcvbuf;
if (flags & MSG_OOB)
return -EOPNOTSUPP;
@@ -1960,12 +1973,15 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
skb_free_datagram(sk, skb);
- if (READ_ONCE(nlk->cb_running) &&
- atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
- ret = netlink_dump(sk, false);
- if (ret) {
- WRITE_ONCE(sk->sk_err, -ret);
- sk_error_report(sk);
+ if (READ_ONCE(nlk->cb_running)) {
+ rmem = atomic_read(&sk->sk_rmem_alloc);
+ rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ if (rmem <= (rcvbuf >> 1)) {
+ ret = netlink_dump(sk, false);
+ if (ret) {
+ WRITE_ONCE(sk->sk_err, -ret);
+ sk_error_report(sk);
+ }
}
}
@@ -2250,6 +2266,7 @@ static int netlink_dump(struct sock *sk, bool lock_taken)
int err = -ENOBUFS;
int alloc_min_size;
int alloc_size;
+ unsigned int rmem, rcvbuf;
if (!lock_taken)
mutex_lock(&nlk->nl_cb_mutex);
@@ -2258,9 +2275,6 @@ static int netlink_dump(struct sock *sk, bool lock_taken)
goto errout_skb;
}
- if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
- goto errout_skb;
-
/* NLMSG_GOODSIZE is small to avoid high order allocations being
* required, but it makes sense to _attempt_ a 32KiB allocation
* to reduce number of system calls on dump operations, if user
@@ -2283,6 +2297,11 @@ static int netlink_dump(struct sock *sk, bool lock_taken)
if (!skb)
goto errout_skb;
+ rmem = atomic_read(&sk->sk_rmem_alloc);
+ rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ if ((rmem + skb->truesize) > rcvbuf)
+ goto errout_skb;
+
/* Trim skb to allocated size. User is expected to provide buffer as
* large as max(min_dump_alloc, 32KiB (max_recvmsg_len capped at
* netlink_recvmsg())). dump will pack as many smaller messages as
--
2.25.1
Powered by blists - more mailing lists