[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250624170933.419907-1-kuni1840@gmail.com>
Date: Tue, 24 Jun 2025 10:08:41 -0700
From: Kuniyuki Iwashima <kuni1840@...il.com>
To: kuba@...nel.org
Cc: davem@...emloft.net,
edumazet@...gle.com,
horms@...nel.org,
jbaron@...mai.com,
kuni1840@...il.com,
kuniyu@...gle.com,
netdev@...r.kernel.org,
pabeni@...hat.com
Subject: Re: [PATCH net-next v2 3/3] netlink: Fix wraparound of sk->sk_rmem_alloc
Date: Tue, 24 Jun 2025 07:11:57 -0700
From: Jakub Kicinski <kuba@...nel.org>
> On Tue, 24 Jun 2025 09:55:15 +0200 Paolo Abeni wrote:
> > > To be clear -- are you saying we should fix this differently?
> > > Or perhaps that the problem doesn't exist? The change doesn't
> > > seem very intrusive..
> >
> > AFAICS the race is possible even with netlink as netlink_unicast() runs
> > without the socket lock, too.
> >
> > The point is that for UDP the scenario with multiple threads enqueuing a
> > packet into the same socket is a critical path, optimizing for
> > performances and allowing some memory accounting inaccuracy makes sense.
> >
> > For netlink socket, that scenario looks a patological one and I think we
> > should prefer accuracy instead of optimization.
>
> Could you ELI5 what you mean? Are you suggesting a lock around every
> sk_rmem write for netlink sockets?
> If we think this is an attack vector the attacker can simply use a UDP
> socket instead. Or do you think it'd lead to simpler code?
I was wondering if atomic_add_return() is expensive for netlink,
and if not, we could use it like below. I'm also not sure we want
to keep the allow-at-least-one-skb rule for netlink though, which
comes from the first condition in __sock_queue_rcv_skb() for UDP
in the past, IIRC.
untested:
---8<---
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e8972a857e51..e1a9ae7ff521 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -387,7 +387,6 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
WARN_ON(skb->sk != NULL);
skb->sk = sk;
skb->destructor = netlink_skb_destructor;
- atomic_add(skb->truesize, &sk->sk_rmem_alloc);
sk_mem_charge(sk, skb->truesize);
}
@@ -1212,41 +1211,45 @@ struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast)
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
long *timeo, struct sock *ssk)
{
+ unsigned long rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
+ DECLARE_WAITQUEUE(wait, current);
struct netlink_sock *nlk;
nlk = nlk_sk(sk);
- if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
- DECLARE_WAITQUEUE(wait, current);
- if (!*timeo) {
- if (!ssk || netlink_is_kernel(ssk))
- netlink_overrun(sk);
- sock_put(sk);
- kfree_skb(skb);
- return -EAGAIN;
- }
-
- __set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&nlk->wait, &wait);
+ if (rmem == skb->truesize ||
+ (rmem < sk->sk_rcvbuf && !test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
+ netlink_skb_set_owner_r(skb, sk);
+ return 0;
+ }
- if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
- !sock_flag(sk, SOCK_DEAD))
- *timeo = schedule_timeout(*timeo);
+ atomic_dec(skb->truesize, &sk->sk_rmem_alloc);
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nlk->wait, &wait);
+ if (!*timeo) {
+ if (!ssk || netlink_is_kernel(ssk))
+ netlink_overrun(sk);
sock_put(sk);
+ kfree_skb(skb);
+ return -EAGAIN;
+ }
- if (signal_pending(current)) {
- kfree_skb(skb);
- return sock_intr_errno(*timeo);
- }
- return 1;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&nlk->wait, &wait);
+
+ if ((atomic_read(&sk->sk_rmem_alloc) + skb->truesize > sk->sk_rcvbuf ||
+ test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
+ !sock_flag(sk, SOCK_DEAD))
+ *timeo = schedule_timeout(*timeo);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&nlk->wait, &wait);
+ sock_put(sk);
+
+ if (signal_pending(current)) {
+ kfree_skb(skb);
+ return sock_intr_errno(*timeo);
}
- netlink_skb_set_owner_r(skb, sk);
- return 0;
+ return 1;
}
static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
@@ -1307,6 +1310,7 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
ret = -ECONNREFUSED;
if (nlk->netlink_rcv != NULL) {
ret = skb->len;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
netlink_skb_set_owner_r(skb, sk);
NETLINK_CB(skb).sk = ssk;
netlink_deliver_tap_kernel(sk, ssk, skb);
@@ -1382,14 +1386,18 @@ EXPORT_SYMBOL_GPL(netlink_strict_get_check);
static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
{
+ unsigned long rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
+ unsigned int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
struct netlink_sock *nlk = nlk_sk(sk);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
- !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
+ if (rmem == skb->truesize ||
+ (size <= rcvbuf && !test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
netlink_skb_set_owner_r(skb, sk);
__netlink_sendskb(sk, skb);
- return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
+ return size > (rcvbuf >> 1);
}
+
+ atomic_dec(skb->truesize, &sk->sk_rmem_alloc);
return -1;
}
@@ -2249,6 +2257,7 @@ static int netlink_dump(struct sock *sk, bool lock_taken)
struct module *module;
int err = -ENOBUFS;
int alloc_min_size;
+ unsigned int rmem;
int alloc_size;
if (!lock_taken)
@@ -2258,9 +2267,6 @@ static int netlink_dump(struct sock *sk, bool lock_taken)
goto errout_skb;
}
- if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
- goto errout_skb;
-
/* NLMSG_GOODSIZE is small to avoid high order allocations being
* required, but it makes sense to _attempt_ a 32KiB allocation
* to reduce number of system calls on dump operations, if user
@@ -2283,6 +2289,12 @@ static int netlink_dump(struct sock *sk, bool lock_taken)
if (!skb)
goto errout_skb;
+ rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc);
+ if (rmem != skb->truesize && rmem >= sk->sk_rcvbuf) {
+ atomic_dec(skb->truesize, &sk->sk_rmem_alloc);
+ goto errout_skb;
+ }
+
/* Trim skb to allocated size. User is expected to provide buffer as
* large as max(min_dump_alloc, 32KiB (max_recvmsg_len capped at
* netlink_recvmsg())). dump will pack as many smaller messages as
---8<---
Powered by blists - more mailing lists