[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <45f34557fff9207019b48f62fc92678c76255f213a5904f82f02bad9a4883ef2@mail.kernel.org>
Date: Tue, 25 Nov 2025 12:26:24 +0000 (UTC)
From: bot+bpf-ci@...nel.org
To: jiayuan.chen@...ux.dev,bpf@...r.kernel.org
Cc: jiayuan.chen@...ux.dev,john.fastabend@...il.com,jakub@...udflare.com,davem@...emloft.net,edumazet@...gle.com,kuba@...nel.org,pabeni@...hat.com,horms@...nel.org,ncardwell@...gle.com,kuniyu@...gle.com,dsahern@...nel.org,ast@...nel.org,daniel@...earbox.net,andrii@...nel.org,martin.lau@...ux.dev,eddyz87@...il.com,song@...nel.org,yonghong.song@...ux.dev,kpsingh@...nel.org,sdf@...ichev.me,haoluo@...gle.com,jolsa@...nel.org,shuah@...nel.org,mhal@...x.co,sgarzare@...hat.com,cong.wang@...edance.com,netdev@...r.kernel.org,linux-kernel@...r.kernel.org,linux-kselftest@...r.kernel.org,ast@...nel.org,andrii@...nel.org,daniel@...earbox.net,martin.lau@...nel.org,eddyz87@...il.com,yonghong.song@...ux.dev,clm@...a.com,ihor.solodrai@...ux.dev
Subject: Re: [PATCH bpf-next v4 2/3] bpf, sockmap: Fix FIONREAD for sockmap
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index 0323a2b6c..2fccae0df 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -97,6 +97,7 @@ struct sk_psock {
> struct sk_buff_head ingress_skb;
> struct list_head ingress_msg;
> spinlock_t ingress_lock;
> + ssize_t ingress_size;
> unsigned long state;
> struct list_head link;
> spinlock_t link_lock;
[ ... ]
> @@ -321,6 +322,16 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
> kfree_skb(skb);
> }
>
> +static inline ssize_t sk_psock_get_msg_size(struct sk_psock *psock)
> +{
> + return READ_ONCE(psock->ingress_size);
> +}
> +
> +static inline void sk_psock_inc_msg_size(struct sk_psock *psock, ssize_t diff)
> +{
> + WRITE_ONCE(psock->ingress_size, READ_ONCE(psock->ingress_size) + diff);
> +}
> +
Can sk_psock_inc_msg_size race with itself? The function implements a
read-modify-write operation that isn't atomic. Looking at the call sites:
- sk_psock_queue_msg() holds ingress_lock when calling this
- sk_psock_dequeue_msg() holds ingress_lock when calling this
- __sk_psock_purge_ingress_msg() holds ingress_lock when calling this
- __sk_msg_recvmsg() does NOT hold ingress_lock when calling this
The __sk_msg_recvmsg() path runs under the socket lock via
tcp_bpf_recvmsg()->lock_sock(), but sk_psock_queue_msg() can be called
from softirq context (network RX -> BPF verdict ->
sk_psock_verdict_apply() -> sk_psock_skb_ingress_self() ->
sk_psock_queue_msg()) without the socket lock.
This means concurrent updates from different lock contexts can race:
Thread A (softirq, ingress_lock): reads size=1000
Thread B (process, socket lock): reads size=1000, subtracts 50, writes 950
Thread A: adds 100 to stale 1000, writes 1100
Result: size=1100 but should be 1050 (lost the -50 update)
The ingress_size counter would drift from the actual queue size, breaking
the FIONREAD accuracy that this patch aims to provide.
> static inline bool sk_psock_queue_msg(struct sk_psock *psock,
> struct sk_msg *msg)
> {
> @@ -329,6 +340,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock,
> spin_lock_bh(&psock->ingress_lock);
> if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
> list_add_tail(&msg->list, &psock->ingress_msg);
> + sk_psock_inc_msg_size(psock, msg->sg.size);
> ret = true;
> } else {
> sk_msg_free(psock->sk, msg);
[ ... ]
> @@ -345,18 +357,25 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
>
> spin_lock_bh(&psock->ingress_lock);
> msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
> - if (msg)
> + if (msg) {
> list_del(&msg->list);
> + sk_psock_inc_msg_size(psock, -msg->sg.size);
> + }
> spin_unlock_bh(&psock->ingress_lock);
> return msg;
> }
[ ... ]
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index d73e03f77..c959d52a6 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -455,6 +455,7 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg
> atomic_sub(copy, &sk->sk_rmem_alloc);
> }
> msg_rx->sg.size -= copy;
> + sk_psock_inc_msg_size(psock, -copy);
This call to sk_psock_inc_msg_size() happens without holding ingress_lock,
just the socket lock. Can this race with sk_psock_queue_msg() which holds
ingress_lock but not the socket lock?
>
> if (!sge->length) {
> sk_msg_iter_var_next(i);
> @@ -819,9 +820,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
> list_del(&msg->list);
> if (!msg->skb)
> atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
> + sk_psock_inc_msg_size(psock, -((ssize_t)msg->sg.size));
> sk_msg_free(psock->sk, msg);
> kfree(msg);
> }
> + WARN_ON_ONCE(psock->ingress_size);
> }
[ ... ]
---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/19669112811
Powered by blists - more mailing lists