[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <f8ffd9bb3fd24376adf963a6e7137f10b95db761b084a29ec3efa0eb7f305296@mail.kernel.org>
Date: Sun, 28 Dec 2025 00:43:50 +0000 (UTC)
From: bot+bpf-ci@...nel.org
To: xiyou.wangcong@...il.com,netdev@...r.kernel.org
Cc: hemanthmalla@...il.com,john.fastabend@...il.com,jakub@...udflare.com,bpf@...r.kernel.org,zijianzhang@...edance.com,amery.hung@...edance.com,cong.wang@...edance.com,ast@...nel.org,andrii@...nel.org,daniel@...earbox.net,martin.lau@...nel.org,eddyz87@...il.com,yonghong.song@...ux.dev,clm@...a.com,ihor.solodrai@...ux.dev
Subject: Re: [Patch bpf-next v5 4/4] tcp_bpf: improve ingress redirection performance with message corking
> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index 45ff311ccf49..2dd4ade35b11 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
[ ... ]
> +static void __sk_psock_purge_backlog_msg(struct sk_psock *psock)
> +{
> + struct sk_msg *msg, *tmp;
> +
> + spin_lock(&psock->backlog_msg_lock);
> + list_for_each_entry_safe(msg, tmp, &psock->backlog_msg, list) {
> + struct sock *sk_from = msg->sk;
> + bool slow;
> +
> + list_del(&msg->list);
> + slow = lock_sock_fast(sk_from);
^^^^^^^^^^^^^^^^^^^^^^
Can lock_sock_fast() be called while holding backlog_msg_lock spinlock?
lock_sock_fast() contains might_sleep() and can call schedule() via
__lock_sock() when the socket is contended. This would result in
"scheduling while atomic" since we're holding the spinlock.
The call chain is: sk_psock_destroy() -> __sk_psock_purge_backlog_msg()
which runs in workqueue context where sleeping would normally be fine,
but not while holding a spinlock.
> + sk_wmem_queued_add(sk_from, -msg->sg.size);
> + sock_put(sk_from);
> + sk_msg_free(sk_from, msg);
> + unlock_sock_fast(sk_from, slow);
> + kfree_sk_msg(msg);
> + }
> + spin_unlock(&psock->backlog_msg_lock);
> +}
[ ... ]
> diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
> index ed049a912a23..d0e03e7df8e3 100644
> --- a/net/ipv4/tcp_bpf.c
> +++ b/net/ipv4/tcp_bpf.c
[ ... ]
> +static int tcp_bpf_ingress_backlog(struct sock *sk, struct sock *sk_redir,
> + struct sk_msg *msg, u32 apply_bytes)
> +{
> + bool ingress_msg_empty = false;
> + bool apply = apply_bytes;
> + struct sk_psock *psock;
> + struct sk_msg *tmp;
> + u32 tot_size = 0;
> + int ret = 0;
> + u8 nonagle;
> +
> + psock = sk_psock_get(sk_redir);
> + if (unlikely(!psock))
> + return -EPIPE;
> +
> + spin_lock(&psock->backlog_msg_lock);
> + /* If possible, coalesce the curr sk_msg to the last sk_msg from the
> + * psock->backlog_msg.
> + */
> + if (!list_empty(&psock->backlog_msg)) {
> + struct sk_msg *last;
> +
> + last = list_last_entry(&psock->backlog_msg, struct sk_msg, list);
> + if (last->sk == sk) {
> + int i = tcp_bpf_coalesce_msg(last, msg, &apply_bytes,
> + &tot_size);
> +
> + if (i == msg->sg.end || (apply && !apply_bytes))
> + goto out_unlock;
> + }
> + }
> +
> + /* Otherwise, allocate a new sk_msg and transfer the data from the
> + * passed in msg to it.
> + */
> + tmp = sk_msg_alloc(GFP_ATOMIC);
> + if (!tmp) {
> + ret = -ENOMEM;
> + spin_unlock(&psock->backlog_msg_lock);
> + goto error;
^^^^^^^^^^
When tcp_bpf_coalesce_msg() partially succeeds (adds data to last->sg and
increments tot_size) but then sk_msg_alloc() fails, the error path jumps
to 'error' label which skips the sk_wmem_queued_add(sk, tot_size) call at
out_unlock.
However, the coalesced data is now in the backlog (in 'last'), and when
backlog_xfer_to_local() later processes it, it will call
sk_wmem_queued_add(msg->sk, -msg->sg.size) for the full size including
the coalesced portion. Since sk_wmem_queued was never incremented for
that coalesced portion, this would underflow sk_wmem_queued.
> + }
> +
> + tmp->sk = sk;
> + sock_hold(tmp->sk);
> + tmp->sg.start = msg->sg.start;
> + tcp_bpf_xfer_msg(tmp, msg, &apply_bytes, &tot_size);
> +
> + ingress_msg_empty = list_empty(&psock->ingress_msg);
> + list_add_tail(&tmp->list, &psock->backlog_msg);
> +
> +out_unlock:
> + spin_unlock(&psock->backlog_msg_lock);
> + sk_wmem_queued_add(sk, tot_size);
[ ... ]
> +error:
> + sk_psock_put(sk_redir, psock);
> + return ret;
> +}
---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/20546413613
Powered by blists - more mailing lists