netdev - Re: [Patch bpf-next v2 4/4] tcp_bpf: improve ingress redirection performance with message corking

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250311205426.h3rvfakthoa6usgr@gmail.com>
Date: Tue, 11 Mar 2025 13:54:26 -0700
From: John Fastabend <john.fastabend@...il.com>
To: Cong Wang <xiyou.wangcong@...il.com>
Cc: netdev@...r.kernel.org, bpf@...r.kernel.org, jakub@...udflare.com,
	zhoufeng.zf@...edance.com, Zijian Zhang <zijianzhang@...edance.com>,
	Amery Hung <amery.hung@...edance.com>,
	Cong Wang <cong.wang@...edance.com>
Subject: Re: [Patch bpf-next v2 4/4] tcp_bpf: improve ingress redirection
 performance with message corking

On 2025-03-06 14:02:05, Cong Wang wrote:
> From: Zijian Zhang <zijianzhang@...edance.com>
> 
> The TCP_BPF ingress redirection path currently lacks the message corking
> mechanism found in standard TCP. This causes the sender to wake up the
> receiver for every message, even when messages are small, resulting in
> reduced throughput compared to regular TCP in certain scenarios.

Agreed this is annoying.

> 
> This change introduces a kernel worker-based intermediate layer to provide
> automatic message corking for TCP_BPF. While this adds a slight latency
> overhead, it significantly improves overall throughput by reducing
> unnecessary wake-ups and reducing the sock lock contention.

Great. Couple questions below.

> 
> Reviewed-by: Amery Hung <amery.hung@...edance.com>
> Co-developed-by: Cong Wang <cong.wang@...edance.com>
> Signed-off-by: Cong Wang <cong.wang@...edance.com>
> Signed-off-by: Zijian Zhang <zijianzhang@...edance.com>
> ---
>  include/linux/skmsg.h |  19 ++++
>  net/core/skmsg.c      | 139 ++++++++++++++++++++++++++++-
>  net/ipv4/tcp_bpf.c    | 197 ++++++++++++++++++++++++++++++++++++++++--
>  3 files changed, 347 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index 7620f170c4b1..2531428168ad 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -15,6 +15,8 @@
>  
>  #define MAX_MSG_FRAGS			MAX_SKB_FRAGS
>  #define NR_MSG_FRAG_IDS			(MAX_MSG_FRAGS + 1)
> +/* GSO size for TCP BPF backlog processing */
> +#define TCP_BPF_GSO_SIZE		65536
>  
>  enum __sk_action {
>  	__SK_DROP = 0,
> @@ -85,8 +87,10 @@ struct sk_psock {
>  	struct sock			*sk_redir;
>  	u32				apply_bytes;
>  	u32				cork_bytes;
> +	u32				backlog_since_notify;
>  	u8				eval;
>  	u8 				redir_ingress : 1; /* undefined if sk_redir is null */
> +	u8				backlog_work_delayed : 1;
>  	struct sk_msg			*cork;
>  	struct sk_psock_progs		progs;
>  #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
> @@ -97,6 +101,9 @@ struct sk_psock {
>  	struct sk_buff_head		ingress_skb;
>  	struct list_head		ingress_msg;
>  	spinlock_t			ingress_lock;
> +	struct list_head		backlog_msg;
> +	/* spin_lock for backlog_msg and backlog_since_notify */
> +	spinlock_t			backlog_msg_lock;
>  	unsigned long			state;
>  	struct list_head		link;
>  	spinlock_t			link_lock;
> @@ -117,11 +124,13 @@ struct sk_psock {
>  	struct mutex			work_mutex;
>  	struct sk_psock_work_state	work_state;
>  	struct delayed_work		work;
> +	struct delayed_work		backlog_work;
>  	struct sock			*sk_pair;
>  	struct rcu_work			rwork;
>  };

[...]

> +static int tcp_bpf_ingress_backlog(struct sock *sk, struct sock *sk_redir,
> +				   struct sk_msg *msg, u32 apply_bytes)
> +{
> +	bool ingress_msg_empty = false;
> +	bool apply = apply_bytes;
> +	struct sk_psock *psock;
> +	struct sk_msg *tmp;
> +	u32 tot_size = 0;
> +	int ret = 0;
> +	u8 nonagle;
> +
> +	psock = sk_psock_get(sk_redir);
> +	if (unlikely(!psock))
> +		return -EPIPE;
> +
> +	spin_lock(&psock->backlog_msg_lock);
> +	/* If possible, coalesce the curr sk_msg to the last sk_msg from the
> +	 * psock->backlog_msg.
> +	 */
> +	if (!list_empty(&psock->backlog_msg)) {
> +		struct sk_msg *last;
> +
> +		last = list_last_entry(&psock->backlog_msg, struct sk_msg, list);
> +		if (last->sk == sk) {
> +			int i = tcp_bpf_coalesce_msg(last, msg, &apply_bytes,
> +						     &tot_size);
> +
> +			if (i == msg->sg.end || (apply && !apply_bytes))
> +				goto out_unlock;
> +		}
> +	}
> +
> +	/* Otherwise, allocate a new sk_msg and transfer the data from the
> +	 * passed in msg to it.
> +	 */
> +	tmp = sk_msg_alloc(GFP_ATOMIC);
> +	if (!tmp) {
> +		ret = -ENOMEM;
> +		spin_unlock(&psock->backlog_msg_lock);
> +		goto error;
> +	}
> +
> +	tmp->sk = sk;
> +	sock_hold(tmp->sk);
> +	tmp->sg.start = msg->sg.start;
> +	tcp_bpf_xfer_msg(tmp, msg, &apply_bytes, &tot_size);
> +
> +	ingress_msg_empty = list_empty(&psock->ingress_msg);
> +	list_add_tail(&tmp->list, &psock->backlog_msg);
> +
> +out_unlock:
> +	spin_unlock(&psock->backlog_msg_lock);
> +	sk_wmem_queued_add(sk, tot_size);
> +
> +	/* At this point, the data has been handled well. If one of the
> +	 * following conditions is met, we can notify the peer socket in
> +	 * the context of this system call immediately.
> +	 * 1. If the write buffer has been used up;
> +	 * 2. Or, the message size is larger than TCP_BPF_GSO_SIZE;
> +	 * 3. Or, the ingress queue was empty;
> +	 * 4. Or, the tcp socket is set to no_delay.
> +	 * Otherwise, kick off the backlog work so that we can have some
> +	 * time to wait for any incoming messages before sending a
> +	 * notification to the peer socket.
> +	 */

I think this could also be used to get the bpf_msg_cork_bytes working
directly in receive path. This also means we can avoid using
strparser in the receive path. The strparser case has noticable
overhead for us that is significant enough we don't use it.
Not that we need to do it all in one patch set.

> +	nonagle = tcp_sk(sk)->nonagle;
> +	if (!sk_stream_memory_free(sk) ||
> +	    tot_size >= TCP_BPF_GSO_SIZE || ingress_msg_empty ||
> +	    (!(nonagle & TCP_NAGLE_CORK) && (nonagle & TCP_NAGLE_OFF))) {
> +		release_sock(sk);
> +		psock->backlog_work_delayed = false;
> +		sk_psock_backlog_msg(psock);
> +		lock_sock(sk);
> +	} else {
> +		sk_psock_run_backlog_work(psock, false);
> +	}
> +
> +error:
> +	sk_psock_put(sk_redir, psock);
> +	return ret;
> +}
> +
>  static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
>  				struct sk_msg *msg, int *copied, int flags)
>  {
> @@ -442,18 +619,24 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
>  			cork = true;
>  			psock->cork = NULL;
>  		}
> -		release_sock(sk);
>  
> -		origsize = msg->sg.size;
> -		ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress,
> -					    msg, tosend, flags);

The only sticky bit here that is blocking folding this entire tcp_bpf_sendmsg_redir
logic out is tls user right?

> -		sent = origsize - msg->sg.size;
> +		if (redir_ingress) {
> +			ret = tcp_bpf_ingress_backlog(sk, sk_redir, msg, tosend);
> +		} else {
> +			release_sock(sk);
> +
> +			origsize = msg->sg.size;
> +			ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress,
> +						    msg, tosend, flags);

now sendmsg redir is really only for egress here so we can skip handling
the ingress here. And the entire existing sk_psock_backlog work queue because
its handled by tcp_bpf_ingress_backlog?

> +			sent = origsize - msg->sg.size;
> +
> +			lock_sock(sk);
> +			sk_mem_uncharge(sk, sent);
> +		}

I like the direction but any blockers to just get this out of TLS as
well? I'm happy to do it if needed I would prefer not to try and
support both styles at the same time.