[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7facb17b78f6cbbdb38f140872e02a345a0023f7.1749466540.git.jgh@exim.org>
Date: Mon, 9 Jun 2025 17:05:17 +0100
From: Jeremy Harris <jgh@...m.org>
To: netdev@...r.kernel.org
Cc: linux-api@...r.kernel.org,
edumazet@...gle.com,
ncardwell@...gle.com,
Jeremy Harris <jgh@...m.org>
Subject: [PATCH net-next v3 1/6] tcp: support writing to a socket in listening state
In the tcp sendmsg handler, permit a write in LISTENING state if
a MSG_PRELOAD flag is used. Copy from iovec to a linear sk_buff
for placement on the socket write queue.
Signed-off-by: Jeremy Harris <jgh@...m.org>
---
include/linux/socket.h | 1 +
net/ipv4/tcp.c | 112 ++++++++++++++++++
.../perf/trace/beauty/include/linux/socket.h | 1 +
tools/perf/trace/beauty/msg_flags.c | 3 +
4 files changed, 117 insertions(+)
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 3b262487ec06..b41f4cd4dc97 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -330,6 +330,7 @@ struct ucred {
#define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */
#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */
#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */
+#define MSG_PRELOAD 0x10000000 /* Preload tx data while listening */
#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */
#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file
descriptor received through
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f64f8276a73c..c0a787c1649d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1057,6 +1057,115 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
return err;
}
+/* Cut-down version of tcp_sendmsg_locked(), for writing on a listen socket
+ */
+static int tcp_sendmsg_preload(struct sock *sk, struct msghdr *msg)
+{
+ struct sk_buff *skb;
+ int flags, err, copied = 0;
+ int size_goal;
+ int process_backlog = 0;
+ long timeo;
+
+ if (sk->sk_state != TCP_LISTEN)
+ return -EINVAL;
+
+ flags = msg->msg_flags;
+
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+ /* Ok commence sending. */
+restart:
+ /* Use a arbitrary "mss" value */
+ size_goal = 1000;
+
+ err = -EPIPE;
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto do_error;
+
+ while (msg_data_left(msg)) {
+ ssize_t copy = 0;
+
+ skb = tcp_write_queue_tail(sk);
+ if (skb)
+ copy = size_goal - skb->len;
+
+ trace_tcp_sendmsg_locked(sk, msg, skb, size_goal);
+
+ if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
+ bool first_skb = !skb;
+
+ /* Limit to only one skb on the sk write queue */
+
+ if (!first_skb)
+ goto out_nopush;
+
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_space;
+
+ if (unlikely(process_backlog >= 16)) {
+ process_backlog = 0;
+ if (sk_flush_backlog(sk))
+ goto restart;
+ }
+
+ skb = tcp_stream_alloc_skb(sk, sk->sk_allocation,
+ first_skb);
+ if (!skb)
+ goto wait_for_space;
+
+ process_backlog++;
+
+#ifdef CONFIG_SKB_DECRYPTED
+ skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
+ tcp_skb_entail(sk, skb);
+ copy = size_goal;
+ }
+
+ /* Try to append data to the end of skb. */
+ if (copy > msg_data_left(msg))
+ copy = msg_data_left(msg);
+
+ copy = min_t(int, copy, skb_tailroom(skb));
+ err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
+ if (err)
+ goto do_error;
+
+ TCP_SKB_CB(skb)->end_seq += copy;
+ tcp_skb_pcount_set(skb, 0);
+
+ copied += copy;
+ goto out_nopush;
+
+wait_for_space:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ tcp_remove_empty_skb(sk);
+
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
+ goto do_error;
+ }
+
+out_nopush:
+ return copied;
+
+do_error:
+ tcp_remove_empty_skb(sk);
+
+ if (copied)
+ goto out_nopush;
+
+ err = sk_stream_error(sk, flags, err);
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
+ sk->sk_write_space(sk);
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ }
+
+ return err;
+}
+
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
struct net_devmem_dmabuf_binding *binding = NULL;
@@ -1129,6 +1238,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
goto out_err;
}
+ if (unlikely(flags & MSG_PRELOAD))
+ return tcp_sendmsg_preload(sk, msg);
+
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h
index c3322eb3d686..e9ea498169f3 100644
--- a/tools/perf/trace/beauty/include/linux/socket.h
+++ b/tools/perf/trace/beauty/include/linux/socket.h
@@ -330,6 +330,7 @@ struct ucred {
#define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */
#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */
#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */
+#define MSG_PRELOAD 0x10000000 /* Preload tx data while listening */
#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */
#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file
descriptor received through
diff --git a/tools/perf/trace/beauty/msg_flags.c b/tools/perf/trace/beauty/msg_flags.c
index 2da581ff0c80..27e40da9b02d 100644
--- a/tools/perf/trace/beauty/msg_flags.c
+++ b/tools/perf/trace/beauty/msg_flags.c
@@ -20,6 +20,9 @@
#ifndef MSG_SPLICE_PAGES
#define MSG_SPLICE_PAGES 0x8000000
#endif
+#ifndef MSG_PRELOAD
+#define MSG_PRELOAD 0x10000000
+#endif
#ifndef MSG_FASTOPEN
#define MSG_FASTOPEN 0x20000000
#endif
--
2.49.0
Powered by blists - more mailing lists