[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230607181920.2294972-8-dhowells@redhat.com>
Date: Wed, 7 Jun 2023 19:19:13 +0100
From: David Howells <dhowells@...hat.com>
To: netdev@...r.kernel.org,
Linus Torvalds <torvalds@...ux-foundation.org>
Cc: David Howells <dhowells@...hat.com>,
Chuck Lever <chuck.lever@...cle.com>,
Boris Pismenny <borisp@...dia.com>,
John Fastabend <john.fastabend@...il.com>,
Jakub Kicinski <kuba@...nel.org>,
"David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Paolo Abeni <pabeni@...hat.com>,
Willem de Bruijn <willemdebruijn.kernel@...il.com>,
David Ahern <dsahern@...nel.org>,
Matthew Wilcox <willy@...radead.org>,
Jens Axboe <axboe@...nel.dk>, linux-mm@...ck.org,
linux-kernel@...r.kernel.org, Kuniyuki Iwashima <kuniyu@...zon.com>
Subject: [PATCH net-next v6 07/14] ipv4, ipv6: Use splice_eof() to flush
Allow splice to undo the effects of MSG_MORE after prematurely ending a
splice/sendfile due to getting an EOF condition (->splice_read() returned
0) after splice had called sendmsg() with MSG_MORE set when the user didn't
set MSG_MORE.
For UDP, a pending packet will not be emitted if the socket is closed
before it is flushed; with this change, it be flushed by ->splice_eof().
For TCP, it's not clear that MSG_MORE is actually effective.
Suggested-by: Linus Torvalds <torvalds@...ux-foundation.org>
Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/
Signed-off-by: David Howells <dhowells@...hat.com>
cc: Kuniyuki Iwashima <kuniyu@...zon.com>
cc: Eric Dumazet <edumazet@...gle.com>
cc: Willem de Bruijn <willemdebruijn.kernel@...il.com>
cc: David Ahern <dsahern@...nel.org>
cc: "David S. Miller" <davem@...emloft.net>
cc: Jakub Kicinski <kuba@...nel.org>
cc: Paolo Abeni <pabeni@...hat.com>
cc: Jens Axboe <axboe@...nel.dk>
cc: Matthew Wilcox <willy@...radead.org>
cc: netdev@...r.kernel.org
---
Notes:
ver #6)
- In inet_splice_eof(), use prot after deref of sk->sk_prot.
- In udpv6_splice_eof(), use udp_v6_push_pending_frames().
- In udpv6_splice_eof(), don't check for AF_INET.
include/net/inet_common.h | 1 +
include/net/tcp.h | 1 +
include/net/udp.h | 1 +
net/ipv4/af_inet.c | 18 ++++++++++++++++++
net/ipv4/tcp.c | 16 ++++++++++++++++
net/ipv4/tcp_ipv4.c | 1 +
net/ipv4/udp.c | 16 ++++++++++++++++
net/ipv6/af_inet6.c | 1 +
net/ipv6/tcp_ipv6.c | 1 +
net/ipv6/udp.c | 15 +++++++++++++++
10 files changed, 71 insertions(+)
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 77f4b0ef5b92..a75333342c4e 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -35,6 +35,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock,
struct sock *newsk);
int inet_send_prepare(struct sock *sk);
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
+void inet_splice_eof(struct socket *sock);
ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
size_t size, int flags);
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 68990a8f556a..49611af31bb7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -327,6 +327,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
size_t size, struct ubuf_info *uarg);
+void tcp_splice_eof(struct socket *sock);
int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
diff --git a/include/net/udp.h b/include/net/udp.h
index 5cad44318d71..4ed0b47c5582 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -278,6 +278,7 @@ int udp_get_port(struct sock *sk, unsigned short snum,
int udp_err(struct sk_buff *, u32);
int udp_abort(struct sock *sk, int err);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+void udp_splice_eof(struct socket *sock);
int udp_push_pending_frames(struct sock *sk);
void udp_flush_pending_frames(struct sock *sk);
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b5735b3551cf..fd233c4195ac 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -831,6 +831,21 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
}
EXPORT_SYMBOL(inet_sendmsg);
+void inet_splice_eof(struct socket *sock)
+{
+ const struct proto *prot;
+ struct sock *sk = sock->sk;
+
+ if (unlikely(inet_send_prepare(sk)))
+ return;
+
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ if (prot->splice_eof)
+ prot->splice_eof(sock);
+}
+EXPORT_SYMBOL_GPL(inet_splice_eof);
+
ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
size_t size, int flags)
{
@@ -1050,6 +1065,7 @@ const struct proto_ops inet_stream_ops = {
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
+ .splice_eof = inet_splice_eof,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
@@ -1084,6 +1100,7 @@ const struct proto_ops inet_dgram_ops = {
.read_skb = udp_read_skb,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
+ .splice_eof = inet_splice_eof,
.sendpage = inet_sendpage,
.set_peek_off = sk_set_peek_off,
#ifdef CONFIG_COMPAT
@@ -1115,6 +1132,7 @@ static const struct proto_ops inet_sockraw_ops = {
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
+ .splice_eof = inet_splice_eof,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
.compat_ioctl = inet_compat_ioctl,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 53b7751b68e1..09f03221a6f1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1371,6 +1371,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
}
EXPORT_SYMBOL(tcp_sendmsg);
+void tcp_splice_eof(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss_now, size_goal;
+
+ if (!tcp_write_queue_tail(sk))
+ return;
+
+ lock_sock(sk);
+ mss_now = tcp_send_mss(sk, &size_goal, 0);
+ tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
+ release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_splice_eof);
+
/*
* Handle reading urgent data. BSD has very simple semantics for
* this, no blocking and very strange errors 8)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 53e9ce2f05bb..84a5d557dc1a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3116,6 +3116,7 @@ struct proto tcp_prot = {
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
+ .splice_eof = tcp_splice_eof,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fd3dae081f3a..df5e407286d7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1324,6 +1324,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
EXPORT_SYMBOL(udp_sendmsg);
+void udp_splice_eof(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct udp_sock *up = udp_sk(sk);
+
+ if (!up->pending || READ_ONCE(up->corkflag))
+ return;
+
+ lock_sock(sk);
+ if (up->pending && !READ_ONCE(up->corkflag))
+ udp_push_pending_frames(sk);
+ release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(udp_splice_eof);
+
int udp_sendpage(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
@@ -2918,6 +2933,7 @@ struct proto udp_prot = {
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
+ .splice_eof = udp_splice_eof,
.sendpage = udp_sendpage,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2bbf13216a3d..564942bee067 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -695,6 +695,7 @@ const struct proto_ops inet6_stream_ops = {
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
+ .splice_eof = inet_splice_eof,
.sendpage = inet_sendpage,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d657713d1c71..c17c8ff94b79 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2150,6 +2150,7 @@ struct proto tcpv6_prot = {
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
+ .splice_eof = tcp_splice_eof,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v6_do_rcv,
.release_cb = tcp_release_cb,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e5a337e6b970..317b01c9bc39 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1653,6 +1653,20 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
EXPORT_SYMBOL(udpv6_sendmsg);
+static void udpv6_splice_eof(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct udp_sock *up = udp_sk(sk);
+
+ if (!up->pending || READ_ONCE(up->corkflag))
+ return;
+
+ lock_sock(sk);
+ if (up->pending && !READ_ONCE(up->corkflag))
+ udp_v6_push_pending_frames(sk);
+ release_sock(sk);
+}
+
void udpv6_destroy_sock(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
@@ -1764,6 +1778,7 @@ struct proto udpv6_prot = {
.getsockopt = udpv6_getsockopt,
.sendmsg = udpv6_sendmsg,
.recvmsg = udpv6_recvmsg,
+ .splice_eof = udpv6_splice_eof,
.release_cb = ip6_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
Powered by blists - more mailing lists