[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210927202923.7360-1-jlundberg@llnw.com>
Date:   Mon, 27 Sep 2021 13:29:17 -0700
From:   Johannes Lundberg <jlundberg@...w.com>
To:     linux-kernel@...r.kernel.org
Cc:     Johannes Lundberg <jlundberg@...w.com>,
        "David S. Miller" <davem@...emloft.net>,
        Jakub Kicinski <kuba@...nel.org>,
        Eric Dumazet <edumazet@...gle.com>,
        Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
        David Ahern <dsahern@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>,
        Florian Westphal <fw@...len.de>,
        Alexander Aring <aahringo@...hat.com>,
        Tonghao Zhang <xiangxia.m.yue@...il.com>,
        Yangbo Lu <yangbo.lu@....com>,
        Thomas Gleixner <tglx@...utronix.de>, netdev@...r.kernel.org
Subject: [PATCH] fs: eventpoll: add empty event
The EPOLLEMPTY event will trigger when the TCP write buffer becomes
empty, i.e., when all outgoing data have been ACKed.
The need for this functionality comes from a business requirement
of measuring with higher precision how much time is spent
transmitting data to a client. For reference, similar functionality
was previously added to FreeBSD as the kqueue event EVFILT_EMPTY.
Signed-off-by: Johannes Lundberg <jlundberg@...w.com>
---
 include/net/sock.h             | 11 +++++++++++
 include/uapi/linux/eventpoll.h |  1 +
 net/core/sock.c                |  5 +++++
 net/core/stream.c              | 14 ++++++++++++++
 net/ipv4/tcp.c                 |  5 +++++
 5 files changed, 36 insertions(+)
diff --git a/include/net/sock.h b/include/net/sock.h
index c005c3c750e8..9047a9e225a9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -516,6 +516,7 @@ struct sock {
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk);
 	void			(*sk_write_space)(struct sock *sk);
+	void			(*sk_empty)(struct sock *sk);
 	void			(*sk_error_report)(struct sock *sk);
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
@@ -965,6 +966,7 @@ static inline void sk_wmem_queued_add(struct sock *sk, int val)
 	WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
 }
 
+void sk_stream_empty(struct sock *sk);
 void sk_stream_write_space(struct sock *sk);
 
 /* OOB backlog add */
@@ -1288,6 +1290,11 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
 
 INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));
 
+static inline bool sk_stream_is_empty(const struct sock *sk)
+{
+	return (sk->sk_wmem_queued == 0);
+}
+
 static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
 {
 	if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
@@ -1559,6 +1566,10 @@ DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
 	sk_wmem_queued_add(sk, -skb->truesize);
+
+	if (sk_stream_is_empty(sk))
+		sk->sk_empty(sk);
+
 	sk_mem_uncharge(sk, skb->truesize);
 	if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
 	    !sk->sk_tx_skb_cache && !skb_cloned(skb)) {
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8a3432d0f0dc..aab9f1f624d0 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -39,6 +39,7 @@
 #define EPOLLWRNORM	(__force __poll_t)0x00000100
 #define EPOLLWRBAND	(__force __poll_t)0x00000200
 #define EPOLLMSG	(__force __poll_t)0x00000400
+#define EPOLLEMPTY	(__force __poll_t)0x00000800
 #define EPOLLRDHUP	(__force __poll_t)0x00002000
 
 /* Set exclusive wakeup mode for the target file descriptor */
diff --git a/net/core/sock.c b/net/core/sock.c
index 512e629f9780..f917791d8149 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3062,6 +3062,10 @@ static void sock_def_write_space(struct sock *sk)
 	rcu_read_unlock();
 }
 
+static void sock_def_empty(struct sock *sk)
+{
+}
+
 static void sock_def_destruct(struct sock *sk)
 {
 }
@@ -3136,6 +3140,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_state_change	=	sock_def_wakeup;
 	sk->sk_data_ready	=	sock_def_readable;
 	sk->sk_write_space	=	sock_def_write_space;
+	sk->sk_empty		=	sock_def_empty;
 	sk->sk_error_report	=	sock_def_error_report;
 	sk->sk_destruct		=	sock_def_destruct;
 
diff --git a/net/core/stream.c b/net/core/stream.c
index 4f1d4aa5fb38..c7e4135542a2 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -21,6 +21,20 @@
 #include <linux/wait.h>
 #include <net/sock.h>
 
+void sk_stream_empty(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+	struct socket_wq *wq;
+
+	if (sk_stream_is_empty(sk) && sock) {
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		if (skwq_has_sleeper(wq))
+			wake_up_interruptible_poll(&wq->wait, EPOLLEMPTY);
+		rcu_read_unlock();
+	}
+}
+
 /**
  * sk_stream_write_space - stream socket write_space callback.
  * @sk: socket
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e8b48df73c85..550bae79af06 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -453,6 +453,8 @@ void tcp_init_sock(struct sock *sk)
 	tp->tsoffset = 0;
 	tp->rack.reo_wnd_steps = 1;
 
+	sk->sk_empty = sk_stream_empty;
+
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
@@ -561,6 +563,9 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 		    tp->urg_data)
 			target++;
 
+		if (sk_stream_is_empty(sk))
+			mask |= EPOLLEMPTY;
+
 		if (tcp_stream_is_readable(sk, target))
 			mask |= EPOLLIN | EPOLLRDNORM;
 
-- 
2.17.1
Powered by blists - more mailing lists
 
