lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240828181011.1591242-1-namangulati@google.com>
Date: Wed, 28 Aug 2024 18:10:11 +0000
From: Naman Gulati <namangulati@...gle.com>
To: Alexander Viro <viro@...iv.linux.org.uk>, Christian Brauner <brauner@...nel.org>, Jan Kara <jack@...e.cz>, 
	"David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>, netdev@...r.kernel.org
Cc: Stanislav Fomichev <sdf@...ichev.me>, linux-kernel@...r.kernel.org, skhawaja@...gle.com, 
	Joe Damato <jdamato@...tly.com>, Willem de Bruijn <willemdebruijn.kernel@...il.com>, 
	Naman Gulati <namangulati@...gle.com>
Subject: [PATCH] Add provision to busyloop for events in ep_poll.

NAPI busypolling in ep_busy_loop loops on napi_poll and checks for new
epoll events after every napi poll. Checking just for epoll events in a
tight loop in the kernel context delivers latency gains to applications
that are not interested in napi busypolling with epoll.

This patch adds an option to loop just for new events inside
ep_busy_loop, guarded by the EPIOCSPARAMS ioctl that controls epoll napi
busypolling.

A comparison with neper tcp_rr shows that busylooping for events in
epoll_wait boosted throughput by ~3-7% and reduced median latency by
~10%.

To demonstrate the latency and throughput improvements, a comparison was
made of neper tcp_rr running with:
    1. (baseline) No busylooping
    2. (epoll busylooping) enabling the epoll busy looping on all epoll
    fd's
    3. (userspace busylooping) looping on epoll_wait in userspace
    with timeout=0

Stats for two machines with 100Gbps NICs running tcp_rr with 5 threads
and varying flows:

Type                Flows   Throughput             Latency (μs)
                             (B/s)      P50   P90    P99   P99.9   P99.99
baseline            15	    272145      57.2  71.9   91.4  100.6   111.6
baseline            30	    464952	66.8  78.8   98.1  113.4   122.4
baseline            60	    695920	80.9  118.5  143.4 161.8   174.6
epoll busyloop      15	    301751	44.7  70.6   84.3  95.4    106.5
epoll busyloop      30	    508392	58.9  76.9   96.2  109.3   118.5
epoll busyloop      60	    745731	77.4  106.2  127.5 143.1   155.9
userspace busyloop  15	    279202	55.4  73.1   85.2  98.3    109.6
userspace busyloop  30	    472440	63.7  78.2   96.5  112.2   120.1
userspace busyloop  60	    720779	77.9  113.5  134.9 152.6   165.7

Per the above data epoll busyloop outperforms baseline and userspace
busylooping in both throughput and latency. As the density of flows per
thread increased, the median latency of all three epoll mechanisms
converges. However epoll busylooping is better at capturing the tail
latencies at high flow counts.

Signed-off-by: Naman Gulati <namangulati@...gle.com>
---
 fs/eventpoll.c                 | 53 ++++++++++++++++++++++++++--------
 include/uapi/linux/eventpoll.h |  3 +-
 2 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index f53ca4f7fcedd..6cba79261817a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -232,7 +232,10 @@ struct eventpoll {
 	u32 busy_poll_usecs;
 	/* busy poll packet budget */
 	u16 busy_poll_budget;
-	bool prefer_busy_poll;
+	/* prefer to busypoll in napi poll */
+	bool napi_prefer_busy_poll;
+	/* avoid napi poll when busy looping and poll only for events */
+	bool event_poll_only;
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -430,6 +433,24 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
 	return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
 }
 
+/**
+ * ep_event_busy_loop - loop until events available or busy poll
+ * times out.
+ *
+ * @ep: Pointer to the eventpoll context.
+ *
+ * Return: true if events available, false otherwise.
+ */
+static bool ep_event_busy_loop(struct eventpoll *ep)
+{
+	unsigned long start_time = busy_loop_current_time();
+
+	while (!ep_busy_loop_end(ep, start_time))
+		cond_resched();
+
+	return ep_events_available(ep);
+}
+
 /*
  * Busy poll if globally on and supporting sockets found && no events,
  * busy loop will return if need_resched or ep_events_available.
@@ -440,23 +461,29 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
 {
 	unsigned int napi_id = READ_ONCE(ep->napi_id);
 	u16 budget = READ_ONCE(ep->busy_poll_budget);
-	bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
+	bool event_poll_only = READ_ONCE(ep->event_poll_only);
 
 	if (!budget)
 		budget = BUSY_POLL_BUDGET;
 
-	if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) {
+	if (!ep_busy_loop_on(ep))
+		return false;
+
+	if (event_poll_only) {
+		return ep_event_busy_loop(ep);
+	} else if (napi_id >= MIN_NAPI_ID) {
+		bool napi_prefer_busy_poll = READ_ONCE(ep->napi_prefer_busy_poll);
+
 		napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end,
-			       ep, prefer_busy_poll, budget);
+				ep, napi_prefer_busy_poll, budget);
 		if (ep_events_available(ep))
 			return true;
 		/*
-		 * Busy poll timed out.  Drop NAPI ID for now, we can add
-		 * it back in when we have moved a socket with a valid NAPI
-		 * ID onto the ready list.
-		 */
+		* Busy poll timed out.  Drop NAPI ID for now, we can add
+		* it back in when we have moved a socket with a valid NAPI
+		* ID onto the ready list.
+		*/
 		ep->napi_id = 0;
-		return false;
 	}
 	return false;
 }
@@ -523,13 +550,15 @@ static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
 
 		WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
 		WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
-		WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
+		WRITE_ONCE(ep->napi_prefer_busy_poll, epoll_params.prefer_busy_poll);
+		WRITE_ONCE(ep->event_poll_only, epoll_params.event_poll_only);
 		return 0;
 	case EPIOCGPARAMS:
 		memset(&epoll_params, 0, sizeof(epoll_params));
 		epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
 		epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
-		epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
+		epoll_params.prefer_busy_poll = READ_ONCE(ep->napi_prefer_busy_poll);
+		epoll_params.event_poll_only = READ_ONCE(ep->event_poll_only);
 		if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
 			return -EFAULT;
 		return 0;
@@ -2203,7 +2232,7 @@ static int do_epoll_create(int flags)
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	ep->busy_poll_usecs = 0;
 	ep->busy_poll_budget = 0;
-	ep->prefer_busy_poll = false;
+	ep->napi_prefer_busy_poll = false;
 #endif
 	ep->file = file;
 	fd_install(fd, file);
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 4f4b948ef3811..3bc0f4eed976c 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -89,9 +89,10 @@ struct epoll_params {
 	__u32 busy_poll_usecs;
 	__u16 busy_poll_budget;
 	__u8 prefer_busy_poll;
+	__u8 event_poll_only:1;
 
 	/* pad the struct to a multiple of 64bits */
-	__u8 __pad;
+	__u8 __pad:7;
 };
 
 #define EPOLL_IOC_TYPE 0x8A
-- 
2.46.0.295.g3b9ea8a38a-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ