lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <tencent_D113D23D99C9FC229F0FAADCA8CF823A2609@qq.com>
Date:   Mon, 30 Jan 2023 01:47:21 +0800
From:   wenyang.linux@...mail.com
To:     Alexander Viro <viro@...iv.linux.org.uk>
Cc:     Wen Yang <wenyang.linux@...mail.com>,
        Christoph Hellwig <hch@....de>, Dylan Yudaken <dylany@...com>,
        Jens Axboe <axboe@...nel.dk>,
        David Woodhouse <dwmw@...zon.co.uk>,
        Paolo Bonzini <pbonzini@...hat.com>,
        linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: [PATCH 2/2] eventfd: support delayed wakeup for non-semaphore eventfd to reduce cpu utilization

From: Wen Yang <wenyang.linux@...mail.com>

For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
then a read(2) returns 8 bytes containing that value, and the counter's
value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
N event_writes vs ONE event_read is possible.

However, the current implementation wakes up the read thread immediately
in eventfd_write so that the cpu utilization increases unnecessarily.

By adding a configurable delay after eventfd_write, these unnecessary
wakeup operations are avoided, thereby reducing cpu utilization.

We used the following test code:
https://github.com/w-simon/tests/blob/master/src/test.c
./test_zmq  > /dev/null

The cpu usage is as follows:
12:14:22     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
12:14:24     all   55.46    0.00    4.78    0.00    0.00    0.96    0.00    0.00    0.00   38.80
12:14:26     all   56.29    0.00    4.70    0.00    0.00    1.24    0.00    0.00    0.00   37.76
12:14:28     all   54.97    0.00    5.25    0.00    0.00    0.97    0.00    0.00    0.00   38.81
12:14:30     all   56.02    0.00    5.26    0.00    0.00    1.24    0.00    0.00    0.00   37.48
12:14:32     all   55.31    0.00    5.03    0.00    0.00    1.40    0.00    0.00    0.00   38.27
12:14:34     all   55.46    0.00    5.26    0.00    0.00    1.24    0.00    0.00    0.00   38.04

Then adjust the new control parameter, as follows:
echo 5 > /proc/sys/fs/eventfd_write_wake_delay_ms

The cpu usagen was observed to decrease by more than 30%, as follows:
12:14:36     all   28.17    0.00    0.93    0.00    0.00    0.00    0.00    0.00    0.00   70.90
12:14:38     all   24.00    0.00    0.80    0.00    0.00    0.13    0.00    0.00    0.00   75.07
12:14:40     all   23.57    0.00    0.53    0.00    0.00    0.13    0.00    0.00    0.00   75.77
12:14:42     all   23.59    0.00    0.40    0.00    0.00    0.00    0.00    0.00    0.00   76.01
12:14:44     all   23.69    0.00    0.27    0.00    0.00    0.00    0.00    0.00    0.00   76.04
12:14:46     all   23.20    0.00    0.67    0.00    0.00    0.13    0.00    0.00    0.00   76.00
12:14:48     all   24.87    0.00    0.66    0.00    0.00    0.00    0.00    0.00    0.00   74.47
12:14:50     all   24.27    0.00    0.66    0.00    0.00    0.00    0.00    0.00    0.00   75.07

Signed-off-by: Wen Yang <wenyang.linux@...mail.com>
Cc: Al Viro <viro@...iv.linux.org.uk>
Cc: Christoph Hellwig <hch@....de>
Cc: Dylan Yudaken <dylany@...com>
Cc: Jens Axboe <axboe@...nel.dk>
Cc: David Woodhouse <dwmw@...zon.co.uk>
Cc: Paolo Bonzini <pbonzini@...hat.com>
Cc: linux-fsdevel@...r.kernel.org
Cc: linux-kernel@...r.kernel.org
---
 fs/eventfd.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index c5bda3df4a28..e45436737f9d 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -41,6 +41,9 @@ struct eventfd_ctx {
 	__u64 count;
 	unsigned int flags;
 	int id;
+#ifdef CONFIG_SYSCTL
+	struct delayed_work dwork;
+#endif
 };
 
 __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
@@ -95,6 +98,9 @@ static void eventfd_free_ctx(struct eventfd_ctx *ctx)
 {
 	if (ctx->id >= 0)
 		ida_simple_remove(&eventfd_ida, ctx->id);
+#ifdef CONFIG_SYSCTL
+	flush_delayed_work(&ctx->dwork);
+#endif
 	kfree(ctx);
 }
 
@@ -256,6 +262,28 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
 	return sizeof(ucnt);
 }
 
+#ifdef CONFIG_SYSCTL
+
+static unsigned long sysctl_eventfd_write_wake_delay_ms;
+
+static void eventfd_delayed_workfn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct eventfd_ctx *ctx = container_of(dwork, struct eventfd_ctx, dwork);
+
+	spin_lock_irq(&ctx->wqh.lock);
+	current->in_eventfd = 1;
+	if (ctx->count) {
+		/* waitqueue_active is safe because ctx->wqh.lock is being held here. */
+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+	}
+	current->in_eventfd = 0;
+	spin_unlock_irq(&ctx->wqh.lock);
+}
+
+#endif
+
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 			     loff_t *ppos)
 {
@@ -282,8 +310,26 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 	if (likely(res > 0)) {
 		ctx->count += ucnt;
 		current->in_eventfd = 1;
-		if (waitqueue_active(&ctx->wqh))
+
+		/* waitqueue_active is safe because ctx->wqh.lock is being held here. */
+		if (waitqueue_active(&ctx->wqh)) {
+#ifdef CONFIG_SYSCTL
+			if (ctx->flags & EFD_SEMAPHORE)
+				wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+			else {
+				unsigned long delay = sysctl_eventfd_write_wake_delay_ms;
+
+				if (delay) {
+					if (!delayed_work_pending(&ctx->dwork))
+						queue_delayed_work(system_unbound_wq,
+								&ctx->dwork, delay);
+				} else
+					wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+			}
+#else
 			wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+#endif
+		}
 		current->in_eventfd = 0;
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
@@ -406,6 +452,9 @@ static int do_eventfd(unsigned int count, int flags)
 	ctx->count = count;
 	ctx->flags = flags;
 	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
+#ifdef CONFIG_SYSCTL
+	INIT_DELAYED_WORK(&ctx->dwork, eventfd_delayed_workfn);
+#endif
 
 	flags &= EFD_SHARED_FCNTL_FLAGS;
 	flags |= O_RDWR;
@@ -438,3 +487,31 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
 	return do_eventfd(count, 0);
 }
 
+#ifdef CONFIG_SYSCTL
+
+static unsigned long min_wake_delay;
+
+static unsigned long max_wake_delay = HZ / 10;
+
+static struct ctl_table fs_eventfd_ctl[] = {
+	{
+		.procname      = "eventfd_write_wake_delay_ms",
+		.data          = &sysctl_eventfd_write_wake_delay_ms,
+		.maxlen        = sizeof(unsigned long),
+		.mode          = 0644,
+		.proc_handler  = proc_doulongvec_ms_jiffies_minmax,
+		.extra1        = (void *)&min_wake_delay,
+		.extra2        = (void *)&max_wake_delay,
+	},
+	{ }
+};
+
+static int __init init_fs_exec_sysctls(void)
+{
+	register_sysctl_init("fs", fs_eventfd_ctl);
+	return 0;
+}
+
+fs_initcall(init_fs_exec_sysctls);
+
+#endif /* CONFIG_SYSCTL */
-- 
2.37.2

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ