lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20250310051832.5658-1-wen.yang@linux.dev>
Date: Mon, 10 Mar 2025 13:18:32 +0800
From: Wen Yang <wen.yang@...ux.dev>
To: Christian Brauner <brauner@...nel.org>,
	Jan Kara <jack@...e.cz>,
	Alexander Viro <viro@...iv.linux.org.uk>
Cc: Wen Yang <wen.yang@...ux.dev>,
	Jens Axboe <axboe@...nel.dk>,
	Dylan Yudaken <dylany@...com>,
	David Woodhouse <dwmw@...zon.co.uk>,
	Paolo Bonzini <pbonzini@...hat.com>,
	Dave Young <dyoung@...hat.com>,
	kernel test robot <lkp@...el.com>,
	linux-fsdevel@...r.kernel.org,
	linux-kernel@...r.kernel.org
Subject: [PATCH v4] eventfd: introduce configurable maximum value for eventfd

For the NON-SEMAPHORE eventfd, a write (2) call adds the 8-byte integer
value provided in its buffer to the counter, while a read (2) returns the
8-byte value containing the value and resetting the counter value to 0.
Therefore, the accumulated value of multiple writes can be retrieved by a
single read.

Currently, the reading thread is waked up immediately after the writing
thread writes eventfd, and the maximum value of the counter is ULLONG_MAX,
therefore, in the ping pong scene with frequent reading and writing,
the CPU will be exhausted.

By introducing the configurable maximum counter, we could achieve flow
control and reduce unnecessary CPU overhead.

We may use the following test code:
	#define _GNU_SOURCE
	#include <assert.h>
	#include <errno.h>
	#include <getopt.h>
	#include <pthread.h>
	#include <poll.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <unistd.h>
	#include <string.h>
	#include <sys/eventfd.h>
	#include <sys/prctl.h>
	#include <sys/ioctl.h>

	#define EFD_IOC_SET_MAXIMUM     _IOW('E', 0, __u64)
	#define EFD_IOC_GET_MAXIMUM     _IOR('E', 0, __u64)

	struct param {
		int fd;
		int cpu;
	};

	static void publish(void *data)
	{
		struct param * param = (struct param *)data;
		unsigned long long value = 1;
		cpu_set_t cpuset;

		prctl(PR_SET_NAME, "publish");
		CPU_ZERO(&cpuset);
		CPU_SET(param->cpu, &cpuset);
		sched_setaffinity(0, sizeof(cpuset), &cpuset);

		while (1)
			eventfd_write(param->fd, value);
	}

	static void subscribe(void *data)
	{
		struct param *param = (struct param *)data;
		unsigned long long value = 0;
		struct pollfd pfds[1];
		cpu_set_t cpuset;

		prctl(PR_SET_NAME, "subscribe");
		CPU_ZERO(&cpuset);
		CPU_SET(param->cpu, &cpuset);
		sched_setaffinity(0, sizeof(cpuset), &cpuset);

		pfds[0].fd = param->fd;
		pfds[0].events = POLLIN;

		while(1) {
			poll(pfds, 1, -1);
			if(pfds[0].revents & POLLIN) {
				read(param->fd, &value, sizeof(value));
			}
		}
	}

	static void usage(void)
	{
		printf("Usage: \n");
		printf("\t");
		printf("<-p cpuid> <-s cpuid> <-m maximum> \n");
	}

	int main(int argc, char *argv[])
	{
		struct param sub_param = {0};
		struct param pub_param = {0};
		char *optstr = "p:s:m:";
		int opt, ret, fd;
		__u64 maximum;
		pid_t pid;

		if (argc < 2) {
			usage();
			return 1;
		}

		while((opt = getopt(argc, argv, optstr)) != -1){
			switch(opt) {
				case 'p':
					pub_param.cpu = atoi(optarg);
					break;
				case 's':
					sub_param.cpu = atoi(optarg);
					break;
				case 'm':
					maximum = atoi(optarg);
					break;
				case '?':
					usage();
					return -1;
			}
		}

		fd = eventfd(0, EFD_CLOEXEC);
		assert(fd);

		ret = ioctl(fd, EFD_IOC_SET_MAXIMUM, &maximum);
		if (ret) {
			printf("error=%s\n", strerror(errno));
			return -1;
		}

		sub_param.fd = fd;
		pub_param.fd = fd;

		pid = fork();
		if (pid == 0)
			subscribe(&sub_param);
		else if (pid > 0)
			publish(&pub_param);
		else {
			printf("XXX: fork error!\n");
			return -1;
		}

		return 0;
	}

$ ./a.out  -p 2 -s 3 -m 6553500
-----cpu2-usage----------cpu3-usage----
usr sys idl wai stl:usr sys idl wai stl
 47  53   0   0   0: 46  54   0   0   0
 53  47   0   0   0: 45  54   1   0   0
 56  44   0   0   0: 48  52   0   0   0
 53  47   0   0   0: 45  55   0   0   0

$ ./a.out  -p 2 -s 3 -m 100
-----cpu2-usage----------cpu3-usage----
usr sys idl wai stl:usr sys idl wai stl
 41  59   0   0   0: 33  65   2   0   0
 46  54   0   0   0: 30  67   2   0   0
 38  62   0   0   0: 33  65   2   0   0
 37  63   0   0   0: 31  66   3   0   0

$ ./a.out  -p 2 -s 3 -m 10
-----cpu2-usage----------cpu3-usage----
usr sys idl wai stl:usr sys idl wai stl
 37  43  20   0   0: 21  42  37   0   0
 30  47  23   0   0: 20  42  38   0   0
 39  39  23   0   0: 24  37  39   0   0
 39  40  22   0   0: 23  41  36   0   0

Signed-off-by: Wen Yang <wen.yang@...ux.dev>
Cc: Al Viro <viro@...iv.linux.org.uk>
Cc: Jens Axboe <axboe@...nel.dk>
Cc: Christian Brauner <brauner@...nel.org>
Cc: Jan Kara <jack@...e.cz>
Cc: Dylan Yudaken <dylany@...com>
Cc: David Woodhouse <dwmw@...zon.co.uk>
Cc: Paolo Bonzini <pbonzini@...hat.com>
Cc: Dave Young <dyoung@...hat.com>
Cc: kernel test robot <lkp@...el.com>
Cc: linux-fsdevel@...r.kernel.org
Cc: linux-kernel@...r.kernel.org
---
v4: fix build errors reported by kernel test robot
v3: simply achieve flow control by configuring the maximum value of counter
v2: fix compilation errors 
https://lore.kernel.org/all/20240811085954.17162-1-wen.yang@linux.dev/
v1: https://lore.kernel.org/all/20240519144124.4429-1-wen.yang@linux.dev/

 fs/eventfd.c                 | 63 ++++++++++++++++++++++++++++++++----
 include/uapi/linux/eventfd.h |  3 ++
 2 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 76129bfcd663..043f637df5cb 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -39,6 +39,7 @@ struct eventfd_ctx {
 	 * also, adds to the "count" counter and issue a wakeup.
 	 */
 	__u64 count;
+	__u64 maximum;
 	unsigned int flags;
 	int id;
 };
@@ -49,7 +50,7 @@ struct eventfd_ctx {
  * @mask: [in] poll mask
  *
  * This function is supposed to be called by the kernel in paths that do not
- * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * allow sleeping. In this function we allow the counter to reach the maximum
  * value, and we signal this as overflow condition by returning a EPOLLERR
  * to poll(2).
  */
@@ -70,7 +71,7 @@ void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
 	current->in_eventfd = 1;
-	if (ctx->count < ULLONG_MAX)
+	if (ctx->count < ctx->maximum)
 		ctx->count++;
 	if (waitqueue_active(&ctx->wqh))
 		wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
@@ -119,7 +120,7 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 {
 	struct eventfd_ctx *ctx = file->private_data;
 	__poll_t events = 0;
-	u64 count;
+	u64 count, max;
 
 	poll_wait(file, &ctx->wqh, wait);
 
@@ -162,12 +163,13 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 	 *     eventfd_poll returns 0
 	 */
 	count = READ_ONCE(ctx->count);
+	max = READ_ONCE(ctx->maximum);
 
 	if (count > 0)
 		events |= EPOLLIN;
-	if (count == ULLONG_MAX)
+	if (count == max)
 		events |= EPOLLERR;
-	if (ULLONG_MAX - 1 > count)
+	if (max - 1 > count)
 		events |= EPOLLOUT;
 
 	return events;
@@ -244,6 +246,11 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
 	return sizeof(ucnt);
 }
 
+static inline bool eventfd_is_writable(struct eventfd_ctx *ctx, __u64 cnt)
+{
+	return (ctx->maximum > ctx->count) && (ctx->maximum - ctx->count > cnt);
+}
+
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 			     loff_t *ppos)
 {
@@ -259,11 +266,11 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 		return -EINVAL;
 	spin_lock_irq(&ctx->wqh.lock);
 	res = -EAGAIN;
-	if (ULLONG_MAX - ctx->count > ucnt)
+	if (eventfd_is_writable(ctx, ucnt))
 		res = sizeof(ucnt);
 	else if (!(file->f_flags & O_NONBLOCK)) {
 		res = wait_event_interruptible_locked_irq(ctx->wqh,
-				ULLONG_MAX - ctx->count > ucnt);
+				eventfd_is_writable(ctx, ucnt));
 		if (!res)
 			res = sizeof(ucnt);
 	}
@@ -299,6 +306,46 @@ static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 }
 #endif
 
+static long eventfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	void __user *argp = (void __user *)arg;
+	int ret = 0;
+	__u64 max;
+
+	if (!argp)
+		return -EINVAL;
+
+	switch (cmd) {
+	case EFD_IOC_SET_MAXIMUM: {
+		if (copy_from_user(&max, argp, sizeof(max)))
+			return -EFAULT;
+
+		spin_lock_irq(&ctx->wqh.lock);
+		if (ctx->count >= max)
+			ret = -EINVAL;
+		else
+			ctx->maximum = max;
+		spin_unlock_irq(&ctx->wqh.lock);
+		break;
+	}
+	case EFD_IOC_GET_MAXIMUM: {
+		spin_lock_irq(&ctx->wqh.lock);
+		max = ctx->maximum;
+		spin_unlock_irq(&ctx->wqh.lock);
+
+		if (copy_to_user(argp, &max, sizeof(max)))
+			ret = -EFAULT;
+		break;
+	}
+	default:
+		ret = -ENOENT;
+		break;
+	}
+
+	return ret;
+}
+
 static const struct file_operations eventfd_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= eventfd_show_fdinfo,
@@ -308,6 +355,7 @@ static const struct file_operations eventfd_fops = {
 	.read_iter	= eventfd_read,
 	.write		= eventfd_write,
 	.llseek		= noop_llseek,
+	.unlocked_ioctl	= eventfd_ioctl,
 };
 
 /**
@@ -398,6 +446,7 @@ static int do_eventfd(unsigned int count, int flags)
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
 	ctx->flags = flags;
+	ctx->maximum = ULLONG_MAX;
 	ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL);
 
 	flags &= EFD_SHARED_FCNTL_FLAGS;
diff --git a/include/uapi/linux/eventfd.h b/include/uapi/linux/eventfd.h
index 2eb9ab6c32f3..96e6430a3d12 100644
--- a/include/uapi/linux/eventfd.h
+++ b/include/uapi/linux/eventfd.h
@@ -8,4 +8,7 @@
 #define EFD_CLOEXEC O_CLOEXEC
 #define EFD_NONBLOCK O_NONBLOCK
 
+#define EFD_IOC_SET_MAXIMUM	_IOW('E', 0, __u64)
+#define EFD_IOC_GET_MAXIMUM	_IOR('E', 0, __u64)
+
 #endif /* _UAPI_LINUX_EVENTFD_H */
-- 
2.25.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ