lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230422-uring-getdents-v2-6-2db1e37dc55e@codewreck.org>
Date:   Wed, 10 May 2023 19:52:54 +0900
From:   Dominique Martinet <asmadeus@...ewreck.org>
To:     Alexander Viro <viro@...iv.linux.org.uk>,
        Christian Brauner <brauner@...nel.org>,
        Jens Axboe <axboe@...nel.dk>,
        Pavel Begunkov <asml.silence@...il.com>,
        Stefan Roesch <shr@...com>
Cc:     Clay Harris <bugs@...ycon.org>, Dave Chinner <david@...morbit.com>,
        linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org,
        io-uring@...r.kernel.org,
        Dominique Martinet <asmadeus@...ewreck.org>
Subject: [PATCH v2 6/6] RFC: io_uring getdents: test returning an EOF flag
 in CQE

This turns out to be very slightly faster than an extra call to
getdents, but in practice it doesn't seem to be such an improvement as
the trailing getdents will return almost immediately be absorbed by the
scheduling noise in a find-like context (my ""server"" is too noisy to
get proper benchmarks out, but results look slightly better with this in
async mode, and almost identical in the NOWAIT path)

If the user is waiting the end of a single directory though it might be
worth it, so including the patch for comments.
(in particular I'm not really happy that the flag has become in-out for
vfs_getdents, especially when the getdents64 syscall does not use it,
but I don't see much other way around it)

If this approach is acceptable/wanted then this patch will be split down
further (at least dir_context/vfs_getdents, kernfs, libfs, uring in four
separate commits)

Signed-off-by: Dominique Martinet <asmadeus@...ewreck.org>
---
 fs/internal.h                 |  2 +-
 fs/kernfs/dir.c               |  1 +
 fs/libfs.c                    |  9 ++++++---
 fs/readdir.c                  | 10 ++++++----
 include/linux/fs.h            |  2 ++
 include/uapi/linux/io_uring.h |  2 ++
 io_uring/fs.c                 |  8 ++++++--
 7 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index 0264b001d99a..0b1552c7a870 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -267,4 +267,4 @@ void mnt_idmap_put(struct mnt_idmap *idmap);
 struct linux_dirent64;
 
 int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
-		 unsigned int count, unsigned long flags);
+		 unsigned int count, unsigned long *flags);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 5a5b3e7881bf..53a6b4804c34 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1860,6 +1860,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
 	up_read(&root->kernfs_rwsem);
 	file->private_data = NULL;
 	ctx->pos = INT_MAX;
+	ctx->flags |= DIR_CONTEXT_F_EOD;
 	return 0;
 }
 
diff --git a/fs/libfs.c b/fs/libfs.c
index a3c7e42d90a7..b2a95dadffbd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -208,10 +208,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
 		p = &next->d_child;
 	}
 	spin_lock(&dentry->d_lock);
-	if (next)
+	if (next) {
 		list_move_tail(&cursor->d_child, &next->d_child);
-	else
+	} else {
 		list_del_init(&cursor->d_child);
+		ctx->flags |= DIR_CONTEXT_F_EOD;
+	}
 	spin_unlock(&dentry->d_lock);
 	dput(next);
 
@@ -1347,7 +1349,8 @@ static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
 
 static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
 {
-	dir_emit_dots(file, ctx);
+	if (dir_emit_dots(file, ctx))
+		ctx->flags |= DIR_CONTEXT_F_EOD;
 	return 0;
 }
 
diff --git a/fs/readdir.c b/fs/readdir.c
index 1311b89d75e1..be75a2154b4f 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -358,14 +358,14 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
  * @file    : pointer to file struct of directory
  * @dirent  : pointer to user directory structure
  * @count   : size of buffer
- * @flags   : additional dir_context flags
+ * @flags   : pointer to additional dir_context flags
  */
 int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
-		 unsigned int count, unsigned long flags)
+		 unsigned int count, unsigned long *flags)
 {
 	struct getdents_callback64 buf = {
 		.ctx.actor = filldir64,
-		.ctx.flags = flags,
+		.ctx.flags = flags ? *flags : 0,
 		.count = count,
 		.current_dir = dirent
 	};
@@ -384,6 +384,8 @@ int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
 		else
 			error = count - buf.count;
 	}
+	if (flags)
+		*flags = buf.ctx.flags;
 	return error;
 }
 
@@ -397,7 +399,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 	if (!f.file)
 		return -EBADF;
 
-	error = vfs_getdents(f.file, dirent, count, 0);
+	error = vfs_getdents(f.file, dirent, count, NULL);
 
 	fdput_pos(f);
 	return error;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7de2b5ca38e..d1e31bccfb4f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1723,8 +1723,10 @@ struct dir_context {
  * flags for dir_context flags
  * DIR_CONTEXT_F_NOWAIT: Request non-blocking iterate
  *                       (requires file->f_mode & FMODE_NOWAIT)
+ * DIR_CONTEXT_F_EOD: Signal directory has been fully iterated, set by the fs
  */
 #define DIR_CONTEXT_F_NOWAIT	0x1
+#define DIR_CONTEXT_F_EOD	0x2
 
 /*
  * These flags let !MMU mmap() govern direct device mapping vs immediate
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 35d0de18d893..35877132027e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -381,11 +381,13 @@ struct io_uring_cqe {
  * IORING_CQE_F_SOCK_NONEMPTY	If set, more data to read after socket recv
  * IORING_CQE_F_NOTIF	Set for notification CQEs. Can be used to distinct
  * 			them from sends.
+ * IORING_CQE_F_EOF     If set, file or directory has reached end of file.
  */
 #define IORING_CQE_F_BUFFER		(1U << 0)
 #define IORING_CQE_F_MORE		(1U << 1)
 #define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
 #define IORING_CQE_F_NOTIF		(1U << 3)
+#define IORING_CQE_F_EOF		(1U << 4)
 
 enum {
 	IORING_CQE_BUFFER_SHIFT		= 16,
diff --git a/io_uring/fs.c b/io_uring/fs.c
index b15ec81c1ed2..f6222b0148ef 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -322,6 +322,7 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
 	unsigned long getdents_flags = 0;
+	u32 cqe_flags = 0;
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK) {
@@ -338,13 +339,16 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
 			goto out;
 	}
 
-	ret = vfs_getdents(req->file, gd->dirent, gd->count, getdents_flags);
+	ret = vfs_getdents(req->file, gd->dirent, gd->count, &getdents_flags);
 out:
 	if (ret == -EAGAIN &&
 	    (issue_flags & IO_URING_F_NONBLOCK))
 			return -EAGAIN;
 
-	io_req_set_res(req, ret, 0);
+	if (getdents_flags & DIR_CONTEXT_F_EOD)
+		cqe_flags |= IORING_CQE_F_EOF;
+
+	io_req_set_res(req, ret, cqe_flags);
 	return 0;
 }
 

-- 
2.39.2

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ