lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230523-abgleichen-rotieren-37fdb6fb9ef3@brauner>
Date:   Tue, 23 May 2023 16:30:14 +0200
From:   Christian Brauner <brauner@...nel.org>
To:     Dominique Martinet <asmadeus@...ewreck.org>
Cc:     Alexander Viro <viro@...iv.linux.org.uk>,
        Jens Axboe <axboe@...nel.dk>,
        Pavel Begunkov <asml.silence@...il.com>,
        Stefan Roesch <shr@...com>, Clay Harris <bugs@...ycon.org>,
        Dave Chinner <david@...morbit.com>,
        linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org,
        io-uring@...r.kernel.org
Subject: Re: [PATCH v2 6/6] RFC: io_uring getdents: test returning an EOF
 flag in CQE

On Wed, May 10, 2023 at 07:52:54PM +0900, Dominique Martinet wrote:
> This turns out to be very slightly faster than an extra call to
> getdents, but in practice it doesn't seem to be such an improvement as
> the trailing getdents will return almost immediately be absorbed by the
> scheduling noise in a find-like context (my ""server"" is too noisy to
> get proper benchmarks out, but results look slightly better with this in
> async mode, and almost identical in the NOWAIT path)
> 
> If the user is waiting the end of a single directory though it might be
> worth it, so including the patch for comments.
> (in particular I'm not really happy that the flag has become in-out for
> vfs_getdents, especially when the getdents64 syscall does not use it,
> but I don't see much other way around it)
> 
> If this approach is acceptable/wanted then this patch will be split down
> further (at least dir_context/vfs_getdents, kernfs, libfs, uring in four
> separate commits)
> 
> Signed-off-by: Dominique Martinet <asmadeus@...ewreck.org>
> ---
>  fs/internal.h                 |  2 +-
>  fs/kernfs/dir.c               |  1 +
>  fs/libfs.c                    |  9 ++++++---
>  fs/readdir.c                  | 10 ++++++----
>  include/linux/fs.h            |  2 ++
>  include/uapi/linux/io_uring.h |  2 ++
>  io_uring/fs.c                 |  8 ++++++--
>  7 files changed, 24 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/internal.h b/fs/internal.h
> index 0264b001d99a..0b1552c7a870 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -267,4 +267,4 @@ void mnt_idmap_put(struct mnt_idmap *idmap);
>  struct linux_dirent64;
>  
>  int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
> -		 unsigned int count, unsigned long flags);
> +		 unsigned int count, unsigned long *flags);
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index 5a5b3e7881bf..53a6b4804c34 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -1860,6 +1860,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
>  	up_read(&root->kernfs_rwsem);
>  	file->private_data = NULL;
>  	ctx->pos = INT_MAX;
> +	ctx->flags |= DIR_CONTEXT_F_EOD;
>  	return 0;
>  }
>  
> diff --git a/fs/libfs.c b/fs/libfs.c
> index a3c7e42d90a7..b2a95dadffbd 100644
> --- a/fs/libfs.c
> +++ b/fs/libfs.c
> @@ -208,10 +208,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
>  		p = &next->d_child;
>  	}
>  	spin_lock(&dentry->d_lock);
> -	if (next)
> +	if (next) {
>  		list_move_tail(&cursor->d_child, &next->d_child);
> -	else
> +	} else {
>  		list_del_init(&cursor->d_child);
> +		ctx->flags |= DIR_CONTEXT_F_EOD;
> +	}
>  	spin_unlock(&dentry->d_lock);
>  	dput(next);
>  
> @@ -1347,7 +1349,8 @@ static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
>  
>  static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
>  {
> -	dir_emit_dots(file, ctx);
> +	if (dir_emit_dots(file, ctx))
> +		ctx->flags |= DIR_CONTEXT_F_EOD;
>  	return 0;
>  }
>  
> diff --git a/fs/readdir.c b/fs/readdir.c
> index 1311b89d75e1..be75a2154b4f 100644
> --- a/fs/readdir.c
> +++ b/fs/readdir.c
> @@ -358,14 +358,14 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
>   * @file    : pointer to file struct of directory
>   * @dirent  : pointer to user directory structure
>   * @count   : size of buffer
> - * @flags   : additional dir_context flags
> + * @flags   : pointer to additional dir_context flags
>   */
>  int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
> -		 unsigned int count, unsigned long flags)
> +		 unsigned int count, unsigned long *flags)
>  {
>  	struct getdents_callback64 buf = {
>  		.ctx.actor = filldir64,
> -		.ctx.flags = flags,
> +		.ctx.flags = flags ? *flags : 0,
>  		.count = count,
>  		.current_dir = dirent
>  	};
> @@ -384,6 +384,8 @@ int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
>  		else
>  			error = count - buf.count;
>  	}
> +	if (flags)
> +		*flags = buf.ctx.flags;
>  	return error;
>  }
>  
> @@ -397,7 +399,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
>  	if (!f.file)
>  		return -EBADF;
>  
> -	error = vfs_getdents(f.file, dirent, count, 0);
> +	error = vfs_getdents(f.file, dirent, count, NULL);
>  
>  	fdput_pos(f);
>  	return error;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index f7de2b5ca38e..d1e31bccfb4f 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1723,8 +1723,10 @@ struct dir_context {
>   * flags for dir_context flags
>   * DIR_CONTEXT_F_NOWAIT: Request non-blocking iterate
>   *                       (requires file->f_mode & FMODE_NOWAIT)
> + * DIR_CONTEXT_F_EOD: Signal directory has been fully iterated, set by the fs
>   */
>  #define DIR_CONTEXT_F_NOWAIT	0x1
> +#define DIR_CONTEXT_F_EOD	0x2
>  
>  /*
>   * These flags let !MMU mmap() govern direct device mapping vs immediate
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 35d0de18d893..35877132027e 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -381,11 +381,13 @@ struct io_uring_cqe {
>   * IORING_CQE_F_SOCK_NONEMPTY	If set, more data to read after socket recv
>   * IORING_CQE_F_NOTIF	Set for notification CQEs. Can be used to distinct
>   * 			them from sends.
> + * IORING_CQE_F_EOF     If set, file or directory has reached end of file.
>   */
>  #define IORING_CQE_F_BUFFER		(1U << 0)
>  #define IORING_CQE_F_MORE		(1U << 1)
>  #define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
>  #define IORING_CQE_F_NOTIF		(1U << 3)
> +#define IORING_CQE_F_EOF		(1U << 4)
>  
>  enum {
>  	IORING_CQE_BUFFER_SHIFT		= 16,
> diff --git a/io_uring/fs.c b/io_uring/fs.c
> index b15ec81c1ed2..f6222b0148ef 100644
> --- a/io_uring/fs.c
> +++ b/io_uring/fs.c
> @@ -322,6 +322,7 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
>  {
>  	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>  	unsigned long getdents_flags = 0;
> +	u32 cqe_flags = 0;
>  	int ret;
>  
>  	if (issue_flags & IO_URING_F_NONBLOCK) {
> @@ -338,13 +339,16 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
>  			goto out;
>  	}
>  
> -	ret = vfs_getdents(req->file, gd->dirent, gd->count, getdents_flags);
> +	ret = vfs_getdents(req->file, gd->dirent, gd->count, &getdents_flags);

I don't understand how synchronization and updating of f_pos works here.
For example, what happens if a concurrent seek happens on the fd while
io_uring is using vfs_getdents which calls into iterate_dir() and
updates f_pos?

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ