[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1585ab7d7c7c987450f733b5773bc1ca1f673fce.1638282789.git.asml.silence@gmail.com>
Date: Tue, 30 Nov 2021 15:18:54 +0000
From: Pavel Begunkov <asml.silence@...il.com>
To: io-uring@...r.kernel.org, netdev@...r.kernel.org,
linux-kernel@...r.kernel.org
Cc: Jakub Kicinski <kuba@...nel.org>,
Jonathan Lemon <jonathan.lemon@...il.com>,
"David S . Miller" <davem@...emloft.net>,
Willem de Bruijn <willemb@...gle.com>,
Eric Dumazet <edumazet@...gle.com>,
Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
David Ahern <dsahern@...nel.org>, Jens Axboe <axboe@...nel.dk>,
Pavel Begunkov <asml.silence@...il.com>
Subject: [RFC 06/12] io_uring: add send notifiers registration
Add IORING_REGISTER_TX_CTX and IORING_UNREGISTER_TX_CTX. Transmission
(i.e. send) context will serve be used to notify the userspace when
fixed buffers used for zerocopy sends are released by the kernel.
Notification of a single tx context lives in generations, where each
generation posts one CQE with ->user_data equal to the specified tag and
->res is a generation number starting from 0. All requests issued
against a ctx will get attached to the current generation of
notifications. Then, the userspace will be able to request to flush the
notification allowing it to post a CQE when all buffers of all requests
attached to it are released by the kernel. It'll also switch the
generation to a new one with a sequence number incremented by one.
Signed-off-by: Pavel Begunkov <asml.silence@...il.com>
---
fs/io_uring.c | 72 +++++++++++++++++++++++++++++++++++
include/uapi/linux/io_uring.h | 7 ++++
2 files changed, 79 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 59380e3454ad..a01f91e70fa5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -94,6 +94,8 @@
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
+#define IORING_MAX_TX_NOTIFIERS (1U << 10)
+
/* only define max */
#define IORING_MAX_FIXED_FILES (1U << 15)
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
@@ -326,6 +328,15 @@ struct io_submit_state {
struct blk_plug plug;
};
+struct io_tx_notifier {
+};
+
+struct io_tx_ctx {
+ struct io_tx_notifier *notifier;
+ u64 tag;
+ u32 seq;
+};
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -373,6 +384,8 @@ struct io_ring_ctx {
unsigned nr_user_files;
unsigned nr_user_bufs;
struct io_mapped_ubuf **user_bufs;
+ struct io_tx_ctx *tx_ctxs;
+ unsigned nr_tx_ctxs;
struct io_submit_state submit_state;
struct list_head timeout_list;
@@ -9199,6 +9212,55 @@ static int io_buffer_validate(struct iovec *iov)
return 0;
}
+static int io_sqe_tx_ctx_unregister(struct io_ring_ctx *ctx)
+{
+ if (!ctx->nr_tx_ctxs)
+ return -ENXIO;
+
+ kvfree(ctx->tx_ctxs);
+ ctx->tx_ctxs = NULL;
+ ctx->nr_tx_ctxs = 0;
+ return 0;
+}
+
+static int io_sqe_tx_ctx_register(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned int nr_args)
+{
+ struct io_uring_tx_ctx_register __user *tx_args = arg;
+ struct io_uring_tx_ctx_register tx_arg;
+ unsigned i;
+ int ret;
+
+ if (ctx->nr_tx_ctxs)
+ return -EBUSY;
+ if (!nr_args)
+ return -EINVAL;
+ if (nr_args > IORING_MAX_TX_NOTIFIERS)
+ return -EMFILE;
+
+ ctx->tx_ctxs = kvcalloc(nr_args, sizeof(ctx->tx_ctxs[0]),
+ GFP_KERNEL_ACCOUNT);
+ if (!ctx->tx_ctxs)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_args; i++, ctx->nr_tx_ctxs++) {
+ struct io_tx_ctx *tx_ctx = &ctx->tx_ctxs[i];
+
+ if (copy_from_user(&tx_arg, &tx_args[i], sizeof(tx_arg))) {
+ ret = -EFAULT;
+ goto out_fput;
+ }
+ tx_ctx->tag = tx_arg.tag;
+ }
+ return 0;
+
+out_fput:
+ kvfree(ctx->tx_ctxs);
+ ctx->tx_ctxs = NULL;
+ ctx->nr_tx_ctxs = 0;
+ return ret;
+}
+
static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args, u64 __user *tags)
{
@@ -9429,6 +9491,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
#endif
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
+ io_sqe_tx_ctx_unregister(ctx);
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
@@ -11104,6 +11167,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_iowq_max_workers(ctx, arg);
break;
+ case IORING_REGISTER_TX_CTX:
+ ret = io_sqe_tx_ctx_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_TX_CTX:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_sqe_tx_ctx_unregister(ctx);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 787f491f0d2a..f2e8d18e40e0 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -325,6 +325,9 @@ enum {
/* set/get max number of io-wq workers */
IORING_REGISTER_IOWQ_MAX_WORKERS = 19,
+ IORING_REGISTER_TX_CTX = 20,
+ IORING_UNREGISTER_TX_CTX = 21,
+
/* this goes last */
IORING_REGISTER_LAST
};
@@ -365,6 +368,10 @@ struct io_uring_rsrc_update2 {
__u32 resv2;
};
+struct io_uring_tx_ctx_register {
+ __u64 tag;
+};
+
/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)
--
2.34.0
Powered by blists - more mailing lists