[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <51cddd97b31d80ec8842a88b9f3c9881419e8a7b.1763725387.git.asml.silence@gmail.com>
Date: Sun, 23 Nov 2025 22:51:25 +0000
From: Pavel Begunkov <asml.silence@...il.com>
To: linux-block@...r.kernel.org,
io-uring@...r.kernel.org
Cc: Vishal Verma <vishal1.verma@...el.com>,
tushar.gohad@...el.com,
Keith Busch <kbusch@...nel.org>,
Jens Axboe <axboe@...nel.dk>,
Christoph Hellwig <hch@....de>,
Sagi Grimberg <sagi@...mberg.me>,
Alexander Viro <viro@...iv.linux.org.uk>,
Christian Brauner <brauner@...nel.org>,
Andrew Morton <akpm@...ux-foundation.org>,
Sumit Semwal <sumit.semwal@...aro.org>,
Christian König <christian.koenig@....com>,
Pavel Begunkov <asml.silence@...il.com>,
linux-kernel@...r.kernel.org,
linux-nvme@...ts.infradead.org,
linux-fsdevel@...r.kernel.org,
linux-media@...r.kernel.org,
dri-devel@...ts.freedesktop.org,
linaro-mm-sig@...ts.linaro.org
Subject: [RFC v2 05/11] block: add infra to handle dmabuf tokens
Add blk-mq infrastructure to handle dmabuf tokens. There are two main
objects. The first is struct blk_mq_dma_token, which is an extension of
struct dma_token and passed in an iterator. The second is struct
blk_mq_dma_map, which keeps the actual mapping and unlike the token, can
be ejected (e.g. by move_notify) and recreated.
The token keeps an rcu protected pointer to the mapping, so when it
resolves a token into a mapping to pass it to a request, it'll do an rcu
protected lookup and get a percpu reference to the mapping.
If there is no current mapping attached to a token, it'll need to be
created by calling the driver (e.g. nvme) via a new callback. It
requires waiting, thefore can't be done for nowait requests and couldn't
happen deeper in the stack, e.g. during nvme request submission.
The structure split is needed because move_notify can request to
invalidate the dma mapping at any moment, and we need a way to
concurrently remove it and wait for the inflight requests using the
previous mapping to complete.
Signed-off-by: Pavel Begunkov <asml.silence@...il.com>
---
block/Makefile | 1 +
block/bdev.c | 14 ++
block/blk-mq-dma-token.c | 236 +++++++++++++++++++++++++++++++
block/blk-mq.c | 20 +++
block/fops.c | 1 +
include/linux/blk-mq-dma-token.h | 60 ++++++++
include/linux/blk-mq.h | 21 +++
include/linux/blkdev.h | 3 +
8 files changed, 356 insertions(+)
create mode 100644 block/blk-mq-dma-token.c
create mode 100644 include/linux/blk-mq-dma-token.h
diff --git a/block/Makefile b/block/Makefile
index c65f4da93702..0190e5aa9f00 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
blk-crypto-sysfs.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
+obj-$(CONFIG_DMA_SHARED_BUFFER) += blk-mq-dma-token.o
diff --git a/block/bdev.c b/block/bdev.c
index 810707cca970..da89d20f33f3 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -28,6 +28,7 @@
#include <linux/part_stat.h>
#include <linux/uaccess.h>
#include <linux/stat.h>
+#include <linux/blk-mq-dma-token.h>
#include "../fs/internal.h"
#include "blk.h"
@@ -61,6 +62,19 @@ struct block_device *file_bdev(struct file *bdev_file)
}
EXPORT_SYMBOL(file_bdev);
+struct dma_token *blkdev_dma_map(struct file *file,
+ struct dma_token_params *params)
+{
+ struct request_queue *q = bdev_get_queue(file_bdev(file));
+
+ if (!(file->f_flags & O_DIRECT))
+ return ERR_PTR(-EINVAL);
+ if (!q->mq_ops)
+ return ERR_PTR(-EINVAL);
+
+ return blk_mq_dma_map(q, params);
+}
+
static void bdev_write_inode(struct block_device *bdev)
{
struct inode *inode = BD_INODE(bdev);
diff --git a/block/blk-mq-dma-token.c b/block/blk-mq-dma-token.c
new file mode 100644
index 000000000000..cd62c4d09422
--- /dev/null
+++ b/block/blk-mq-dma-token.c
@@ -0,0 +1,236 @@
+#include <linux/blk-mq-dma-token.h>
+#include <linux/dma-resv.h>
+
+struct blk_mq_dma_fence {
+ struct dma_fence base;
+ spinlock_t lock;
+};
+
+static const char *blk_mq_fence_drv_name(struct dma_fence *fence)
+{
+ return "blk-mq";
+}
+
+const struct dma_fence_ops blk_mq_dma_fence_ops = {
+ .get_driver_name = blk_mq_fence_drv_name,
+ .get_timeline_name = blk_mq_fence_drv_name,
+};
+
+static void blk_mq_dma_token_free(struct blk_mq_dma_token *token)
+{
+ token->q->mq_ops->clean_dma_token(token->q, token);
+ dma_buf_put(token->dmabuf);
+ kfree(token);
+}
+
+static inline void blk_mq_dma_token_put(struct blk_mq_dma_token *token)
+{
+ if (refcount_dec_and_test(&token->refs))
+ blk_mq_dma_token_free(token);
+}
+
+static void blk_mq_dma_mapping_free(struct blk_mq_dma_map *map)
+{
+ struct blk_mq_dma_token *token = map->token;
+
+ if (map->sgt)
+ token->q->mq_ops->dma_unmap(token->q, map);
+
+ dma_fence_put(&map->fence->base);
+ percpu_ref_exit(&map->refs);
+ kfree(map);
+ blk_mq_dma_token_put(token);
+}
+
+static void blk_mq_dma_map_work_free(struct work_struct *work)
+{
+ struct blk_mq_dma_map *map = container_of(work, struct blk_mq_dma_map,
+ free_work);
+
+ dma_fence_signal(&map->fence->base);
+ blk_mq_dma_mapping_free(map);
+}
+
+static void blk_mq_dma_map_refs_free(struct percpu_ref *ref)
+{
+ struct blk_mq_dma_map *map = container_of(ref, struct blk_mq_dma_map, refs);
+
+ INIT_WORK(&map->free_work, blk_mq_dma_map_work_free);
+ queue_work(system_wq, &map->free_work);
+}
+
+static struct blk_mq_dma_map *blk_mq_alloc_dma_mapping(struct blk_mq_dma_token *token)
+{
+ struct blk_mq_dma_fence *fence = NULL;
+ struct blk_mq_dma_map *map;
+ int ret = -ENOMEM;
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+
+ fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+ if (!fence)
+ goto err;
+
+ ret = percpu_ref_init(&map->refs, blk_mq_dma_map_refs_free, 0,
+ GFP_KERNEL);
+ if (ret)
+ goto err;
+
+ dma_fence_init(&fence->base, &blk_mq_dma_fence_ops, &fence->lock,
+ token->fence_ctx, atomic_inc_return(&token->fence_seq));
+ spin_lock_init(&fence->lock);
+ map->fence = fence;
+ map->token = token;
+ refcount_inc(&token->refs);
+ return map;
+err:
+ kfree(map);
+ kfree(fence);
+ return ERR_PTR(ret);
+}
+
+static inline
+struct blk_mq_dma_map *blk_mq_get_token_map(struct blk_mq_dma_token *token)
+{
+ struct blk_mq_dma_map *map;
+
+ guard(rcu)();
+
+ map = rcu_dereference(token->map);
+ if (unlikely(!map || !percpu_ref_tryget_live_rcu(&map->refs)))
+ return NULL;
+ return map;
+}
+
+static struct blk_mq_dma_map *
+blk_mq_create_dma_map(struct blk_mq_dma_token *token)
+{
+ struct dma_buf *dmabuf = token->dmabuf;
+ struct blk_mq_dma_map *map;
+ long ret;
+
+ guard(mutex)(&token->mapping_lock);
+
+ map = blk_mq_get_token_map(token);
+ if (map)
+ return map;
+
+ map = blk_mq_alloc_dma_mapping(token);
+ if (IS_ERR(map))
+ return NULL;
+
+ dma_resv_lock(dmabuf->resv, NULL);
+ ret = dma_resv_wait_timeout(dmabuf->resv, DMA_RESV_USAGE_BOOKKEEP,
+ true, MAX_SCHEDULE_TIMEOUT);
+ ret = ret ? ret : -ETIME;
+ if (ret > 0)
+ ret = token->q->mq_ops->dma_map(token->q, map);
+ dma_resv_unlock(dmabuf->resv);
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ percpu_ref_get(&map->refs);
+ rcu_assign_pointer(token->map, map);
+ return map;
+}
+
+static void blk_mq_dma_map_remove(struct blk_mq_dma_token *token)
+{
+ struct dma_buf *dmabuf = token->dmabuf;
+ struct blk_mq_dma_map *map;
+ int ret;
+
+ dma_resv_assert_held(dmabuf->resv);
+
+ ret = dma_resv_reserve_fences(dmabuf->resv, 1);
+ if (WARN_ON_ONCE(ret))
+ return;
+
+ map = rcu_dereference_protected(token->map,
+ dma_resv_held(dmabuf->resv));
+ if (!map)
+ return;
+ rcu_assign_pointer(token->map, NULL);
+
+ dma_resv_add_fence(dmabuf->resv, &map->fence->base,
+ DMA_RESV_USAGE_KERNEL);
+ percpu_ref_kill(&map->refs);
+}
+
+blk_status_t blk_rq_assign_dma_map(struct request *rq,
+ struct blk_mq_dma_token *token)
+{
+ struct blk_mq_dma_map *map;
+
+ map = blk_mq_get_token_map(token);
+ if (map)
+ goto complete;
+
+ if (rq->cmd_flags & REQ_NOWAIT)
+ return BLK_STS_AGAIN;
+
+ map = blk_mq_create_dma_map(token);
+ if (IS_ERR(map))
+ return BLK_STS_RESOURCE;
+complete:
+ rq->dma_map = map;
+ return BLK_STS_OK;
+}
+
+void blk_mq_dma_map_move_notify(struct blk_mq_dma_token *token)
+{
+ blk_mq_dma_map_remove(token);
+}
+
+static void blk_mq_release_dma_mapping(struct dma_token *base_token)
+{
+ struct blk_mq_dma_token *token = dma_token_to_blk_mq(base_token);
+ struct dma_buf *dmabuf = token->dmabuf;
+
+ dma_resv_lock(dmabuf->resv, NULL);
+ blk_mq_dma_map_remove(token);
+ dma_resv_unlock(dmabuf->resv);
+
+ blk_mq_dma_token_put(token);
+}
+
+struct dma_token *blk_mq_dma_map(struct request_queue *q,
+ struct dma_token_params *params)
+{
+ struct dma_buf *dmabuf = params->dmabuf;
+ struct blk_mq_dma_token *token;
+ int ret;
+
+ if (!q->mq_ops->dma_map || !q->mq_ops->dma_unmap ||
+ !q->mq_ops->init_dma_token || !q->mq_ops->clean_dma_token)
+ return ERR_PTR(-EINVAL);
+
+ token = kzalloc(sizeof(*token), GFP_KERNEL);
+ if (!token)
+ return ERR_PTR(-ENOMEM);
+
+ get_dma_buf(dmabuf);
+ token->fence_ctx = dma_fence_context_alloc(1);
+ token->dmabuf = dmabuf;
+ token->dir = params->dir;
+ token->base.release = blk_mq_release_dma_mapping;
+ token->q = q;
+ refcount_set(&token->refs, 1);
+ mutex_init(&token->mapping_lock);
+
+ if (!blk_get_queue(q)) {
+ kfree(token);
+ return ERR_PTR(-EFAULT);
+ }
+
+ ret = token->q->mq_ops->init_dma_token(token->q, token);
+ if (ret) {
+ kfree(token);
+ blk_put_queue(q);
+ return ERR_PTR(ret);
+ }
+ return &token->base;
+}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f2650c97a75e..1ff3a7e3191b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -29,6 +29,7 @@
#include <linux/blk-crypto.h>
#include <linux/part_stat.h>
#include <linux/sched/isolation.h>
+#include <linux/blk-mq-dma-token.h>
#include <trace/events/block.h>
@@ -439,6 +440,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->nr_integrity_segments = 0;
rq->end_io = NULL;
rq->end_io_data = NULL;
+ rq->dma_map = NULL;
blk_crypto_rq_set_defaults(rq);
INIT_LIST_HEAD(&rq->queuelist);
@@ -794,6 +796,7 @@ static void __blk_mq_free_request(struct request *rq)
blk_pm_mark_last_busy(rq);
rq->mq_hctx = NULL;
+ blk_rq_drop_dma_map(rq);
if (rq->tag != BLK_MQ_NO_TAG) {
blk_mq_dec_active_requests(hctx);
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
@@ -3214,6 +3217,23 @@ void blk_mq_submit_bio(struct bio *bio)
blk_mq_bio_to_request(rq, bio, nr_segs);
+ if (bio_flagged(bio, BIO_DMA_TOKEN)) {
+ struct blk_mq_dma_token *token;
+ blk_status_t ret;
+
+ token = dma_token_to_blk_mq(bio->dma_token);
+ ret = blk_rq_assign_dma_map(rq, token);
+ if (ret) {
+ if (ret == BLK_STS_AGAIN) {
+ bio_wouldblock_error(bio);
+ } else {
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ }
+ goto queue_exit;
+ }
+ }
+
ret = blk_crypto_rq_get_keyslot(rq);
if (ret != BLK_STS_OK) {
bio->bi_status = ret;
diff --git a/block/fops.c b/block/fops.c
index 41f8795874a9..ac52fe1a4b8d 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -973,6 +973,7 @@ const struct file_operations def_blk_fops = {
.fallocate = blkdev_fallocate,
.uring_cmd = blkdev_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC,
+ .dma_map = blkdev_dma_map,
};
static __init int blkdev_init(void)
diff --git a/include/linux/blk-mq-dma-token.h b/include/linux/blk-mq-dma-token.h
new file mode 100644
index 000000000000..4a8d84addc06
--- /dev/null
+++ b/include/linux/blk-mq-dma-token.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BLK_MQ_DMA_TOKEN_H
+#define BLK_MQ_DMA_TOKEN_H
+
+#include <linux/blk-mq.h>
+#include <linux/dma_token.h>
+#include <linux/percpu-refcount.h>
+
+struct blk_mq_dma_token;
+struct blk_mq_dma_fence;
+
+struct blk_mq_dma_map {
+ void *private;
+
+ struct percpu_ref refs;
+ struct sg_table *sgt;
+ struct blk_mq_dma_token *token;
+ struct blk_mq_dma_fence *fence;
+ struct work_struct free_work;
+};
+
+struct blk_mq_dma_token {
+ struct dma_token base;
+ enum dma_data_direction dir;
+
+ void *private;
+
+ struct dma_buf *dmabuf;
+ struct blk_mq_dma_map __rcu *map;
+ struct request_queue *q;
+
+ struct mutex mapping_lock;
+ refcount_t refs;
+
+ atomic_t fence_seq;
+ u64 fence_ctx;
+};
+
+static inline
+struct blk_mq_dma_token *dma_token_to_blk_mq(struct dma_token *token)
+{
+ return container_of(token, struct blk_mq_dma_token, base);
+}
+
+blk_status_t blk_rq_assign_dma_map(struct request *req,
+ struct blk_mq_dma_token *token);
+
+static inline void blk_rq_drop_dma_map(struct request *rq)
+{
+ if (rq->dma_map) {
+ percpu_ref_put(&rq->dma_map->refs);
+ rq->dma_map = NULL;
+ }
+}
+
+void blk_mq_dma_map_move_notify(struct blk_mq_dma_token *token);
+struct dma_token *blk_mq_dma_map(struct request_queue *q,
+ struct dma_token_params *params);
+
+#endif /* BLK_MQ_DMA_TOKEN_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b54506b3b76d..4745d1e183f2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -94,6 +94,9 @@ enum mq_rq_state {
MQ_RQ_COMPLETE = 2,
};
+struct blk_mq_dma_map;
+struct blk_mq_dma_token;
+
/*
* Try to put the fields that are referenced together in the same cacheline.
*
@@ -170,6 +173,8 @@ struct request {
unsigned long deadline;
+ struct blk_mq_dma_map *dma_map;
+
/*
* The hash is used inside the scheduler, and killed once the
* request reaches the dispatch list. The ipi_list is only used
@@ -675,6 +680,21 @@ struct blk_mq_ops {
*/
void (*map_queues)(struct blk_mq_tag_set *set);
+ /**
+ * @map_dmabuf: Allows drivers to pre-map a dmabuf. The resulting driver
+ * specific mapping will be wrapped into dma_token and passed to the
+ * read / write path in an iterator.
+ */
+ int (*dma_map)(struct request_queue *q, struct blk_mq_dma_map *);
+ void (*dma_unmap)(struct request_queue *q, struct blk_mq_dma_map *);
+ int (*init_dma_token)(struct request_queue *q,
+ struct blk_mq_dma_token *token);
+ void (*clean_dma_token)(struct request_queue *q,
+ struct blk_mq_dma_token *token);
+
+ struct dma_buf_attachment *(*dma_attach)(struct request_queue *q,
+ struct dma_token_params *params);
+
#ifdef CONFIG_BLK_DEBUG_FS
/**
* @show_rq: Used by the debugfs implementation to show driver-specific
@@ -946,6 +966,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
void blk_mq_freeze_queue_nomemsave(struct request_queue *q);
void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q);
+
static inline unsigned int __must_check
blk_mq_freeze_queue(struct request_queue *q)
{
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cb4ba09959ee..dec75348f8dc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1777,6 +1777,9 @@ struct block_device *file_bdev(struct file *bdev_file);
bool disk_live(struct gendisk *disk);
unsigned int block_size(struct block_device *bdev);
+struct dma_token *blkdev_dma_map(struct file *file,
+ struct dma_token_params *params);
+
#ifdef CONFIG_BLOCK
void invalidate_bdev(struct block_device *bdev);
int sync_blockdev(struct block_device *bdev);
--
2.52.0
Powered by blists - more mailing lists