[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <dd58b093-1e50-343b-255a-6af46b96e055@linux.alibaba.com>
Date: Tue, 10 May 2022 12:22:47 +0800
From: Ziyang Zhang <ZiyangZhang@...ux.alibaba.com>
To: Gabriel Krisman Bertazi <krisman@...labora.com>,
Ming Lei <ming.lei@...hat.com>
Cc: Jens Axboe <axboe@...nel.dk>, linux-block@...r.kernel.org,
linux-kernel@...r.kernel.org, io-uring@...r.kernel.org,
Xiaoguang Wang <xiaoguang.wang@...ux.alibaba.com>
Subject: Re: [RFC PATCH] ubd: add io_uring based userspace block driver
On 2022/5/9 23:10, Gabriel Krisman Bertazi wrote:
> Ming Lei <ming.lei@...hat.com> writes:
>
>> This is the driver part of userspace block driver(ubd driver), the other
>> part is userspace daemon part(ubdsrv)[1].
>>
>> The two parts communicate by io_uring's IORING_OP_URING_CMD with one
>> shared cmd buffer for storing io command, and the buffer is read only for
>> ubdsrv, each io command is indexed by io request tag directly, and
>> is written by ubd driver.
>>
>> For example, when one READ io request is submitted to ubd block driver, ubd
>> driver stores the io command into cmd buffer first, then completes one
>> IORING_OP_URING_CMD for notifying ubdsrv, and the URING_CMD is issued to
>> ubd driver beforehand by ubdsrv for getting notification of any new io request,
>> and each URING_CMD is associated with one io request by tag.
>>
>> After ubdsrv gets the io command, it translates and handles the ubd io
>> request, such as, for the ubd-loop target, ubdsrv translates the request
>> into same request on another file or disk, like the kernel loop block
>> driver. In ubdsrv's implementation, the io is still handled by io_uring,
>> and share same ring with IORING_OP_URING_CMD command. When the target io
>> request is done, the same IORING_OP_URING_CMD is issued to ubd driver for
>> both committing io request result and getting future notification of new
>> io request.
>>
>> Another thing done by ubd driver is to copy data between kernel io
>> request and ubdsrv's io buffer:
>>
>> 1) before ubsrv handles WRITE request, copy the request's data into
>> ubdsrv's userspace io buffer, so that ubdsrv can handle the write
>> request
>>
>> 2) after ubsrv handles READ request, copy ubdsrv's userspace io buffer
>> into this READ request, then ubd driver can complete the READ request
>>
>> Zero copy may be switched if mm is ready to support it.
>>
>> ubd driver doesn't handle any logic of the specific user space driver,
>> so it should be small/simple enough.
>>
>> [1] ubdsrv
>> https://github.com/ming1/ubdsrv/commits/devel
>>
>> Signed-off-by: Ming Lei <ming.lei@...hat.com>
>> ---
>> drivers/block/Kconfig | 7 +
>> drivers/block/Makefile | 2 +
>> drivers/block/ubd_drv.c | 1193 ++++++++++++++++++++++++++++++++++
>> include/uapi/linux/ubd_cmd.h | 167 +++++
>> 4 files changed, 1369 insertions(+)
>> create mode 100644 drivers/block/ubd_drv.c
>> create mode 100644 include/uapi/linux/ubd_cmd.h
>>
>> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
>> index fdb81f2794cd..3893ccd82e8a 100644
>> --- a/drivers/block/Kconfig
>> +++ b/drivers/block/Kconfig
>> @@ -408,6 +408,13 @@ config BLK_DEV_RBD
>>
>> If unsure, say N.
>>
>> +config BLK_DEV_USER_BLK_DRV
>> + bool "Userspace block driver"
>> + select IO_URING
>> + default y
>> + help
>> + io uring based userspace block driver.
>> +
>> source "drivers/block/rnbd/Kconfig"
>>
>> endif # BLK_DEV
>> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
>> index 934a9c7c3a7c..effff34babd9 100644
>> --- a/drivers/block/Makefile
>> +++ b/drivers/block/Makefile
>> @@ -39,4 +39,6 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
>>
>> obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
>>
>> +obj-$(CONFIG_BLK_DEV_USER_BLK_DRV) += ubd_drv.o
>> +
>> swim_mod-y := swim.o swim_asm.o
>> diff --git a/drivers/block/ubd_drv.c b/drivers/block/ubd_drv.c
>> new file mode 100644
>> index 000000000000..9eba7ee17aff
>> --- /dev/null
>> +++ b/drivers/block/ubd_drv.c
>
> Hi Ming,
>
> Thank you very much for sending it.
>
> Given the interest in working on it from a bunch of people, I'd
> appreciate if we consider putting the code into drivers/staging as soon
> as possible, and let us work on the same code base to consolidate the
> protocol there, instead of taking too long to push to drivers/block.
Approveļ¼
>
> One initial concern is that UBD is already the name for storage device
> exposed by User Mode Linux. A rename would avoid future confusion.
>
>> @@ -0,0 +1,1193 @@
>> +// SPDX-License-Identifier: GPL-2.0-or-later
>> +/*
>> + * Userspace block device - block device which IO is handled from userspace
>> + *
>> + * Take full use of io_uring passthrough command for communicating with
>> + * ubd userspace daemon(ubdsrvd) for handling basic IO request.
>> + *
>> + * Copyright 2022 Ming Lei <ming.lei@...hat.com>
>> + *
>> + * (part of code stolen from loop.c)
>> + */
>> +#include <linux/module.h>
>> +#include <linux/moduleparam.h>
>> +#include <linux/sched.h>
>> +#include <linux/fs.h>
>> +#include <linux/pagemap.h>
>> +#include <linux/file.h>
>> +#include <linux/stat.h>
>> +#include <linux/errno.h>
>> +#include <linux/major.h>
>> +#include <linux/wait.h>
>> +#include <linux/blkdev.h>
>> +#include <linux/init.h>
>> +#include <linux/swap.h>
>> +#include <linux/slab.h>
>> +#include <linux/compat.h>
>> +#include <linux/mutex.h>
>> +#include <linux/writeback.h>
>> +#include <linux/completion.h>
>> +#include <linux/highmem.h>
>> +#include <linux/sysfs.h>
>> +#include <linux/miscdevice.h>
>> +#include <linux/falloc.h>
>> +#include <linux/uio.h>
>> +#include <linux/ioprio.h>
>> +#include <linux/sched/mm.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/cdev.h>
>> +#include <linux/io_uring.h>
>> +#include <linux/blk-mq.h>
>> +#include <linux/delay.h>
>> +#include <linux/mm.h>
>> +#include <asm/page.h>
>> +#include <linux/task_work.h>
>> +#include <uapi/linux/ubd_cmd.h>
>> +
>> +#define UBD_MINORS (1U << MINORBITS)
>> +
>> +struct ubd_abort_work {
>> + struct ubd_device *ub;
>> + unsigned int q_id;
>> + struct work_struct work;
>> +};
>> +
>> +struct ubd_rq_data {
>> + struct callback_head work;
>> +};
>> +
>> +/* io cmd is active: sqe cmd is received, and its cqe isn't done */
>> +#define UBD_IO_FLAG_ACTIVE 0x01
>> +
>> +/*
>> + * FETCH io cmd is completed via cqe, and the io cmd is being handled by
>> + * ubdsrv, and not committed yet
>> + */
>> +#define UBD_IO_FLAG_OWNED_BY_SRV 0x02
>> +
>> +struct ubd_io {
>> + /* userspace buffer address from io cmd */
>> + __u64 addr;
>> + unsigned int flags;
>> + unsigned int res;
>> +
>> + struct io_uring_cmd *cmd;
>> +};
>> +
>> +struct ubd_queue {
>> + int q_id;
>> + int q_depth;
>> +
>> + struct task_struct *ubq_daemon;
>> + char *io_cmd_buf;
>> +
>> + unsigned long io_addr; /* mapped vm address */
>> + unsigned int max_io_sz;
>> + bool aborted;
>> + struct ubd_io ios[0];
>> +};
>> +
>> +struct ubd_device {
>> + struct gendisk *ub_disk;
>> + struct request_queue *ub_queue;
>> +
>> + char *__queues;
>> +
>> + unsigned short queue_size;
>> + unsigned short bs_shift;
>> + unsigned int nr_aborted_queues;
>> + struct ubdsrv_ctrl_dev_info dev_info;
>> +
>> + struct blk_mq_tag_set tag_set;
>> +
>> + struct cdev cdev;
>> + struct device cdev_dev;
>> +
>> + atomic_t ch_open_cnt;
>> + int ub_number;
>> +
>> + struct mutex mutex;
>> +};
>> +
>> +static dev_t ubd_chr_devt;
>> +static struct class *ubd_chr_class;
>> +
>> +static DEFINE_IDR(ubd_index_idr);
>> +static DEFINE_MUTEX(ubd_ctl_mutex);
>> +
>> +static struct miscdevice ubd_misc;
>> +
>> +static inline struct ubd_queue *ubd_get_queue(struct ubd_device *dev, int qid)
>> +{
>> + return (struct ubd_queue *)&(dev->__queues[qid * dev->queue_size]);
>> +}
>> +
>> +static inline bool ubd_rq_need_copy(struct request *rq)
>> +{
>> + return rq->bio && bio_has_data(rq->bio);
>> +}
>> +
>> +static inline struct ubdsrv_io_desc *ubd_get_iod(struct ubd_queue *ubq, int tag)
>> +{
>> + return (struct ubdsrv_io_desc *)
>> + &(ubq->io_cmd_buf[tag * sizeof(struct ubdsrv_io_desc)]);
>> +}
>> +
>> +static inline char *ubd_queue_cmd_buf(struct ubd_device *ub, int q_id)
>> +{
>> + return ubd_get_queue(ub, q_id)->io_cmd_buf;
>> +}
>> +
>> +static inline int ubd_queue_cmd_buf_size(struct ubd_device *ub, int q_id)
>> +{
>> + struct ubd_queue *ubq = ubd_get_queue(ub, q_id);
>> +
>> + return round_up(ubq->q_depth * sizeof(struct ubdsrv_io_desc), PAGE_SIZE);
>> +}
>> +
>> +static int ubd_open(struct block_device *bdev, fmode_t mode)
>> +{
>> + return 0;
>> +}
>> +
>> +static void ubd_release(struct gendisk *disk, fmode_t mode)
>> +{
>> +}
>> +
>> +static const struct block_device_operations ub_fops = {
>> + .owner = THIS_MODULE,
>> + .open = ubd_open,
>> + .release = ubd_release,
>> +};
>> +
>> +#define UBD_MAX_PIN_PAGES 32
>> +
>> +static void ubd_release_pages(struct ubd_device *ub, struct page **pages,
>> + int nr_pages)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < nr_pages; i++)
>> + put_page(pages[i]);
>> +}
>> +
>> +static int ubd_pin_user_pages(struct ubd_device *ub, u64 start_vm,
>> + struct page **pages, unsigned int nr_pages, bool to_rq)
>> +{
>> + unsigned int gup_flags = to_rq ? 0 : FOLL_WRITE;
>> +
>> + return get_user_pages_fast(start_vm, nr_pages, gup_flags, pages);
>> +}
>> +
>> +static inline unsigned ubd_copy_bv(struct bio_vec *bv, void **bv_addr,
>> + void *pg_addr, unsigned int *pg_off,
>> + unsigned int *pg_len, bool to_bv)
>> +{
>> + unsigned len = min_t(unsigned, bv->bv_len, *pg_len);
>> +
>> + if (*bv_addr == NULL)
>> + *bv_addr = kmap_local_page(bv->bv_page);
>> +
>> + if (to_bv)
>> + memcpy(*bv_addr + bv->bv_offset, pg_addr + *pg_off, len);
>> + else
>> + memcpy(pg_addr + *pg_off, *bv_addr + bv->bv_offset, len);
>> +
>> + bv->bv_offset += len;
>> + bv->bv_len -= len;
>> + *pg_off += len;
>> + *pg_len -= len;
>> +
>> + if (!bv->bv_len) {
>> + kunmap_local(*bv_addr);
>> + *bv_addr = NULL;
>> + }
>> +
>> + return len;
>> +}
>> +
>> +/* copy rq pages to ubdsrv vm address pointed by io->addr */
>> +static int ubd_copy_pages(struct ubd_device *ub, struct request *rq)
>> +{
>> + struct ubd_queue *ubq = rq->mq_hctx->driver_data;
>> + struct ubd_io *io = &ubq->ios[rq->tag];
>> + struct page *pgs[UBD_MAX_PIN_PAGES];
>> + const bool to_rq = !op_is_write(rq->cmd_flags);
>> + struct req_iterator req_iter;
>> + struct bio_vec bv;
>> + unsigned long start = io->addr, left = rq->__data_len;
>> + unsigned int idx = 0, pg_len = 0, pg_off = 0;
>> + int nr_pin = 0;
>> + void *pg_addr = NULL;
>> + struct page *curr = NULL;
>> +
>> + rq_for_each_segment(bv, rq, req_iter) {
>> + unsigned len, bv_off = bv.bv_offset, bv_len = bv.bv_len;
>> + void *bv_addr = NULL;
>> +
>> +refill:
>> + if (pg_len == 0) {
>> + unsigned int off = 0;
>> +
>> + if (pg_addr) {
>> + kunmap_local(pg_addr);
>> + if (!to_rq)
>> + set_page_dirty_lock(curr);
>> + pg_addr = NULL;
>> + }
>> +
>> + /* refill pages */
>> + if (idx >= nr_pin) {
>> + unsigned int max_pages;
>> +
>> + ubd_release_pages(ub, pgs, nr_pin);
>> +
>> + off = start & (PAGE_SIZE - 1);
>> + max_pages = round_up(off + left, PAGE_SIZE);
>> + nr_pin = min_t(unsigned, UBD_MAX_PIN_PAGES, max_pages);
>> + nr_pin = ubd_pin_user_pages(ub, start, pgs,
>> + nr_pin, to_rq);
>> + if (nr_pin <= 0)
>> + return -EINVAL;
>> + idx = 0;
>> + }
>> + pg_off = off;
>> + pg_len = min(PAGE_SIZE - off, left);
>> + off = 0;
>> + curr = pgs[idx++];
>> + pg_addr = kmap_local_page(curr);
>> + }
>> +
>> + len = ubd_copy_bv(&bv, &bv_addr, pg_addr, &pg_off, &pg_len,
>> + to_rq);
>> + /* either one of the two has been consumed */
>> + WARN_ON_ONCE(bv.bv_len && pg_len);
>> + start += len;
>> + left -= len;
>> +
>> + /* overflow */
>> + WARN_ON_ONCE(left > rq->__data_len);
>> + WARN_ON_ONCE(bv.bv_len > bv_len);
>> + if (bv.bv_len)
>> + goto refill;
>> +
>> + bv.bv_len = bv_len;
>> + bv.bv_offset = bv_off;
>> + }
>> + if (pg_addr) {
>> + kunmap_local(pg_addr);
>> + if (!to_rq)
>> + set_page_dirty_lock(curr);
>> + }
>> + ubd_release_pages(ub, pgs, nr_pin);
>> +
>> + WARN_ON_ONCE(left != 0);
>> +
>> + return 0;
>> +}
>> +
>> +#define UBD_REMAP_BATCH 32
>> +
>> +static int ubd_map_io(struct ubd_device *ub, struct request *req)
>> +{
>> + /*
>> + * no zero copy, we delay copy WRITE request data into ubdsrv
>> + * context via task_work_add and the big benefit is that pinning
>> + * pages in current context is pretty fast, see ubd_pin_user_pages
>> + */
>> + if (!op_is_write(req->cmd_flags) || !ubd_rq_need_copy(req))
>> + return 0;
>> +
>> + /* convert to data copy in current context */
>> + return ubd_copy_pages(ub, req);
>> +}
>> +
>> +static int ubd_unmap_io(struct request *req)
>> +{
>> + struct ubd_device *ub = req->q->queuedata;
>> +
>> + if (!op_is_write(req->cmd_flags) && ubd_rq_need_copy(req))
>> + return ubd_copy_pages(ub, req);
>> + return 0;
>> +}
>> +
>> +static inline unsigned int ubd_req_build_flags(struct request *req)
>> +{
>> + unsigned flags = 0;
>> +
>> + if (req->cmd_flags & REQ_FAILFAST_DEV)
>> + flags |= UBD_IO_F_FAILFAST_DEV;
>> +
>> + if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
>> + flags |= UBD_IO_F_FAILFAST_TRANSPORT;
>> +
>> + if (req->cmd_flags & REQ_FAILFAST_DRIVER)
>> + flags |= UBD_IO_F_FAILFAST_DRIVER;
>> +
>> + if (req->cmd_flags & REQ_META)
>> + flags |= UBD_IO_F_META;
>> +
>> + if (req->cmd_flags & REQ_INTEGRITY)
>> + flags |= UBD_IO_F_INTEGRITY;
>> +
>> + if (req->cmd_flags & REQ_FUA)
>> + flags |= UBD_IO_F_FUA;
>> +
>> + if (req->cmd_flags & REQ_PREFLUSH)
>> + flags |= UBD_IO_F_PREFLUSH;
>> +
>> + if (req->cmd_flags & REQ_NOUNMAP)
>> + flags |= UBD_IO_F_NOUNMAP;
>> +
>> + if (req->cmd_flags & REQ_SWAP)
>> + flags |= UBD_IO_F_SWAP;
>> +
>> + return flags;
>> +}
>> +
>> +static int ubd_setup_iod(struct ubd_queue *ubq, struct request *req)
>> +{
>> + struct ubdsrv_io_desc *iod = ubd_get_iod(ubq, req->tag);
>> + struct ubd_io *io = &ubq->ios[req->tag];
>> + u32 ubd_op;
>> +
>> + switch (req_op(req)) {
>> + case REQ_OP_READ:
>> + ubd_op = UBD_IO_OP_READ;
>> + break;
>> + case REQ_OP_WRITE:
>> + ubd_op = UBD_IO_OP_WRITE;
>> + break;
>> + case REQ_OP_FLUSH:
>> + ubd_op = UBD_IO_OP_FLUSH;
>> + break;
>> + case REQ_OP_DISCARD:
>> + ubd_op = UBD_IO_OP_DISCARD;
>> + break;
>> + case REQ_OP_WRITE_ZEROES:
>> + ubd_op = UBD_IO_OP_WRITE_ZEROES;
>> + break;
>> + default:
>> + return BLK_STS_IOERR;
>> + }
>> +
>> + /* need to translate since kernel may change */
>> + iod->op_flags = ubd_op | ubd_req_build_flags(req);
>> + iod->tag_blocks = req->tag | (blk_rq_sectors(req) << 12);
>> + iod->start_block = blk_rq_pos(req);
>> + iod->addr = io->addr;
>> +
>> + return BLK_STS_OK;
>> +}
>> +
>> +static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
>> + const struct blk_mq_queue_data *bd)
>> +{
>> + struct ubd_queue *ubq = hctx->driver_data;
>> + struct request *rq = bd->rq;
>> + struct ubd_io *io = &ubq->ios[rq->tag];
>> + struct ubd_rq_data *data = blk_mq_rq_to_pdu(rq);
>> + blk_status_t res;
>> +
>> + if (ubq->aborted)
>> + return BLK_STS_IOERR;
>> +
>> + /* this io cmd slot isn't active, so have to fail this io */
>> + if (WARN_ON_ONCE(!(io->flags & UBD_IO_FLAG_ACTIVE)))
>> + return BLK_STS_IOERR;
>> +
>> + /* fill iod to slot in io cmd buffer */
>> + res = ubd_setup_iod(ubq, rq);
>> + if (res != BLK_STS_OK)
>> + return BLK_STS_IOERR;
>> +
>> + blk_mq_start_request(bd->rq);
>> +
>> + /* mark this cmd owned by ubdsrv */
>> + io->flags |= UBD_IO_FLAG_OWNED_BY_SRV;
>> +
>> + /*
>> + * clear ACTIVE since we are done with this sqe/cmd slot
>> + *
>> + * We can only accept io cmd in case of being not active.
>> + */
>> + io->flags &= ~UBD_IO_FLAG_ACTIVE;
>> +
>> + /*
>> + * run data copy in task work context for WRITE, and complete io_uring
>> + * cmd there too.
>> + *
>> + * This way should improve batching, meantime pinning pages in current
>> + * context is pretty fast.
>> + */
>> + task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL);
>> +
>> + return BLK_STS_OK;
>> +}
>> +
>> +static void ubd_complete_rq(struct request *req)
>> +{
>> + struct ubd_queue *ubq = req->mq_hctx->driver_data;
>> + struct ubd_io *io = &ubq->ios[req->tag];
>> +
>> + /* for READ request, writing data in iod->addr to rq buffers */
>> + ubd_unmap_io(req);
>> +
>> + blk_mq_end_request(req, io->res);
>> +}
>> +
>> +static void ubd_rq_task_work_fn(struct callback_head *work)
>> +{
>> + struct ubd_rq_data *data = container_of(work,
>> + struct ubd_rq_data, work);
>> + struct request *req = blk_mq_rq_from_pdu(data);
>> + struct ubd_queue *ubq = req->mq_hctx->driver_data;
>> + struct ubd_io *io = &ubq->ios[req->tag];
>> + struct ubd_device *ub = req->q->queuedata;
>> + int ret = UBD_IO_RES_OK;
>> +
>> + if (ubd_map_io(ub, req))
>> + ret = UBD_IO_RES_DATA_BAD;
>> +
>> + pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x, addr %llx\n",
>> + __func__, io->cmd->cmd_op, req->tag, ret, io->flags,
>> + ubd_get_iod(ubq, req->tag)->addr);
>> + /* tell ubdsrv one io request is coming */
>> + io_uring_cmd_done(io->cmd, ret, 0);
>> +}
>> +
>> +static int ubd_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
>> + unsigned int hctx_idx)
>> +{
>> + struct ubd_device *ub = hctx->queue->queuedata;
>> + struct ubd_queue *ubq = ubd_get_queue(ub, hctx->queue_num);
>> +
>> + hctx->driver_data = ubq;
>> + return 0;
>> +}
>> +
>> +static int ubd_init_rq(struct blk_mq_tag_set *set, struct request *req,
>> + unsigned int hctx_idx, unsigned int numa_node)
>> +{
>> + struct ubd_rq_data *data = blk_mq_rq_to_pdu(req);
>> +
>> + init_task_work(&data->work, ubd_rq_task_work_fn);
>> +
>> + return 0;
>> +}
>> +
>> +static const struct blk_mq_ops ubd_mq_ops = {
>> + .queue_rq = ubd_queue_rq,
>> + .init_hctx = ubd_init_hctx,
>> + .init_request = ubd_init_rq,
>> +};
>> +
>> +static int ubd_ch_open(struct inode *inode, struct file *filp)
>> +{
>> + struct ubd_device *ub = container_of(inode->i_cdev,
>> + struct ubd_device, cdev);
>> +
>> + if (atomic_cmpxchg(&ub->ch_open_cnt, 0, 1) == 0) {
>> + filp->private_data = ub;
>> + return 0;
>> + }
>> + return -EBUSY;
>> +}
>> +
>> +static int ubd_ch_release(struct inode *inode, struct file *filp)
>> +{
>> + struct ubd_device *ub = filp->private_data;
>> +
>> + while (atomic_cmpxchg(&ub->ch_open_cnt, 1, 0) != 1)
>> + cpu_relax();
>> +
>> + filp->private_data = NULL;
>> + return 0;
>> +}
>> +
>> +/* map pre-allocated per-queue cmd buffer to ubdsrv daemon */
>> +static int ubd_ch_mmap(struct file *filp, struct vm_area_struct *vma)
>> +{
>> + struct ubd_device *ub = filp->private_data;
>> + size_t sz = vma->vm_end - vma->vm_start;
>> + unsigned max_sz = UBD_MAX_QUEUE_DEPTH * sizeof(struct ubdsrv_io_desc);
>> + unsigned long pfn, end;
>> + int q_id;
>> +
>> + end = UBDSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
>> + if (vma->vm_pgoff < UBDSRV_CMD_BUF_OFFSET || vma->vm_pgoff >= end)
>> + return -EINVAL;
>> +
>> + q_id = (vma->vm_pgoff - UBDSRV_CMD_BUF_OFFSET) / max_sz;
>> +
>> + if (sz != ubd_queue_cmd_buf_size(ub, q_id))
>> + return -EINVAL;
>> +
>> + pfn = virt_to_phys(ubd_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
>> + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
>> +}
>> +
>> +static void ubd_commit_completion(struct ubd_device *ub,
>> + struct ubdsrv_io_cmd *ub_cmd)
>> +{
>> + u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
>> + struct ubd_queue *ubq = ubd_get_queue(ub, qid);
>> + struct ubd_io *io = &ubq->ios[tag];
>> + struct request *req;
>> +
>> + /* now this cmd slot is owned by nbd driver */
>> + io->flags &= ~UBD_IO_FLAG_OWNED_BY_SRV;
>> + io->res = ub_cmd->result;
>> +
>> + /* find the io request and complete */
>> + req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], ub_cmd->tag);
>> +
>> + if (req && likely(!blk_should_fake_timeout(req->q)))
>> + ubd_complete_rq(req);
>> +}
>> +
>> +/* has to be called disk is dead or frozen */
>> +static int ubd_abort_queue(struct ubd_device *ub, int qid)
>> +{
>> + int ret = UBD_IO_RES_ABORT;
>> + struct ubd_queue *q = ubd_get_queue(ub, qid);
>> + int i;
>> +
>> + for (i = 0; i < q->q_depth; i++) {
>> + struct ubd_io *io = &q->ios[i];
>> +
>> + if (io->flags & UBD_IO_FLAG_ACTIVE) {
>> + io->flags &= ~UBD_IO_FLAG_ACTIVE;
>> + io_uring_cmd_done(io->cmd, ret, 0);
>> + }
>> + }
>> + q->ubq_daemon = NULL;
>> + return 0;
>> +}
>> +
>> +static void ubd_abort_work_fn(struct work_struct *work)
>> +{
>> + struct ubd_abort_work *abort_work =
>> + container_of(work, struct ubd_abort_work, work);
>> + struct ubd_device *ub = abort_work->ub;
>> + struct ubd_queue *ubq = ubd_get_queue(ub, abort_work->q_id);
>> +
>> + blk_mq_freeze_queue(ub->ub_queue);
>> + mutex_lock(&ub->mutex);
>> + if (!ubq->aborted) {
>> + ubq->aborted = true;
>> + ubd_abort_queue(ub, abort_work->q_id);
>> + ub->nr_aborted_queues++;
>> + }
>> + mutex_unlock(&ub->mutex);
>> + blk_mq_unfreeze_queue(ub->ub_queue);
>> +
>> + mutex_lock(&ub->mutex);
>> + if (ub->nr_aborted_queues == ub->dev_info.nr_hw_queues) {
>> + if (disk_live(ub->ub_disk))
>> + del_gendisk(ub->ub_disk);
>> + }
>> + mutex_unlock(&ub->mutex);
>> +
>> + kfree(abort_work);
>> +}
>> +
>> +static int ubd_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
>> +{
>> + struct ubdsrv_io_cmd *ub_cmd = (struct ubdsrv_io_cmd *)cmd->cmd;
>> + struct ubd_device *ub = cmd->file->private_data;
>> + struct ubd_queue *ubq;
>> + struct ubd_io *io;
>> + u32 cmd_op = cmd->cmd_op;
>> + unsigned tag = ub_cmd->tag;
>> + int ret = UBD_IO_RES_INVALID_SQE;
>> +
>> + pr_devel("%s: receieved: cmd op %d, tag %d, queue %d\n",
>> + __func__, cmd->cmd_op, tag, ub_cmd->q_id);
>> +
>> + if (!(issue_flags & IO_URING_F_SQE128))
>> + goto out;
>> +
>> + if (cmd_op == UBD_IO_ABORT_QUEUE) {
>> + struct ubd_abort_work *work = kzalloc(sizeof(*work),
>> + GFP_KERNEL);
>> + if (!work)
>> + goto out;
>> +
>> + INIT_WORK(&work->work, ubd_abort_work_fn);
>> + work->ub = ub;
>> + work->q_id = ub_cmd->q_id;
>> +
>> + schedule_work(&work->work);
>> + ret = UBD_IO_RES_OK;
>> + goto out_done;
>> + }
>> +
>> + ubq = ubd_get_queue(ub, ub_cmd->q_id);
>> + if (WARN_ON_ONCE(tag >= ubq->q_depth))
>> + goto out;
>> +
>> + io = &ubq->ios[tag];
>> +
>> + /* there is pending io cmd, something must be wrong */
>> + if (io->flags & UBD_IO_FLAG_ACTIVE) {
>> + ret = UBD_IO_RES_BUSY;
>> + goto out;
>> + }
>> +
>> + switch (cmd_op) {
>> + case UBD_IO_FETCH_REQ:
>> + /* FETCH_REQ is only issued when starting device */
>> + mutex_lock(&ub->mutex);
>> + if (!ubq->ubq_daemon)
>> + ubq->ubq_daemon = current;
>
> I find it slightly weird to set ubq_daemon per-request. It would cause
> weird corruptions if a buggy thread sends a FETCH_REQ to the wrong q_id
> after another request in-flight has already set ubq_daemon. I was also
> working on MQ and I was thinking of dropping the q_id field from the
> io_uring command, implying it from the fd that was opened. Then, I let
> each thread reopen the char device to configure a new queue:
>
> fd = open("/dev/ubdc0", O_RDWR); // Connects to a new queue
>
> fd is closed with fork/exec.
>
>> +static int ubd_ctrl_start_dev(struct ubd_device *ub, struct io_uring_cmd *cmd)
>> +{
>> + struct ubdsrv_ctrl_dev_info *info = (struct ubdsrv_ctrl_dev_info *)cmd->cmd;
>> + int ret = -EINVAL;
>> + unsigned long end = jiffies + 3 * HZ;
>> +
>> + if (info->ubdsrv_pid <= 0)
>> + return -1;
>> +
>> + mutex_lock(&ub->mutex);
>> +
>> + ub->dev_info.ubdsrv_pid = info->ubdsrv_pid;
>> + if (disk_live(ub->ub_disk))
>> + goto unlock;
>> + while (time_before(jiffies, end)) {
>> + if (ubd_io_ready(ub)) {
>> + ret = 0;
>> + break;
>> + }
>> + msleep(100);
>> + }
>
> This timer is quite weird, in my opinion. Shouldn't we fail start_dev
> and let userspace retry the command at a later time?
>
>> + unlock:
>> + mutex_unlock(&ub->mutex);
>> + pr_devel("%s: device io ready %d\n", __func__, !ret);
>> +
>> + if (ret == 0)
>> + ret = add_disk(ub->ub_disk);
>> +
>> + return ret;
>> +}
>> +
>> +static inline void ubd_dump(struct io_uring_cmd *cmd)
>> +{
>> +#ifdef DEBUG
>> + struct ubdsrv_ctrl_dev_info *info =
>> + (struct ubdsrv_ctrl_dev_info *)cmd->cmd;
>> +
>> + printk("%s: cmd_op %x, dev id %d flags %llx\n", __func__,
>> + cmd->cmd_op, info->dev_id, info->flags);
>> +
>> + printk("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
>> + info->nr_hw_queues, info->queue_depth,
>> + info->block_size, info->dev_blocks);
>> +#endif
>> +}
>
> Maybe pr_debug or tracepoint?
>
>> +MODULE_AUTHOR("Ming Lei <ming.lei@...hat.com>");
>> +MODULE_LICENSE("GPL");
>> diff --git a/include/uapi/linux/ubd_cmd.h b/include/uapi/linux/ubd_cmd.h
>> new file mode 100644
>> index 000000000000..8ecea6aa9cfe
>> --- /dev/null
>> +++ b/include/uapi/linux/ubd_cmd.h
>> @@ -0,0 +1,167 @@
>> +#ifndef USER_BLK_DRV_CMD_INC_H
>> +#define USER_BLK_DRV_CMD_INC_H
>> +
>> +/* ubd server command definition */
>> +
>> +/* CMD result code */
>> +#define UBD_CTRL_CMD_RES_OK 0
>> +#define UBD_CTRL_CMD_RES_FAILED -1
>> +
>> +/*
>> + * Admin commands, issued by ubd server, and handled by ubd driver.
>> + */
>> +#define UBD_CMD_SET_DEV_INFO 0x01
>> +#define UBD_CMD_GET_DEV_INFO 0x02
>> +#define UBD_CMD_ADD_DEV 0x04
>> +#define UBD_CMD_DEL_DEV 0x05
>> +#define UBD_CMD_START_DEV 0x06
>> +#define UBD_CMD_STOP_DEV 0x07
>> +
>> +/*
>> + * IO commands, issued by ubd server, and handled by ubd driver.
>> + *
>> + * FETCH_REQ: issued via sqe(URING_CMD) beforehand for fetching IO request
>> + * from ubd driver, should be issued only when starting device. After
>> + * the associated cqe is returned, request's tag can be retrieved via
>> + * cqe->userdata.
>> + *
>> + * COMMIT_AND_FETCH_REQ: issued via sqe(URING_CMD) after ubdserver handled
>> + * this IO request, request's handling result is committed to ubd
>> + * driver, meantime FETCH_REQ is piggyback, and FETCH_REQ has to be
>> + * handled before completing io request.
>> + *
>> + * COMMIT_REQ: issued via sqe(URING_CMD) after ubdserver handled this IO
>> + * request, request's handling result is committed to ubd driver.
>> + *
>> + * ABORT_QUEUE: issued via sqe(URING_CMD) and abort all active commands,
>> + * meantime ubdserver can't issue any FETCH_REQ commands
>> + */
>> +#define UBD_IO_FETCH_REQ 0x20
>> +#define UBD_IO_COMMIT_AND_FETCH_REQ 0x21
>> +#define UBD_IO_COMMIT_REQ 0x22
>> +#define UBD_IO_ABORT_QUEUE 0x23
>> +
>> +#define UBD_IO_RES_OK 0x01
>> +#define UBD_IO_RES_INVALID_SQE 0x5f
>> +#define UBD_IO_RES_INVALID_TAG 0x5e
>> +#define UBD_IO_RES_INVALID_QUEUE 0x5d
>> +#define UBD_IO_RES_BUSY 0x5c
>> +#define UBD_IO_RES_DUP_FETCH 0x5b
>> +#define UBD_IO_RES_UNEXPECTED_CMD 0x5a
>> +#define UBD_IO_RES_DATA_BAD 0x59
>> +
>> +/* only ABORT means that no re-fetch */
>> +#define UBD_IO_RES_ABORT 0x59
>> +
>> +#define UBDSRV_CMD_BUF_OFFSET 0
>> +#define UBDSRV_IO_BUF_OFFSET 0x80000000
>> +
>> +/* tag bit is 12bit, so at most 4096 IOs for each queue */
>> +#define UBD_MAX_QUEUE_DEPTH 4096
>> +
>> +/*
>> + * zero copy requires 4k block size, and can remap ubd driver's io
>> + * request into ubdsrv's vm space
>> + */
>> +#define UBD_F_SUPPORT_ZERO_COPY 0
>> +
>> +struct ubdsrv_ctrl_dev_info {
>> + __u16 nr_hw_queues;
>> + __u16 queue_depth;
>> + __u16 block_size;
>> + __u16 state;
>> +
>> + __u32 rq_max_blocks;
>> + __u32 dev_id;
>> +
>> + __u64 dev_blocks;
>> + __u64 flags;
>> +
>> + /*
>> + * Only valid for READ kind of ctrl command, and driver can
>> + * get the userspace buffer address here, then write data
>> + * into this buffer.
>> + *
>> + * And the buffer has to be inside one single page.
>> + */
>> + __u64 addr;
>> + __u32 len;
>> + __s32 ubdsrv_pid;
>> + __u64 reserved0[2];
>> +};
>
> I was about to send you a patch to simplify the interface for
> different commands. My plan was to avoid passing parts of the structure for
> commands that don't use it, considering the limit of io_uring cmd length.
>
> struct ubdsrv_ctrl_dev_info {
> __u32 dev_id;
> union {
>
> char raw[28]; /* Maximum size for iouring_command */
>
> struct {
> __u32 rq_max_blocks;
> __s32 ubdsrv_pid;
>
> __u16 nr_hw_queues;
> __u16 queue_depth;
> __u16 block_size;
> __u16 state;
>
> __u64 dev_blocks;
> __u64 flags;
> }, /* UBD_CMD_ADD_DEV */
>
> struct {
> __u64 addr;
> __u32 len;
> } /* CMD_GET_DEV_INFO */
> }
> };
>
> What do you think?
>
> in addition, I think UBD_CMD_ADD_DEV should embed an extra protocol
> version field, for future extension.
>
Powered by blists - more mailing lists