linux-kernel - [PATCH] [RFC] um: Convert ubd driver to blk-mq

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20171126131053.32300-1-richard@nod.at>
Date:   Sun, 26 Nov 2017 14:10:53 +0100
From:   Richard Weinberger <richard@....at>
To:     user-mode-linux-devel@...ts.sourceforge.net
Cc:     linux-kernel@...r.kernel.org, axboe@...com,
        anton.ivanov@...bridgegreys.com, hch@....de,
        linux-block@...r.kernel.org, Richard Weinberger <richard@....at>
Subject: [PATCH] [RFC] um: Convert ubd driver to blk-mq

This is the first attempt to convert the UserModeLinux block driver
(UBD) to blk-mq.
While the conversion itself is rather trivial, a few questions
popped up in my head. Maybe you can help me with them.

MAX_SG is 64, used for blk_queue_max_segments(). This comes from
a0044bdf60c2 ("uml: batch I/O requests"). Is this still a good/sane
value for blk-mq?

The driver does IO batching, for each request it issues many UML struct
io_thread_req request to the IO thread on the host side.
One io_thread_req per SG page.
Before the conversion the driver used blk_end_request() to indicate that
a part of the request is done.
blk_mq_end_request() does not take a length parameter, therefore we can
only mark the whole request as done. See the new is_last property on the
driver.
Maybe there is a way to partially end requests too in blk-mq?

Another obstacle with IO batching is that UML IO thread requests can
fail. Not only due to OOM, also because the pipe between the UML kernel
process and the host IO thread can return EAGAIN.
In this case the driver puts the request into a list and retried later
again when the pipe turns writable.
I’m not sure whether this restart logic makes sense with blk-mq, maybe
there is a way in blk-mq to put back a (partial) request?

Signed-off-by: Richard Weinberger <richard@....at>
---
 arch/um/drivers/ubd_kern.c | 188 ++++++++++++++++++++++++++-------------------
 1 file changed, 107 insertions(+), 81 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 90034acace2a..abbfe0c97418 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/ata.h>
 #include <linux/hdreg.h>
 #include <linux/cdrom.h>
@@ -57,6 +58,7 @@ struct io_thread_req {
 	unsigned long long cow_offset;
 	unsigned long bitmap_words[2];
 	int error;
+	bool is_last;
 };
 
 
@@ -142,7 +144,6 @@ struct cow {
 #define MAX_SG 64
 
 struct ubd {
-	struct list_head restart;
 	/* name (and fd, below) of the file opened for writing, either the
 	 * backing or the cow file. */
 	char *file;
@@ -156,9 +157,12 @@ struct ubd {
 	struct cow cow;
 	struct platform_device pdev;
 	struct request_queue *queue;
+	struct blk_mq_tag_set tag_set;
 	spinlock_t lock;
+};
+
+struct ubd_pdu {
 	struct scatterlist sg[MAX_SG];
-	struct request *request;
 	int start_sg, end_sg;
 	sector_t rq_pos;
 };
@@ -182,10 +186,6 @@ struct ubd {
 	.shared =		0, \
 	.cow =			DEFAULT_COW, \
 	.lock =			__SPIN_LOCK_UNLOCKED(ubd_devs.lock), \
-	.request =		NULL, \
-	.start_sg =		0, \
-	.end_sg =		0, \
-	.rq_pos =		0, \
 }
 
 /* Protected by ubd_lock */
@@ -196,6 +196,12 @@ static int fake_ide = 0;
 static struct proc_dir_entry *proc_ide_root = NULL;
 static struct proc_dir_entry *proc_ide = NULL;
 
+static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
+				 const struct blk_mq_queue_data *bd);
+static int ubd_init_request(struct blk_mq_tag_set *set,
+			    struct request *req, unsigned int hctx_idx,
+			    unsigned int numa_node);
+
 static void make_proc_ide(void)
 {
 	proc_ide_root = proc_mkdir("ide", NULL);
@@ -448,11 +454,8 @@ __uml_help(udb_setup,
 "    in the boot output.\n\n"
 );
 
-static void do_ubd_request(struct request_queue * q);
-
 /* Only changed by ubd_init, which is an initcall. */
 static int thread_fd = -1;
-static LIST_HEAD(restart);
 
 /* Function to read several request pointers at a time
 * handling fractional reads if (and as) needed
@@ -510,9 +513,6 @@ static int bulk_req_safe_read(
 /* Called without dev->lock held, and only in interrupt context. */
 static void ubd_handler(void)
 {
-	struct ubd *ubd;
-	struct list_head *list, *next_ele;
-	unsigned long flags;
 	int n;
 	int count;
 
@@ -532,23 +532,22 @@ static void ubd_handler(void)
 			return;
 		}
 		for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
-			blk_end_request(
-				(*irq_req_buffer)[count]->req,
-				BLK_STS_OK,
-				(*irq_req_buffer)[count]->length
-			);
-			kfree((*irq_req_buffer)[count]);
+			struct io_thread_req *io_req = (*irq_req_buffer)[count];
+
+			/*
+			 * UBD is batching IO, only end the blk mq request
+			 * if this is the last one.
+			 */
+			if (io_req->is_last)
+				blk_mq_end_request(io_req->req,
+						   io_req->error ?
+						   BLK_STS_IOERR : BLK_STS_OK);
+
+			kfree(io_req);
 		}
 	}
-	reactivate_fd(thread_fd, UBD_IRQ);
 
-	list_for_each_safe(list, next_ele, &restart){
-		ubd = container_of(list, struct ubd, restart);
-		list_del_init(&ubd->restart);
-		spin_lock_irqsave(&ubd->lock, flags);
-		do_ubd_request(ubd->queue);
-		spin_unlock_irqrestore(&ubd->lock, flags);
-	}
+	reactivate_fd(thread_fd, UBD_IRQ);
 }
 
 static irqreturn_t ubd_intr(int irq, void *dev)
@@ -858,6 +857,7 @@ static void ubd_device_release(struct device *dev)
 	struct ubd *ubd_dev = dev_get_drvdata(dev);
 
 	blk_cleanup_queue(ubd_dev->queue);
+	blk_mq_free_tag_set(&ubd_dev->tag_set);
 	*ubd_dev = ((struct ubd) DEFAULT_UBD);
 }
 
@@ -900,6 +900,11 @@ static int ubd_disk_register(int major, u64 size, int unit,
 
 #define ROUND_BLOCK(n) ((n + ((1 << 9) - 1)) & (-1 << 9))
 
+static const struct blk_mq_ops ubd_mq_ops = {
+	.queue_rq = ubd_queue_rq,
+	.init_request = ubd_init_request,
+};
+
 static int ubd_add(int n, char **error_out)
 {
 	struct ubd *ubd_dev = &ubd_devs[n];
@@ -916,15 +921,24 @@ static int ubd_add(int n, char **error_out)
 
 	ubd_dev->size = ROUND_BLOCK(ubd_dev->size);
 
-	INIT_LIST_HEAD(&ubd_dev->restart);
-	sg_init_table(ubd_dev->sg, MAX_SG);
+	ubd_dev->tag_set.ops = &ubd_mq_ops;
+	ubd_dev->tag_set.queue_depth = 64;
+	ubd_dev->tag_set.numa_node = NUMA_NO_NODE;
+	ubd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	ubd_dev->tag_set.cmd_size = sizeof(struct ubd_pdu);
+	ubd_dev->tag_set.driver_data = ubd_dev;
+	ubd_dev->tag_set.nr_hw_queues = 1;
 
-	err = -ENOMEM;
-	ubd_dev->queue = blk_init_queue(do_ubd_request, &ubd_dev->lock);
-	if (ubd_dev->queue == NULL) {
-		*error_out = "Failed to initialize device queue";
+	err = blk_mq_alloc_tag_set(&ubd_dev->tag_set);
+	if (err)
 		goto out;
+
+	ubd_dev->queue = blk_mq_init_queue(&ubd_dev->tag_set);
+	if (IS_ERR(ubd_dev->queue)) {
+		err = PTR_ERR(ubd_dev->queue);
+		goto out_cleanup;
 	}
+
 	ubd_dev->queue->queuedata = ubd_dev;
 	blk_queue_write_cache(ubd_dev->queue, true, false);
 
@@ -932,7 +946,7 @@ static int ubd_add(int n, char **error_out)
 	err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]);
 	if(err){
 		*error_out = "Failed to register device";
-		goto out_cleanup;
+		goto out_cleanup_tags;
 	}
 
 	if (fake_major != UBD_MAJOR)
@@ -950,6 +964,8 @@ static int ubd_add(int n, char **error_out)
 out:
 	return err;
 
+out_cleanup_tags:
+	blk_mq_free_tag_set(&ubd_dev->tag_set);
 out_cleanup:
 	blk_cleanup_queue(ubd_dev->queue);
 	goto out;
@@ -1342,9 +1358,9 @@ static bool submit_request(struct io_thread_req *io_req, struct ubd *dev)
 	if (n != sizeof(io_req)) {
 		if (n != -EAGAIN)
 			printk("write to io thread failed, "
-			       "errno = %d\n", -n);
-		else if (list_empty(&dev->restart))
-			list_add(&dev->restart, &restart);
+		       "errno = %d\n", -n);
+
+		WARN_ONCE(1, "Failed to submit IO request\n");
 
 		kfree(io_req);
 		return false;
@@ -1352,63 +1368,73 @@ static bool submit_request(struct io_thread_req *io_req, struct ubd *dev)
 	return true;
 }
 
-/* Called with dev->lock held */
-static void do_ubd_request(struct request_queue *q)
+static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
+				 const struct blk_mq_queue_data *bd)
 {
+	struct request *req = bd->rq;
+	struct ubd *dev = hctx->queue->queuedata;
+	struct ubd_pdu *pdu = blk_mq_rq_to_pdu(req);
 	struct io_thread_req *io_req;
-	struct request *req;
 
-	while(1){
-		struct ubd *dev = q->queuedata;
-		if(dev->request == NULL){
-			struct request *req = blk_fetch_request(q);
-			if(req == NULL)
-				return;
+	blk_mq_start_request(req);
+
+	pdu->rq_pos = blk_rq_pos(req);
+	pdu->start_sg = 0;
+	pdu->end_sg = blk_rq_map_sg(req->q, req, pdu->sg);
 
-			dev->request = req;
-			dev->rq_pos = blk_rq_pos(req);
-			dev->start_sg = 0;
-			dev->end_sg = blk_rq_map_sg(q, req, dev->sg);
+	if (req_op(req) == REQ_OP_FLUSH) {
+		io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC);
+		if (io_req == NULL) {
+			WARN_ONCE(1, "Out of memory in io_thread_req allocation\n");
+			return BLK_STS_IOERR;
 		}
+		prepare_flush_request(req, io_req);
+		if (submit_request(io_req, dev) == false)
+			return BLK_STS_IOERR;
+
+		if (!pdu->end_sg)
+			io_req->is_last = true;
+		else
+			io_req->is_last = false;
+	}
 
-		req = dev->request;
+	while (pdu->start_sg < pdu->end_sg) {
+		struct scatterlist *sg = &pdu->sg[pdu->start_sg];
 
-		if (req_op(req) == REQ_OP_FLUSH) {
-			io_req = kmalloc(sizeof(struct io_thread_req),
-					 GFP_ATOMIC);
-			if (io_req == NULL) {
-				if (list_empty(&dev->restart))
-					list_add(&dev->restart, &restart);
-				return;
-			}
-			prepare_flush_request(req, io_req);
-			if (submit_request(io_req, dev) == false)
-				return;
+		io_req = kmalloc(sizeof(struct io_thread_req),
+				 GFP_ATOMIC);
+		if (io_req == NULL) {
+			WARN_ONCE(1, "Out of memory in io_thread_req allocation\n");
+			return BLK_STS_IOERR;
 		}
+		prepare_request(req, io_req,
+				(unsigned long long)pdu->rq_pos << 9,
+				sg->offset, sg->length, sg_page(sg));
 
-		while(dev->start_sg < dev->end_sg){
-			struct scatterlist *sg = &dev->sg[dev->start_sg];
+		if (pdu->start_sg + 1 == pdu->end_sg)
+			io_req->is_last = true;
+		else
+			io_req->is_last = false;
 
-			io_req = kmalloc(sizeof(struct io_thread_req),
-					 GFP_ATOMIC);
-			if(io_req == NULL){
-				if(list_empty(&dev->restart))
-					list_add(&dev->restart, &restart);
-				return;
-			}
-			prepare_request(req, io_req,
-					(unsigned long long)dev->rq_pos << 9,
-					sg->offset, sg->length, sg_page(sg));
+		if (submit_request(io_req, dev) == false)
+			return BLK_STS_IOERR;
 
-			if (submit_request(io_req, dev) == false)
-				return;
-
-			dev->rq_pos += sg->length >> 9;
-			dev->start_sg++;
-		}
-		dev->end_sg = 0;
-		dev->request = NULL;
+		pdu->rq_pos += sg->length >> 9;
+		pdu->start_sg++;
 	}
+
+	return BLK_STS_OK;
+}
+
+static int ubd_init_request(struct blk_mq_tag_set *set,
+			    struct request *req, unsigned int hctx_idx,
+			    unsigned int numa_node)
+{
+	struct ubd_pdu *pdu = blk_mq_rq_to_pdu(req);
+
+	sg_init_table(pdu->sg, MAX_SG);
+
+	return 0;
 }
 
 static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-- 
2.13.6