linux-kernel - [PATCH v9 6/6] block: loop: support DIO & AIO

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Thu,  6 Aug 2015 04:42:18 -0400
From:	Ming Lei <ming.lei@...onical.com>
To:	Jens Axboe <axboe@...nel.dk>, linux-kernel@...r.kernel.org,
	Dave Kleikamp <dave.kleikamp@...cle.com>
Cc:	Zach Brown <zab@...bo.net>, Christoph Hellwig <hch@...radead.org>,
	Maxim Patlasov <mpatlasov@...allels.com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Alexander Viro <viro@...iv.linux.org.uk>,
	Tejun Heo <tj@...nel.org>, Dave Chinner <david@...morbit.com>,
	Ming Lei <ming.lei@...onical.com>
Subject: [PATCH v9 6/6] block: loop: support DIO & AIO

There are about 3 advantages to use direct I/O and AIO on
read/write loop's backing file:

1) double cache can be avoided, then memory usage gets
decreased a lot

2) not like user space direct I/O, there isn't cost of
pinning pages

3) avoid context switch for obtaining good throughput
- in buffered file read, random I/O top throughput is often obtained
only if they are submitted concurrently from lots of tasks; but for
sequential I/O, most of times they can be hit from page cache, so
concurrent submissions often introduce unnecessary context switch
and can't improve throughput much. There was such discussion[1]
to use non-blocking I/O to improve the problem for application.
- with direct I/O and AIO, concurrent submissions can be
avoided and random read throughput can't be affected meantime

Follows my fio test result:

1. 16 jobs fio test inside ext4 file system over loop block
1) How to run
	- linux kernel: 4.1.0-rc2-next-20150506 with the patchset
	- the loop block is over one image on HDD.
	- linux psync, 16 jobs, size 400M, ext4 over loop block
	- test result: IOPS from fio output

2) Throughput result:
        -------------------------------------------------------------
        test cases          |randread   |read   |randwrite  |write  |
        -------------------------------------------------------------
        base                |240        |8705   |3763       |20914
        -------------------------------------------------------------
        base+loop aio       |242        |9258   |4577       |21451
        -------------------------------------------------------------

3) context switch
        - context switch decreased by ~16% with loop aio for randread,
	and decreased by ~33% for read

4) memory usage
	- After these four tests with loop aio: ~10% memory becomes used
	- After these four tests without loop aio: more than 55% memory
	becomes used

2. single job fio test inside ext4 file system over loop block(for Maxim Patlasov)
1) How to run
	- linux kernel: 4.1.0-rc2-next-20150506 with the patchset
	- the loop block is over one image on HDD.
	- linux psync, 1 job, size 4000M, ext4 over loop block
	- test result: IOPS from fio output

2) Throughput result:
        -------------------------------------------------------------
        test cases          |randread   |read   |randwrite  |write  |
        -------------------------------------------------------------
        base                |109        |21180  |4192       |22782
        -------------------------------------------------------------
        base+loop aio       |114        |21018  |5404       |22670
        -------------------------------------------------------------

3) context switch
        - context switch decreased by ~10% with loop aio for randread,
	and decreased by ~50% for read

4) memory usage
	- After these four tests with loop aio: ~10% memory becomes used
	- After these four tests without loop aio: more than 55% memory
	becomes used

Both 'context switch' and 'memory usage' data are got from sar.

[1] https://lwn.net/Articles/612483/
[2] sar graph when running fio over loop without the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-nonaio.pdf

[3] sar graph when running fio over loop with the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-aio.pdf

[4] sar graph when running fio over loop without the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-nonaio-1job.pdf

[5] sar graph when running fio over loop with the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-aio-1job.pdf

Signed-off-by: Ming Lei <ming.lei@...onical.com>
---
 drivers/block/loop.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/block/loop.h |   5 ++
 2 files changed, 149 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 06a737b..edaa18a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -200,6 +200,8 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
 	blk_mq_freeze_queue(lo->lo_queue);
 	lo->use_dio = use_dio;
 	lo->dio_align = dio_align;
+	lo->last_use_dio = use_dio;
+	atomic_set(&lo->pending_dio, 0);
 	if (use_dio)
 		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
 	else
@@ -436,6 +438,128 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq)
 	return ret;
 }
 
+static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
+{
+	if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE))
+		return;
+
+	if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
+		struct bio *bio = cmd->rq->bio;
+
+		bio_advance(bio, bytes);
+		zero_fill_bio(bio);
+	}
+}
+
+static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+{
+	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
+	struct request *rq = cmd->rq;
+	struct loop_device *lo = cmd->rq->q->queuedata;
+
+	handle_partial_read(cmd, ret);
+
+	if (ret > 0)
+		ret = 0;
+	else if (ret < 0)
+		ret = -EIO;
+
+	rq->errors = ret;
+	blk_mq_complete_request(rq);
+
+	if (atomic_dec_and_test(&lo->pending_dio)) {
+		if (lo->waiter)
+			wake_up_process(lo->waiter);
+	}
+}
+
+static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
+		     loff_t pos, bool rw)
+{
+	struct iov_iter iter;
+	struct bio_vec *bvec;
+	struct bio *bio = cmd->rq->bio;
+	struct file *file = lo->lo_backing_file;
+	int ret;
+
+	/* nomerge for loop request queue */
+	WARN_ON(cmd->rq->bio != cmd->rq->biotail);
+
+	bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+	iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
+		      bio_segments(bio), blk_rq_bytes(cmd->rq));
+
+	cmd->iocb.ki_pos = pos;
+	cmd->iocb.ki_filp = file;
+	cmd->iocb.ki_complete = lo_rw_aio_complete;
+	cmd->iocb.ki_flags = IOCB_DIRECT;
+
+	atomic_inc(&lo->pending_dio);
+
+	if (rw == WRITE)
+		ret = file->f_op->write_iter(&cmd->iocb, &iter);
+	else
+		ret = file->f_op->read_iter(&cmd->iocb, &iter);
+
+	if (ret != -EIOCBQUEUED)
+		cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
+	return 0;
+}
+
+static inline void lo_drain_pending_dio(struct loop_device *lo)
+{
+	lo->waiter = current;
+
+	/* order between writing waiter and reading pending_dio */
+	smp_mb__before_atomic();
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!atomic_read(&lo->pending_dio))
+			break;
+		schedule();
+	}
+	set_current_state(TASK_RUNNING);
+	lo->waiter = NULL;
+}
+
+static inline int lo_rw_simple(struct loop_device *lo,
+		struct request *rq, loff_t pos, bool rw)
+{
+	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	/*
+	 * When working at direct I/O, under very unusual cases,
+	 * such as unaligned direct I/O from application and
+	 * access to loop block device with 'unaligned' offset & size,
+	 * wait for completion of the pending dio/aio before swiching
+	 * to buffered I/O for avoiding probable race between dio and
+	 * buffered I/O.
+	 *
+	 * Page cache flush is handled inside VFS direct I/O path if
+	 * it is needed.
+	 */
+	if (unlikely(lo->use_dio && lo->last_use_dio && !cmd->use_aio))
+		lo_drain_pending_dio(lo);
+
+	if (cmd->use_aio)
+		return lo_rw_aio(lo, cmd, pos, rw);
+
+	/*
+	 * lo_write_simple and lo_read_simple should have been covered
+	 * by io submit style function like lo_rw_aio(), one blocker
+	 * is that lo_read_simple() need to call flush_dcache_page after
+	 * the page is written from kernel, and it isn't easy to handle
+	 * this in io submit style function which submits all segments
+	 * of the req at one time. And direct read IO doesn't need to
+	 * run flush_dcache_page().
+	 */
+	if (rw == WRITE)
+		return lo_write_simple(lo, rq, pos);
+	else
+		return lo_read_simple(lo, rq, pos);
+}
+
 static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 {
 	loff_t pos;
@@ -451,13 +575,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 		else if (lo->transfer)
 			ret = lo_write_transfer(lo, rq, pos);
 		else
-			ret = lo_write_simple(lo, rq, pos);
+			ret = lo_rw_simple(lo, rq, pos, WRITE);
 
 	} else {
 		if (lo->transfer)
 			ret = lo_read_transfer(lo, rq, pos);
 		else
-			ret = lo_read_simple(lo, rq, pos);
+			ret = lo_rw_simple(lo, rq, pos, READ);
 	}
 
 	return ret;
@@ -1545,6 +1669,13 @@ int loop_unregister_transfer(int number)
 EXPORT_SYMBOL(loop_register_transfer);
 EXPORT_SYMBOL(loop_unregister_transfer);
 
+static inline bool req_dio_aligned(struct loop_device *lo,
+		const struct request *rq)
+{
+	return !((blk_rq_pos(rq) << 9) & lo->dio_align) &&
+		!(blk_rq_bytes(rq) & lo->dio_align);
+}
+
 static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		const struct blk_mq_queue_data *bd)
 {
@@ -1556,6 +1687,13 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (lo->lo_state != Lo_bound)
 		return -EIO;
 
+	if (lo->use_dio && !lo->transfer &&
+			req_dio_aligned(lo, bd->rq) &&
+			!(cmd->rq->cmd_flags & (REQ_FLUSH | REQ_DISCARD)))
+		cmd->use_aio = true;
+	else
+		cmd->use_aio = false;
+
 	queue_kthread_work(&lo->worker, &cmd->work);
 
 	return BLK_MQ_RQ_QUEUE_OK;
@@ -1571,11 +1709,14 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 		goto failed;
 
 	ret = do_req_filebacked(lo, cmd->rq);
+	lo->last_use_dio = cmd->use_aio;
 
  failed:
 	if (ret)
 		cmd->rq->errors = -EIO;
-	blk_mq_complete_request(cmd->rq);
+	/* complete non-aio request */
+	if (!cmd->use_aio || ret)
+		blk_mq_complete_request(cmd->rq);
 }
 
 static void loop_queue_work(struct kthread_work *work)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 63f8e14..439054b 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -60,6 +60,9 @@ struct loop_device {
 	struct task_struct	*worker_task;
 	unsigned		dio_align;
 	bool			use_dio;
+	bool			last_use_dio;
+	atomic_t		pending_dio;
+	struct task_struct	*waiter;
 
 	struct request_queue	*lo_queue;
 	struct blk_mq_tag_set	tag_set;
@@ -70,6 +73,8 @@ struct loop_cmd {
 	struct kthread_work work;
 	struct request *rq;
 	struct list_head list;
+	bool use_aio;           /* use AIO interface to handle I/O */
+	struct kiocb iocb;
 };
 
 /* Support for loadable transfer modules */
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/