linux-kernel - [RFC PATCH v2] blk-mq: Introduce per sw queue time-slice

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Mon, 1 Feb 2016 23:43:40 +0100
From:	Andreas Herrmann <aherrmann@...e.com>
To:	Christoph Hellwig <hch@....de>, Jens Axboe <axboe@...nel.dk>
Cc:	linux-kernel@...r.kernel.org,
	Johannes Thumshirn <jthumshirn@...e.de>,
	Jan Kara <jack@...e.cz>
Subject: [RFC PATCH v2] blk-mq: Introduce per sw queue time-slice

This introduces a new blk_mq hw attribute time_slice_us which allows
to specify a time slice in usecs.

Default value is 0 and implies no modification to blk-mq behaviour.

A positive value changes blk-mq to service only one software queue
within this time slice until it expires or the software queue is
empty. Then the next software queue with pending requests is selected.

Signed-off-by: Andreas Herrmann <aherrmann@...e.com>
---
 block/blk-mq-sysfs.c   |  27 +++++++
 block/blk-mq.c         | 208 +++++++++++++++++++++++++++++++++++++++++--------
 include/linux/blk-mq.h |   9 +++
 3 files changed, 211 insertions(+), 33 deletions(-)

Hi,

This update is long overdue (sorry for the delay).

Change to v1:
- time slice is now specified in usecs instead of msecs.
- time slice is extended (up to 3 times the initial value) when there
  was actually a request to be serviced for the software queue

Fio test results are sent in a separate mail to this.

Results for fio improved to some extent with this patch. But in
reality the picture is quite mixed. Performance is highly dependend on
task scheduling. There is no guarantee that the requests originated
from one CPU belong to the same process.

I think for rotary devices CFQ is by far the best choice. A simple
illustration is:

  Copying two files (750MB in this case) in parallel on a rotary
  device. The elapsed wall clock time (seconds) for this is
                               mean    stdev
   cfq, slice_idle=8           16.18   4.95
   cfq, slice_idle=0           23.74   2.82
   blk-mq, time_slice_usec=0   24.37   2.05
   blk-mq, time_slice_usec=250 25.58   3.16


Regards,

Andreas

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 1cf1878..77c875c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -247,6 +247,26 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 	return ret;
 }
 
+static ssize_t blk_mq_hw_sysfs_tslice_show(struct blk_mq_hw_ctx *hctx,
+					  char *page)
+{
+	return sprintf(page, "%u\n", hctx->tslice_us);
+}
+
+static ssize_t blk_mq_hw_sysfs_tslice_store(struct blk_mq_hw_ctx *hctx,
+					const char *page, size_t length)
+{
+	unsigned long long store;
+	int err;
+
+	err = kstrtoull(page, 10, &store);
+	if (err)
+		return -EINVAL;
+
+	hctx->tslice_us = (unsigned)store;
+	return length;
+}
+
 static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
 	.attr = {.name = "dispatched", .mode = S_IRUGO },
 	.show = blk_mq_sysfs_dispatched_show,
@@ -305,6 +325,12 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
 	.show = blk_mq_hw_sysfs_poll_show,
 };
 
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tslice = {
+	.attr = {.name = "time_slice_us", .mode = S_IRUGO | S_IWUSR },
+	.show = blk_mq_hw_sysfs_tslice_show,
+	.store = blk_mq_hw_sysfs_tslice_store,
+};
+
 static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_queued.attr,
 	&blk_mq_hw_sysfs_run.attr,
@@ -314,6 +340,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_cpus.attr,
 	&blk_mq_hw_sysfs_active.attr,
 	&blk_mq_hw_sysfs_poll.attr,
+	&blk_mq_hw_sysfs_tslice.attr,
 	NULL,
 };
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4c0622f..97d32f2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -68,6 +68,7 @@ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
 
 	if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
 		set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+	cpumask_set_cpu(ctx->cpu, hctx->cpu_pending_mask);
 }
 
 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
@@ -76,6 +77,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
 
 	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+	cpumask_clear_cpu(ctx->cpu, hctx->cpu_pending_mask);
 }
 
 void blk_mq_freeze_queue_start(struct request_queue *q)
@@ -682,15 +684,41 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 	return false;
 }
 
+static int tslice_flush_one_ctx(struct blk_mq_hw_ctx *hctx,
+				struct list_head *list,	int cpu)
+{
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_ctx *ctx;
+
+	if (cpu == -1 || !hctx->tslice_us)
+		return 0;
+
+	ctx = __blk_mq_get_ctx(q, cpu);
+	spin_lock(&ctx->lock);
+	if (!list_empty(&ctx->rq_list)) {
+		list_splice_tail_init(&ctx->rq_list, list);
+		spin_lock(&hctx->lock);
+		hctx->tslice_inc = 1;
+		blk_mq_hctx_clear_pending(hctx, ctx);
+		spin_unlock(&hctx->lock);
+	}
+	spin_unlock(&ctx->lock);
+	return 1;
+}
+
 /*
  * Process software queues that have been marked busy, splicing them
  * to the for-dispatch
  */
-static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx,
+			struct list_head *list,	int cpu)
 {
 	struct blk_mq_ctx *ctx;
 	int i;
 
+	if (tslice_flush_one_ctx(hctx, list, cpu))
+		return;
+
 	for (i = 0; i < hctx->ctx_map.size; i++) {
 		struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
 		unsigned int off, bit;
@@ -706,9 +734,11 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 				break;
 
 			ctx = hctx->ctxs[bit + off];
-			clear_bit(bit, &bm->word);
 			spin_lock(&ctx->lock);
 			list_splice_tail_init(&ctx->rq_list, list);
+			spin_lock(&hctx->lock);
+			blk_mq_hctx_clear_pending(hctx, ctx);
+			spin_unlock(&hctx->lock);
 			spin_unlock(&ctx->lock);
 
 			bit++;
@@ -717,6 +747,114 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 }
 
 /*
+ * It'd be great if the workqueue API had a way to pass
+ * in a mask and had some smarts for more clever placement.
+ * For now we just round-robin here, switching for every
+ * BLK_MQ_CPU_WORK_BATCH queued items.
+ */
+static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
+{
+	if (hctx->queue->nr_hw_queues == 1)
+		return WORK_CPU_UNBOUND;
+
+	if (--hctx->next_cpu_batch <= 0) {
+		int cpu = hctx->next_cpu, next_cpu;
+
+		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+		if (next_cpu >= nr_cpu_ids)
+			next_cpu = cpumask_first(hctx->cpumask);
+
+		hctx->next_cpu = next_cpu;
+		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+
+		return cpu;
+	}
+
+	return hctx->next_cpu;
+}
+
+static int blk_mq_tslice_expired(struct blk_mq_hw_ctx *hctx)
+{
+	if (time_after_eq(jiffies, hctx->tslice_expiration))
+		return 1;
+
+	return 0;
+}
+
+static int blk_mq_tslice_next_cpu(struct blk_mq_hw_ctx *hctx)
+{
+	int c = hctx->tslice_cpu;
+
+	/* allow extension of time slice */
+	if (hctx->tslice_inc && hctx->tslice_inc_count < 4) {
+		hctx->tslice_inc = 0;
+		hctx->tslice_inc_count++;
+		goto out;
+	}
+	hctx->tslice_inc = 0;
+	hctx->tslice_inc_count = 0;
+
+	/* clear CPU for which tslice has expired */
+	if (c != -1)
+		cpumask_clear_cpu(c, hctx->cpu_service_mask);
+
+	/* calculate mask of remaining CPUs with pending work */
+	if (cpumask_and(hctx->cpu_next_mask, hctx->cpu_pending_mask,
+				hctx->cpu_service_mask)) {
+		c = cpumask_first(hctx->cpu_next_mask);
+	} else {
+		/*
+		 * no remaining CPUs with pending work, reset epoch,
+		 * start with first CPU that has requests pending
+		 */
+		hctx->tslice_expiration = 0;
+		cpumask_setall(hctx->cpu_service_mask);
+		c = cpumask_first(hctx->cpu_pending_mask);
+	}
+
+	/* no CPU with pending requests */
+	if (c >= nr_cpu_ids)
+		return -1;
+
+out:
+	hctx->tslice_expiration = jiffies + usecs_to_jiffies(hctx->tslice_us);
+	return c;
+}
+
+static int tslice_get_busy_ctx(struct blk_mq_hw_ctx *hctx)
+{
+	int cpu = -1;
+
+	if (hctx->tslice_us) {
+		spin_lock(&hctx->lock);
+		if (blk_mq_tslice_expired(hctx))
+			hctx->tslice_cpu = blk_mq_tslice_next_cpu(hctx);
+		cpu = hctx->tslice_cpu;
+		spin_unlock(&hctx->lock);
+	}
+
+	return cpu;
+}
+
+/*
+ * Ensure that hardware queue is run if there are pending
+ * requests in any software queue.
+ */
+static void tslice_schedule_pending_work(struct blk_mq_hw_ctx *hctx)
+{
+	long t;
+
+	if (!hctx->tslice_us || cpumask_empty(hctx->cpu_pending_mask))
+		return;
+
+	t = hctx->tslice_expiration - jiffies;
+	if (t < 0)
+		t = 0;
+	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+					&hctx->run_work, (unsigned int)t);
+}
+
+/*
  * Run this hardware queue, pulling any software queues mapped to it in.
  * Note that this function currently has various problems around ordering
  * of IO. In particular, we'd like FIFO behaviour on handling existing
@@ -729,7 +867,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	LIST_HEAD(rq_list);
 	LIST_HEAD(driver_list);
 	struct list_head *dptr;
-	int queued;
+	int queued, cpu;
 
 	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
 
@@ -739,9 +877,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	hctx->run++;
 
 	/*
-	 * Touch any software queue that has pending entries.
+	 * Touch dedicated software queue if time slice is set or any
+	 * software queue that has pending entries (cpu == -1).
 	 */
-	flush_busy_ctxs(hctx, &rq_list);
+	cpu = tslice_get_busy_ctx(hctx);
+	flush_busy_ctxs(hctx, &rq_list, cpu);
 
 	/*
 	 * If we have previous entries on our dispatch list, grab them
@@ -826,34 +966,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 		 * blk_mq_run_hw_queue() already checks the STOPPED bit
 		 **/
 		blk_mq_run_hw_queue(hctx, true);
-	}
-}
-
-/*
- * It'd be great if the workqueue API had a way to pass
- * in a mask and had some smarts for more clever placement.
- * For now we just round-robin here, switching for every
- * BLK_MQ_CPU_WORK_BATCH queued items.
- */
-static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
-{
-	if (hctx->queue->nr_hw_queues == 1)
-		return WORK_CPU_UNBOUND;
-
-	if (--hctx->next_cpu_batch <= 0) {
-		int cpu = hctx->next_cpu, next_cpu;
-
-		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
-		if (next_cpu >= nr_cpu_ids)
-			next_cpu = cpumask_first(hctx->cpumask);
-
-		hctx->next_cpu = next_cpu;
-		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
-
-		return cpu;
-	}
-
-	return hctx->next_cpu;
+	} else
+		tslice_schedule_pending_work(hctx);
 }
 
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
@@ -992,7 +1106,10 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
 	__blk_mq_insert_req_list(hctx, ctx, rq, at_head);
+	spin_lock(&hctx->lock);
 	blk_mq_hctx_mark_pending(hctx, ctx);
+	spin_unlock(&hctx->lock);
+
 }
 
 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
@@ -1583,7 +1700,9 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
 	spin_lock(&ctx->lock);
 	if (!list_empty(&ctx->rq_list)) {
 		list_splice_init(&ctx->rq_list, &tmp);
+		spin_lock(&hctx->lock);
 		blk_mq_hctx_clear_pending(hctx, ctx);
+		spin_unlock(&hctx->lock);
 	}
 	spin_unlock(&ctx->lock);
 
@@ -1602,7 +1721,9 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
 	}
 
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+	spin_lock(&hctx->lock);
 	blk_mq_hctx_mark_pending(hctx, ctx);
+	spin_unlock(&hctx->lock);
 
 	spin_unlock(&ctx->lock);
 
@@ -1691,6 +1812,12 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	hctx->queue_num = hctx_idx;
 	hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
 
+	hctx->tslice_us = 0;
+	hctx->tslice_expiration = 0;
+	hctx->tslice_cpu = -1;
+	hctx->tslice_inc = 0;
+	hctx->tslice_inc_count = 0;
+
 	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
 					blk_mq_hctx_notify, hctx);
 	blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
@@ -2006,6 +2133,18 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 						node))
 			goto err_hctxs;
 
+		if (!zalloc_cpumask_var_node(&hctxs[i]->cpu_pending_mask,
+						GFP_KERNEL, node))
+			goto err_hctxs;
+
+		if (!zalloc_cpumask_var_node(&hctxs[i]->cpu_service_mask,
+						GFP_KERNEL, node))
+			goto err_hctxs;
+
+		if (!zalloc_cpumask_var_node(&hctxs[i]->cpu_next_mask,
+						GFP_KERNEL, node))
+			goto err_hctxs;
+
 		atomic_set(&hctxs[i]->nr_active, 0);
 		hctxs[i]->numa_node = node;
 		hctxs[i]->queue_num = i;
@@ -2069,6 +2208,9 @@ err_hctxs:
 		if (!hctxs[i])
 			break;
 		free_cpumask_var(hctxs[i]->cpumask);
+		free_cpumask_var(hctxs[i]->cpu_pending_mask);
+		free_cpumask_var(hctxs[i]->cpu_service_mask);
+		free_cpumask_var(hctxs[i]->cpu_next_mask);
 		kfree(hctxs[i]);
 	}
 err_map:
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 7fc9296..a8ca685 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -57,6 +57,15 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
+	cpumask_var_t		cpu_service_mask; /* CPUs not yet serviced */
+	cpumask_var_t		cpu_pending_mask; /* CPUs with pending work */
+	cpumask_var_t		cpu_next_mask;	/* CPUs to be serviced */
+	int			tslice_us;	/* time slice in µs */
+	int			tslice_inc;	/* extend time slice */
+	int			tslice_inc_count;
+	unsigned long		tslice_expiration; /* in jiffies */
+	int			tslice_cpu;	/* cpu of time slice */
+
 	struct blk_mq_cpu_notifier	cpu_notifier;
 	struct kobject		kobj;
 
-- 
1.9.1