From c49ec4e8b0e4135a87c9894597901539f3e3ca08 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 21 Dec 2016 12:39:33 +0100
Subject: [PATCH 3/3] blk-mq: Split driver and scheduler tags

Add 'sched_tags' next to 'tags' in struct blk_mq_hw_ctx and also
in struct blk_mq_tag_set. Add 'sched_tag' next to 'tag' in struct
request. Modify blk_mq_update_nr_requests() such that it accepts
values larger than the queue depth. Make __blk_mq_free_request()
free both tags. Make blk_mq_alloc_tag_set() allocate both tag sets.
Make blk_mq_free_tag_set() free both tag sets. Make
blk_mq_dispatch_rq_list() allocate the driver tag. Modify
blk_mq_update_nr_requests() such that it accepts a size that
exceeds the queue depth.
---
 block/blk-flush.c      |   9 ++-
 block/blk-mq.c         | 160 +++++++++++++++++++++++++++++++++++--------------
 block/blk-mq.h         |   5 +-
 block/blk-tag.c        |   1 +
 include/linux/blk-mq.h |   2 +
 include/linux/blkdev.h |   1 +
 6 files changed, 129 insertions(+), 49 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 6a7c29d2eb3c..46d12bbfde85 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -170,6 +170,8 @@ static bool blk_flush_complete_seq(struct request *rq,
 	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 	bool queued = false, kicked;
 
+	BUG_ON(rq->tag < 0);
+
 	BUG_ON(rq->flush.seq & seq);
 	rq->flush.seq |= seq;
 
@@ -319,6 +321,8 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	if (q->mq_ops) {
 		struct blk_mq_hw_ctx *hctx;
 
+		BUG_ON(first_rq->tag < 0);
+
 		flush_rq->mq_ctx = first_rq->mq_ctx;
 		flush_rq->tag = first_rq->tag;
 		fq->orig_rq = first_rq;
@@ -452,8 +456,9 @@ void blk_insert_flush(struct request *rq)
 	 * processed directly without going through flush machinery.  Queue
 	 * for normal execution.
 	 */
-	if ((policy & REQ_FSEQ_DATA) &&
-	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+	if (((policy & REQ_FSEQ_DATA) &&
+	     !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) ||
+	    (q->mq_ops && blk_mq_assign_drv_tag(rq) < 0)) {
 		if (q->mq_ops)
 			blk_mq_sched_insert_request(rq, false, true, false);
 		else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b68b7fc43e46..48d7968d4ed9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -220,20 +220,21 @@ EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
 struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
 				       unsigned int op)
 {
-	struct blk_mq_tags *tags = data->hctx->tags;
+	struct blk_mq_tags *tags = data->hctx->sched_tags;
 	struct request *rq;
-	unsigned int tag;
+	unsigned int sched_tag;
 
-	tag = blk_mq_get_tag(data, tags);
-	if (tag != BLK_MQ_TAG_FAIL) {
-		rq = tags->rqs[tag];
+	sched_tag = blk_mq_get_tag(data, tags);
+	if (sched_tag != BLK_MQ_TAG_FAIL) {
+		rq = tags->rqs[sched_tag];
+		rq->tag = -1;
 
 		if (blk_mq_tag_busy(data->hctx)) {
 			rq->rq_flags = RQF_MQ_INFLIGHT;
 			atomic_inc(&data->hctx->nr_active);
 		}
 
-		rq->tag = tag;
+		rq->sched_tag = sched_tag;
 		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
 		return rq;
 	}
@@ -328,6 +329,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 			     struct request *rq)
 {
 	const int tag = rq->tag;
+	const int sched_tag = rq->sched_tag;
 	struct request_queue *q = rq->q;
 
 	ctx->rq_completed[rq_is_sync(rq)]++;
@@ -340,7 +342,13 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
-	blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
+	if (tag >= 0) {
+		WARN_ON_ONCE(hctx->tags->rqs[tag] != rq);
+		hctx->tags->rqs[tag] = NULL;
+		blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
+	}
+	if (sched_tag >= 0)
+		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
 	blk_queue_exit(q);
 }
 
@@ -844,6 +852,26 @@ static inline unsigned int queued_to_index(unsigned int queued)
 	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
+int blk_mq_assign_drv_tag(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+	struct blk_mq_alloc_data data = {
+		.q = rq->q,
+		.ctx = rq->mq_ctx,
+		.hctx = hctx,
+	};
+
+	rq->tag = blk_mq_get_tag(&data, hctx->tags);
+	if (rq->tag < 0)
+		goto out;
+	WARN_ON_ONCE(hctx->tags->rqs[rq->tag]);
+	hctx->tags->rqs[rq->tag] = rq;
+
+out:
+	return rq->tag;
+}
+
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
 	struct request_queue *q = hctx->queue;
@@ -866,6 +894,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 		struct blk_mq_queue_data bd;
 
 		rq = list_first_entry(list, struct request, queuelist);
+		if (rq->tag < 0 && blk_mq_assign_drv_tag(rq) < 0)
+			break;
 		list_del_init(&rq->queuelist);
 
 		bd.rq = rq;
@@ -1296,7 +1326,8 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 		goto insert;
 
 	hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
-	if (blk_mq_hctx_stopped(hctx))
+	if (blk_mq_hctx_stopped(hctx) ||
+	    (rq->tag < 0 && blk_mq_assign_drv_tag(rq) < 0))
 		goto insert;
 
 	new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
@@ -1592,17 +1623,19 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags)
 }
 
 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
-					unsigned int hctx_idx)
+					unsigned int hctx_idx,
+					unsigned int nr_tags,
+					unsigned int reserved_tags)
 {
 	struct blk_mq_tags *tags;
 
-	tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+	tags = blk_mq_init_tags(nr_tags, reserved_tags,
 				set->numa_node,
 				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
 	if (!tags)
 		return NULL;
 
-	tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
+	tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
 				 set->numa_node);
 	if (!tags->rqs) {
@@ -1800,6 +1833,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
 
 	hctx->tags = set->tags[hctx_idx];
+	hctx->sched_tags = set->sched_tags[hctx_idx];
 
 	/*
 	 * Allocate space for all possible cpus to avoid allocation at
@@ -1881,6 +1915,38 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 	}
 }
 
+static void __blk_mq_free_rq_map_i(struct blk_mq_tag_set *set, int hctx_idx)
+{
+	if (set->sched_tags[hctx_idx]) {
+		blk_mq_free_rqs(set, set->sched_tags[hctx_idx], hctx_idx);
+		blk_mq_free_rq_map(set->sched_tags[hctx_idx]);
+		set->sched_tags[hctx_idx] = NULL;
+	}
+	if (set->tags[hctx_idx]) {
+		blk_mq_free_rq_map(set->tags[hctx_idx]);
+		set->tags[hctx_idx] = NULL;
+	}
+}
+
+static bool __blk_mq_alloc_rq_map_i(struct blk_mq_tag_set *set, int hctx_idx,
+				    unsigned int nr_requests)
+{
+	int ret = 0;
+
+	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
+					set->queue_depth, set->reserved_tags);
+	set->sched_tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
+							nr_requests, 0);
+	if (set->sched_tags[hctx_idx])
+		ret = blk_mq_alloc_rqs(set, set->sched_tags[hctx_idx],
+				       hctx_idx);
+	if (!set->tags[hctx_idx] || !set->sched_tags[hctx_idx] || ret < 0) {
+		__blk_mq_free_rq_map_i(set, hctx_idx);
+		return false;
+	}
+	return true;
+}
+
 static void blk_mq_map_swqueue(struct request_queue *q,
 			       const struct cpumask *online_mask)
 {
@@ -1909,23 +1975,15 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 
 		hctx_idx = q->mq_map[i];
 		/* unmapped hw queue can be remapped after CPU topo changed */
-		if (!set->tags[hctx_idx]) {
-			set->tags[hctx_idx] = blk_mq_alloc_rq_map(set,
-								  hctx_idx);
-			if (blk_mq_alloc_rqs(set, set->tags[hctx_idx],
-					     hctx_idx) < 0) {
-				blk_mq_free_rq_map(set->tags[hctx_idx]);
-				set->tags[hctx_idx] = NULL;
-			}
-
+		if (!set->tags[hctx_idx] &&
+		    !__blk_mq_alloc_rq_map_i(set, hctx_idx, q->nr_requests)) {
 			/*
 			 * If tags initialization fail for some hctx,
 			 * that hctx won't be brought online.  In this
 			 * case, remap the current ctx to hctx[0] which
 			 * is guaranteed to always have tags allocated
 			 */
-			if (!set->tags[hctx_idx])
-				q->mq_map[i] = 0;
+			q->mq_map[i] = 0;
 		}
 
 		ctx = per_cpu_ptr(q->queue_ctx, i);
@@ -2318,26 +2376,20 @@ static int blk_mq_queue_reinit_prepare(unsigned int cpu)
 	return 0;
 }
 
-static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set,
+				  unsigned int nr_requests)
 {
 	int i;
 
-	for (i = 0; i < set->nr_hw_queues; i++) {
-		set->tags[i] = blk_mq_alloc_rq_map(set, i);
-		if (!set->tags[i])
+	for (i = 0; i < set->nr_hw_queues; i++)
+		if (!__blk_mq_alloc_rq_map_i(set, i, nr_requests))
 			goto out_unwind;
-		if (blk_mq_alloc_rqs(set, set->tags[i], i) < 0)
-			goto free_rq_map;
-	}
 
 	return 0;
 
 out_unwind:
-	while (--i >= 0) {
-		blk_mq_free_rqs(set, set->tags[i], i);
-free_rq_map:
-		blk_mq_free_rq_map(set->tags[i]);
-	}
+	while (--i >= 0)
+		__blk_mq_free_rq_map_i(set, i);
 
 	return -ENOMEM;
 }
@@ -2347,14 +2399,15 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
  * may reduce the depth asked for, if memory is tight. set->queue_depth
  * will be updated to reflect the allocated depth.
  */
-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set,
+				unsigned int nr_requests)
 {
 	unsigned int depth;
 	int err;
 
 	depth = set->queue_depth;
 	do {
-		err = __blk_mq_alloc_rq_maps(set);
+		err = __blk_mq_alloc_rq_maps(set, nr_requests);
 		if (!err)
 			break;
 
@@ -2385,7 +2438,7 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
  */
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 {
-	int ret;
+	int ret = -ENOMEM;
 
 	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
 
@@ -2425,32 +2478,39 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (!set->tags)
 		return -ENOMEM;
 
-	ret = -ENOMEM;
+	set->sched_tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
+				       GFP_KERNEL, set->numa_node);
+	if (!set->sched_tags)
+		goto free_drv_tags;
+
 	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
 			GFP_KERNEL, set->numa_node);
 	if (!set->mq_map)
-		goto out_free_tags;
+		goto free_sched_tags;
 
 	if (set->ops->map_queues)
 		ret = set->ops->map_queues(set);
 	else
 		ret = blk_mq_map_queues(set);
 	if (ret)
-		goto out_free_mq_map;
+		goto free_mq_map;
 
-	ret = blk_mq_alloc_rq_maps(set);
+	ret = blk_mq_alloc_rq_maps(set, set->queue_depth/*q->nr_requests*/);
 	if (ret)
-		goto out_free_mq_map;
+		goto free_mq_map;
 
 	mutex_init(&set->tag_list_lock);
 	INIT_LIST_HEAD(&set->tag_list);
 
 	return 0;
 
-out_free_mq_map:
+free_mq_map:
 	kfree(set->mq_map);
 	set->mq_map = NULL;
-out_free_tags:
+free_sched_tags:
+	kfree(set->sched_tags);
+	set->sched_tags = NULL;
+free_drv_tags:
 	kfree(set->tags);
 	set->tags = NULL;
 	return ret;
@@ -2465,12 +2525,16 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 		if (set->tags[i]) {
 			blk_mq_free_rqs(set, set->tags[i], i);
 			blk_mq_free_rq_map(set->tags[i]);
+			blk_mq_free_rq_map(set->sched_tags[i]);
 		}
 	}
 
 	kfree(set->mq_map);
 	set->mq_map = NULL;
 
+	kfree(set->sched_tags);
+	set->sched_tags = NULL;
+
 	kfree(set->tags);
 	set->tags = NULL;
 }
@@ -2482,14 +2546,18 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 	struct blk_mq_hw_ctx *hctx;
 	int i, ret;
 
-	if (!set || nr > set->queue_depth)
+	if (!set)
 		return -EINVAL;
 
 	ret = 0;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx->tags)
 			continue;
-		ret = blk_mq_tag_update_depth(hctx->tags, nr);
+		ret = blk_mq_tag_update_depth(hctx->tags,
+					      min(nr, set->queue_depth));
+		if (ret)
+			break;
+		ret = blk_mq_tag_update_depth(hctx->sched_tags, nr);
 		if (ret)
 			break;
 	}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 2e98dd8ccee2..0368c513c2ab 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -31,6 +31,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
+int blk_mq_assign_drv_tag(struct request *rq);
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 
@@ -41,7 +42,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx);
 void blk_mq_free_rq_map(struct blk_mq_tags *tags);
 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
-					unsigned int hctx_idx);
+					unsigned int hctx_idx,
+					unsigned int nr_tags,
+					unsigned int reserved_tags);
 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx);
 
diff --git a/block/blk-tag.c b/block/blk-tag.c
index bae1decb6ec3..319a3a3eb1d7 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
 	list_del_init(&rq->queuelist);
 	rq->rq_flags &= ~RQF_QUEUED;
 	rq->tag = -1;
+	rq->sched_tag = -1;
 
 	if (unlikely(bqt->tag_index[tag] == NULL))
 		printk(KERN_ERR "%s: tag %d is missing\n",
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9255ccb043f2..377594bcda8d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -36,6 +36,7 @@ struct blk_mq_hw_ctx {
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
+	struct blk_mq_tags	*sched_tags;
 
 	struct srcu_struct	queue_rq_srcu;
 
@@ -72,6 +73,7 @@ struct blk_mq_tag_set {
 	void			*driver_data;
 
 	struct blk_mq_tags	**tags;
+	struct blk_mq_tags	**sched_tags;
 
 	struct mutex		tag_list_lock;
 	struct list_head	tag_list;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7c40fb838b44..112b57bce9e9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -223,6 +223,7 @@ struct request {
 	void *special;		/* opaque pointer available for LLD use */
 
 	int tag;
+	int sched_tag;
 	int errors;
 
 	/*
-- 
2.11.0