linux-kernel - Re: Slow file transfer speeds with CFQ IO scheduler in some cases

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <x491vxjqmg0.fsf@segfault.boston.devel.redhat.com>
Date:	Mon, 10 Nov 2008 16:51:27 -0500
From:	Jeff Moyer <jmoyer@...hat.com>
To:	"Vitaly V. Bursov" <vitalyb@...enet.dn.ua>
Cc:	Jens Axboe <jens.axboe@...cle.com>, linux-kernel@...r.kernel.org
Subject: Re: Slow file transfer speeds with CFQ IO scheduler in some cases

"Vitaly V. Bursov" <vitalyb@...enet.dn.ua> writes:

> Jens Axboe wrote:
>> On Mon, Nov 10 2008, Vitaly V. Bursov wrote:
>>> Jens Axboe wrote:
>>>> On Mon, Nov 10 2008, Jeff Moyer wrote:
>>>>> Jens Axboe <jens.axboe@...cle.com> writes:
>>>>>
>>>>>> http://bugzilla.kernel.org/attachment.cgi?id=18473&action=view
>>>>> Funny, I was going to ask the same question.  ;)  The reason Jens wants
>>>>> you to try this patch is that nfsd may be farming off the I/O requests
>>>>> to different threads which are then performing interleaved I/O.  The
>>>>> above patch tries to detect this and allow cooperating processes to get
>>>>> disk time instead of waiting for the idle timeout.
>>>> Precisely :-)
>>>>
>>>> The only reason I haven't merged it yet is because of worry of extra
>>>> cost, but I'll throw some SSD love at it and see how it turns out.
>>>>
>>> Sorry, but I get "oops" same moment nfs read transfer starts.
>>> I can get directory list via nfs, read files locally (not
>>> carefully tested, though)
>>>
>>> Dumps captured via netconsole, so these may not be completely accurate
>>> but hopefully will give a hint.
>> 
>> Interesting, strange how that hasn't triggered here. Or perhaps the
>> version that Jeff posted isn't the one I tried. Anyway, search for:
>> 
>>         RB_CLEAR_NODE(&cfqq->rb_node);
>> 
>> and add a
>> 
>>         RB_CLEAR_NODE(&cfqq->prio_node);
>> 
>> just below that. It's in cfq_find_alloc_queue(). I think that should fix
>> it.
>> 
>
> Same problem.
>
> I did make clean; make -j3; sync; on (2 times) patched kernel and it went OK
> but It won't boot anymore with cfq with same error...
>
> Switching cfq io scheduler at runtime (booting with "as") appears to work with
> two parallel local dd reads.

Strange, I can't reproduce a failure.  I'll keep trying.  For now, these
are the results I see:

[root@...den ~]# mount megadeth:/export/cciss /mnt/megadeth/
[root@...den ~]# dd if=/mnt/megadeth/file1 of=/dev/null bs=1M
1024+0 records in
1024+0 records out
1073741824 bytes (1.1 GB) copied, 26.8128 s, 40.0 MB/s
[root@...den ~]# umount /mnt/megadeth/
[root@...den ~]# mount megadeth:/export/cciss /mnt/megadeth/
[root@...den ~]# dd if=/mnt/megadeth/file1 of=/dev/null bs=1M
1024+0 records in
1024+0 records out
1073741824 bytes (1.1 GB) copied, 23.7025 s, 45.3 MB/s
[root@...den ~]# umount /mnt/megadeth/

Here is the patch, with the suggestion from Jens to switch the cfqq to
the right priority tree when the priority is changed.

Cheers,

Jeff

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6a062ee..dd26cba 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -83,6 +83,12 @@ struct cfq_data {
 	 * rr list of queues with requests and the count of them
 	 */
 	struct cfq_rb_root service_tree;
+	/*
+	 * Each priority tree is sorted by next_request position.  These
+	 * trees are used when determining if two or more queues are
+	 * interleaving requests (see cfq_close_cooperator).
+	 */
+	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 	unsigned int busy_queues;
 
 	int rq_in_driver;
@@ -142,6 +148,8 @@ struct cfq_queue {
 	struct rb_node rb_node;
 	/* service_tree key */
 	unsigned long rb_key;
+	/* prio tree member */
+	struct rb_node prio_node;
 	/* sorted list of pending requests */
 	struct rb_root sort_list;
 	/* if fifo isn't expired, next request to serve */
@@ -415,13 +423,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 	return NULL;
 }
 
+static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+{
+	rb_erase(n, root);
+	RB_CLEAR_NODE(n);
+}
+
 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
 {
 	if (root->left == n)
 		root->left = NULL;
-
-	rb_erase(n, &root->rb);
-	RB_CLEAR_NODE(n);
+	rb_erase_init(n, &root->rb);
 }
 
 /*
@@ -540,6 +552,62 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
 	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
 }
 
+static struct cfq_queue *cfq_prio_tree_lookup(struct cfq_data *cfqd,
+	int ioprio, sector_t sector, struct rb_node **ret_parent,
+	struct rb_node ***rb_link)
+{
+	struct rb_root *root = &cfqd->prio_trees[ioprio];
+	struct rb_node **p, *parent;
+	struct cfq_queue *cfqq = NULL;
+
+	parent = NULL;
+	p = &root->rb_node;
+	while (*p) {
+		struct rb_node **n;
+
+		parent = *p;
+		cfqq = rb_entry(parent, struct cfq_queue, prio_node);
+
+		/*
+		 * Sort strictly based on sector.  Smallest to the left,
+		 * largest to the right.
+		 */
+		if (sector > cfqq->next_rq->sector)
+			n = &(*p)->rb_right;
+		else if (sector < cfqq->next_rq->sector)
+			n = &(*p)->rb_left;
+		else
+			break;
+		p = n;
+	}
+
+	*ret_parent = parent;
+	if (rb_link)
+		*rb_link = p;
+	return NULL;
+}
+
+static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	struct rb_root *root = &cfqd->prio_trees[cfqq->ioprio];
+	struct rb_node **p, *parent;
+	struct cfq_queue *__cfqq;
+
+	if (!RB_EMPTY_NODE(&cfqq->prio_node))
+		rb_erase_init(&cfqq->prio_node, root);
+
+	if (cfq_class_idle(cfqq))
+		return;
+	if (!cfqq->next_rq)
+		return;
+
+	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->ioprio, cfqq->next_rq->sector, &parent, &p);
+	BUG_ON(__cfqq);
+
+	rb_link_node(&cfqq->prio_node, parent, p);
+	rb_insert_color(&cfqq->prio_node, root);
+}
+
 /*
  * Update cfqq's position in the service tree.
  */
@@ -548,8 +616,10 @@ static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	/*
 	 * Resorting requires the cfqq to be on the RR list already.
 	 */
-	if (cfq_cfqq_on_rr(cfqq))
+	if (cfq_cfqq_on_rr(cfqq)) {
 		cfq_service_tree_add(cfqd, cfqq, 0);
+		cfq_prio_tree_add(cfqd, cfqq);
+	}
 }
 
 /*
@@ -578,6 +648,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	if (!RB_EMPTY_NODE(&cfqq->rb_node))
 		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+	if (!RB_EMPTY_NODE(&cfqq->prio_node))
+		rb_erase_init(&cfqq->prio_node, &cfqd->prio_trees[cfqq->ioprio]);
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
@@ -623,6 +695,9 @@ static void cfq_add_rq_rb(struct request *rq)
 	 * check if this request is a better next-serve candidate
 	 */
 	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
+	/* this changes the position in the priority tree */
+	cfq_prio_tree_add(cfqd, cfqq);
+
 	BUG_ON(!cfqq->next_rq);
 }
 
@@ -831,11 +906,11 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 /*
  * Get and set a new active queue for service.
  */
-static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
+static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
+					      struct cfq_queue *cfqq)
 {
-	struct cfq_queue *cfqq;
-
-	cfqq = cfq_get_next_queue(cfqd);
+	if (!cfqq)
+		cfqq = cfq_get_next_queue(cfqd);
 	__cfq_set_active_queue(cfqd, cfqq);
 	return cfqq;
 }
@@ -859,15 +934,77 @@ static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
 	return cfq_dist_from_last(cfqd, rq) <= cic->seek_mean;
 }
 
-static int cfq_close_cooperator(struct cfq_data *cfq_data,
-				struct cfq_queue *cfqq)
+static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
+				    struct cfq_queue *cur_cfqq)
 {
+	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->ioprio];
+	struct rb_node *parent, *node;
+	struct cfq_queue *__cfqq;
+	sector_t sector = cfqd->last_position;
+
+	if (RB_EMPTY_ROOT(root))
+		return NULL;
+
+	/*
+	 * First, if we find a request starting at the end of the last
+	 * request, choose it.
+	 */
+	__cfqq = cfq_prio_tree_lookup(cfqd, cur_cfqq->ioprio,
+				      sector, &parent, NULL);
+	if (__cfqq)
+		return __cfqq;
+
+	/*
+	 * If the exact sector wasn't found, the parent of the NULL leaf
+	 * will contain the closest sector.
+	 */
+	__cfqq = rb_entry(parent, struct cfq_queue, prio_node);
+	if (cfq_rq_close(cfqd, __cfqq->next_rq))
+		return __cfqq;
+	if (__cfqq->next_rq->sector < sector)
+		node = rb_next(&__cfqq->prio_node);
+	else
+		node = rb_prev(&__cfqq->prio_node);
+	if (!node)
+		return NULL;
+	__cfqq = rb_entry(node, struct cfq_queue, prio_node);
+	if (cfq_rq_close(cfqd, __cfqq->next_rq))
+		return __cfqq;
+
+	return NULL;
+}
+
+/*
+ * cfqd - obvious
+ * cur_cfqq - passed in so that we don't decide that the current queue is
+ * 	      closely cooperating with itself.
+ *
+ * So, basically we're assuming that that cur_cfqq has dispatched at least
+ * one request, and that cfqd->last_position reflects a position on the disk
+ * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
+ * assumption.
+ */
+static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
+					      struct cfq_queue *cur_cfqq)
+{
+	struct cfq_queue *cfqq;
+
+	/*
+	 * A valid cfq_io_context is necessary to compare requests against
+	 * the seek_mean of the current cfqq.
+	 */
+	if (!cfqd->active_cic)
+		return NULL;
+
 	/*
 	 * We should notice if some of the queues are cooperating, eg
 	 * working closely on the same area of the disk. In that case,
 	 * we can group them together and don't waste time idling.
 	 */
-	return 0;
+	if ((cfqq = cfqq_close(cfqd, cur_cfqq)))
+		return cfqq;
+
+	return NULL;
 }
 
 #define CIC_SEEKY(cic) ((cic)->seek_mean > (8 * 1024))
@@ -908,13 +1045,6 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
 		return;
 
-	/*
-	 * See if this prio level has a good candidate
-	 */
-	if (cfq_close_cooperator(cfqd, cfqq) &&
-	    (sample_valid(cic->ttime_samples) && cic->ttime_mean > 2))
-		return;
-
 	cfq_mark_cfqq_must_dispatch(cfqq);
 	cfq_mark_cfqq_wait_request(cfqq);
 
@@ -992,7 +1122,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  */
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 {
-	struct cfq_queue *cfqq;
+	struct cfq_queue *cfqq, *new_cfqq = NULL;
 
 	cfqq = cfqd->active_queue;
 	if (!cfqq)
@@ -1012,6 +1142,15 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 		goto keep_queue;
 
 	/*
+ 	 * If another queue has a request waiting within our mean seek
+ 	 * distance, let it run.  The expire code will check for close
+ 	 * cooperators and put the close queue at the front of the service
+ 	 * tree.
+ 	 */
+	if ((new_cfqq = cfq_close_cooperator(cfqd, cfqq)))
+		goto expire;
+
+	/*
 	 * No requests pending. If the active queue still has requests in
 	 * flight or is idling for a new request, allow either of these
 	 * conditions to happen (or time out) before selecting a new queue.
@@ -1025,7 +1164,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 expire:
 	cfq_slice_expired(cfqd, 0);
 new_queue:
-	cfqq = cfq_set_active_queue(cfqd);
+	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
 	return cfqq;
 }
@@ -1466,6 +1605,7 @@ retry:
 		}
 
 		RB_CLEAR_NODE(&cfqq->rb_node);
+		RB_CLEAR_NODE(&cfqq->prio_node);
 		INIT_LIST_HEAD(&cfqq->fifo);
 
 		atomic_set(&cfqq->ref, 0);
@@ -1949,7 +2089,18 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			cfq_set_prio_slice(cfqd, cfqq);
 			cfq_clear_cfqq_slice_new(cfqq);
 		}
-		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
+		/*
+		 * If there are no requests waiting in this queue, and
+		 * there are other queues ready to issue requests, AND
+		 * those other queues are issuing requests within our
+		 * mean seek distance, give them a chance to run instead
+		 * of idling.
+		 */
+		if (RB_EMPTY_ROOT(&cfqq->sort_list) &&
+		    !RB_EMPTY_ROOT(&cfqd->service_tree.rb)) {
+			if (cfq_close_cooperator(cfqd, cfqq))
+				cfq_slice_expired(cfqd, 1);
+		} else if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
 			cfq_slice_expired(cfqd, 1);
 		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
 			cfq_arm_slice_timer(cfqd);
@@ -1983,6 +2134,7 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 		if (cfqq->ioprio != cfqq->org_ioprio)
 			cfqq->ioprio = cfqq->org_ioprio;
 	}
+	cfq_prio_tree_add(cfqq->cfqd, cfqq);
 }
 
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/