lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20090117162657.GA31965@Krystal>
Date:	Sat, 17 Jan 2009 11:26:57 -0500
From:	Mathieu Desnoyers <mathieu.desnoyers@...ymtl.ca>
To:	Jens Axboe <axboe@...nel.dk>, Andrea Arcangeli <andrea@...e.de>,
	akpm@...ux-foundation.org, Ingo Molnar <mingo@...e.hu>,
	Linus Torvalds <torvalds@...ux-foundation.org>
Cc:	linux-kernel@...r.kernel.org, ltt-dev@...ts.casi.polymtl.ca
Subject: [RFC PATCH] block: Fix bio merge induced high I/O latency

A long standing I/O regression (since 2.6.18, still there today) has hit
Slashdot recently :
http://bugzilla.kernel.org/show_bug.cgi?id=12309
http://it.slashdot.org/article.pl?sid=09/01/15/049201

I've taken a trace reproducing the wrong behavior on my machine and I
think it's getting us somewhere.

LTTng 0.83, kernel 2.6.28
Machine : Intel Xeon E5405 dual quad-core, 16GB ram
(just created a new block-trace.c LTTng probe which is not released yet.
It basically replaces blktrace)


echo 3 > /proc/sys/vm/drop_caches

lttctl -C -w /tmp/trace -o channel.mm.bufnum=8 -o channel.block.bufnum=64 trace

dd if=/dev/zero of=/tmp/newfile bs=1M count=1M
cp -ax music /tmp   (copying 1.1GB of mp3)

ls  (takes 15 seconds to get the directory listing !)

lttctl -D trace

I looked at the trace (especially at the ls surroundings), and bash is
waiting for a few seconds for I/O in the exec system call (to exec ls).

While this happens, we have dd doing lots and lots of bio_queue. There
is a bio_backmerge after each bio_queue event. This is reasonable,
because dd is writing to a contiguous file.

However, I wonder if this is not the actual problem. We have dd which
has the head request in the elevator request queue. It is progressing
steadily by plugging/unplugging the device periodically and gets its
work done. However, because requests are being dequeued at the same
rate others are being merged, I suspect it stays at the top of the queue
and does not let the other unrelated requests run.

There is a test in the blk-merge.c which makes sure that merged requests
do not get bigger than a certain size. However, if the request is
steadily dequeued, I think this test is not doing anything.


This patch implements a basic test to make sure we never merge more than 128
requests into the same request if it is the "last_merge" request. I have not
been able to trigger the problem again with the fix applied. It might not be in
a perfect state : there may be better solutions to the problem, but I think it
helps pointing out where the culprit lays.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@...ymtl.ca>
CC: Jens Axboe <axboe@...nel.dk>
CC: Andrea Arcangeli <andrea@...e.de>
CC: akpm@...ux-foundation.org
CC: Ingo Molnar <mingo@...e.hu>
CC: Linus Torvalds <torvalds@...ux-foundation.org>
---
 block/blk-merge.c      |   12 +++++++++---
 block/elevator.c       |   31 ++++++++++++++++++++++++++++---
 include/linux/blkdev.h |    1 +
 3 files changed, 38 insertions(+), 6 deletions(-)

Index: linux-2.6-lttng/include/linux/blkdev.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/blkdev.h	2009-01-17 09:49:54.000000000 -0500
+++ linux-2.6-lttng/include/linux/blkdev.h	2009-01-17 09:50:29.000000000 -0500
@@ -313,6 +313,7 @@ struct request_queue
 	 */
 	struct list_head	queue_head;
 	struct request		*last_merge;
+	int			nr_cached_merge;
 	elevator_t		*elevator;
 
 	/*
Index: linux-2.6-lttng/block/elevator.c
===================================================================
--- linux-2.6-lttng.orig/block/elevator.c	2009-01-17 09:49:54.000000000 -0500
+++ linux-2.6-lttng/block/elevator.c	2009-01-17 11:07:12.000000000 -0500
@@ -255,6 +255,7 @@ int elevator_init(struct request_queue *
 
 	INIT_LIST_HEAD(&q->queue_head);
 	q->last_merge = NULL;
+	q->nr_cached_merge = 0;
 	q->end_sector = 0;
 	q->boundary_rq = NULL;
 
@@ -438,8 +439,10 @@ void elv_dispatch_sort(struct request_qu
 	struct list_head *entry;
 	int stop_flags;
 
-	if (q->last_merge == rq)
+	if (q->last_merge == rq) {
 		q->last_merge = NULL;
+		q->nr_cached_merge = 0;
+	}
 
 	elv_rqhash_del(q, rq);
 
@@ -478,8 +481,10 @@ EXPORT_SYMBOL(elv_dispatch_sort);
  */
 void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
 {
-	if (q->last_merge == rq)
+	if (q->last_merge == rq) {
 		q->last_merge = NULL;
+		q->nr_cached_merge = 0;
+	}
 
 	elv_rqhash_del(q, rq);
 
@@ -498,6 +503,16 @@ int elv_merge(struct request_queue *q, s
 	int ret;
 
 	/*
+	 * Make sure we don't starve other requests by merging too many cached
+	 * requests together.
+	 */
+	if (q->nr_cached_merge >= BLKDEV_MAX_RQ) {
+		q->last_merge = NULL;
+		q->nr_cached_merge = 0;
+		return ELEVATOR_NO_MERGE;
+	}
+
+	/*
 	 * First try one-hit cache.
 	 */
 	if (q->last_merge) {
@@ -536,6 +551,10 @@ void elv_merged_request(struct request_q
 	if (type == ELEVATOR_BACK_MERGE)
 		elv_rqhash_reposition(q, rq);
 
+	if (q->last_merge != rq)
+		q->nr_cached_merge = 0;
+	else
+		q->nr_cached_merge++;
 	q->last_merge = rq;
 }
 
@@ -551,6 +570,10 @@ void elv_merge_requests(struct request_q
 	elv_rqhash_del(q, next);
 
 	q->nr_sorted--;
+	if (q->last_merge != rq)
+		q->nr_cached_merge = 0;
+	else
+		q->nr_cached_merge++;
 	q->last_merge = rq;
 }
 
@@ -626,8 +649,10 @@ void elv_insert(struct request_queue *q,
 		q->nr_sorted++;
 		if (rq_mergeable(rq)) {
 			elv_rqhash_add(q, rq);
-			if (!q->last_merge)
+			if (!q->last_merge) {
+				q->nr_cached_merge = 1;
 				q->last_merge = rq;
+			}
 		}
 
 		/*
Index: linux-2.6-lttng/block/blk-merge.c
===================================================================
--- linux-2.6-lttng.orig/block/blk-merge.c	2009-01-17 09:49:54.000000000 -0500
+++ linux-2.6-lttng/block/blk-merge.c	2009-01-17 09:50:29.000000000 -0500
@@ -231,8 +231,10 @@ static inline int ll_new_hw_segment(stru
 	if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments
 	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
 		req->cmd_flags |= REQ_NOMERGE;
-		if (req == q->last_merge)
+		if (req == q->last_merge) {
 			q->last_merge = NULL;
+			q->nr_cached_merge = 0;
+		}
 		return 0;
 	}
 
@@ -256,8 +258,10 @@ int ll_back_merge_fn(struct request_queu
 
 	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
-		if (req == q->last_merge)
+		if (req == q->last_merge) {
 			q->last_merge = NULL;
+			q->nr_cached_merge = 0;
+		}
 		return 0;
 	}
 	if (!bio_flagged(req->biotail, BIO_SEG_VALID))
@@ -281,8 +285,10 @@ int ll_front_merge_fn(struct request_que
 
 	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
-		if (req == q->last_merge)
+		if (req == q->last_merge) {
 			q->last_merge = NULL;
+			q->nr_cached_merge = 0;
+		}
 		return 0;
 	}
 	if (!bio_flagged(bio, BIO_SEG_VALID))

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ