lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4CBC3B3A.9040300@jp.fujitsu.com>
Date:	Mon, 18 Oct 2010 21:19:06 +0900
From:	Yasuaki Ishimatsu <isimatu.yasuaki@...fujitsu.com>
To:	Jens Axboe <axboe@...nel.dk>
CC:	kosaki.motohiro@...fujitsu.com, linux-kernel@...r.kernel.org
Subject: [PATCH v5] blk: fix a wrong accounting of hd_struct->in_flight

Hi Jens,

Jens Axboe wrote:
> On 2010-10-18 10:28, Yasuaki Ishimatsu wrote:
>> Hi Jens,
>>
>>> This looks good! To quiesce the queue, something like the below.
>>> Completely untested.
>> Thank you for your advice.
>> I applied your idea to the patch.
> 
> But you changed it, though:
> 
>>  	if (old_ptbl) {
>>  		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
>> +		spin_lock_irq(q->queue_lock);
>> +		elv_quiesce_start(q);
>>  		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
>> +		elv_quiesce_end(q);
>> +		spin_unlock_irq(q->queue_lock);
>>  	}
>>  }
> 
> That is not going to work. The point is to start the drain period
> before, then end it when the callback has gone through. By placing it
> just after the call_rcu() call, there's no guarentee that the RCU grace
> period has elapsed. That is why I placed it inside the rcu callback. Why
> did you move it?

Ah...
I misunderstood the purpose of the call_rcu().
I moved elv_quiesce_end() to the rcu callback.

Regards,
Yasuaki Ishimatsu.
===

From: Yasuaki Ishimatsu <isimatu.yasuaki@...fujitsu.com>

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

The patch fixes the problem.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@...fujitsu.com>
---
 block/blk-core.c         |   24 ++++++++++++++++--------
 block/blk-merge.c        |    2 +-
 block/blk.h              |    4 ----
 block/genhd.c            |   14 ++++++++++++++
 fs/partitions/check.c    |   12 ++++++++++++
 include/linux/blkdev.h   |    1 +
 include/linux/elevator.h |    2 ++
 include/linux/genhd.h    |    1 +
 8 files changed, 47 insertions(+), 13 deletions(-)

Index: linux-2.6.36-rc7/block/blk-core.c
===================================================================
--- linux-2.6.36-rc7.orig/block/blk-core.c	2010-10-15 09:21:37.000000000 +0900
+++ linux-2.6.36-rc7/block/blk-core.c	2010-10-18 14:45:19.000000000 +0900
@@ -64,13 +64,15 @@ static void drive_stat_acct(struct reque
 		return;

 	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));

-	if (!new_io)
+	if (!new_io) {
+		part = rq->part;
 		part_stat_inc(cpu, part, merges[rw]);
-	else {
+	} else {
+		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
+		rq->part = part;
 	}

 	part_stat_unlock();
@@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
+	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);

@@ -796,11 +799,16 @@ static struct request *get_request(struc
 	rl->starved[is_sync] = 0;

 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	if (priv)
+	if (priv) {
 		rl->elvpriv++;

-	if (blk_queue_io_stat(q))
-		rw_flags |= REQ_IO_STAT;
+		/*
+		 * Don't do stats for non-priv requests
+		 */
+		if (blk_queue_io_stat(q))
+			rw_flags |= REQ_IO_STAT;
+	}
+
 	spin_unlock_irq(q->queue_lock);

 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1759,7 +1767,7 @@ static void blk_account_io_completion(st
 		int cpu;

 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1779,7 +1787,7 @@ static void blk_account_io_done(struct r
 		int cpu;

 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;

 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
Index: linux-2.6.36-rc7/block/blk-merge.c
===================================================================
--- linux-2.6.36-rc7.orig/block/blk-merge.c	2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/block/blk-merge.c	2010-10-18 14:41:03.000000000 +0900
@@ -343,7 +343,7 @@ static void blk_account_io_merge(struct
 		int cpu;

 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+		part = req->part;

 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
Index: linux-2.6.36-rc7/include/linux/blkdev.h
===================================================================
--- linux-2.6.36-rc7.orig/include/linux/blkdev.h	2010-10-15 09:21:37.000000000 +0900
+++ linux-2.6.36-rc7/include/linux/blkdev.h	2010-10-15 09:26:22.000000000 +0900
@@ -115,6 +115,7 @@ struct request {
 	void *elevator_private3;

 	struct gendisk *rq_disk;
+	struct hd_struct *part;
 	unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
 	unsigned long long start_time_ns;
Index: linux-2.6.36-rc7/block/genhd.c
===================================================================
--- linux-2.6.36-rc7.orig/block/genhd.c	2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/block/genhd.c	2010-10-18 20:50:18.000000000 +0900
@@ -925,8 +925,15 @@ static void disk_free_ptbl_rcu_cb(struct
 {
 	struct disk_part_tbl *ptbl =
 		container_of(head, struct disk_part_tbl, rcu_head);
+	struct gendisk *disk = ptbl->disk;
+	struct request_queue *q = disk->queue;
+	unsigned long flags;

 	kfree(ptbl);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	elv_quiesce_end(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }

 /**
@@ -944,11 +951,17 @@ static void disk_replace_part_tbl(struct
 				  struct disk_part_tbl *new_ptbl)
 {
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
+	struct request_queue *q = disk->queue;

 	rcu_assign_pointer(disk->part_tbl, new_ptbl);

 	if (old_ptbl) {
 		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
+
+		spin_lock_irq(q->queue_lock);
+		elv_quiesce_start(q);
+		spin_unlock_irq(q->queue_lock);
+
 		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
 	}
 }
@@ -989,6 +1002,7 @@ int disk_expand_part_tbl(struct gendisk
 		return -ENOMEM;

 	new_ptbl->len = target;
+	new_ptbl->disk = disk;

 	for (i = 0; i < len; i++)
 		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
Index: linux-2.6.36-rc7/fs/partitions/check.c
===================================================================
--- linux-2.6.36-rc7.orig/fs/partitions/check.c	2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/fs/partitions/check.c	2010-10-18 20:20:43.000000000 +0900
@@ -364,17 +364,25 @@ struct device_type part_type = {
 static void delete_partition_rcu_cb(struct rcu_head *head)
 {
 	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
+	struct gendisk *disk = part_to_disk(part);
+	struct request_queue *q = disk->queue;
+	unsigned long flags;

 	part->start_sect = 0;
 	part->nr_sects = 0;
 	part_stat_set_all(part, 0);
 	put_device(part_to_dev(part));
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	elv_quiesce_end(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }

 void delete_partition(struct gendisk *disk, int partno)
 {
 	struct disk_part_tbl *ptbl = disk->part_tbl;
 	struct hd_struct *part;
+	struct request_queue *q = disk->queue;

 	if (partno >= ptbl->len)
 		return;
@@ -389,6 +397,10 @@ void delete_partition(struct gendisk *di
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));

+	spin_lock_irq(q->queue_lock);
+	elv_quiesce_start(q);
+	spin_unlock_irq(q->queue_lock);
+
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }

Index: linux-2.6.36-rc7/block/blk.h
===================================================================
--- linux-2.6.36-rc7.orig/block/blk.h	2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/block/blk.h	2010-10-18 16:22:47.000000000 +0900
@@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(stru

 int blk_dev_init(void);

-void elv_quiesce_start(struct request_queue *q);
-void elv_quiesce_end(struct request_queue *q);
-
-
 /*
  * Return the threshold (number of used requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
Index: linux-2.6.36-rc7/include/linux/elevator.h
===================================================================
--- linux-2.6.36-rc7.orig/include/linux/elevator.h	2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/include/linux/elevator.h	2010-10-18 17:09:58.000000000 +0900
@@ -121,6 +121,8 @@ extern void elv_completed_request(struct
 extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
 extern void elv_put_request(struct request_queue *, struct request *);
 extern void elv_drain_elevator(struct request_queue *);
+extern void elv_quiesce_start(struct request_queue *);
+extern void elv_quiesce_end(struct request_queue *);

 /*
  * io scheduler registration
Index: linux-2.6.36-rc7/include/linux/genhd.h
===================================================================
--- linux-2.6.36-rc7.orig/include/linux/genhd.h	2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/include/linux/genhd.h	2010-10-18 19:57:36.000000000 +0900
@@ -130,6 +130,7 @@ struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
 	struct hd_struct *last_lookup;
+	struct gendisk *disk;
 	struct hd_struct *part[];
 };


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ