lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 15 Apr 2022 16:06:05 +0800
From:   Yu Kuai <yukuai3@...wei.com>
To:     <bvanassche@....org>, <hch@...radead.org>, <axboe@...nel.dk>
CC:     <linux-block@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
        <yukuai3@...wei.com>, <yi.zhang@...wei.com>
Subject: [PATCH -next v2] block: update nsecs[] in part_stat_show() and diskstats_show()

commit 86d7331299fd("block: update io_ticks when io hang") fixed that
%util will be zero for iostat when io is hanged, however, avgqu-sz is
still zero while it represents the number of io that are hunged. On the
other hand, for some slow device, if an io is started before and done
after diskstats is read, the avgqu-sz will be miscalculated.

To fix the problem, update 'nsecs[]' when part_stat_show() or
diskstats_show() is called. In order to do that, add 'stat_time' in
struct block_device and 'rq_stat_time' in struct request to record the
time. And during iteration, update 'nsecs[]' for each inflight request.

Signed-off-by: Yu Kuai <yukuai3@...wei.com>
---
Changes in v2:
 - remove other cleanup patches.

 block/bdev.c              |  2 ++
 block/blk-mq.c            | 62 ++++++++++++++++++++++++++++++++++++++-
 block/blk-mq.h            |  2 ++
 block/genhd.c             | 62 +++++++++++++++++++++++++--------------
 include/linux/blk-mq.h    |  2 ++
 include/linux/blk_types.h |  5 ++++
 6 files changed, 112 insertions(+), 23 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index 7bf88e591aaf..da0471f7492d 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -487,9 +487,11 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	bdev = I_BDEV(inode);
 	mutex_init(&bdev->bd_fsfreeze_mutex);
 	spin_lock_init(&bdev->bd_size_lock);
+	spin_lock_init(&bdev->bd_stat_lock);
 	bdev->bd_partno = partno;
 	bdev->bd_inode = inode;
 	bdev->bd_queue = disk->queue;
+	bdev->stat_time = 0;
 	bdev->bd_stats = alloc_percpu(struct disk_stats);
 	if (!bdev->bd_stats) {
 		iput(inode);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3fe3226aad1b..ebb4db535794 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -128,6 +128,48 @@ struct mq_inflight {
 	unsigned int inflight[2];
 };
 
+static bool blk_mq_check_inflight_with_stat(struct request *rq, void *priv,
+					    bool reserved)
+{
+	struct mq_inflight *mi = priv;
+
+	if ((!mi->part->bd_partno || rq->part == mi->part) &&
+	    blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
+		u64 stat_time;
+
+		mi->inflight[rq_data_dir(rq)]++;
+		if (!rq->part)
+			return true;
+
+		stat_time = READ_ONCE(rq->stat_time_ns);
+		/*
+		 * This might fail if 'req->stat_time_ns' is updated in
+		 * blk_account_io_done().
+		 */
+		if (likely(cmpxchg(&rq->stat_time_ns, stat_time,
+			    rq->part->stat_time) == stat_time)) {
+			int sgrp = op_stat_group(req_op(rq));
+			u64 duation = stat_time ?
+				rq->part->stat_time - stat_time :
+				rq->part->stat_time - rq->start_time_ns;
+
+			part_stat_add(rq->part, nsecs[sgrp], duation);
+		}
+	}
+
+	return true;
+}
+
+unsigned int blk_mq_in_flight_with_stat(struct request_queue *q,
+					struct block_device *part)
+{
+	struct mq_inflight mi = { .part = part };
+
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_with_stat, &mi);
+
+	return mi.inflight[0] + mi.inflight[1];
+}
+
 static bool blk_mq_check_inflight(struct request *rq, void *priv,
 				  bool reserved)
 {
@@ -369,6 +411,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		rq->start_time_ns = ktime_get_ns();
 	else
 		rq->start_time_ns = 0;
+	rq->stat_time_ns = 0;
 	rq->part = NULL;
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
 	rq->alloc_time_ns = alloc_time_ns;
@@ -870,7 +913,24 @@ static void __blk_account_io_done(struct request *req, u64 now)
 	part_stat_lock();
 	update_io_ticks(req->part, jiffies, true);
 	part_stat_inc(req->part, ios[sgrp]);
-	part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+
+	if (queue_is_mq(req->q)) {
+		u64 stat_time = READ_ONCE(req->stat_time_ns);
+
+		/*
+		 * This might fail if 'req->stat_time_ns' is updated during
+		 * blk_mq_check_inflight_with_stat().
+		 */
+		if (likely(cmpxchg(&req->stat_time_ns, stat_time, now) ==
+				   stat_time)) {
+			u64 duation = stat_time ? now - stat_time :
+						  now - req->start_time_ns;
+
+			part_stat_add(req->part, nsecs[sgrp], duation);
+		}
+	} else {
+		part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+	}
 	part_stat_unlock();
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9bad3057c1f3..e6c2c523c8de 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -193,6 +193,8 @@ unsigned int blk_mq_in_flight(struct request_queue *q,
 		struct block_device *part);
 void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
 		unsigned int inflight[2]);
+unsigned int blk_mq_in_flight_with_stat(struct request_queue *q,
+		struct block_device *part);
 
 static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
 					      int budget_token)
diff --git a/block/genhd.c b/block/genhd.c
index b8b6759d670f..36144fe8872b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -932,25 +932,52 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
 }
 
-ssize_t part_stat_show(struct device *dev,
-		       struct device_attribute *attr, char *buf)
+static inline void part_set_stat_time(struct block_device *part)
+{
+	u64 now = ktime_get_ns();
+
+again:
+	part->stat_time = now;
+
+	if (part->bd_partno) {
+		part = bdev_whole(part);
+		goto again;
+	}
+}
+
+static inline void part_get_stat_info(struct block_device *bdev,
+				      struct disk_stats *stat,
+				      unsigned int *inflight)
 {
-	struct block_device *bdev = dev_to_bdev(dev);
 	struct request_queue *q = bdev_get_queue(bdev);
-	struct disk_stats stat;
-	unsigned int inflight;
 
-	if (queue_is_mq(q))
-		inflight = blk_mq_in_flight(q, bdev);
-	else
-		inflight = part_in_flight(bdev);
+	if (queue_is_mq(q)) {
+		part_stat_lock();
+		spin_lock(&bdev->bd_stat_lock);
+		part_set_stat_time(bdev);
+		*inflight = blk_mq_in_flight_with_stat(q, bdev);
+		spin_unlock(&bdev->bd_stat_lock);
+		part_stat_unlock();
+	} else {
+		*inflight = part_in_flight(bdev);
+	}
 
-	if (inflight) {
+	if (*inflight) {
 		part_stat_lock();
 		update_io_ticks(bdev, jiffies, true);
 		part_stat_unlock();
 	}
-	part_stat_read_all(bdev, &stat);
+	part_stat_read_all(bdev, stat);
+}
+
+ssize_t part_stat_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct block_device *bdev = dev_to_bdev(dev);
+	struct disk_stats stat;
+	unsigned int inflight;
+
+	part_get_stat_info(bdev, &stat, &inflight);
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
@@ -1239,17 +1266,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	xa_for_each(&gp->part_tbl, idx, hd) {
 		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
 			continue;
-		if (queue_is_mq(gp->queue))
-			inflight = blk_mq_in_flight(gp->queue, hd);
-		else
-			inflight = part_in_flight(hd);
-
-		if (inflight) {
-			part_stat_lock();
-			update_io_ticks(hd, jiffies, true);
-			part_stat_unlock();
-		}
-		part_stat_read_all(hd, &stat);
+
+		part_get_stat_info(hd, &stat, &inflight);
 		seq_printf(seqf, "%4d %7d %pg "
 			   "%lu %lu %lu %u "
 			   "%lu %lu %lu %u "
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 7aa5c54901a9..387e475fedd4 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -108,6 +108,8 @@ struct request {
 	u64 start_time_ns;
 	/* Time that I/O was submitted to the device. */
 	u64 io_start_time_ns;
+	/* Time that I/O was counted in part_get_stat_info(). */
+	u64 stat_time_ns;
 
 #ifdef CONFIG_BLK_WBT
 	unsigned short wbt_flags;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 046a34c81ec4..4014b0c0ef16 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -65,6 +65,11 @@ struct block_device {
 	struct super_block	*bd_fsfreeze_sb;
 
 	struct partition_meta_info *bd_meta_info;
+
+	/* Prevent part_get_stat_info() to be called concurrently */
+	spinlock_t		bd_stat_lock;
+	/* Will be set when part_get_stat_info() is called */
+	u64			stat_time;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	bool			bd_make_it_fail;
 #endif
-- 
2.31.1

Powered by blists - more mailing lists