[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20260109083126.15052-2-lidiangang@bytedance.com>
Date: Fri, 9 Jan 2026 16:31:26 +0800
From: Diangang Li <diangangli@...il.com>
To: axboe@...nel.dk
Cc: linux-block@...r.kernel.org,
linux-kernel@...r.kernel.org,
changfengnan@...edance.com,
Diangang Li <lidiangang@...edance.com>
Subject: [RFC 1/1] block: export windowed IO P99 latency
Track per-IO completion latency in a power-of-two histogram
(NR_STAT_BUCKETS buckets, DISK_LAT_BASE_USEC .. DISK_LAT_MAX_USEC).
Maintain a per-cpu sliced ring histogram and compute P99 by aggregating the
recent slices at read time in /proc/diskstats and /sys/block/<dev>/stat.
Report P99 in usecs using the bucket midpoint, clamp overflows to
DISK_LAT_MAX_USEC, and append the P99 for read/write/discard/flush.
Suggested-by: Fengnan Chang <changfengnan@...edance.com>
Signed-off-by: Diangang Li <lidiangang@...edance.com>
---
block/blk-core.c | 5 ++-
block/blk-flush.c | 6 ++-
block/blk-mq.c | 5 ++-
block/genhd.c | 50 ++++++++++++++++++++++++-
include/linux/part_stat.h | 79 +++++++++++++++++++++++++++++++++++++++
5 files changed, 139 insertions(+), 6 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 8387fe50ea156..832ba4fc1b75a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1062,12 +1062,15 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
const int sgrp = op_stat_group(op);
unsigned long now = READ_ONCE(jiffies);
unsigned long duration = now - start_time;
+ u64 latency_ns = jiffies_to_nsecs(duration);
+ unsigned int bucket = diskstat_latency_bucket(latency_ns);
part_stat_lock();
update_io_ticks(bdev, now, true);
part_stat_inc(bdev, ios[sgrp]);
part_stat_add(bdev, sectors[sgrp], sectors);
- part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
+ part_stat_add(bdev, nsecs[sgrp], latency_ns);
+ part_stat_latency_record(bdev, sgrp, now, bucket);
part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
part_stat_unlock();
}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 43d6152897a42..b3ff78025968f 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -124,11 +124,13 @@ static void blk_flush_restore_request(struct request *rq)
static void blk_account_io_flush(struct request *rq)
{
struct block_device *part = rq->q->disk->part0;
+ u64 latency_ns = blk_time_get_ns() - rq->start_time_ns;
+ unsigned int bucket = diskstat_latency_bucket(latency_ns);
part_stat_lock();
part_stat_inc(part, ios[STAT_FLUSH]);
- part_stat_add(part, nsecs[STAT_FLUSH],
- blk_time_get_ns() - rq->start_time_ns);
+ part_stat_add(part, nsecs[STAT_FLUSH], latency_ns);
+ part_stat_latency_record(part, STAT_FLUSH, jiffies, bucket);
part_stat_unlock();
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index eff4f72ce83be..6a7fd6681902e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1068,11 +1068,14 @@ static inline void blk_account_io_done(struct request *req, u64 now)
*/
if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) {
const int sgrp = op_stat_group(req_op(req));
+ u64 latency_ns = now - req->start_time_ns;
+ unsigned int bucket = diskstat_latency_bucket(latency_ns);
part_stat_lock();
update_io_ticks(req->part, jiffies, true);
part_stat_inc(req->part, ios[sgrp]);
- part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+ part_stat_add(req->part, nsecs[sgrp], latency_ns);
+ part_stat_latency_record(req->part, sgrp, jiffies, bucket);
part_stat_local_dec(req->part,
in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
diff --git a/block/genhd.c b/block/genhd.c
index 69c75117ba2c0..56151c7880651 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -108,23 +108,60 @@ static void part_stat_read_all(struct block_device *part,
struct disk_stats *stat)
{
int cpu;
+ u32 now_epoch = (u32)(jiffies / HZ);
memset(stat, 0, sizeof(struct disk_stats));
for_each_possible_cpu(cpu) {
struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
int group;
+ int slice;
+ int bucket;
for (group = 0; group < NR_STAT_GROUPS; group++) {
stat->nsecs[group] += ptr->nsecs[group];
stat->sectors[group] += ptr->sectors[group];
stat->ios[group] += ptr->ios[group];
stat->merges[group] += ptr->merges[group];
+
+ for (slice = 0; slice < NR_STAT_SLICES; slice++) {
+ u32 slice_epoch = READ_ONCE(ptr->latency_epoch[slice]);
+ s32 age = (s32)(now_epoch - slice_epoch);
+
+ if (age < 0 || age >= NR_STAT_SLICES)
+ continue;
+
+ for (bucket = 0; bucket < NR_STAT_BUCKETS; bucket++)
+ stat->latency[group][0][bucket] +=
+ ptr->latency[group][slice][bucket];
+ }
}
stat->io_ticks += ptr->io_ticks;
}
}
+static u32 diskstat_p99_us(u32 buckets[NR_STAT_BUCKETS])
+{
+ u32 total = 0;
+ u32 accum = 0;
+ u32 target;
+ int bucket;
+
+ for (bucket = 0; bucket < NR_STAT_BUCKETS; bucket++)
+ total += buckets[bucket];
+ if (!total)
+ return 0;
+
+ target = total - div_u64((u64)total, 100);
+ for (bucket = 0; bucket < NR_STAT_BUCKETS; bucket++) {
+ accum += buckets[bucket];
+ if (accum >= target)
+ return diskstat_latency_bucket_us(bucket);
+ }
+
+ return diskstat_latency_bucket_us(NR_STAT_BUCKETS - 1);
+}
+
static void bdev_count_inflight_rw(struct block_device *part,
unsigned int inflight[2], bool mq_driver)
{
@@ -1078,7 +1115,8 @@ ssize_t part_stat_show(struct device *dev,
"%8lu %8lu %8llu %8u "
"%8u %8u %8u "
"%8lu %8lu %8llu %8u "
- "%8lu %8u"
+ "%8lu %8u "
+ "%8u %8u %8u %8u"
"\n",
stat.ios[STAT_READ],
stat.merges[STAT_READ],
@@ -1100,7 +1138,11 @@ ssize_t part_stat_show(struct device *dev,
(unsigned long long)stat.sectors[STAT_DISCARD],
(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
stat.ios[STAT_FLUSH],
- (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
+ (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC),
+ diskstat_p99_us(stat.latency[STAT_READ][0]),
+ diskstat_p99_us(stat.latency[STAT_WRITE][0]),
+ diskstat_p99_us(stat.latency[STAT_DISCARD][0]),
+ diskstat_p99_us(stat.latency[STAT_FLUSH][0]));
}
/*
@@ -1406,6 +1448,10 @@ static int diskstats_show(struct seq_file *seqf, void *v)
seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]);
seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
NSEC_PER_MSEC));
+ seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_READ][0]));
+ seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_WRITE][0]));
+ seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_DISCARD][0]));
+ seq_put_decimal_ull(seqf, " ", diskstat_p99_us(stat.latency[STAT_FLUSH][0]));
seq_putc(seqf, '\n');
}
rcu_read_unlock();
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 729415e91215d..cbcb24abac21e 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -5,6 +5,19 @@
#include <linux/blkdev.h>
#include <asm/local.h>
+/*
+ * Diskstats latency histogram:
+ * - Bucket upper bounds are power-of-two in usecs, starting at DISK_LAT_BASE_USEC.
+ * - The last bucket is a saturation bucket for latencies >= DISK_LAT_MAX_USEC.
+ *
+ * Latency is tracked in NR_STAT_SLICES 1-second slices and
+ * summed to compute a NR_STAT_SLICES-second P99 latency.
+ */
+#define NR_STAT_BUCKETS 21
+#define NR_STAT_SLICES 5
+#define DISK_LAT_BASE_USEC 8U
+#define DISK_LAT_MAX_USEC (DISK_LAT_BASE_USEC << (NR_STAT_BUCKETS - 1))
+
struct disk_stats {
u64 nsecs[NR_STAT_GROUPS];
unsigned long sectors[NR_STAT_GROUPS];
@@ -12,6 +25,8 @@ struct disk_stats {
unsigned long merges[NR_STAT_GROUPS];
unsigned long io_ticks;
local_t in_flight[2];
+ u32 latency_epoch[NR_STAT_SLICES];
+ u32 latency[NR_STAT_GROUPS][NR_STAT_SLICES][NR_STAT_BUCKETS];
};
/*
@@ -81,4 +96,68 @@ static inline void part_stat_set_all(struct block_device *part, int value)
unsigned int bdev_count_inflight(struct block_device *part);
+static inline unsigned int diskstat_latency_bucket(u64 latency_ns)
+{
+ u64 latency_us = latency_ns / 1000;
+ u64 scaled;
+
+ if (latency_us <= DISK_LAT_BASE_USEC)
+ return 0;
+
+ if (latency_us >= DISK_LAT_MAX_USEC)
+ return NR_STAT_BUCKETS - 1;
+
+ scaled = div_u64(latency_us - 1, DISK_LAT_BASE_USEC);
+ return min_t(unsigned int, (unsigned int)fls64(scaled),
+ NR_STAT_BUCKETS - 1);
+}
+
+static inline u32 diskstat_latency_bucket_upper_us(unsigned int bucket)
+{
+ if (bucket >= NR_STAT_BUCKETS - 1)
+ return DISK_LAT_MAX_USEC;
+ return DISK_LAT_BASE_USEC << bucket;
+}
+
+static inline u32 diskstat_latency_bucket_us(unsigned int bucket)
+{
+ u32 high;
+ u32 low;
+
+ if (bucket >= NR_STAT_BUCKETS - 1)
+ return DISK_LAT_MAX_USEC;
+
+ high = diskstat_latency_bucket_upper_us(bucket);
+ low = high >> 1;
+ return low + (low >> 1);
+}
+
+static inline void __part_stat_latency_prepare(struct block_device *part,
+ u32 epoch, unsigned int slice)
+{
+ struct disk_stats *stats = per_cpu_ptr(part->bd_stats, smp_processor_id());
+ int group;
+
+ if (likely(stats->latency_epoch[slice] == epoch))
+ return;
+
+ for (group = 0; group < NR_STAT_GROUPS; group++)
+ memset(stats->latency[group][slice], 0,
+ sizeof(stats->latency[group][slice]));
+ stats->latency_epoch[slice] = epoch;
+}
+
+static inline void part_stat_latency_record(struct block_device *part,
+ int sgrp, unsigned long now, unsigned int bucket)
+{
+ u32 epoch = now / HZ;
+ unsigned int slice = epoch % NR_STAT_SLICES;
+
+ __part_stat_latency_prepare(part, epoch, slice);
+ if (bdev_is_partition(part))
+ __part_stat_latency_prepare(bdev_whole(part), epoch, slice);
+
+ part_stat_inc(part, latency[sgrp][slice][bucket]);
+}
+
#endif /* _LINUX_PART_STAT_H */
--
2.39.5
Powered by blists - more mailing lists