lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 14 Apr 2010 22:45:10 -0700
From:	Divyesh Shah <dpshah@...gle.com>
To:	jens.axboe@...cle.com
Cc:	linux-kernel@...r.kernel.org, nauman@...gle.com, rickyb@...gle.com
Subject: [PATCH 2/4] block: Add disk performance histograms which can be read

from sysfs and cleared upon writing.

Signed-off-by: Divyesh Shah <dpshah@...gle.com>
From: Edward Falk <efalk@...gle.com>
---

 block/Kconfig          |   26 +++++
 block/blk-core.c       |    1 
 block/genhd.c          |  270 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/partitions/check.c  |   16 +++
 include/linux/blkdev.h |    4 -
 include/linux/genhd.h  |   48 +++++++++
 include/linux/time.h   |    5 +
 7 files changed, 368 insertions(+), 2 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index f9e89f4..b62fe49 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -100,6 +100,32 @@ config DEBUG_BLK_CGROUP
 	in the blk group which can be used by cfq for tracing various
 	group related activity.
 
+config BLOCK_HISTOGRAM
+	bool "Performance histogram data"
+	default n
+	---help---
+	  This option causes block devices to collect statistics on transfer
+	  sizes and times. Useful for performance-tuning a system. Creates
+	  entries in /sysfs/block/.
+
+	  If you are unsure, say N here.
+
+config HISTO_SIZE_BUCKETS
+	int "Number of size buckets in histogram"
+	depends on BLOCK_HISTOGRAM
+	default "10"
+	---help---
+	  This option controls how many buckets are used to collect
+	  transfer size statistics.
+
+config HISTO_TIME_BUCKETS
+	int "Number of time buckets in histogram"
+	depends on BLOCK_HISTOGRAM
+	default "11"
+	---help---
+	  This option controls how many buckets are used to collect
+	  transfer time statistics.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/blk-core.c b/block/blk-core.c
index f18e7b7..6432b14 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1744,6 +1744,7 @@ static void blk_account_io_done(struct request *req)
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
+		block_histogram_completion(cpu, part, req);
 		part_dec_in_flight(part, rw);
 
 		part_stat_unlock();
diff --git a/block/genhd.c b/block/genhd.c
index d13ba76..3666cf2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -881,6 +881,16 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+#ifdef	CONFIG_BLOCK_HISTOGRAM
+static DEVICE_ATTR(read_request_histo, S_IRUGO | S_IWUSR,
+		part_read_request_histo_show, part_read_histo_clear);
+static DEVICE_ATTR(read_dma_histo, S_IRUGO | S_IWUSR, part_read_dma_histo_show,
+		part_read_histo_clear);
+static DEVICE_ATTR(write_request_histo, S_IRUGO | S_IWUSR,
+		part_write_request_histo_show, part_write_histo_clear);
+static DEVICE_ATTR(write_dma_histo, S_IRUGO | S_IWUSR,
+		part_write_dma_histo_show, part_write_histo_clear);
+#endif
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -902,6 +912,12 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
+#ifdef CONFIG_BLOCK_HISTOGRAM
+	&dev_attr_read_request_histo.attr,
+	&dev_attr_read_dma_histo.attr,
+	&dev_attr_write_request_histo.attr,
+	&dev_attr_write_dma_histo.attr,
+#endif
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
@@ -1286,3 +1302,257 @@ int invalidate_partition(struct gendisk *disk, int partno)
 }
 
 EXPORT_SYMBOL(invalidate_partition);
+
+#ifdef	CONFIG_BLOCK_HISTOGRAM
+/*
+ * Clear one per-cpu instance of a particular I/O histogram. This should always
+ * be called between part_stat_lock() and part_stat_unklock() calls.
+ */
+static inline void __block_part_histogram_reset(struct disk_stats *stats,
+						int direction)
+{
+	if (direction == READ)
+		memset(&stats->rd_histo, 0, sizeof(stats->rd_histo));
+	else
+		memset(&stats->wr_histo, 0, sizeof(stats->wr_histo));
+}
+
+/*
+ * Clear the I/O histogram for a given partition.
+ */
+static void block_part_histogram_reset(struct hd_struct *part, int direction)
+{
+#ifdef	CONFIG_SMP
+	int i;
+
+	part_stat_lock();
+	for_each_possible_cpu(i) {
+		if (cpu_possible(i))
+			__block_part_histogram_reset(per_cpu_ptr(part->dkstats,
+								i), direction);
+	}
+#else
+	part_stat_lock();
+	__block_part_histogram_reset(&part.dkstats, direction);
+#endif
+	part_stat_unlock();
+}
+
+/*
+ * Iterate though all partitions of the disk and clear the specified
+ * (read/write) histogram.
+ */
+static int block_disk_histogram_reset(struct hd_struct *part, int direction)
+{
+	struct disk_part_iter piter;
+	struct gendisk *disk = part_to_disk(part);
+	struct hd_struct *temp;
+
+	if (!disk)
+		return -ENODEV;
+
+	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY_PART0);
+	while ((temp = disk_part_iter_next(&piter)))
+		block_part_histogram_reset(temp, direction);
+	disk_part_iter_exit(&piter);
+	return 0;
+}
+
+/*
+ * Map transfer size to histogram bucket. Transfer sizes are exponentially
+ * increasing. For example: 4,8,16,... sectors.
+ */
+static inline int stats_size_bucket(sector_t sectors)
+{
+	int i;
+	/* To make sure bucket for x bytes captures all IOs <= x bytes. */
+	--sectors;
+	do_div(sectors, BASE_HISTO_SIZE);
+	if (sectors >= (1 << (CONFIG_HISTO_SIZE_BUCKETS - 2)))
+		return CONFIG_HISTO_SIZE_BUCKETS - 1;
+
+	for (i = 0; sectors > 0; ++i, sectors /= 2)
+		;
+	return i;
+}
+
+/*
+ * Map transfer time to histogram bucket. This also uses an exponential
+ * increment, but we like the 1,2,5,10,20,50 progression.
+ */
+static inline int stats_time_bucket(int jiffies)
+{
+	int i;
+	int ms = msecs_to_jiffies(jiffies);
+	int t = BASE_HISTO_TIME;
+
+	for (i = 0;; t *= 10) {
+		if (++i >= CONFIG_HISTO_TIME_BUCKETS || ms <= t)
+			return i - 1;
+		if (++i >= CONFIG_HISTO_TIME_BUCKETS || ms <= t*2)
+			return i - 1;
+		if (++i >= CONFIG_HISTO_TIME_BUCKETS || ms <= t*5)
+			return i - 1;
+	}
+}
+
+/*
+ * Log I/O completion, update histogram.
+ *
+ * @part:	disk device partition
+ * @req:	pointer to request
+ * @req_ms:	time transfer required
+ * @dma_ms:	time dma required
+ */
+static inline void __block_histogram_completion(int cpu, struct hd_struct *part,
+		struct request *req, unsigned int req_ms, unsigned int dma_ms)
+{
+	sector_t sectors = blk_rq_size(req);
+	int size_idx = stats_size_bucket(sectors);
+	int req_time_idx = stats_time_bucket(req_ms);
+	int dma_time_idx = stats_time_bucket(dma_ms);
+
+	if (!rq_data_dir(req))
+		part_stat_inc(cpu, part,
+			rd_histo[HISTO_REQUEST][size_idx][req_time_idx]);
+	else
+		part_stat_inc(cpu, part,
+			wr_histo[HISTO_REQUEST][size_idx][req_time_idx]);
+
+	if (!rq_data_dir(req))
+		part_stat_inc(cpu, part,
+			rd_histo[HISTO_DMA][size_idx][dma_time_idx]);
+	else
+		part_stat_inc(cpu, part,
+			wr_histo[HISTO_DMA][size_idx][dma_time_idx]);
+}
+
+/*
+ * Called after a dma interrupt. Should be called between part_stat_lock()
+ * and part_stat_unlock() calls.
+ * Note that for block devices with queue_depth > 1, the io_elapsed will not be
+ * accurate as it may include time spent in the disk queue due to re-ordering of
+ * requests by the disk.
+ */
+void block_histogram_completion(int cpu, struct hd_struct *part,
+		struct request *req)
+{
+	unsigned long long now = sched_clock();
+	uint64_t rq_elapsed = 0, io_elapsed = 0;
+
+	if (time_after64(now, rq_start_time_ns(req)))
+		rq_elapsed = now - rq_start_time_ns(req);
+	if (time_after64(now, rq_io_start_time_ns(req)))
+		io_elapsed = now - rq_io_start_time_ns(req);
+	__block_histogram_completion(cpu, part, req, nsecs_to_msecs(rq_elapsed),
+					nsecs_to_msecs(io_elapsed));
+}
+
+static uint64_t histo_stat_read(struct hd_struct *part, int direction,
+				int i, int j, int k)
+{
+	return (direction == READ) ? part_stat_read(part, rd_histo[i][j][k]) :
+			part_stat_read(part, wr_histo[i][j][k]);
+}
+
+/*
+ * Dumps the specified 'type' of histogram for part to out.
+ * The result must be less than PAGE_SIZE.
+ */
+static int dump_histo(struct hd_struct *part, int direction, int type,
+			char *page)
+{
+	ssize_t	rem = PAGE_SIZE;
+	char *optr = page;
+	int i, j, len, ms, size = BASE_HISTO_SIZE * 512;
+	static const int mults[3] = {1, 2, 5};
+
+	/*
+	 * Documentation/filesystems/sysfs.txt strongly discourages the use of
+	 * any kind of fancy formatting here. We *are* emitting an array, so
+	 * there needs to be some amount of formatting.
+	 */
+
+	/* Row header */
+	len = snprintf(page, rem, "       ");
+	page += len;
+	rem -= len;
+	for (i = 0, ms = BASE_HISTO_TIME; i < CONFIG_HISTO_TIME_BUCKETS;
+		ms *= 10) {
+		for (j = 0; j < 3 && i < CONFIG_HISTO_TIME_BUCKETS; ++j, ++i) {
+			len = snprintf(page, rem, "\t%d", ms * mults[j]);
+			page += len;
+			rem -= len;
+		}
+	}
+	len = snprintf(page, rem, "\n");
+	page += len;
+	rem -= len;
+
+	/* Payload */
+	for (i = 0; i < CONFIG_HISTO_SIZE_BUCKETS; i++, size *= 2) {
+		len = snprintf(page, rem, "%7d", size);
+		page += len;
+		rem -= len;
+		for (j = 0; j < CONFIG_HISTO_TIME_BUCKETS; j++) {
+			len = snprintf(page, rem, "\t%llu",
+				histo_stat_read(part, direction, type, i, j));
+			page += len;
+			rem -= len;
+		}
+		len = snprintf(page, rem, "\n");
+		page += len;
+		rem -= len;
+	}
+	return page - optr;
+}
+
+/*
+ * sysfs show() methods for the four histogram channels.
+ */
+ssize_t part_read_request_histo_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return dump_histo(dev_to_part(dev), READ, HISTO_REQUEST, page);
+}
+
+ssize_t part_read_dma_histo_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return dump_histo(dev_to_part(dev), READ, HISTO_DMA, page);
+}
+
+ssize_t part_write_request_histo_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return dump_histo(dev_to_part(dev), WRITE, HISTO_REQUEST, page);
+}
+
+ssize_t part_write_dma_histo_show(struct device *dev,
+			struct device_attribute *attr, char *page)
+{
+	return dump_histo(dev_to_part(dev), WRITE, HISTO_DMA, page);
+}
+
+/*
+ * Reinitializes the read histograms to 0.
+ */
+ssize_t part_read_histo_clear(struct device *dev,
+		struct device_attribute *attr, const char *page, size_t count)
+{
+	/* Ignore the data, just clear the histogram */
+	int retval = block_disk_histogram_reset(dev_to_part(dev), READ);
+	return (retval == 0 ? count : retval);
+}
+
+/*
+ * Reinitializes the write histograms to 0.
+ */
+ssize_t part_write_histo_clear(struct device *dev,
+		struct device_attribute *attr, const char *page, size_t count)
+{
+	int retval = block_disk_histogram_reset(dev_to_part(dev), WRITE);
+	return (retval == 0 ? count : retval);
+}
+
+#endif
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e238ab2..e0044d4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -300,6 +300,16 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
 		   NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+#ifdef CONFIG_BLOCK_HISTOGRAM
+static DEVICE_ATTR(read_request_histo, S_IRUGO | S_IWUSR,
+		part_read_request_histo_show, part_read_histo_clear);
+static DEVICE_ATTR(read_dma_histo, S_IRUGO | S_IWUSR, part_read_dma_histo_show,
+		part_read_histo_clear);
+static DEVICE_ATTR(write_request_histo, S_IRUGO | S_IWUSR,
+		part_write_request_histo_show, part_write_histo_clear);
+static DEVICE_ATTR(write_dma_histo, S_IRUGO | S_IWUSR,
+		part_write_dma_histo_show, part_write_histo_clear);
+#endif
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -313,6 +323,12 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_discard_alignment.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
+#ifdef CONFIG_BLOCK_HISTOGRAM
+	&dev_attr_read_request_histo.attr,
+	&dev_attr_read_dma_histo.attr,
+	&dev_attr_write_request_histo.attr,
+	&dev_attr_write_dma_histo.attr,
+#endif
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4cc2cdf..2e5e083 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -195,7 +195,7 @@ struct request {
 
 	struct gendisk *rq_disk;
 	unsigned long start_time;
-#ifdef CONFIG_BLK_CGROUP
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLOCK_HISTOGRAM)
 	unsigned long long start_time_ns;
 	unsigned long long io_start_time_ns;    /* when passed to hardware */
 #endif
@@ -1206,7 +1206,7 @@ static inline void put_dev_sector(Sector p)
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
 
-#ifdef CONFIG_BLK_CGROUP
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLOCK_HISTOGRAM)
 static inline void set_start_time_ns(struct request *req)
 {
 	req->start_time_ns = sched_clock();
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5f2f4c4..7406533 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -65,6 +65,8 @@ enum {
 #include <linux/fs.h>
 #include <linux/workqueue.h>
 
+struct request;
+
 struct partition {
 	unsigned char boot_ind;		/* 0x80 - active */
 	unsigned char head;		/* starting head */
@@ -78,6 +80,13 @@ struct partition {
 	__le32 nr_sects;		/* nr of sectors in partition */
 } __attribute__((packed));
 
+#define	BASE_HISTO_SIZE		4	/* smallest transfer size, sectors */
+#define	BASE_HISTO_TIME		10	/* shortest transfer time, ms */
+
+/* Index into the histo arrays */
+#define HISTO_REQUEST   0
+#define HISTO_DMA       1
+
 struct disk_stats {
 	unsigned long sectors[2];	/* READs and WRITEs */
 	unsigned long ios[2];
@@ -85,6 +94,23 @@ struct disk_stats {
 	unsigned long ticks[2];
 	unsigned long io_ticks;
 	unsigned long time_in_queue;
+#ifdef CONFIG_BLOCK_HISTOGRAM
+	/*
+	 * Implement 2-variable histograms, with transfers tracked by transfer
+	 * size and completion time. The sysfs files are
+	 * /sys/block/DEV/PART/read_request_histo,
+	 * /sys/block/DEV/PART/write_request_histo,
+	 * /sys/block/DEV/PART/read_dma_histo,
+	 * /sys/block/DEV/PART/write_dma_histo and the
+	 * /sys/block/DEV counterparts.
+	 *
+	 * The *request_histo files measure time from when the request is first
+	 * submitted into the reuqest queue.  The *dma_histo files measure time
+	 * from when the request is dispatched from the queue to the device.
+	 */
+	uint64_t rd_histo[2][CONFIG_HISTO_SIZE_BUCKETS][CONFIG_HISTO_TIME_BUCKETS];
+	uint64_t wr_histo[2][CONFIG_HISTO_SIZE_BUCKETS][CONFIG_HISTO_TIME_BUCKETS];
+#endif
 };
 	
 struct hd_struct {
@@ -360,6 +386,28 @@ static inline int get_disk_ro(struct gendisk *disk)
 	return disk->part0.policy;
 }
 
+#ifdef	CONFIG_BLOCK_HISTOGRAM
+extern void block_histogram_completion(int cpu, struct hd_struct *part,
+					struct request *req);
+extern ssize_t part_read_request_histo_show(struct device *dev,
+			struct device_attribute *attr, char *page);
+extern ssize_t part_read_dma_histo_show(struct device *dev,
+			struct device_attribute *attr, char *page);
+extern ssize_t part_write_request_histo_show(struct device *dev,
+			struct device_attribute *attr, char *page);
+extern ssize_t part_write_dma_histo_show(struct device *dev,
+			struct device_attribute *attr, char *page);
+extern ssize_t part_write_dma_histo_show(struct device *dev,
+			struct device_attribute *attr, char *page);
+extern ssize_t part_read_histo_clear(struct device *dev,
+		struct device_attribute *attr, const char *page, size_t count);
+extern ssize_t part_write_histo_clear(struct device *dev,
+		struct device_attribute *attr, const char *page, size_t count);
+#else
+static inline void block_histogram_completion(int cpu, struct hd_struct *part,
+						struct request *req) {}
+#endif
+
 /* drivers/char/random.c */
 extern void add_disk_randomness(struct gendisk *disk);
 extern void rand_initialize_disk(struct gendisk *disk);
diff --git a/include/linux/time.h b/include/linux/time.h
index 6e026e4..fa1b9de 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -42,6 +42,11 @@ extern struct timezone sys_tz;
 
 #define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
 
+static inline unsigned int nsecs_to_msecs(uint64_t ns)
+{
+	return ns / NSEC_PER_MSEC;
+}
+
 static inline int timespec_equal(const struct timespec *a,
                                  const struct timespec *b)
 {

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ