[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <fcda2351-9ba7-4121-a993-184a4c02f9a6@kernel.org>
Date: Fri, 10 May 2024 16:11:10 +0900
From: Damien Le Moal <dlemoal@...nel.org>
To: John Meneghini <jmeneghi@...hat.com>, tj@...nel.org,
josef@...icpanda.com, axboe@...nel.dk, kbusch@...nel.org, hch@....de,
sagi@...mberg.me, emilne@...hat.com, hare@...nel.org
Cc: linux-block@...r.kernel.org, cgroups@...r.kernel.org,
linux-nvme@...ts.infradead.org, linux-kernel@...r.kernel.org,
jrani@...estorage.com, randyj@...estorage.com, aviv.coro@....com
Subject: Re: [PATCH v3 1/3] block: track per-node I/O latency
On 5/10/24 05:43, John Meneghini wrote:
> From: Hannes Reinecke <hare@...nel.org>
>
> Add a new option 'BLK_NODE_LATENCY' to track per-node I/O latency.
> This can be used by I/O schedulers to determine the 'best' queue
> to send I/O to.
>
> Signed-off-by: Hannes Reinecke <hare@...nel.org>
>
> Cleaned up checkpatch warnings and updated MAINTAINERS.
This note should be before Hannes SoB. E.g:
[John] Fixed checkpatch warnings and updated MAINTAINERS.
>
> Signed-off-by: John Meneghini <jmeneghi@...hat.com>
> ---
> MAINTAINERS | 1 +
> block/Kconfig | 9 +
> block/Makefile | 1 +
> block/blk-mq-debugfs.c | 2 +
> block/blk-nlatency.c | 389 +++++++++++++++++++++++++++++++++++++++++
> block/blk-rq-qos.h | 6 +
> include/linux/blk-mq.h | 11 ++
> 7 files changed, 419 insertions(+)
> create mode 100644 block/blk-nlatency.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 7c121493f43d..a4634365c82f 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -5405,6 +5405,7 @@ F: block/bfq-cgroup.c
> F: block/blk-cgroup.c
> F: block/blk-iocost.c
> F: block/blk-iolatency.c
> +F: block/blk-nlatency.c
> F: block/blk-throttle.c
> F: include/linux/blk-cgroup.h
>
> diff --git a/block/Kconfig b/block/Kconfig
> index 1de4682d48cc..641ed39d609c 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -186,6 +186,15 @@ config BLK_CGROUP_IOPRIO
> scheduler and block devices process requests. Only some I/O schedulers
> and some block devices support I/O priorities.
>
> +config BLK_NODE_LATENCY
> + bool "Track per-node I/O latency"
> + help
> + Enable per-node I/O latency tracking for multipathing. This uses the
> + blk-nodelat latency tracker to provide latencies for each node, and schedules
> + I/O on the path with the least latency for the submitting node. This can be
> + used by I/O schedulers to determine the node with the least latency. Currently
> + only supports nvme over fabrics devices.
> +
> config BLK_DEBUG_FS
> bool "Block layer debugging information in debugfs"
> default y
> diff --git a/block/Makefile b/block/Makefile
> index 46ada9dc8bbf..9d2e71a3e36f 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
> obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o
> obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
> obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
> +obj-$(CONFIG_BLK_NODE_LATENCY) += blk-nlatency.o
Let's keep the alignment please.
> obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
> obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
> bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 94668e72ab09..cb38228b95d8 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -762,6 +762,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
> return "latency";
> case RQ_QOS_COST:
> return "cost";
> + case RQ_QOS_NLAT:
> + return "node-latency";
> }
> return "unknown";
> }
> diff --git a/block/blk-nlatency.c b/block/blk-nlatency.c
> new file mode 100644
> index 000000000000..219c3f636d76
> --- /dev/null
> +++ b/block/blk-nlatency.c
> @@ -0,0 +1,389 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Per-node request latency tracking.
> + *
> + * Copyright (C) 2023 Hannes Reinecke
> + *
> + * A simple per-node latency tracker for use by I/O scheduler.
> + * Latencies are measures over 'win_usec' microseconds and stored per node.
> + * If the number of measurements falls below 'lowat' the measurement is
> + * assumed to be unreliable and will become 'stale'.
> + * These 'stale' latencies can be 'decayed', where during each measurement
> + * interval the 'stale' latency value is decreased by 'decay' percent.
> + * Once the 'stale' latency reaches zero it will be updated by the
> + * measured latency.
> + */
> +#include <linux/kernel.h>
> +#include <linux/blk_types.h>
> +#include <linux/slab.h>
> +
> +#include "blk-stat.h"
> +#include "blk-rq-qos.h"
> +#include "blk.h"
> +
> +#define NLAT_DEFAULT_LOWAT 2
> +#define NLAT_DEFAULT_DECAY 50
> +
> +struct rq_nlat {
> + struct rq_qos rqos;
> +
> + u64 win_usec; /* latency measurement window in microseconds */
Using microseconds forces you to do costly multiplications and divisions by
1000. Why not keep things in nanoseconds ?
> + unsigned int lowat; /* Low Watermark latency measurement */
> + unsigned int decay; /* Percentage for 'decaying' latencies */
> + bool enabled;
> +
> + struct blk_stat_callback *cb;
> +
> + unsigned int num;
> + u64 *latency;
> + unsigned int *samples;
> +};
> +
> +static inline struct rq_nlat *RQNLAT(struct rq_qos *rqos)
> +{
> + return container_of(rqos, struct rq_nlat, rqos);
> +}
> +
> +static u64 nlat_default_latency_usec(struct request_queue *q)
> +{
> + /*
> + * We default to 2msec for non-rotational storage, and 75msec
> + * for rotational storage.
> + */
> + if (blk_queue_nonrot(q))
> + return 2000ULL;
> + else
No need for this else.
> + return 75000ULL;
> +}
> +
> +static void nlat_timer_fn(struct blk_stat_callback *cb)
> +{
> + struct rq_nlat *nlat = cb->data;
> + int n;
> +
> + for (n = 0; n < cb->buckets; n++) {
> + if (cb->stat[n].nr_samples < nlat->lowat) {
> + /*
> + * 'decay' the latency by the specified
> + * percentage to ensure the queues are
> + * being tested to balance out temporary
> + * latency spikes.
> + */
> + nlat->latency[n] =
> + div64_u64(nlat->latency[n] * nlat->decay, 100);
> + } else
> + nlat->latency[n] = cb->stat[n].mean;
Missing the curly brackets around the else block.
Nit: n is a rather unusual name for a loop index. Why not the usual "i" ? Does
notmatter much though.
> + nlat->samples[n] = cb->stat[n].nr_samples;
> + }
> + if (nlat->enabled)
> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
> +}
> +
> +static int nlat_bucket_node(const struct request *rq)
> +{
> + if (!rq->mq_ctx)
> + return -1;
> + return cpu_to_node(blk_mq_rq_cpu((struct request *)rq));
> +}
> +
> +static void nlat_exit(struct rq_qos *rqos)
> +{
> + struct rq_nlat *nlat = RQNLAT(rqos);
> +
> + blk_stat_remove_callback(nlat->rqos.disk->queue, nlat->cb);
> + blk_stat_free_callback(nlat->cb);
> + kfree(nlat->samples);
> + kfree(nlat->latency);
> + kfree(nlat);
> +}
> +
> +#ifdef CONFIG_BLK_DEBUG_FS
> +static int nlat_win_usec_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> +
> + seq_printf(m, "%llu\n", nlat->win_usec);
> + return 0;
> +}
> +
> +static ssize_t nlat_win_usec_write(void *data, const char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> + char val[16] = { };
> + u64 usec;
> + int err;
> +
> + if (blk_queue_dying(nlat->rqos.disk->queue))
> + return -ENOENT;
> +
> + if (count >= sizeof(val))
> + return -EINVAL;
> +
> + if (copy_from_user(val, buf, count))
> + return -EFAULT;
> +
> + err = kstrtoull(val, 10, &usec);
> + if (err)
> + return err;
> + blk_stat_deactivate(nlat->cb);
> + nlat->win_usec = usec;
> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
> +
> + return count;
> +}
> +
> +static int nlat_lowat_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> +
> + seq_printf(m, "%u\n", nlat->lowat);
> + return 0;
> +}
> +
> +static ssize_t nlat_lowat_write(void *data, const char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> + char val[16] = { };
> + unsigned int lowat;
> + int err;
> +
> + if (blk_queue_dying(nlat->rqos.disk->queue))
> + return -ENOENT;
> +
> + if (count >= sizeof(val))
> + return -EINVAL;
> +
> + if (copy_from_user(val, buf, count))
> + return -EFAULT;
> +
> + err = kstrtouint(val, 10, &lowat);
> + if (err)
> + return err;
> + blk_stat_deactivate(nlat->cb);
> + nlat->lowat = lowat;
> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
> +
> + return count;
> +}
> +
> +static int nlat_decay_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> +
> + seq_printf(m, "%u\n", nlat->decay);
> + return 0;
> +}
> +
> +static ssize_t nlat_decay_write(void *data, const char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> + char val[16] = { };
> + unsigned int decay;
> + int err;
> +
> + if (blk_queue_dying(nlat->rqos.disk->queue))
> + return -ENOENT;
> +
> + if (count >= sizeof(val))
> + return -EINVAL;
> +
> + if (copy_from_user(val, buf, count))
> + return -EFAULT;
> +
> + err = kstrtouint(val, 10, &decay);
> + if (err)
> + return err;
> + if (decay > 100)
> + return -EINVAL;
> + blk_stat_deactivate(nlat->cb);
> + nlat->decay = decay;
> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
> +
> + return count;
> +}
> +
> +static int nlat_enabled_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> +
> + seq_printf(m, "%d\n", nlat->enabled);
> + return 0;
> +}
> +
> +static int nlat_id_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> +
> + seq_printf(m, "%u\n", rqos->id);
> + return 0;
> +}
> +
> +static int nlat_latency_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> + int n;
> +
> + if (!nlat->enabled)
> + return 0;
> +
> + for (n = 0; n < nlat->num; n++) {
> + if (n > 0)
> + seq_puts(m, " ");
> + seq_printf(m, "%llu", nlat->latency[n]);
> + }
> + seq_puts(m, "\n");
> + return 0;
> +}
> +
> +static int nlat_samples_show(void *data, struct seq_file *m)
> +{
> + struct rq_qos *rqos = data;
> + struct rq_nlat *nlat = RQNLAT(rqos);
> + int n;
> +
> + if (!nlat->enabled)
> + return 0;
> +
> + for (n = 0; n < nlat->num; n++) {
> + if (n > 0)
> + seq_puts(m, " ");
> + seq_printf(m, "%u", nlat->samples[n]);
> + }
> + seq_puts(m, "\n");
> + return 0;
> +}
> +
> +static const struct blk_mq_debugfs_attr nlat_debugfs_attrs[] = {
> + {"win_usec", 0600, nlat_win_usec_show, nlat_win_usec_write},
> + {"lowat", 0600, nlat_lowat_show, nlat_lowat_write},
> + {"decay", 0600, nlat_decay_show, nlat_decay_write},
> + {"enabled", 0400, nlat_enabled_show},
> + {"id", 0400, nlat_id_show},
> + {"latency", 0400, nlat_latency_show},
> + {"samples", 0400, nlat_samples_show},
> + {},
> +};
> +#endif
> +
> +static const struct rq_qos_ops nlat_rqos_ops = {
> + .exit = nlat_exit,
> +#ifdef CONFIG_BLK_DEBUG_FS
> + .debugfs_attrs = nlat_debugfs_attrs,
> +#endif
> +};
> +
> +u64 blk_nlat_latency(struct gendisk *disk, int node)
> +{
> + struct rq_qos *rqos;
> + struct rq_nlat *nlat;
> +
> + rqos = nlat_rq_qos(disk->queue);
> + if (!rqos)
> + return 0;
> + nlat = RQNLAT(rqos);
> + if (node > nlat->num)
> + return 0;
> +
> + return div64_u64(nlat->latency[node], 1000);
See comment at the top. Why not keep everything in nanoseconds to avoid this
costly division ?
> +}
> +EXPORT_SYMBOL_GPL(blk_nlat_latency);
> +
> +int blk_nlat_enable(struct gendisk *disk)
> +{
> + struct rq_qos *rqos;
> + struct rq_nlat *nlat;
> +
> + /* Latency tracking not enabled? */
> + rqos = nlat_rq_qos(disk->queue);
> + if (!rqos)
> + return -EINVAL;
> + nlat = RQNLAT(rqos);
> + if (nlat->enabled)
> + return 0;
> +
> + /* Queue not registered? Maybe shutting down... */
> + if (!blk_queue_registered(disk->queue))
> + return -EAGAIN;
> +
> + nlat->enabled = true;
> + memset(nlat->latency, 0, sizeof(u64) * nlat->num);
> + memset(nlat->samples, 0, sizeof(unsigned int) * nlat->num);
> + blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(blk_nlat_enable);
> +
> +void blk_nlat_disable(struct gendisk *disk)
> +{
> + struct rq_qos *rqos = nlat_rq_qos(disk->queue);
> + struct rq_nlat *nlat;
> +
> + if (!rqos)
> + return;
> + nlat = RQNLAT(rqos);
> + if (nlat->enabled) {
> + blk_stat_deactivate(nlat->cb);
> + nlat->enabled = false;
> + }
> +}
> +EXPORT_SYMBOL_GPL(blk_nlat_disable);
> +
> +int blk_nlat_init(struct gendisk *disk)
> +{
> + struct rq_nlat *nlat;
> + int ret = -ENOMEM;
> +
> + nlat = kzalloc(sizeof(*nlat), GFP_KERNEL);
> + if (!nlat)
> + return -ENOMEM;
> +
> + nlat->num = num_possible_nodes();
> + nlat->lowat = NLAT_DEFAULT_LOWAT;
> + nlat->decay = NLAT_DEFAULT_DECAY;
> + nlat->win_usec = nlat_default_latency_usec(disk->queue);
> +
> + nlat->latency = kcalloc(nlat->num, sizeof(u64), GFP_KERNEL);
> + if (!nlat->latency)
> + goto err_free;
> + nlat->samples = kcalloc(nlat->num, sizeof(unsigned int), GFP_KERNEL);
> + if (!nlat->samples)
> + goto err_free;
> + nlat->cb = blk_stat_alloc_callback(nlat_timer_fn, nlat_bucket_node,
> + nlat->num, nlat);
> + if (!nlat->cb)
> + goto err_free;
> +
> + /*
> + * Assign rwb and add the stats callback.
> + */
This can be a single line comment.
> + mutex_lock(&disk->queue->rq_qos_mutex);
> + ret = rq_qos_add(&nlat->rqos, disk, RQ_QOS_NLAT, &nlat_rqos_ops);
> + mutex_unlock(&disk->queue->rq_qos_mutex);
> + if (ret)
> + goto err_free_cb;
> +
> + blk_stat_add_callback(disk->queue, nlat->cb);
> +
> + return 0;
> +
> +err_free_cb:
> + blk_stat_free_callback(nlat->cb);
> +err_free:
> + kfree(nlat->samples);
> + kfree(nlat->latency);
> + kfree(nlat);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(blk_nlat_init);
> diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
> index 37245c97ee61..2fc11ced0c00 100644
> --- a/block/blk-rq-qos.h
> +++ b/block/blk-rq-qos.h
> @@ -17,6 +17,7 @@ enum rq_qos_id {
> RQ_QOS_WBT,
> RQ_QOS_LATENCY,
> RQ_QOS_COST,
> + RQ_QOS_NLAT,
> };
>
> struct rq_wait {
> @@ -79,6 +80,11 @@ static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
> return rq_qos_id(q, RQ_QOS_LATENCY);
> }
>
> +static inline struct rq_qos *nlat_rq_qos(struct request_queue *q)
> +{
> + return rq_qos_id(q, RQ_QOS_NLAT);
> +}
> +
> static inline void rq_wait_init(struct rq_wait *rq_wait)
> {
> atomic_set(&rq_wait->inflight, 0);
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index d3d8fd8e229b..1f3829627f1b 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -1231,4 +1231,15 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
> }
> #endif /* CONFIG_BLK_DEV_ZONED */
>
> +#ifdef CONFIG_BLK_NODE_LATENCY
> +int blk_nlat_enable(struct gendisk *disk);
> +void blk_nlat_disable(struct gendisk *disk);
> +u64 blk_nlat_latency(struct gendisk *disk, int node);
> +int blk_nlat_init(struct gendisk *disk);
> +#else
> +static inline int blk_nlat_enable(struct gendisk *disk) { return 0; }
> +static inline void blk_nlat_disable(struct gendisk *disk) {}
> +static inline u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
> +static inline int blk_nlat_init(struct gendisk *disk) { return -EOPNOTSUPP; }
> +#endif
> #endif /* BLK_MQ_H */
--
Damien Le Moal
Western Digital Research
Powered by blists - more mailing lists