[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALTww29xMyNq0SpPGvVqp6YPmCVu+N+d_neeJD_mogiviiZpYw@mail.gmail.com>
Date: Wed, 16 Apr 2025 13:32:10 +0800
From: Xiao Ni <xni@...hat.com>
To: Yu Kuai <yukuai1@...weicloud.com>
Cc: axboe@...nel.dk, song@...nel.org, yukuai3@...wei.com,
linux-block@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-raid@...r.kernel.org, yi.zhang@...wei.com, yangerkun@...wei.com
Subject: Re: [PATCH 2/4] md: add a new api sync_io_depth
On Sat, Apr 12, 2025 at 3:39 PM Yu Kuai <yukuai1@...weicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@...wei.com>
>
> Currently if sync speed is above speed_min and below speed_max,
> md_do_sync() will wait for all sync IOs to be done before issuing new
> sync IO, means sync IO depth is limited to just 1.
>
> This limit is too low, in order to prevent sync speed drop conspicuously
> after fixing is_mddev_idle() in the next patch, add a new api for
> limiting sync IO depth, the default value is 32.
>
> Signed-off-by: Yu Kuai <yukuai3@...wei.com>
> ---
> drivers/md/md.c | 103 +++++++++++++++++++++++++++++++++++++++---------
> drivers/md/md.h | 1 +
> 2 files changed, 85 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 438e71e45c16..8966c4afc62a 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -111,32 +111,42 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
> /* Default safemode delay: 200 msec */
> #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
> /*
> - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
> - * is 1000 KB/sec, so the extra system load does not show up that much.
> - * Increase it if you want to have more _guaranteed_ speed. Note that
> - * the RAID driver will use the maximum available bandwidth if the IO
> - * subsystem is idle. There is also an 'absolute maximum' reconstruction
> - * speed limit - in case reconstruction slows down your system despite
> - * idle IO detection.
These comments are useful. They only describe the meaning of those
control values. Is it good to keep them?
> + * Background sync IO speed control:
> *
> - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
> - * or /sys/block/mdX/md/sync_speed_{min,max}
> + * - below speed min:
> + * no limit;
> + * - above speed min and below speed max:
> + * a) if mddev is idle, then no limit;
> + * b) if mddev is busy handling normal IO, then limit inflight sync IO
> + * to sync_io_depth;
> + * - above speed max:
> + * sync IO can't be issued;
> + *
> + * Following configurations can be changed via /proc/sys/dev/raid/ for system
> + * or /sys/block/mdX/md/ for one array.
> */
> -
> static int sysctl_speed_limit_min = 1000;
> static int sysctl_speed_limit_max = 200000;
> -static inline int speed_min(struct mddev *mddev)
> +static int sysctl_sync_io_depth = 32;
> +
> +static int speed_min(struct mddev *mddev)
> {
> return mddev->sync_speed_min ?
> mddev->sync_speed_min : sysctl_speed_limit_min;
> }
>
> -static inline int speed_max(struct mddev *mddev)
> +static int speed_max(struct mddev *mddev)
> {
> return mddev->sync_speed_max ?
> mddev->sync_speed_max : sysctl_speed_limit_max;
> }
>
> +static int sync_io_depth(struct mddev *mddev)
> +{
> + return mddev->sync_io_depth ?
> + mddev->sync_io_depth : sysctl_sync_io_depth;
> +}
> +
> static void rdev_uninit_serial(struct md_rdev *rdev)
> {
> if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
> @@ -293,14 +303,21 @@ static const struct ctl_table raid_table[] = {
> .procname = "speed_limit_min",
> .data = &sysctl_speed_limit_min,
> .maxlen = sizeof(int),
> - .mode = S_IRUGO|S_IWUSR,
> + .mode = 0644,
Is it better to use macro rather than number directly here?
> .proc_handler = proc_dointvec,
> },
> {
> .procname = "speed_limit_max",
> .data = &sysctl_speed_limit_max,
> .maxlen = sizeof(int),
> - .mode = S_IRUGO|S_IWUSR,
> + .mode = 0644,
> + .proc_handler = proc_dointvec,
> + },
> + {
> + .procname = "sync_io_depth",
> + .data = &sysctl_sync_io_depth,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> .proc_handler = proc_dointvec,
> },
> };
> @@ -5091,7 +5108,7 @@ static ssize_t
> sync_min_show(struct mddev *mddev, char *page)
> {
> return sprintf(page, "%d (%s)\n", speed_min(mddev),
> - mddev->sync_speed_min ? "local": "system");
> + mddev->sync_speed_min ? "local" : "system");
> }
>
> static ssize_t
> @@ -5100,7 +5117,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len)
> unsigned int min;
> int rv;
>
> - if (strncmp(buf, "system", 6)==0) {
> + if (strncmp(buf, "system", 6) == 0) {
> min = 0;
> } else {
> rv = kstrtouint(buf, 10, &min);
> @@ -5120,7 +5137,7 @@ static ssize_t
> sync_max_show(struct mddev *mddev, char *page)
> {
> return sprintf(page, "%d (%s)\n", speed_max(mddev),
> - mddev->sync_speed_max ? "local": "system");
> + mddev->sync_speed_max ? "local" : "system");
> }
>
> static ssize_t
> @@ -5129,7 +5146,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
> unsigned int max;
> int rv;
>
> - if (strncmp(buf, "system", 6)==0) {
> + if (strncmp(buf, "system", 6) == 0) {
> max = 0;
> } else {
> rv = kstrtouint(buf, 10, &max);
> @@ -5145,6 +5162,35 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
> static struct md_sysfs_entry md_sync_max =
> __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
>
> +static ssize_t
> +sync_io_depth_show(struct mddev *mddev, char *page)
> +{
> + return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
> + mddev->sync_io_depth ? "local" : "system");
> +}
> +
> +static ssize_t
> +sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
> +{
> + unsigned int max;
> + int rv;
> +
> + if (strncmp(buf, "system", 6) == 0) {
> + max = 0;
> + } else {
> + rv = kstrtouint(buf, 10, &max);
> + if (rv < 0)
> + return rv;
> + if (max == 0)
> + return -EINVAL;
> + }
> + mddev->sync_io_depth = max;
> + return len;
> +}
> +
> +static struct md_sysfs_entry md_sync_io_depth =
> +__ATTR_RW(sync_io_depth);
> +
> static ssize_t
> degraded_show(struct mddev *mddev, char *page)
> {
> @@ -5671,6 +5717,7 @@ static struct attribute *md_redundancy_attrs[] = {
> &md_mismatches.attr,
> &md_sync_min.attr,
> &md_sync_max.attr,
> + &md_sync_io_depth.attr,
> &md_sync_speed.attr,
> &md_sync_force_parallel.attr,
> &md_sync_completed.attr,
> @@ -8927,6 +8974,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
> }
> }
>
> +static bool sync_io_within_limit(struct mddev *mddev)
> +{
> + int io_sectors;
> +
> + /*
> + * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
> + * RESYNC_PAGES(64k) per IO.
> + */
> + if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
> + io_sectors = 8;
> + else
> + io_sectors = 128;
> +
> + return atomic_read(&mddev->recovery_active) <
> + io_sectors * sync_io_depth(mddev);
> +}
> +
> #define SYNC_MARKS 10
> #define SYNC_MARK_STEP (3*HZ)
> #define UPDATE_FREQUENCY (5*60*HZ)
> @@ -9195,7 +9259,8 @@ void md_do_sync(struct md_thread *thread)
> msleep(500);
> goto repeat;
> }
> - if (!is_mddev_idle(mddev, 0)) {
> + if (!sync_io_within_limit(mddev) &&
> + !is_mddev_idle(mddev, 0)) {
> /*
> * Give other IO more of a chance.
> * The faster the devices, the less we wait.
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 1cf00a04bcdd..63be622467c6 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -483,6 +483,7 @@ struct mddev {
> /* if zero, use the system-wide default */
> int sync_speed_min;
> int sync_speed_max;
> + int sync_io_depth;
>
> /* resync even though the same disks are shared among md-devices */
> int parallel_resync;
> --
> 2.39.2
>
This part looks good to me.
Acked-by: Xiao Ni <xni@...hat.com>
Powered by blists - more mailing lists