lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALTww2_JMiwp=QMuRTFDXQkuqMivR=k7yyw1CAMDxSrJX_WUvg@mail.gmail.com>
Date: Tue, 22 Apr 2025 14:15:57 +0800
From: Xiao Ni <xni@...hat.com>
To: Yu Kuai <yukuai1@...weicloud.com>
Cc: axboe@...nel.dk, agk@...hat.com, snitzer@...nel.org, mpatocka@...hat.com, 
	song@...nel.org, yukuai3@...wei.com, viro@...iv.linux.org.uk, 
	akpm@...ux-foundation.org, nadav.amit@...il.com, ubizjak@...il.com, 
	cl@...ux.com, linux-block@...r.kernel.org, linux-kernel@...r.kernel.org, 
	dm-devel@...ts.linux.dev, linux-raid@...r.kernel.org, yi.zhang@...wei.com, 
	yangerkun@...wei.com, johnny.chenyi@...wei.com
Subject: Re: [PATCH v2 3/5] md: add a new api sync_io_depth

On Fri, Apr 18, 2025 at 9:17 AM Yu Kuai <yukuai1@...weicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@...wei.com>
>
> Currently if sync speed is above speed_min and below speed_max,
> md_do_sync() will wait for all sync IOs to be done before issuing new
> sync IO, means sync IO depth is limited to just 1.
>
> This limit is too low, in order to prevent sync speed drop conspicuously
> after fixing is_mddev_idle() in the next patch, add a new api for
> limiting sync IO depth, the default value is 32.
>
> Signed-off-by: Yu Kuai <yukuai3@...wei.com>
> ---
>  drivers/md/md.c | 109 +++++++++++++++++++++++++++++++++++++++---------
>  drivers/md/md.h |   1 +
>  2 files changed, 91 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 438e71e45c16..52cadfce7e8d 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -111,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
>  /* Default safemode delay: 200 msec */
>  #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
>  /*
> - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
> - * is 1000 KB/sec, so the extra system load does not show up that much.
> - * Increase it if you want to have more _guaranteed_ speed. Note that
> - * the RAID driver will use the maximum available bandwidth if the IO
> - * subsystem is idle. There is also an 'absolute maximum' reconstruction
> - * speed limit - in case reconstruction slows down your system despite
> - * idle IO detection.
> + * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
> + * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
> + * does not show up that much. Increase it if you want to have more guaranteed
> + * speed. Note that the RAID driver will use the maximum bandwidth
> + * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
>   *
> - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
> - * or /sys/block/mdX/md/sync_speed_{min,max}
> + * Background sync IO speed control:
> + *
> + * - below speed min:
> + *   no limit;
> + * - above speed min and below speed max:
> + *   a) if mddev is idle, then no limit;
> + *   b) if mddev is busy handling normal IO, then limit inflight sync IO
> + *   to sync_io_depth;
> + * - above speed max:
> + *   sync IO can't be issued;
> + *
> + * Following configurations can be changed via /proc/sys/dev/raid/ for system
> + * or /sys/block/mdX/md/ for one array.
>   */
> -
>  static int sysctl_speed_limit_min = 1000;
>  static int sysctl_speed_limit_max = 200000;
> -static inline int speed_min(struct mddev *mddev)
> +static int sysctl_sync_io_depth = 32;
> +
> +static int speed_min(struct mddev *mddev)
>  {
>         return mddev->sync_speed_min ?
>                 mddev->sync_speed_min : sysctl_speed_limit_min;
>  }
>
> -static inline int speed_max(struct mddev *mddev)
> +static int speed_max(struct mddev *mddev)
>  {
>         return mddev->sync_speed_max ?
>                 mddev->sync_speed_max : sysctl_speed_limit_max;
>  }
>
> +static int sync_io_depth(struct mddev *mddev)
> +{
> +       return mddev->sync_io_depth ?
> +               mddev->sync_io_depth : sysctl_sync_io_depth;
> +}
> +
>  static void rdev_uninit_serial(struct md_rdev *rdev)
>  {
>         if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
> @@ -293,14 +309,21 @@ static const struct ctl_table raid_table[] = {
>                 .procname       = "speed_limit_min",
>                 .data           = &sysctl_speed_limit_min,
>                 .maxlen         = sizeof(int),
> -               .mode           = S_IRUGO|S_IWUSR,
> +               .mode           = 0644,
>                 .proc_handler   = proc_dointvec,
>         },
>         {
>                 .procname       = "speed_limit_max",
>                 .data           = &sysctl_speed_limit_max,
>                 .maxlen         = sizeof(int),
> -               .mode           = S_IRUGO|S_IWUSR,
> +               .mode           = 0644,
> +               .proc_handler   = proc_dointvec,
> +       },
> +       {
> +               .procname       = "sync_io_depth",
> +               .data           = &sysctl_sync_io_depth,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
>                 .proc_handler   = proc_dointvec,
>         },
>  };
> @@ -5091,7 +5114,7 @@ static ssize_t
>  sync_min_show(struct mddev *mddev, char *page)
>  {
>         return sprintf(page, "%d (%s)\n", speed_min(mddev),
> -                      mddev->sync_speed_min ? "local": "system");
> +                      mddev->sync_speed_min ? "local" : "system");
>  }
>
>  static ssize_t
> @@ -5100,7 +5123,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len)
>         unsigned int min;
>         int rv;
>
> -       if (strncmp(buf, "system", 6)==0) {
> +       if (strncmp(buf, "system", 6) == 0) {
>                 min = 0;
>         } else {
>                 rv = kstrtouint(buf, 10, &min);
> @@ -5120,7 +5143,7 @@ static ssize_t
>  sync_max_show(struct mddev *mddev, char *page)
>  {
>         return sprintf(page, "%d (%s)\n", speed_max(mddev),
> -                      mddev->sync_speed_max ? "local": "system");
> +                      mddev->sync_speed_max ? "local" : "system");
>  }
>
>  static ssize_t
> @@ -5129,7 +5152,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
>         unsigned int max;
>         int rv;
>
> -       if (strncmp(buf, "system", 6)==0) {
> +       if (strncmp(buf, "system", 6) == 0) {
>                 max = 0;
>         } else {
>                 rv = kstrtouint(buf, 10, &max);
> @@ -5145,6 +5168,35 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
>  static struct md_sysfs_entry md_sync_max =
>  __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
>
> +static ssize_t
> +sync_io_depth_show(struct mddev *mddev, char *page)
> +{
> +       return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
> +                      mddev->sync_io_depth ? "local" : "system");
> +}
> +
> +static ssize_t
> +sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
> +{
> +       unsigned int max;
> +       int rv;
> +
> +       if (strncmp(buf, "system", 6) == 0) {
> +               max = 0;
> +       } else {
> +               rv = kstrtouint(buf, 10, &max);
> +               if (rv < 0)
> +                       return rv;
> +               if (max == 0)
> +                       return -EINVAL;
> +       }
> +       mddev->sync_io_depth = max;
> +       return len;
> +}
> +
> +static struct md_sysfs_entry md_sync_io_depth =
> +__ATTR_RW(sync_io_depth);
> +
>  static ssize_t
>  degraded_show(struct mddev *mddev, char *page)
>  {
> @@ -5671,6 +5723,7 @@ static struct attribute *md_redundancy_attrs[] = {
>         &md_mismatches.attr,
>         &md_sync_min.attr,
>         &md_sync_max.attr,
> +       &md_sync_io_depth.attr,
>         &md_sync_speed.attr,
>         &md_sync_force_parallel.attr,
>         &md_sync_completed.attr,
> @@ -8927,6 +8980,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
>         }
>  }
>
> +static bool sync_io_within_limit(struct mddev *mddev)
> +{
> +       int io_sectors;
> +
> +       /*
> +        * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
> +        * RESYNC_PAGES(64k) per IO.
> +        */
> +       if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
> +               io_sectors = 8;
> +       else
> +               io_sectors = 128;
> +
> +       return atomic_read(&mddev->recovery_active) <
> +               io_sectors * sync_io_depth(mddev);
> +}
> +
>  #define SYNC_MARKS     10
>  #define        SYNC_MARK_STEP  (3*HZ)
>  #define UPDATE_FREQUENCY (5*60*HZ)
> @@ -9195,7 +9265,8 @@ void md_do_sync(struct md_thread *thread)
>                                 msleep(500);
>                                 goto repeat;
>                         }
> -                       if (!is_mddev_idle(mddev, 0)) {
> +                       if (!sync_io_within_limit(mddev) &&
> +                           !is_mddev_idle(mddev, 0)) {
>                                 /*
>                                  * Give other IO more of a chance.
>                                  * The faster the devices, the less we wait.
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 9d55b4630077..b57842188f18 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -484,6 +484,7 @@ struct mddev {
>         /* if zero, use the system-wide default */
>         int                             sync_speed_min;
>         int                             sync_speed_max;
> +       int                             sync_io_depth;
>
>         /* resync even though the same disks are shared among md-devices */
>         int                             parallel_resync;
> --
> 2.39.2
>

Looks good to me, reviewed-by: Xiao Ni <xni@...hat.com>


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ