linux-kernel - Re: [PATCH v4 1/9] md/raid1,raid10: Set the LastDev flag when the configuration changes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALTww2_XBBBP3NHRjxrxsJ3eqjJ_bB8SmCeHFocun1hQiUedkA@mail.gmail.com>
Date: Sun, 21 Sep 2025 15:54:16 +0800
From: Xiao Ni <xni@...hat.com>
To: Kenta Akagi <k@...l.me>
Cc: Song Liu <song@...nel.org>, Yu Kuai <yukuai3@...wei.com>, 
	Mariusz Tkaczyk <mtkaczyk@...nel.org>, Shaohua Li <shli@...com>, Guoqing Jiang <jgq516@...il.com>, 
	linux-raid@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v4 1/9] md/raid1,raid10: Set the LastDev flag when the
 configuration changes

Hi Kenta

On Mon, Sep 15, 2025 at 11:44 AM Kenta Akagi <k@...l.me> wrote:
>
> Currently, the LastDev flag is set on an rdev that failed a failfast
> metadata write and called md_error, but did not become Faulty. It is
> cleared when the metadata write retry succeeds. This has problems for
> the following reasons:
>
> * Despite its name, the flag is only set during a metadata write window.
> * Unlike when LastDev and Failfast was introduced, md_error on the last
>   rdev of a RAID1/10 array now sets MD_BROKEN. Thus when LastDev is set,
>   the array is already unwritable.
>
> A following commit will prevent failfast bios from breaking the array,
> which requires knowing from outside the personality whether an rdev is
> the last one. For that purpose, LastDev should be set on rdevs that must
> not be lost.
>
> This commit ensures that LastDev is set on the indispensable rdev in a
> degraded RAID1/10 array.
>
> Signed-off-by: Kenta Akagi <k@...l.me>
> ---
>  drivers/md/md.c     |  4 +---
>  drivers/md/md.h     |  6 +++---
>  drivers/md/raid1.c  | 34 +++++++++++++++++++++++++++++++++-
>  drivers/md/raid10.c | 34 +++++++++++++++++++++++++++++++++-
>  4 files changed, 70 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 4e033c26fdd4..268410b66b83 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -1007,10 +1007,8 @@ static void super_written(struct bio *bio)
>                 if (!test_bit(Faulty, &rdev->flags)
>                     && (bio->bi_opf & MD_FAILFAST)) {
>                         set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
> -                       set_bit(LastDev, &rdev->flags);
>                 }
> -       } else
> -               clear_bit(LastDev, &rdev->flags);
> +       }
>
>         bio_put(bio);
>
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 51af29a03079..ec598f9a8381 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -281,9 +281,9 @@ enum flag_bits {
>                                  * It is expects that no bad block log
>                                  * is present.
>                                  */
> -       LastDev,                /* Seems to be the last working dev as
> -                                * it didn't fail, so don't use FailFast
> -                                * any more for metadata
> +       LastDev,                /* This is the last working rdev.
> +                                * so don't use FailFast any more for
> +                                * metadata.
>                                  */
>         CollisionCheck,         /*
>                                  * check if there is collision between raid1
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index bf44878ec640..32ad6b102ff7 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1733,6 +1733,33 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
>         seq_printf(seq, "]");
>  }
>
> +/**
> + * update_lastdev - Set or clear LastDev flag for all rdevs in array
> + * @conf: pointer to r1conf
> + *
> + * Sets LastDev if the device is In_sync and cannot be lost for the array.
> + * Otherwise, clear it.
> + *
> + * Caller must hold ->device_lock.
> + */
> +static void update_lastdev(struct r1conf *conf)
> +{
> +       int i;
> +       int alive_disks = conf->raid_disks - conf->mddev->degraded;
> +
> +       for (i = 0; i < conf->raid_disks; i++) {
> +               struct md_rdev *rdev = conf->mirrors[i].rdev;
> +
> +               if (rdev) {
> +                       if (test_bit(In_sync, &rdev->flags) &&
> +                           alive_disks == 1)
> +                               set_bit(LastDev, &rdev->flags);
> +                       else
> +                               clear_bit(LastDev, &rdev->flags);
> +               }
> +       }
> +}
> +
>  /**
>   * raid1_error() - RAID1 error handler.
>   * @mddev: affected md device.
> @@ -1767,8 +1794,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
>                 }
>         }
>         set_bit(Blocked, &rdev->flags);
> -       if (test_and_clear_bit(In_sync, &rdev->flags))
> +       if (test_and_clear_bit(In_sync, &rdev->flags)) {
>                 mddev->degraded++;
> +               update_lastdev(conf);
> +       }
>         set_bit(Faulty, &rdev->flags);
>         spin_unlock_irqrestore(&conf->device_lock, flags);
>         /*
> @@ -1864,6 +1893,7 @@ static int raid1_spare_active(struct mddev *mddev)
>                 }
>         }
>         mddev->degraded -= count;
> +       update_lastdev(conf);

update_lastdev is called in raid1_spare_active, raid1_run and
raid1_reshape. Could you explain the reason why it needs to call this
function? Is it the reason you want to clear LastDev flag? If so, is
it a right place to do it? As your commit message says, it will be
cleared after retry metadata successfully. In raid1, is it the right
place that fixes read/write successfully?

Best Regards
Xiao

>         spin_unlock_irqrestore(&conf->device_lock, flags);
>
>         print_conf(conf);
> @@ -3290,6 +3320,7 @@ static int raid1_run(struct mddev *mddev)
>         rcu_assign_pointer(conf->thread, NULL);
>         mddev->private = conf;
>         set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
> +       update_lastdev(conf);
>
>         md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
>
> @@ -3427,6 +3458,7 @@ static int raid1_reshape(struct mddev *mddev)
>
>         spin_lock_irqsave(&conf->device_lock, flags);
>         mddev->degraded += (raid_disks - conf->raid_disks);
> +       update_lastdev(conf);
>         spin_unlock_irqrestore(&conf->device_lock, flags);
>         conf->raid_disks = mddev->raid_disks = raid_disks;
>         mddev->delta_disks = 0;
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index b60c30bfb6c7..dc4edd4689f8 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1983,6 +1983,33 @@ static int enough(struct r10conf *conf, int ignore)
>                 _enough(conf, 1, ignore);
>  }
>
> +/**
> + * update_lastdev - Set or clear LastDev flag for all rdevs in array
> + * @conf: pointer to r10conf
> + *
> + * Sets LastDev if the device is In_sync and cannot be lost for the array.
> + * Otherwise, clear it.
> + *
> + * Caller must hold ->reconfig_mutex or ->device_lock.
> + */
> +static void update_lastdev(struct r10conf *conf)
> +{
> +       int i;
> +       int raid_disks = max(conf->geo.raid_disks, conf->prev.raid_disks);
> +
> +       for (i = 0; i < raid_disks; i++) {
> +               struct md_rdev *rdev = conf->mirrors[i].rdev;
> +
> +               if (rdev) {
> +                       if (test_bit(In_sync, &rdev->flags) &&
> +                           !enough(conf, i))
> +                               set_bit(LastDev, &rdev->flags);
> +                       else
> +                               clear_bit(LastDev, &rdev->flags);
> +               }
> +       }
> +}
> +
>  /**
>   * raid10_error() - RAID10 error handler.
>   * @mddev: affected md device.
> @@ -2013,8 +2040,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
>                         return;
>                 }
>         }
> -       if (test_and_clear_bit(In_sync, &rdev->flags))
> +       if (test_and_clear_bit(In_sync, &rdev->flags)) {
>                 mddev->degraded++;
> +               update_lastdev(conf);
> +       }
>
>         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
>         set_bit(Blocked, &rdev->flags);
> @@ -2102,6 +2131,7 @@ static int raid10_spare_active(struct mddev *mddev)
>         }
>         spin_lock_irqsave(&conf->device_lock, flags);
>         mddev->degraded -= count;
> +       update_lastdev(conf);
>         spin_unlock_irqrestore(&conf->device_lock, flags);
>
>         print_conf(conf);
> @@ -4159,6 +4189,7 @@ static int raid10_run(struct mddev *mddev)
>         md_set_array_sectors(mddev, size);
>         mddev->resync_max_sectors = size;
>         set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
> +       update_lastdev(conf);
>
>         if (md_integrity_register(mddev))
>                 goto out_free_conf;
> @@ -4567,6 +4598,7 @@ static int raid10_start_reshape(struct mddev *mddev)
>          */
>         spin_lock_irq(&conf->device_lock);
>         mddev->degraded = calc_degraded(conf);
> +       update_lastdev(conf);
>         spin_unlock_irq(&conf->device_lock);
>         mddev->raid_disks = conf->geo.raid_disks;
>         mddev->reshape_position = conf->reshape_progress;
> --
> 2.50.1
>
>