linux-kernel - Re: [PATCH v3 3/3] md/raid5: check for overlapping bad blocks before starting reshape

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <a9119707-4fa1-58cb-377c-63df67f88957@huaweicloud.com>
Date: Wed, 5 Mar 2025 14:36:58 +0800
From: Yu Kuai <yukuai1@...weicloud.com>
To: Doug V Johnson <dougvj@...gvj.net>
Cc: Doug Johnson <dougvj@...il.com>, Song Liu <song@...nel.org>,
 "open list:SOFTWARE RAID (Multiple Disks) SUPPORT"
 <linux-raid@...r.kernel.org>, open list <linux-kernel@...r.kernel.org>,
 "yukuai (C)" <yukuai3@...wei.com>
Subject: Re: [PATCH v3 3/3] md/raid5: check for overlapping bad blocks before
 starting reshape

Hi,

在 2025/02/24 17:02, Doug V Johnson 写道:
> In addition to halting a reshape in progress when we encounter bad
> blocks, we want to make sure that we do not even attempt a reshape if we
> know before hand that there are too many overlapping bad blocks and we
> would have to stall the reshape.
> 
> To do this, we add a new internal function array_has_badblock() which
> first checks to see if there are enough drives with bad blocks for the
> condition to occur and if there are proceeds to do a simple O(n^2) check
> for overlapping bad blocks. If more overlaps are found than can be
> corrected for, we return 1 for the presence of bad blocks, otherwise 0
> 
> This function is invoked in raid5_start_reshape() and if there are bad
> blocks present, returns -EIO which is reported to userspace.
> 
> It's possible for bad blocks to be discovered or put in the metadata
> after a reshape has started, so we want to leave in place the
> functionality to detect and halt a reshape.
> 
> Signed-off-by: Doug V Johnson <dougvj@...gvj.net>
> ---
>   drivers/md/raid5.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 94 insertions(+)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 8b23109d6f37..4b907a674dd1 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -8451,6 +8451,94 @@ static int check_reshape(struct mddev *mddev)
>   				     + mddev->delta_disks));
>   }
>   
> +static int array_has_badblock(struct r5conf *conf)
> +{
> +	/* Searches for overlapping bad blocks on devices that would result
> +	 * in an unreadable condition
> +	 */
> +	int i, j;
> +	/* First see if we even have bad blocks on enough drives to have a
> +	 * bad read condition
> +	 */
> +	int num_badblock_devs = 0;
> +
> +	for (i = 0; i < conf->raid_disks; i++) {
> +		if (rdev_has_badblock(conf->disks[i].rdev,
> +				      0, conf->disks[i].rdev->sectors))
		if (rdev->badblocks.count)

> +			num_badblock_devs++;
> +	}
> +	if (num_badblock_devs <= conf->max_degraded) {
> +		/* There are not enough devices with bad blocks to pose any
> +		 * read problem
> +		 */
> +		return 0;
> +	}
> +	pr_debug("%s: running overlapping bad block check",
> +		 mdname(conf->mddev));
> +	/* Do a more sophisticated check for overlapping regions */
> +	for (i = 0; i < conf->raid_disks; i++) {
> +		sector_t first_bad;
> +		int bad_sectors;
> +		sector_t next_check_s = 0;
> +		int next_check_sectors = conf->disks[i].rdev->sectors;
> +
> +		pr_debug("%s: badblock check: %i (s: %lu, sec: %i)",
> +			 mdname(conf->mddev), i,
> +			 (unsigned long)next_check_s, next_check_sectors);
> +		while (is_badblock(conf->disks[i].rdev,
> +				   next_check_s, next_check_sectors,
> +				   &first_bad,
> +				   &bad_sectors) != 0) {
> +			/* Align bad blocks to the size of our stripe */
> +			sector_t aligned_first_bad = first_bad &
> +				~((sector_t)RAID5_STRIPE_SECTORS(conf) - 1);
> +			int aligned_bad_sectors =
> +				max_t(int, RAID5_STRIPE_SECTORS(conf),
> +				      bad_sectors);
> +			int this_num_bad = 1;
For example, if first_bad is 0, bad_sectors is 512 in rdev0

> +
> +			pr_debug("%s: found blocks %i %lu -> %i",
> +				 mdname(conf->mddev), i,
> +				 (unsigned long)aligned_first_bad,
> +				 aligned_bad_sectors);
> +			for (j = 0; j < conf->raid_disks; j++) {
> +				sector_t this_first_bad;
> +				int this_bad_sectors;
> +
> +				if (j == i)
> +					continue;
> +				if (is_badblock(conf->disks[j].rdev,
> +						aligned_first_bad,
> +						aligned_bad_sectors,
> +						&this_first_bad,
> +						&this_bad_sectors)) {
And rdev1 has badblocks 0+256, rdev2 has badblocks 256+256.

If this array is a raid6 with max_degraded=2, then it's fine.

Perhaps a pseudocode loop like following?

  sector_t offset = 0;
  while (offset < dev_sectors) {
          len = dev_sectors - offset;
          bad_disks = 0;
          for (i = 0; i < conf->raid_disks; ++i) {
                  if (is_badblock(rdev, offset, len, &first_bad, 
&bad_sectors)) {
                          if (first_bad <= offset) {
                                  len = min(len, first_bad + bad_sectors 
  offset);
                                  bad_disks++;
                          } else {
                                  len = min(len, first_bad - offset);
                          }
                  }
          }

          if (bad_disks > max_degraded)
                  return false;

          offset += len;
  }

  return true;

Thanks,
Kuai

> +					this_num_bad++;
> +					pr_debug("md/raid:%s: bad block overlap dev %i: %lu %i",
> +						 mdname(conf->mddev), j,
> +						 (unsigned long)this_first_bad,
> +						 this_bad_sectors);
> +				}
> +			}
> +			if (this_num_bad > conf->max_degraded) {
> +				pr_debug("md/raid:%s: %i drives with unreadable sector(s) around %lu %i due to bad block list",
> +					 mdname(conf->mddev),
> +					 this_num_bad,
> +					 (unsigned long)first_bad,
> +					 bad_sectors);
> +				return 1;
> +			}
> +			next_check_s = first_bad + bad_sectors;
> +			next_check_sectors =
> +				next_check_sectors - (first_bad + bad_sectors);
> +			pr_debug("%s: badblock check: %i (s: %lu, sec: %i)",
> +				 mdname(conf->mddev), i,
> +				 (unsigned long)next_check_s,
> +				 next_check_sectors);
> +		}
> +	}
> +	return 0;
> +}
> +
>   static int raid5_start_reshape(struct mddev *mddev)
>   {
>   	struct r5conf *conf = mddev->private;
> @@ -8498,6 +8586,12 @@ static int raid5_start_reshape(struct mddev *mddev)
>   		return -EINVAL;
>   	}
>   
> +	if (array_has_badblock(conf)) {
> +		pr_warn("md/raid:%s: reshape not possible due to bad block list",
> +			mdname(mddev));
> +		return -EIO;
> +	}
> +
>   	atomic_set(&conf->reshape_stripes, 0);
>   	spin_lock_irq(&conf->device_lock);
>   	write_seqcount_begin(&conf->gen_lock);
>