[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <686d8ef7-cbae-113c-fa48-e0c5dfa2f3d5@huaweicloud.com>
Date: Thu, 27 Apr 2023 18:51:51 +0800
From: Yu Kuai <yukuai1@...weicloud.com>
To: linan666@...weicloud.com, song@...nel.org, neilb@...e.de,
Rob.Becker@...erbed.com
Cc: linux-raid@...r.kernel.org, linux-kernel@...r.kernel.org,
linan122@...wei.com, yi.zhang@...wei.com, houtao1@...wei.com,
yangerkun@...wei.com, "yukuai (C)" <yukuai3@...wei.com>
Subject: Re: [PATCH 3/3] md/raid10: fix wrong setting of max_corr_read_errors
Hi,
在 2023/04/27 16:56, linan666@...weicloud.com 写道:
> From: Li Nan <linan122@...wei.com>
>
> max_corr_read_errors should not be negative number. Change it to
> unsigned int where use it.
>
> Fixes: 1e50915fe0bb ("raid: improve MD/raid10 handling of correctable read errors.")
> Signed-off-by: Li Nan <linan122@...wei.com>
> ---
> drivers/md/md.c | 2 +-
> drivers/md/raid10.c | 4 ++--
> 2 files changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index faffbd042925..a365ed122960 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -4484,7 +4484,7 @@ __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_stor
>
> static ssize_t
> max_corrected_read_errors_show(struct mddev *mddev, char *page) {
> - return sprintf(page, "%d\n",
> + return sprintf(page, "%u\n",
> atomic_read(&mddev->max_corr_read_errors));
> }
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 4fcfcb350d2b..28cdb2ae0e91 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -2727,7 +2727,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
> int sect = 0; /* Offset from r10_bio->sector */
> int sectors = r10_bio->sectors;
> struct md_rdev *rdev;
> - int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
> + unsigned int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
This line exceed 80 columns.
> int d = r10_bio->devs[r10_bio->read_slot].devnum;
>
> /* still own a reference to this rdev, so it cannot
> @@ -2743,7 +2743,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
> check_decay_read_errors(mddev, rdev);
> atomic_inc(&rdev->read_errors);
> if (atomic_read(&rdev->read_errors) > max_read_errors) {
> - pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
> + pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %u:max %u]\n",
> mdname(mddev), rdev->bdev,
> atomic_read(&rdev->read_errors), max_read_errors);
> pr_notice("md/raid10:%s: %pg: Failing raid device\n",
>
This is not critical, but I think it's better do some cleanup to fold
above code into check_decay_read_errors(), and rename it to
check_read_error():
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7135cfaf75db..633aabfea452 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2636,18 +2636,17 @@ static void recovery_request_write(struct mddev
*mddev, struct r10bio *r10_bio)
* since the last recorded read error.
*
*/
-static void check_decay_read_errors(struct mddev *mddev, struct md_rdev
*rdev)
+static bool check_read_errors(struct mddev *mddev, struct md_rdev *rdev)
{
- long cur_time_mon;
+ time64_t cur_time_mon = ktime_get_seconds();
unsigned long hours_since_last;
- unsigned int read_errors = atomic_read(&rdev->read_errors);
-
- cur_time_mon = ktime_get_seconds();
+ unsigned int read_errors;
+ unsigned int max_read_errors;
if (rdev->last_read_error == 0) {
/* first time we've seen a read error */
rdev->last_read_error = cur_time_mon;
- return;
+ goto increase;
}
hours_since_last = (long)(cur_time_mon -
@@ -2660,10 +2659,26 @@ static void check_decay_read_errors(struct mddev
*mddev, struct md_rdev *rdev)
* just set read errors to 0. We do this to avoid
* overflowing the shift of read_errors by hours_since_last.
*/
+ read_errors = atomic_read(&rdev->read_errors);
if (hours_since_last >= 8 * sizeof(read_errors))
atomic_set(&rdev->read_errors, 0);
else
atomic_set(&rdev->read_errors, read_errors >>
hours_since_last);
+
+increase:
+ max_read_errors = atomic_read(&mddev->max_corr_read_errors);
+ read_errors = atomic_inc_return(&rdev->read_errors);
+ if (read_errors > max_read_errors) {
+ pr_notice("md/raid10:%s: %pg: Raid device exceeded
read_error threshold [cur %u:max %u]\n",
+ mdname(mddev), rdev->bdev,
+ read_errors, max_read_errors);
+ pr_notice("md/raid10:%s: %pg: Failing raid device\n",
+ mdname(mddev), rdev->bdev);
+ md_error(mddev, rdev);
+ return true;
+ }
+
+ return false;
}
static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
@@ -2703,7 +2718,6 @@ static void fix_read_error(struct r10conf *conf,
struct mddev *mddev, struct r10
int sect = 0; /* Offset from r10_bio->sector */
int sectors = r10_bio->sectors;
struct md_rdev *rdev;
- int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum;
/* still own a reference to this rdev, so it cannot
@@ -2716,15 +2730,7 @@ static void fix_read_error(struct r10conf *conf,
struct mddev *mddev, struct r10
more fix_read_error() attempts */
return;
- check_decay_read_errors(mddev, rdev);
- atomic_inc(&rdev->read_errors);
- if (atomic_read(&rdev->read_errors) > max_read_errors) {
- pr_notice("md/raid10:%s: %pg: Raid device exceeded
read_error threshold [cur %d:max %d]\n",
- mdname(mddev), rdev->bdev,
- atomic_read(&rdev->read_errors), max_read_errors);
- pr_notice("md/raid10:%s: %pg: Failing raid device\n",
- mdname(mddev), rdev->bdev);
- md_error(mddev, rdev);
+ if (check_read_errors(mddev, rdev)) {
r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
return;
}
Thanks,
Kuai
Powered by blists - more mailing lists