[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250915034210.8533-5-k@mgml.me>
Date: Mon, 15 Sep 2025 12:42:05 +0900
From: Kenta Akagi <k@...l.me>
To: Song Liu <song@...nel.org>, Yu Kuai <yukuai3@...wei.com>,
Mariusz Tkaczyk <mtkaczyk@...nel.org>, Shaohua Li <shli@...com>,
Guoqing Jiang <jgq516@...il.com>
Cc: linux-raid@...r.kernel.org, linux-kernel@...r.kernel.org,
Kenta Akagi <k@...l.me>
Subject: [PATCH v4 4/9] md/raid1,raid10: Don't set MD_BROKEN on failfast bio failure
Failfast is a feature implemented only for RAID1 and RAID10. It instructs
the block device providing the rdev to immediately return a bio error
without retrying if any issue occurs. This allows quickly detaching a
problematic rdev and minimizes IO latency.
Due to its nature, failfast bios can fail easily, and md must not mark
an essential rdev as Faulty or set MD_BROKEN on the array just because
a failfast bio failed.
When failfast was introduced, RAID1 and RAID10 were designed to continue
operating normally even if md_error was called for the last rdev. However,
with the introduction of MD_BROKEN in RAID1/RAID10
in commit 9631abdbf406 ("md: Set MD_BROKEN for RAID1 and RAID10"), calling
md_error for the last rdev now prevents further writes to the array.
Despite this, the current failfast error handler still assumes that
calling md_error will not break the array.
Normally, this is not an issue because MD_FAILFAST is not set when a bio
is issued to the last rdev. However, if the array is not degraded and a
bio with MD_FAILFAST has been issued, simultaneous failures could
potentially break the array. This is unusual but can happen; for example,
this can occur when using NVMe over TCP if all rdevs depend on
a single Ethernet link.
In other words, this becomes a problem under the following conditions:
Preconditions:
* Failfast is enabled on all rdevs.
* All rdevs are In_sync - This is a requirement for bio to be submit
with MD_FAILFAST.
* At least one bio has been submitted but has not yet completed.
Trigger condition:
* All underlying devices of the rdevs return an error for their failfast
bios.
Whether the bio is a read or a write makes little difference to the
outcome.
In the write case, md_error is invoked on each rdev through its bi_end_io
handler.
In the read case, losing the first rdev triggers a metadata
update. Next, md_super_write, unlike raid1_write_request, issues the bio
with MD_FAILFAST if the rdev supports it, causing the bio to fail
immediately - Before this patchset, LastDev was set only by the failure
path in super_written. Consequently, super_written calls md_error on the
remaining rdev.
Prior to this commit, the following changes were introduced:
* The helper function md_bio_failure_error() that skips the error handler
if a failfast bio targets the last rdev.
* Serialization md_error() and md_bio_failure_error().
* Setting the LastDev flag for rdevs that must not be lost.
This commit uses md_bio_failure_error() instead of md_error() for failfast
bio failures, ensuring that failfast bios do not stop array operations.
Fixes: 9631abdbf406 ("md: Set MD_BROKEN for RAID1 and RAID10")
Signed-off-by: Kenta Akagi <k@...l.me>
---
drivers/md/md.c | 5 +----
drivers/md/raid1.c | 37 ++++++++++++++++++-------------------
drivers/md/raid10.c | 9 +++++----
3 files changed, 24 insertions(+), 27 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 65fdd9bae8f4..65814bbe9bad 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1004,11 +1004,8 @@ static void super_written(struct bio *bio)
if (bio->bi_status) {
pr_err("md: %s gets error=%d\n", __func__,
blk_status_to_errno(bio->bi_status));
- md_error(mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags)
- && (bio->bi_opf & MD_FAILFAST)) {
+ if (!md_bio_failure_error(mddev, rdev, bio))
set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
- }
}
bio_put(bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 32ad6b102ff7..8fff9dacc6e0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -470,7 +470,7 @@ static void raid1_end_write_request(struct bio *bio)
(bio->bi_opf & MD_FAILFAST) &&
/* We never try FailFast to WriteMostly devices */
!test_bit(WriteMostly, &rdev->flags)) {
- md_error(r1_bio->mddev, rdev);
+ md_bio_failure_error(r1_bio->mddev, rdev, bio);
}
/*
@@ -2178,8 +2178,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
if (test_bit(FailFast, &rdev->flags)) {
/* Don't try recovering from here - just fail it
* ... unless it is the last working device of course */
- md_error(mddev, rdev);
- if (test_bit(Faulty, &rdev->flags))
+ if (md_bio_failure_error(mddev, rdev, bio))
/* Don't try to read from here, but make sure
* put_buf does it's thing
*/
@@ -2657,9 +2656,8 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
{
struct mddev *mddev = conf->mddev;
- struct bio *bio;
+ struct bio *bio, *updated_bio;
struct md_rdev *rdev;
- sector_t sector;
clear_bit(R1BIO_ReadError, &r1_bio->state);
/* we got a read error. Maybe the drive is bad. Maybe just
@@ -2672,29 +2670,30 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
*/
bio = r1_bio->bios[r1_bio->read_disk];
- bio_put(bio);
- r1_bio->bios[r1_bio->read_disk] = NULL;
+ updated_bio = NULL;
rdev = conf->mirrors[r1_bio->read_disk].rdev;
- if (mddev->ro == 0
- && !test_bit(FailFast, &rdev->flags)) {
- freeze_array(conf, 1);
- fix_read_error(conf, r1_bio);
- unfreeze_array(conf);
- } else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
- md_error(mddev, rdev);
+ if (mddev->ro == 0) {
+ if (!test_bit(FailFast, &rdev->flags)) {
+ freeze_array(conf, 1);
+ fix_read_error(conf, r1_bio);
+ unfreeze_array(conf);
+ } else {
+ md_bio_failure_error(mddev, rdev, bio);
+ }
} else {
- r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
+ updated_bio = IO_BLOCKED;
}
+ bio_put(bio);
+ r1_bio->bios[r1_bio->read_disk] = updated_bio;
+
rdev_dec_pending(rdev, conf->mddev);
- sector = r1_bio->sector;
- bio = r1_bio->master_bio;
/* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
r1_bio->state = 0;
- raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
- allow_barrier(conf, sector);
+ raid1_read_request(mddev, r1_bio->master_bio, r1_bio->sectors, r1_bio);
+ allow_barrier(conf, r1_bio->sector);
}
static void raid1d(struct md_thread *thread)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index dc4edd4689f8..b73af94a88b0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -488,7 +488,7 @@ static void raid10_end_write_request(struct bio *bio)
dec_rdev = 0;
if (test_bit(FailFast, &rdev->flags) &&
(bio->bi_opf & MD_FAILFAST)) {
- md_error(rdev->mddev, rdev);
+ md_bio_failure_error(rdev->mddev, rdev, bio);
}
/*
@@ -2443,7 +2443,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
} else if (test_bit(FailFast, &rdev->flags)) {
/* Just give up on this device */
- md_error(rdev->mddev, rdev);
+ md_bio_failure_error(rdev->mddev, rdev, tbio);
continue;
}
/* Ok, we need to write this bio, either to correct an
@@ -2895,8 +2895,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
freeze_array(conf, 1);
fix_read_error(conf, mddev, r10_bio);
unfreeze_array(conf);
- } else
- md_error(mddev, rdev);
+ } else {
+ md_bio_failure_error(mddev, rdev, bio);
+ }
rdev_dec_pending(rdev, mddev);
r10_bio->state = 0;
--
2.50.1
Powered by blists - more mailing lists