[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20241118114157.355749-6-yukuai1@huaweicloud.com>
Date: Mon, 18 Nov 2024 19:41:57 +0800
From: Yu Kuai <yukuai1@...weicloud.com>
To: song@...nel.org,
xni@...hat.com
Cc: linux-raid@...r.kernel.org,
linux-kernel@...r.kernel.org,
yukuai3@...wei.com,
yukuai1@...weicloud.com,
yi.zhang@...wei.com,
yangerkun@...wei.com
Subject: [PATCH md-6.13 5/5] md/md-bitmap: move bitmap_{start, end}write to md upper layer
From: Yu Kuai <yukuai3@...wei.com>
There are two BUG reports that raid5 will hang at
bitmap_startwrite([1],[2]), root cause is that bitmap start write and end
write is unbalanced, and while reviewing raid5 code, it's found that
bitmap operations can be optimized. For example, for a 4 disks raid5, with
chunksize=8k, if user issue a IO (0 + 48k) to the array:
┌────────────────────────────────────────────────────────────┐
│chunk 0 │
│ ┌────────────┬─────────────┬─────────────┬────────────┼
│ sh0 │A0: 0 + 4k │A1: 8k + 4k │A2: 16k + 4k │A3: P │
│ ┼────────────┼─────────────┼─────────────┼────────────┼
│ sh1 │B0: 4k + 4k │B1: 12k + 4k │B2: 20k + 4k │B3: P │
┼──────┴────────────┴─────────────┴─────────────┴────────────┼
│chunk 1 │
│ ┌────────────┬─────────────┬─────────────┬────────────┤
│ sh2 │C0: 24k + 4k│C1: 32k + 4k │C2: P │C3: 40k + 4k│
│ ┼────────────┼─────────────┼─────────────┼────────────┼
│ sh3 │D0: 28k + 4k│D1: 36k + 4k │D2: P │D3: 44k + 4k│
└──────┴────────────┴─────────────┴─────────────┴────────────┘
Before this patch, 4 stripe head will be used, and each sh will attach
bio for 3 disks, and each attached bio will trigger
bitmap_startwrite() once, which means total 12 times.
- 3 times (0 + 4k), for (A0, A1 and A2)
- 3 times (4 + 4k), for (B0, B1 and B2)
- 3 times (8 + 4k), for (C0, C1 and C3)
- 3 times (12 + 4k), for (D0, D1 and D3)
After this patch, md upper layer will calculate that IO range (0 + 48k)
is corresponding to the bitmap (0 + 16k), and call bitmap_startwrite()
just once.
Noted that this patch will align bitmap ranges to the chunks, for example,
if user issue a IO (0 + 4k) to array:
- Before this patch, 1 time (0 + 4k), for A0;
- After this patch, 1 time (0 + 8k) for chunk 0;
Usually, one bitmap bit will represent more than one disk chunk, and this
doesn't have any difference. And even if user really created a array
that one chunk contain multiple bits, the overhead is that more data
will be recovered after power failure.
[1] https://lore.kernel.org/all/CAJpMwyjmHQLvm6zg1cmQErttNNQPDAAXPKM3xgTjMhbfts986Q@mail.gmail.com/
[2] https://lore.kernel.org/all/ADF7D720-5764-4AF3-B68E-1845988737AA@flyingcircus.io/
Signed-off-by: Yu Kuai <yukuai3@...wei.com>
---
drivers/md/md.c | 29 +++++++++++++++++++++++++++++
drivers/md/md.h | 2 ++
drivers/md/raid1.c | 4 ----
drivers/md/raid10.c | 3 ---
drivers/md/raid5-cache.c | 2 --
drivers/md/raid5.c | 24 +-----------------------
6 files changed, 32 insertions(+), 32 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index bbe002ebd584..ab37c9939ac6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8723,12 +8723,32 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
+static void md_bitmap_start(struct mddev *mddev,
+ struct md_io_clone *md_io_clone)
+{
+ if (mddev->pers->bitmap_sector)
+ mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
+ &md_io_clone->sectors);
+
+ mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset,
+ md_io_clone->sectors);
+}
+
+static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
+{
+ mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset,
+ md_io_clone->sectors);
+}
+
static void md_end_clone_io(struct bio *bio)
{
struct md_io_clone *md_io_clone = bio->bi_private;
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
+ if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ md_bitmap_end(mddev, md_io_clone);
+
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
@@ -8753,6 +8773,12 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
if (blk_queue_io_stat(bdev->bd_disk->queue))
md_io_clone->start_time = bio_start_io_acct(*bio);
+ if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
+ md_io_clone->offset = (*bio)->bi_iter.bi_sector;
+ md_io_clone->sectors = bio_sectors(*bio);
+ md_bitmap_start(mddev, md_io_clone);
+ }
+
clone->bi_end_io = md_end_clone_io;
clone->bi_private = md_io_clone;
*bio = clone;
@@ -8771,6 +8797,9 @@ void md_free_cloned_bio(struct bio *bio)
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
+ if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ md_bitmap_end(mddev, md_io_clone);
+
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index de6dadb9a40b..def808064ad8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -831,6 +831,8 @@ struct md_io_clone {
struct mddev *mddev;
struct bio *orig_bio;
unsigned long start_time;
+ sector_t offset;
+ unsigned long sectors;
struct bio bio_clone;
};
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b9819f9c8ed2..e2adfeff5ae6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -422,8 +422,6 @@ static void close_write(struct r1bio *r1_bio)
if (test_bit(R1BIO_BehindIO, &r1_bio->state))
mddev->bitmap_ops->end_behind_write(mddev);
- /* clear the bitmap if all writes complete successfully */
- mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors);
md_write_end(mddev);
}
@@ -1616,8 +1614,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (test_bit(R1BIO_BehindIO, &r1_bio->state))
mddev->bitmap_ops->start_behind_write(mddev);
- mddev->bitmap_ops->startwrite(mddev, r1_bio->sector,
- r1_bio->sectors);
first_clone = 0;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index efbc0bd92479..79a209236c9f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -428,8 +428,6 @@ static void close_write(struct r10bio *r10_bio)
{
struct mddev *mddev = r10_bio->mddev;
- /* clear the bitmap if all writes complete successfully */
- mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors);
md_write_end(mddev);
}
@@ -1487,7 +1485,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
- mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors);
for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index ba4f9577c737..011246e16a99 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -313,8 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf));
}
}
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 95caed41654c..976788138a6e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3578,12 +3578,6 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
* is added to a batch, STRIPE_BIT_DELAY cannot be changed
* any more.
*/
- set_bit(STRIPE_BITMAP_PENDING, &sh->state);
- spin_unlock_irq(&sh->stripe_lock);
- conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf));
- spin_lock_irq(&sh->stripe_lock);
- clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
sh->bm_seq = conf->seq_flush+1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
@@ -3638,7 +3632,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
for (i = disks; i--; ) {
struct bio *bi;
- int bitmap_end = 0;
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
struct md_rdev *rdev = conf->disks[i].rdev;
@@ -3663,8 +3656,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].towrite = NULL;
sh->overwrite_disks = 0;
spin_unlock_irq(&sh->stripe_lock);
- if (bi)
- bitmap_end = 1;
log_stripe_write_finished(sh);
@@ -3679,10 +3670,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bio_io_error(bi);
bi = nextbi;
}
- if (bitmap_end)
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf));
- bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
@@ -3691,7 +3678,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].page = sh->dev[i].orig_page;
}
- if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
@@ -3725,9 +3711,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi = nextbi;
}
}
- if (bitmap_end)
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf));
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -4076,8 +4059,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
bio_endio(wbi);
wbi = wbi2;
}
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf));
+
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
@@ -5797,10 +5779,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
- for (d = 0; d < conf->raid_disks - conf->max_degraded;
- d++)
- mddev->bitmap_ops->startwrite(mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf));
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
--
2.39.2
Powered by blists - more mailing lists