linux-kernel - Re: [PATCH] md/raid1,raid10: don't ignore IO flags

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <e7ba770d-fb9f-a4d8-c1dc-24765dfe55bd@huaweicloud.com>
Date: Thu, 27 Feb 2025 10:48:59 +0800
From: Yu Kuai <yukuai1@...weicloud.com>
To: Paul Menzel <pmenzel@...gen.mpg.de>, Yu Kuai <yukuai1@...weicloud.com>
Cc: song@...nel.org, linux-raid@...r.kernel.org,
 linux-kernel@...r.kernel.org, yi.zhang@...wei.com, yangerkun@...wei.com,
 "yukuai (C)" <yukuai3@...wei.com>
Subject: Re: [PATCH] md/raid1,raid10: don't ignore IO flags

Hi,

在 2025/02/26 16:29, Paul Menzel 写道:
> Dear Kuai,
> 
> 
> Thank you for your patch.
> 
> Am 26.02.25 um 07:30 schrieb Yu Kuai:
>> From: Yu Kuai <yukuai3@...wei.com>
>>
>> If blk-wbt is enabled by default, it's found that raid write performance
>> is quite bad because all IO are throttled by wbt of underlying disks,
>> due to flag REQ_IDLE is ignored. And turns out this behaviour exist since
>> blk-wbt is introduced.
>>
>> Other than REQ_IDLE, other flags should not be ignored as well, for
>> example REQ_META can be set for filesystems, clear it can cause priority
> 
> clear*ing*
> 
>> reverse problems; And REQ_NOWAIT should not be cleared as well, because
> 
> … problems. REQ…NOWAIT …
> 
>> io will wait instead of fail directly in underlying disks.
> 
> fail*ing*
> 
>> Fix those problems by keeping IO flags from master bio.
> 
> Add a Fixes: tag?

I didn't find an appropriate tag yet, raid is invented like this, before
v2.x. And later wbt and other IO falgs are introduced.
> 
> Do you have a test case, how to reproduce the issue?

Test case is simple, just enable wbt for underlying disks, and then
issue direct IO, then check if IO are throttled by wbt.

Thanks,
Kuai

> 
> 
> Kind regards,
> 
> Paul
> 
> 
>> Signed-off-by: Yu Kuai <yukuai3@...wei.com>
>> ---
>>   drivers/md/raid1.c  | 5 -----
>>   drivers/md/raid10.c | 8 --------
>>   2 files changed, 13 deletions(-)
>>
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index a87eb9a3b016..347de0e36d59 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -1316,8 +1316,6 @@ static void raid1_read_request(struct mddev 
>> *mddev, struct bio *bio,
>>       struct r1conf *conf = mddev->private;
>>       struct raid1_info *mirror;
>>       struct bio *read_bio;
>> -    const enum req_op op = bio_op(bio);
>> -    const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
>>       int max_sectors;
>>       int rdisk, error;
>>       bool r1bio_existed = !!r1_bio;
>> @@ -1405,7 +1403,6 @@ static void raid1_read_request(struct mddev 
>> *mddev, struct bio *bio,
>>       read_bio->bi_iter.bi_sector = r1_bio->sector +
>>           mirror->rdev->data_offset;
>>       read_bio->bi_end_io = raid1_end_read_request;
>> -    read_bio->bi_opf = op | do_sync;
>>       if (test_bit(FailFast, &mirror->rdev->flags) &&
>>           test_bit(R1BIO_FailFast, &r1_bio->state))
>>               read_bio->bi_opf |= MD_FAILFAST;
>> @@ -1654,8 +1651,6 @@ static void raid1_write_request(struct mddev 
>> *mddev, struct bio *bio,
>>           mbio->bi_iter.bi_sector    = (r1_bio->sector + 
>> rdev->data_offset);
>>           mbio->bi_end_io    = raid1_end_write_request;
>> -        mbio->bi_opf = bio_op(bio) |
>> -            (bio->bi_opf & (REQ_SYNC | REQ_FUA | REQ_ATOMIC));
>>           if (test_bit(FailFast, &rdev->flags) &&
>>               !test_bit(WriteMostly, &rdev->flags) &&
>>               conf->raid_disks - mddev->degraded > 1)
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index efe93b979167..e294ba00ea0e 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -1146,8 +1146,6 @@ static void raid10_read_request(struct mddev 
>> *mddev, struct bio *bio,
>>   {
>>       struct r10conf *conf = mddev->private;
>>       struct bio *read_bio;
>> -    const enum req_op op = bio_op(bio);
>> -    const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
>>       int max_sectors;
>>       struct md_rdev *rdev;
>>       char b[BDEVNAME_SIZE];
>> @@ -1228,7 +1226,6 @@ static void raid10_read_request(struct mddev 
>> *mddev, struct bio *bio,
>>       read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
>>           choose_data_offset(r10_bio, rdev);
>>       read_bio->bi_end_io = raid10_end_read_request;
>> -    read_bio->bi_opf = op | do_sync;
>>       if (test_bit(FailFast, &rdev->flags) &&
>>           test_bit(R10BIO_FailFast, &r10_bio->state))
>>               read_bio->bi_opf |= MD_FAILFAST;
>> @@ -1247,10 +1244,6 @@ static void raid10_write_one_disk(struct mddev 
>> *mddev, struct r10bio *r10_bio,
>>                     struct bio *bio, bool replacement,
>>                     int n_copy)
>>   {
>> -    const enum req_op op = bio_op(bio);
>> -    const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
>> -    const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
>> -    const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC;
>>       unsigned long flags;
>>       struct r10conf *conf = mddev->private;
>>       struct md_rdev *rdev;
>> @@ -1269,7 +1262,6 @@ static void raid10_write_one_disk(struct mddev 
>> *mddev, struct r10bio *r10_bio,
>>       mbio->bi_iter.bi_sector    = (r10_bio->devs[n_copy].addr +
>>                      choose_data_offset(r10_bio, rdev));
>>       mbio->bi_end_io    = raid10_end_write_request;
>> -    mbio->bi_opf = op | do_sync | do_fua | do_atomic;
>>       if (!replacement && test_bit(FailFast,
>>                        &conf->mirrors[devnum].rdev->flags)
>>                && enough(conf, devnum))
> 
> .
>