linux-kernel - Re: [PATCH md-6.12] md: remove flush handling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9eaf862f-0c00-4d58-994a-bd1b3c6f1518@molgen.mpg.de>
Date: Mon, 26 Aug 2024 10:19:17 +0200
From: Paul Menzel <pmenzel@...gen.mpg.de>
To: Yu Kuai <yukuai1@...weicloud.com>
Cc: song@...nel.org, linux-raid@...r.kernel.org,
 linux-kernel@...r.kernel.org, yukuai3@...wei.com, yi.zhang@...wei.com,
 yangerkun@...wei.com, Li Nan <linan122@...wei.com>
Subject: Re: [PATCH md-6.12] md: remove flush handling

Dear Kuai,


Thank you for your patch.


Am 26.08.24 um 09:48 schrieb Yu Kuai:
> From: Yu Kuai <yukuai3@...wei.com>
> 
> For flush request, md has a special flush handling to merge concurrent
> flush request into single one, however, the whole mechanism is based on
> a disk level spin_lock 'mddev->lock'. And fsync can be called quite
> often in some user cases, for consequence, spin lock from IO fast path can
> cause performance degration.
> 
> Fortunately, the block layer already have flush handling to merge

s/have/has/

> concurrent flush request, and it only acquire hctx level spin lock(see

1.  acquire*s*
2.  Please add a space before the (.

> details in blk-flush.c).
> 
> This patch remove the flush handling in md, and convert to use general
> block layer flush handling in underlying disks.

remove*s*, convert*s*

> Flush test for 4 nvme raid10:
> start 128 threads to do fsync 100000 times, on arm64, see how long it
> takes.

Please share the script, so it’s easier to reproduce?

> Test result: about 10 times faster for high concurrency.
> Before this patch: 50943374 microseconds
> After this patch:  5096347  microseconds
> 
> BTW, this patch can fix the same problem as commit 611d5cbc0b35 ("md: fix
> deadlock between mddev_suspend and flush bio").

So, should that be reverted? (Cc: +Li Nan)

> Signed-off-by: Yu Kuai <yukuai3@...wei.com>
> ---
>   drivers/md/md.c | 137 ++++--------------------------------------------
>   drivers/md/md.h |  10 ----
>   2 files changed, 11 insertions(+), 136 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index a38981de8901..4d675f7cc2a7 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -546,137 +546,23 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n
>   	return 0;
>   }
>   
> -/*
> - * Generic flush handling for md
> - */
> -
> -static void md_end_flush(struct bio *bio)
> -{
> -	struct md_rdev *rdev = bio->bi_private;
> -	struct mddev *mddev = rdev->mddev;
> -
> -	bio_put(bio);
> -
> -	rdev_dec_pending(rdev, mddev);
> -
> -	if (atomic_dec_and_test(&mddev->flush_pending))
> -		/* The pre-request flush has finished */
> -		queue_work(md_wq, &mddev->flush_work);
> -}
> -
> -static void md_submit_flush_data(struct work_struct *ws);
> -
> -static void submit_flushes(struct work_struct *ws)
> +bool md_flush_request(struct mddev *mddev, struct bio *bio)
>   {
> -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
>   	struct md_rdev *rdev;
> +	struct bio *new;
>   
> -	mddev->start_flush = ktime_get_boottime();
> -	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
> -	atomic_set(&mddev->flush_pending, 1);
> -	rcu_read_lock();
> -	rdev_for_each_rcu(rdev, mddev)
> -		if (rdev->raid_disk >= 0 &&
> -		    !test_bit(Faulty, &rdev->flags)) {
> -			struct bio *bi;
> -
> -			atomic_inc(&rdev->nr_pending);
> -			rcu_read_unlock();
> -			bi = bio_alloc_bioset(rdev->bdev, 0,
> -					      REQ_OP_WRITE | REQ_PREFLUSH,
> -					      GFP_NOIO, &mddev->bio_set);
> -			bi->bi_end_io = md_end_flush;
> -			bi->bi_private = rdev;
> -			atomic_inc(&mddev->flush_pending);
> -			submit_bio(bi);
> -			rcu_read_lock();
> -		}
> -	rcu_read_unlock();
> -	if (atomic_dec_and_test(&mddev->flush_pending))
> -		queue_work(md_wq, &mddev->flush_work);
> -}
> -
> -static void md_submit_flush_data(struct work_struct *ws)
> -{
> -	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
> -	struct bio *bio = mddev->flush_bio;
> -
> -	/*
> -	 * must reset flush_bio before calling into md_handle_request to avoid a
> -	 * deadlock, because other bios passed md_handle_request suspend check
> -	 * could wait for this and below md_handle_request could wait for those
> -	 * bios because of suspend check
> -	 */
> -	spin_lock_irq(&mddev->lock);
> -	mddev->prev_flush_start = mddev->start_flush;
> -	mddev->flush_bio = NULL;
> -	spin_unlock_irq(&mddev->lock);
> -	wake_up(&mddev->sb_wait);
> -
> -	if (bio->bi_iter.bi_size == 0) {
> -		/* an empty barrier - all done */
> -		bio_endio(bio);
> -	} else {
> -		bio->bi_opf &= ~REQ_PREFLUSH;
> -
> -		/*
> -		 * make_requst() will never return error here, it only
> -		 * returns error in raid5_make_request() by dm-raid.
> -		 * Since dm always splits data and flush operation into
> -		 * two separate io, io size of flush submitted by dm
> -		 * always is 0, make_request() will not be called here.
> -		 */
> -		if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio)))
> -			bio_io_error(bio);
> -	}
> -
> -	/* The pair is percpu_ref_get() from md_flush_request() */
> -	percpu_ref_put(&mddev->active_io);
> -}
> +	rdev_for_each(rdev, mddev) {
> +		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
> +			continue;
>   
> -/*
> - * Manages consolidation of flushes and submitting any flushes needed for
> - * a bio with REQ_PREFLUSH.  Returns true if the bio is finished or is
> - * being finished in another context.  Returns false if the flushing is
> - * complete but still needs the I/O portion of the bio to be processed.
> - */
> -bool md_flush_request(struct mddev *mddev, struct bio *bio)
> -{
> -	ktime_t req_start = ktime_get_boottime();
> -	spin_lock_irq(&mddev->lock);
> -	/* flush requests wait until ongoing flush completes,
> -	 * hence coalescing all the pending requests.
> -	 */
> -	wait_event_lock_irq(mddev->sb_wait,
> -			    !mddev->flush_bio ||
> -			    ktime_before(req_start, mddev->prev_flush_start),
> -			    mddev->lock);
> -	/* new request after previous flush is completed */
> -	if (ktime_after(req_start, mddev->prev_flush_start)) {
> -		WARN_ON(mddev->flush_bio);
> -		/*
> -		 * Grab a reference to make sure mddev_suspend() will wait for
> -		 * this flush to be done.
> -		 *
> -		 * md_flush_reqeust() is called under md_handle_request() and
> -		 * 'active_io' is already grabbed, hence percpu_ref_is_zero()
> -		 * won't pass, percpu_ref_tryget_live() can't be used because
> -		 * percpu_ref_kill() can be called by mddev_suspend()
> -		 * concurrently.
> -		 */
> -		WARN_ON(percpu_ref_is_zero(&mddev->active_io));
> -		percpu_ref_get(&mddev->active_io);
> -		mddev->flush_bio = bio;
> -		spin_unlock_irq(&mddev->lock);
> -		INIT_WORK(&mddev->flush_work, submit_flushes);
> -		queue_work(md_wq, &mddev->flush_work);
> -		return true;
> +		new = bio_alloc_bioset(rdev->bdev, 0,
> +				       REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO,
> +				       &mddev->bio_set);
> +		bio_chain(new, bio);
> +		submit_bio(new);
>   	}
>   
> -	/* flush was performed for some other bio while we waited. */
> -	spin_unlock_irq(&mddev->lock);
> -	if (bio->bi_iter.bi_size == 0) {
> -		/* pure flush without data - all done */
> +	if (bio_sectors(bio) == 0) {
>   		bio_endio(bio);
>   		return true;
>   	}
> @@ -763,7 +649,6 @@ int mddev_init(struct mddev *mddev)
>   	atomic_set(&mddev->openers, 0);
>   	atomic_set(&mddev->sync_seq, 0);
>   	spin_lock_init(&mddev->lock);
> -	atomic_set(&mddev->flush_pending, 0);
>   	init_waitqueue_head(&mddev->sb_wait);
>   	init_waitqueue_head(&mddev->recovery_wait);
>   	mddev->reshape_position = MaxSector;
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 1c6a5f41adca..5d2e6bd58e4d 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -572,16 +572,6 @@ struct mddev {
>   						   */
>   	struct bio_set			io_clone_set;
>   
> -	/* Generic flush handling.
> -	 * The last to finish preflush schedules a worker to submit
> -	 * the rest of the request (without the REQ_PREFLUSH flag).
> -	 */
> -	struct bio *flush_bio;
> -	atomic_t flush_pending;
> -	ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed
> -						* flush was started.
> -						*/
> -	struct work_struct flush_work;
>   	struct work_struct event_work;	/* used by dm to report failure event */
>   	mempool_t *serial_info_pool;
>   	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);

Code removal is always nice to see.


Kind regards,

Paul