[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a1698611-f386-4974-9cbd-a270fb417af2@vivo.com>
Date: Thu, 25 Apr 2024 10:54:56 +0800
From: YangYang <yang.yang@...o.com>
To: Alasdair Kergon <agk@...hat.com>, Mike Snitzer <snitzer@...nel.org>,
Mikulas Patocka <mpatocka@...hat.com>
Cc: linux-kernel@...r.kernel.org, dm-devel@...ts.linux.dev
Subject: Re: [RFC PATCH] dm: Avoid sending redundant empty flush bios to the
same block device
On 2024/4/22 18:05, Yang Yang wrote:
> __send_empty_flush() sends empty flush bios to every target in the
> dm_table. However, if the num_targets exceeds the number of block
> devices in the dm_table's device list, it could lead to multiple
> invocations of __send_duplicate_bios() for the same block device.
> Typically, a single thread sending numerous empty flush bios to one
> block device is redundant, as these bios are likely to be merged by the
> flush state machine. In scenarios where num_targets significantly
> outweighs the number of block devices, such behavior may result in a
> noteworthy decrease in performance.
>
> This issue can be reproduced using this command line:
> for i in {0..1023}; do
> echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
> done | dmsetup create example
>
> With this fix, a random write with fsync workload executed with the
> following fio command:
>
> fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
> --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
> --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
>
> results in an increase from 857 KB/s to 30.8 MB/s of the write
> throughput (3580% increase).
>
> Signed-off-by: Yang Yang <yang.yang@...o.com>
> ---
> drivers/md/dm-core.h | 1 +
> drivers/md/dm-table.c | 7 +++++
> drivers/md/dm.c | 59 +++++++++++++++++++++++++++++++++++
> include/linux/device-mapper.h | 6 ++++
> 4 files changed, 73 insertions(+)
>
> diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
> index e6757a30dcca..7e3f2168289f 100644
> --- a/drivers/md/dm-core.h
> +++ b/drivers/md/dm-core.h
> @@ -217,6 +217,7 @@ struct dm_table {
> /* a list of devices used by this table */
> struct list_head devices;
> struct rw_semaphore devices_lock;
> + unsigned short num_devices;
>
> /* events get handed up using this callback */
> void (*event_fn)(void *data);
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index 41f1d731ae5a..ddc60e498afb 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
>
> int dm_table_resume_targets(struct dm_table *t)
> {
> + struct list_head *devices = dm_table_get_devices(t);
> + struct dm_dev_internal *dd;
> unsigned int i;
> int r = 0;
>
> @@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
> ti->type->resume(ti);
> }
>
> + t->num_devices = 0;
> +
> + list_for_each_entry(dd, devices, list)
> + dd->dm_dev->index = ++(t->num_devices);
> +
> return 0;
> }
>
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 56aa2a8b9d71..7297235291f6 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -48,6 +48,8 @@
> */
> #define REQ_DM_POLL_LIST REQ_DRV
>
> +#define DM_MAX_TABLE_DEVICES 1024
> +
> static const char *_name = DM_NAME;
>
> static unsigned int major;
> @@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
> return ret;
> }
>
> +static inline bool has_redundant_flush(struct dm_table *t,
> + unsigned long **bitmap)
> +{
> + if (t->num_devices < t->num_targets) {
> + /* Add a limit here to prevent excessive memory usage for bitmaps */
> + if (t->num_devices >= DM_MAX_TABLE_DEVICES)
> + return false;
> +
> + /* dm_dev's index starts from 1, so need plus 1 here */
> + *bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL);
> + if (*bitmap)
> + return true;
> + }
> +
> + return false;
> +}
> +
> +static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
> + sector_t start, sector_t len, void *data)
> +{
> + unsigned short *index = data;
> + *index = dev->index;
> + return 0;
> +}
> +
> static void __send_empty_flush(struct clone_info *ci)
> {
> struct dm_table *t = ci->map;
> struct bio flush_bio;
> + unsigned long *handled_map;
> + unsigned int nr_handled = 0;
> + bool check = has_redundant_flush(t, &handled_map);
>
> /*
> * Use an on-stack bio for this, it's safe since we don't
> @@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci)
>
> for (unsigned int i = 0; i < t->num_targets; i++) {
> unsigned int bios;
> + unsigned short index = 0;
> struct dm_target *ti = dm_table_get_target(t, i);
>
> if (unlikely(ti->num_flush_bios == 0))
> continue;
>
> + /*
> + * If the num_targets is greater than the number of block devices
> + * in the dm_table's devices list, __send_empty_flush() might
> + * invoke __send_duplicate_bios() multiple times for the same
> + * block device. This could lead to a substantial decrease in
> + * performance when num_targets significantly exceeds the number
> + * of block devices.
> + * Ensure that __send_duplicate_bios() is only called once for
> + * each block device.
> + */
> + if (check) {
> + if (nr_handled == t->num_devices)
> + break;
> +
> + if (ti->type->iterate_devices)
> + ti->type->iterate_devices(ti, dm_get_dev_index, &index);
> +
> + if (index > 0) {
> + if (__test_and_set_bit(index, handled_map))
> + continue;
> + else
> + nr_handled++;
> + }
> + }
> +
> atomic_add(ti->num_flush_bios, &ci->io->io_count);
> bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
> NULL, GFP_NOWAIT);
> atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
> }
>
> + if (check)
> + bitmap_free(handled_map);
> +
> /*
> * alloc_io() takes one extra reference for submission, so the
> * reference won't reach 0 without the following subtraction
> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> index 82b2195efaca..4a54b4f0a609 100644
> --- a/include/linux/device-mapper.h
> +++ b/include/linux/device-mapper.h
> @@ -169,6 +169,12 @@ struct dm_dev {
> struct dax_device *dax_dev;
> blk_mode_t mode;
> char name[16];
> +
> + /*
> + * sequential number for each dm_dev in dm_table's devices list,
> + * start from 1
> + */
> + unsigned short index;
> };
>
> /*
Hello experts,
Any comments are welcome!
Thanks
Powered by blists - more mailing lists