[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <c1b45434-6341-4e26-a376-4f434df2caf0@vivo.com>
Date: Sat, 11 May 2024 19:08:39 +0800
From: YangYang <yang.yang@...o.com>
To: Mikulas Patocka <mpatocka@...hat.com>
Cc: Alasdair Kergon <agk@...hat.com>, Mike Snitzer <snitzer@...nel.org>,
dm-devel@...ts.linux.dev, linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH] dm: Avoid sending redundant empty flush bios to the
same block device
On 2024/5/10 22:08, Mikulas Patocka wrote:
> Hi
>
> Regarding *bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL); - you
> can't allocate memory in the I/O processing path, because it may deadlock.
> Think of a case when the system is out of memory and it needs to swap out
> some pages and the swap out operation attempts to allocate more memory.
Got it!
>
> Anyway, the patch is too complicated. I suggest to try this:
>
> * introduce a per-target bit "flush_pass_around" that is set for dm-linear
> and dm-stripe and that means that the target supports flush optimization
>
> * set a per-table "flush_pass_around" bit if all the targets in the table
> have "flush_pass_around" set
>
> * in __send_empty_flush, test the table's "flush_pass_around" bit and if
> it is set, bypass this loop "for (unsigned int i = 0; i <
> t->num_targets; i++) {" and iterate over dm_table->devices and send the
> flush to each of them.
OK, I will do this in the next version.
Thanks for your comments.
>
> Hopefully, these changes will make the patch smaller and more acceptable.
>
> Mikulas
>
>
>
> On Mon, 22 Apr 2024, Yang Yang wrote:
>
>> __send_empty_flush() sends empty flush bios to every target in the
>> dm_table. However, if the num_targets exceeds the number of block
>> devices in the dm_table's device list, it could lead to multiple
>> invocations of __send_duplicate_bios() for the same block device.
>> Typically, a single thread sending numerous empty flush bios to one
>> block device is redundant, as these bios are likely to be merged by the
>> flush state machine. In scenarios where num_targets significantly
>> outweighs the number of block devices, such behavior may result in a
>> noteworthy decrease in performance.
>>
>> This issue can be reproduced using this command line:
>> for i in {0..1023}; do
>> echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
>> done | dmsetup create example
>>
>> With this fix, a random write with fsync workload executed with the
>> following fio command:
>>
>> fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
>> --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
>> --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
>>
>> results in an increase from 857 KB/s to 30.8 MB/s of the write
>> throughput (3580% increase).
>>
>> Signed-off-by: Yang Yang <yang.yang@...o.com>
>> ---
>> drivers/md/dm-core.h | 1 +
>> drivers/md/dm-table.c | 7 +++++
>> drivers/md/dm.c | 59 +++++++++++++++++++++++++++++++++++
>> include/linux/device-mapper.h | 6 ++++
>> 4 files changed, 73 insertions(+)
>>
>> diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
>> index e6757a30dcca..7e3f2168289f 100644
>> --- a/drivers/md/dm-core.h
>> +++ b/drivers/md/dm-core.h
>> @@ -217,6 +217,7 @@ struct dm_table {
>> /* a list of devices used by this table */
>> struct list_head devices;
>> struct rw_semaphore devices_lock;
>> + unsigned short num_devices;
>>
>> /* events get handed up using this callback */
>> void (*event_fn)(void *data);
>> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
>> index 41f1d731ae5a..ddc60e498afb 100644
>> --- a/drivers/md/dm-table.c
>> +++ b/drivers/md/dm-table.c
>> @@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
>>
>> int dm_table_resume_targets(struct dm_table *t)
>> {
>> + struct list_head *devices = dm_table_get_devices(t);
>> + struct dm_dev_internal *dd;
>> unsigned int i;
>> int r = 0;
>>
>> @@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
>> ti->type->resume(ti);
>> }
>>
>> + t->num_devices = 0;
>> +
>> + list_for_each_entry(dd, devices, list)
>> + dd->dm_dev->index = ++(t->num_devices);
>> +
>> return 0;
>> }
>>
>> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
>> index 56aa2a8b9d71..7297235291f6 100644
>> --- a/drivers/md/dm.c
>> +++ b/drivers/md/dm.c
>> @@ -48,6 +48,8 @@
>> */
>> #define REQ_DM_POLL_LIST REQ_DRV
>>
>> +#define DM_MAX_TABLE_DEVICES 1024
>> +
>> static const char *_name = DM_NAME;
>>
>> static unsigned int major;
>> @@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
>> return ret;
>> }
>>
>> +static inline bool has_redundant_flush(struct dm_table *t,
>> + unsigned long **bitmap)
>> +{
>> + if (t->num_devices < t->num_targets) {
>> + /* Add a limit here to prevent excessive memory usage for bitmaps */
>> + if (t->num_devices >= DM_MAX_TABLE_DEVICES)
>> + return false;
>> +
>> + /* dm_dev's index starts from 1, so need plus 1 here */
>> + *bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL);
>> + if (*bitmap)
>> + return true;
>> + }
>> +
>> + return false;
>> +}
>> +
>> +static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
>> + sector_t start, sector_t len, void *data)
>> +{
>> + unsigned short *index = data;
>> + *index = dev->index;
>> + return 0;
>> +}
>> +
>> static void __send_empty_flush(struct clone_info *ci)
>> {
>> struct dm_table *t = ci->map;
>> struct bio flush_bio;
>> + unsigned long *handled_map;
>> + unsigned int nr_handled = 0;
>> + bool check = has_redundant_flush(t, &handled_map);
>>
>> /*
>> * Use an on-stack bio for this, it's safe since we don't
>> @@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci)
>>
>> for (unsigned int i = 0; i < t->num_targets; i++) {
>> unsigned int bios;
>> + unsigned short index = 0;
>> struct dm_target *ti = dm_table_get_target(t, i);
>>
>> if (unlikely(ti->num_flush_bios == 0))
>> continue;
>>
>> + /*
>> + * If the num_targets is greater than the number of block devices
>> + * in the dm_table's devices list, __send_empty_flush() might
>> + * invoke __send_duplicate_bios() multiple times for the same
>> + * block device. This could lead to a substantial decrease in
>> + * performance when num_targets significantly exceeds the number
>> + * of block devices.
>> + * Ensure that __send_duplicate_bios() is only called once for
>> + * each block device.
>> + */
>> + if (check) {
>> + if (nr_handled == t->num_devices)
>> + break;
>> +
>> + if (ti->type->iterate_devices)
>> + ti->type->iterate_devices(ti, dm_get_dev_index, &index);
>> +
>> + if (index > 0) {
>> + if (__test_and_set_bit(index, handled_map))
>> + continue;
>> + else
>> + nr_handled++;
>> + }
>> + }
>> +
>> atomic_add(ti->num_flush_bios, &ci->io->io_count);
>> bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
>> NULL, GFP_NOWAIT);
>> atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
>> }
>>
>> + if (check)
>> + bitmap_free(handled_map);
>> +
>> /*
>> * alloc_io() takes one extra reference for submission, so the
>> * reference won't reach 0 without the following subtraction
>> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
>> index 82b2195efaca..4a54b4f0a609 100644
>> --- a/include/linux/device-mapper.h
>> +++ b/include/linux/device-mapper.h
>> @@ -169,6 +169,12 @@ struct dm_dev {
>> struct dax_device *dax_dev;
>> blk_mode_t mode;
>> char name[16];
>> +
>> + /*
>> + * sequential number for each dm_dev in dm_table's devices list,
>> + * start from 1
>> + */
>> + unsigned short index;
>> };
>>
>> /*
>> --
>> 2.34.1
>>
>
>
Powered by blists - more mailing lists