[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20210630134635.fcdlsase45iotavs@work>
Date: Wed, 30 Jun 2021 15:46:35 +0200
From: Lukas Czerner <lczerner@...hat.com>
To: Jan Kara <jack@...e.cz>
Cc: Ted Tso <tytso@....edu>, linux-ext4@...r.kernel.org
Subject: Re: [PATCH 4/4] ext4: Improve scalability of ext4 orphan file
handling
On Wed, Jun 16, 2021 at 12:56:55PM +0200, Jan Kara wrote:
> Even though the length of the critical section when adding / removing
> orphaned inodes was significantly reduced by using orphan file, the
> contention of lock protecting orphan file still appears high in profiles
> for truncate / unlink intensive workloads with high number of threads.
>
> This patch makes handling of orphan file completely lockless. Also to
> reduce conflicts between CPUs different CPUs start searching for empty
> slot in orphan file in different blocks.
>
> Performance comparison of locked orphan file handling, lockless orphan
> file handling, and completely disabled orphan inode handling
> from 80 CPU Xeon Server with 526 GB of RAM, filesystem located on
> SAS SSD disk, average of 5 runs:
>
> stress-orphan (microbenchmark truncating files byte-by-byte from N
> processes in parallel)
>
> Threads Time Time Time
> Orphan locked Orphan lockless No orphan
> 1 0.945600 0.939400 0.891200
> 2 1.331800 1.246600 1.174400
> 4 1.995000 1.780600 1.713200
> 8 6.424200 4.900000 4.106000
> 16 14.937600 8.516400 8.138000
> 32 33.038200 24.565600 24.002200
> 64 60.823600 39.844600 38.440200
> 128 122.941400 70.950400 69.315000
>
> So we can see that with lockless orphan file handling, addition /
> deletion of orphaned inodes got almost completely out of picture even
> for a microbenchmark stressing it.
>
> For reaim creat_clo workload on ramdisk there are also noticeable gains
> (average of 5 runs):
>
> Clients Vanilla (ops/s) Patched (ops/s)
> creat_clo-1 14705.88 ( 0.00%) 14354.07 * -2.39%*
> creat_clo-3 27108.43 ( 0.00%) 28301.89 ( 4.40%)
> creat_clo-5 37406.48 ( 0.00%) 45180.73 * 20.78%*
> creat_clo-7 41338.58 ( 0.00%) 54687.50 * 32.29%*
> creat_clo-9 45226.13 ( 0.00%) 62937.07 * 39.16%*
> creat_clo-11 44000.00 ( 0.00%) 65088.76 * 47.93%*
> creat_clo-13 36516.85 ( 0.00%) 68661.97 * 88.03%*
> creat_clo-15 30864.20 ( 0.00%) 69551.78 * 125.35%*
> creat_clo-17 27478.45 ( 0.00%) 67729.08 * 146.48%*
> creat_clo-19 25000.00 ( 0.00%) 61621.62 * 146.49%*
> creat_clo-21 18772.35 ( 0.00%) 63829.79 * 240.02%*
> creat_clo-23 16698.94 ( 0.00%) 61938.96 * 270.92%*
> creat_clo-25 14973.05 ( 0.00%) 56947.61 * 280.33%*
> creat_clo-27 16436.69 ( 0.00%) 65008.03 * 295.51%*
> creat_clo-29 13949.01 ( 0.00%) 69047.62 * 395.00%*
> creat_clo-31 14283.52 ( 0.00%) 67982.45 * 375.95%*
>
> Signed-off-by: Jan Kara <jack@...e.cz>
> ---
> fs/ext4/ext4.h | 3 +--
> fs/ext4/orphan.c | 55 +++++++++++++++++++++++++++---------------------
> 2 files changed, 32 insertions(+), 26 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 83298c0b6dae..d08927e19b76 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1480,7 +1480,7 @@ static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
> }
>
> struct ext4_orphan_block {
> - int ob_free_entries; /* Number of free orphan entries in block */
> + atomic_t ob_free_entries; /* Number of free orphan entries in block */
> struct buffer_head *ob_bh; /* Buffer for orphan block */
> };
>
> @@ -1488,7 +1488,6 @@ struct ext4_orphan_block {
> * Info about orphan file.
> */
> struct ext4_orphan_info {
> - spinlock_t of_lock;
> int of_blocks; /* Number of orphan blocks in a file */
> __u32 of_csum_seed; /* Checksum seed for orphan file */
> struct ext4_orphan_block *of_binfo; /* Array with info about orphan
> diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
> index ac22667b7fd5..010222cde4f7 100644
> --- a/fs/ext4/orphan.c
> +++ b/fs/ext4/orphan.c
> @@ -10,16 +10,30 @@
>
> static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
> {
> - int i, j;
> + int i, j, start;
> struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
> int ret = 0;
> + bool found = false;
> __le32 *bdata;
> int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
>
> - spin_lock(&oi->of_lock);
> - for (i = 0; i < oi->of_blocks && !oi->of_binfo[i].ob_free_entries; i++);
> - if (i == oi->of_blocks) {
> - spin_unlock(&oi->of_lock);
> + /*
> + * Find block with free orphan entry. Use CPU number for a naive hash
> + * for a search start in the orphan file
> + */
> + start = raw_smp_processor_id()*13 % oi->of_blocks;
> + i = start;
> + do {
> + if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries)
> + >= 0) {
> + found = true;
> + break;
> + }
> + if (++i >= oi->of_blocks)
> + i = 0;
> + } while (i != start);
> +
> + if (!found) {
> /*
> * For now we don't grow or shrink orphan file. We just use
> * whatever was allocated at mke2fs time. The additional
> @@ -28,28 +42,24 @@ static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
> */
> return -ENOSPC;
> }
> - oi->of_binfo[i].ob_free_entries--;
> - spin_unlock(&oi->of_lock);
>
> - /*
> - * Get access to orphan block. We have dropped of_lock but since we
> - * have decremented number of free entries we are guaranteed free entry
> - * in our block.
> - */
> ret = ext4_journal_get_write_access(handle, inode->i_sb,
> oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
> if (ret)
> return ret;
>
> bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
> - spin_lock(&oi->of_lock);
> /* Find empty slot in a block */
> - for (j = 0; j < inodes_per_ob && bdata[j]; j++);
> - BUG_ON(j == inodes_per_ob);
> - bdata[j] = cpu_to_le32(inode->i_ino);
> + j = 0;
> + do {
> + while (bdata[j]) {
> + if (++j >= inodes_per_ob)
> + j = 0;
> + }
> + } while (cmpxchg(&bdata[j], 0, cpu_to_le32(inode->i_ino)) != 0);
In case there is any sort of corruption on disk or in memory we can
potentially get stuck here forever right ? Not sure if that matters
all that much.
Other than that it looks good and negates some of my comments on the
previous patch, sorry about that ;)
You can add
Reviewed-by: Lukas Czerner <lczerner@...hat.com>
> +
> EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
> ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
> - spin_unlock(&oi->of_lock);
>
> return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
> }
> @@ -178,10 +188,8 @@ static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
> goto out;
>
> bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
> - spin_lock(&oi->of_lock);
> bdata[off] = 0;
> - oi->of_binfo[blk].ob_free_entries++;
> - spin_unlock(&oi->of_lock);
> + atomic_inc(&oi->of_binfo[blk].ob_free_entries);
> ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
> out:
> ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
> @@ -534,8 +542,6 @@ int ext4_init_orphan_info(struct super_block *sb)
> struct ext4_orphan_block_tail *ot;
> ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);
>
> - spin_lock_init(&oi->of_lock);
> -
> if (!ext4_has_feature_orphan_file(sb))
> return 0;
>
> @@ -579,7 +585,7 @@ int ext4_init_orphan_info(struct super_block *sb)
> for (j = 0; j < inodes_per_ob; j++)
> if (bdata[j] == 0)
> free++;
> - oi->of_binfo[i].ob_free_entries = free;
> + atomic_set(&oi->of_binfo[i].ob_free_entries, free);
> }
> iput(inode);
> return 0;
> @@ -601,7 +607,8 @@ int ext4_orphan_file_empty(struct super_block *sb)
> if (!ext4_has_feature_orphan_file(sb))
> return 1;
> for (i = 0; i < oi->of_blocks; i++)
> - if (oi->of_binfo[i].ob_free_entries != inodes_per_ob)
> + if (atomic_read(&oi->of_binfo[i].ob_free_entries) !=
> + inodes_per_ob)
> return 0;
> return 1;
> }
> --
> 2.26.2
>
Powered by blists - more mailing lists