lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250819023648.1031767-1-wangzijie1@honor.com>
Date: Tue, 19 Aug 2025 10:36:48 +0800
From: wangzijie <wangzijie1@...or.com>
To: <niuzhiguo84@...il.com>
CC: <chao@...nel.org>, <feng.han@...or.com>, <jaegeuk@...nel.org>,
	<linux-f2fs-devel@...ts.sourceforge.net>, <linux-kernel@...r.kernel.org>,
	<wangzijie1@...or.com>
Subject: Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries

>>
>> Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
>> and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
>> f2fs_get_node_info") also mentioned that situation.
>>
>> My idea is, when flush nat entries, we can use some structures to record nat
>> pages we may read, and readahead them before hold nat_tree_lock. Before
>> impletement code, I did some survey and found a submittion in community.
>>
>> Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
>> Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
>> This patch aims to improve nat entry set sort by using bucket.
>> I steal that structure and readahead nat pages contain nat entry set which cannot be moved
>> to journal according to dirty nat entry set bucket.
>>
>> By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
>> when flush nat entries.
>> 1. avoid nat set tree lookup and sort
>> 2. readahead nat pages before holding nat_tree_lock
>>
>> Signed-off-by: wangzijie <wangzijie1@...or.com>
>> ---
>>  fs/f2fs/f2fs.h |  1 +
>>  fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
>>  fs/f2fs/node.h |  2 +-
>>  3 files changed, 35 insertions(+), 38 deletions(-)
>>
>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>> index 46be75605..b27cc059f 100644
>> --- a/fs/f2fs/f2fs.h
>> +++ b/fs/f2fs/f2fs.h
>> @@ -975,6 +975,7 @@ struct f2fs_nm_info {
>>         struct radix_tree_root nat_set_root;/* root of the nat set cache */
>>         struct f2fs_rwsem nat_tree_lock;        /* protect nat entry tree */
>>         struct list_head nat_entries;   /* cached nat entry list (clean) */
>> +       struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1];        /* store dirty nat set */
>>         spinlock_t nat_list_lock;       /* protect clean nat entry list */
>>         unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
>>         unsigned int nat_blocks;        /* # of nat blocks */
>> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
>> index 27743b93e..87c975ee8 100644
>> --- a/fs/f2fs/node.c
>> +++ b/fs/f2fs/node.c
>> @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
>>         __free_nat_entry(e);
>>  }
>>
>> +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
>> +                                                       struct nat_entry_set *set)
>> +{
>> +       list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
>> +}
>Hi zijie,
>Is there any lock protecting this list related operations in the
>current process?

Hi, Zhiguo
I think the lock protection needed is as same as __set_nat_cache_dirty().

>> +
>>  static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>>                                                         struct nat_entry *ne)
>>  {
>> @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>>                 head->set = set;
>>                 head->entry_cnt = 0;
>>                 f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
>> +               __relocate_nat_entry_set(nm_i, head);
>>         }
>>         return head;
>>  }
>> @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>>          * 2. update old block address to new one;
>>          */
>>         if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
>> -                               !get_nat_flag(ne, IS_DIRTY)))
>> +                               !get_nat_flag(ne, IS_DIRTY))) {
>>                 head->entry_cnt++;
>> +               __relocate_nat_entry_set(nm_i, head);
>> +       }
>>
>>         set_nat_flag(ne, IS_PREALLOC, new_ne);
>>
>> @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>>
>>         set_nat_flag(ne, IS_DIRTY, false);
>>         set->entry_cnt--;
>> +       __relocate_nat_entry_set(nm_i, set);
>>         nm_i->nat_cnt[DIRTY_NAT]--;
>>         nm_i->nat_cnt[RECLAIMABLE_NAT]++;
>>  }
>> @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
>>         up_write(&curseg->journal_rwsem);
>>  }
>>
>> -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
>> -                                               struct list_head *head, int max)
>> -{
>> -       struct nat_entry_set *cur;
>> -
>> -       if (nes->entry_cnt >= max)
>> -               goto add_out;
>> -
>> -       list_for_each_entry(cur, head, set_list) {
>> -               if (cur->entry_cnt >= nes->entry_cnt) {
>> -                       list_add(&nes->set_list, cur->set_list.prev);
>> -                       return;
>> -               }
>> -       }
>> -add_out:
>> -       list_add_tail(&nes->set_list, head);
>> -}
>> -
>>  static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
>>                 const struct f2fs_nat_block *nat_blk)
>>  {
>> @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>>
>>         /* Allow dirty nats by node block allocation in write_begin */
>>         if (!set->entry_cnt) {
>> +               list_del(&set->set_list);
>>                 radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
>>                 kmem_cache_free(nat_entry_set_slab, set);
>>         }
>> @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>         struct f2fs_nm_info *nm_i = NM_I(sbi);
>>         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
>>         struct f2fs_journal *journal = curseg->journal;
>> -       struct nat_entry_set *setvec[NAT_VEC_SIZE];
>>         struct nat_entry_set *set, *tmp;
>> -       unsigned int found;
>> -       nid_t set_idx = 0;
>> -       LIST_HEAD(sets);
>> +       int i;
>>         int err = 0;
>>
>>         /*
>> @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>         if (!nm_i->nat_cnt[DIRTY_NAT])
>>                 return 0;
>>
>> +       /* readahead sets which cannot be moved to journal */
>> +       if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
>> +               for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
>i am a little confused, why is there "i <= NAT_ENTRY_PER_BLOCK;"?
>do we need to calculate according to the current nat block/entry number?
>thanks!

Yes, you are right, we can calculate according to DIRTY_NAT count to end this loop early.


>> +                       list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
>> +                               f2fs_ra_meta_pages(sbi, set->set, 1,
>> +                                                               META_NAT, true);
>> +                       }
>> +               }
>> +       }
>> +
>>         f2fs_down_write(&nm_i->nat_tree_lock);
>>
>>         /*
>> @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>                         nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
>>                 remove_nats_in_journal(sbi);
>>
>> -       while ((found = __gang_lookup_nat_set(nm_i,
>> -                                       set_idx, NAT_VEC_SIZE, setvec))) {
>> -               unsigned idx;
>> -
>> -               set_idx = setvec[found - 1]->set + 1;
>> -               for (idx = 0; idx < found; idx++)
>> -                       __adjust_nat_entry_set(setvec[idx], &sets,
>> -                                               MAX_NAT_JENTRIES(journal));
>> -       }
>> -
>>         /* flush dirty nats in nat entry set */
>> -       list_for_each_entry_safe(set, tmp, &sets, set_list) {
>> -               err = __flush_nat_entry_set(sbi, set, cpc);
>> -               if (err)
>> -                       break;
>> +       for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
>> +               list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
>> +                       err = __flush_nat_entry_set(sbi, set, cpc);
>> +                       if (err)
>> +                               break;
>> +               }
>>         }
>>
>>         f2fs_up_write(&nm_i->nat_tree_lock);
>> @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>>         struct f2fs_nm_info *nm_i = NM_I(sbi);
>>         unsigned char *version_bitmap;
>>         unsigned int nat_segs;
>> +       int i;
>>         int err;
>>
>>         nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
>> @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>>         INIT_LIST_HEAD(&nm_i->nat_entries);
>>         spin_lock_init(&nm_i->nat_list_lock);
>>
>> +       for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
>> +               INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
>> +
>>         mutex_init(&nm_i->build_lock);
>>         spin_lock_init(&nm_i->nid_list_lock);
>>         init_f2fs_rwsem(&nm_i->nat_tree_lock);
>> diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
>> index 030390543..d805d4ce7 100644
>> --- a/fs/f2fs/node.h
>> +++ b/fs/f2fs/node.h
>> @@ -158,7 +158,7 @@ enum mem_type {
>>  };
>>
>>  struct nat_entry_set {
>> -       struct list_head set_list;      /* link with other nat sets */
>> +       struct list_head set_list;      /* link with nat sets which have same entry_cnt */
>>         struct list_head entry_list;    /* link with dirty nat entries */
>>         nid_t set;                      /* set number*/
>>         unsigned int entry_cnt;         /* the # of nat entries in set */
>> --
>> 2.25.1

By the way, Chao and Zhiguo
I noticed that in f2fs_flush_nat_entries(), the check for nat_cnt[DIRTY_NAT] is out of
nat_tree_lock, and in f2fs_write_checkpoint(), there has below check without nat_tree_lock:

if (cpc->reason & CP_DISCARD) {
	if (!f2fs_exist_trim_candidates(sbi, cpc)) {
		unblock_operations(sbi);
		goto out;
	}

	if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 &&
			SIT_I(sbi)->dirty_sentries == 0 &&
			prefree_segments(sbi) == 0) {
		f2fs_flush_sit_entries(sbi, cpc);
		f2fs_clear_prefree_segments(sbi, cpc);
		unblock_operations(sbi);
		goto out;
	}
}

Is there a missing or we don't need nat_tree_lock in these situations?

And do you think the way in this patch is appropriate for reducing nat_tree_lock
hold time?

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ