lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250826014401.2020583-1-wangzijie1@honor.com>
Date: Tue, 26 Aug 2025 09:44:01 +0800
From: wangzijie <wangzijie1@...or.com>
To: <linux-f2fs-devel@...ts.sourceforge.net>
CC: <chao@...nel.org>, <feng.han@...or.com>, <jaegeuk@...nel.org>,
	<linux-kernel@...r.kernel.org>, <wangzijie1@...or.com>
Subject: Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries

> On 8/13/25 12:04, wangzijie wrote:
> > Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
> > and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
> > f2fs_get_node_info") also mentioned that situation.
> > 
> > My idea is, when flush nat entries, we can use some structures to record nat
> > pages we may read, and readahead them before hold nat_tree_lock. Before
> > impletement code, I did some survey and found a submittion in community.
> > 
> > Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
> > Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
> > This patch aims to improve nat entry set sort by using bucket.
> > I steal that structure and readahead nat pages contain nat entry set which cannot be moved
> > to journal according to dirty nat entry set bucket.
> > 
> > By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
> > when flush nat entries.
> 
> Zijie,
> 
> Can you please figure out some numbers for this patch? something like
> checkpoint latency or average or extreme time to grab nat_tree_lock...?
> 
> > 1. avoid nat set tree lookup and sort
> > 2. readahead nat pages before holding nat_tree_lock
> 
> It may cause performance regression if it races w/ drop_caches?
> 
> Thanks,

Hi, Chao
Do you mean that it will race with f2fs_try_to_free_nats()?
In this patch, nat_tree_lock is not held when readahead nat pages, but
in fact we need it. Am I right?

> > 
> > Signed-off-by: wangzijie <wangzijie1@...or.com>
> > ---
> >  fs/f2fs/f2fs.h |  1 +
> >  fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
> >  fs/f2fs/node.h |  2 +-
> >  3 files changed, 35 insertions(+), 38 deletions(-)
> > 
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index 46be75605..b27cc059f 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -975,6 +975,7 @@ struct f2fs_nm_info {
> >  	struct radix_tree_root nat_set_root;/* root of the nat set cache */
> >  	struct f2fs_rwsem nat_tree_lock;	/* protect nat entry tree */
> >  	struct list_head nat_entries;	/* cached nat entry list (clean) */
> > +	struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1];	/* store dirty nat set */
> >  	spinlock_t nat_list_lock;	/* protect clean nat entry list */
> >  	unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
> >  	unsigned int nat_blocks;	/* # of nat blocks */
> > diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> > index 27743b93e..87c975ee8 100644
> > --- a/fs/f2fs/node.c
> > +++ b/fs/f2fs/node.c
> > @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
> >  	__free_nat_entry(e);
> >  }
> >  
> > +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
> > +							struct nat_entry_set *set)
> > +{
> > +	list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
> > +}
> > +
> >  static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
> >  							struct nat_entry *ne)
> >  {
> > @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
> >  		head->set = set;
> >  		head->entry_cnt = 0;
> >  		f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
> > +		__relocate_nat_entry_set(nm_i, head);
> >  	}
> >  	return head;
> >  }
> > @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> >  	 * 2. update old block address to new one;
> >  	 */
> >  	if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
> > -				!get_nat_flag(ne, IS_DIRTY)))
> > +				!get_nat_flag(ne, IS_DIRTY))) {
> >  		head->entry_cnt++;
> > +		__relocate_nat_entry_set(nm_i, head);
> > +	}
> >  
> >  	set_nat_flag(ne, IS_PREALLOC, new_ne);
> >  
> > @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> >  
> >  	set_nat_flag(ne, IS_DIRTY, false);
> >  	set->entry_cnt--;
> > +	__relocate_nat_entry_set(nm_i, set);
> >  	nm_i->nat_cnt[DIRTY_NAT]--;
> >  	nm_i->nat_cnt[RECLAIMABLE_NAT]++;
> >  }
> > @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
> >  	up_write(&curseg->journal_rwsem);
> >  }
> >  
> > -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> > -						struct list_head *head, int max)
> > -{
> > -	struct nat_entry_set *cur;
> > -
> > -	if (nes->entry_cnt >= max)
> > -		goto add_out;
> > -
> > -	list_for_each_entry(cur, head, set_list) {
> > -		if (cur->entry_cnt >= nes->entry_cnt) {
> > -			list_add(&nes->set_list, cur->set_list.prev);
> > -			return;
> > -		}
> > -	}
> > -add_out:
> > -	list_add_tail(&nes->set_list, head);
> > -}
> > -
> >  static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
> >  		const struct f2fs_nat_block *nat_blk)
> >  {
> > @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
> >  
> >  	/* Allow dirty nats by node block allocation in write_begin */
> >  	if (!set->entry_cnt) {
> > +		list_del(&set->set_list);
> >  		radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
> >  		kmem_cache_free(nat_entry_set_slab, set);
> >  	}
> > @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >  	struct f2fs_nm_info *nm_i = NM_I(sbi);
> >  	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> >  	struct f2fs_journal *journal = curseg->journal;
> > -	struct nat_entry_set *setvec[NAT_VEC_SIZE];
> >  	struct nat_entry_set *set, *tmp;
> > -	unsigned int found;
> > -	nid_t set_idx = 0;
> > -	LIST_HEAD(sets);
> > +	int i;
> >  	int err = 0;
> >  
> >  	/*
> > @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >  	if (!nm_i->nat_cnt[DIRTY_NAT])
> >  		return 0;
> >  
> > +	/* readahead sets which cannot be moved to journal */
> > +	if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
> > +		for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
> > +			list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> > +				f2fs_ra_meta_pages(sbi, set->set, 1,
> > +								META_NAT, true);
> > +			}
> > +		}
> > +	}
> > +
> >  	f2fs_down_write(&nm_i->nat_tree_lock);
> >  
> >  	/*
> > @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >  			nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
> >  		remove_nats_in_journal(sbi);
> >  
> > -	while ((found = __gang_lookup_nat_set(nm_i,
> > -					set_idx, NAT_VEC_SIZE, setvec))) {
> > -		unsigned idx;
> > -
> > -		set_idx = setvec[found - 1]->set + 1;
> > -		for (idx = 0; idx < found; idx++)
> > -			__adjust_nat_entry_set(setvec[idx], &sets,
> > -						MAX_NAT_JENTRIES(journal));
> > -	}
> > -
> >  	/* flush dirty nats in nat entry set */
> > -	list_for_each_entry_safe(set, tmp, &sets, set_list) {
> > -		err = __flush_nat_entry_set(sbi, set, cpc);
> > -		if (err)
> > -			break;
> > +	for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
> > +		list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> > +			err = __flush_nat_entry_set(sbi, set, cpc);
> > +			if (err)
> > +				break;
> > +		}
> >  	}
> >  
> >  	f2fs_up_write(&nm_i->nat_tree_lock);
> > @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> >  	struct f2fs_nm_info *nm_i = NM_I(sbi);
> >  	unsigned char *version_bitmap;
> >  	unsigned int nat_segs;
> > +	int i;
> >  	int err;
> >  
> >  	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
> > @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> >  	INIT_LIST_HEAD(&nm_i->nat_entries);
> >  	spin_lock_init(&nm_i->nat_list_lock);
> >  
> > +	for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
> > +		INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
> > +
> >  	mutex_init(&nm_i->build_lock);
> >  	spin_lock_init(&nm_i->nid_list_lock);
> >  	init_f2fs_rwsem(&nm_i->nat_tree_lock);
> > diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
> > index 030390543..d805d4ce7 100644
> > --- a/fs/f2fs/node.h
> > +++ b/fs/f2fs/node.h
> > @@ -158,7 +158,7 @@ enum mem_type {
> >  };
> >  
> >  struct nat_entry_set {
> > -	struct list_head set_list;	/* link with other nat sets */
> > +	struct list_head set_list;	/* link with nat sets which have same entry_cnt */
> >  	struct list_head entry_list;	/* link with dirty nat entries */
> >  	nid_t set;			/* set number*/
> >  	unsigned int entry_cnt;		/* the # of nat entries in set */


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ