lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20141001150651.GA24084@jaegeuk-mac02.hsd1.ca.comcast.net>
Date:	Wed, 1 Oct 2014 08:06:51 -0700
From:	Jaegeuk Kim <jaegeuk@...nel.org>
To:	Chao Yu <chao2.yu@...sung.com>
Cc:	linux-kernel@...r.kernel.org,
	linux-f2fs-devel@...ts.sourceforge.net
Subject: Re: [f2fs-dev] [PATCH 3/3] f2fs: refactor flush_nat_entries to
 remove costly reorganizing ops

On Tue, Sep 30, 2014 at 02:04:20PM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@...nel.org]
> > Sent: Tuesday, September 23, 2014 12:53 PM
> > To: linux-kernel@...r.kernel.org; linux-fsdevel@...r.kernel.org;
> > linux-f2fs-devel@...ts.sourceforge.net
> > Cc: Jaegeuk Kim
> > Subject: [f2fs-dev] [PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing
> > ops
> > 
> > Previously, f2fs tries to reorganize the dirty nat entries into multiple sets
> > according to its nid ranges. This can improve the flushing nat pages, however,
> > if there are a lot of cached nat entries, it becomes a bottleneck.
> > 
> > This patch introduces a new set management flow by removing dirty nat list and
> > adding a series of set operations when the nat entry becomes dirty.
> > 
> > Signed-off-by: Jaegeuk Kim <jaegeuk@...nel.org>
> > ---
> >  fs/f2fs/f2fs.h |  13 +--
> >  fs/f2fs/node.c | 299 +++++++++++++++++++++++++++++----------------------------
> >  fs/f2fs/node.h |   9 +-
> >  3 files changed, 162 insertions(+), 159 deletions(-)
> > 
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index 7b1e1d2..94cfdc4 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -164,6 +164,9 @@ struct fsync_inode_entry {
> >  #define sit_in_journal(sum, i)		(sum->sit_j.entries[i].se)
> >  #define segno_in_journal(sum, i)	(sum->sit_j.entries[i].segno)
> > 
> > +#define MAX_NAT_JENTRIES(sum)	(NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
> > +#define MAX_SIT_JENTRIES(sum)	(SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
> > +
> >  static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
> >  {
> >  	int before = nats_in_cursum(rs);
> > @@ -182,9 +185,8 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int
> > size,
> >  								int type)
> >  {
> >  	if (type == NAT_JOURNAL)
> > -		return nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES;
> > -
> > -	return sits_in_cursum(sum) + size <= SIT_JOURNAL_ENTRIES;
> > +		return size <= MAX_NAT_JENTRIES(sum);
> > +	return size <= MAX_SIT_JENTRIES(sum);
> >  }
> > 
> >  /*
> > @@ -292,11 +294,10 @@ struct f2fs_nm_info {
> > 
> >  	/* NAT cache management */
> >  	struct radix_tree_root nat_root;/* root of the nat entry cache */
> > +	struct radix_tree_root nat_set_root;/* root of the nat set cache */
> >  	rwlock_t nat_tree_lock;		/* protect nat_tree_lock */
> > -	unsigned int nat_cnt;		/* the # of cached nat entries */
> >  	struct list_head nat_entries;	/* cached nat entry list (clean) */
> > -	struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
> > -	struct list_head nat_entry_set;	/* nat entry set list */
> > +	unsigned int nat_cnt;		/* the # of cached nat entries */
> >  	unsigned int dirty_nat_cnt;	/* total num of nat entries in set */
> > 
> >  	/* free node ids management */
> > diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> > index 21ed91b..f5a21f4 100644
> > --- a/fs/f2fs/node.c
> > +++ b/fs/f2fs/node.c
> > @@ -123,6 +123,57 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct
> > nat_entry *e)
> >  	kmem_cache_free(nat_entry_slab, e);
> >  }
> > 
> > +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> > +						struct nat_entry *ne)
> > +{
> > +	nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
> 
> nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
> 
> > +	struct nat_entry_set *head;
> > +
> > +	if (get_nat_flag(ne, IS_DIRTY))
> > +		return;
> > +retry:
> > +	head = radix_tree_lookup(&nm_i->nat_set_root, set);
> > +	if (!head) {
> > +		head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
> > +
> > +		INIT_LIST_HEAD(&head->entry_list);
> > +		INIT_LIST_HEAD(&head->set_list);
> > +		head->set = set;
> > +		head->entry_cnt = 0;
> > +
> > +		if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
> > +			cond_resched();
> > +			goto retry;
> > +		}
> > +	}
> > +	list_move_tail(&ne->list, &head->entry_list);
> > +	nm_i->dirty_nat_cnt++;
> > +	head->entry_cnt++;
> > +	set_nat_flag(ne, IS_DIRTY, true);
> > +}
> > +
> > +static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> > +						struct nat_entry *ne)
> > +{
> > +	nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
> > +	struct nat_entry_set *head;
> > +
> > +	head = radix_tree_lookup(&nm_i->nat_set_root, set);
> > +	if (head) {
> > +		list_move_tail(&ne->list, &nm_i->nat_entries);
> > +		set_nat_flag(ne, IS_DIRTY, false);
> > +		head->entry_cnt--;
> > +		nm_i->dirty_nat_cnt--;
> > +	}
> > +}
> > +
> > +static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
> > +		nid_t start, unsigned int nr, struct nat_entry_set **ep)
> > +{
> > +	return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
> > +							start, nr);
> > +}
> > +
> >  bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
> >  {
> >  	struct f2fs_nm_info *nm_i = NM_I(sbi);
> > @@ -1739,79 +1790,6 @@ skip:
> >  	return err;
> >  }
> > 
> > -static struct nat_entry_set *grab_nat_entry_set(void)
> > -{
> > -	struct nat_entry_set *nes =
> > -			f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
> > -
> > -	nes->entry_cnt = 0;
> > -	INIT_LIST_HEAD(&nes->set_list);
> > -	INIT_LIST_HEAD(&nes->entry_list);
> > -	return nes;
> > -}
> > -
> > -static void release_nat_entry_set(struct nat_entry_set *nes,
> > -						struct f2fs_nm_info *nm_i)
> > -{
> > -	nm_i->dirty_nat_cnt -= nes->entry_cnt;
> > -	list_del(&nes->set_list);
> > -	kmem_cache_free(nat_entry_set_slab, nes);
> > -}
> > -
> > -static void adjust_nat_entry_set(struct nat_entry_set *nes,
> > -						struct list_head *head)
> > -{
> > -	struct nat_entry_set *next = nes;
> > -
> > -	if (list_is_last(&nes->set_list, head))
> > -		return;
> > -
> > -	list_for_each_entry_continue(next, head, set_list)
> > -		if (nes->entry_cnt <= next->entry_cnt)
> > -			break;
> > -
> > -	list_move_tail(&nes->set_list, &next->set_list);
> > -}
> > -
> > -static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
> > -{
> > -	struct nat_entry_set *nes;
> > -	nid_t start_nid = START_NID(ne->ni.nid);
> > -
> > -	list_for_each_entry(nes, head, set_list) {
> > -		if (nes->start_nid == start_nid) {
> > -			list_move_tail(&ne->list, &nes->entry_list);
> > -			nes->entry_cnt++;
> > -			adjust_nat_entry_set(nes, head);
> > -			return;
> > -		}
> > -	}
> > -
> > -	nes = grab_nat_entry_set();
> > -
> > -	nes->start_nid = start_nid;
> > -	list_move_tail(&ne->list, &nes->entry_list);
> > -	nes->entry_cnt++;
> > -	list_add(&nes->set_list, head);
> > -}
> > -
> > -static void merge_nats_in_set(struct f2fs_sb_info *sbi)
> > -{
> > -	struct f2fs_nm_info *nm_i = NM_I(sbi);
> > -	struct list_head *dirty_list = &nm_i->dirty_nat_entries;
> > -	struct list_head *set_list = &nm_i->nat_entry_set;
> > -	struct nat_entry *ne, *tmp;
> > -
> > -	write_lock(&nm_i->nat_tree_lock);
> > -	list_for_each_entry_safe(ne, tmp, dirty_list, list) {
> > -		if (nat_get_blkaddr(ne) == NEW_ADDR)
> > -			continue;
> 
> Shouldn't we move this condition judgment into __flush_nat_entry_set?
> 
> > -		add_nat_entry(ne, set_list);
> > -		nm_i->dirty_nat_cnt++;
> > -	}
> > -	write_unlock(&nm_i->nat_tree_lock);
> > -}
> > -
> >  static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
> >  {
> >  	struct f2fs_nm_info *nm_i = NM_I(sbi);
> > @@ -1846,101 +1824,129 @@ found:
> >  	mutex_unlock(&curseg->curseg_mutex);
> >  }
> > 
> > -/*
> > - * This function is called during the checkpointing process.
> > - */
> > -void flush_nat_entries(struct f2fs_sb_info *sbi)
> > +static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> > +						struct list_head *head, int max)
> >  {
> > -	struct f2fs_nm_info *nm_i = NM_I(sbi);
> > -	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> > -	struct f2fs_summary_block *sum = curseg->sum_blk;
> > -	struct nat_entry_set *nes, *tmp;
> > -	struct list_head *head = &nm_i->nat_entry_set;
> > -	bool to_journal = true;
> > +	struct nat_entry_set *cur;
> > +	nid_t dirty_cnt = 0;
> > 
> > -	/* merge nat entries of dirty list to nat entry set temporarily */
> > -	merge_nats_in_set(sbi);
> > +	if (nes->entry_cnt >= max)
> > +		goto add_out;
> > 
> > -	/*
> > -	 * if there are no enough space in journal to store dirty nat
> > -	 * entries, remove all entries from journal and merge them
> > -	 * into nat entry set.
> > -	 */
> > -	if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) {
> > -		remove_nats_in_journal(sbi);
> > -
> > -		/*
> > -		 * merge nat entries of dirty list to nat entry set temporarily
> > -		 */
> > -		merge_nats_in_set(sbi);
> > +	list_for_each_entry(cur, head, set_list) {
> > +		dirty_cnt += cur->entry_cnt;
> > +		if (dirty_cnt > max)
> > +			break;
> 
> Seems a problem here, For example:
> set no:		1	2	3	4	5	6	7
> nat number:	4	3	2	1	1	1	1
> max = 6
> set list will be: 3--4--2--1--1--1--1
> Then we will prior flush nat set with more entries in the set list, result in
> a degradation.
> We'd better not break the adjustment until we finish to traverse the whole
> sequential sorted set list to avoid above condition, so how about removing the
> codes relate to dirty_cnt?

Hi Chao,

Agreed.
I fixed that.
Thanks,

> 
> Thanks,
> Yu
> 
> > +		if (cur->entry_cnt >= nes->entry_cnt) {
> > +			list_add(&nes->set_list, cur->set_list.prev);
> > +			return;
> > +		}
> >  	}
> > +add_out:
> > +	list_add_tail(&nes->set_list, head);
> > +}
> > 
> > -	if (!nm_i->dirty_nat_cnt)
> > -		return;
> > +static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
> > +					struct nat_entry_set *set)
> > +{
> > +	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> > +	struct f2fs_summary_block *sum = curseg->sum_blk;
> > +	nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
> > +	bool to_journal = true;
> > +	struct f2fs_nat_block *nat_blk;
> > +	struct nat_entry *ne, *cur;
> > +	struct page *page = NULL;
> > 
> >  	/*
> >  	 * there are two steps to flush nat entries:
> >  	 * #1, flush nat entries to journal in current hot data summary block.
> >  	 * #2, flush nat entries to nat page.
> >  	 */
> > -	list_for_each_entry_safe(nes, tmp, head, set_list) {
> > -		struct f2fs_nat_block *nat_blk;
> > -		struct nat_entry *ne, *cur;
> > -		struct page *page;
> > -		nid_t start_nid = nes->start_nid;
> > +	if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
> > +		to_journal = false;
> > 
> > -		if (to_journal &&
> > -			!__has_cursum_space(sum, nes->entry_cnt, NAT_JOURNAL))
> > -			to_journal = false;
> > +	if (to_journal) {
> > +		mutex_lock(&curseg->curseg_mutex);
> > +	} else {
> > +		page = get_next_nat_page(sbi, start_nid);
> > +		nat_blk = page_address(page);
> > +		f2fs_bug_on(sbi, !nat_blk);
> > +	}
> > +
> > +	/* flush dirty nats in nat entry set */
> > +	list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
> > +		struct f2fs_nat_entry *raw_ne;
> > +		nid_t nid = nat_get_nid(ne);
> > +		int offset;
> > 
> >  		if (to_journal) {
> > -			mutex_lock(&curseg->curseg_mutex);
> > +			offset = lookup_journal_in_cursum(sum,
> > +							NAT_JOURNAL, nid, 1);
> > +			f2fs_bug_on(sbi, offset < 0);
> > +			raw_ne = &nat_in_journal(sum, offset);
> > +			nid_in_journal(sum, offset) = cpu_to_le32(nid);
> >  		} else {
> > -			page = get_next_nat_page(sbi, start_nid);
> > -			nat_blk = page_address(page);
> > -			f2fs_bug_on(sbi, !nat_blk);
> > +			raw_ne = &nat_blk->entries[nid - start_nid];
> >  		}
> > +		raw_nat_from_node_info(raw_ne, &ne->ni);
> > 
> > -		/* flush dirty nats in nat entry set */
> > -		list_for_each_entry_safe(ne, cur, &nes->entry_list, list) {
> > -			struct f2fs_nat_entry *raw_ne;
> > -			nid_t nid = nat_get_nid(ne);
> > -			int offset;
> > +		write_lock(&NM_I(sbi)->nat_tree_lock);
> > +		nat_reset_flag(ne);
> > +		__clear_nat_cache_dirty(NM_I(sbi), ne);
> > +		write_unlock(&NM_I(sbi)->nat_tree_lock);
> > 
> > -			if (to_journal) {
> > -				offset = lookup_journal_in_cursum(sum,
> > -							NAT_JOURNAL, nid, 1);
> > -				f2fs_bug_on(sbi, offset < 0);
> > -				raw_ne = &nat_in_journal(sum, offset);
> > -				nid_in_journal(sum, offset) = cpu_to_le32(nid);
> > -			} else {
> > -				raw_ne = &nat_blk->entries[nid - start_nid];
> > -			}
> > -			raw_nat_from_node_info(raw_ne, &ne->ni);
> > +		if (nat_get_blkaddr(ne) == NULL_ADDR)
> > +			add_free_nid(sbi, nid, false);
> > +	}
> > 
> > -			if (nat_get_blkaddr(ne) == NULL_ADDR &&
> > -				add_free_nid(sbi, nid, false) <= 0) {
> > -				write_lock(&nm_i->nat_tree_lock);
> > -				__del_from_nat_cache(nm_i, ne);
> > -				write_unlock(&nm_i->nat_tree_lock);
> > -			} else {
> > -				write_lock(&nm_i->nat_tree_lock);
> > -				nat_reset_flag(ne);
> > -				__clear_nat_cache_dirty(nm_i, ne);
> > -				write_unlock(&nm_i->nat_tree_lock);
> > -			}
> > -		}
> > +	if (to_journal)
> > +		mutex_unlock(&curseg->curseg_mutex);
> > +	else
> > +		f2fs_put_page(page, 1);
> > 
> > -		if (to_journal)
> > -			mutex_unlock(&curseg->curseg_mutex);
> > -		else
> > -			f2fs_put_page(page, 1);
> > +	f2fs_bug_on(sbi, set->entry_cnt);
> > +	radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
> > +	kmem_cache_free(nat_entry_set_slab, set);
> > +}
> > 
> > -		f2fs_bug_on(sbi, !list_empty(&nes->entry_list));
> > -		release_nat_entry_set(nes, nm_i);
> > +/*
> > + * This function is called during the checkpointing process.
> > + */
> > +void flush_nat_entries(struct f2fs_sb_info *sbi)
> > +{
> > +	struct f2fs_nm_info *nm_i = NM_I(sbi);
> > +	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> > +	struct f2fs_summary_block *sum = curseg->sum_blk;
> > +	struct nat_entry_set *setvec[NATVEC_SIZE];
> > +	struct nat_entry_set *set, *tmp;
> > +	unsigned int found;
> > +	nid_t set_idx = 0;
> > +	LIST_HEAD(sets);
> > +
> > +	/*
> > +	 * if there are no enough space in journal to store dirty nat
> > +	 * entries, remove all entries from journal and merge them
> > +	 * into nat entry set.
> > +	 */
> > +	if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
> > +		remove_nats_in_journal(sbi);
> > +
> > +	if (!nm_i->dirty_nat_cnt)
> > +		return;
> > +
> > +	while ((found = __gang_lookup_nat_set(nm_i,
> > +					set_idx, NATVEC_SIZE, setvec))) {
> > +		unsigned idx;
> > +		set_idx = setvec[found - 1]->set + 1;
> > +		for (idx = 0; idx < found; idx++)
> > +			__adjust_nat_entry_set(setvec[idx], &sets,
> > +							MAX_NAT_JENTRIES(sum));
> >  	}
> > 
> > -	f2fs_bug_on(sbi, !list_empty(head));
> > +	/* flush dirty nats in nat entry set */
> > +	list_for_each_entry_safe(set, tmp, &sets, set_list)
> > +		__flush_nat_entry_set(sbi, set);
> > +
> >  	f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
> >  }
> > 
> > @@ -1968,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> >  	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
> >  	INIT_LIST_HEAD(&nm_i->free_nid_list);
> >  	INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
> > +	INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
> >  	INIT_LIST_HEAD(&nm_i->nat_entries);
> > -	INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
> > -	INIT_LIST_HEAD(&nm_i->nat_entry_set);
> > 
> >  	mutex_init(&nm_i->build_lock);
> >  	spin_lock_init(&nm_i->free_nid_list_lock);
> > diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
> > index b8ba63c..bd826d9 100644
> > --- a/fs/f2fs/node.h
> > +++ b/fs/f2fs/node.h
> > @@ -43,6 +43,7 @@ enum {
> >  	IS_CHECKPOINTED,	/* is it checkpointed before? */
> >  	HAS_FSYNCED_INODE,	/* is the inode fsynced before? */
> >  	HAS_LAST_FSYNC,		/* has the latest node fsync mark? */
> > +	IS_DIRTY,		/* this nat entry is dirty? */
> >  };
> > 
> >  struct nat_entry {
> > @@ -60,10 +61,6 @@ struct nat_entry {
> >  #define nat_get_version(nat)		(nat->ni.version)
> >  #define nat_set_version(nat, v)		(nat->ni.version = v)
> > 
> > -#define __set_nat_cache_dirty(nm_i, ne)					\
> > -		list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
> > -#define __clear_nat_cache_dirty(nm_i, ne)				\
> > -		list_move_tail(&ne->list, &nm_i->nat_entries);
> >  #define inc_node_version(version)	(++version)
> > 
> >  static inline void set_nat_flag(struct nat_entry *ne,
> > @@ -113,9 +110,9 @@ enum mem_type {
> >  };
> > 
> >  struct nat_entry_set {
> > -	struct list_head set_list;	/* link with all nat sets */
> > +	struct list_head set_list;	/* link with other nat sets */
> >  	struct list_head entry_list;	/* link with dirty nat entries */
> > -	nid_t start_nid;		/* start nid of nats in set */
> > +	nid_t set;			/* set number*/
> >  	unsigned int entry_cnt;		/* the # of nat entries in set */
> >  };
> > 
> > --
> > 1.8.5.2 (Apple Git-48)
> > 
> > 
> > ------------------------------------------------------------------------------
> > Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
> > Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
> > Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
> > Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
> > http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
> > _______________________________________________
> > Linux-f2fs-devel mailing list
> > Linux-f2fs-devel@...ts.sourceforge.net
> > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ