lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9c15f4dc-37ab-d94d-c62e-0aded8ea848f@huawei.com>
Date:   Sat, 25 Feb 2017 11:26:10 +0800
From:   Chao Yu <yuchao0@...wei.com>
To:     Jaegeuk Kim <jaegeuk@...nel.org>
CC:     <linux-kernel@...r.kernel.org>, <linux-fsdevel@...r.kernel.org>,
        <linux-f2fs-devel@...ts.sourceforge.net>
Subject: Re: [f2fs-dev] [PATCH 2/3] f2fs: add bitmaps for empty or full NAT
 blocks

On 2017/2/24 6:54, Jaegeuk Kim wrote:
> On 02/23, Chao Yu wrote:
>> On 2017/2/14 10:06, Jaegeuk Kim wrote:
>>> This patches adds bitmaps to represent empty or full NAT blocks containing
>>> free nid entries.
>>>
>>> If we can find valid crc|cp_ver in the last block of checkpoint pack, we'll
>>> use these bitmaps when building free nids. In order to avoid checkpointing
>>> burden, up-to-date bitmaps will be flushed only during umount time. So,
>>> normally we can get this gain, but when power-cut happens, we rely on fsck.f2fs
>>> which recovers this bitmap again.
>>>
>>> After this patch, we build free nids from nid #0 at mount time to make more
>>> full NAT blocks, but in runtime, we check empty NAT blocks to load free nids
>>> without loading any NAT pages from disk.
>>>
>>> Signed-off-by: Jaegeuk Kim <jaegeuk@...nel.org>
>>> ---
>>>  fs/f2fs/checkpoint.c    |  29 +++++++-
>>>  fs/f2fs/debug.c         |   1 +
>>>  fs/f2fs/f2fs.h          |  23 +++++-
>>>  fs/f2fs/node.c          | 188 +++++++++++++++++++++++++++++++++++++++++++-----
>>>  fs/f2fs/segment.c       |   2 +-
>>>  include/linux/f2fs_fs.h |   1 +
>>>  6 files changed, 224 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
>>> index 042f8d9afe44..783c5c3f16a4 100644
>>> --- a/fs/f2fs/checkpoint.c
>>> +++ b/fs/f2fs/checkpoint.c
>>> @@ -1024,6 +1024,10 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  
>>>  	spin_lock(&sbi->cp_lock);
>>>  
>>> +	if (ckpt->cp_pack_total_block_count >
>>> +			sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
>>> +		disable_nat_bits(sbi, false);
>>
>> I think we need to drop nat full/empty bitmap only if there is no enough space
>> in CP area while doing umount, otherwise we can keep this in memory.
> 
> Yup.
> 
>>
>>> +
>>>  	if (cpc->reason == CP_UMOUNT)
>>>  		__set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
>>>  	else
>>> @@ -1136,6 +1140,29 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  
>>>  	start_blk = __start_cp_next_addr(sbi);
>>>  
>>> +	/* write nat bits */
> 
> ...
> 
>>> +static int scan_nat_bits(struct f2fs_sb_info *sbi)
>>> +{
>>> +	struct f2fs_nm_info *nm_i = NM_I(sbi);
>>> +	struct page *page;
>>> +	unsigned int i = 0;
>>> +	nid_t target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK;
>>> +	nid_t nid;
>>> +
>>> +	if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
>>> +		return -EAGAIN;
>>> +
>>> +	down_read(&nm_i->nat_tree_lock);
>>> +check_empty:
>>> +	i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i);
>>> +	if (i >= nm_i->nat_blocks) {
>>> +		i = 0;
>>> +		goto check_partial;
>>> +	}
>>> +
>>> +	for (nid = i * NAT_ENTRY_PER_BLOCK; nid < (i + 1) * NAT_ENTRY_PER_BLOCK;
>>> +									nid++) {
>>> +		if (unlikely(nid >= nm_i->max_nid))
>>> +			break;
>>> +		add_free_nid(sbi, nid, true);
>>> +	}
>>> +
>>> +	if (nm_i->nid_cnt[FREE_NID_LIST] >= target)
>>> +		goto out;
>>> +	i++;
>>> +	goto check_empty;
>>> +
>>> +check_partial:
>>> +	i = find_next_zero_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i);
>>> +	if (i >= nm_i->nat_blocks) {
>>> +		disable_nat_bits(sbi, true);
>>
>> Can this happen in real world? Should be a bug in somewhere?
> 
> It happens, since current design handles full_nat_bits optionally in order
> to avoid scanning a whole NAT page to set it back as 1 from 0.

All 0 value in full_nat_bits means the NAT block has at least one free nid,
right? so if we traverse all such NAT blocks, and still haven't collect enough
*target* number of free nids, we don't have to disable nat bit cache, we can
just break out here.

Thanks,

> 
>>
>>> +		return -EINVAL;
>>> +	}
>>> +
>>> +	nid = i * NAT_ENTRY_PER_BLOCK;
>>> +	page = get_current_nat_page(sbi, nid);
>>> +	scan_nat_page(sbi, page, nid);
>>> +	f2fs_put_page(page, 1);
>>> +
>>> +	if (nm_i->nid_cnt[FREE_NID_LIST] < target) {
>>> +		i++;
>>> +		goto check_partial;
>>> +	}
>>> +out:
>>> +	up_read(&nm_i->nat_tree_lock);
>>> +	return 0;
>>> +}
>>> +
>>> +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
>>>  {
>>>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
>>>  	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
>>> @@ -1854,6 +1911,20 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
>>>  	if (!sync && !available_free_memory(sbi, FREE_NIDS))
>>>  		return;
>>>  
>>> +	/* try to find free nids with nat_bits */
>>> +	if (!mount && !scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST])
>>> +		return;
>>> +
>>> +	/* find next valid candidate */
>>
>> This is just for mount case?
> 
> Yup, it reuses free nids in dirty NAT blocks, so that we can make them as full
> NAT pages.
> 
> Thanks,
> 
>>
>>> +	if (is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) {
>>> +		int idx = find_next_zero_bit_le(nm_i->full_nat_bits,
>>> +					nm_i->nat_blocks, 0);
>>> +		if (idx >= nm_i->nat_blocks)
>>> +			set_sbi_flag(sbi, SBI_NEED_FSCK);
>>> +		else
>>> +			nid = idx * NAT_ENTRY_PER_BLOCK;
>>> +	}
>>> +
>>>  	/* readahead nat pages to be scanned */
>>>  	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
>>>  							META_NAT, true);
>>> @@ -1896,10 +1967,10 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
>>>  					nm_i->ra_nid_pages, META_NAT, false);
>>>  }
>>>  
>>> -void build_free_nids(struct f2fs_sb_info *sbi, bool sync)
>>> +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
>>>  {
>>>  	mutex_lock(&NM_I(sbi)->build_lock);
>>> -	__build_free_nids(sbi, sync);
>>> +	__build_free_nids(sbi, sync, mount);
>>>  	mutex_unlock(&NM_I(sbi)->build_lock);
>>>  }
>>>  
>>> @@ -1941,7 +2012,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
>>>  	spin_unlock(&nm_i->nid_list_lock);
>>>  
>>>  	/* Let's scan nat pages and its caches to get free nids */
>>> -	build_free_nids(sbi, true);
>>> +	build_free_nids(sbi, true, false);
>>>  	goto retry;
>>>  }
>>>  
>>> @@ -2233,8 +2304,39 @@ static void __adjust_nat_entry_set(struct nat_entry_set *nes,
>>>  	list_add_tail(&nes->set_list, head);
>>>  }
>>>  
>>> +void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
>>> +						struct page *page)
>>> +{
>>> +	struct f2fs_nm_info *nm_i = NM_I(sbi);
>>> +	unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
>>> +	struct f2fs_nat_block *nat_blk = page_address(page);
>>> +	int valid = 0;
>>> +	int i;
>>> +
>>> +	if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
>>> +		return;
>>> +
>>> +	for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) {
>>> +		if (start_nid == 0 && i == 0)
>>> +			valid++;
>>> +		if (nat_blk->entries[i].block_addr)
>>> +			valid++;
>>> +	}
>>> +	if (valid == 0) {
>>> +		test_and_set_bit_le(nat_index, nm_i->empty_nat_bits);
>>> +		test_and_clear_bit_le(nat_index, nm_i->full_nat_bits);
>>
>> set_bit_le/clear_bit_le
>>
>>> +		return;
>>> +	}
>>> +
>>> +	test_and_clear_bit_le(nat_index, nm_i->empty_nat_bits);
>>
>> ditto
>>
>>> +	if (valid == NAT_ENTRY_PER_BLOCK)
>>> +		test_and_set_bit_le(nat_index, nm_i->full_nat_bits);
>>> +	else
>>> +		test_and_clear_bit_le(nat_index, nm_i->full_nat_bits);
>>
>> ditto
>>
>>> +}
>>> +
>>>  static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>>> -					struct nat_entry_set *set)
>>> +		struct nat_entry_set *set, struct cp_control *cpc)
>>>  {
>>>  	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
>>>  	struct f2fs_journal *journal = curseg->journal;
>>> @@ -2249,7 +2351,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>>>  	 * #1, flush nat entries to journal in current hot data summary block.
>>>  	 * #2, flush nat entries to nat page.
>>>  	 */
>>> -	if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
>>> +	if (cpc->reason == CP_UMOUNT ||
>>
>> if ((cpc->reason == CP_UMOUNT && is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) ||
>>
>>> +		!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
>>>  		to_journal = false;
>>>  
>>>  	if (to_journal) {
>>> @@ -2289,10 +2392,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>>>  		}
>>>  	}
>>>  
>>> -	if (to_journal)
>>> +	if (to_journal) {
>>>  		up_write(&curseg->journal_rwsem);
>>> -	else
>>> +	} else {
>>> +		__update_nat_bits(sbi, start_nid, page);
>>>  		f2fs_put_page(page, 1);
>>> +	}
>>>  
>>>  	f2fs_bug_on(sbi, set->entry_cnt);
>>>  
>>> @@ -2303,7 +2408,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>>>  /*
>>>   * This function is called during the checkpointing process.
>>>   */
>>> -void flush_nat_entries(struct f2fs_sb_info *sbi)
>>> +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  {
>>>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
>>>  	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
>>> @@ -2324,7 +2429,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
>>>  	 * entries, remove all entries from journal and merge them
>>>  	 * into nat entry set.
>>>  	 */
>>> -	if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
>>> +	if (cpc->reason == CP_UMOUNT ||
>>
>> if ((cpc->reason == CP_UMOUNT && is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) ||
>>
>>> +		!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
>>>  		remove_nats_in_journal(sbi);
>>>  
>>>  	while ((found = __gang_lookup_nat_set(nm_i,
>>> @@ -2338,27 +2444,72 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
>>>  
>>>  	/* flush dirty nats in nat entry set */
>>>  	list_for_each_entry_safe(set, tmp, &sets, set_list)
>>> -		__flush_nat_entry_set(sbi, set);
>>> +		__flush_nat_entry_set(sbi, set, cpc);
>>>  
>>>  	up_write(&nm_i->nat_tree_lock);
>>>  
>>>  	f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
>>>  }
>>>  
>>> +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
>>> +{
>>> +	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
>>> +	struct f2fs_nm_info *nm_i = NM_I(sbi);
>>> +	unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE;
>>> +	unsigned int i;
>>> +	__u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver);
>>
>> __u64 cp_ver = cur_cp_version(ckpt);
>>
>> Thanks,
>>
>>> +	size_t crc_offset = le32_to_cpu(ckpt->checksum_offset);
>>> +	__u64 crc = le32_to_cpu(*((__le32 *)
>>> +				((unsigned char *)ckpt + crc_offset)));
>>> +	block_t nat_bits_addr;
>>> +
>>> +	if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
>>> +		return 0;
>>> +
>>> +	nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 +
>>> +						F2FS_BLKSIZE - 1);
>>> +	nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS,
>>> +						GFP_KERNEL);
>>> +	if (!nm_i->nat_bits)
>>> +		return -ENOMEM;
>>> +
>>> +	nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
>>> +						nm_i->nat_bits_blocks;
>>> +	for (i = 0; i < nm_i->nat_bits_blocks; i++) {
>>> +		struct page *page = get_meta_page(sbi, nat_bits_addr++);
>>> +
>>> +		memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
>>> +					page_address(page), F2FS_BLKSIZE);
>>> +		f2fs_put_page(page, 1);
>>> +	}
>>> +
>>> +	cp_ver |= (crc << 32);
>>> +	if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
>>> +		disable_nat_bits(sbi, true);
>>> +		return 0;
>>> +	}
>>> +
>>> +	nm_i->full_nat_bits = nm_i->nat_bits + 8;
>>> +	nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
>>> +
>>> +	f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint");
>>> +	return 0;
>>> +}
>>> +
>>>  static int init_node_manager(struct f2fs_sb_info *sbi)
>>>  {
>>>  	struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
>>>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
>>>  	unsigned char *version_bitmap;
>>> -	unsigned int nat_segs, nat_blocks;
>>> +	unsigned int nat_segs;
>>> +	int err;
>>>  
>>>  	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
>>>  
>>>  	/* segment_count_nat includes pair segment so divide to 2. */
>>>  	nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
>>> -	nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
>>> -
>>> -	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
>>> +	nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
>>> +	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks;
>>>  
>>>  	/* not used nids: 0, node, meta, (and root counted as valid node) */
>>>  	nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
>>> @@ -2392,6 +2543,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>>>  	if (!nm_i->nat_bitmap)
>>>  		return -ENOMEM;
>>>  
>>> +	err = __get_nat_bitmaps(sbi);
>>> +	if (err)
>>> +		return err;
>>> +
>>>  #ifdef CONFIG_F2FS_CHECK_FS
>>>  	nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
>>>  					GFP_KERNEL);
>>> @@ -2414,7 +2569,7 @@ int build_node_manager(struct f2fs_sb_info *sbi)
>>>  	if (err)
>>>  		return err;
>>>  
>>> -	build_free_nids(sbi, true);
>>> +	build_free_nids(sbi, true, true);
>>>  	return 0;
>>>  }
>>>  
>>> @@ -2473,6 +2628,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
>>>  	up_write(&nm_i->nat_tree_lock);
>>>  
>>>  	kfree(nm_i->nat_bitmap);
>>> +	kfree(nm_i->nat_bits);
>>>  #ifdef CONFIG_F2FS_CHECK_FS
>>>  	kfree(nm_i->nat_bitmap_mir);
>>>  #endif
>>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>>> index df2ff5cfe8f4..8e1ec248c653 100644
>>> --- a/fs/f2fs/segment.c
>>> +++ b/fs/f2fs/segment.c
>>> @@ -386,7 +386,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
>>>  	if (!available_free_memory(sbi, FREE_NIDS))
>>>  		try_to_free_nids(sbi, MAX_FREE_NIDS);
>>>  	else
>>> -		build_free_nids(sbi, false);
>>> +		build_free_nids(sbi, false, false);
>>>  
>>>  	if (!is_idle(sbi))
>>>  		return;
>>> diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
>>> index f0748524ca8c..1c92ace2e8f8 100644
>>> --- a/include/linux/f2fs_fs.h
>>> +++ b/include/linux/f2fs_fs.h
>>> @@ -114,6 +114,7 @@ struct f2fs_super_block {
>>>  /*
>>>   * For checkpoint
>>>   */
>>> +#define CP_NAT_BITS_FLAG	0x00000080
>>>  #define CP_CRC_RECOVERY_FLAG	0x00000040
>>>  #define CP_FASTBOOT_FLAG	0x00000020
>>>  #define CP_FSCK_FLAG		0x00000010
>>>
>>
>>
>> ------------------------------------------------------------------------------
>> Check out the vibrant tech community on one of the world's most
>> engaging tech sites, SlashDot.org! http://sdm.link/slashdot
>> _______________________________________________
>> Linux-f2fs-devel mailing list
>> Linux-f2fs-devel@...ts.sourceforge.net
>> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> 
> .
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ