lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <150174651782.104003.14493275027950415677.stgit@hn>
Date:   Thu, 03 Aug 2017 00:48:37 -0700
From:   Steven Swanson <swanson@....ucsd.edu>
To:     linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org,
        linux-nvdimm@...ts.01.org
Cc:     Steven Swanson <steven.swanson@...il.com>, dan.j.williams@...el.com
Subject: [RFC 04/16] NOVA: Inode operations and structures

Nova maintains per-CPU inode tables, and inode numbers are striped across the
tables (i.e., inos 0, n, 2n,... on cpu 0; inos 1, n + 1, 2n + 1, ... on cpu 1).

The inodes themselves live in a set of linked lists (one per CPU) of 2MB
blocks.  The last 8 bytes of each block points to the next block.  Pointers to
heads of these list live in PMEM block INODE_TABLE0_START and are replicated in
PMEM block INODE_TABLE1_START.  Additional space for inodes is allocated on
demand.

To allocate inodes, Nova maintains a per-cpu inuse_list in DRAM holds a RB
tree that holds ranges of unallocated inode numbers.

Signed-off-by: Steven Swanson <swanson@...ucsd.edu>
---
 fs/nova/inode.c | 1467 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nova/inode.h |  389 +++++++++++++++
 2 files changed, 1856 insertions(+)
 create mode 100644 fs/nova/inode.c
 create mode 100644 fs/nova/inode.h

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
new file mode 100644
index 000000000000..db001b7b5d4f
--- /dev/null
+++ b/fs/nova/inode.c
@@ -0,0 +1,1467 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode methods (allocate/free/read/write).
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix024@...ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli@...il.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/fs.h>
+#include <linux/aio.h>
+#include <linux/highuid.h>
+#include <linux/module.h>
+#include <linux/mpage.h>
+#include <linux/backing-dev.h>
+#include <linux/types.h>
+#include <linux/ratelimit.h>
+#include "nova.h"
+#include "inode.h"
+
+unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX] = {12, 21, 30};
+uint32_t blk_type_to_size[NOVA_BLOCK_TYPE_MAX] = {0x1000, 0x200000, 0x40000000};
+
+int nova_init_inode_inuse_list(struct super_block *sb)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	struct nova_range_node *range_node;
+	struct inode_map *inode_map;
+	unsigned long range_high;
+	int i;
+	int ret;
+
+	sbi->s_inodes_used_count = NOVA_NORMAL_INODE_START;
+
+	range_high = NOVA_NORMAL_INODE_START / sbi->cpus;
+	if (NOVA_NORMAL_INODE_START % sbi->cpus)
+		range_high++;
+
+	for (i = 0; i < sbi->cpus; i++) {
+		inode_map = &sbi->inode_maps[i];
+		range_node = nova_alloc_inode_node(sb);
+		if (range_node == NULL)
+			/* FIXME: free allocated memories */
+			return -ENOMEM;
+
+		range_node->range_low = 0;
+		range_node->range_high = range_high;
+		nova_update_range_node_checksum(range_node);
+		ret = nova_insert_inodetree(sbi, range_node, i);
+		if (ret) {
+			nova_err(sb, "%s failed\n", __func__);
+			nova_free_inode_node(sb, range_node);
+			return ret;
+		}
+		inode_map->num_range_node_inode = 1;
+		inode_map->first_inode_range = range_node;
+	}
+
+	return 0;
+}
+
+static int nova_alloc_inode_table(struct super_block *sb,
+	struct nova_inode_info_header *sih, int version)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	struct inode_table *inode_table;
+	unsigned long blocknr;
+	u64 block;
+	int allocated;
+	int i;
+
+	for (i = 0; i < sbi->cpus; i++) {
+		inode_table = nova_get_inode_table(sb, version, i);
+		if (!inode_table)
+			return -EINVAL;
+
+		/* Allocate replicate inodes from tail */
+		allocated = nova_new_log_blocks(sb, sih, &blocknr, 1,
+				ALLOC_INIT_ZERO, i,
+				version ? ALLOC_FROM_TAIL : ALLOC_FROM_HEAD);
+
+		nova_dbgv("%s: allocate log @ 0x%lx\n", __func__,
+							blocknr);
+		if (allocated != 1 || blocknr == 0)
+			return -ENOSPC;
+
+		block = nova_get_block_off(sb, blocknr, NOVA_BLOCK_TYPE_2M);
+		nova_memunlock_range(sb, inode_table, CACHELINE_SIZE);
+		inode_table->log_head = block;
+		nova_memlock_range(sb, inode_table, CACHELINE_SIZE);
+		nova_flush_buffer(inode_table, CACHELINE_SIZE, 0);
+	}
+
+	return 0;
+}
+
+int nova_init_inode_table(struct super_block *sb)
+{
+	struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_INODETABLE_INO);
+	struct nova_inode_info_header sih;
+	int num_tables;
+	int ret = 0;
+	int i;
+
+	nova_memunlock_inode(sb, pi);
+	pi->i_mode = 0;
+	pi->i_uid = 0;
+	pi->i_gid = 0;
+	pi->i_links_count = cpu_to_le16(1);
+	pi->i_flags = 0;
+	pi->nova_ino = NOVA_INODETABLE_INO;
+
+	pi->i_blk_type = NOVA_BLOCK_TYPE_2M;
+	nova_memlock_inode(sb, pi);
+
+	sih.ino = NOVA_INODETABLE_INO;
+	sih.i_blk_type = NOVA_BLOCK_TYPE_2M;
+
+	num_tables = 1;
+	if (metadata_csum)
+		num_tables = 2;
+
+	for (i = 0; i < num_tables; i++) {
+		ret = nova_alloc_inode_table(sb, &sih, i);
+		if (ret)
+			return ret;
+	}
+
+	PERSISTENT_BARRIER();
+	return ret;
+}
+
+inline int nova_insert_inodetree(struct nova_sb_info *sbi,
+	struct nova_range_node *new_node, int cpu)
+{
+	struct rb_root *tree;
+	int ret;
+
+	tree = &sbi->inode_maps[cpu].inode_inuse_tree;
+	ret = nova_insert_range_node(tree, new_node);
+	if (ret)
+		nova_dbg("ERROR: %s failed %d\n", __func__, ret);
+
+	return ret;
+}
+
+inline int nova_search_inodetree(struct nova_sb_info *sbi,
+	unsigned long ino, struct nova_range_node **ret_node)
+{
+	struct rb_root *tree;
+	unsigned long internal_ino;
+	int cpu;
+
+	cpu = ino % sbi->cpus;
+	tree = &sbi->inode_maps[cpu].inode_inuse_tree;
+	internal_ino = ino / sbi->cpus;
+	return nova_find_range_node(sbi, tree, internal_ino, ret_node);
+}
+
+/* Get the address in PMEM of an inode by inode number.  Allocate additional
+ * block to store additional inodes if necessary.
+ */
+int nova_get_inode_address(struct super_block *sb, u64 ino, int version,
+	u64 *pi_addr, int extendable, int extend_alternate)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	struct nova_inode_info_header sih;
+	struct inode_table *inode_table;
+	unsigned int data_bits;
+	unsigned int num_inodes_bits;
+	u64 curr;
+	unsigned int superpage_count;
+	u64 alternate_pi_addr = 0;
+	u64 internal_ino;
+	int cpuid;
+	int extended = 0;
+	unsigned int index;
+	unsigned int i = 0;
+	unsigned long blocknr;
+	unsigned long curr_addr;
+	int allocated;
+
+	if (ino < NOVA_NORMAL_INODE_START) {
+		*pi_addr = nova_get_reserved_inode_addr(sb, ino);
+		return 0;
+	}
+
+	sih.ino = NOVA_INODETABLE_INO;
+	sih.i_blk_type = NOVA_BLOCK_TYPE_2M;
+	data_bits = blk_type_to_shift[sih.i_blk_type];
+	num_inodes_bits = data_bits - NOVA_INODE_BITS;
+
+	cpuid = ino % sbi->cpus;
+	internal_ino = ino / sbi->cpus;
+
+	inode_table = nova_get_inode_table(sb, version, cpuid);
+	superpage_count = internal_ino >> num_inodes_bits;
+	index = internal_ino & ((1 << num_inodes_bits) - 1);
+
+	curr = inode_table->log_head;
+	if (curr == 0)
+		return -EINVAL;
+
+	for (i = 0; i < superpage_count; i++) {
+		if (curr == 0)
+			return -EINVAL;
+
+		curr_addr = (unsigned long)nova_get_block(sb, curr);
+		/* Next page pointer in the last 8 bytes of the superpage */
+		curr_addr += nova_inode_blk_size(&sih) - 8;
+		curr = *(u64 *)(curr_addr);
+
+		if (curr == 0) {
+			if (extendable == 0)
+				return -EINVAL;
+
+			extended = 1;
+
+			allocated = nova_new_log_blocks(sb, &sih, &blocknr,
+				1, ALLOC_INIT_ZERO, cpuid,
+				version ? ALLOC_FROM_TAIL : ALLOC_FROM_HEAD);
+
+			if (allocated != 1)
+				return allocated;
+
+			curr = nova_get_block_off(sb, blocknr,
+						NOVA_BLOCK_TYPE_2M);
+			nova_memunlock_range(sb, (void *)curr_addr,
+						CACHELINE_SIZE);
+			*(u64 *)(curr_addr) = curr;
+			nova_memlock_range(sb, (void *)curr_addr,
+						CACHELINE_SIZE);
+			nova_flush_buffer((void *)curr_addr,
+						NOVA_INODE_SIZE, 1);
+		}
+	}
+
+	/* Extend alternate inode table */
+	if (extended && extend_alternate && metadata_csum)
+		nova_get_inode_address(sb, ino, version + 1,
+					&alternate_pi_addr, extendable, 0);
+
+	*pi_addr = curr + index * NOVA_INODE_SIZE;
+
+	return 0;
+}
+
+int nova_get_alter_inode_address(struct super_block *sb, u64 ino,
+	u64 *alter_pi_addr)
+{
+	int ret;
+
+	if (metadata_csum == 0) {
+		nova_err(sb, "Access alter inode when replica inode disabled\n");
+		return 0;
+	}
+
+	if (ino < NOVA_NORMAL_INODE_START) {
+		*alter_pi_addr = nova_get_alter_reserved_inode_addr(sb, ino);
+	} else {
+		ret = nova_get_inode_address(sb, ino, 1, alter_pi_addr, 0, 0);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int nova_delete_file_tree(struct super_block *sb,
+	struct nova_inode_info_header *sih, unsigned long start_blocknr,
+	unsigned long last_blocknr, bool delete_nvmm, bool delete_dead,
+	u64 epoch_id)
+{
+	struct nova_file_write_entry *entry;
+	struct nova_file_write_entry *entryc, entry_copy;
+	struct nova_file_write_entry *old_entry = NULL;
+	unsigned long pgoff = start_blocknr;
+	unsigned long old_pgoff = 0;
+	unsigned int num_free = 0;
+	int freed = 0;
+	void *ret;
+	timing_t delete_time;
+
+	NOVA_START_TIMING(delete_file_tree_t, delete_time);
+
+	entryc = (metadata_csum == 0) ? entry : &entry_copy;
+
+	/* Handle EOF blocks */
+	do {
+		entry = radix_tree_lookup(&sih->tree, pgoff);
+		if (entry) {
+			ret = radix_tree_delete(&sih->tree, pgoff);
+			BUG_ON(!ret || ret != entry);
+			if (entry != old_entry) {
+				if (old_entry && delete_nvmm) {
+					nova_free_old_entry(sb, sih,
+							old_entry, old_pgoff,
+							num_free, delete_dead,
+							epoch_id);
+					freed += num_free;
+				}
+
+				old_entry = entry;
+				old_pgoff = pgoff;
+				num_free = 1;
+			} else {
+				num_free++;
+			}
+			pgoff++;
+		} else {
+			/* We are finding a hole. Jump to the next entry. */
+			entry = nova_find_next_entry(sb, sih, pgoff);
+			if (!entry)
+				break;
+
+			if (metadata_csum == 0)
+				entryc = entry;
+			else if (!nova_verify_entry_csum(sb, entry, entryc))
+				break;
+
+			pgoff++;
+			pgoff = pgoff > entryc->pgoff ? pgoff : entryc->pgoff;
+		}
+	} while (1);
+
+	if (old_entry && delete_nvmm) {
+		nova_free_old_entry(sb, sih, old_entry, old_pgoff,
+					num_free, delete_dead, epoch_id);
+		freed += num_free;
+	}
+
+	nova_dbgv("Inode %lu: delete file tree from pgoff %lu to %lu, %d blocks freed\n",
+			sih->ino, start_blocknr, last_blocknr, freed);
+
+	NOVA_END_TIMING(delete_file_tree_t, delete_time);
+	return freed;
+}
+
+static int nova_free_dram_resource(struct super_block *sb,
+	struct nova_inode_info_header *sih)
+{
+	unsigned long last_blocknr;
+	int freed = 0;
+
+	if (!(S_ISREG(sih->i_mode)) && !(S_ISDIR(sih->i_mode)))
+		return 0;
+
+	if (S_ISREG(sih->i_mode)) {
+		last_blocknr = nova_get_last_blocknr(sb, sih);
+		freed = nova_delete_file_tree(sb, sih, 0,
+					last_blocknr, false, false, 0);
+	} else {
+		nova_delete_dir_tree(sb, sih);
+		freed = 1;
+	}
+
+	return freed;
+}
+
+static inline void check_eof_blocks(struct super_block *sb,
+	struct nova_inode *pi, struct inode *inode,
+	struct nova_inode_info_header *sih)
+{
+	if ((pi->i_flags & cpu_to_le32(NOVA_EOFBLOCKS_FL)) &&
+		(inode->i_size + sb->s_blocksize) > (sih->i_blocks
+			<< sb->s_blocksize_bits)) {
+		nova_memunlock_inode(sb, pi);
+		pi->i_flags &= cpu_to_le32(~NOVA_EOFBLOCKS_FL);
+		nova_update_inode_checksum(pi);
+		nova_update_alter_inode(sb, inode, pi);
+		nova_memlock_inode(sb, pi);
+	}
+}
+
+/*
+ * Free data blocks from inode in the range start <=> end
+ */
+static void nova_truncate_file_blocks(struct inode *inode, loff_t start,
+				    loff_t end, u64 epoch_id)
+{
+	struct super_block *sb = inode->i_sb;
+	struct nova_inode *pi = nova_get_inode(sb, inode);
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+	unsigned long first_blocknr, last_blocknr;
+	int freed = 0;
+
+	inode->i_mtime = inode->i_ctime = current_time(inode);
+
+	nova_dbg_verbose("truncate: pi %p iblocks %lx %llx %llx %llx\n", pi,
+			 sih->i_blocks, start, end, pi->i_size);
+
+	first_blocknr = (start + (1UL << data_bits) - 1) >> data_bits;
+
+	if (end == 0)
+		return;
+	last_blocknr = (end - 1) >> data_bits;
+
+	if (first_blocknr > last_blocknr)
+		return;
+
+	freed = nova_delete_file_tree(sb, sih, first_blocknr,
+				last_blocknr, true, false, epoch_id);
+
+	inode->i_blocks -= (freed * (1 << (data_bits -
+				sb->s_blocksize_bits)));
+
+	sih->i_blocks = inode->i_blocks;
+	/* Check for the flag EOFBLOCKS is still valid after the set size */
+	check_eof_blocks(sb, pi, inode, sih);
+
+}
+
+/* search the radix tree to find hole or data
+ * in the specified range
+ * Input:
+ * first_blocknr: first block in the specified range
+ * last_blocknr: last_blocknr in the specified range
+ * @data_found: indicates whether data blocks were found
+ * @hole_found: indicates whether a hole was found
+ * hole: whether we are looking for a hole or data
+ */
+static int nova_lookup_hole_in_range(struct super_block *sb,
+	struct nova_inode_info_header *sih,
+	unsigned long first_blocknr, unsigned long last_blocknr,
+	int *data_found, int *hole_found, int hole)
+{
+	struct nova_file_write_entry *entry;
+	struct nova_file_write_entry *entryc, entry_copy;
+	unsigned long blocks = 0;
+	unsigned long pgoff, old_pgoff;
+
+	entryc = (metadata_csum == 0) ? entry : &entry_copy;
+
+	pgoff = first_blocknr;
+	while (pgoff <= last_blocknr) {
+		old_pgoff = pgoff;
+		entry = radix_tree_lookup(&sih->tree, pgoff);
+		if (entry) {
+			*data_found = 1;
+			if (!hole)
+				goto done;
+			pgoff++;
+		} else {
+			*hole_found = 1;
+			entry = nova_find_next_entry(sb, sih, pgoff);
+			pgoff++;
+			if (entry) {
+				if (metadata_csum == 0)
+					entryc = entry;
+				else if (!nova_verify_entry_csum(sb, entry,
+								entryc))
+					goto done;
+
+				pgoff = pgoff > entryc->pgoff ?
+					pgoff : entryc->pgoff;
+				if (pgoff > last_blocknr)
+					pgoff = last_blocknr + 1;
+			}
+		}
+
+		if (!*hole_found || !hole)
+			blocks += pgoff - old_pgoff;
+	}
+done:
+	return blocks;
+}
+
+/* copy persistent state to struct inode */
+static int nova_read_inode(struct super_block *sb, struct inode *inode,
+	u64 pi_addr)
+{
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode *pi, fake_pi;
+	struct nova_inode_info_header *sih = &si->header;
+	int ret = -EIO;
+	unsigned long ino;
+
+	ret = nova_get_reference(sb, pi_addr, &fake_pi,
+			(void **)&pi, sizeof(struct nova_inode));
+	if (ret) {
+		nova_dbg("%s: read pi @ 0x%llx failed\n",
+				__func__, pi_addr);
+		goto bad_inode;
+	}
+
+	inode->i_mode = sih->i_mode;
+	i_uid_write(inode, le32_to_cpu(pi->i_uid));
+	i_gid_write(inode, le32_to_cpu(pi->i_gid));
+//	set_nlink(inode, le16_to_cpu(pi->i_links_count));
+	inode->i_generation = le32_to_cpu(pi->i_generation);
+	nova_set_inode_flags(inode, pi, le32_to_cpu(pi->i_flags));
+	ino = inode->i_ino;
+
+	/* check if the inode is active. */
+	if (inode->i_mode == 0 || pi->deleted == 1) {
+		/* this inode is deleted */
+		ret = -ESTALE;
+		goto bad_inode;
+	}
+
+	inode->i_blocks = sih->i_blocks;
+	inode->i_mapping->a_ops = &nova_aops_dax;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &nova_file_inode_operations;
+		if (inplace_data_updates && wprotect == 0)
+			inode->i_fop = &nova_dax_file_operations;
+		else
+			inode->i_fop = &nova_wrap_file_operations;
+		break;
+	case S_IFDIR:
+		inode->i_op = &nova_dir_inode_operations;
+		inode->i_fop = &nova_dir_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &nova_symlink_inode_operations;
+		break;
+	default:
+		inode->i_op = &nova_special_inode_operations;
+		init_special_inode(inode, inode->i_mode,
+				   le32_to_cpu(pi->dev.rdev));
+		break;
+	}
+
+	/* Update size and time after rebuild the tree */
+	inode->i_size = le64_to_cpu(sih->i_size);
+	inode->i_atime.tv_sec = (__s32)le32_to_cpu(pi->i_atime);
+	inode->i_ctime.tv_sec = (__s32)le32_to_cpu(pi->i_ctime);
+	inode->i_mtime.tv_sec = (__s32)le32_to_cpu(pi->i_mtime);
+	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec =
+					 inode->i_ctime.tv_nsec = 0;
+	set_nlink(inode, le16_to_cpu(pi->i_links_count));
+	return 0;
+
+bad_inode:
+	make_bad_inode(inode);
+	return ret;
+}
+
+static void nova_get_inode_flags(struct inode *inode, struct nova_inode *pi)
+{
+	unsigned int flags = inode->i_flags;
+	unsigned int nova_flags = le32_to_cpu(pi->i_flags);
+
+	nova_flags &= ~(FS_SYNC_FL | FS_APPEND_FL | FS_IMMUTABLE_FL |
+			 FS_NOATIME_FL | FS_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		nova_flags |= FS_SYNC_FL;
+	if (flags & S_APPEND)
+		nova_flags |= FS_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		nova_flags |= FS_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		nova_flags |= FS_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		nova_flags |= FS_DIRSYNC_FL;
+
+	pi->i_flags = cpu_to_le32(nova_flags);
+}
+
+static void nova_init_inode(struct inode *inode, struct nova_inode *pi)
+{
+	pi->i_mode = cpu_to_le16(inode->i_mode);
+	pi->i_uid = cpu_to_le32(i_uid_read(inode));
+	pi->i_gid = cpu_to_le32(i_gid_read(inode));
+	pi->i_links_count = cpu_to_le16(inode->i_nlink);
+	pi->i_size = cpu_to_le64(inode->i_size);
+	pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	pi->i_generation = cpu_to_le32(inode->i_generation);
+	pi->log_head = 0;
+	pi->log_tail = 0;
+	pi->alter_log_head = 0;
+	pi->alter_log_tail = 0;
+	pi->deleted = 0;
+	pi->delete_epoch_id = 0;
+	nova_get_inode_flags(inode, pi);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		pi->dev.rdev = cpu_to_le32(inode->i_rdev);
+}
+
+static int nova_alloc_unused_inode(struct super_block *sb, int cpuid,
+	unsigned long *ino)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	struct inode_map *inode_map;
+	struct nova_range_node *i, *next_i;
+	struct rb_node *temp, *next;
+	unsigned long next_range_low;
+	unsigned long new_ino;
+	unsigned long MAX_INODE = 1UL << 31;
+
+	inode_map = &sbi->inode_maps[cpuid];
+	i = inode_map->first_inode_range;
+	NOVA_ASSERT(i);
+	if (!nova_range_node_checksum_ok(i)) {
+		nova_dbg("%s: first node failed\n", __func__);
+		return -EIO;
+	}
+
+	temp = &i->node;
+	next = rb_next(temp);
+
+	if (!next) {
+		next_i = NULL;
+		next_range_low = MAX_INODE;
+	} else {
+		next_i = container_of(next, struct nova_range_node, node);
+		if (!nova_range_node_checksum_ok(next_i)) {
+			nova_dbg("%s: second node failed\n", __func__);
+			return -EIO;
+		}
+		next_range_low = next_i->range_low;
+	}
+
+	new_ino = i->range_high + 1;
+
+	if (next_i && new_ino == (next_range_low - 1)) {
+		/* Fill the gap completely */
+		i->range_high = next_i->range_high;
+		nova_update_range_node_checksum(i);
+		rb_erase(&next_i->node, &inode_map->inode_inuse_tree);
+		nova_free_inode_node(sb, next_i);
+		inode_map->num_range_node_inode--;
+	} else if (new_ino < (next_range_low - 1)) {
+		/* Aligns to left */
+		i->range_high = new_ino;
+		nova_update_range_node_checksum(i);
+	} else {
+		nova_dbg("%s: ERROR: new ino %lu, next low %lu\n", __func__,
+			new_ino, next_range_low);
+		return -ENOSPC;
+	}
+
+	*ino = new_ino * sbi->cpus + cpuid;
+	sbi->s_inodes_used_count++;
+	inode_map->allocated++;
+
+	nova_dbg_verbose("Alloc ino %lu\n", *ino);
+	return 0;
+}
+
+static int nova_free_inuse_inode(struct super_block *sb, unsigned long ino)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	struct inode_map *inode_map;
+	struct nova_range_node *i = NULL;
+	struct nova_range_node *curr_node;
+	int found = 0;
+	int cpuid = ino % sbi->cpus;
+	unsigned long internal_ino = ino / sbi->cpus;
+	int ret = 0;
+
+	nova_dbg_verbose("Free inuse ino: %lu\n", ino);
+	inode_map = &sbi->inode_maps[cpuid];
+
+	mutex_lock(&inode_map->inode_table_mutex);
+	found = nova_search_inodetree(sbi, ino, &i);
+	if (!found) {
+		nova_dbg("%s ERROR: ino %lu not found\n", __func__, ino);
+		mutex_unlock(&inode_map->inode_table_mutex);
+		return -EINVAL;
+	}
+
+	if ((internal_ino == i->range_low) && (internal_ino == i->range_high)) {
+		/* fits entire node */
+		rb_erase(&i->node, &inode_map->inode_inuse_tree);
+		nova_free_inode_node(sb, i);
+		inode_map->num_range_node_inode--;
+		goto block_found;
+	}
+	if ((internal_ino == i->range_low) && (internal_ino < i->range_high)) {
+		/* Aligns left */
+		i->range_low = internal_ino + 1;
+		nova_update_range_node_checksum(i);
+		goto block_found;
+	}
+	if ((internal_ino > i->range_low) && (internal_ino == i->range_high)) {
+		/* Aligns right */
+		i->range_high = internal_ino - 1;
+		nova_update_range_node_checksum(i);
+		goto block_found;
+	}
+	if ((internal_ino > i->range_low) && (internal_ino < i->range_high)) {
+		/* Aligns somewhere in the middle */
+		curr_node = nova_alloc_inode_node(sb);
+		NOVA_ASSERT(curr_node);
+		if (curr_node == NULL) {
+			/* returning without freeing the block */
+			goto block_found;
+		}
+		curr_node->range_low = internal_ino + 1;
+		curr_node->range_high = i->range_high;
+		nova_update_range_node_checksum(curr_node);
+
+		i->range_high = internal_ino - 1;
+		nova_update_range_node_checksum(i);
+
+		ret = nova_insert_inodetree(sbi, curr_node, cpuid);
+		if (ret) {
+			nova_free_inode_node(sb, curr_node);
+			goto err;
+		}
+		inode_map->num_range_node_inode++;
+		goto block_found;
+	}
+
+err:
+	nova_error_mng(sb, "Unable to free inode %lu\n", ino);
+	nova_error_mng(sb, "Found inuse block %lu - %lu\n",
+				 i->range_low, i->range_high);
+	mutex_unlock(&inode_map->inode_table_mutex);
+	return ret;
+
+block_found:
+	sbi->s_inodes_used_count--;
+	inode_map->freed++;
+	mutex_unlock(&inode_map->inode_table_mutex);
+	return ret;
+}
+
+static int nova_free_inode(struct super_block *sb, struct nova_inode *pi,
+	struct nova_inode_info_header *sih)
+{
+	int err = 0;
+	timing_t free_time;
+
+	NOVA_START_TIMING(free_inode_t, free_time);
+
+	nova_free_inode_log(sb, pi, sih);
+
+	sih->log_pages = 0;
+	sih->i_mode = 0;
+	sih->pi_addr = 0;
+	sih->alter_pi_addr = 0;
+	sih->i_size = 0;
+	sih->i_blocks = 0;
+
+	err = nova_free_inuse_inode(sb, pi->nova_ino);
+
+	NOVA_END_TIMING(free_inode_t, free_time);
+	return err;
+}
+
+struct inode *nova_iget(struct super_block *sb, unsigned long ino)
+{
+	struct nova_inode_info *si;
+	struct inode *inode;
+	u64 pi_addr;
+	int err;
+
+	inode = iget_locked(sb, ino);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	si = NOVA_I(inode);
+
+	nova_dbgv("%s: inode %lu\n", __func__, ino);
+
+	err = nova_get_inode_address(sb, ino, 0, &pi_addr, 0, 0);
+	if (err) {
+		nova_dbg("%s: get inode %lu address failed %d\n",
+			 __func__, ino, err);
+		goto fail;
+	}
+
+	if (pi_addr == 0) {
+		nova_dbg("%s: failed to get pi_addr for inode %lu\n",
+			 __func__, ino);
+		err = -EACCES;
+		goto fail;
+	}
+
+	err = nova_rebuild_inode(sb, si, ino, pi_addr, 1);
+	if (err) {
+		nova_dbg("%s: failed to rebuild inode %lu\n", __func__, ino);
+		goto fail;
+	}
+
+	err = nova_read_inode(sb, inode, pi_addr);
+	if (unlikely(err)) {
+		nova_dbg("%s: failed to read inode %lu\n", __func__, ino);
+		goto fail;
+
+	}
+
+	inode->i_ino = ino;
+
+	unlock_new_inode(inode);
+	return inode;
+fail:
+	iget_failed(inode);
+	return ERR_PTR(err);
+}
+
+unsigned long nova_get_last_blocknr(struct super_block *sb,
+	struct nova_inode_info_header *sih)
+{
+	struct nova_inode *pi, fake_pi;
+	unsigned long last_blocknr;
+	unsigned int btype;
+	unsigned int data_bits;
+	int ret;
+
+	ret = nova_get_reference(sb, sih->pi_addr, &fake_pi,
+			(void **)&pi, sizeof(struct nova_inode));
+	if (ret) {
+		nova_dbg("%s: read pi @ 0x%lx failed\n",
+				__func__, sih->pi_addr);
+		btype = 0;
+	} else {
+		btype = sih->i_blk_type;
+	}
+
+	data_bits = blk_type_to_shift[btype];
+
+	if (sih->i_size == 0)
+		last_blocknr = 0;
+	else
+		last_blocknr = (sih->i_size - 1) >> data_bits;
+
+	return last_blocknr;
+}
+
+static int nova_free_inode_resource(struct super_block *sb,
+	struct nova_inode *pi, struct nova_inode_info_header *sih)
+{
+	unsigned long last_blocknr;
+	int ret = 0;
+	int freed = 0;
+	struct nova_inode *alter_pi;
+
+	nova_memunlock_inode(sb, pi);
+	pi->deleted = 1;
+
+	if (pi->valid) {
+		nova_dbg("%s: inode %lu still valid\n",
+				__func__, sih->ino);
+		pi->valid = 0;
+	}
+	nova_update_inode_checksum(pi);
+	if (metadata_csum && sih->alter_pi_addr) {
+		alter_pi = (struct nova_inode *)nova_get_block(sb,
+						sih->alter_pi_addr);
+		memcpy_to_pmem_nocache(alter_pi, pi, sizeof(struct nova_inode));
+	}
+	nova_memlock_inode(sb, pi);
+
+	/* We need the log to free the blocks from the b-tree */
+	switch (sih->i_mode & S_IFMT) {
+	case S_IFREG:
+		last_blocknr = nova_get_last_blocknr(sb, sih);
+		nova_dbgv("%s: file ino %lu\n", __func__, sih->ino);
+		freed = nova_delete_file_tree(sb, sih, 0,
+					last_blocknr, true, true, 0);
+		break;
+	case S_IFDIR:
+		nova_dbgv("%s: dir ino %lu\n", __func__, sih->ino);
+		nova_delete_dir_tree(sb, sih);
+		break;
+	case S_IFLNK:
+		/* Log will be freed later */
+		nova_dbgv("%s: symlink ino %lu\n",
+				__func__, sih->ino);
+		freed = nova_delete_file_tree(sb, sih, 0, 0,
+						true, true, 0);
+		break;
+	default:
+		nova_dbgv("%s: special ino %lu\n",
+				__func__, sih->ino);
+		break;
+	}
+
+	nova_dbg_verbose("%s: Freed %d\n", __func__, freed);
+	/* Then we can free the inode */
+	ret = nova_free_inode(sb, pi, sih);
+	if (ret)
+		nova_err(sb, "%s: free inode %lu failed\n",
+				__func__, sih->ino);
+
+	return ret;
+}
+
+void nova_evict_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct nova_inode *pi = nova_get_inode(sb, inode);
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	timing_t evict_time;
+	int destroy = 0;
+	int ret;
+
+	NOVA_START_TIMING(evict_inode_t, evict_time);
+	if (!sih) {
+		nova_err(sb, "%s: ino %lu sih is NULL!\n",
+				__func__, inode->i_ino);
+		NOVA_ASSERT(0);
+		goto out;
+	}
+
+	// pi can be NULL if the file has already been deleted, but a handle
+	// remains.
+	if (pi && pi->nova_ino != inode->i_ino) {
+		nova_err(sb, "%s: inode %lu ino does not match: %llu\n",
+				__func__, inode->i_ino, pi->nova_ino);
+		nova_dbg("inode size %llu, pi addr 0x%lx, pi head 0x%llx, tail 0x%llx, mode %u\n",
+				inode->i_size, sih->pi_addr, sih->log_head,
+				sih->log_tail, pi->i_mode);
+		nova_dbg("sih: ino %lu, inode size %lu, mode %u, inode mode %u\n",
+				sih->ino, sih->i_size,
+				sih->i_mode, inode->i_mode);
+		nova_print_inode_log(sb, inode);
+	}
+
+	/* Check if this inode exists in at least one snapshot. */
+	if (pi && pi->valid == 0) {
+		ret = nova_append_inode_to_snapshot(sb, pi);
+		if (ret == 0)
+			goto out;
+	}
+
+	nova_dbg_verbose("%s: %lu\n", __func__, inode->i_ino);
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+			goto out;
+
+		if (pi) {
+			ret = nova_free_inode_resource(sb, pi, sih);
+			if (ret)
+				goto out;
+		}
+
+		destroy = 1;
+		pi = NULL; /* we no longer own the nova_inode */
+
+		inode->i_mtime = inode->i_ctime = current_time(inode);
+		inode->i_size = 0;
+	}
+out:
+	if (destroy == 0) {
+		nova_dbgv("%s: destroying %lu\n", __func__, inode->i_ino);
+		nova_free_dram_resource(sb, sih);
+	}
+	/* TODO: Since we don't use page-cache, do we really need the following
+	 * call?
+	 */
+	truncate_inode_pages(&inode->i_data, 0);
+
+	clear_inode(inode);
+	NOVA_END_TIMING(evict_inode_t, evict_time);
+}
+
+/* First rebuild the inode tree, then free the blocks */
+int nova_delete_dead_inode(struct super_block *sb, u64 ino)
+{
+	struct nova_inode_info si;
+	struct nova_inode_info_header *sih;
+	struct nova_inode *pi;
+	u64 pi_addr = 0;
+	int err;
+
+	if (ino < NOVA_NORMAL_INODE_START) {
+		nova_dbg("%s: invalid inode %llu\n", __func__, ino);
+		return -EINVAL;
+	}
+
+	err = nova_get_inode_address(sb, ino, 0, &pi_addr, 0, 0);
+	if (err) {
+		nova_dbg("%s: get inode %llu address failed %d\n",
+					__func__, ino, err);
+		return -EINVAL;
+	}
+
+	if (pi_addr == 0)
+		return -EACCES;
+
+	memset(&si, 0, sizeof(struct nova_inode_info));
+	err = nova_rebuild_inode(sb, &si, ino, pi_addr, 0);
+	if (err)
+		return err;
+
+	pi = (struct nova_inode *)nova_get_block(sb, pi_addr);
+	sih = &si.header;
+
+	nova_dbgv("Delete dead inode %lu, log head 0x%llx, tail 0x%llx\n",
+			sih->ino, sih->log_head, sih->log_tail);
+
+	return nova_free_inode_resource(sb, pi, sih);
+}
+
+/* Returns 0 on failure */
+u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	struct inode_map *inode_map;
+	unsigned long free_ino = 0;
+	int map_id;
+	u64 ino = 0;
+	int ret;
+	timing_t new_inode_time;
+
+	NOVA_START_TIMING(new_nova_inode_t, new_inode_time);
+	map_id = sbi->map_id;
+	sbi->map_id = (sbi->map_id + 1) % sbi->cpus;
+
+	inode_map = &sbi->inode_maps[map_id];
+
+	mutex_lock(&inode_map->inode_table_mutex);
+	ret = nova_alloc_unused_inode(sb, map_id, &free_ino);
+	if (ret) {
+		nova_dbg("%s: alloc inode number failed %d\n", __func__, ret);
+		mutex_unlock(&inode_map->inode_table_mutex);
+		return 0;
+	}
+
+	ret = nova_get_inode_address(sb, free_ino, 0, pi_addr, 1, 1);
+	if (ret) {
+		nova_dbg("%s: get inode address failed %d\n", __func__, ret);
+		mutex_unlock(&inode_map->inode_table_mutex);
+		return 0;
+	}
+
+	mutex_unlock(&inode_map->inode_table_mutex);
+
+	ino = free_ino;
+
+	NOVA_END_TIMING(new_nova_inode_t, new_inode_time);
+	return ino;
+}
+
+struct inode *nova_new_vfs_inode(enum nova_new_inode_type type,
+	struct inode *dir, u64 pi_addr, u64 ino, umode_t mode,
+	size_t size, dev_t rdev, const struct qstr *qstr, u64 epoch_id)
+{
+	struct super_block *sb;
+	struct nova_sb_info *sbi;
+	struct inode *inode;
+	struct nova_inode *diri = NULL;
+	struct nova_inode_info *si;
+	struct nova_inode_info_header *sih = NULL;
+	struct nova_inode *pi;
+	struct nova_inode *alter_pi;
+	int errval;
+	u64 alter_pi_addr = 0;
+	timing_t new_inode_time;
+
+	NOVA_START_TIMING(new_vfs_inode_t, new_inode_time);
+	sb = dir->i_sb;
+	sbi = (struct nova_sb_info *)sb->s_fs_info;
+	inode = new_inode(sb);
+	if (!inode) {
+		errval = -ENOMEM;
+		goto fail2;
+	}
+
+	inode_init_owner(inode, dir, mode);
+	inode->i_blocks = inode->i_size = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+
+	inode->i_generation = atomic_add_return(1, &sbi->next_generation);
+	inode->i_size = size;
+
+	diri = nova_get_inode(sb, dir);
+	if (!diri) {
+		errval = -EACCES;
+		goto fail1;
+	}
+
+	if (metadata_csum) {
+		/* Get alternate inode address */
+		errval = nova_get_alter_inode_address(sb, ino, &alter_pi_addr);
+		if (errval)
+			goto fail1;
+	}
+
+	pi = (struct nova_inode *)nova_get_block(sb, pi_addr);
+	nova_dbg_verbose("%s: allocating inode %llu @ 0x%llx\n",
+					__func__, ino, pi_addr);
+
+	/* chosen inode is in ino */
+	inode->i_ino = ino;
+
+	switch (type) {
+	case TYPE_CREATE:
+		inode->i_op = &nova_file_inode_operations;
+		inode->i_mapping->a_ops = &nova_aops_dax;
+		if (inplace_data_updates && wprotect == 0)
+			inode->i_fop = &nova_dax_file_operations;
+		else
+			inode->i_fop = &nova_wrap_file_operations;
+		break;
+	case TYPE_MKNOD:
+		init_special_inode(inode, mode, rdev);
+		inode->i_op = &nova_special_inode_operations;
+		break;
+	case TYPE_SYMLINK:
+		inode->i_op = &nova_symlink_inode_operations;
+		inode->i_mapping->a_ops = &nova_aops_dax;
+		break;
+	case TYPE_MKDIR:
+		inode->i_op = &nova_dir_inode_operations;
+		inode->i_fop = &nova_dir_operations;
+		inode->i_mapping->a_ops = &nova_aops_dax;
+		set_nlink(inode, 2);
+		break;
+	default:
+		nova_dbg("Unknown new inode type %d\n", type);
+		break;
+	}
+
+	/*
+	 * Pi is part of the dir log so no transaction is needed,
+	 * but we need to flush to NVMM.
+	 */
+	nova_memunlock_inode(sb, pi);
+	pi->i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+	pi->i_flags = nova_mask_flags(mode, diri->i_flags);
+	pi->nova_ino = ino;
+	pi->i_create_time = current_time(inode).tv_sec;
+	pi->create_epoch_id = epoch_id;
+	nova_init_inode(inode, pi);
+
+	if (metadata_csum) {
+		alter_pi = (struct nova_inode *)nova_get_block(sb,
+								alter_pi_addr);
+		memcpy_to_pmem_nocache(alter_pi, pi, sizeof(struct nova_inode));
+	}
+
+	nova_memlock_inode(sb, pi);
+
+	si = NOVA_I(inode);
+	sih = &si->header;
+	nova_init_header(sb, sih, inode->i_mode);
+	sih->pi_addr = pi_addr;
+	sih->alter_pi_addr = alter_pi_addr;
+	sih->ino = ino;
+	sih->i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+
+	nova_set_inode_flags(inode, pi, le32_to_cpu(pi->i_flags));
+
+	if (insert_inode_locked(inode) < 0) {
+		nova_err(sb, "nova_new_inode failed ino %lx\n", inode->i_ino);
+		errval = -EINVAL;
+		goto fail1;
+	}
+
+	nova_flush_buffer(pi, NOVA_INODE_SIZE, 0);
+	NOVA_END_TIMING(new_vfs_inode_t, new_inode_time);
+	return inode;
+fail1:
+	make_bad_inode(inode);
+	iput(inode);
+fail2:
+	NOVA_END_TIMING(new_vfs_inode_t, new_inode_time);
+	return ERR_PTR(errval);
+}
+
+int nova_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	/* write_inode should never be called because we always keep our inodes
+	 * clean. So let us know if write_inode ever gets called.
+	 */
+//	BUG();
+	return 0;
+}
+
+/*
+ * dirty_inode() is called from mark_inode_dirty_sync()
+ * usually dirty_inode should not be called because NOVA always keeps its inodes
+ * clean. Only exception is touch_atime which calls dirty_inode to update the
+ * i_atime field.
+ */
+void nova_dirty_inode(struct inode *inode, int flags)
+{
+	struct super_block *sb = inode->i_sb;
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	struct nova_inode *pi, inode_copy;
+
+	if (sbi->mount_snapshot)
+		return;
+
+	pi = nova_get_block(sb, sih->pi_addr);
+
+	/* check the inode before updating to make sure all fields are good */
+	if (nova_check_inode_integrity(sb, sih->ino, sih->pi_addr,
+					sih->alter_pi_addr, &inode_copy, 0) < 0)
+		return;
+
+	/* only i_atime should have changed if at all.
+	 * we can do in-place atomic update
+	 */
+	nova_memunlock_inode(sb, pi);
+	pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	nova_update_inode_checksum(pi);
+	nova_update_alter_inode(sb, inode, pi);
+	nova_memlock_inode(sb, pi);
+	/* Relax atime persistency */
+	nova_flush_buffer(&pi->i_atime, sizeof(pi->i_atime), 0);
+}
+
+static void nova_setsize(struct inode *inode, loff_t oldsize, loff_t newsize,
+	u64 epoch_id)
+{
+	struct super_block *sb = inode->i_sb;
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	timing_t setsize_time;
+
+	/* We only support truncate regular file */
+	if (!(S_ISREG(inode->i_mode))) {
+		nova_err(inode->i_sb, "%s:wrong file mode %x\n", inode->i_mode);
+		return;
+	}
+
+	NOVA_START_TIMING(setsize_t, setsize_time);
+
+	inode_dio_wait(inode);
+
+	nova_dbgv("%s: inode %lu, old size %llu, new size %llu\n",
+		__func__, inode->i_ino, oldsize, newsize);
+
+	if (newsize != oldsize) {
+		nova_clear_last_page_tail(sb, inode, newsize);
+		i_size_write(inode, newsize);
+		sih->i_size = newsize;
+	}
+
+	/* FIXME: we should make sure that there is nobody reading the inode
+	 * before truncating it. Also we need to munmap the truncated range
+	 * from application address space, if mmapped.
+	 */
+	/* synchronize_rcu(); */
+
+	/* FIXME: Do we need to clear truncated DAX pages? */
+//	dax_truncate_page(inode, newsize, nova_dax_get_block);
+
+	truncate_pagecache(inode, newsize);
+	nova_truncate_file_blocks(inode, newsize, oldsize, epoch_id);
+	NOVA_END_TIMING(setsize_t, setsize_time);
+}
+
+int nova_getattr(const struct path *path, struct kstat *stat,
+		 u32 request_mask, unsigned int flags)
+{
+	struct inode *inode;
+
+	inode = path->dentry->d_inode;
+	generic_fillattr(inode, stat);
+	/* stat->blocks should be the number of 512B blocks */
+	stat->blocks = (inode->i_blocks << inode->i_sb->s_blocksize_bits) >> 9;
+	return 0;
+}
+
+int nova_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	struct super_block *sb = inode->i_sb;
+	struct nova_inode *pi = nova_get_inode(sb, inode);
+	int ret;
+	unsigned int ia_valid = attr->ia_valid, attr_mask;
+	loff_t oldsize = inode->i_size;
+	u64 epoch_id;
+	timing_t setattr_time;
+
+	NOVA_START_TIMING(setattr_t, setattr_time);
+	if (!pi) {
+		ret = -EACCES;
+		goto out;
+	}
+
+	ret = setattr_prepare(dentry, attr);
+	if (ret)
+		goto out;
+
+	/* Update inode with attr except for size */
+	setattr_copy(inode, attr);
+
+	epoch_id = nova_get_epoch_id(sb);
+
+	attr_mask = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_SIZE | ATTR_ATIME
+			| ATTR_MTIME | ATTR_CTIME;
+
+	ia_valid = ia_valid & attr_mask;
+
+	if (ia_valid == 0)
+		goto out;
+
+	ret = nova_handle_setattr_operation(sb, inode, pi, ia_valid,
+					attr, epoch_id);
+	if (ret)
+		goto out;
+
+	/* Only after log entry is committed, we can truncate size */
+	if ((ia_valid & ATTR_SIZE) && (attr->ia_size != oldsize ||
+			pi->i_flags & cpu_to_le32(NOVA_EOFBLOCKS_FL))) {
+//		nova_set_blocksize_hint(sb, inode, pi, attr->ia_size);
+
+		/* now we can freely truncate the inode */
+		nova_setsize(inode, oldsize, attr->ia_size, epoch_id);
+	}
+
+	sih->trans_id++;
+out:
+	NOVA_END_TIMING(setattr_t, setattr_time);
+	return ret;
+}
+
+void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
+	unsigned int flags)
+{
+	inode->i_flags &=
+		~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC);
+	if (flags & FS_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & FS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & FS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & FS_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & FS_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+	if (!pi->i_xattr)
+		inode_has_no_xattr(inode);
+	inode->i_flags |= S_DAX;
+}
+
+static int nova_legacy_get_blocks(struct inode *inode, sector_t iblock,
+	struct buffer_head *bh, int create)
+{
+	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
+	bool new = false, boundary = false;
+	u32 bno;
+	int ret;
+
+	ret = nova_dax_get_blocks(inode, iblock, max_blocks, &bno, &new,
+				&boundary, create, false);
+	if (ret <= 0)
+		return ret;
+
+	map_bh(bh, inode->i_sb, bno);
+	bh->b_size = ret << inode->i_blkbits;
+	return 0;
+}
+
+static ssize_t nova_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *filp = iocb->ki_filp;
+	struct address_space *mapping = filp->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+	timing_t dio_time;
+
+	if (WARN_ON_ONCE(IS_DAX(inode)))
+		return -EIO;
+
+	NOVA_START_TIMING(direct_IO_t, dio_time);
+
+	ret = blockdev_direct_IO(iocb, inode, iter, nova_legacy_get_blocks);
+
+	NOVA_END_TIMING(direct_IO_t, dio_time);
+	return ret;
+}
+
+/*
+ * find the file offset for SEEK_DATA/SEEK_HOLE
+ */
+unsigned long nova_find_region(struct inode *inode, loff_t *offset, int hole)
+{
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+	unsigned long first_blocknr, last_blocknr;
+	unsigned long blocks = 0, offset_in_block;
+	int data_found = 0, hole_found = 0;
+
+	if (*offset >= inode->i_size)
+		return -ENXIO;
+
+	if (!inode->i_blocks || !sih->i_size) {
+		if (hole)
+			return inode->i_size;
+		else
+			return -ENXIO;
+	}
+
+	offset_in_block = *offset & ((1UL << data_bits) - 1);
+
+	first_blocknr = *offset >> data_bits;
+	last_blocknr = inode->i_size >> data_bits;
+
+	nova_dbg_verbose("find_region offset %llx, first_blocknr %lx, last_blocknr %lx hole %d\n",
+		  *offset, first_blocknr, last_blocknr, hole);
+
+	blocks = nova_lookup_hole_in_range(inode->i_sb, sih,
+		first_blocknr, last_blocknr, &data_found, &hole_found, hole);
+
+	/* Searching data but only hole found till the end */
+	if (!hole && !data_found && hole_found)
+		return -ENXIO;
+
+	if (data_found && !hole_found) {
+		/* Searching data but we are already into them */
+		if (hole)
+			/* Searching hole but only data found, go to the end */
+			*offset = inode->i_size;
+		return 0;
+	}
+
+	/* Searching for hole, hole found and starting inside an hole */
+	if (hole && hole_found && !blocks) {
+		/* we found data after it */
+		if (!data_found)
+			/* last hole */
+			*offset = inode->i_size;
+		return 0;
+	}
+
+	if (offset_in_block) {
+		blocks--;
+		*offset += (blocks << data_bits) +
+			   ((1 << data_bits) - offset_in_block);
+	} else {
+		*offset += blocks << data_bits;
+	}
+
+	return 0;
+}
+
+static int nova_writepages(struct address_space *mapping,
+	struct writeback_control *wbc)
+{
+	int ret;
+	timing_t wp_time;
+
+	NOVA_START_TIMING(write_pages_t, wp_time);
+	ret = dax_writeback_mapping_range(mapping,
+			mapping->host->i_sb->s_bdev, wbc);
+	NOVA_END_TIMING(write_pages_t, wp_time);
+	return ret;
+}
+
+const struct address_space_operations nova_aops_dax = {
+	.writepages		= nova_writepages,
+	.direct_IO		= nova_direct_IO,
+	/*.dax_mem_protect	= nova_dax_mem_protect,*/
+};
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
new file mode 100644
index 000000000000..5ad69335799c
--- /dev/null
+++ b/fs/nova/inode.h
@@ -0,0 +1,389 @@
+#ifndef __INODE_H
+#define __INODE_H
+
+struct nova_inode_info_header;
+struct nova_inode;
+
+#include "super.h"
+#include "log.h"
+
+enum nova_new_inode_type {
+	TYPE_CREATE = 0,
+	TYPE_MKNOD,
+	TYPE_SYMLINK,
+	TYPE_MKDIR
+};
+
+
+/*
+ * Structure of an inode in PMEM
+ * Keep the inode size to within 120 bytes: We use the last eight bytes
+ * as inode table tail pointer.
+ */
+struct nova_inode {
+
+	/* first 40 bytes */
+	u8	i_rsvd;		 /* reserved. used to be checksum */
+	u8	valid;		 /* Is this inode valid? */
+	u8	deleted;	 /* Is this inode deleted? */
+	u8	i_blk_type;	 /* data block size this inode uses */
+	__le32	i_flags;	 /* Inode flags */
+	__le64	i_size;		 /* Size of data in bytes */
+	__le32	i_ctime;	 /* Inode modification time */
+	__le32	i_mtime;	 /* Inode b-tree Modification time */
+	__le32	i_atime;	 /* Access time */
+	__le16	i_mode;		 /* File mode */
+	__le16	i_links_count;	 /* Links count */
+
+	__le64	i_xattr;	 /* Extended attribute block */
+
+	/* second 40 bytes */
+	__le32	i_uid;		 /* Owner Uid */
+	__le32	i_gid;		 /* Group Id */
+	__le32	i_generation;	 /* File version (for NFS) */
+	__le32	i_create_time;	 /* Create time */
+	__le64	nova_ino;	 /* nova inode number */
+
+	__le64	log_head;	 /* Log head pointer */
+	__le64	log_tail;	 /* Log tail pointer */
+
+	/* last 40 bytes */
+	__le64	alter_log_head;	 /* Alternate log head pointer */
+	__le64	alter_log_tail;	 /* Alternate log tail pointer */
+
+	__le64	create_epoch_id; /* Transaction ID when create */
+	__le64	delete_epoch_id; /* Transaction ID when deleted */
+
+	struct {
+		__le32 rdev;	 /* major/minor # */
+	} dev;			 /* device inode */
+
+	__le32	csum;            /* CRC32 checksum */
+
+	/* Leave 8 bytes for inode table tail pointer */
+} __attribute((__packed__));
+
+/*
+ * Inode table.  It's a linked list of pages.
+ */
+struct inode_table {
+	__le64 log_head;
+};
+
+/*
+ * NOVA-specific inode state kept in DRAM
+ */
+struct nova_inode_info_header {
+	/* For files, tree holds a map from file offsets to
+	 * write log entries.
+	 *
+	 * For directories, tree holds a map from a hash of the file name to
+	 * dentry log entry.
+	 */
+	struct radix_tree_root tree;
+	struct rb_root vma_tree;	/* Write vmas */
+	struct list_head list;		/* SB list of mmap sih */
+	int num_vmas;
+	unsigned short i_mode;		/* Dir or file? */
+	unsigned long log_pages;	/* Num of log pages */
+	unsigned long i_size;
+	unsigned long i_blocks;
+	unsigned long ino;
+	unsigned long pi_addr;
+	unsigned long alter_pi_addr;
+	unsigned long valid_entries;	/* For thorough GC */
+	unsigned long num_entries;	/* For thorough GC */
+	u64 last_setattr;		/* Last setattr entry */
+	u64 last_link_change;		/* Last link change entry */
+	u64 last_dentry;		/* Last updated dentry */
+	u64 trans_id;			/* Transaction ID */
+	u64 log_head;			/* Log head pointer */
+	u64 log_tail;			/* Log tail pointer */
+	u64 alter_log_head;		/* Alternate log head pointer */
+	u64 alter_log_tail;		/* Alternate log tail pointer */
+	u8  i_blk_type;
+};
+
+/* For rebuild purpose, temporarily store pi infomation */
+struct nova_inode_rebuild {
+	u64	i_size;
+	u32	i_flags;	/* Inode flags */
+	u32	i_ctime;	/* Inode modification time */
+	u32	i_mtime;	/* Inode b-tree Modification time */
+	u32	i_atime;	/* Access time */
+	u32	i_uid;		/* Owner Uid */
+	u32	i_gid;		/* Group Id */
+	u32	i_generation;	/* File version (for NFS) */
+	u16	i_links_count;	/* Links count */
+	u16	i_mode;		/* File mode */
+	u64	trans_id;
+};
+
+/*
+ * DRAM state for inodes
+ */
+struct nova_inode_info {
+	struct nova_inode_info_header header;
+	struct inode vfs_inode;
+};
+
+
+static inline struct nova_inode_info *NOVA_I(struct inode *inode)
+{
+	return container_of(inode, struct nova_inode_info, vfs_inode);
+}
+
+static inline struct nova_inode *nova_get_alter_inode(struct super_block *sb,
+	struct inode *inode)
+{
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	struct nova_inode fake_pi;
+	void *addr;
+	int rc;
+
+	if (metadata_csum == 0)
+		return NULL;
+
+	addr = nova_get_block(sb, sih->alter_pi_addr);
+	rc = memcpy_mcsafe(&fake_pi, addr, sizeof(struct nova_inode));
+	if (rc)
+		return NULL;
+
+	return (struct nova_inode *)addr;
+}
+
+static inline int nova_update_alter_inode(struct super_block *sb,
+	struct inode *inode, struct nova_inode *pi)
+{
+	struct nova_inode *alter_pi;
+
+	if (metadata_csum == 0)
+		return 0;
+
+	alter_pi = nova_get_alter_inode(sb, inode);
+	if (!alter_pi)
+		return -EINVAL;
+
+	memcpy_to_pmem_nocache(alter_pi, pi, sizeof(struct nova_inode));
+	return 0;
+}
+
+
+static inline int nova_update_inode_checksum(struct nova_inode *pi)
+{
+	u32 crc = 0;
+
+	if (metadata_csum == 0)
+		return 0;
+
+	crc = nova_crc32c(~0, (__u8 *)pi,
+			(sizeof(struct nova_inode) - sizeof(__le32)));
+
+	pi->csum = crc;
+	nova_flush_buffer(pi, sizeof(struct nova_inode), 1);
+	return 0;
+}
+
+static inline int nova_check_inode_checksum(struct nova_inode *pi)
+{
+	u32 crc = 0;
+
+	if (metadata_csum == 0)
+		return 0;
+
+	crc = nova_crc32c(~0, (__u8 *)pi,
+			(sizeof(struct nova_inode) - sizeof(__le32)));
+
+	if (pi->csum == cpu_to_le32(crc))
+		return 0;
+	else
+		return 1;
+}
+
+
+
+static inline void nova_update_tail(struct nova_inode *pi, u64 new_tail)
+{
+	timing_t update_time;
+
+	NOVA_START_TIMING(update_tail_t, update_time);
+
+	PERSISTENT_BARRIER();
+	pi->log_tail = new_tail;
+	nova_flush_buffer(&pi->log_tail, CACHELINE_SIZE, 1);
+
+	NOVA_END_TIMING(update_tail_t, update_time);
+}
+
+static inline void nova_update_alter_tail(struct nova_inode *pi, u64 new_tail)
+{
+	timing_t update_time;
+
+	if (metadata_csum == 0)
+		return;
+
+	NOVA_START_TIMING(update_tail_t, update_time);
+
+	PERSISTENT_BARRIER();
+	pi->alter_log_tail = new_tail;
+	nova_flush_buffer(&pi->alter_log_tail, CACHELINE_SIZE, 1);
+
+	NOVA_END_TIMING(update_tail_t, update_time);
+}
+
+
+
+/* Update inode tails and checksums */
+static inline void nova_update_inode(struct super_block *sb,
+	struct inode *inode, struct nova_inode *pi,
+	struct nova_inode_update *update, int update_alter)
+{
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+
+	sih->log_tail = update->tail;
+	sih->alter_log_tail = update->alter_tail;
+	nova_update_tail(pi, update->tail);
+	if (metadata_csum)
+		nova_update_alter_tail(pi, update->alter_tail);
+
+	nova_update_inode_checksum(pi);
+	if (inode && update_alter)
+		nova_update_alter_inode(sb, inode, pi);
+}
+
+
+static inline
+struct inode_table *nova_get_inode_table(struct super_block *sb,
+	int version, int cpu)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	int table_start;
+
+	if (cpu >= sbi->cpus)
+		return NULL;
+
+	if ((version & 0x1) == 0)
+		table_start = INODE_TABLE0_START;
+	else
+		table_start = INODE_TABLE1_START;
+
+	return (struct inode_table *)((char *)nova_get_block(sb,
+		NOVA_DEF_BLOCK_SIZE_4K * table_start) +
+		cpu * CACHELINE_SIZE);
+}
+
+static inline unsigned int
+nova_inode_blk_shift(struct nova_inode_info_header *sih)
+{
+	return blk_type_to_shift[sih->i_blk_type];
+}
+
+static inline uint32_t nova_inode_blk_size(struct nova_inode_info_header *sih)
+{
+	return blk_type_to_size[sih->i_blk_type];
+}
+
+static inline u64 nova_get_reserved_inode_addr(struct super_block *sb,
+	u64 inode_number)
+{
+	return (NOVA_DEF_BLOCK_SIZE_4K * RESERVE_INODE_START) +
+			inode_number * NOVA_INODE_SIZE;
+}
+
+static inline u64 nova_get_alter_reserved_inode_addr(struct super_block *sb,
+	u64 inode_number)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+
+	return nova_get_addr_off(sbi, sbi->replica_reserved_inodes_addr) +
+			inode_number * NOVA_INODE_SIZE;
+}
+
+static inline struct nova_inode *nova_get_reserved_inode(struct super_block *sb,
+	u64 inode_number)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	u64 addr;
+
+	addr = nova_get_reserved_inode_addr(sb, inode_number);
+
+	return (struct nova_inode *)(sbi->virt_addr + addr);
+}
+
+static inline struct nova_inode *
+nova_get_alter_reserved_inode(struct super_block *sb,
+	u64 inode_number)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	u64 addr;
+
+	addr = nova_get_alter_reserved_inode_addr(sb, inode_number);
+
+	return (struct nova_inode *)(sbi->virt_addr + addr);
+}
+
+/* If this is part of a read-modify-write of the inode metadata,
+ * nova_memunlock_inode() before calling!
+ */
+static inline struct nova_inode *nova_get_inode_by_ino(struct super_block *sb,
+						  u64 ino)
+{
+	if (ino == 0 || ino >= NOVA_NORMAL_INODE_START)
+		return NULL;
+
+	return nova_get_reserved_inode(sb, ino);
+}
+
+static inline struct nova_inode *nova_get_inode(struct super_block *sb,
+	struct inode *inode)
+{
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	struct nova_inode fake_pi;
+	void *addr;
+	int rc;
+
+	addr = nova_get_block(sb, sih->pi_addr);
+	rc = memcpy_mcsafe(&fake_pi, addr, sizeof(struct nova_inode));
+	if (rc)
+		return NULL;
+
+	return (struct nova_inode *)addr;
+}
+
+
+
+extern const struct address_space_operations nova_aops_dax;
+int nova_init_inode_inuse_list(struct super_block *sb);
+extern int nova_init_inode_table(struct super_block *sb);
+int nova_get_alter_inode_address(struct super_block *sb, u64 ino,
+	u64 *alter_pi_addr);
+unsigned long nova_get_last_blocknr(struct super_block *sb,
+	struct nova_inode_info_header *sih);
+int nova_get_inode_address(struct super_block *sb, u64 ino, int version,
+	u64 *pi_addr, int extendable, int extend_alternate);
+int nova_set_blocksize_hint(struct super_block *sb, struct inode *inode,
+	struct nova_inode *pi, loff_t new_size);
+extern struct inode *nova_iget(struct super_block *sb, unsigned long ino);
+extern void nova_evict_inode(struct inode *inode);
+extern int nova_write_inode(struct inode *inode, struct writeback_control *wbc);
+extern void nova_dirty_inode(struct inode *inode, int flags);
+extern int nova_notify_change(struct dentry *dentry, struct iattr *attr);
+extern int nova_getattr(const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned int flags);
+extern void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
+	unsigned int flags);
+extern unsigned long nova_find_region(struct inode *inode, loff_t *offset,
+		int hole);
+int nova_delete_file_tree(struct super_block *sb,
+	struct nova_inode_info_header *sih, unsigned long start_blocknr,
+	unsigned long last_blocknr, bool delete_nvmm,
+	bool delete_dead, u64 trasn_id);
+u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr);
+extern struct inode *nova_new_vfs_inode(enum nova_new_inode_type,
+	struct inode *dir, u64 pi_addr, u64 ino, umode_t mode,
+	size_t size, dev_t rdev, const struct qstr *qstr, u64 epoch_id);
+
+#endif

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ