The inode count: i_count, does not conform to normal reference counting semantics, its a usage count. That is, objects can and do live without any references (i_count == 0). This is because the reference from the inode hash table is not accounted. This makes that find_inode_fast() can result in 0->1 transitions and similarly, iput() can do 1->0->1 transitions. This patch changes things to include the inode hash table's reference and reworks iput() to avoid spurious drops to 0. It does mean that we now call super_operations::drop_inode() with non-zero i_count, but I could not find an implementation where this mattered. Only once we really decide to fully drop the inode and remove it from the hash will we do the final dec_and_test. This basically boils down to pushing part of iput() into atomic_dec_and_lock(). Also, this would allow an RCU based find_inode*(). Signed-off-by: Peter Zijlstra (Intel) --- fs/btrfs/inode.c | 2 +- fs/inode.c | 24 +++++++++++++++++++----- include/linux/fs.h | 11 ++++++++++- 3 files changed, 30 insertions(+), 7 deletions(-) --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3172,7 +3172,7 @@ void btrfs_add_delayed_iput(struct inode struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_inode *binode = BTRFS_I(inode); - if (atomic_add_unless(&inode->i_count, -1, 1)) + if (atomic_add_unless(&inode->i_count, -1, 2)) return; spin_lock(&fs_info->delayed_iput_lock); --- a/fs/inode.c +++ b/fs/inode.c @@ -135,7 +135,7 @@ int inode_init_always(struct super_block inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; - atomic_set(&inode->i_count, 1); + atomic_set(&inode->i_count, 2); /* hashed and ref */ inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; inode->__i_nlink = 1; @@ -387,7 +387,7 @@ static void init_once(void *foo) void __iget(struct inode *inode) { lockdep_assert_held(&inode->i_lock); - atomic_inc(&inode->i_count); + WARN_ON(!atomic_inc_not_zero(&inode->i_count)); } /* @@ -395,7 +395,7 @@ void __iget(struct inode *inode) */ void ihold(struct inode *inode) { - WARN_ON(atomic_inc_return(&inode->i_count) < 2); + WARN_ON(atomic_inc_return(&inode->i_count) < 3); } EXPORT_SYMBOL(ihold); @@ -814,6 +814,8 @@ static struct inode *find_inode_fast(str { struct inode *inode = NULL; + lockdep_assert_held(&inode_hash_lock); + repeat: hlist_for_each_entry(inode, head, i_hash) { if (inode->i_ino != ino) @@ -1488,11 +1490,12 @@ void iput(struct inode *inode) BUG_ON(inode->i_state & I_CLEAR); retry: - if (!atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) + if (atomic_add_unless(&inode->i_count, -1, 2)) return; + spin_lock(&inode->i_lock); + if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { - atomic_inc(&inode->i_count); inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); trace_writeback_lazytime_iput(inode); @@ -1500,6 +1503,7 @@ void iput(struct inode *inode) goto retry; } + atomic_dec(&inode->i_count); /* 2 -> 1 */ WARN_ON(inode->i_state & I_NEW); /* @@ -1520,6 +1524,16 @@ void iput(struct inode *inode) spin_unlock(&inode->i_lock); return; } + + /* + * If, at this point, only the hashtable has a reference left + * continue to take the inode out, otherwise someone got a ref + * while we weren't looking. + */ + if (atomic_cmpxchg(&inode->i_count, 1, 0) != 1) { + spin_unlock(&inode->i_lock); + return; + } if (!drop) { inode->i_state |= I_WILL_FREE; --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2713,7 +2713,16 @@ extern unsigned int get_next_ino(void); static inline int i_count(struct inode *inode) { - return atomic_read(&inode->i_count); + int i_count = atomic_read(&inode->i_count); + + /* + * In order to preserve the 'old' usage-count semantics, remove the + * reference that the hash-table has. + */ + if (i_count) + i_count--; + + return i_count; } extern void __iget(struct inode * inode);