Impelemnt lazy inode lru similarly to dcache. This will reduce lock acquisition and will help to improve lock ordering subsequently. Signed-off-by: Nick Piggin --- fs/fs-writeback.c | 21 +++++++-------- fs/inode.c | 61 +++++++++++++++++++--------------------------- include/linux/fs.h | 7 ++++- include/linux/writeback.h | 1 4 files changed, 42 insertions(+), 48 deletions(-) Index: linux-2.6/fs/inode.c =================================================================== --- linux-2.6.orig/fs/inode.c +++ linux-2.6/fs/inode.c @@ -96,7 +96,6 @@ static unsigned int i_hash_shift __read_ * allowing for low-overhead inode sync() operations. */ -LIST_HEAD(inode_in_use); LIST_HEAD(inode_unused); struct inode_hash_bucket { @@ -298,6 +297,7 @@ void inode_init_once(struct inode *inode INIT_HLIST_BL_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); + INIT_LIST_HEAD(&inode->i_list); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.tree_lock); spin_lock_init(&inode->i_data.i_mmap_lock); @@ -323,25 +323,6 @@ static void init_once(void *foo) inode_init_once(inode); } -/* - * inode_lock must be held - */ -void __iget(struct inode *inode) -{ - assert_spin_locked(&inode->i_lock); - - inode->i_count++; - if (inode->i_count > 1) - return; - - if (!(inode->i_state & (I_DIRTY|I_SYNC))) { - spin_lock(&wb_inode_list_lock); - list_move(&inode->i_list, &inode_in_use); - spin_unlock(&wb_inode_list_lock); - } - atomic_dec(&inodes_stat.nr_unused); -} - /** * clear_inode - clear an inode * @inode: inode to clear @@ -384,7 +365,7 @@ static void dispose_list(struct list_hea struct inode *inode; inode = list_first_entry(head, struct inode, i_list); - list_del(&inode->i_list); + list_del_init(&inode->i_list); if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); @@ -437,11 +418,12 @@ static int invalidate_list(struct list_h invalidate_inode_buffers(inode); if (!inode->i_count) { spin_lock(&wb_inode_list_lock); - list_move(&inode->i_list, dispose); + list_del(&inode->i_list); spin_unlock(&wb_inode_list_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; spin_unlock(&inode->i_lock); + list_add(&inode->i_list, dispose); count++; continue; } @@ -480,19 +462,6 @@ int invalidate_inodes(struct super_block } EXPORT_SYMBOL(invalidate_inodes); -static int can_unuse(struct inode *inode) -{ - if (inode->i_state) - return 0; - if (inode_has_buffers(inode)) - return 0; - if (inode->i_count) - return 0; - if (inode->i_data.nrpages) - return 0; - return 1; -} - /* * Scan `goal' inodes on the unused list for freeable ones. They are moved to * a temporary list and then are freed outside inode_lock by dispose_list(). @@ -510,13 +479,12 @@ static void prune_icache(int nr_to_scan) { LIST_HEAD(freeable); int nr_pruned = 0; - int nr_scanned; unsigned long reap = 0; down_read(&iprune_sem); again: spin_lock(&wb_inode_list_lock); - for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + for (; nr_to_scan; nr_to_scan--) { struct inode *inode; if (list_empty(&inode_unused)) @@ -528,33 +496,30 @@ again: spin_unlock(&wb_inode_list_lock); goto again; } - if (inode->i_state || inode->i_count) { + if (inode->i_count || (inode->i_state & ~I_REFERENCED)) { + list_del_init(&inode->i_list); + spin_unlock(&inode->i_lock); + atomic_dec(&inodes_stat.nr_unused); + continue; + } + if (inode->i_state) { list_move(&inode->i_list, &inode_unused); + inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { + list_move(&inode->i_list, &inode_unused); spin_unlock(&wb_inode_list_lock); __iget(inode); spin_unlock(&inode->i_lock); + if (remove_inode_buffers(inode)) reap += invalidate_mapping_pages(&inode->i_data, 0, -1); iput(inode); -again2: spin_lock(&wb_inode_list_lock); - - if (inode != list_entry(inode_unused.next, - struct inode, i_list)) - continue; /* wrong inode or list_empty */ - if (!spin_trylock(&inode->i_lock)) { - spin_unlock(&wb_inode_list_lock); - goto again2; - } - if (!can_unuse(inode)) { - spin_unlock(&inode->i_lock); - continue; - } + continue; } list_move(&inode->i_list, &freeable); WARN_ON(inode->i_state & I_NEW); @@ -694,9 +659,6 @@ __inode_add_to_lists(struct super_block atomic_inc(&inodes_stat.nr_inodes); list_add(&inode->i_sb_list, &sb->s_inodes); spin_unlock(&sb_inode_list_lock); - spin_lock(&wb_inode_list_lock); - list_add(&inode->i_list, &inode_in_use); - spin_unlock(&wb_inode_list_lock); if (b) { spin_lock_bucket(b); hlist_bl_add_head(&inode->i_hash, &b->head); @@ -1343,9 +1305,13 @@ void generic_delete_inode(struct inode * { const struct super_operations *op = inode->i_sb->s_op; - spin_lock(&wb_inode_list_lock); - list_del_init(&inode->i_list); - spin_unlock(&wb_inode_list_lock); + if (!list_empty(&inode->i_list)) { + spin_lock(&wb_inode_list_lock); + list_del_init(&inode->i_list); + spin_unlock(&wb_inode_list_lock); + if (!inode->i_state) + atomic_dec(&inodes_stat.nr_unused); + } list_del_init(&inode->i_sb_list); spin_unlock(&sb_inode_list_lock); WARN_ON(inode->i_state & I_NEW); @@ -1393,13 +1359,15 @@ int generic_detach_inode(struct inode *i struct super_block *sb = inode->i_sb; if (!hlist_bl_unhashed(&inode->i_hash)) { - if (!(inode->i_state & (I_DIRTY|I_SYNC))) { - spin_lock(&wb_inode_list_lock); - list_move(&inode->i_list, &inode_unused); - spin_unlock(&wb_inode_list_lock); - } - atomic_inc(&inodes_stat.nr_unused); if (sb->s_flags & MS_ACTIVE) { + inode->i_state |= I_REFERENCED; + if (!(inode->i_state & (I_DIRTY|I_SYNC)) && + list_empty(&inode->i_list)) { + spin_lock(&wb_inode_list_lock); + list_add(&inode->i_list, &inode_unused); + spin_unlock(&wb_inode_list_lock); + atomic_inc(&inodes_stat.nr_unused); + } spin_unlock(&inode->i_lock); spin_unlock(&sb_inode_list_lock); return 0; @@ -1414,11 +1382,14 @@ int generic_detach_inode(struct inode *i WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; __remove_inode_hash(inode); - atomic_dec(&inodes_stat.nr_unused); } - spin_lock(&wb_inode_list_lock); - list_del_init(&inode->i_list); - spin_unlock(&wb_inode_list_lock); + if (!list_empty(&inode->i_list)) { + spin_lock(&wb_inode_list_lock); + list_del_init(&inode->i_list); + spin_unlock(&wb_inode_list_lock); + if (!inode->i_state) + atomic_dec(&inodes_stat.nr_unused); + } list_del_init(&inode->i_sb_list); spin_unlock(&sb_inode_list_lock); WARN_ON(inode->i_state & I_NEW); Index: linux-2.6/include/linux/fs.h =================================================================== --- linux-2.6.orig/include/linux/fs.h +++ linux-2.6/include/linux/fs.h @@ -1634,16 +1634,17 @@ struct super_operations { * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ -#define I_DIRTY_SYNC 1 -#define I_DIRTY_DATASYNC 2 -#define I_DIRTY_PAGES 4 +#define I_DIRTY_SYNC 0x01 +#define I_DIRTY_DATASYNC 0x02 +#define I_DIRTY_PAGES 0x04 #define __I_NEW 3 #define I_NEW (1 << __I_NEW) -#define I_WILL_FREE 16 -#define I_FREEING 32 -#define I_CLEAR 64 +#define I_WILL_FREE 0x10 +#define I_FREEING 0x20 +#define I_CLEAR 0x40 #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) +#define I_REFERENCED 0x100 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -2171,7 +2172,6 @@ extern int insert_inode_locked4(struct i extern int insert_inode_locked(struct inode *); extern void unlock_new_inode(struct inode *); -extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void destroy_inode(struct inode *); @@ -2422,6 +2422,12 @@ extern int generic_show_options(struct s extern void save_mount_options(struct super_block *sb, char *options); extern void replace_mount_options(struct super_block *sb, char *options); +static inline void __iget(struct inode *inode) +{ + assert_spin_locked(&inode->i_lock); + inode->i_count++; +} + static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; Index: linux-2.6/fs/fs-writeback.c =================================================================== --- linux-2.6.orig/fs/fs-writeback.c +++ linux-2.6/fs/fs-writeback.c @@ -552,15 +552,8 @@ select_queue: inode->i_state |= I_DIRTY_PAGES; redirty_tail(inode); } - } else if (inode->i_count) { - /* - * The inode is clean, inuse - */ - list_move(&inode->i_list, &inode_in_use); } else { - /* - * The inode is clean, unused - */ + /* The inode is clean */ list_move(&inode->i_list, &inode_unused); } } @@ -1206,8 +1199,6 @@ static void wait_sb_inodes(struct super_ */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&sb_inode_list_lock); - /* * Data integrity sync. Must wait for all pages under writeback, * because there may have been pages dirtied before our sync @@ -1215,7 +1206,8 @@ static void wait_sb_inodes(struct super_ * In which case, the inode may not be on the dirty list, but * we still have to wait for that writeout. */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + rcu_read_lock(); + list_for_each_entry_rcu(inode, &sb->s_inodes, i_sb_list) { struct address_space *mapping; mapping = inode->i_mapping; @@ -1229,13 +1221,13 @@ static void wait_sb_inodes(struct super_ } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb_inode_list_lock); + rcu_read_unlock(); /* * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * sb_inode_list_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it under - * spinlock. So we keep the reference and iput it later. + * removed from s_inodes list while we dropped the i_lock. We + * cannot iput the inode now as we can be holding the last + * reference and we cannot iput it under spinlock. So we keep + * the reference and iput it later. */ iput(old_inode); old_inode = inode; @@ -1244,9 +1236,9 @@ static void wait_sb_inodes(struct super_ cond_resched(); - spin_lock(&sb_inode_list_lock); + rcu_read_lock(); } - spin_unlock(&sb_inode_list_lock); + rcu_read_unlock(); iput(old_inode); } Index: linux-2.6/include/linux/writeback.h =================================================================== --- linux-2.6.orig/include/linux/writeback.h +++ linux-2.6/include/linux/writeback.h @@ -11,7 +11,6 @@ struct backing_dev_info; extern spinlock_t sb_inode_list_lock; extern spinlock_t wb_inode_list_lock; -extern struct list_head inode_in_use; extern struct list_head inode_unused; /* -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/