lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1426016724-23912-9-git-send-email-jbacik@fb.com>
Date:	Tue, 10 Mar 2015 15:45:23 -0400
From:	Josef Bacik <jbacik@...com>
To:	<linux-fsdevel@...r.kernel.org>, <david@...morbit.com>,
	<viro@...iv.linux.org.uk>, <jack@...e.cz>,
	<linux-kernel@...r.kernel.org>
CC:	Dave Chinner <dchinner@...hat.com>
Subject: [PATCH 8/9] inode: convert per-sb inode list to a list_lru

From: Dave Chinner <dchinner@...hat.com>

The per-superblock inode list and lock is a bottleneck for systems
that cycle inodes in and out of cache concurrently. The global lock
is a limiting factor.

Most of the additions to the sb inode list occur on the CPU that
allocated the inode, and most of the removals occur during evict()
calls as a result of memory reclaim. Both of these events are local
to the node that the inode belongs to, so it maps to the per-node
lists that the list_lru uses.

There are several places where the inode list is walked. These can
be converted easily to use list_lru_walk() to do their work on each
inode on the list.

Signed-off-by: Dave Chinner <dchinner@...hat.com>
---
 fs/block_dev.c         |  76 ++++++++++++--------
 fs/drop_caches.c       |  58 ++++++++++-----
 fs/inode.c             | 136 ++++++++++++++++++-----------------
 fs/notify/inode_mark.c | 121 +++++++++++++-------------------
 fs/quota/dquot.c       | 187 ++++++++++++++++++++++++++++++++-----------------
 fs/super.c             |   8 ++-
 include/linux/fs.h     |   9 ++-
 7 files changed, 340 insertions(+), 255 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eb2436..d23ce6f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1749,38 +1749,56 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 }
 EXPORT_SYMBOL(__invalidate_device);
 
-void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
-{
-	struct inode *inode, *old_inode = NULL;
+struct bdev_iter {
+	void (*func)(struct block_device *, void *);
+	void *arg;
+	struct inode *toput_inode;
+};
 
-	spin_lock(&blockdev_superblock->s_inode_list_lock);
-	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
-		struct address_space *mapping = inode->i_mapping;
+static enum lru_status
+bdev_iter_cb(struct list_head *item, struct list_lru_one *lru,
+	     spinlock_t *lock, void *cb_arg)
+{
+	struct bdev_iter *iter = cb_arg;
+	struct inode *inode = container_of(item, struct inode, i_sb_list);
 
-		spin_lock(&inode->i_lock);
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
-		    mapping->nrpages == 0) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		__iget(inode);
+	spin_lock(&inode->i_lock);
+	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
+	    inode->i_mapping->nrpages == 0) {
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&blockdev_superblock->s_inode_list_lock);
-		/*
-		 * We hold a reference to 'inode' so it couldn't have been
-		 * removed from s_inodes list while we dropped the
-		 * s_inode_list_lock  We cannot iput the inode now as we can
-		 * be holding the last reference and we cannot iput it under
-		 * s_inode_list_lock. So we keep the reference and iput it
-		 * later.
-		 */
-		iput(old_inode);
-		old_inode = inode;
+		return LRU_SKIP;
+	}
+	__iget(inode);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(lock);
 
-		func(I_BDEV(inode), arg);
+	iput(iter->toput_inode);
+	iter->toput_inode = inode;
 
-		spin_lock(&blockdev_superblock->s_inode_list_lock);
-	}
-	spin_unlock(&blockdev_superblock->s_inode_list_lock);
-	iput(old_inode);
+	iter->func(I_BDEV(inode), iter->arg);
+
+	/*
+	 * Even though we have dropped the lock here, we can return LRU_SKIP as
+	 * we have a reference to the current inode and so it's next pointer is
+	 * guaranteed to be valid even though we dropped the list lock.
+	 */
+	spin_lock(lock);
+	return LRU_SKIP;
+}
+
+/*
+ * iterate_bdevs - run a callback across all block devices
+ */
+void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+{
+	struct bdev_iter iter = {
+		.func = func,
+		.arg = arg,
+	};
+
+	list_lru_walk(&blockdev_superblock->s_inode_list, bdev_iter_cb, &iter,
+		      ULONG_MAX);
+
+	/* the list walk doesn't release the last inode it sees! */
+	iput(iter.toput_inode);
 }
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d72d52b..ee381e1 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -13,29 +13,51 @@
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
 
-static void drop_pagecache_sb(struct super_block *sb, void *unused)
+static enum lru_status
+drop_pagecache_inode(struct list_head *item, struct list_lru_one *lru,
+		     spinlock_t *lock, void *cb_arg)
 {
-	struct inode *inode, *toput_inode = NULL;
+	struct inode **toput_inode = cb_arg;
+	struct inode *inode = container_of(item, struct inode, i_sb_list);
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		spin_lock(&inode->i_lock);
-		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-		    (inode->i_mapping->nrpages == 0)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		__iget(inode);
+	spin_lock(&inode->i_lock);
+	if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+	    (inode->i_mapping->nrpages == 0)) {
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&sb->s_inode_list_lock);
+		return LRU_SKIP;
+	}
+	__iget(inode);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(lock);
 
-		invalidate_mapping_pages(inode->i_mapping, 0, -1);
-		iput(toput_inode);
-		toput_inode = inode;
+	iput(*toput_inode);
+	*toput_inode = inode;
 
-		spin_lock(&sb->s_inode_list_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	invalidate_mapping_pages(inode->i_mapping, 0, -1);
+
+	/*
+	 * Even though we have dropped the lock here, we can return LRU_SKIP as
+	 * we have a reference to the current inode and so it's next pointer is
+	 * guaranteed to be valid even though we dropped the list lock.
+	 */
+	spin_lock(lock);
+	return LRU_SKIP;
+}
+
+
+/*
+ * This is a best effort scan, so we don't need to be absolutely sure we hit al
+ * inodes on the superblock. Hence a single pass is sufficient to catch them
+ * all.
+ */
+static void drop_pagecache_sb(struct super_block *sb, void *unused)
+{
+	struct inode *toput_inode = NULL;
+
+	list_lru_walk(&sb->s_inode_list, drop_pagecache_inode, &toput_inode,
+		      ULONG_MAX);
+
+	/* the list walk doesn't release the last inode it sees! */
 	iput(toput_inode);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index b961e5a..17da8801 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -28,8 +28,8 @@
  *   inode->i_state, inode->i_hash, __iget()
  * Inode LRU list locks protect:
  *   inode->i_sb->s_inode_lru, inode->i_lru
- * inode->i_sb->s_inode_list_lock protects:
- *   inode->i_sb->s_inodes, inode->i_sb_list
+ * Inode list locks protects:
+ *   inode->i_sb->s_inode_list, inode->i_sb_list
  * bdi->wb.list_lock protects:
  *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
  * inode_hash_lock protects:
@@ -37,7 +37,7 @@
  *
  * Lock ordering:
  *
- * inode->i_sb->s_inode_list_lock
+ * Inode list lock
  *   inode->i_lock
  *     Inode LRU list locks
  *
@@ -45,7 +45,7 @@
  *   inode->i_lock
  *
  * inode_hash_lock
- *   inode->i_sb->s_inode_list_lock
+ *   Inode list lock
  *   inode->i_lock
  *
  * iunique_lock
@@ -357,6 +357,7 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_LIST_HEAD(&inode->i_io_list);
 	INIT_LIST_HEAD(&inode->i_wb_list);
+	INIT_LIST_HEAD(&inode->i_sb_list);
 	INIT_LIST_HEAD(&inode->i_lru);
 	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
@@ -423,19 +424,13 @@ static void inode_lru_list_del(struct inode *inode)
  */
 void inode_sb_list_add(struct inode *inode)
 {
-	spin_lock(&inode->i_sb->s_inode_list_lock);
-	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
-	spin_unlock(&inode->i_sb->s_inode_list_lock);
+	list_lru_add(&inode->i_sb->s_inode_list, &inode->i_sb_list);
 }
 EXPORT_SYMBOL_GPL(inode_sb_list_add);
 
 static inline void inode_sb_list_del(struct inode *inode)
 {
-	if (!list_empty(&inode->i_sb_list)) {
-		spin_lock(&inode->i_sb->s_inode_list_lock);
-		list_del_init(&inode->i_sb_list);
-		spin_unlock(&inode->i_sb->s_inode_list_lock);
-	}
+	list_lru_del(&inode->i_sb->s_inode_list, &inode->i_sb_list);
 }
 
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -577,6 +572,50 @@ static void dispose_list(struct list_head *head)
 	}
 }
 
+static enum lru_status
+__evict_inodes_isolate(struct list_head *item, struct list_lru_one *lru,
+		       spinlock_t *lock, void *cb_arg, bool kill_dirty)
+{
+	struct list_head *dispose = cb_arg;
+	struct inode	*inode = container_of(item, struct inode, i_sb_list);
+
+	if (atomic_read(&inode->i_count))
+		return LRU_SKIP;
+
+	spin_lock(&inode->i_lock);
+	if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+		spin_unlock(&inode->i_lock);
+		return LRU_SKIP;
+	}
+
+	if ((inode->i_state & I_DIRTY) && !kill_dirty) {
+		spin_unlock(&inode->i_lock);
+		return LRU_SKIP;
+	}
+
+	inode->i_state |= I_FREEING;
+	inode_lru_list_del(inode);
+	list_add(&inode->i_lru, dispose);
+
+	list_lru_isolate(lru, item);
+	spin_unlock(&inode->i_lock);
+	return LRU_REMOVED;
+}
+
+static enum lru_status
+evict_inodes_isolate(struct list_head *item, struct list_lru_one *lru,
+		     spinlock_t *lock, void *cb_arg)
+{
+	return __evict_inodes_isolate(item, lru, lock, cb_arg, true);
+}
+
+static enum lru_status
+invalidate_inodes_isolate(struct list_head *item, struct list_lru_one *lru,
+			  spinlock_t *lock, void *cb_arg)
+{
+	return __evict_inodes_isolate(item, lru, lock, cb_arg, false);
+}
+
 /**
  * evict_inodes	- evict all evictable inodes for a superblock
  * @sb:		superblock to operate on
@@ -588,28 +627,15 @@ static void dispose_list(struct list_head *head)
  */
 void evict_inodes(struct super_block *sb)
 {
-	struct inode *inode, *next;
-	LIST_HEAD(dispose);
-
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-		if (atomic_read(&inode->i_count))
-			continue;
-
-		spin_lock(&inode->i_lock);
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
+	long freed;
 
-		inode->i_state |= I_FREEING;
-		inode_lru_list_del(inode);
-		spin_unlock(&inode->i_lock);
-		list_add(&inode->i_lru, &dispose);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	do {
+		LIST_HEAD(dispose);
 
-	dispose_list(&dispose);
+		freed = list_lru_walk(&sb->s_inode_list, evict_inodes_isolate,
+				      &dispose, ULONG_MAX);
+		dispose_list(&dispose);
+	} while (freed > 0);
 }
 
 /**
@@ -624,38 +650,24 @@ void evict_inodes(struct super_block *sb)
  */
 int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 {
-	int busy = 0;
-	struct inode *inode, *next;
-	LIST_HEAD(dispose);
+	list_lru_walk_cb isolate;
+	long freed;
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-		spin_lock(&inode->i_lock);
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
-			spin_unlock(&inode->i_lock);
-			busy = 1;
-			continue;
-		}
-		if (atomic_read(&inode->i_count)) {
-			spin_unlock(&inode->i_lock);
-			busy = 1;
-			continue;
-		}
+	isolate = kill_dirty ? evict_inodes_isolate :invalidate_inodes_isolate;
 
-		inode->i_state |= I_FREEING;
-		inode_lru_list_del(inode);
-		spin_unlock(&inode->i_lock);
-		list_add(&inode->i_lru, &dispose);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	do {
+		LIST_HEAD(dispose);
 
-	dispose_list(&dispose);
+		freed = list_lru_walk(&sb->s_inode_list, isolate,
+				      &dispose, ULONG_MAX);
+		dispose_list(&dispose);
+	} while (freed > 0);
 
-	return busy;
+	/*
+	 * if we skipped any inodes because we couldn't isolate them, tell the
+	 * caller there are still active inodes.
+	 */
+	return !!list_lru_count(&sb->s_inode_list);
 }
 
 /*
@@ -849,7 +861,7 @@ EXPORT_SYMBOL(get_next_ino);
  *	@sb: superblock
  *
  *	Allocates a new inode for given superblock.
- *	Inode wont be chained in superblock s_inodes list
+ *	Inode wont be chained in superblock s_inode_list list
  *	This means :
  *	- fs can't be unmount
  *	- quotas, fsnotify, writeback can't work
@@ -883,8 +895,6 @@ struct inode *new_inode(struct super_block *sb)
 {
 	struct inode *inode;
 
-	spin_lock_prefetch(&sb->s_inode_list_lock);
-
 	inode = new_inode_pseudo(sb);
 	if (inode)
 		inode_sb_list_add(inode);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index a4e1a8f..a0cdc66 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -161,87 +161,60 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 	return ret;
 }
 
-/**
- * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
- * @sb: superblock being unmounted.
- *
- * Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
- */
-void fsnotify_unmount_inodes(struct super_block *sb)
-{
-	struct inode *inode, *next_i, *need_iput = NULL;
-
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
-		struct inode *need_iput_tmp;
-
-		/*
-		 * We cannot __iget() an inode in state I_FREEING,
-		 * I_WILL_FREE, or I_NEW which is fine because by that point
-		 * the inode cannot have any associated watches.
-		 */
-		spin_lock(&inode->i_lock);
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
+static enum lru_status
+fsnotify_unmount_inode(struct list_head *item, struct list_lru_one *lru,
+		       spinlock_t *lock, void *cb_arg)
+ {
+	struct inode **toput_inode = cb_arg;
+	struct inode *inode = container_of(item, struct inode, i_sb_list);
+
+	/* New or being freed inodes cannot have any associated watches. */
+	spin_lock(&inode->i_lock);
+	if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+		spin_unlock(&inode->i_lock);
+		return LRU_SKIP;
+	}
 
-		/*
-		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with MS_ACTIVE clear would actually
-		 * evict all inodes with zero i_count from icache which is
-		 * unnecessarily violent and may in fact be illegal to do.
-		 */
-		if (!atomic_read(&inode->i_count)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-
-		need_iput_tmp = need_iput;
-		need_iput = NULL;
-
-		/* In case fsnotify_inode_delete() drops a reference. */
-		if (inode != need_iput_tmp)
-			__iget(inode);
-		else
-			need_iput_tmp = NULL;
+	/* If i_count is zero, the inode cannot have any watches */
+	if (!atomic_read(&inode->i_count)) {
 		spin_unlock(&inode->i_lock);
+		return LRU_SKIP;
+	}
 
-		/* In case the dropping of a reference would nuke next_i. */
-		while (&next_i->i_sb_list != &sb->s_inodes) {
-			spin_lock(&next_i->i_lock);
-			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
-						atomic_read(&next_i->i_count)) {
-				__iget(next_i);
-				need_iput = next_i;
-				spin_unlock(&next_i->i_lock);
-				break;
-			}
-			spin_unlock(&next_i->i_lock);
-			next_i = list_entry(next_i->i_sb_list.next,
-						struct inode, i_sb_list);
-		}
+	__iget(inode);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(lock);
 
-		/*
-		 * We can safely drop s_inode_list_lock here because either
-		 * we actually hold references on both inode and next_i or
-		 * end of list.  Also no new inodes will be added since the
-		 * umount has begun.
-		 */
-		spin_unlock(&sb->s_inode_list_lock);
+	iput(*toput_inode);
+	*toput_inode = inode;
 
-		if (need_iput_tmp)
-			iput(need_iput_tmp);
+	/* for each watch, send FS_UNMOUNT and then remove it */
+	fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+	fsnotify_inode_delete(inode);
 
-		/* for each watch, send FS_UNMOUNT and then remove it */
-		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+	/*
+	 * Even though we have dropped the lock here, we can return LRU_SKIP as
+	 * we have a reference to the current inode and so it's next pointer is
+	 * guaranteed to be valid even though we dropped the list lock.
+	 */
+	spin_lock(lock);
+	return LRU_SKIP;
+}
 
-		fsnotify_inode_delete(inode);
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @sb: superblock being unmounted.
+ *
+ * Called during unmount with the sb->s_umount held exclusively and so the inode
+ * list will not grow and so a single pass will catch all inodes.
+ */
+void fsnotify_unmount_inodes(struct super_block *sb)
+{
+	struct inode *toput_inode = NULL;
 
-		iput(inode);
+	list_lru_walk(&sb->s_inode_list, fsnotify_unmount_inode, &toput_inode,
+		      ULONG_MAX);
 
-		spin_lock(&sb->s_inode_list_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	/* the list walk doesn't release the last inode it sees! */
+	iput(toput_inode);
 }
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 68c7ae3..20c97c7 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -912,55 +912,81 @@ static int dqinit_needed(struct inode *inode, int type)
 	return 0;
 }
 
-/* This routine is guarded by dqonoff_mutex mutex */
-static void add_dquot_ref(struct super_block *sb, int type)
+static enum lru_status
+add_dquot_ref_type(struct list_head *item, struct list_lru_one *lru,
+		   spinlock_t *lock, void *cb_arg, int type)
 {
-	struct inode *inode, *old_inode = NULL;
-#ifdef CONFIG_QUOTA_DEBUG
-	int reserved = 0;
-#endif
+	struct inode **toput_inode = cb_arg;
+	struct inode *inode = container_of(item, struct inode, i_sb_list);
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		spin_lock(&inode->i_lock);
-		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-		    !atomic_read(&inode->i_writecount) ||
-		    !dqinit_needed(inode, type)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		__iget(inode);
+	spin_lock(&inode->i_lock);
+	if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+	    !atomic_read(&inode->i_writecount) ||
+	    !dqinit_needed(inode, type)) {
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&sb->s_inode_list_lock);
+		return LRU_SKIP;
+	}
+
+	__iget(inode);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(lock);
 
 #ifdef CONFIG_QUOTA_DEBUG
-		if (unlikely(inode_get_rsv_space(inode) > 0))
-			reserved = 1;
+	if (unlikely(inode_get_rsv_space(inode) > 0))
+		quota_error(inode->i_sb, "Writes happened before quota was "
+			    "turned on thus quota information is probably "
+			    "inconsistent. Please run quotacheck(8)");
 #endif
-		iput(old_inode);
-		__dquot_initialize(inode, type);
 
-		/*
-		 * We hold a reference to 'inode' so it couldn't have been
-		 * removed from s_inodes list while we dropped the
-		 * s_inode_list_lock. We cannot iput the inode now as we can be
-		 * holding the last reference and we cannot iput it under
-		 * s_inode_list_lock. So we keep the reference and iput it
-		 * later.
-		 */
-		old_inode = inode;
-		spin_lock(&sb->s_inode_list_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-	iput(old_inode);
+	iput(*toput_inode);
+	*toput_inode = inode;
 
-#ifdef CONFIG_QUOTA_DEBUG
-	if (reserved) {
-		quota_error(sb, "Writes happened before quota was turned on "
-			"thus quota information is probably inconsistent. "
-			"Please run quotacheck(8)");
+	__dquot_initialize(inode, type);
+
+	/*
+	 * Even though we have dropped the lock here, we can return LRU_SKIP as
+	 * we have a reference to the current inode and so it's next pointer is
+	 * guaranteed to be valid even though we dropped the list lock.
+	 */
+	spin_lock(lock);
+	return LRU_SKIP;
+}
+
+static enum lru_status
+add_dquot_ref_usr(struct list_head *item, struct list_lru_one *lru,
+		  spinlock_t *lock, void *cb_arg)
+{
+	return add_dquot_ref_type(item, lru, lock, cb_arg, USRQUOTA);
+}
+
+static enum lru_status
+add_dquot_ref_grp(struct list_head *item, struct list_lru_one *lru,
+		  spinlock_t *lock, void *cb_arg)
+{
+	return add_dquot_ref_type(item, lru, lock, cb_arg, GRPQUOTA);
+}
+
+/* add_dquot_ref is protected by the dqonoff_mutex mutex */
+void add_dquot_ref(struct super_block *sb, int type)
+{
+	struct inode *toput_inode = NULL;
+	list_lru_walk_cb isolate;
+
+	switch (type) {
+	case USRQUOTA:
+		isolate = add_dquot_ref_usr;
+		break;
+	case GRPQUOTA:
+		isolate = add_dquot_ref_grp;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return;
 	}
-#endif
+	list_lru_walk(&sb->s_inode_list, isolate, &toput_inode, ULONG_MAX);
+
+	/* the list walk doesn't release the last inode it sees! */
+	iput(toput_inode);
 }
 
 /*
@@ -1013,36 +1039,67 @@ static void put_dquot_list(struct list_head *tofree_head)
 	}
 }
 
-static void remove_dquot_ref(struct super_block *sb, int type,
-		struct list_head *tofree_head)
+static enum lru_status
+remove_dquot_ref_type(struct list_head *item, struct list_lru_one *lru,
+		      spinlock_t *lock, void *cb_arg, int type)
 {
-	struct inode *inode;
-	int reserved = 0;
+	struct list_head *tofree_head = cb_arg;
+	struct inode *inode = container_of(item, struct inode, i_sb_list);
 
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		/*
-		 *  We have to scan also I_NEW inodes because they can already
-		 *  have quota pointer initialized. Luckily, we need to touch
-		 *  only quota pointers and these have separate locking
-		 *  (dq_data_lock).
-		 */
-		spin_lock(&dq_data_lock);
-		if (!IS_NOQUOTA(inode)) {
-			if (unlikely(inode_get_rsv_space(inode) > 0))
-				reserved = 1;
-			remove_inode_dquot_ref(inode, type, tofree_head);
-		}
-		spin_unlock(&dq_data_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
+	/*
+	 *  We have to scan also I_NEW inodes because they can already
+	 *  have quota pointer initialized. Luckily, we need to touch
+	 *  only quota pointers and these have separate locking
+	 *  (dqptr_sem).
+	 */
+	spin_lock(&dq_data_lock);
+	if (!IS_NOQUOTA(inode)) {
 #ifdef CONFIG_QUOTA_DEBUG
-	if (reserved) {
-		printk(KERN_WARNING "VFS (%s): Writes happened after quota"
+		if (unlikely(inode_get_rsv_space(inode) > 0)) {
+			printk_ratelimited(KERN_WARNING
+			"VFS (%s): Writes happened after quota"
 			" was disabled thus quota information is probably "
-			"inconsistent. Please run quotacheck(8).\n", sb->s_id);
-	}
+			"inconsistent. Please run quotacheck(8).\n",
+			inode->i_sb->s_id);
+		}
 #endif
+		remove_inode_dquot_ref(inode, type, tofree_head);
+	}
+	spin_unlock(&dq_data_lock);
+	return LRU_SKIP;
+}
+
+static enum lru_status
+remove_dquot_ref_usr(struct list_head *item, struct list_lru_one *lru,
+		     spinlock_t *lock, void *cb_arg)
+{
+	return remove_dquot_ref_type(item, lru, lock, cb_arg, USRQUOTA);
+}
+
+static enum lru_status
+remove_dquot_ref_grp(struct list_head *item, struct list_lru_one *lru,
+		     spinlock_t *lock, void *cb_arg)
+{
+	return remove_dquot_ref_type(item, lru, lock, cb_arg, GRPQUOTA);
+}
+
+static void remove_dquot_ref(struct super_block *sb, int type,
+		struct list_head *tofree_head)
+{
+	list_lru_walk_cb isolate;
+
+	switch (type) {
+	case USRQUOTA:
+		isolate = remove_dquot_ref_usr;
+		break;
+	case GRPQUOTA:
+		isolate = remove_dquot_ref_grp;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return;
+	}
+	list_lru_walk(&sb->s_inode_list, isolate, tofree_head, ULONG_MAX);
 }
 
 /* Gather all references from inodes and drop them */
diff --git a/fs/super.c b/fs/super.c
index 6a05d94..721584a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -146,6 +146,7 @@ static void destroy_super(struct super_block *s)
 	int i;
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
+	list_lru_destroy(&s->s_inode_list);
 	for (i = 0; i < SB_FREEZE_LEVELS; i++)
 		percpu_counter_destroy(&s->s_writers.counter[i]);
 	security_sb_free(s);
@@ -191,13 +192,13 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
 	mutex_init(&s->s_sync_lock);
-	INIT_LIST_HEAD(&s->s_inodes);
-	spin_lock_init(&s->s_inode_list_lock);
 
 	if (list_lru_init_memcg(&s->s_dentry_lru))
 		goto fail;
 	if (list_lru_init_memcg(&s->s_inode_lru))
 		goto fail;
+	if (list_lru_init(&s->s_inode_list))
+		goto fail;
 
 	init_rwsem(&s->s_umount);
 	lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -294,6 +295,7 @@ void deactivate_locked_super(struct super_block *s)
 		 */
 		list_lru_destroy(&s->s_dentry_lru);
 		list_lru_destroy(&s->s_inode_lru);
+		list_lru_destroy(&s->s_inode_list);
 
 		put_filesystem(fs);
 		put_super(s);
@@ -413,7 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
 		if (sop->put_super)
 			sop->put_super(sb);
 
-		if (!list_empty(&sb->s_inodes)) {
+		if (list_lru_count(&sb->s_inode_list)) {
 			printk("VFS: Busy inodes after unmount of %s. "
 			   "Self-destruct in 5 seconds.  Have a nice day...\n",
 			   sb->s_id);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 66284fe..964aba3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1319,9 +1319,12 @@ struct super_block {
 	 */
 	int s_stack_depth;
 
-	/* s_inode_list_lock protects s_inodes */
-	spinlock_t		s_inode_list_lock ____cacheline_aligned_in_smp;
-	struct list_head	s_inodes;	/* all inodes */
+	/*
+	 * the inode list is not strictly used as a LRU, but uses the list_lru
+	 * construct to provide a scalable list implemenation for adding,
+	 * removing and walking the inodes cached in memory.
+	 */
+	struct list_lru		s_inode_list ____cacheline_aligned_in_smp;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ