[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1279283870-18549-5-git-send-email-ngupta@vflare.org>
Date: Fri, 16 Jul 2010 18:07:46 +0530
From: Nitin Gupta <ngupta@...are.org>
To: Pekka Enberg <penberg@...helsinki.fi>,
Hugh Dickins <hugh.dickins@...cali.co.uk>,
Andrew Morton <akpm@...ux-foundation.org>,
Greg KH <greg@...ah.com>,
Dan Magenheimer <dan.magenheimer@...cle.com>,
Rik van Riel <riel@...hat.com>, Avi Kivity <avi@...hat.com>,
Christoph Hellwig <hch@...radead.org>,
Minchan Kim <minchan.kim@...il.com>,
Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
Cc: linux-mm <linux-mm@...ck.org>,
linux-kernel <linux-kernel@...r.kernel.org>
Subject: [PATCH 4/8] Shrink zcache based on memlimit
User can change (per-pool) memlimit using sysfs node:
/sys/kernel/mm/zcache/pool<id>/memlimit
When memlimit is set to a value smaller than current
number of pages allocated for that pool, excess pages
are now freed immediately instead of waiting for get/
flush for these pages.
Currently, victim page selection is essentially random.
Automatic cache resizing and better page replacement
policies will be implemented later.
Signed-off-by: Nitin Gupta <ngupta@...are.org>
---
drivers/staging/zram/zcache_drv.c | 115 ++++++++++++++++++++++++++++++++++---
1 files changed, 106 insertions(+), 9 deletions(-)
diff --git a/drivers/staging/zram/zcache_drv.c b/drivers/staging/zram/zcache_drv.c
index f680f19..c5de65d 100644
--- a/drivers/staging/zram/zcache_drv.c
+++ b/drivers/staging/zram/zcache_drv.c
@@ -41,6 +41,7 @@
#include <linux/kernel.h>
#include <linux/cleancache.h>
#include <linux/highmem.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/u64_stats_sync.h>
@@ -416,7 +417,8 @@ out:
* Called under zcache_inode_rb->tree_lock
*/
#define FREE_BATCH 16
-static void zcache_free_inode_pages(struct zcache_inode_rb *znode)
+static void zcache_free_inode_pages(struct zcache_inode_rb *znode,
+ u32 pages_to_free)
{
int count;
unsigned long index = 0;
@@ -428,6 +430,8 @@ static void zcache_free_inode_pages(struct zcache_inode_rb *znode)
count = radix_tree_gang_lookup(&znode->page_tree,
(void **)pages, index, FREE_BATCH);
+ if (count > pages_to_free)
+ count = pages_to_free;
for (i = 0; i < count; i++) {
index = pages[i]->index;
@@ -437,7 +441,98 @@ static void zcache_free_inode_pages(struct zcache_inode_rb *znode)
}
index++;
- } while (count == FREE_BATCH);
+ pages_to_free -= count;
+ } while (pages_to_free && (count == FREE_BATCH));
+}
+
+/*
+ * Returns number of pages stored in excess of currently
+ * set memlimit for the given pool.
+ */
+static u32 zcache_count_excess_pages(struct zcache_pool *zpool)
+{
+ u32 excess_pages, memlimit_pages, pages_stored;
+
+ memlimit_pages = zcache_get_memlimit(zpool) >> PAGE_SHIFT;
+ pages_stored = zcache_get_stat(zpool, ZPOOL_STAT_PAGES_STORED);
+ excess_pages = pages_stored > memlimit_pages ?
+ pages_stored - memlimit_pages : 0;
+
+ return excess_pages;
+}
+
+/*
+ * Free pages from this pool till we come within its memlimit.
+ *
+ * Currently, its called only when user sets memlimit lower than the
+ * number of pages currently stored in that pool. We select nodes in
+ * order of increasing inode number. This, in general, has no correlation
+ * with the order in which these are added. So, it is essentially random
+ * selection of nodes. Pages within a victim node node are freed in order
+ * of increasing index number.
+ *
+ * Automatic cache resizing and better page replacement policies will
+ * be implemented later.
+ */
+static void zcache_shrink_pool(struct zcache_pool *zpool)
+{
+ struct rb_node *node;
+ struct zcache_inode_rb *znode;
+
+ read_lock(&zpool->tree_lock);
+ node = rb_first(&zpool->inode_tree);
+ if (unlikely(!node)) {
+ read_unlock(&zpool->tree_lock);
+ return;
+ }
+ znode = rb_entry(node, struct zcache_inode_rb, rb_node);
+ kref_get(&znode->refcount);
+ read_unlock(&zpool->tree_lock);
+
+ do {
+ u32 pages_to_free;
+ struct rb_node *next_node;
+ struct zcache_inode_rb *next_znode;
+
+ pages_to_free = zcache_count_excess_pages(zpool);
+ if (!pages_to_free) {
+ spin_lock(&znode->tree_lock);
+ if (zcache_inode_is_empty(znode))
+ zcache_inode_isolate(znode);
+ spin_unlock(&znode->tree_lock);
+
+ kref_put(&znode->refcount, zcache_inode_release);
+ break;
+ }
+
+ /*
+ * Get the next victim node before we (possibly) isolate
+ * the current node.
+ */
+ read_lock(&zpool->tree_lock);
+ next_node = rb_next(node);
+ next_znode = NULL;
+ if (next_node) {
+ next_znode = rb_entry(next_node,
+ struct zcache_inode_rb, rb_node);
+ kref_get(&next_znode->refcount);
+ }
+ read_unlock(&zpool->tree_lock);
+
+ spin_lock(&znode->tree_lock);
+ zcache_free_inode_pages(znode, pages_to_free);
+ if (zcache_inode_is_empty(znode))
+ zcache_inode_isolate(znode);
+ spin_unlock(&znode->tree_lock);
+
+ kref_put(&znode->refcount, zcache_inode_release);
+
+ /* Avoid busy-looping */
+ cond_resched();
+
+ node = next_node;
+ znode = next_znode;
+ } while (znode);
}
#ifdef CONFIG_SYSFS
@@ -476,10 +571,13 @@ static void memlimit_sysfs_common(struct kobject *kobj, u64 *value, int store)
{
struct zcache_pool *zpool = zcache_kobj_to_pool(kobj);
- if (store)
+ if (store) {
zcache_set_memlimit(zpool, *value);
- else
+ if (zcache_count_excess_pages(zpool))
+ zcache_shrink_pool(zpool);
+ } else {
*value = zcache_get_memlimit(zpool);
+ }
}
static ssize_t memlimit_store(struct kobject *kobj,
@@ -687,9 +785,8 @@ static void zcache_put_page(int pool_id, ino_t inode_no,
/*
* memlimit can be changed any time by user using sysfs. If
* it is set to a value smaller than current number of pages
- * stored, then excess pages are not freed immediately but
- * further puts are blocked till sufficient number of pages
- * are flushed/freed.
+ * stored, then excess pages are freed synchronously when this
+ * sysfs event occurs.
*/
if (zcache_get_stat(zpool, ZPOOL_STAT_PAGES_STORED) >
zcache_get_memlimit(zpool) >> PAGE_SHIFT) {
@@ -781,7 +878,7 @@ static void zcache_flush_inode(int pool_id, ino_t inode_no)
return;
spin_lock_irqsave(&znode->tree_lock, flags);
- zcache_free_inode_pages(znode);
+ zcache_free_inode_pages(znode, UINT_MAX);
if (zcache_inode_is_empty(znode))
zcache_inode_isolate(znode);
spin_unlock_irqrestore(&znode->tree_lock, flags);
@@ -815,7 +912,7 @@ static void zcache_flush_fs(int pool_id)
while (node) {
znode = rb_entry(node, struct zcache_inode_rb, rb_node);
node = rb_next(node);
- zcache_free_inode_pages(znode);
+ zcache_free_inode_pages(znode, UINT_MAX);
rb_erase(&znode->rb_node, &zpool->inode_tree);
kfree(znode);
}
--
1.7.1.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists