[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1359495627-30285-7-git-send-email-sjenning@linux.vnet.ibm.com>
Date: Tue, 29 Jan 2013 15:40:26 -0600
From: Seth Jennings <sjenning@...ux.vnet.ibm.com>
To: Andrew Morton <akpm@...ux-foundation.org>
Cc: Seth Jennings <sjenning@...ux.vnet.ibm.com>,
Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
Nitin Gupta <ngupta@...are.org>,
Minchan Kim <minchan@...nel.org>,
Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>,
Dan Magenheimer <dan.magenheimer@...cle.com>,
Robert Jennings <rcj@...ux.vnet.ibm.com>,
Jenifer Hopper <jhopper@...ibm.com>,
Mel Gorman <mgorman@...e.de>,
Johannes Weiner <jweiner@...hat.com>,
Rik van Riel <riel@...hat.com>,
Larry Woodman <lwoodman@...hat.com>,
Benjamin Herrenschmidt <benh@...nel.crashing.org>,
Dave Hansen <dave@...ux.vnet.ibm.com>, linux-mm@...ck.org,
linux-kernel@...r.kernel.org, devel@...verdev.osuosl.org
Subject: [PATCHv4 6/7] zswap: add flushing support
This patchset adds support for flush pages out of the compressed
pool to the swap device
Signed-off-by: Seth Jennings <sjenning@...ux.vnet.ibm.com>
---
mm/zswap.c | 451 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 434 insertions(+), 17 deletions(-)
diff --git a/mm/zswap.c b/mm/zswap.c
index a6c2928..b8e5673 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -34,6 +34,12 @@
#include <linux/mempool.h>
#include <linux/zsmalloc.h>
+#include <linux/mm_types.h>
+#include <linux/page-flags.h>
+#include <linux/swapops.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+
/*********************************
* statistics
**********************************/
@@ -41,6 +47,8 @@
static atomic_t zswap_pool_pages = ATOMIC_INIT(0);
/* The number of compressed pages currently stored in zswap */
static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+/* The number of outstanding pages awaiting writeback */
+static atomic_t zswap_outstanding_flushes = ATOMIC_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -49,9 +57,14 @@ static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
* certain event is occurring.
*/
static u64 zswap_pool_limit_hit;
+static u64 zswap_flushed_pages;
static u64 zswap_reject_compress_poor;
+static u64 zswap_flush_attempted;
+static u64 zswap_reject_tmppage_fail;
+static u64 zswap_reject_flush_fail;
static u64 zswap_reject_zsmalloc_fail;
static u64 zswap_reject_kmemcache_fail;
+static u64 zswap_saved_by_flush;
static u64 zswap_duplicate_entry;
/*********************************
@@ -80,6 +93,14 @@ static unsigned int zswap_max_compression_ratio = 80;
module_param_named(max_compression_ratio,
zswap_max_compression_ratio, uint, 0644);
+/*
+ * Maximum number of outstanding flushes allowed at any given time.
+ * This is to prevent decompressing an unbounded number of compressed
+ * pages into the swap cache all at once, and to help with writeback
+ * congestion.
+*/
+#define ZSWAP_MAX_OUTSTANDING_FLUSHES 64
+
/*********************************
* compression functions
**********************************/
@@ -145,14 +166,23 @@ static void zswap_comp_exit(void)
**********************************/
struct zswap_entry {
struct rb_node rbnode;
+ struct list_head lru;
+ int refcount;
unsigned type;
pgoff_t offset;
unsigned long handle;
unsigned int length;
};
+/*
+ * The tree lock in the zswap_tree struct protects a few things:
+ * - the rbtree
+ * - the lru list
+ * - the refcount field of each entry in the tree
+ */
struct zswap_tree {
struct rb_root rbroot;
+ struct list_head lru;
spinlock_t lock;
struct zs_pool *pool;
};
@@ -184,6 +214,8 @@ static inline struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
entry = kmem_cache_alloc(zswap_entry_cache, gfp);
if (!entry)
return NULL;
+ INIT_LIST_HEAD(&entry->lru);
+ entry->refcount = 1;
return entry;
}
@@ -192,6 +224,17 @@ static inline void zswap_entry_cache_free(struct zswap_entry *entry)
kmem_cache_free(zswap_entry_cache, entry);
}
+static inline void zswap_entry_get(struct zswap_entry *entry)
+{
+ entry->refcount++;
+}
+
+static inline int zswap_entry_put(struct zswap_entry *entry)
+{
+ entry->refcount--;
+ return entry->refcount;
+}
+
/*********************************
* rbtree functions
**********************************/
@@ -367,6 +410,278 @@ static struct zs_ops zswap_zs_ops = {
};
/*********************************
+* flush code
+**********************************/
+static void zswap_end_swap_write(struct bio *bio, int err)
+{
+ end_swap_bio_write(bio, err);
+ atomic_dec(&zswap_outstanding_flushes);
+ zswap_flushed_pages++;
+}
+
+/*
+ * zswap_get_swap_cache_page
+ *
+ * This is an adaption of read_swap_cache_async()
+ *
+ * If success, page is returned in retpage
+ * Returns 0 if page was already in the swap cache, page is not locked
+ * Returns 1 if the new page needs to be populated, page is locked
+ */
+static int zswap_get_swap_cache_page(swp_entry_t entry,
+ struct page **retpage)
+{
+ struct page *found_page, *new_page = NULL;
+ int err;
+
+ *retpage = NULL;
+ do {
+ /*
+ * First check the swap cache. Since this is normally
+ * called after lookup_swap_cache() failed, re-calling
+ * that would confuse statistics.
+ */
+ found_page = find_get_page(&swapper_space, entry.val);
+ if (found_page)
+ break;
+
+ /*
+ * Get a new page to read into from swap.
+ */
+ if (!new_page) {
+ new_page = alloc_page(GFP_KERNEL);
+ if (!new_page)
+ break; /* Out of memory */
+ }
+
+ /*
+ * call radix_tree_preload() while we can wait.
+ */
+ err = radix_tree_preload(GFP_KERNEL);
+ if (err)
+ break;
+
+ /*
+ * Swap entry may have been freed since our caller observed it.
+ */
+ err = swapcache_prepare(entry);
+ if (err == -EEXIST) { /* seems racy */
+ radix_tree_preload_end();
+ continue;
+ }
+ if (err) { /* swp entry is obsolete ? */
+ radix_tree_preload_end();
+ break;
+ }
+
+ /* May fail (-ENOMEM) if radix-tree node allocation failed. */
+ __set_page_locked(new_page);
+ SetPageSwapBacked(new_page);
+ err = __add_to_swap_cache(new_page, entry);
+ if (likely(!err)) {
+ radix_tree_preload_end();
+ lru_cache_add_anon(new_page);
+ *retpage = new_page;
+ return 1;
+ }
+ radix_tree_preload_end();
+ ClearPageSwapBacked(new_page);
+ __clear_page_locked(new_page);
+ /*
+ * add_to_swap_cache() doesn't return -EEXIST, so we can safely
+ * clear SWAP_HAS_CACHE flag.
+ */
+ swapcache_free(entry, NULL);
+ } while (err != -ENOMEM);
+
+ if (new_page)
+ page_cache_release(new_page);
+ if (!found_page)
+ return -ENOMEM;
+ *retpage = found_page;
+ return 0;
+}
+
+static int zswap_flush_entry(struct zswap_entry *entry)
+{
+ unsigned long type = entry->type;
+ struct zswap_tree *tree = zswap_trees[type];
+ struct page *page;
+ swp_entry_t swpentry;
+ u8 *src, *dst;
+ unsigned int dlen;
+ int ret, refcount;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ };
+
+ /* get/allocate page in the swap cache */
+ swpentry = swp_entry(type, entry->offset);
+ ret = zswap_get_swap_cache_page(swpentry, &page);
+ if (ret < 0)
+ return ret;
+ else if (ret) {
+ /* decompress */
+ dlen = PAGE_SIZE;
+ src = zs_map_object(tree->pool, entry->handle, ZS_MM_RO);
+ dst = kmap_atomic(page);
+ ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
+ dst, &dlen);
+ kunmap_atomic(dst);
+ zs_unmap_object(tree->pool, entry->handle);
+ BUG_ON(ret);
+ BUG_ON(dlen != PAGE_SIZE);
+ SetPageUptodate(page);
+ } else {
+ /* page is already in the swap cache, ignore for now */
+ spin_lock(&tree->lock);
+ refcount = zswap_entry_put(entry);
+ spin_unlock(&tree->lock);
+
+ if (likely(refcount))
+ return 0;
+
+ /* if the refcount is zero, invalidate must have come in */
+ /* free */
+ zs_free(tree->pool, entry->handle);
+ zswap_entry_cache_free(entry);
+ atomic_dec(&zswap_stored_pages);
+
+ return 0;
+ }
+
+ /* start writeback */
+ SetPageReclaim(page);
+ /*
+ * Return value is ignored here because it doesn't change anything
+ * for us. Page is returned unlocked.
+ */
+ (void)__swap_writepage(page, &wbc, zswap_end_swap_write);
+ page_cache_release(page);
+ atomic_inc(&zswap_outstanding_flushes);
+
+ /* remove */
+ spin_lock(&tree->lock);
+ refcount = zswap_entry_put(entry);
+ if (refcount > 1) {
+ /* load in progress, load will free */
+ spin_unlock(&tree->lock);
+ return 0;
+ }
+ if (refcount == 1)
+ /* no invalidate yet, remove from rbtree */
+ rb_erase(&entry->rbnode, &tree->rbroot);
+ spin_unlock(&tree->lock);
+
+ /* free */
+ zs_free(tree->pool, entry->handle);
+ zswap_entry_cache_free(entry);
+ atomic_dec(&zswap_stored_pages);
+
+ return 0;
+}
+
+static void zswap_flush_entries(unsigned type, int nr)
+{
+ struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_entry *entry;
+ int i, ret;
+
+/*
+ * This limits is arbitrary for now until a better
+ * policy can be implemented. This is so we don't
+ * eat all of RAM decompressing pages for writeback.
+ */
+ if (atomic_read(&zswap_outstanding_flushes) >
+ ZSWAP_MAX_OUTSTANDING_FLUSHES)
+ return;
+
+ for (i = 0; i < nr; i++) {
+ /* dequeue from lru */
+ spin_lock(&tree->lock);
+ if (list_empty(&tree->lru)) {
+ spin_unlock(&tree->lock);
+ break;
+ }
+ entry = list_first_entry(&tree->lru,
+ struct zswap_entry, lru);
+ list_del(&entry->lru);
+ zswap_entry_get(entry);
+ spin_unlock(&tree->lock);
+ ret = zswap_flush_entry(entry);
+ if (ret) {
+ /* put back on the lru */
+ spin_lock(&tree->lock);
+ list_add(&entry->lru, &tree->lru);
+ spin_unlock(&tree->lock);
+ } else {
+ if (atomic_read(&zswap_outstanding_flushes) >
+ ZSWAP_MAX_OUTSTANDING_FLUSHES)
+ break;
+ }
+ }
+}
+
+/*******************************************
+* page pool for temporary compression result
+********************************************/
+#define ZSWAP_TMPPAGE_POOL_PAGES 16
+static LIST_HEAD(zswap_tmppage_list);
+static DEFINE_SPINLOCK(zswap_tmppage_lock);
+
+static void zswap_tmppage_pool_destroy(void)
+{
+ struct page *page, *tmppage;
+
+ spin_lock(&zswap_tmppage_lock);
+ list_for_each_entry_safe(page, tmppage, &zswap_tmppage_list, lru) {
+ list_del(&page->lru);
+ __free_pages(page, 1);
+ }
+ spin_unlock(&zswap_tmppage_lock);
+}
+
+static int zswap_tmppage_pool_create(void)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < ZSWAP_TMPPAGE_POOL_PAGES; i++) {
+ page = alloc_pages(GFP_KERNEL, 1);
+ if (!page) {
+ zswap_tmppage_pool_destroy();
+ return -ENOMEM;
+ }
+ spin_lock(&zswap_tmppage_lock);
+ list_add(&page->lru, &zswap_tmppage_list);
+ spin_unlock(&zswap_tmppage_lock);
+ }
+ return 0;
+}
+
+static inline struct page *zswap_tmppage_alloc(void)
+{
+ struct page *page;
+
+ spin_lock(&zswap_tmppage_lock);
+ if (list_empty(&zswap_tmppage_list)) {
+ spin_unlock(&zswap_tmppage_lock);
+ return NULL;
+ }
+ page = list_first_entry(&zswap_tmppage_list, struct page, lru);
+ list_del(&page->lru);
+ spin_unlock(&zswap_tmppage_lock);
+ return page;
+}
+
+static inline void zswap_tmppage_free(struct page *page)
+{
+ spin_lock(&zswap_tmppage_lock);
+ list_add(&page->lru, &zswap_tmppage_list);
+ spin_unlock(&zswap_tmppage_lock);
+}
+
+/*********************************
* frontswap hooks
**********************************/
/* attempts to compress and store an single page */
@@ -378,7 +693,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct page *pag
unsigned int dlen = PAGE_SIZE;
unsigned long handle;
char *buf;
- u8 *src, *dst;
+ u8 *src, *dst, *tmpdst;
+ struct page *tmppage;
+ bool flush_attempted = 0;
if (!tree) {
ret = -ENODEV;
@@ -392,12 +709,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct page *pag
kunmap_atomic(src);
if (ret) {
ret = -EINVAL;
- goto putcpu;
+ goto freepage;
}
if ((dlen * 100 / PAGE_SIZE) > zswap_max_compression_ratio) {
zswap_reject_compress_poor++;
ret = -E2BIG;
- goto putcpu;
+ goto freepage;
}
/* store */
@@ -405,15 +722,46 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct page *pag
__GFP_NORETRY | __GFP_HIGHMEM | __GFP_NOMEMALLOC |
__GFP_NOWARN);
if (!handle) {
- zswap_reject_zsmalloc_fail++;
- ret = -ENOMEM;
- goto putcpu;
+ zswap_flush_attempted++;
+ /*
+ * Copy compressed buffer out of per-cpu storage so
+ * we can re-enable preemption.
+ */
+ tmppage = zswap_tmppage_alloc();
+ if (!tmppage) {
+ zswap_reject_tmppage_fail++;
+ ret = -ENOMEM;
+ goto freepage;
+ }
+ flush_attempted = 1;
+ tmpdst = page_address(tmppage);
+ memcpy(tmpdst, dst, dlen);
+ dst = tmpdst;
+ put_cpu_var(zswap_dstmem);
+
+ /* try to free up some space */
+ /* TODO: replace with more targeted policy */
+ zswap_flush_entries(type, 16);
+ /* try again, allowing wait */
+ handle = zs_malloc(tree->pool, dlen,
+ __GFP_NORETRY | __GFP_HIGHMEM | __GFP_NOMEMALLOC |
+ __GFP_NOWARN);
+ if (!handle) {
+ /* still no space, fail */
+ zswap_reject_zsmalloc_fail++;
+ ret = -ENOMEM;
+ goto freepage;
+ }
+ zswap_saved_by_flush++;
}
buf = zs_map_object(tree->pool, handle, ZS_MM_WO);
memcpy(buf, dst, dlen);
zs_unmap_object(tree->pool, handle);
- put_cpu_var(zswap_dstmem);
+ if (flush_attempted)
+ zswap_tmppage_free(tmppage);
+ else
+ put_cpu_var(zswap_dstmem);
/* allocate entry */
entry = zswap_entry_cache_alloc(GFP_KERNEL);
@@ -436,16 +784,19 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct page *pag
ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
if (ret == -EEXIST) {
zswap_duplicate_entry++;
-
- /* remove from rbtree */
+ /* remove from rbtree and lru */
rb_erase(&dupentry->rbnode, &tree->rbroot);
-
- /* free */
- zs_free(tree->pool, dupentry->handle);
- zswap_entry_cache_free(dupentry);
- atomic_dec(&zswap_stored_pages);
+ if (dupentry->lru.next != LIST_POISON1)
+ list_del(&dupentry->lru);
+ if (!zswap_entry_put(dupentry)) {
+ /* free */
+ zs_free(tree->pool, dupentry->handle);
+ zswap_entry_cache_free(dupentry);
+ atomic_dec(&zswap_stored_pages);
+ }
}
} while (ret == -EEXIST);
+ list_add_tail(&entry->lru, &tree->lru);
spin_unlock(&tree->lock);
/* update stats */
@@ -453,8 +804,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct page *pag
return 0;
-putcpu:
- put_cpu_var(zswap_dstmem);
+freepage:
+ if (flush_attempted)
+ zswap_tmppage_free(tmppage);
+ else
+ put_cpu_var(zswap_dstmem);
reject:
return ret;
}
@@ -469,10 +823,21 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct page *page
struct zswap_entry *entry;
u8 *src, *dst;
unsigned int dlen;
+ int refcount;
/* find */
spin_lock(&tree->lock);
entry = zswap_rb_search(&tree->rbroot, offset);
+ if (!entry) {
+ /* entry was flushed */
+ spin_unlock(&tree->lock);
+ return -1;
+ }
+ zswap_entry_get(entry);
+
+ /* remove from lru */
+ if (entry->lru.next != LIST_POISON1)
+ list_del(&entry->lru);
spin_unlock(&tree->lock);
/* decompress */
@@ -484,6 +849,25 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct page *page
kunmap_atomic(dst);
zs_unmap_object(tree->pool, entry->handle);
+ spin_lock(&tree->lock);
+ refcount = zswap_entry_put(entry);
+ if (likely(refcount)) {
+ list_add_tail(&entry->lru, &tree->lru);
+ spin_unlock(&tree->lock);
+ return 0;
+ }
+ spin_unlock(&tree->lock);
+
+ /*
+ * We don't have to unlink from the rbtree because zswap_flush_entry()
+ * or zswap_frontswap_invalidate page() has already done this for us if we
+ * are the last reference.
+ */
+ /* free */
+ zs_free(tree->pool, entry->handle);
+ zswap_entry_cache_free(entry);
+ atomic_dec(&zswap_stored_pages);
+
return 0;
}
@@ -492,14 +876,27 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
+ int refcount;
/* find */
spin_lock(&tree->lock);
entry = zswap_rb_search(&tree->rbroot, offset);
+ if (!entry) {
+ /* entry was flushed */
+ spin_unlock(&tree->lock);
+ return;
+ }
- /* remove from rbtree */
+ /* remove from rbtree and lru */
rb_erase(&entry->rbnode, &tree->rbroot);
+ if (entry->lru.next != LIST_POISON1)
+ list_del(&entry->lru);
+ refcount = zswap_entry_put(entry);
spin_unlock(&tree->lock);
+ if (refcount) {
+ /* must be flushing */
+ return;
+ }
/* free */
zs_free(tree->pool, entry->handle);
@@ -528,6 +925,7 @@ static void zswap_frontswap_invalidate_area(unsigned type)
node = next;
}
tree->rbroot = RB_ROOT;
+ INIT_LIST_HEAD(&tree->lru);
spin_unlock(&tree->lock);
}
@@ -543,6 +941,7 @@ static void zswap_frontswap_init(unsigned type)
if (!tree->pool)
goto freetree;
tree->rbroot = RB_ROOT;
+ INIT_LIST_HEAD(&tree->lru);
spin_lock_init(&tree->lock);
zswap_trees[type] = tree;
return;
@@ -578,20 +977,32 @@ static int __init zswap_debugfs_init(void)
if (!zswap_debugfs_root)
return -ENOMEM;
+ debugfs_create_u64("saved_by_flush", S_IRUGO,
+ zswap_debugfs_root, &zswap_saved_by_flush);
debugfs_create_u64("pool_limit_hit", S_IRUGO,
zswap_debugfs_root, &zswap_pool_limit_hit);
+ debugfs_create_u64("reject_flush_attempted", S_IRUGO,
+ zswap_debugfs_root, &zswap_flush_attempted);
+ debugfs_create_u64("reject_tmppage_fail", S_IRUGO,
+ zswap_debugfs_root, &zswap_reject_tmppage_fail);
+ debugfs_create_u64("reject_flush_fail", S_IRUGO,
+ zswap_debugfs_root, &zswap_reject_flush_fail);
debugfs_create_u64("reject_zsmalloc_fail", S_IRUGO,
zswap_debugfs_root, &zswap_reject_zsmalloc_fail);
debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
zswap_debugfs_root, &zswap_reject_kmemcache_fail);
debugfs_create_u64("reject_compress_poor", S_IRUGO,
zswap_debugfs_root, &zswap_reject_compress_poor);
+ debugfs_create_u64("flushed_pages", S_IRUGO,
+ zswap_debugfs_root, &zswap_flushed_pages);
debugfs_create_u64("duplicate_entry", S_IRUGO,
zswap_debugfs_root, &zswap_duplicate_entry);
debugfs_create_atomic_t("pool_pages", S_IRUGO,
zswap_debugfs_root, &zswap_pool_pages);
debugfs_create_atomic_t("stored_pages", S_IRUGO,
zswap_debugfs_root, &zswap_stored_pages);
+ debugfs_create_atomic_t("outstanding_flushes", S_IRUGO,
+ zswap_debugfs_root, &zswap_outstanding_flushes);
return 0;
}
@@ -627,6 +1038,10 @@ static int __init init_zswap(void)
pr_err("zswap: page pool initialization failed\n");
goto pagepoolfail;
}
+ if (zswap_tmppage_pool_create()) {
+ pr_err("zswap: workmem pool initialization failed\n");
+ goto tmppoolfail;
+ }
if (zswap_comp_init()) {
pr_err("zswap: compressor initialization failed\n");
goto compfail;
@@ -642,6 +1057,8 @@ static int __init init_zswap(void)
pcpufail:
zswap_comp_exit();
compfail:
+ zswap_tmppage_pool_destroy();
+tmppoolfail:
zswap_page_pool_destroy();
pagepoolfail:
zswap_entry_cache_destory();
--
1.8.1.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists