linux-kernel - [PATCH 3/4] virtual block device driver (ramzswap)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1253227412-24342-4-git-send-email-ngupta@vflare.org>
Date:	Fri, 18 Sep 2009 04:13:31 +0530
From:	Nitin Gupta <ngupta@...are.org>
To:	Greg KH <greg@...ah.com>
Cc:	Andrew Morton <akpm@...ux-foundation.org>,
	Pekka Enberg <penberg@...helsinki.fi>,
	Ed Tomlinson <edt@....ca>,
	linux-kernel <linux-kernel@...r.kernel.org>,
	linux-mm <linux-mm@...ck.org>,
	linux-mm-cc <linux-mm-cc@...top.org>
Subject: [PATCH 3/4] virtual block device driver (ramzswap)

Creates RAM based block devices (/dev/ramzswapX) which can be
used (only) as swap disks. Pages swapped to these are compressed
and stored in memory itself.

The module is called ramzswap.ko. It depends on:
 - xvmalloc memory allocator (compiled with this driver)
 - lzo_compress.ko
 - lzo_decompress.ko

See ramzswap.txt for usage details.

Signed-off-by: Nitin Gupta <ngupta@...are.org>

---
 drivers/staging/Kconfig                   |    2 +
 drivers/staging/Makefile                  |    1 +
 drivers/staging/ramzswap/Kconfig          |   21 +
 drivers/staging/ramzswap/Makefile         |    3 +
 drivers/staging/ramzswap/ramzswap_drv.c   | 1462 +++++++++++++++++++++++++++++
 drivers/staging/ramzswap/ramzswap_drv.h   |  173 ++++
 drivers/staging/ramzswap/ramzswap_ioctl.h |   50 +
 7 files changed, 1712 insertions(+), 0 deletions(-)

diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 10d3fcf..5cc55c4 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -131,5 +131,7 @@ source "drivers/staging/iio/Kconfig"

 source "drivers/staging/cowloop/Kconfig"

+source "drivers/staging/ramzswap/Kconfig"
+
 endif # !STAGING_EXCLUDE_BUILD
 endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index c30093b..dca5027 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -47,3 +47,4 @@ obj-$(CONFIG_RAR_REGISTER)	+= rar/
 obj-$(CONFIG_DX_SEP)		+= sep/
 obj-$(CONFIG_IIO)		+= iio/
 obj-$(CONFIG_COWLOOP)		+= cowloop/
+obj-$(CONFIG_RAMZSWAP)		+= ramzswap/
diff --git a/drivers/staging/ramzswap/Kconfig b/drivers/staging/ramzswap/Kconfig
new file mode 100644
index 0000000..24e2569
--- /dev/null
+++ b/drivers/staging/ramzswap/Kconfig
@@ -0,0 +1,21 @@
+config RAMZSWAP
+	tristate "Compressed in-memory swap device (ramzswap)"
+	depends on SWAP
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	default n
+	help
+	  Creates virtual block devices which can be used (only) as a swap
+	  disks. Pages swapped to these disks are compressed and stored in
+	  memory itself.
+
+	  See ramzswap.txt for more information.
+	  Project home: http://compcache.googlecode.com/
+
+config RAMZSWAP_STATS
+	bool "Enable ramzswap stats"
+	depends on RAMZSWAP
+	default y
+	help
+	  Enable statistics collection for ramzswap. This adds only a minimal
+	  overhead. In unsure, say Y.
diff --git a/drivers/staging/ramzswap/Makefile b/drivers/staging/ramzswap/Makefile
new file mode 100644
index 0000000..507d7dc
--- /dev/null
+++ b/drivers/staging/ramzswap/Makefile
@@ -0,0 +1,3 @@
+ramzswap-objs	:=	ramzswap_drv.o xvmalloc.o
+
+obj-$(CONFIG_RAMZSWAP)	+=	ramzswap.o
diff --git a/drivers/staging/ramzswap/ramzswap_drv.c b/drivers/staging/ramzswap/ramzswap_drv.c
new file mode 100644
index 0000000..46c387a
--- /dev/null
+++ b/drivers/staging/ramzswap/ramzswap_drv.c
@@ -0,0 +1,1462 @@
+/*
+ * Compressed RAM based swap device
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ * Project home: http://compcache.googlecode.com
+ */
+
+#define KMSG_COMPONENT "ramzswap"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/lzo.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+
+#include "ramzswap_drv.h"
+
+/* Globals */
+static int ramzswap_major;
+static struct ramzswap *devices;
+
+/*
+ * Pages that compress to larger than this size are
+ * forwarded to backing swap, if present or stored
+ * uncompressed in memory otherwise.
+ */
+static unsigned int max_zpage_size;
+
+/* Module params (documentation at end) */
+static unsigned int num_devices;
+
+static int rzs_test_flag(struct ramzswap *rzs, u32 index,
+			enum rzs_pageflags flag)
+{
+	return rzs->table[index].flags & BIT(flag);
+}
+
+static void rzs_set_flag(struct ramzswap *rzs, u32 index,
+			enum rzs_pageflags flag)
+{
+	rzs->table[index].flags |= BIT(flag);
+}
+
+static void rzs_clear_flag(struct ramzswap *rzs, u32 index,
+			enum rzs_pageflags flag)
+{
+	rzs->table[index].flags &= ~BIT(flag);
+}
+
+static int page_zero_filled(void *ptr)
+{
+	u32 pos;
+	u64 *page;
+
+	page = (u64 *)ptr;
+
+	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
+		if (page[pos])
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * memlimit cannot be greater than backing disk size.
+ */
+static void ramzswap_set_memlimit(struct ramzswap *rzs, size_t totalram_bytes)
+{
+	int memlimit_valid = 1;
+
+	if (!rzs->memlimit) {
+		pr_info("Memory limit not set.\n");
+		memlimit_valid = 0;
+	}
+
+	if (rzs->memlimit > rzs->disksize) {
+		pr_info("Memory limit cannot be greater than "
+			"disksize: limit=%zu, disksize=%zu\n",
+			rzs->memlimit, rzs->disksize);
+		memlimit_valid = 0;
+	}
+
+	if (!memlimit_valid) {
+		size_t mempart, disksize;
+		pr_info("Using default: smaller of (%u%% of RAM) and "
+			"(backing disk size).\n",
+			default_memlimit_perc_ram);
+		mempart = default_memlimit_perc_ram * (totalram_bytes / 100);
+		disksize = rzs->disksize;
+		rzs->memlimit = mempart > disksize ? disksize : mempart;
+	}
+
+	if (rzs->memlimit > totalram_bytes / 2) {
+		pr_info(
+		"Its not advisable setting limit more than half of "
+		"size of memory since we expect a 2:1 compression ratio. "
+		"Limit represents amount of *compressed* data we can keep "
+		"in memory!\n"
+		"\tMemory Size: %zu kB\n"
+		"\tLimit you selected: %zu kB\n"
+		"Continuing anyway ...\n",
+		totalram_bytes >> 10, rzs->memlimit >> 10
+		);
+	}
+
+	rzs->memlimit &= PAGE_MASK;
+	BUG_ON(!rzs->memlimit);
+}
+
+static void ramzswap_set_disksize(struct ramzswap *rzs, size_t totalram_bytes)
+{
+	if (!rzs->disksize) {
+		pr_info(
+		"disk size not provided. You can use disksize_kb module "
+		"param to specify size.\nUsing default: (%u%% of RAM).\n",
+		default_disksize_perc_ram
+		);
+		rzs->disksize = default_disksize_perc_ram *
+					(totalram_bytes / 100);
+	}
+
+	if (rzs->disksize > 2 * (totalram_bytes)) {
+		pr_info(
+		"There is little point creating a ramzswap of greater than "
+		"twice the size of memory since we expect a 2:1 compression "
+		"ratio. Note that ramzswap uses about 0.1%% of the size of "
+		"the swap device when not in use so a huge ramzswap is "
+		"wasteful.\n"
+		"\tMemory Size: %zu kB\n"
+		"\tSize you selected: %zu kB\n"
+		"Continuing anyway ...\n",
+		totalram_bytes >> 10, rzs->disksize
+		);
+	}
+
+	rzs->disksize &= PAGE_MASK;
+}
+
+/*
+ * Swap header (1st page of swap device) contains information
+ * to indentify it as a swap partition. Prepare such a header
+ * for ramzswap device (ramzswap0) so that swapon can identify
+ * it as swap partition. In case backing swap device is provided,
+ * copy its swap header.
+ */
+static int setup_swap_header(struct ramzswap *rzs, union swap_header *s)
+{
+	int ret = 0;
+	struct page *page;
+	struct address_space *mapping;
+	union swap_header *backing_swap_header;
+
+	/*
+	 * There is no backing swap device. Create a swap header
+	 * that is acceptable by swapon.
+	 */
+	if (!rzs->backing_swap) {
+		s->info.version = 1;
+		s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
+		s->info.nr_badpages = 0;
+		memcpy(s->magic.magic, "SWAPSPACE2", 10);
+		return 0;
+	}
+
+	/*
+	 * We have a backing swap device. Copy its swap header
+	 * to ramzswap device header. If this header contains
+	 * invalid information (backing device not a swap
+	 * partition, etc.), swapon will fail for ramzswap
+	 * which is correct behavior - we don't want to swap
+	 * over filesystem partition!
+	 */
+
+	/* Read the backing swap header (code from sys_swapon) */
+	mapping = rzs->swap_file->f_mapping;
+	if (!mapping->a_ops->readpage) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	page = read_mapping_page(mapping, 0, rzs->swap_file);
+	if (IS_ERR(page)) {
+		ret = PTR_ERR(page);
+		goto out;
+	}
+
+	backing_swap_header = kmap(page);
+	memcpy(s, backing_swap_header, sizeof(*s));
+	if (s->info.nr_badpages) {
+		pr_info("Cannot use backing swap with bad pages (%u)\n",
+			s->info.nr_badpages);
+		ret = -EINVAL;
+	}
+	/*
+	 * ramzswap disksize equals number of usable pages in backing
+	 * swap. Set last_page in swap header to match this disksize
+	 * ('last_page' means 0-based index of last usable swap page).
+	 */
+	s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
+	kunmap(page);
+
+out:
+	return ret;
+}
+
+static void ramzswap_flush_dcache_page(struct page *page)
+{
+#ifdef CONFIG_ARM
+	int flag = 0;
+	/*
+	 * Ugly hack to get flush_dcache_page() work on ARM.
+	 * page_mapping(page) == NULL after clearing this swap cache flag.
+	 * Without clearing this flag, flush_dcache_page() will simply set
+	 * "PG_dcache_dirty" bit and return.
+	 */
+	if (PageSwapCache(page)) {
+		flag = 1;
+		ClearPageSwapCache(page);
+	}
+#endif
+	flush_dcache_page(page);
+#ifdef CONFIG_ARM
+	if (flag)
+		SetPageSwapCache(page);
+#endif
+}
+
+void ramzswap_ioctl_get_stats(struct ramzswap *rzs,
+			struct ramzswap_ioctl_stats *s)
+{
+	strncpy(s->backing_swap_name, rzs->backing_swap_name,
+		MAX_SWAP_NAME_LEN - 1);
+	s->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
+
+	s->disksize = rzs->disksize;
+	s->memlimit = rzs->memlimit;
+
+#if defined(CONFIG_RAMZSWAP_STATS)
+	{
+	struct ramzswap_stats *rs = &rzs->stats;
+	size_t succ_writes, mem_used;
+	unsigned int good_compress_perc = 0, no_compress_perc = 0;
+
+	mem_used = xv_get_total_size_bytes(rzs->mem_pool)
+			+ (rs->pages_expand << PAGE_SHIFT);
+	succ_writes = rs->num_writes - rs->failed_writes;
+
+	if (succ_writes && rs->pages_stored) {
+		good_compress_perc = rs->good_compress * 100
+					/ rs->pages_stored;
+		no_compress_perc = rs->pages_expand * 100
+					/ rs->pages_stored;
+	}
+
+	s->num_reads = rs->num_reads;
+	s->num_writes = rs->num_writes;
+	s->failed_reads = rs->failed_reads;
+	s->failed_writes = rs->failed_writes;
+	s->invalid_io = rs->invalid_io;
+	s->notify_free = rs->notify_free;
+	s->pages_zero = rs->pages_zero;
+
+	s->good_compress_pct = good_compress_perc;
+	s->pages_expand_pct = no_compress_perc;
+
+	s->pages_stored = rs->pages_stored;
+	s->pages_used = mem_used >> PAGE_SHIFT;
+	s->orig_data_size = rs->pages_stored << PAGE_SHIFT;
+	s->compr_data_size = rs->compr_size;
+	s->mem_used_total = mem_used;
+
+	s->bdev_num_reads = rs->bdev_num_reads;
+	s->bdev_num_writes = rs->bdev_num_writes;
+	}
+#endif /* CONFIG_RAMZSWAP_STATS */
+}
+
+static int add_backing_swap_extent(struct ramzswap *rzs,
+				pgoff_t phy_pagenum,
+				pgoff_t num_pages)
+{
+	unsigned int idx;
+	struct list_head *head;
+	struct page *curr_page, *new_page;
+	unsigned int extents_per_page = PAGE_SIZE /
+				sizeof(struct ramzswap_backing_extent);
+
+	idx = rzs->num_extents % extents_per_page;
+	if (!idx) {
+		new_page = alloc_page(__GFP_ZERO);
+		if (!new_page)
+			return -ENOMEM;
+
+		if (rzs->num_extents) {
+			curr_page = virt_to_page(rzs->curr_extent);
+			head = &curr_page->lru;
+		} else {
+			head = &rzs->backing_swap_extent_list;
+		}
+
+		list_add(&new_page->lru, head);
+		rzs->curr_extent = page_address(new_page);
+	}
+
+	rzs->curr_extent->phy_pagenum = phy_pagenum;
+	rzs->curr_extent->num_pages = num_pages;
+
+	pr_debug("add_extent: idx=%u, phy_pgnum=%lu, num_pgs=%lu, "
+		"pg_last=%lu, curr_ext=%p\n", idx, phy_pagenum, num_pages,
+		phy_pagenum + num_pages - 1, rzs->curr_extent);
+
+	if (idx != extents_per_page - 1)
+		rzs->curr_extent++;
+
+	return 0;
+}
+
+static int setup_backing_swap_extents(struct ramzswap *rzs,
+				struct inode *inode, unsigned long *num_pages)
+{
+	int ret = 0;
+	unsigned blkbits;
+	unsigned blocks_per_page;
+	pgoff_t contig_pages = 0, total_pages = 0;
+	pgoff_t pagenum = 0, prev_pagenum = 0;
+	sector_t probe_block = 0;
+	sector_t last_block;
+
+	blkbits = inode->i_blkbits;
+	blocks_per_page = PAGE_SIZE >> blkbits;
+
+	last_block = i_size_read(inode) >> blkbits;
+	while (probe_block + blocks_per_page <= last_block) {
+		unsigned block_in_page;
+		sector_t first_block;
+
+		first_block = bmap(inode, probe_block);
+		if (first_block == 0)
+			goto bad_bmap;
+
+		/* It must be PAGE_SIZE aligned on-disk */
+		if (first_block & (blocks_per_page - 1)) {
+			probe_block++;
+			goto probe_next;
+		}
+
+		/* All blocks within this page must be contiguous on disk */
+		for (block_in_page = 1; block_in_page < blocks_per_page;
+					block_in_page++) {
+			sector_t block;
+
+			block = bmap(inode, probe_block + block_in_page);
+			if (block == 0)
+				goto bad_bmap;
+			if (block != first_block + block_in_page) {
+				/* Discontiguity */
+				probe_block++;
+				goto probe_next;
+			}
+		}
+
+		/*
+		 * We found a PAGE_SIZE length, PAGE_SIZE aligned
+		 * run of blocks.
+		 */
+		pagenum = first_block >> (PAGE_SHIFT - blkbits);
+
+		if (total_pages && (pagenum != prev_pagenum + 1)) {
+			ret = add_backing_swap_extent(rzs, prev_pagenum -
+					(contig_pages - 1), contig_pages);
+			if (ret < 0)
+				goto out;
+			rzs->num_extents++;
+			contig_pages = 0;
+		}
+		total_pages++;
+		contig_pages++;
+		prev_pagenum = pagenum;
+		probe_block += blocks_per_page;
+
+probe_next:
+		continue;
+	}
+
+	if (contig_pages) {
+		pr_debug("adding last extent: pagenum=%lu, "
+			"contig_pages=%lu\n", pagenum, contig_pages);
+		ret = add_backing_swap_extent(rzs,
+			prev_pagenum - (contig_pages - 1), contig_pages);
+		if (ret < 0)
+			goto out;
+		rzs->num_extents++;
+	}
+	if (!rzs->num_extents) {
+		pr_err("No swap extents found!\n");
+		ret = -EINVAL;
+	}
+
+	if (!ret) {
+		*num_pages = total_pages;
+		pr_info("Found %lu extents containing %luk\n",
+			rzs->num_extents, *num_pages << (PAGE_SHIFT - 10));
+	}
+	goto out;
+
+bad_bmap:
+	pr_err("Backing swapfile has holes\n");
+	ret = -EINVAL;
+out:
+	while (ret && !list_empty(&rzs->backing_swap_extent_list)) {
+		struct page *page;
+		struct list_head *entry = rzs->backing_swap_extent_list.next;
+		page = list_entry(entry, struct page, lru);
+		list_del(entry);
+		__free_page(page);
+	}
+	return ret;
+}
+
+static void map_backing_swap_extents(struct ramzswap *rzs)
+{
+	struct ramzswap_backing_extent *se;
+	struct page *table_page, *se_page;
+	unsigned long num_pages, num_table_pages, entry;
+	unsigned long se_idx, span;
+	unsigned entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
+	unsigned extents_per_page = PAGE_SIZE / sizeof(*se);
+
+	/* True for block device */
+	if (!rzs->num_extents)
+		return;
+
+	se_page = list_entry(rzs->backing_swap_extent_list.next,
+					struct page, lru);
+	se = page_address(se_page);
+	span = se->num_pages;
+	num_pages = rzs->disksize >> PAGE_SHIFT;
+	num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
+							PAGE_SIZE);
+
+	entry = 0;
+	se_idx = 0;
+	while (num_table_pages--) {
+		table_page = vmalloc_to_page(&rzs->table[entry]);
+		while (span <= entry) {
+			se_idx++;
+			if (se_idx == rzs->num_extents)
+				BUG();
+
+			if (!(se_idx % extents_per_page)) {
+				se_page = list_entry(se_page->lru.next,
+						struct page, lru);
+				se = page_address(se_page);
+			} else
+				se++;
+
+			span += se->num_pages;
+		}
+		table_page->mapping = (struct address_space *)se;
+		table_page->private = se->num_pages - (span - entry);
+		pr_debug("map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n",
+			entry, span, table_page->mapping, table_page->private);
+		entry += entries_per_page;
+	}
+}
+
+/*
+ * Check if value of backing_swap module param is sane.
+ * Claim this device and set ramzswap size equal to
+ * size of this block device.
+ */
+static int setup_backing_swap(struct ramzswap *rzs)
+{
+	int ret = 0;
+	size_t disksize;
+	unsigned long num_pages = 0;
+	struct inode *inode;
+	struct file *swap_file;
+	struct address_space *mapping;
+	struct block_device *bdev = NULL;
+
+	if (!rzs->backing_swap_name[0]) {
+		pr_debug("backing_swap param not given\n");
+		goto out;
+	}
+
+	pr_info("Using backing swap device: %s\n", rzs->backing_swap_name);
+
+	swap_file = filp_open(rzs->backing_swap_name,
+				O_RDWR | O_LARGEFILE, 0);
+	if (IS_ERR(swap_file)) {
+		pr_err("Error opening backing device: %s\n",
+			rzs->backing_swap_name);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	mapping = swap_file->f_mapping;
+	inode = mapping->host;
+
+	if (S_ISBLK(inode->i_mode)) {
+		bdev = I_BDEV(inode);
+		ret = bd_claim(bdev, setup_backing_swap);
+		if (ret < 0) {
+			bdev = NULL;
+			goto bad_param;
+		}
+		disksize = i_size_read(inode);
+	} else if (S_ISREG(inode->i_mode)) {
+		bdev = inode->i_sb->s_bdev;
+		if (IS_SWAPFILE(inode)) {
+			ret = -EBUSY;
+			goto bad_param;
+		}
+		ret = setup_backing_swap_extents(rzs, inode, &num_pages);
+		if (ret < 0)
+			goto bad_param;
+		disksize = num_pages << PAGE_SHIFT;
+	} else {
+		goto bad_param;
+	}
+
+	rzs->swap_file = swap_file;
+	rzs->backing_swap = bdev;
+	rzs->disksize = disksize;
+	BUG_ON(!rzs->disksize);
+
+	return 0;
+
+bad_param:
+	if (bdev)
+		bd_release(bdev);
+	filp_close(swap_file, NULL);
+
+out:
+	rzs->backing_swap = NULL;
+	return ret;
+}
+
+/*
+ * Map logical page number 'pagenum' to physical page number
+ * on backing swap device. For block device, this is a nop.
+ */
+u32 map_backing_swap_page(struct ramzswap *rzs, u32 pagenum)
+{
+	u32 skip_pages, entries_per_page;
+	size_t delta, se_offset, skipped;
+	struct page *table_page, *se_page;
+	struct ramzswap_backing_extent *se;
+
+	if (!rzs->num_extents)
+		return pagenum;
+
+	entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
+
+	table_page = vmalloc_to_page(&rzs->table[pagenum]);
+	se = (struct ramzswap_backing_extent *)table_page->mapping;
+	se_page = virt_to_page(se);
+
+	skip_pages = pagenum - (pagenum / entries_per_page * entries_per_page);
+	se_offset = table_page->private + skip_pages;
+
+	if (se_offset < se->num_pages)
+		return se->phy_pagenum + se_offset;
+
+	skipped = se->num_pages - table_page->private;
+	do {
+		struct ramzswap_backing_extent *se_base;
+		u32 se_entries_per_page = PAGE_SIZE / sizeof(*se);
+
+		/* Get next swap extent */
+		se_base = (struct ramzswap_backing_extent *)
+						page_address(se_page);
+		if (se - se_base == se_entries_per_page - 1) {
+			se_page = list_entry(se_page->lru.next,
+						struct page, lru);
+			se = page_address(se_page);
+		} else {
+			se++;
+		}
+
+		skipped += se->num_pages;
+	} while (skipped < skip_pages);
+
+	delta = skipped - skip_pages;
+	se_offset = se->num_pages - delta;
+
+	return se->phy_pagenum + se_offset;
+}
+
+static void ramzswap_free_page(struct ramzswap *rzs, size_t index)
+{
+	u32 clen;
+	void *obj;
+
+	struct page *page = rzs->table[index].page;
+	u32 offset = rzs->table[index].offset;
+
+	if (unlikely(!page)) {
+		if (rzs_test_flag(rzs, index, RZS_ZERO)) {
+			rzs_clear_flag(rzs, index, RZS_ZERO);
+			stat_dec(rzs->stats.pages_zero);
+		}
+		return;
+	}
+
+	if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) {
+		clen = PAGE_SIZE;
+		__free_page(page);
+		rzs_clear_flag(rzs, index, RZS_UNCOMPRESSED);
+		stat_dec(rzs->stats.pages_expand);
+		goto out;
+	}
+
+	obj = kmap_atomic(page, KM_USER0) + offset;
+	clen = xv_get_object_size(obj) - sizeof(struct zobj_header);
+	kunmap_atomic(obj, KM_USER0);
+
+	xv_free(rzs->mem_pool, page, offset);
+	if (clen <= PAGE_SIZE / 2)
+		stat_dec(rzs->stats.good_compress);
+
+out:
+	rzs->stats.compr_size -= clen;
+	stat_dec(rzs->stats.pages_stored);
+
+	rzs->table[index].page = NULL;
+	rzs->table[index].offset = 0;
+}
+
+/*
+ * callback function called when swap_map[offset] == 0
+ * i.e page at this swap offset is no longer used
+ */
+static void ramzswap_free_notify(struct block_device *bdev,
+				unsigned long index)
+{
+	struct ramzswap *rzs = bdev->bd_disk->private_data;
+
+	ramzswap_free_page(rzs, index);
+	stat_inc(rzs->stats.notify_free);
+}
+
+static int handle_zero_page(struct bio *bio)
+{
+	void *user_mem;
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	user_mem = kmap_atomic(page, KM_USER0);
+	memset(user_mem, 0, PAGE_SIZE);
+	kunmap_atomic(user_mem, KM_USER0);
+
+	ramzswap_flush_dcache_page(page);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+}
+
+static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio)
+{
+	u32 index;
+	struct page *page;
+	unsigned char *user_mem, *cmem;
+
+	page = bio->bi_io_vec[0].bv_page;
+	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+
+	user_mem = kmap_atomic(page, KM_USER0);
+	cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
+			rzs->table[index].offset;
+
+	memcpy(user_mem, cmem, PAGE_SIZE);
+	kunmap_atomic(user_mem, KM_USER0);
+	kunmap_atomic(cmem, KM_USER1);
+
+	ramzswap_flush_dcache_page(page);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+}
+
+
+/*
+ * Called when request page is not present in ramzswap.
+ * Its either in backing swap device (if present) or
+ * this is an attempt to read before any previous write
+ * to this location - this happens due to readahead when
+ * swap device is read from user-space (e.g. during swapon)
+ */
+static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio)
+{
+	/*
+	 * Always forward such requests to backing swap
+	 * device (if present)
+	 */
+	if (rzs->backing_swap) {
+		u32 pagenum;
+		stat_dec(rzs->stats.num_reads);
+		stat_inc(rzs->stats.bdev_num_reads);
+		bio->bi_bdev = rzs->backing_swap;
+
+		/*
+		 * In case backing swap is a file, find the right offset within
+		 * the file corresponding to logical position 'index'. For block
+		 * device, this is a nop.
+		 */
+		pagenum = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+		bio->bi_sector = map_backing_swap_page(rzs, pagenum)
+					<< SECTORS_PER_PAGE_SHIFT;
+		return 1;
+	}
+
+	/*
+	 * Its unlikely event in case backing dev is
+	 * not present
+	 */
+	pr_debug("Read before write on swap device: "
+		"sector=%lu, size=%u, offset=%u\n",
+		(ulong)(bio->bi_sector), bio->bi_size,
+		bio->bi_io_vec[0].bv_offset);
+
+	/* Do nothing. Just return success */
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+}
+
+static int ramzswap_read(struct ramzswap *rzs, struct bio *bio)
+{
+	int ret;
+	u32 index;
+	size_t clen;
+	struct page *page;
+	struct zobj_header *zheader;
+	unsigned char *user_mem, *cmem;
+
+	stat_inc(rzs->stats.num_reads);
+
+	page = bio->bi_io_vec[0].bv_page;
+	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+
+	if (unlikely(!rzs->init_notify_callback) && PageSwapCache(page)) {
+		set_swap_free_notify(bio->bi_bdev, ramzswap_free_notify);
+		rzs->init_notify_callback = 1;
+	}
+
+	if (rzs_test_flag(rzs, index, RZS_ZERO))
+		return handle_zero_page(bio);
+
+	/* Requested page is not present in compressed area */
+	if (!rzs->table[index].page)
+		return handle_ramzswap_fault(rzs, bio);
+
+	/* Page is stored uncompressed since its incompressible */
+	if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
+		return handle_uncompressed_page(rzs, bio);
+
+	user_mem = kmap_atomic(page, KM_USER0);
+	clen = PAGE_SIZE;
+
+	cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
+			rzs->table[index].offset;
+
+	ret = lzo1x_decompress_safe(
+		cmem + sizeof(*zheader),
+		xv_get_object_size(cmem) - sizeof(*zheader),
+		user_mem, &clen);
+
+	kunmap_atomic(user_mem, KM_USER0);
+	kunmap_atomic(cmem, KM_USER1);
+
+	/* should NEVER happen */
+	if (unlikely(ret != LZO_E_OK)) {
+		pr_err("Decompression failed! err=%d, page=%u\n",
+			ret, index);
+		stat_inc(rzs->stats.failed_reads);
+		goto out;
+	}
+
+	ramzswap_flush_dcache_page(page);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+
+out:
+	bio_io_error(bio);
+	return 0;
+}
+
+static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
+{
+	int ret, fwd_write_request = 0;
+	u32 offset, index;
+	size_t clen;
+	struct zobj_header *zheader;
+	struct page *page, *page_store;
+	unsigned char *user_mem, *cmem, *src;
+
+	stat_inc(rzs->stats.num_writes);
+
+	page = bio->bi_io_vec[0].bv_page;
+	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+
+	src = rzs->compress_buffer;
+
+	/*
+	 * System swaps to same sector again when the stored page
+	 * is no longer referenced by any process. So, its now safe
+	 * to free the memory that was allocated for this page.
+	 */
+	if (rzs->table[index].page)
+		ramzswap_free_page(rzs, index);
+
+	/*
+	 * No memory ia allocated for zero filled pages.
+	 * Simply clear zero page flag.
+	 */
+	if (rzs_test_flag(rzs, index, RZS_ZERO)) {
+		stat_dec(rzs->stats.pages_zero);
+		rzs_clear_flag(rzs, index, RZS_ZERO);
+	}
+
+	mutex_lock(&rzs->lock);
+
+	user_mem = kmap_atomic(page, KM_USER0);
+	if (page_zero_filled(user_mem)) {
+		kunmap_atomic(user_mem, KM_USER0);
+		mutex_unlock(&rzs->lock);
+		stat_inc(rzs->stats.pages_zero);
+		rzs_set_flag(rzs, index, RZS_ZERO);
+
+		set_bit(BIO_UPTODATE, &bio->bi_flags);
+		bio_endio(bio, 0);
+		return 0;
+	}
+
+	if (rzs->backing_swap &&
+		(rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) {
+		kunmap_atomic(user_mem, KM_USER0);
+		mutex_unlock(&rzs->lock);
+		fwd_write_request = 1;
+		goto out;
+	}
+
+	ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
+				rzs->compress_workmem);
+
+	kunmap_atomic(user_mem, KM_USER0);
+
+	if (unlikely(ret != LZO_E_OK)) {
+		mutex_unlock(&rzs->lock);
+		pr_err("Compression failed! err=%d\n", ret);
+		stat_inc(rzs->stats.failed_writes);
+		goto out;
+	}
+
+	/*
+	 * Page is incompressible. Forward it to backing swap
+	 * if present. Otherwise, store it as-is (uncompressed)
+	 * since we do not want to return too many swap write
+	 * errors which has side effect of hanging the system.
+	 */
+	if (unlikely(clen > max_zpage_size)) {
+		if (rzs->backing_swap) {
+			mutex_unlock(&rzs->lock);
+			fwd_write_request = 1;
+			goto out;
+		}
+
+		clen = PAGE_SIZE;
+		page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+		if (unlikely(!page_store)) {
+			mutex_unlock(&rzs->lock);
+			pr_info("Error allocating memory for incompressible "
+				"page: %u\n", index);
+			stat_inc(rzs->stats.failed_writes);
+			goto out;
+		}
+
+		offset = 0;
+		rzs_set_flag(rzs, index, RZS_UNCOMPRESSED);
+		stat_inc(rzs->stats.pages_expand);
+		rzs->table[index].page = page_store;
+		src = kmap_atomic(page, KM_USER0);
+		goto memstore;
+	}
+
+	if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
+			&rzs->table[index].page, &offset,
+			GFP_NOIO | __GFP_HIGHMEM)) {
+		mutex_unlock(&rzs->lock);
+		pr_info("Error allocating memory for compressed "
+			"page: %u, size=%zu\n", index, clen);
+		stat_inc(rzs->stats.failed_writes);
+		if (rzs->backing_swap)
+			fwd_write_request = 1;
+		goto out;
+	}
+
+memstore:
+	rzs->table[index].offset = offset;
+
+	cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
+			rzs->table[index].offset;
+
+#if 0
+	/* Back-reference needed for memory defragmentation */
+	if (!rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)) {
+		zheader = (struct zobj_header *)cmem;
+		zheader->table_idx = index;
+		cmem += sizeof(*zheader);
+	}
+#endif
+
+	memcpy(cmem, src, clen);
+
+	kunmap_atomic(cmem, KM_USER1);
+	if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
+		kunmap_atomic(src, KM_USER0);
+
+	/* Update stats */
+	rzs->stats.compr_size += clen;
+	stat_inc(rzs->stats.pages_stored);
+	if (clen <= PAGE_SIZE / 2)
+		stat_inc(rzs->stats.good_compress);
+
+	mutex_unlock(&rzs->lock);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+
+out:
+	if (fwd_write_request) {
+		stat_inc(rzs->stats.bdev_num_writes);
+		bio->bi_bdev = rzs->backing_swap;
+#if 0
+		/*
+		 * TODO: We currently have linear mapping of ramzswap and
+		 * backing swap sectors. This is not desired since we want
+		 * to optimize writes to backing swap to minimize disk seeks
+		 * or have effective wear leveling (for SSDs). Also, a
+		 * non-linear mapping is required to implement compressed
+		 * on-disk swapping.
+		 */
+		 bio->bi_sector = get_backing_swap_page()
+					<< SECTORS_PER_PAGE_SHIFT;
+#endif
+		/*
+		 * In case backing swap is a file, find the right offset within
+		 * the file corresponding to logical position 'index'. For block
+		 * device, this is a nop.
+		 */
+		bio->bi_sector = map_backing_swap_page(rzs, index)
+					<< SECTORS_PER_PAGE_SHIFT;
+		return 1;
+	}
+
+	bio_io_error(bio);
+	return 0;
+}
+
+
+/*
+ * Check if request is within bounds and page aligned.
+ */
+static inline int valid_swap_request(struct ramzswap *rzs, struct bio *bio)
+{
+	if (unlikely(
+		(bio->bi_sector >= (rzs->disksize >> SECTOR_SHIFT)) ||
+		(bio->bi_sector & (SECTORS_PER_PAGE - 1)) ||
+		(bio->bi_vcnt != 1) ||
+		(bio->bi_size != PAGE_SIZE) ||
+		(bio->bi_io_vec[0].bv_offset != 0))) {
+
+		return 0;
+	}
+
+	/* swap request is valid */
+	return 1;
+}
+
+/*
+ * Handler function for all ramzswap I/O requests.
+ */
+static int ramzswap_make_request(struct request_queue *queue, struct bio *bio)
+{
+	int ret = 0;
+	struct ramzswap *rzs = queue->queuedata;
+
+	if (unlikely(!rzs->init_done)) {
+		bio_io_error(bio);
+		return 0;
+	}
+
+	if (!valid_swap_request(rzs, bio)) {
+		stat_inc(rzs->stats.invalid_io);
+		bio_io_error(bio);
+		return 0;
+	}
+
+	switch (bio_data_dir(bio)) {
+	case READ:
+		ret = ramzswap_read(rzs, bio);
+		break;
+
+	case WRITE:
+		ret = ramzswap_write(rzs, bio);
+		break;
+	}
+
+	return ret;
+}
+
+static void reset_device(struct ramzswap *rzs)
+{
+	int is_backing_blkdev = 0;
+	size_t index, num_pages;
+	unsigned entries_per_page;
+	unsigned long num_table_pages, entry = 0;
+
+	if (rzs->backing_swap && !rzs->num_extents)
+		is_backing_blkdev = 1;
+
+	num_pages = rzs->disksize >> PAGE_SHIFT;
+
+	/* Free various per-device buffers */
+	kfree(rzs->compress_workmem);
+	free_pages((unsigned long)rzs->compress_buffer, 1);
+
+	rzs->compress_workmem = NULL;
+	rzs->compress_buffer = NULL;
+
+	/* Free all pages that are still in this ramzswap device */
+	for (index = 0; index < num_pages; index++) {
+		struct page *page;
+		u16 offset;
+
+		page = rzs->table[index].page;
+		offset = rzs->table[index].offset;
+
+		if (!page)
+			continue;
+
+		if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
+			__free_page(page);
+		else
+			xv_free(rzs->mem_pool, page, offset);
+	}
+
+	entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
+	num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
+					PAGE_SIZE);
+	/*
+	 * Set page->mapping to NULL for every table page.
+	 * Otherwise, we will hit bad_page() during free.
+	 */
+	while (rzs->num_extents && num_table_pages--) {
+		struct page *page;
+		page = vmalloc_to_page(&rzs->table[entry]);
+		page->mapping = NULL;
+		entry += entries_per_page;
+	}
+	vfree(rzs->table);
+	rzs->table = NULL;
+
+	xv_destroy_pool(rzs->mem_pool);
+	rzs->mem_pool = NULL;
+
+	/* Free all swap extent pages */
+	while (!list_empty(&rzs->backing_swap_extent_list)) {
+		struct page *page;
+		struct list_head *entry;
+		entry = rzs->backing_swap_extent_list.next;
+		page = list_entry(entry, struct page, lru);
+		list_del(entry);
+		__free_page(page);
+	}
+	INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
+	rzs->num_extents = 0;
+
+	/* Close backing swap device, if present */
+	if (rzs->backing_swap) {
+		if (is_backing_blkdev)
+			bd_release(rzs->backing_swap);
+		filp_close(rzs->swap_file, NULL);
+		rzs->backing_swap = NULL;
+	}
+
+	/* Reset stats */
+	memset(&rzs->stats, 0, sizeof(rzs->stats));
+
+	rzs->disksize = 0;
+	rzs->memlimit = 0;
+
+	/* Back to uninitialized state */
+	rzs->init_done = 0;
+}
+
+static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
+{
+	int ret;
+	size_t num_pages;
+	struct page *page;
+	union swap_header *swap_header;
+
+	if (rzs->init_done) {
+		pr_info("Device already initialized!\n");
+		return -EBUSY;
+	}
+
+	ret = setup_backing_swap(rzs);
+	if (ret)
+		goto fail;
+
+	if (rzs->backing_swap)
+		ramzswap_set_memlimit(rzs, totalram_pages << PAGE_SHIFT);
+	else
+		ramzswap_set_disksize(rzs, totalram_pages << PAGE_SHIFT);
+
+	rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+	if (!rzs->compress_workmem) {
+		pr_err("Error allocating compressor working memory!\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	rzs->compress_buffer = (void *)__get_free_pages(__GFP_ZERO, 1);
+	if (!rzs->compress_buffer) {
+		pr_err("Error allocating compressor buffer space\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	num_pages = rzs->disksize >> PAGE_SHIFT;
+	rzs->table = vmalloc(num_pages * sizeof(*rzs->table));
+	if (!rzs->table) {
+		pr_err("Error allocating ramzswap address table\n");
+		/* To prevent accessing table entries during cleanup */
+		rzs->disksize = 0;
+		ret = -ENOMEM;
+		goto fail;
+	}
+	memset(rzs->table, 0, num_pages * sizeof(*rzs->table));
+
+	map_backing_swap_extents(rzs);
+
+	page = alloc_page(__GFP_ZERO);
+	if (!page) {
+		pr_err("Error allocating swap header page\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+	rzs->table[0].page = page;
+	rzs_set_flag(rzs, 0, RZS_UNCOMPRESSED);
+
+	swap_header = kmap(page);
+	ret = setup_swap_header(rzs, swap_header);
+	kunmap(page);
+	if (ret) {
+		pr_err("Error setting swap header\n");
+		goto fail;
+	}
+
+	set_capacity(rzs->disk, rzs->disksize >> SECTOR_SHIFT);
+
+	/*
+	 * We have ident mapping of sectors for ramzswap and
+	 * and the backing swap device. So, this queue flag
+	 * should be according to backing dev.
+	 */
+	if (!rzs->backing_swap ||
+			blk_queue_nonrot(rzs->backing_swap->bd_disk->queue))
+		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, rzs->disk->queue);
+
+	rzs->mem_pool = xv_create_pool();
+	if (!rzs->mem_pool) {
+		pr_err("Error creating memory pool\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/*
+	 * Pages that compress to size greater than this are forwarded
+	 * to physical swap disk (if backing dev is provided)
+	 * TODO: make this configurable
+	 */
+	if (rzs->backing_swap)
+		max_zpage_size = max_zpage_size_bdev;
+	else
+		max_zpage_size = max_zpage_size_nobdev;
+	pr_debug("Max compressed page size: %u bytes\n", max_zpage_size);
+
+	rzs->init_done = 1;
+
+	pr_debug("Initialization done!\n");
+	return 0;
+
+fail:
+	reset_device(rzs);
+
+	pr_err("Initialization failed: err=%d\n", ret);
+	return ret;
+}
+
+static int ramzswap_ioctl_reset_device(struct ramzswap *rzs)
+{
+	if (rzs->init_done)
+		reset_device(rzs);
+
+	return 0;
+}
+
+static int ramzswap_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned int cmd, unsigned long arg)
+{
+	int ret = 0;
+	size_t disksize_kb, memlimit_kb;
+
+	struct ramzswap *rzs = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case RZSIO_SET_DISKSIZE_KB:
+		if (rzs->init_done) {
+			ret = -EBUSY;
+			goto out;
+		}
+		if (copy_from_user(&disksize_kb, (void *)arg,
+						_IOC_SIZE(cmd))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		rzs->disksize = disksize_kb << 10;
+		pr_info("Disk size set to %zu kB\n", disksize_kb);
+		break;
+
+	case RZSIO_SET_MEMLIMIT_KB:
+		if (rzs->init_done) {
+			/* TODO: allow changing memlimit */
+			ret = -EBUSY;
+			goto out;
+		}
+		if (copy_from_user(&memlimit_kb, (void *)arg,
+						_IOC_SIZE(cmd))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		rzs->memlimit = memlimit_kb << 10;
+		pr_info("Memory limit set to %zu kB\n", memlimit_kb);
+		break;
+
+	case RZSIO_SET_BACKING_SWAP:
+		if (rzs->init_done) {
+			ret = -EBUSY;
+			goto out;
+		}
+
+		if (copy_from_user(&rzs->backing_swap_name, (void *)arg,
+						_IOC_SIZE(cmd))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		rzs->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
+		pr_info("Backing swap set to %s\n", rzs->backing_swap_name);
+		break;
+
+	case RZSIO_GET_STATS:
+	{
+		struct ramzswap_ioctl_stats *stats;
+		if (!rzs->init_done) {
+			ret = -ENOTTY;
+			goto out;
+		}
+		stats = kzalloc(sizeof(*stats), GFP_KERNEL);
+		if (!stats) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ramzswap_ioctl_get_stats(rzs, stats);
+		if (copy_to_user((void *)arg, stats, sizeof(*stats))) {
+			kfree(stats);
+			ret = -EFAULT;
+			goto out;
+		}
+		kfree(stats);
+		break;
+	}
+	case RZSIO_INIT:
+		ret = ramzswap_ioctl_init_device(rzs);
+		break;
+
+	case RZSIO_RESET:
+		/* Do not reset an active device! */
+		if (bdev->bd_holders) {
+			ret = -EBUSY;
+			goto out;
+		}
+		ret = ramzswap_ioctl_reset_device(rzs);
+		/*
+		 * Racy! Device has already been swapoff'ed.  Bad things
+		 * can happen if another swapon is done before this reset.
+		 * TODO: A callback from swapoff() will solve this problem.
+		 */
+		set_swap_free_notify(bdev, NULL);
+		rzs->init_notify_callback = 0;
+		break;
+
+	default:
+		pr_info("Invalid ioctl %u\n", cmd);
+		ret = -ENOTTY;
+	}
+
+out:
+	return ret;
+}
+
+static struct block_device_operations ramzswap_devops = {
+	.ioctl = ramzswap_ioctl,
+	.owner = THIS_MODULE,
+};
+
+static void create_device(struct ramzswap *rzs, int device_id)
+{
+	mutex_init(&rzs->lock);
+	INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
+
+	rzs->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!rzs->queue) {
+		pr_err("Error allocating disk queue for device %d\n",
+			device_id);
+		return;
+	}
+
+	blk_queue_make_request(rzs->queue, ramzswap_make_request);
+	rzs->queue->queuedata = rzs;
+
+	 /* gendisk structure */
+	rzs->disk = alloc_disk(1);
+	if (!rzs->disk) {
+		blk_cleanup_queue(rzs->queue);
+		pr_warning("Error allocating disk structure for device %d\n",
+			device_id);
+		return;
+	}
+
+	rzs->disk->major = ramzswap_major;
+	rzs->disk->first_minor = device_id;
+	rzs->disk->fops = &ramzswap_devops;
+	rzs->disk->queue = rzs->queue;
+	rzs->disk->private_data = rzs;
+	snprintf(rzs->disk->disk_name, 16, "ramzswap%d", device_id);
+
+	/*
+	 * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl
+	 * or set equal to backing swap device (if provided)
+	 */
+	set_capacity(rzs->disk, 0);
+	add_disk(rzs->disk);
+
+	rzs->init_done = 0;
+
+	return;
+}
+
+static void destroy_device(struct ramzswap *rzs)
+{
+	if (rzs->disk) {
+		del_gendisk(rzs->disk);
+		put_disk(rzs->disk);
+	}
+
+	if (rzs->queue)
+		blk_cleanup_queue(rzs->queue);
+}
+
+static int __init ramzswap_init(void)
+{
+	int i;
+
+	if (num_devices > max_num_devices) {
+		pr_warning("Invalid value for num_devices: %u\n",
+							num_devices);
+		return -EINVAL;
+	}
+
+	ramzswap_major = register_blkdev(0, "ramzswap");
+	if (ramzswap_major <= 0) {
+		pr_warning("Unable to get major number\n");
+		return -EBUSY;
+	}
+
+	if (!num_devices) {
+		pr_info("num_devices not specified. Using default: 1\n");
+		num_devices = 1;
+	}
+
+	/* Allocate the device array and initialize each one */
+	pr_info("Creating %u devices ...\n", num_devices);
+	devices = kzalloc(num_devices * sizeof(struct ramzswap), GFP_KERNEL);
+	if (!devices)
+		goto out;
+
+	for (i = 0; i < num_devices; i++)
+		create_device(&devices[i], i);
+
+	return 0;
+
+out:
+	unregister_blkdev(ramzswap_major, "ramzswap");
+	return -ENOMEM;
+}
+
+static void __exit ramzswap_exit(void)
+{
+	int i;
+	struct ramzswap *rzs;
+
+	for (i = 0; i < num_devices; i++) {
+		rzs = &devices[i];
+
+		destroy_device(rzs);
+		if (rzs->init_done)
+			reset_device(rzs);
+	}
+
+	unregister_blkdev(ramzswap_major, "ramzswap");
+
+	kfree(devices);
+	pr_debug("Cleanup done!\n");
+}
+
+module_param(num_devices, uint, 0);
+MODULE_PARM_DESC(num_devices, "Number of ramzswap devices");
+
+module_init(ramzswap_init);
+module_exit(ramzswap_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@...are.org>");
+MODULE_DESCRIPTION("Compressed RAM Based Swap Device");
diff --git a/drivers/staging/ramzswap/ramzswap_drv.h b/drivers/staging/ramzswap/ramzswap_drv.h
new file mode 100644
index 0000000..f7f273f
--- /dev/null
+++ b/drivers/staging/ramzswap/ramzswap_drv.h
@@ -0,0 +1,173 @@
+/*
+ * Compressed RAM based swap device
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ * Project home: http://compcache.googlecode.com
+ */
+
+#ifndef _RAMZSWAP_DRV_H_
+#define _RAMZSWAP_DRV_H_
+
+#include "ramzswap_ioctl.h"
+#include "xvmalloc.h"
+
+/*
+ * Some arbitrary value. This is just to catch
+ * invalid value for num_devices module parameter.
+ */
+static const unsigned max_num_devices = 32;
+
+/*
+ * Stored at beginning of each compressed object.
+ *
+ * It stores back-reference to table entry which points to this
+ * object. This is required to support memory defragmentation or
+ * migrating compressed pages to backing swap disk.
+ */
+struct zobj_header {
+#if 0
+	u32 table_idx;
+#endif
+};
+
+/*-- Configurable parameters */
+
+/* Default ramzswap disk size: 25% of total RAM */
+static const unsigned default_disksize_perc_ram = 25;
+static const unsigned default_memlimit_perc_ram = 15;
+
+/*
+ * Max compressed page size when backing device is provided.
+ * Pages that compress to size greater than this are sent to
+ * physical swap disk.
+ */
+static const unsigned max_zpage_size_bdev = PAGE_SIZE / 2;
+
+/*
+ * Max compressed page size when there is no backing dev.
+ * Pages that compress to size greater than this are stored
+ * uncompressed in memory.
+ */
+static const unsigned max_zpage_size_nobdev = PAGE_SIZE / 4 * 3;
+
+/*
+ * NOTE: max_zpage_size_{bdev,nobdev} sizes must be
+ * less than or equal to:
+ *   XV_MAX_ALLOC_SIZE - sizeof(struct zobj_header)
+ * since otherwise xv_malloc would always return failure.
+ */
+
+/*-- End of configurable params */
+
+#define SECTOR_SHIFT		9
+#define SECTOR_SIZE		(1 << SECTOR_SHIFT)
+#define SECTORS_PER_PAGE_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
+#define SECTORS_PER_PAGE	(1 << SECTORS_PER_PAGE_SHIFT)
+
+/* Debugging and Stats */
+#if defined(CONFIG_RAMZSWAP_STATS)
+#define stat_inc(stat)	((stat)++)
+#define stat_dec(stat)	((stat)--)
+#else
+#define stat_inc(x)
+#define stat_dec(x)
+#endif
+
+/* Flags for ramzswap pages (table[page_no].flags) */
+enum rzs_pageflags {
+	/* Page is stored uncompressed */
+	RZS_UNCOMPRESSED,
+
+	/* Page consists entirely of zeros */
+	RZS_ZERO,
+
+	__NR_RZS_PAGEFLAGS,
+};
+
+/*-- Data structures */
+
+/*
+ * Allocated for each swap slot, indexed by page no.
+ * These table entries must fit exactly in a page.
+ */
+struct table {
+	struct page *page;
+	u16 offset;
+	u8 count;	/* object ref count (not yet used) */
+	u8 flags;
+} __attribute__((aligned(4)));;
+
+/*
+ * Swap extent information in case backing swap is a regular
+ * file. These extent entries must fit exactly in a page.
+ */
+struct ramzswap_backing_extent {
+	pgoff_t phy_pagenum;
+	pgoff_t num_pages;
+} __attribute__((aligned(4)));
+
+struct ramzswap_stats {
+	/* basic stats */
+	size_t compr_size;	/* compressed size of pages stored -
+				 * needed to enforce memlimit */
+	/* more stats */
+#if defined(CONFIG_RAMZSWAP_STATS)
+	u64 num_reads;		/* failed + successful */
+	u64 num_writes;		/* --do-- */
+	u64 failed_reads;	/* can happen when memory is too low */
+	u64 failed_writes;	/* should NEVER! happen */
+	u64 invalid_io;		/* non-swap I/O requests */
+	u64 notify_free;	/* no. of pages freed by swap free callback */
+	u32 pages_zero;		/* no. of zero filled pages */
+	u32 pages_stored;	/* no. of pages currently stored */
+	u32 good_compress;	/* % of pages with compression ratio<=50% */
+	u32 pages_expand;	/* % of incompressible pages */
+	u64 bdev_num_reads;	/* no. of reads on backing dev */
+	u64 bdev_num_writes;	/* no. of writes on backing dev */
+#endif
+};
+
+struct ramzswap {
+	struct xv_pool *mem_pool;
+	void *compress_workmem;
+	void *compress_buffer;
+	struct table *table;
+	struct mutex lock;
+	struct request_queue *queue;
+	struct gendisk *disk;
+	int init_done;
+	int init_notify_callback;
+	/*
+	 * This is limit on compressed data size (stats.compr_size)
+	 * Its applicable only when backing swap device is present.
+	 */
+	size_t memlimit;	/* bytes */
+	/*
+	 * This is limit on amount of *uncompressed* worth of data
+	 * we can hold. When backing swap device is provided, it is
+	 * set equal to device size.
+	 */
+	size_t disksize;	/* bytes */
+
+	struct ramzswap_stats stats;
+
+	/* backing swap device info */
+	struct ramzswap_backing_extent *curr_extent;
+	struct list_head backing_swap_extent_list;
+	unsigned long num_extents;
+	char backing_swap_name[MAX_SWAP_NAME_LEN];
+	struct block_device *backing_swap;
+	struct file *swap_file;
+};
+
+/*-- */
+
+#endif
+
diff --git a/drivers/staging/ramzswap/ramzswap_ioctl.h b/drivers/staging/ramzswap/ramzswap_ioctl.h
new file mode 100644
index 0000000..a7ce039
--- /dev/null
+++ b/drivers/staging/ramzswap/ramzswap_ioctl.h
@@ -0,0 +1,50 @@
+/*
+ * Compressed RAM based swap device
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ * Project home: http://compcache.googlecode.com
+ */
+
+#ifndef _RAMZSWAP_IOCTL_H_
+#define _RAMZSWAP_IOCTL_H_
+
+#define MAX_SWAP_NAME_LEN 128
+
+struct ramzswap_ioctl_stats {
+	char backing_swap_name[MAX_SWAP_NAME_LEN];
+	u64 memlimit;		/* only applicable if backing swap present */
+	u64 disksize;		/* user specified or equal to backing swap
+				 * size (if present) */
+	u64 num_reads;		/* failed + successful */
+	u64 num_writes;		/* --do-- */
+	u64 failed_reads;	/* can happen when memory is too low */
+	u64 failed_writes;	/* should NEVER! happen */
+	u64 invalid_io;		/* non-swap I/O requests */
+	u64 notify_free;	/* no. of pages freed by swap free callback */
+	u32 pages_zero;		/* no. of zero filled pages */
+	u32 good_compress_pct;	/* no. of pages with compression ratio<=50% */
+	u32 pages_expand_pct;	/* no. of incompressible pages */
+	u32 pages_stored;
+	u32 pages_used;
+	u64 orig_data_size;
+	u64 compr_data_size;
+	u64 mem_used_total;
+	u64 bdev_num_reads;	/* no. of reads on backing dev */
+	u64 bdev_num_writes;	/* no. of writes on backing dev */
+} __attribute__ ((packed, aligned(4)));
+
+#define RZSIO_SET_DISKSIZE_KB	_IOW('z', 0, size_t)
+#define RZSIO_SET_MEMLIMIT_KB	_IOW('z', 1, size_t)
+#define RZSIO_SET_BACKING_SWAP	_IOW('z', 2, unsigned char[MAX_SWAP_NAME_LEN])
+#define RZSIO_GET_STATS		_IOR('z', 3, struct ramzswap_ioctl_stats)
+#define RZSIO_INIT		_IO('z', 4)
+#define RZSIO_RESET		_IO('z', 5)
+
+#endif
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/