linux-kernel - [PATCH 1/6] Add IO cgroup tracking for buffered writes.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1299619256-12661-2-git-send-email-teravest@google.com>
Date:	Tue,  8 Mar 2011 13:20:51 -0800
From:	Justin TerAvest <teravest@...gle.com>
To:	m-ikeda@...jp.nec.com, jaxboe@...ionio.com, vgoyal@...hat.com
Cc:	linux-kernel@...r.kernel.org, ryov@...inux.co.jp,
	taka@...inux.co.jp, kamezawa.hiroyu@...fujitsu.com,
	righi.andrea@...il.com, guijianfeng@...fujitsu.com,
	balbir@...ux.vnet.ibm.com, ctalbott@...gle.com, nauman@...gle.com,
	mrubin@...gle.com, Justin TerAvest <teravest@...gle.com>
Subject: [PATCH 1/6] Add IO cgroup tracking for buffered writes.

This patch adds IO tracking code in the mm/ tree so that the block layer
can provide isolation for buffered writes. I've left modifying the
page_cgroup structure as simple as possible; I'm happy to change this to
using bits as part of the "flags" structure to reduce overhead.

Signed-off-by: Justin TerAvest <teravest@...gle.com>
---
 block/blk-cgroup.c          |  184 +++++++++++++++++++++++++++++++++++++++++++
 fs/buffer.c                 |    2 +
 fs/direct-io.c              |    2 +
 include/linux/blkio-track.h |   89 +++++++++++++++++++++
 include/linux/iocontext.h   |    1 +
 include/linux/memcontrol.h  |    6 ++
 include/linux/mmzone.h      |    4 +-
 include/linux/page_cgroup.h |   12 +++-
 init/Kconfig                |   16 ++++
 mm/Makefile                 |    3 +-
 mm/bounce.c                 |    2 +
 mm/filemap.c                |    2 +
 mm/memcontrol.c             |    6 ++
 mm/memory.c                 |    6 ++
 mm/page-writeback.c         |   14 +++-
 mm/page_cgroup.c            |   29 +++++---
 mm/swap_state.c             |    2 +
 17 files changed, 363 insertions(+), 17 deletions(-)
 create mode 100644 include/linux/blkio-track.h

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a..80d88ec 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -19,6 +19,8 @@
 #include <linux/slab.h>
 #include "blk-cgroup.h"
 #include <linux/genhd.h>
+#include <linux/blkio-track.h>
+#include <linux/mm_inline.h>
 
 #define MAX_KEY_LEN 100
 
@@ -175,6 +177,12 @@ static inline void blkio_update_group_iops(struct blkio_group *blkg,
 	}
 }
 
+static inline struct blkio_cgroup *blkio_cgroup_from_task(struct task_struct *p)
+{
+	return container_of(task_subsys_state(p, blkio_subsys_id),
+					struct blkio_cgroup, css);
+}
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
@@ -1233,8 +1241,20 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 	return 0;
 }
 
+/* Read the ID of the specified blkio cgroup. */
+static u64 blkio_id_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	return (u64)css_id(&blkcg->css);
+}
+
 struct cftype blkio_files[] = {
 	{
+		.name = "id",
+		.read_u64 = blkio_id_read,
+	},
+	{
 		.name = "weight_device",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
 				BLKIO_PROP_weight_device),
@@ -1385,6 +1405,170 @@ struct cftype blkio_files[] = {
 #endif
 };
 
+/* Block IO tracking related functions */
+
+#ifdef CONFIG_CGROUP_BLKIOTRACK
+
+/*
+ * The block I/O tracking mechanism is implemented on the cgroup memory
+ * controller framework. It helps to find the the owner of an I/O request
+ * because every I/O request has a target page and the owner of the page
+ * can be easily determined on the framework.
+ */
+
+/**
+ * blkio_cgroup_set_owner() - set the owner ID of a page.
+ * @page:	the page we want to tag
+ * @mm:		the mm_struct of a page owner
+ *
+ * Make a given page have the blkio-cgroup ID of the owner of this page.
+ */
+void blkio_cgroup_set_owner(struct page *page, struct mm_struct *mm)
+{
+	struct blkio_cgroup *blkcg;
+	struct page_cgroup *pc;
+
+	if (blkio_cgroup_disabled())
+		return;
+	pc = lookup_page_cgroup(page);
+	if (unlikely(!pc))
+		return;
+
+	pc->blkio_cgroup_id = 0;	/* 0: default blkio_cgroup id */
+	if (!mm)
+		return;
+	/*
+	 * Locking "pc" isn't necessary here since the current process is
+	 * the only one that can access the members related to blkio_cgroup.
+	 */
+	rcu_read_lock();
+	blkcg = blkio_cgroup_from_task(rcu_dereference(mm->owner));
+	if (unlikely(!blkcg))
+		goto out;
+	/*
+	 * css_get(&bio->css) isn't called to increment the reference
+	 * count of this blkio_cgroup "blkcg" so pc->blkio_cgroup_id
+	 * might turn invalid even if this page is still active.
+	 * This approach is chosen to minimize the overhead.
+	 */
+	pc->blkio_cgroup_id = css_id(&blkcg->css);
+out:
+	rcu_read_unlock();
+}
+
+/**
+ * blkio_cgroup_reset_owner() - reset the owner ID of a page
+ * @page:	the page we want to tag
+ * @mm:		the mm_struct of a page owner
+ *
+ * Change the owner of a given page if necessary.
+ */
+void blkio_cgroup_reset_owner(struct page *page, struct mm_struct *mm)
+{
+	/*
+	 * A little trick:
+	 * Just call blkio_cgroup_set_owner() for pages which are already
+	 * active since the blkio_cgroup_id member of page_cgroup can be
+	 * updated without any locks. This is because an integer type of
+	 * variable can be set a new value at once on modern cpus.
+	 */
+	blkio_cgroup_set_owner(page, mm);
+}
+
+/**
+ * blkio_cgroup_reset_owner_pagedirty() - reset the owner ID of a pagecache page
+ * @page:	the page we want to tag
+ * @mm:		the mm_struct of a page owner
+ *
+ * Change the owner of a given page if the page is in the pagecache.
+ */
+void blkio_cgroup_reset_owner_pagedirty(struct page *page, struct mm_struct *mm)
+{
+	if (!page_is_file_cache(page))
+		return;
+	if (current->flags & PF_MEMALLOC)
+		return;
+
+	blkio_cgroup_reset_owner(page, mm);
+}
+
+/**
+ * blkio_cgroup_copy_owner() - copy the owner ID of a page into another page
+ * @npage:	the page where we want to copy the owner
+ * @opage:	the page from which we want to copy the ID
+ *
+ * Copy the owner ID of @opage into @npage.
+ */
+void blkio_cgroup_copy_owner(struct page *npage, struct page *opage)
+{
+	struct page_cgroup *npc, *opc;
+
+	if (blkio_cgroup_disabled())
+		return;
+	npc = lookup_page_cgroup(npage);
+	if (unlikely(!npc))
+		return;
+	opc = lookup_page_cgroup(opage);
+	if (unlikely(!opc))
+		return;
+
+	/*
+	 * Do this without any locks. The reason is the same as
+	 * blkio_cgroup_reset_owner().
+	 */
+	npc->blkio_cgroup_id = opc->blkio_cgroup_id;
+}
+
+/**
+ * get_blkio_cgroup_id() - determine the blkio-cgroup ID
+ * @bio:	the &struct bio which describes the I/O
+ *
+ * Returns the blkio-cgroup ID of a given bio. A return value zero
+ * means that the page associated with the bio belongs to default_blkio_cgroup.
+ */
+unsigned long get_blkio_cgroup_id(struct bio *bio)
+{
+	struct page_cgroup *pc;
+	struct page *page = bio_iovec_idx(bio, 0)->bv_page;
+	unsigned long id = 0;
+
+	pc = lookup_page_cgroup(page);
+	if (pc)
+		id = pc->blkio_cgroup_id;
+	return id;
+}
+
+/**
+ * get_cgroup_from_page() - determine the cgroup from a page.
+ * @page:	the page to be tracked
+ *
+ * Returns the cgroup of a given page. A return value zero means that
+ * the page associated with the page belongs to default_blkio_cgroup.
+ *
+ * Note:
+ * This function must be called under rcu_read_lock().
+ */
+struct cgroup *get_cgroup_from_page(struct page *page)
+{
+	struct page_cgroup *pc;
+	struct cgroup_subsys_state *css;
+
+	pc = lookup_page_cgroup(page);
+	if (!pc)
+		return NULL;
+
+	css = css_lookup(&blkio_subsys, pc->blkio_cgroup_id);
+	if (!css)
+		return NULL;
+
+	return css->cgroup;
+}
+
+EXPORT_SYMBOL(get_blkio_cgroup_id);
+EXPORT_SYMBOL(get_cgroup_from_page);
+
+#endif /* CONFIG_CGROUP_BLKIOTRACK */
+
 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 {
 	return cgroup_add_files(cgroup, subsys, blkio_files,
diff --git a/fs/buffer.c b/fs/buffer.c
index 2219a76..1e911dd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -36,6 +36,7 @@
 #include <linux/buffer_head.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/bio.h>
+#include <linux/blkio-track.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
@@ -667,6 +668,7 @@ static void __set_page_dirty(struct page *page,
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 		account_page_dirtied(page, mapping);
+		blkio_cgroup_reset_owner_pagedirty(page, current->mm);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b044705..2e8d5aa 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -33,6 +33,7 @@
 #include <linux/err.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
+#include <linux/blkio-track.h>
 #include <linux/rwsem.h>
 #include <linux/uio.h>
 #include <asm/atomic.h>
@@ -852,6 +853,7 @@ static int do_direct_IO(struct dio *dio)
 			ret = PTR_ERR(page);
 			goto out;
 		}
+		blkio_cgroup_reset_owner(page, current->mm);
 
 		while (block_in_page < blocks_per_page) {
 			unsigned offset_in_page = block_in_page << blkbits;
diff --git a/include/linux/blkio-track.h b/include/linux/blkio-track.h
new file mode 100644
index 0000000..aedf780
--- /dev/null
+++ b/include/linux/blkio-track.h
@@ -0,0 +1,89 @@
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/page_cgroup.h>
+
+#ifndef _LINUX_BIOTRACK_H
+#define _LINUX_BIOTRACK_H
+
+#ifdef CONFIG_CGROUP_BLKIOTRACK
+
+struct block_device;
+
+/**
+ * __init_blkio_page_cgroup() - initialize a blkio_page_cgroup
+ * @pc:		page_cgroup of the page
+ *
+ * Reset the owner ID of a page.
+ */
+static inline void __init_blkio_page_cgroup(struct page_cgroup *pc)
+{
+	pc->blkio_cgroup_id = 0;
+}
+
+/**
+ * blkio_cgroup_disabled() - check whether blkio_cgroup is disabled
+ *
+ * Returns true if disabled, false if not.
+ */
+static inline bool blkio_cgroup_disabled(void)
+{
+	if (blkio_subsys.disabled)
+		return true;
+	return false;
+}
+
+extern void blkio_cgroup_set_owner(struct page *page, struct mm_struct *mm);
+extern void blkio_cgroup_reset_owner(struct page *page, struct mm_struct *mm);
+extern void blkio_cgroup_reset_owner_pagedirty(struct page *page,
+						 struct mm_struct *mm);
+extern void blkio_cgroup_copy_owner(struct page *page, struct page *opage);
+
+extern unsigned long get_blkio_cgroup_id(struct bio *bio);
+extern struct cgroup *get_cgroup_from_page(struct page *page);
+
+#else /* !CONFIG_CGROUP_BLKIOTRACK */
+
+struct blkiotrack_cgroup;
+
+static inline void __init_blkio_page_cgroup(struct page_cgroup *pc)
+{
+}
+
+static inline bool blkio_cgroup_disabled(void)
+{
+	return true;
+}
+
+static inline void blkio_cgroup_set_owner(struct page *page,
+						struct mm_struct *mm)
+{
+}
+
+static inline void blkio_cgroup_reset_owner(struct page *page,
+						struct mm_struct *mm)
+{
+}
+
+static inline void blkio_cgroup_reset_owner_pagedirty(struct page *page,
+						struct mm_struct *mm)
+{
+}
+
+static inline void blkio_cgroup_copy_owner(struct page *page,
+						struct page *opage)
+{
+}
+
+static inline unsigned long get_blkio_cgroup_id(struct bio *bio)
+{
+	return 0;
+}
+
+static inline struct cgroup *get_cgroup_from_page(struct page *page)
+{
+	return NULL;
+}
+
+#endif /* CONFIG_CGROUP_BLKIOTRACK */
+
+#endif /* _LINUX_BIOTRACK_H */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index b2eee89..3e70b21 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -76,6 +76,7 @@ int put_io_context(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 #else
 static inline void exit_io_context(struct task_struct *task)
 {
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f512e18..a8a7cf0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -49,6 +49,8 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
  * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
  */
 
+extern void __init_mem_page_cgroup(struct page_cgroup *pc);
+
 extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 /* for swap handling */
@@ -153,6 +155,10 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail);
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
+static inline void __init_mem_page_cgroup(struct page_cgroup *pc)
+{
+}
+
 static inline int mem_cgroup_newpage_charge(struct page *page,
 					struct mm_struct *mm, gfp_t gfp_mask)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 02ecb01..a04c37a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -615,7 +615,7 @@ typedef struct pglist_data {
 	int nr_zones;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
 	struct page *node_mem_map;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
 	struct page_cgroup *node_page_cgroup;
 #endif
 #endif
@@ -975,7 +975,7 @@ struct mem_section {
 
 	/* See declaration of similar field in struct zone */
 	unsigned long *pageblock_flags;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
 	/*
 	 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
 	 * section. (see memcontrol.h/page_cgroup.h about this.)
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 6d6cb7a..c3e66fd 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_PAGE_CGROUP_H
 #define __LINUX_PAGE_CGROUP_H
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
 #include <linux/bit_spinlock.h>
 /*
  * Page Cgroup can be considered as an extended mem_map.
@@ -11,10 +11,15 @@
  * then the page cgroup for pfn always exists.
  */
 struct page_cgroup {
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
 	unsigned long flags;
 	struct mem_cgroup *mem_cgroup;
 	struct page *page;
 	struct list_head lru;		/* per cgroup LRU list */
+#endif
+#ifdef CONFIG_CGROUP_BLKIOTRACK
+	unsigned long blkio_cgroup_id;
+#endif
 };
 
 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
@@ -33,6 +38,8 @@ static inline void __init page_cgroup_init(void)
 
 struct page_cgroup *lookup_page_cgroup(struct page *page);
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+
 enum {
 	/* flags for mem_cgroup */
 	PCG_LOCK,  /* Lock for pc->mem_cgroup and following bits. */
@@ -131,8 +138,9 @@ static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
 	bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags);
 	local_irq_restore(*flags);
 }
+#endif
 
-#else /* CONFIG_CGROUP_MEM_RES_CTLR */
+#else /* CONFIG_CGROUP_PAGE */
 struct page_cgroup;
 
 static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
diff --git a/init/Kconfig b/init/Kconfig
index be788c0..256041f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -742,6 +742,22 @@ config DEBUG_BLK_CGROUP
 	Enable some debugging help. Currently it exports additional stat
 	files in a cgroup which can be useful for debugging.
 
+config CGROUP_BLKIOTRACK
+	bool
+	depends on CGROUPS && BLOCK
+	select MM_OWNER
+	default n
+	---help---
+	  Provides a Resource Controller which enables to track the onwner
+	  of every Block I/O requests.
+	  The information this subsystem provides can be used from any
+	  kind of module such as dm-ioband device mapper modules or
+	  the cfq-scheduler.
+
+config CGROUP_PAGE
+	def_bool y
+	depends on CGROUP_MEM_RES_CTLR || CGROUP_BLKIOTRACK
+
 endif # CGROUPS
 
 menuconfig NAMESPACES
diff --git a/mm/Makefile b/mm/Makefile
index 2b1b575..7da3bc8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -38,7 +38,8 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
+obj-$(CONFIG_CGROUP_PAGE) += page_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
diff --git a/mm/bounce.c b/mm/bounce.c
index 1481de6..64980fb 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/blkio-track.h>
 #include <asm/tlbflush.h>
 
 #include <trace/events/block.h>
@@ -211,6 +212,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 		to->bv_len = from->bv_len;
 		to->bv_offset = from->bv_offset;
 		inc_zone_page_state(to->bv_page, NR_BOUNCE);
+		blkio_cgroup_copy_owner(to->bv_page, page);
 
 		if (rw == WRITE) {
 			char *vto, *vfrom;
diff --git a/mm/filemap.c b/mm/filemap.c
index 83a45d3..ab9b53a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
+#include <linux/blkio-track.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include "internal.h"
 
@@ -407,6 +408,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 					gfp_mask & GFP_RECLAIM_MASK);
 	if (error)
 		goto out;
+	blkio_cgroup_set_owner(page, current->mm);
 
 	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 	if (error == 0) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index da53a25..e11c2cd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -359,6 +359,12 @@ static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 static void drain_all_stock_async(void);
 
+void __meminit __init_mem_page_cgroup(struct page_cgroup *pc)
+{
+	pc->mem_cgroup = NULL;
+	INIT_LIST_HEAD(&pc->lru);
+}
+
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 31250fa..4735c3c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -52,6 +52,7 @@
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/blkio-track.h>
 #include <linux/mmu_notifier.h>
 #include <linux/kallsyms.h>
 #include <linux/swapops.h>
@@ -2403,6 +2404,7 @@ gotten:
 		 */
 		ptep_clear_flush(vma, address, page_table);
 		page_add_new_anon_rmap(new_page, vma, address);
+		blkio_cgroup_set_owner(new_page, mm);
 		/*
 		 * We call the notify macro here because, when using secondary
 		 * mmu page tables (such as kvm shadow page tables), we want the
@@ -2828,6 +2830,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
 	do_page_add_anon_rmap(page, vma, address, exclusive);
+	blkio_cgroup_reset_owner(page, mm);
 	/* It's better to call commit-charge after rmap is established */
 	mem_cgroup_commit_charge_swapin(page, ptr);
 
@@ -2959,6 +2962,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	inc_mm_counter_fast(mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, vma, address);
+	/* Not setting the owner for special pages */
+	blkio_cgroup_set_owner(page, mm);
 setpte:
 	set_pte_at(mm, address, page_table, entry);
 
@@ -3114,6 +3119,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (anon) {
 			inc_mm_counter_fast(mm, MM_ANONPAGES);
 			page_add_new_anon_rmap(page, vma, address);
+			blkio_cgroup_set_owner(page, mm);
 		} else {
 			inc_mm_counter_fast(mm, MM_FILEPAGES);
 			page_add_file_rmap(page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2cb01f6..b2a8f81 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/backing-dev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/blkio-track.h>
 #include <linux/blkdev.h>
 #include <linux/mpage.h>
 #include <linux/rmap.h>
@@ -1153,7 +1154,8 @@ EXPORT_SYMBOL(account_page_writeback);
  * We take care to handle the case where the page was truncated from the
  * mapping by re-checking page_mapping() inside tree_lock.
  */
-int __set_page_dirty_nobuffers(struct page *page)
+int __set_page_dirty_nobuffers_track_owner(struct page *page,
+					   int update_owner)
 {
 	if (!TestSetPageDirty(page)) {
 		struct address_space *mapping = page_mapping(page);
@@ -1168,6 +1170,9 @@ int __set_page_dirty_nobuffers(struct page *page)
 			BUG_ON(mapping2 != mapping);
 			WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
 			account_page_dirtied(page, mapping);
+			if (update_owner)
+				blkio_cgroup_reset_owner_pagedirty(page,
+								current->mm);
 			radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 		}
@@ -1180,6 +1185,11 @@ int __set_page_dirty_nobuffers(struct page *page)
 	}
 	return 0;
 }
+
+int __set_page_dirty_nobuffers(struct page *page)
+{
+	return __set_page_dirty_nobuffers_track_owner(page, 1);
+}
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 
 /*
@@ -1190,7 +1200,7 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
 {
 	wbc->pages_skipped++;
-	return __set_page_dirty_nobuffers(page);
+	return __set_page_dirty_nobuffers_track_owner(page, 0);
 }
 EXPORT_SYMBOL(redirty_page_for_writepage);
 
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5bffada..78f5425 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -10,14 +10,17 @@
 #include <linux/cgroup.h>
 #include <linux/swapops.h>
 #include <linux/kmemleak.h>
+#include <linux/blkio-track.h>
 
 static void __meminit
 __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
 {
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
 	pc->flags = 0;
-	pc->mem_cgroup = NULL;
 	pc->page = pfn_to_page(pfn);
-	INIT_LIST_HEAD(&pc->lru);
+#endif
+	__init_mem_page_cgroup(pc);
+	__init_blkio_page_cgroup(pc);
 }
 static unsigned long total_usage;
 
@@ -75,7 +78,7 @@ void __init page_cgroup_init_flatmem(void)
 
 	int nid, fail;
 
-	if (mem_cgroup_disabled())
+	if (mem_cgroup_disabled() && blkio_cgroup_disabled())
 		return;
 
 	for_each_online_node(nid)  {
@@ -84,12 +87,13 @@ void __init page_cgroup_init_flatmem(void)
 			goto fail;
 	}
 	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
-	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
-	" don't want memory cgroups\n");
+	printk(KERN_INFO "please try 'cgroup_disable=memory,blkio' option"
+	" if you don't want memory and blkio cgroups\n");
 	return;
 fail:
 	printk(KERN_CRIT "allocation of page_cgroup failed.\n");
-	printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
+	printk(KERN_CRIT
+		"please try 'cgroup_disable=memory,blkio' boot option\n");
 	panic("Out of memory");
 }
 
@@ -134,6 +138,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
 		 */
 		kmemleak_not_leak(base);
 	} else {
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
 		/*
  		 * We don't have to allocate page_cgroup again, but
 		 * address of memmap may be changed. So, we have to initialize
@@ -144,6 +149,9 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
 		/* check address of memmap is changed or not. */
 		if (base->page == pfn_to_page(pfn))
 			return 0;
+#else
+		return 0;
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
 	}
 
 	if (!base) {
@@ -258,7 +266,7 @@ void __init page_cgroup_init(void)
 	unsigned long pfn;
 	int fail = 0;
 
-	if (mem_cgroup_disabled())
+	if (mem_cgroup_disabled() && blkio_cgroup_disabled())
 		return;
 
 	for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
@@ -267,14 +275,15 @@ void __init page_cgroup_init(void)
 		fail = init_section_page_cgroup(pfn);
 	}
 	if (fail) {
-		printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+		printk(KERN_CRIT
+			"try 'cgroup_disable=memory,blkio' boot option\n");
 		panic("Out of memory");
 	} else {
 		hotplug_memory_notifier(page_cgroup_callback, 0);
 	}
 	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
-	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
-	" want memory cgroups\n");
+	printk(KERN_INFO "please try 'cgroup_disable=memory,blkio' option"
+	" if you don't want memory and blkio cgroups\n");
 }
 
 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5c8cfab..bd4c4e7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -19,6 +19,7 @@
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
 #include <linux/page_cgroup.h>
+#include <linux/blkio-track.h>
 
 #include <asm/pgtable.h>
 
@@ -330,6 +331,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
 		__set_page_locked(new_page);
 		SetPageSwapBacked(new_page);
+		blkio_cgroup_set_owner(new_page, current->mm);
 		err = __add_to_swap_cache(new_page, entry);
 		if (likely(!err)) {
 			radix_tree_preload_end();
-- 
1.7.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/