linux-kernel - [PATCH v9 11/13] writeback: make background writeback cgroup aware

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1313597705-6093-12-git-send-email-gthelen@google.com>
Date:	Wed, 17 Aug 2011 09:15:03 -0700
From:	Greg Thelen <gthelen@...gle.com>
To:	Andrew Morton <akpm@...ux-foundation.org>
Cc:	linux-kernel@...r.kernel.org, linux-mm@...ck.org,
	containers@...ts.osdl.org, linux-fsdevel@...r.kernel.org,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>,
	Balbir Singh <bsingharora@...il.com>,
	Daisuke Nishimura <nishimura@....nes.nec.co.jp>,
	Minchan Kim <minchan.kim@...il.com>,
	Johannes Weiner <hannes@...xchg.org>,
	Wu Fengguang <fengguang.wu@...el.com>,
	Dave Chinner <david@...morbit.com>,
	Vivek Goyal <vgoyal@...hat.com>,
	Andrea Righi <andrea@...terlinux.com>,
	Ciju Rajan K <ciju@...ux.vnet.ibm.com>,
	David Rientjes <rientjes@...gle.com>,
	Greg Thelen <gthelen@...gle.com>
Subject: [PATCH v9 11/13] writeback: make background writeback cgroup aware

When the system is under background dirty memory threshold but some
cgroups are over their background dirty memory thresholds, then only
writeback inodes associated with the over-limit cgroups.

In addition to checking if the system dirty memory usage is over the
system background threshold, over_bground_thresh() now checks if any
cgroups are over their respective background dirty memory thresholds.

If over-limit cgroups are found, then the new
wb_writeback_work.for_cgroup field is set to distinguish between system
and memcg overages.  The new wb_writeback_work.shared_inodes field is
also set.  Inodes written by multiple cgroup are marked owned by
I_MEMCG_SHARED rather than a particular cgroup.  Such shared inodes
cannot easily be attributed to a cgroup, so per-cgroup writeback
(futures version of wakeup_flusher_threads and balance_dirty_pages)
performs suboptimally in the presence of shared inodes.  Therefore,
write shared inodes when performing cgroup background writeback.

If performing cgroup writeback, move_expired_inodes() skips inodes that
do not contribute dirty pages to the cgroup being written back.

After writing some pages, wb_writeback() will call
mem_cgroup_writeback_done() to update the set of over-bg-limits memcg.

This change also makes wakeup_flusher_threads() memcg aware so that
per-cgroup try_to_free_pages() is able to operate more efficiently
without having to write pages of foreign containers.  This change adds a
mem_cgroup parameter to wakeup_flusher_threads() to allow callers,
especially try_to_free_pages() and foreground writeback from
balance_dirty_pages(), to specify a particular cgroup to write inodes
from.

Signed-off-by: Greg Thelen <gthelen@...gle.com>
---
Changelog since v8:

- Added optional memcg parameter to __bdi_start_writeback(),
  bdi_start_writeback(), wakeup_flusher_threads(), writeback_inodes_wb().

- move_expired_inodes() now uses pass in struct wb_writeback_work instead of
  struct writeback_control.

- Added comments to over_bground_thresh().

 fs/buffer.c               |    2 +-
 fs/fs-writeback.c         |   96 +++++++++++++++++++++++++++++++++-----------
 fs/sync.c                 |    2 +-
 include/linux/writeback.h |    6 ++-
 mm/backing-dev.c          |    3 +-
 mm/page-writeback.c       |    3 +-
 mm/vmscan.c               |    3 +-
 7 files changed, 84 insertions(+), 31 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index dd0220b..da1fb23 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -293,7 +293,7 @@ static void free_more_memory(void)
 	struct zone *zone;
 	int nid;
 
-	wakeup_flusher_threads(1024);
+	wakeup_flusher_threads(1024, NULL);
 	yield();
 
 	for_each_online_node(nid) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e91fb82..ba55336 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -38,10 +38,14 @@ struct wb_writeback_work {
 	struct super_block *sb;
 	unsigned long *older_than_this;
 	enum writeback_sync_modes sync_mode;
+	unsigned short memcg_id;	/* If non-zero, then writeback specified
+					 * cgroup. */
 	unsigned int tagged_writepages:1;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
+	unsigned int for_cgroup:1;	/* cgroup writeback */
+	unsigned int shared_inodes:1;	/* write inodes spanning cgroups */
 
 	struct list_head list;		/* pending work list */
 	struct completion *done;	/* set if the caller waits */
@@ -114,9 +118,12 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 	spin_unlock_bh(&bdi->wb_lock);
 }
 
+/*
+ * @memcg is optional.  If set, then limit writeback to the specified cgroup.
+ */
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		      bool range_cyclic)
+		      bool range_cyclic, struct mem_cgroup *memcg)
 {
 	struct wb_writeback_work *work;
 
@@ -136,6 +143,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	work->sync_mode	= WB_SYNC_NONE;
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
+	work->memcg_id = memcg ? css_id(mem_cgroup_css(memcg)) : 0;
+	work->for_cgroup = memcg != NULL;
 
 	bdi_queue_work(bdi, work);
 }
@@ -153,7 +162,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-	__bdi_start_writeback(bdi, nr_pages, true);
+	__bdi_start_writeback(bdi, nr_pages, true, NULL);
 }
 
 /**
@@ -257,15 +266,20 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 	LIST_HEAD(tmp);
 	struct list_head *pos, *node;
 	struct super_block *sb = NULL;
-	struct inode *inode;
+	struct inode *inode, *tmp_inode;
 	int do_sb_sort = 0;
 	int moved = 0;
 
-	while (!list_empty(delaying_queue)) {
-		inode = wb_inode(delaying_queue->prev);
+	list_for_each_entry_safe_reverse(inode, tmp_inode, delaying_queue,
+					 i_wb_list) {
 		if (work->older_than_this &&
 		    inode_dirtied_after(inode, *work->older_than_this))
 			break;
+		if (work->for_cgroup &&
+		    !should_writeback_mem_cgroup_inode(inode,
+						       work->memcg_id,
+						       work->shared_inodes))
+			continue;
 		if (sb && sb != inode->i_sb)
 			do_sb_sort = 1;
 		sb = inode->i_sb;
@@ -643,31 +657,63 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 	return wrote;
 }
 
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
+/*
+ * @memcg is optional.  If set, then limit writeback to the specified cgroup.
+ * If @shared_inodes is set then writeback inodes shared by several memcg.
+ */
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+			 struct mem_cgroup *memcg, bool shared_inodes)
 {
 	struct wb_writeback_work work = {
 		.nr_pages	= nr_pages,
 		.sync_mode	= WB_SYNC_NONE,
+		.memcg_id	= memcg ? css_id(mem_cgroup_css(memcg)) : 0,
+		.for_cgroup	= (memcg != NULL) || shared_inodes,
+		.shared_inodes	= shared_inodes,
 		.range_cyclic	= 1,
 	};
 
 	spin_lock(&wb->list_lock);
 	if (list_empty(&wb->b_io))
-		queue_io(wb, NULL);
+		queue_io(wb, &work);
 	__writeback_inodes_wb(wb, &work);
 	spin_unlock(&wb->list_lock);
 
 	return nr_pages - work.nr_pages;
 }
 
-static inline bool over_bground_thresh(void)
+static inline bool over_bground_thresh(struct wb_writeback_work *work)
 {
 	unsigned long background_thresh, dirty_thresh;
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 
-	return (global_page_state(NR_FILE_DIRTY) +
-		global_page_state(NR_UNSTABLE_NFS) > background_thresh);
+	if (global_page_state(NR_FILE_DIRTY) +
+	    global_page_state(NR_UNSTABLE_NFS) > background_thresh) {
+		work->for_cgroup = 0;
+		return true;
+	}
+
+	/*
+	 * System dirty memory is below system background limit.  Check if any
+	 * memcg are over memcg background limit.
+	 */
+	if (mem_cgroups_over_bground_dirty_thresh()) {
+		work->for_cgroup = 1;
+
+		/*
+		 * Set shared_inodes so that background flusher writes shared
+		 * inodes in addition to inodes in over-limit memcg.  Such
+		 * shared inodes should be rarer than inodes written by a single
+		 * memcg.  Shared inodes limit the ability to map from memcg to
+		 * inode in wakeup_flusher_threads() and writeback_inodes_wb().
+		 * So the quicker such shared inodes are written, the better.
+		 */
+		work->shared_inodes = 1;
+		return true;
+	}
+
+	return false;
 }
 
 /*
@@ -729,7 +775,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (work->for_background && !over_bground_thresh())
+		if (work->for_background && !over_bground_thresh(work))
 			break;
 
 		if (work->for_kupdate) {
@@ -749,6 +795,9 @@ static long wb_writeback(struct bdi_writeback *wb,
 
 		wb_update_bandwidth(wb, wb_start);
 
+		if (progress)
+			mem_cgroup_writeback_done();
+
 		/*
 		 * Did we write something? Try for more
 		 *
@@ -813,17 +862,15 @@ static unsigned long get_nr_dirty_pages(void)
 
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-	if (over_bground_thresh()) {
-
-		struct wb_writeback_work work = {
-			.nr_pages	= LONG_MAX,
-			.sync_mode	= WB_SYNC_NONE,
-			.for_background	= 1,
-			.range_cyclic	= 1,
-		};
+	struct wb_writeback_work work = {
+		.nr_pages	= LONG_MAX,
+		.sync_mode	= WB_SYNC_NONE,
+		.for_background	= 1,
+		.range_cyclic	= 1,
+	};
 
+	if (over_bground_thresh(&work))
 		return wb_writeback(wb, &work);
-	}
 
 	return 0;
 }
@@ -968,10 +1015,11 @@ int bdi_writeback_thread(void *data)
 
 
 /*
- * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * the whole world.
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back the
+ * whole world.  If 'memcg' is non-NULL, then limit attempt to only write pages
+ * from the specified cgroup.
  */
-void wakeup_flusher_threads(long nr_pages)
+void wakeup_flusher_threads(long nr_pages, struct mem_cgroup *memcg)
 {
 	struct backing_dev_info *bdi;
 
@@ -984,7 +1032,7 @@ void wakeup_flusher_threads(long nr_pages)
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false);
+		__bdi_start_writeback(bdi, nr_pages, false, memcg);
 	}
 	rcu_read_unlock();
 }
diff --git a/fs/sync.c b/fs/sync.c
index c98a747..7c1ba55 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -98,7 +98,7 @@ static void sync_filesystems(int wait)
  */
 SYSCALL_DEFINE0(sync)
 {
-	wakeup_flusher_threads(0);
+	wakeup_flusher_threads(0, NULL);
 	sync_filesystems(0);
 	sync_filesystems(1);
 	if (unlikely(laptop_mode))
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d12d070..e6790e8 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -40,6 +40,7 @@
 #define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
 
 struct backing_dev_info;
+struct mem_cgroup;
 
 /*
  * fs/fs-writeback.c
@@ -85,9 +86,10 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
 int writeback_inodes_sb_if_idle(struct super_block *);
 int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
 void sync_inodes_sb(struct super_block *);
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages);
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+			 struct mem_cgroup *memcg, bool shared_inodes);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
-void wakeup_flusher_threads(long nr_pages);
+void wakeup_flusher_threads(long nr_pages, struct mem_cgroup *memcg);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d6edf8d..60d101d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -456,7 +456,8 @@ static int bdi_forker_thread(void *ptr)
 				 * the bdi from the thread. Hopefully 1024 is
 				 * large enough for efficient IO.
 				 */
-				writeback_inodes_wb(&bdi->wb, 1024);
+				writeback_inodes_wb(&bdi->wb, 1024, NULL,
+						    false);
 			} else {
 				/*
 				 * The spinlock makes sure we do not lose
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 12b3900..64de98c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -736,7 +736,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 		trace_balance_dirty_start(bdi);
 		if (bdi_nr_reclaimable > task_bdi_thresh) {
 			pages_written += writeback_inodes_wb(&bdi->wb,
-							     write_chunk);
+							     write_chunk,
+							     NULL, false);
 			trace_balance_dirty_written(bdi, pages_written);
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3153729..fb0ae99 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2223,7 +2223,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		 */
 		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
 		if (total_scanned > writeback_threshold) {
-			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+					       sc->mem_cgroup);
 			sc->may_writepage = 1;
 		}
 
-- 
1.7.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/