lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Thu, 15 Jan 2015 21:49:14 +0300
From:	Konstantin Khebnikov <khlebnikov@...dex-team.ru>
To:	linux-mm@...ck.org, cgroups@...r.kernel.org
Cc:	Roman Gushchin <klamm@...dex-team.ru>, Jan Kara <jack@...e.cz>,
	Dave Chinner <david@...morbit.com>,
	linux-kernel@...r.kernel.org, Tejun Heo <tj@...nel.org>,
	linux-fsdevel@...r.kernel.org, koct9i@...il.com
Subject: [PATCH 3/6] memcg: track shared inodes with dirty pages

From: Konstantin Khlebnikov <khlebnikov@...dex-team.ru>

Inode is owned only by one memory cgroup, but if it's shared it might
contain pages from multiple cgroups. This patch detects this situation
in memory reclaiemer and marks dirty inode with flag I_DIRTY_SHARED
which is cleared only when data is completely written. Memcg writeback
always writes such inodes.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@...dex-team.ru>
---
 fs/fs-writeback.c          |    4 ++--
 include/linux/fs.h         |    3 +++
 include/linux/memcontrol.h |    4 ++++
 mm/memcontrol.c            |   20 ++++++++++++++++++++
 mm/vmscan.c                |    4 ++++
 5 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9034768..fda6a64 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -484,7 +484,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 */
 	spin_lock(&inode->i_lock);
 
-	dirty = inode->i_state & I_DIRTY;
+	dirty = inode->i_state & (I_DIRTY | I_DIRTY_SHARED);
 	inode->i_state &= ~I_DIRTY;
 
 	/*
@@ -501,7 +501,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	smp_mb();
 
 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		inode->i_state |= I_DIRTY_PAGES;
+		inode->i_state |= I_DIRTY_PAGES | (dirty & I_DIRTY_SHARED);
 
 	spin_unlock(&inode->i_lock);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ee2e3c0..303f0ad 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1741,6 +1741,8 @@ struct super_operations {
  *
  * I_DIO_WAKEUP		Never set.  Only used as a key for wait_on_bit().
  *
+ * I_DIRTY_SHARED	Dirty pages belong to multiple memory cgroups.
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC		(1 << 0)
@@ -1757,6 +1759,7 @@ struct super_operations {
 #define __I_DIO_WAKEUP		9
 #define I_DIO_WAKEUP		(1 << I_DIO_WAKEUP)
 #define I_LINKABLE		(1 << 10)
+#define I_DIRTY_SHARED		(1 << 11)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ae05563..3f89e9b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -181,6 +181,8 @@ void mem_cgroup_forget_mapping(struct address_space *mapping);
 bool mem_cgroup_dirty_limits(struct address_space *mapping, unsigned long *dirty,
 			     unsigned long *thresh, unsigned long *bg_thresh);
 bool mem_cgroup_dirty_exceeded(struct inode *inode);
+void mem_cgroup_poke_writeback(struct address_space *mapping,
+			       struct mem_cgroup *memcg);
 
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
@@ -358,6 +360,8 @@ static inline void mem_cgroup_forget_mapping(struct address_space *mapping) {}
 static inline bool mem_cgroup_dirty_limits(struct address_space *mapping, unsigned long *dirty,
 			     unsigned long *thresh, unsigned long *bg_thresh) { return false; }
 static inline bool mem_cgroup_dirty_exceeded(struct inode *inode) { return false; }
+static inline void mem_cgroup_poke_writeback(struct address_space *mapping,
+					     struct mem_cgroup *memcg) { }
 
 #endif /* CONFIG_MEMCG */
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 17d966a3b..d9d345c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6064,6 +6064,9 @@ bool mem_cgroup_dirty_exceeded(struct inode *inode)
 	if (mapping->backing_dev_info->dirty_exceeded)
 		return true;
 
+	if (inode->i_state & I_DIRTY_SHARED)
+		return true;
+
 	rcu_read_lock();
 	memcg = rcu_dereference(mapping->i_memcg);
 	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
@@ -6084,6 +6087,23 @@ bool mem_cgroup_dirty_exceeded(struct inode *inode)
 	return memcg != NULL;
 }
 
+void mem_cgroup_poke_writeback(struct address_space *mapping,
+			       struct mem_cgroup *memcg)
+{
+	struct inode *inode = mapping->host;
+
+	if (rcu_access_pointer(mapping->i_memcg) == memcg ||
+	    !memcg->dirty_exceeded)
+		return;
+
+	if (inode->i_state & (I_DIRTY_PAGES|I_DIRTY_SHARED) == I_DIRTY_PAGES) {
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & I_DIRTY_PAGES)
+			inode->i_state |= I_DIRTY_SHARED;
+		spin_unlock(&inode->i_lock);
+	}
+}
+
 /*
  * subsys_initcall() for memory controller.
  *
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ab2505c..75165fc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1013,6 +1013,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
 				SetPageReclaim(page);
 
+				if (!global_reclaim(sc))
+					mem_cgroup_poke_writeback(mapping,
+							sc->target_mem_cgroup);
+
 				goto keep_locked;
 			}
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ