lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Thu, 26 May 2011 14:15:29 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
Cc:	"linux-mm@...ck.org" <linux-mm@...ck.org>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	"akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
	Ying Han <yinghan@...gle.com>,
	"nishimura@....nes.nec.co.jp" <nishimura@....nes.nec.co.jp>,
	"balbir@...ux.vnet.ibm.com" <balbir@...ux.vnet.ibm.com>
Subject: [RFC][PATCH v3 1/10] check reclaimable in hierarchy walk


I may post this patch as stand alone, later.
==
Check memcg has reclaimable pages at select_victim().

Now, with help of bitmap as memcg->scan_node, we can check whether memcg has
reclaimable pages with easy test of node_empty(&mem->scan_nodes).

mem->scan_nodes is a bitmap to show whether memcg contains reclaimable
memory or not, which is updated periodically.

This patch makes use of scan_nodes and modify hierarchy walk at memory
shrinking in following way.

  - check scan_nodes in mem_cgroup_select_victim()
  - mem_cgroup_select_victim() returns NULL if no memcg is reclaimable.
  - force update of scan_nodes.
  - rename mem_cgroup_select_victim() to be mem_cgroup_select_get_victim()
    to show refcnt is +1.

This will make hierarchy walk better.

And this allows to remove mem_cgroup_local_pages() check which was used for
the same purpose. But this function was wrong because it cannot handle
information of unevictable pages and tmpfs v.s. swapless information.

Changelog:
 - added since v3.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
---
 mm/memcontrol.c |  165 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 110 insertions(+), 55 deletions(-)

Index: memcg_async/mm/memcontrol.c
===================================================================
--- memcg_async.orig/mm/memcontrol.c
+++ memcg_async/mm/memcontrol.c
@@ -584,15 +584,6 @@ static long mem_cgroup_read_stat(struct 
 	return val;
 }
 
-static long mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
-	long ret;
-
-	ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
-	ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
-	return ret;
-}
-
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 					 bool charge)
 {
@@ -1555,43 +1546,6 @@ u64 mem_cgroup_get_limit(struct mem_cgro
 	return min(limit, memsw);
 }
 
-/*
- * Visit the first child (need not be the first child as per the ordering
- * of the cgroup list, since we track last_scanned_child) of @mem and use
- * that to reclaim free pages from.
- */
-static struct mem_cgroup *
-mem_cgroup_select_victim(struct mem_cgroup *root_mem)
-{
-	struct mem_cgroup *ret = NULL;
-	struct cgroup_subsys_state *css;
-	int nextid, found;
-
-	if (!root_mem->use_hierarchy) {
-		css_get(&root_mem->css);
-		ret = root_mem;
-	}
-
-	while (!ret) {
-		rcu_read_lock();
-		nextid = root_mem->last_scanned_child + 1;
-		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
-				   &found);
-		if (css && css_tryget(css))
-			ret = container_of(css, struct mem_cgroup, css);
-
-		rcu_read_unlock();
-		/* Updates scanning parameter */
-		if (!css) {
-			/* this means start scan from ID:1 */
-			root_mem->last_scanned_child = 0;
-		} else
-			root_mem->last_scanned_child = found;
-	}
-
-	return ret;
-}
-
 #if MAX_NUMNODES > 1
 
 /*
@@ -1600,11 +1554,11 @@ mem_cgroup_select_victim(struct mem_cgro
  * nodes based on the zonelist. So update the list loosely once per 10 secs.
  *
  */
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force)
 {
 	int nid;
 
-	if (time_after(mem->next_scan_node_update, jiffies))
+	if (!force && time_after(mem->next_scan_node_update, jiffies))
 		return;
 
 	mem->next_scan_node_update = jiffies + 10*HZ;
@@ -1641,7 +1595,7 @@ int mem_cgroup_select_victim_node(struct
 {
 	int node;
 
-	mem_cgroup_may_update_nodemask(mem);
+	mem_cgroup_may_update_nodemask(mem, false);
 	node = mem->last_scanned_node;
 
 	node = next_node(node, mem->scan_nodes);
@@ -1660,13 +1614,117 @@ int mem_cgroup_select_victim_node(struct
 	return node;
 }
 
+/**
+ * mem_cgroup_has_reclaimable
+ * @mem_cgroup : the mem_cgroup
+ *
+ * The caller can test whether the memcg has reclaimable pages.
+ *
+ * This function checks memcg has reclaimable pages or not with bitmap of
+ * memcg->scan_nodes. This bitmap is updated periodically and indicates
+ * which node has reclaimable memcg memory or not.
+ * Although this is a rough test and result is not very precise but we don't
+ * have to scan all nodes and don't have to use locks.
+ *
+ * For non-NUMA, this cheks reclaimable pages on zones because we don't
+ * update scan_nodes.(see below)
+ */
+static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg)
+{
+	return !nodes_empty(memcg->scan_nodes);
+}
+
 #else
+
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force)
+{
+}
+
 int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
 {
 	return 0;
 }
+
+static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg)
+{
+	unsigned long nr;
+	int zid;
+
+	for (zid = NODE_DATA(0)->nr_zones - 1; zid >= 0; zid--)
+		if (mem_cgroup_zone_reclaimable_pages(memcg, 0, zid))
+			break;
+	if (zid < 0)
+		return false;
+	return true;
+}
 #endif
 
+/**
+ * mem_cgroup_select_get_victim
+ * @root_mem: the root memcg of hierarchy which should be shrinked.
+ *
+ * Visit children of root_mem ony by one. If the routine finds a memcg
+ * which contains reclaimable pages, returns it with refcnt +1. The
+ * scan is done in round-robin and 'the next start point' is saved into
+ * mem->last_scanned_child. If no reclaimable memcg are found, returns NULL.
+ */
+static struct mem_cgroup *
+mem_cgroup_select_get_victim(struct mem_cgroup *root_mem)
+{
+	struct mem_cgroup *ret = NULL;
+	struct cgroup_subsys_state *css;
+	int nextid, found;
+	bool second_visit = false;
+
+	if (!root_mem->use_hierarchy)
+		goto return_root;
+
+	while (!ret) {
+		rcu_read_lock();
+		nextid = root_mem->last_scanned_child + 1;
+		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
+				   &found);
+		if (css && css_tryget(css))
+			ret = container_of(css, struct mem_cgroup, css);
+
+		rcu_read_unlock();
+		/* Updates scanning parameter */
+		if (!css) { /* Indicates we scanned the last node of tree */
+			/*
+			 * If all memcg has no reclaimable pages, we may enter
+			 * an infinite loop. Exit here if we reached the end
+			 * of hierarchy tree twice.
+			 */
+			if (second_visit)
+				return NULL;
+			/* this means start scan from ID:1 */
+			root_mem->last_scanned_child = 0;
+			second_visit = true;
+		} else
+			root_mem->last_scanned_child = found;
+		if (css && ret) {
+			/*
+ 			 * check memcg has reclaimable memory or not. Update
+ 			 * information carefully if we might fail with cached
+ 			 * bitmask information.
+ 			 */
+			if (second_visit)
+				mem_cgroup_may_update_nodemask(ret, true);
+
+			if (!mem_cgroup_has_reclaimable(ret)) {
+				css_put(css);
+				ret = NULL;
+			}
+		}
+	}
+
+	return ret;
+return_root:
+	css_get(&root_mem->css);
+	return root_mem;
+}
+
+
 /*
  * Scan the hierarchy if needed to reclaim memory. We remember the last child
  * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1705,7 +1763,9 @@ static int mem_cgroup_hierarchical_recla
 		is_kswapd = true;
 
 	while (1) {
-		victim = mem_cgroup_select_victim(root_mem);
+		victim = mem_cgroup_select_get_victim(root_mem);
+		if (!victim)
+			return total;
 		if (victim == root_mem) {
 			loop++;
 			if (loop >= 1)
@@ -1733,11 +1793,6 @@ static int mem_cgroup_hierarchical_recla
 				}
 			}
 		}
-		if (!mem_cgroup_local_usage(victim)) {
-			/* this cgroup's local usage == 0 */
-			css_put(&victim->css);
-			continue;
-		}
 		/* we use swappiness of local cgroup */
 		if (check_soft) {
 			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ