lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20090515190027.e7d48d7a.kamezawa.hiroyu@jp.fujitsu.com>
Date:	Fri, 15 May 2009 19:00:27 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	"linux-mm@...ck.org" <linux-mm@...ck.org>
Cc:	"nishimura@....nes.nec.co.jp" <nishimura@....nes.nec.co.jp>,
	"balbir@...ux.vnet.ibm.com" <balbir@...ux.vnet.ibm.com>,
	"hugh@...itas.com" <hugh@...itas.com>, hannes@...xchg.org,
	"mingo@...e.hu" <mingo@...e.hu>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	"akpm@...ux-foundation.org" <akpm@...ux-foundation.org>
Subject: [PATCH] memcg: handle accounting race in swapin-readahead and
 zap_pte

Similar to previous series but this version is a bit claerer, I think.
==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>

When a process exits, zap_pte() is called and free_swap_and_cache()
is called for freeing swp_entry. But free_swap_and_cache() uses trylock()
and entries may not be freed. (Later, global LRU will handle this.)


           processA                   |           processB
  -------------------------------------+-------------------------------------
    (free_swap_and_cache())            |  (read_swap_cache_async())
                                       |    swap_duplicate()
                                       |    __set_page_locked()
                                       |    add_to_swap_cache()
      swap_entry_free() == 0           |
      find_get_page() -> found         |
      try_lock_page() -> fail & return |
                                       |    lru_cache_add_anon()
                                       |      doesn't link this page to memcg's
                                       |      LRU, because of !PageCgroupUsed.

At using memcg, above path is terrible because not freed swapcache will
never be freed until global LRU runs. This can be leak of swap entry
and cause OOM (as Nishimura reported)

To fix this, one easy way is not to permit swapin-readahead. But it causes
unpleasant peformance penalty in case that swapin-readahead hits.

This patch tries to fix above race by adding an private LRU, swapin-buffer.
This works as following.
 1. add swap-cache to swapin-buffer at readahead()
 2. check SwapCache in swapin-buffer again in delayed work.
 3. finally pages in swapin-buffer are moved to INACTIVE_ANON list.

This patch uses delayed_work and moves pages from buffer to anon in
proportional number to the number of pages in swapin-buffer.


Changelog:
 - redesigned again.
 - A main difference from previous trials is PG_lru is not set until
   we confirm the entry. We can avoid races and contention of zone's LRU.
 - # of calls to schedule_work() is reduced.
 - access to zone->lru is batched.
 - don't handle races in writeback (handled by other patch)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
---
 include/linux/swap.h |    8 +++
 mm/memcontrol.c      |  120 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/swap_state.c      |   10 +++-
 3 files changed, 136 insertions(+), 2 deletions(-)

Index: mmotm-2.6.30-May13/mm/memcontrol.c
===================================================================
--- mmotm-2.6.30-May13.orig/mm/memcontrol.c	2009-05-15 17:44:14.000000000 +0900
+++ mmotm-2.6.30-May13/mm/memcontrol.c	2009-05-15 18:46:35.000000000 +0900
@@ -834,6 +834,123 @@
 	return num;
 }
 
+#ifdef CONFIG_SWAP
+
+struct swapin_buffer {
+	spinlock_t		lock;
+	struct list_head	list;
+	int			nr;
+	struct delayed_work	work;
+} memcg_swapin_buffer;
+
+/* Used at swapoff */
+#define ENOUGH_LARGE_SWAPIN_BUFFER	(1024)
+
+/* Hide swapin page from LRU for a while */
+
+static int __check_swapin_buffer(struct page *page)
+{
+	/* Fast path (PG_writeback never be set.) */
+	if (!PageSwapCache(page) || page_mapped(page))
+		return 1;
+
+	if (PageUptodate(page) && trylock_page(page)) {
+		try_to_free_swap(page);
+		unlock_page(page);
+		return 1;
+	}
+	return 0;
+}
+
+static void mem_cgroup_drain_swapin_buffer(struct work_struct *work)
+{
+	struct page *page, *tmp;
+	LIST_HEAD(scan);
+	int nr, fail;
+
+	if (!memcg_swapin_buffer.nr)
+		return;
+
+	/*
+	 * When swapin_buffer increasing rapidly, swapped-in pages tend to be
+	 * in use. Because page faulted thread should continue its own work
+	 * to cause large swapin, swapin-readahead should _hit_ if nr is large.
+	 * In that case, __check_swapin_buffer() will use fast-path.
+	 * Then, making _nr_ to be propotional to the total size.
+	 */
+	nr = memcg_swapin_buffer.nr/8 + 1;
+
+	spin_lock(&memcg_swapin_buffer.lock);
+	while (nr-- && !list_empty(&memcg_swapin_buffer.list)) {
+		list_move(memcg_swapin_buffer.list.next, &scan);
+		memcg_swapin_buffer.nr--;
+	}
+	spin_unlock(&memcg_swapin_buffer.lock);
+
+	fail = 0;
+	list_for_each_entry_safe(page, tmp, &scan, lru) {
+		if (__check_swapin_buffer(page)) {
+			list_del(&page->lru);
+			lru_cache_add_anon(page);
+			put_page(page);
+		} else
+			fail++;
+	}
+	if (!list_empty(&scan)) {
+		spin_lock(&memcg_swapin_buffer.lock);
+		list_splice_tail(&scan, &memcg_swapin_buffer.list);
+		memcg_swapin_buffer.nr += fail;
+		spin_unlock(&memcg_swapin_buffer.lock);
+	}
+
+	if (memcg_swapin_buffer.nr)
+		schedule_delayed_work(&memcg_swapin_buffer.work, HZ/10);
+}
+
+static void mem_cgroup_force_drain_swapin_buffer(void)
+{
+	int swapin_buffer_thresh;
+
+	swapin_buffer_thresh = (num_online_cpus() + 1) * (1 << page_cluster);
+	if (memcg_swapin_buffer.nr > swapin_buffer_thresh)
+		mem_cgroup_drain_swapin_buffer(NULL);
+}
+
+void mem_cgroup_lazy_drain_swapin_buffer(void)
+{
+	schedule_delayed_work(&memcg_swapin_buffer.work, HZ/10);
+}
+
+void mem_cgroup_add_swapin_buffer(struct page *page)
+{
+	get_page(page);
+	spin_lock(&memcg_swapin_buffer.lock);
+	list_add_tail(&page->lru, &memcg_swapin_buffer.list);
+	memcg_swapin_buffer.nr++;
+	spin_unlock(&memcg_swapin_buffer.lock);
+	/*
+	 * Usually, this will not hit. At swapoff, we have to
+	 * drain ents manually.
+	 */
+	if (memcg_swapin_buffer.nr > ENOUGH_LARGE_SWAPIN_BUFFER)
+		mem_cgroup_drain_swapin_buffer(NULL);
+}
+
+static __init int init_swapin_buffer(void)
+{
+	spin_lock_init(&memcg_swapin_buffer.lock);
+	INIT_LIST_HEAD(&memcg_swapin_buffer.list);
+	INIT_DELAYED_WORK(&memcg_swapin_buffer.work,
+			mem_cgroup_drain_swapin_buffer);
+	return 0;
+}
+late_initcall(init_swapin_buffer);
+#else
+static void mem_cgroup_force_drain_swain_buffer(void)
+{
+}
+#endif /* CONFIG_SWAP */
+
 /*
  * Visit the first child (need not be the first child as per the ordering
  * of the cgroup list, since we track last_scanned_child) of @mem and use
@@ -892,6 +1009,8 @@
 	int ret, total = 0;
 	int loop = 0;
 
+	mem_cgroup_force_drain_swapin_buffer();
+
 	while (loop < 2) {
 		victim = mem_cgroup_select_victim(root_mem);
 		if (victim == root_mem)
@@ -1560,6 +1679,7 @@
 }
 
 #ifdef CONFIG_SWAP
+
 /*
  * called after __delete_from_swap_cache() and drop "page" account.
  * memcg information is recorded to swap_cgroup of "ent"
Index: mmotm-2.6.30-May13/include/linux/swap.h
===================================================================
--- mmotm-2.6.30-May13.orig/include/linux/swap.h	2009-05-15 17:44:14.000000000 +0900
+++ mmotm-2.6.30-May13/include/linux/swap.h	2009-05-15 18:01:43.000000000 +0900
@@ -336,11 +336,19 @@
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent);
+extern void mem_cgroup_add_swapin_buffer(struct page *page);
+extern void mem_cgroup_lazy_drain_swapin_buffer(void);
 #else
 static inline void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
 {
 }
+static inline void mem_cgroup_add_swapin_buffer(struct page *page)
+{
+}
+static inline void  mem_cgroup_lazy_drain_swapin_buffer(void)
+{
+}
 #endif
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern void mem_cgroup_uncharge_swap(swp_entry_t ent);
Index: mmotm-2.6.30-May13/mm/swap_state.c
===================================================================
--- mmotm-2.6.30-May13.orig/mm/swap_state.c	2009-05-15 17:44:14.000000000 +0900
+++ mmotm-2.6.30-May13/mm/swap_state.c	2009-05-15 18:01:43.000000000 +0900
@@ -311,7 +311,10 @@
 			/*
 			 * Initiate read into locked page and return.
 			 */
-			lru_cache_add_anon(new_page);
+			if (mem_cgroup_disabled())
+				lru_cache_add_anon(new_page);
+			else
+				mem_cgroup_add_swapin_buffer(new_page);
 			swap_readpage(NULL, new_page);
 			return new_page;
 		}
@@ -368,6 +371,9 @@
 			break;
 		page_cache_release(page);
 	}
-	lru_add_drain();	/* Push any new pages onto the LRU now */
+	if (mem_cgroup_disabled())
+		lru_add_drain();/* Push any new pages onto the LRU now */
+	else
+		mem_cgroup_lazy_drain_swapin_buffer();
 	return read_swap_cache_async(entry, gfp_mask, vma, addr);
 }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ