linux-kernel - Re: [patch 01/10] mm: memcg: fix compaction/migration failing due to memcg limits

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20120712091043.GB3181@kernel>
Date:	Thu, 12 Jul 2012 17:10:43 +0800
From:	Wanpeng Li <liwp.linux@...il.com>
To:	Johannes Weiner <hannes@...xchg.org>,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>,
	Michal Hocko <mhocko@...e.cz>
Cc:	Andrew Morton <akpm@...ux-foundation.org>,
	Hugh Dickins <hughd@...gle.com>,
	David Rientjes <rientjes@...gle.com>,
	Wanpeng Li <liwp.linux@...il.com>, linux-mm@...ck.org,
	cgroups@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [patch 01/10] mm: memcg: fix compaction/migration failing due to
 memcg limits

On Thu, Jul 12, 2012 at 04:54:07PM +0800, Wanpeng Li wrote:
>On Wed, Jul 11, 2012 at 07:02:13PM +0200, Johannes Weiner wrote:
>>Compaction (and page migration in general) can currently be hindered
>>through pages being owned by memory cgroups that are at their limits
>>and unreclaimable.
>>
>>The reason is that the replacement page is being charged against the
>>limit while the page being replaced is also still charged.  But this
>>seems unnecessary, given that only one of the two pages will still be
>>in use after migration finishes.
>>
>>This patch changes the memcg migration sequence so that the
>>replacement page is not charged.  Whatever page is still in use after
>>successful or failed migration gets to keep the charge of the page
>>that was going to be replaced.
>>
>>The replacement page will still show up temporarily in the rss/cache
>>statistics, this can be fixed in a later patch as it's less urgent.
>>
>
>So I want to know after this patch be merged if mem_cgroup_wait_acct_move
>still make sense, if the answer is no, I will send a patch to remove it.

And if this still make sense, I want to change check in
mem_cgroup_do_charge:

if (mem_cgroup_wait_acct_move(mem_over_limit))
	return CHARGE_RETRY;

=>

if (mem_cgroup_wait_acct_move(mem_over_limit) && 
                       mem_cgroup_margin(mem_over_limit) >= nr_pages)
	return CHARGE_RETRY;

Since mem_cgroup_relcaim can reclaim some pages, but in
mem_cgroup_reclaim function there are some exit condition:

total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
if(total && (flag & MEM_CGROUP_RECLAIM_SHRINK))
	break;

and 

if (mem_cgroup_margin(memcg))
	break;

So maybe mem_cgroup_reclaim not reclaim enough pages >= nr_pages, this
time we should go to mem_cgroup_handle_oom instead of return
CHARGE_RETRY.

Hopefully, you can verify if my idea make sense.


>>Reported-by: David Rientjes <rientjes@...gle.com>
>>Signed-off-by: Johannes Weiner <hannes@...xchg.org>
>>Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
>>Acked-by: Michal Hocko <mhocko@...e.cz>
>>---
>> include/linux/memcontrol.h |   11 +++----
>> mm/memcontrol.c            |   67 +++++++++++++++++++++++--------------------
>> mm/migrate.c               |   27 ++++--------------
>> 3 files changed, 47 insertions(+), 58 deletions(-)
>>
>>diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>>index 5a3ee64..8d9489f 100644
>>--- a/include/linux/memcontrol.h
>>+++ b/include/linux/memcontrol.h
>>@@ -98,9 +98,9 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
>> 
>> extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
>> 
>>-extern int
>>-mem_cgroup_prepare_migration(struct page *page,
>>-	struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask);
>>+extern void
>>+mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
>>+			     struct mem_cgroup **memcgp);
>> extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
>> 	struct page *oldpage, struct page *newpage, bool migration_ok);
>> 
>>@@ -276,11 +276,10 @@ static inline struct cgroup_subsys_state
>> 	return NULL;
>> }
>> 
>>-static inline int
>>+static inline void
>> mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
>>-	struct mem_cgroup **memcgp, gfp_t gfp_mask)
>>+			     struct mem_cgroup **memcgp)
>> {
>>-	return 0;
>> }
>> 
>> static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
>>diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>index e8ddc00..12ee2de 100644
>>--- a/mm/memcontrol.c
>>+++ b/mm/memcontrol.c
>>@@ -2977,7 +2977,8 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
>>  * uncharge if !page_mapped(page)
>>  */
>> static struct mem_cgroup *
>>-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>>+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
>>+			     bool end_migration)
>> {
>> 	struct mem_cgroup *memcg = NULL;
>> 	unsigned int nr_pages = 1;
>>@@ -3021,7 +3022,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>> 		/* fallthrough */
>> 	case MEM_CGROUP_CHARGE_TYPE_DROP:
>> 		/* See mem_cgroup_prepare_migration() */
>>-		if (page_mapped(page) || PageCgroupMigration(pc))
>>+		if (page_mapped(page))
>>+			goto unlock_out;
>>+		/*
>>+		 * Pages under migration may not be uncharged.  But
>>+		 * end_migration() /must/ be the one uncharging the
>>+		 * unused post-migration page and so it has to call
>>+		 * here with the migration bit still set.  See the
>>+		 * res_counter handling below.
>>+		 */
>>+		if (!end_migration && PageCgroupMigration(pc))
>> 			goto unlock_out;
>> 		break;
>> 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
>>@@ -3055,7 +3065,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>> 		mem_cgroup_swap_statistics(memcg, true);
>> 		mem_cgroup_get(memcg);
>> 	}
>>-	if (!mem_cgroup_is_root(memcg))
>>+	/*
>>+	 * Migration does not charge the res_counter for the
>>+	 * replacement page, so leave it alone when phasing out the
>>+	 * page that is unused after the migration.
>>+	 */
>>+	if (!end_migration && !mem_cgroup_is_root(memcg))
>> 		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
>> 
>> 	return memcg;
>>@@ -3071,14 +3086,14 @@ void mem_cgroup_uncharge_page(struct page *page)
>> 	if (page_mapped(page))
>> 		return;
>> 	VM_BUG_ON(page->mapping && !PageAnon(page));
>>-	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON);
>>+	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
>> }
>> 
>> void mem_cgroup_uncharge_cache_page(struct page *page)
>> {
>> 	VM_BUG_ON(page_mapped(page));
>> 	VM_BUG_ON(page->mapping);
>>-	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
>>+	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
>> }
>> 
>> /*
>>@@ -3142,7 +3157,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
>> 	if (!swapout) /* this was a swap cache but the swap is unused ! */
>> 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
>> 
>>-	memcg = __mem_cgroup_uncharge_common(page, ctype);
>>+	memcg = __mem_cgroup_uncharge_common(page, ctype, false);
>> 
>> 	/*
>> 	 * record memcg information,  if swapout && memcg != NULL,
>>@@ -3232,19 +3247,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
>>  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
>>  * page belongs to.
>>  */
>>-int mem_cgroup_prepare_migration(struct page *page,
>>-	struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
>>+void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
>>+				  struct mem_cgroup **memcgp)
>> {
>> 	struct mem_cgroup *memcg = NULL;
>> 	struct page_cgroup *pc;
>> 	enum charge_type ctype;
>>-	int ret = 0;
>> 
>> 	*memcgp = NULL;
>> 
>> 	VM_BUG_ON(PageTransHuge(page));
>> 	if (mem_cgroup_disabled())
>>-		return 0;
>>+		return;
>> 
>> 	pc = lookup_page_cgroup(page);
>> 	lock_page_cgroup(pc);
>>@@ -3289,24 +3303,9 @@ int mem_cgroup_prepare_migration(struct page *page,
>> 	 * we return here.
>> 	 */
>> 	if (!memcg)
>>-		return 0;
>>+		return;
>> 
>> 	*memcgp = memcg;
>>-	ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
>>-	css_put(&memcg->css);/* drop extra refcnt */
>>-	if (ret) {
>>-		if (PageAnon(page)) {
>>-			lock_page_cgroup(pc);
>>-			ClearPageCgroupMigration(pc);
>>-			unlock_page_cgroup(pc);
>>-			/*
>>-			 * The old page may be fully unmapped while we kept it.
>>-			 */
>>-			mem_cgroup_uncharge_page(page);
>>-		}
>>-		/* we'll need to revisit this error code (we have -EINTR) */
>>-		return -ENOMEM;
>>-	}
>> 	/*
>> 	 * We charge new page before it's used/mapped. So, even if unlock_page()
>> 	 * is called before end_migration, we can catch all events on this new
>>@@ -3319,8 +3318,12 @@ int mem_cgroup_prepare_migration(struct page *page,
>> 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
>> 	else
>> 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
>>+	/*
>>+	 * The page is committed to the memcg, but it's not actually
>>+	 * charged to the res_counter since we plan on replacing the
>>+	 * old one and only one page is going to be left afterwards.
>>+	 */
>> 	__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
>>-	return ret;
>> }
>> 
>> /* remove redundant charge if migration failed*/
>>@@ -3342,6 +3345,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
>> 		used = newpage;
>> 		unused = oldpage;
>> 	}
>>+	anon = PageAnon(used);
>>+	__mem_cgroup_uncharge_common(unused,
>>+		anon ? MEM_CGROUP_CHARGE_TYPE_ANON
>>+		     : MEM_CGROUP_CHARGE_TYPE_CACHE,
>>+		true);
>>+	css_put(&memcg->css);
>> 	/*
>> 	 * We disallowed uncharge of pages under migration because mapcount
>> 	 * of the page goes down to zero, temporarly.
>>@@ -3351,10 +3360,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
>> 	lock_page_cgroup(pc);
>> 	ClearPageCgroupMigration(pc);
>> 	unlock_page_cgroup(pc);
>>-	anon = PageAnon(used);
>>-	__mem_cgroup_uncharge_common(unused,
>>-		anon ? MEM_CGROUP_CHARGE_TYPE_ANON
>>-		     : MEM_CGROUP_CHARGE_TYPE_CACHE);
>> 
>> 	/*
>> 	 * If a page is a file cache, radix-tree replacement is very atomic
>>diff --git a/mm/migrate.c b/mm/migrate.c
>>index 8137aea..aa06bf4 100644
>>--- a/mm/migrate.c
>>+++ b/mm/migrate.c
>>@@ -687,7 +687,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>> {
>> 	int rc = -EAGAIN;
>> 	int remap_swapcache = 1;
>>-	int charge = 0;
>> 	struct mem_cgroup *mem;
>> 	struct anon_vma *anon_vma = NULL;
>> 
>>@@ -729,12 +728,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>> 	}
>> 
>> 	/* charge against new page */
>>-	charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
>>-	if (charge == -ENOMEM) {
>>-		rc = -ENOMEM;
>>-		goto unlock;
>>-	}
>>-	BUG_ON(charge);
>>+	mem_cgroup_prepare_migration(page, newpage, &mem);
>> 
>> 	if (PageWriteback(page)) {
>> 		/*
>>@@ -824,8 +818,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
>> 		put_anon_vma(anon_vma);
>> 
>> uncharge:
>>-	if (!charge)
>>-		mem_cgroup_end_migration(mem, page, newpage, rc == 0);
>>+	mem_cgroup_end_migration(mem, page, newpage, rc == 0);
>> unlock:
>> 	unlock_page(page);
>> out:
>>@@ -1519,10 +1512,9 @@ migrate_misplaced_page(struct page *page, struct mm_struct *mm, int node)
>> {
>> 	struct page *oldpage = page, *newpage;
>> 	struct address_space *mapping = page_mapping(page);
>>-	struct mem_cgroup *mcg;
>>+	struct mem_cgroup *memcg;
>> 	unsigned int gfp;
>> 	int rc = 0;
>>-	int charge = -ENOMEM;
>> 
>> 	VM_BUG_ON(!PageLocked(page));
>> 	VM_BUG_ON(page_mapcount(page));
>>@@ -1556,12 +1548,7 @@ migrate_misplaced_page(struct page *page, struct mm_struct *mm, int node)
>> 	if (!trylock_page(newpage))
>> 		BUG();		/* new page should be unlocked!!! */
>> 
>>-	// XXX hnaz, is this right?
>>-	charge = mem_cgroup_prepare_migration(page, newpage, &mcg, gfp);
>>-	if (charge == -ENOMEM) {
>>-		rc = charge;
>>-		goto out;
>>-	}
>>+	mem_cgroup_prepare_migration(page, newpage, &memcg);
>> 
>> 	newpage->index = page->index;
>> 	newpage->mapping = page->mapping;
>>@@ -1581,11 +1568,9 @@ migrate_misplaced_page(struct page *page, struct mm_struct *mm, int node)
>> 		page = newpage;
>> 	}
>> 
>>+	mem_cgroup_end_migration(memcg, oldpage, newpage, !rc);
>> out:
>>-	if (!charge)
>>-		mem_cgroup_end_migration(mcg, oldpage, newpage, !rc);
>>-
>>-       if (oldpage != page)
>>+	if (oldpage != page)
>>                put_page(oldpage);
>> 
>> 	if (rc) {
>>-- 
>>1.7.7.6
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/