linux-kernel - Re: [PATCH V2 3/3] mm: Enable Buddy allocation isolation for CDM nodes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <628673ca-15ee-c1be-ad53-3809f83722d7@linux.vnet.ibm.com>
Date:   Wed, 15 Feb 2017 09:57:19 +0530
From:   Anshuman Khandual <khandual@...ux.vnet.ibm.com>
To:     Anshuman Khandual <khandual@...ux.vnet.ibm.com>,
        Vlastimil Babka <vbabka@...e.cz>, linux-kernel@...r.kernel.org,
        linux-mm@...ck.org
Cc:     mhocko@...e.com, mgorman@...e.de, minchan@...nel.org,
        aneesh.kumar@...ux.vnet.ibm.com, bsingharora@...il.com,
        srikar@...ux.vnet.ibm.com, haren@...ux.vnet.ibm.com,
        jglisse@...hat.com, dave.hansen@...el.com, dan.j.williams@...el.com
Subject: Re: [PATCH V2 3/3] mm: Enable Buddy allocation isolation for CDM
 nodes

On 02/14/2017 03:44 PM, Anshuman Khandual wrote:
> On 02/14/2017 01:58 PM, Vlastimil Babka wrote:
>> On 02/10/2017 11:06 AM, Anshuman Khandual wrote:
>>> This implements allocation isolation for CDM nodes in buddy allocator by
>>> discarding CDM memory zones all the time except in the cases where the gfp
>>> flag has got __GFP_THISNODE or the nodemask contains CDM nodes in cases
>>> where it is non NULL (explicit allocation request in the kernel or user
>>> process MPOL_BIND policy based requests).
>>>
>>> Signed-off-by: Anshuman Khandual <khandual@...ux.vnet.ibm.com>
>>> ---
>>>  mm/page_alloc.c | 16 ++++++++++++++++
>>>  1 file changed, 16 insertions(+)
>>>
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index 84d61bb..392c24a 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -64,6 +64,7 @@
>>>  #include <linux/page_owner.h>
>>>  #include <linux/kthread.h>
>>>  #include <linux/memcontrol.h>
>>> +#include <linux/node.h>
>>>  
>>>  #include <asm/sections.h>
>>>  #include <asm/tlbflush.h>
>>> @@ -2908,6 +2909,21 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>>>  		struct page *page;
>>>  		unsigned long mark;
>>>  
>>> +		/*
>>> +		 * CDM nodes get skipped if the requested gfp flag
>>> +		 * does not have __GFP_THISNODE set or the nodemask
>>> +		 * does not have any CDM nodes in case the nodemask
>>> +		 * is non NULL (explicit allocation requests from
>>> +		 * kernel or user process MPOL_BIND policy which has
>>> +		 * CDM nodes).
>>> +		 */
>>> +		if (is_cdm_node(zone->zone_pgdat->node_id)) {
>>> +			if (!(gfp_mask & __GFP_THISNODE)) {
>>> +				if (!ac->nodemask)
>>> +					continue;
>>> +			}
>>> +		}
>>
>> With the current cpuset implementation, this will have a subtle corner
>> case when allocating from a cpuset that allows the cdm node, and there
>> is no (task or vma) mempolicy applied for the allocation. In the fast
>> path (__alloc_pages_nodemask()) we'll set ac->nodemask to
>> current->mems_allowed, so your code will wrongly assume that this
>> ac->nodemask is a policy that allows the CDM node. Probably not what you
>> want?
> 
> You are right, its a problem and not what we want. We can make the
> function get_page_from_freelist() take another parameter "orig_nodemask"
> which gets passed into __alloc_pages_nodemask() in the first place. So
> inside zonelist iterator we can compare orig_nodemask with current
> ac.nodemask to figure out if cpuset swapping of nodemask happened and
> skip CDM node if necessary. Thats a viable solution IMHO.

Hello Vlastimil,

As I mentioned before yesterday this solution works and tested to verify that
there is no allocation leak happening to CDM even after cpuset_enabled() is 
turned ON after changing /sys/fs/cgroup/cpuset/ setup. Major part of the change
is just to add an additional parameter into the function get_page_from_freelist
and changing all it's call sites.

- Anshuman

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 392c24a..9f41e0f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2894,11 +2894,15 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
-						const struct alloc_context *ac)
+			const struct alloc_context *ac, nodemask_t *orig_mask)
 {
 	struct zoneref *z = ac->preferred_zoneref;
 	struct zone *zone;
 	struct pglist_data *last_pgdat_dirty_limit = NULL;
+	bool cpuset_fallback;
+
+	if (ac->nodemask != orig_mask)
+		cpuset_fallback = true;
 
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
@@ -2919,6 +2923,9 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 		 */
 		if (is_cdm_node(zone->zone_pgdat->node_id)) {
 			if (!(gfp_mask & __GFP_THISNODE)) {
+				if (cpuset_fallback)
+					continue;
+
 				if (!ac->nodemask)
 					continue;
 			}
@@ -3066,7 +3073,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
-	const struct alloc_context *ac, unsigned long *did_some_progress)
+	const struct alloc_context *ac, unsigned long *did_some_progress, nodemask_t *orig_mask)
 {
 	struct oom_control oc = {
 		.zonelist = ac->zonelist,
@@ -3095,7 +3102,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 	 * we're still under heavy pressure.
 	 */
 	page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
-					ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
+					ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac, orig_mask);
 	if (page)
 		goto out;
 
@@ -3131,14 +3138,14 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 
 		if (gfp_mask & __GFP_NOFAIL) {
 			page = get_page_from_freelist(gfp_mask, order,
-					ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
+					ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac, orig_mask);
 			/*
 			 * fallback to ignore cpuset restriction if our nodes
 			 * are depleted
 			 */
 			if (!page)
 				page = get_page_from_freelist(gfp_mask, order,
-					ALLOC_NO_WATERMARKS, ac);
+					ALLOC_NO_WATERMARKS, ac, orig_mask);
 		}
 	}
 out:
@@ -3157,7 +3164,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
-		enum compact_priority prio, enum compact_result *compact_result)
+		enum compact_priority prio, enum compact_result *compact_result, nodemask_t *orig_mask)
 {
 	struct page *page;
 
@@ -3178,7 +3185,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 	 */
 	count_vm_event(COMPACTSTALL);
 
-	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask);
 
 	if (page) {
 		struct zone *zone = page_zone(page);
@@ -3263,7 +3270,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
-		enum compact_priority prio, enum compact_result *compact_result)
+		enum compact_priority prio, enum compact_result *compact_result, nodemask_t *orig_mask)
 {
 	*compact_result = COMPACT_SKIPPED;
 	return NULL;
@@ -3330,7 +3337,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
-		unsigned long *did_some_progress)
+		unsigned long *did_some_progress, nodemask_t *orig_mask)
 {
 	struct page *page = NULL;
 	bool drained = false;
@@ -3340,7 +3347,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 		return NULL;
 
 retry:
-	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask);
 
 	/*
 	 * If an allocation failed after direct reclaim, it could be because
@@ -3533,7 +3540,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
-						struct alloc_context *ac)
+				struct alloc_context *ac, nodemask_t *orig_mask)
 {
 	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
 	struct page *page = NULL;
@@ -3597,7 +3604,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	 * The adjusted alloc_flags might result in immediate success, so try
 	 * that first
 	 */
-	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask);
 	if (page)
 		goto got_pg;
 
@@ -3612,7 +3619,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 		page = __alloc_pages_direct_compact(gfp_mask, order,
 						alloc_flags, ac,
 						INIT_COMPACT_PRIORITY,
-						&compact_result);
+						&compact_result, orig_mask);
 		if (page)
 			goto got_pg;
 
@@ -3661,7 +3668,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	}
 
 	/* Attempt with potentially adjusted zonelist and alloc_flags */
-	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, orig_mask);
 	if (page)
 		goto got_pg;
 
@@ -3697,13 +3704,13 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
-							&did_some_progress);
+							&did_some_progress, orig_mask);
 	if (page)
 		goto got_pg;
 
 	/* Try direct compaction and then allocating */
 	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
-					compact_priority, &compact_result);
+					compact_priority, &compact_result, orig_mask);
 	if (page)
 		goto got_pg;
 
@@ -3750,7 +3757,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 		goto retry_cpuset;
 
 	/* Reclaim has failed us, start killing things */
-	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress, orig_mask);
 	if (page)
 		goto got_pg;
 
@@ -3842,7 +3849,7 @@ struct page *
 	}
 
 	/* First allocation attempt */
-	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
+	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac, nodemask);
 	if (likely(page))
 		goto out;
 
@@ -3861,7 +3868,7 @@ struct page *
 	if (unlikely(ac.nodemask != nodemask))
 		ac.nodemask = nodemask;
 
-	page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+	page = __alloc_pages_slowpath(alloc_mask, order, &ac, nodemask);
 
 out:
 	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&