linux-kernel - [RFC 1/4] mm, thp: stop preallocating hugepages in khugepaged

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1431354940-30740-2-git-send-email-vbabka@suse.cz>
Date:	Mon, 11 May 2015 16:35:37 +0200
From:	Vlastimil Babka <vbabka@...e.cz>
To:	linux-mm@...ck.org
Cc:	linux-kernel@...r.kernel.org,
	Andrew Morton <akpm@...ux-foundation.org>,
	Hugh Dickins <hughd@...gle.com>,
	Andrea Arcangeli <aarcange@...hat.com>,
	"Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>,
	Rik van Riel <riel@...hat.com>, Mel Gorman <mgorman@...e.de>,
	Michal Hocko <mhocko@...e.cz>,
	Alex Thorlton <athorlton@....com>,
	David Rientjes <rientjes@...gle.com>,
	Vlastimil Babka <vbabka@...e.cz>
Subject: [RFC 1/4] mm, thp: stop preallocating hugepages in khugepaged

Khugepaged tries to preallocate a hugepage before scanning for THP collapse
candidates. If the preallocation fails, scanning is not attempted. This makes
sense, but it is only restricted to !NUMA configurations, where it does not
need to predict on which node to preallocate.

Besides the !NUMA restriction, the preallocated page may also end up being
unused and put back when no collapse candidate is found. I have observed the
thp_collapse_alloc vmstat counter to have 3+ times the value of the counter
of actually collapsed pages in /sys/.../khugepaged/pages_collapsed. On the
other hand, the periodic hugepage allocation attempts involving sync
compaction can be beneficial for the antifragmentation mechanism, but that's
however harder to evaluate.

The following patch will introduce per-node THP availability tracking, which
has more benefits than current preallocation and is applicable to CONFIG_NUMA.
We can therefore remove the preallocation, which also allows a cleanup of the
functions involved in khugepaged allocations. Another small benefit of the
patch is that NUMA configs can now reuse an allocated hugepage for another
collapse attempt, if the previous one was for the same node and failed.

Signed-off-by: Vlastimil Babka <vbabka@...e.cz>
---
 mm/huge_memory.c | 150 ++++++++++++++++++++-----------------------------------
 1 file changed, 53 insertions(+), 97 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 078832c..565864b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -765,9 +765,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 	return 0;
 }
 
-static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
 {
-	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT));
 }
 
 /* Caller must hold page table lock. */
@@ -825,7 +825,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		return 0;
 	}
-	gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+	gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma));
 	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
 	if (unlikely(!page)) {
 		count_vm_event(THP_FAULT_FALLBACK);
@@ -1116,7 +1116,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 alloc:
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow()) {
-		huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+		huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma));
 		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
 	} else
 		new_page = NULL;
@@ -2318,39 +2318,41 @@ static int khugepaged_find_target_node(void)
 	return target_node;
 }
 
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+static inline struct page *alloc_hugepage_node(gfp_t gfp, int node)
 {
-	if (IS_ERR(*hpage)) {
-		if (!*wait)
-			return false;
-
-		*wait = false;
-		*hpage = NULL;
-		khugepaged_alloc_sleep();
-	} else if (*hpage) {
-		put_page(*hpage);
-		*hpage = NULL;
-	}
-
-	return true;
+	gfp |= __GFP_THISNODE | __GFP_OTHER_NODE;
+	return alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
+}
+#else
+static int khugepaged_find_target_node(void)
+{
+	return 0;
 }
 
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
-		       struct vm_area_struct *vma, unsigned long address,
-		       int node)
+static inline struct page *alloc_hugepage_node(gfp_t gfp, int node)
 {
-	VM_BUG_ON_PAGE(*hpage, *hpage);
+	return alloc_pages(gfp, HPAGE_PMD_ORDER);
+}
+#endif
 
+static struct page
+*khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+{
 	/*
-	 * Before allocating the hugepage, release the mmap_sem read lock.
-	 * The allocation can take potentially a long time if it involves
-	 * sync compaction, and we do not need to hold the mmap_sem during
-	 * that. We will recheck the vma after taking it again in write mode.
+	 * If we allocated a hugepage previously and failed to collapse, reuse
+	 * the page, unless it's on different NUMA node.
 	 */
-	up_read(&mm->mmap_sem);
+	if (!IS_ERR_OR_NULL(*hpage)) {
+		if (IS_ENABLED(CONFIG_NUMA) && page_to_nid(*hpage) != node) {
+			put_page(*hpage);
+			*hpage = NULL;
+		} else {
+			return *hpage;
+		}
+	}
+
+	*hpage = alloc_hugepage_node(gfp, node);
 
-	*hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
 	if (unlikely(!*hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 		*hpage = ERR_PTR(-ENOMEM);
@@ -2360,60 +2362,6 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
 	count_vm_event(THP_COLLAPSE_ALLOC);
 	return *hpage;
 }
-#else
-static int khugepaged_find_target_node(void)
-{
-	return 0;
-}
-
-static inline struct page *alloc_hugepage(int defrag)
-{
-	return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
-			   HPAGE_PMD_ORDER);
-}
-
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
-	struct page *hpage;
-
-	do {
-		hpage = alloc_hugepage(khugepaged_defrag());
-		if (!hpage) {
-			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-			if (!*wait)
-				return NULL;
-
-			*wait = false;
-			khugepaged_alloc_sleep();
-		} else
-			count_vm_event(THP_COLLAPSE_ALLOC);
-	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
-
-	return hpage;
-}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-{
-	if (!*hpage)
-		*hpage = khugepaged_alloc_hugepage(wait);
-
-	if (unlikely(!*hpage))
-		return false;
-
-	return true;
-}
-
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
-		       struct vm_area_struct *vma, unsigned long address,
-		       int node)
-{
-	up_read(&mm->mmap_sem);
-	VM_BUG_ON(!*hpage);
-
-	return  *hpage;
-}
-#endif
 
 static bool hugepage_vma_check(struct vm_area_struct *vma)
 {
@@ -2449,17 +2397,25 @@ static void collapse_huge_page(struct mm_struct *mm,
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
-	/* Only allocate from the target node */
-	gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
-		__GFP_THISNODE;
+	/* 
+	 * Determine the flags relevant for both hugepage allocation and memcg
+	 * charge. Hugepage allocation may still add __GFP_THISNODE and
+	 * __GFP_OTHER_NODE, which memcg ignores.
+	 */
+	gfp = alloc_hugepage_gfpmask(khugepaged_defrag());
 
-	/* release the mmap_sem read lock. */
-	new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
+	/*
+	 * Before allocating the hugepage, release the mmap_sem read lock.
+	 * The allocation can take potentially a long time if it involves
+	 * sync compaction, and we do not need to hold the mmap_sem during
+	 * that. We will recheck the vma after taking it again in write mode.
+	 */
+	up_read(&mm->mmap_sem);
+	new_page = khugepaged_alloc_page(hpage, gfp, node);
 	if (!new_page)
 		return;
 
-	if (unlikely(mem_cgroup_try_charge(new_page, mm,
-					   gfp, &memcg)))
+	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg)))
 		return;
 
 	/*
@@ -2788,15 +2744,9 @@ static void khugepaged_do_scan(void)
 {
 	struct page *hpage = NULL;
 	unsigned int progress = 0, pass_through_head = 0;
-	unsigned int pages = khugepaged_pages_to_scan;
-	bool wait = true;
-
-	barrier(); /* write khugepaged_pages_to_scan to local stack */
+	unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
 
 	while (progress < pages) {
-		if (!khugepaged_prealloc_page(&hpage, &wait))
-			break;
-
 		cond_resched();
 
 		if (unlikely(kthread_should_stop() || freezing(current)))
@@ -2812,6 +2762,12 @@ static void khugepaged_do_scan(void)
 		else
 			progress = pages;
 		spin_unlock(&khugepaged_mm_lock);
+
+		/* THP allocation has failed during collapse */
+		if (IS_ERR(hpage)) {
+			khugepaged_alloc_sleep();
+			break;
+		}
 	}
 
 	if (!IS_ERR_OR_NULL(hpage))
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/