linux-kernel - [RFC 1/6] mm, thp: stop preallocating hugepages in khugepaged

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1424696322-21952-2-git-send-email-vbabka@suse.cz>
Date:	Mon, 23 Feb 2015 13:58:37 +0100
From:	Vlastimil Babka <vbabka@...e.cz>
To:	linux-mm@...ck.org
Cc:	linux-kernel@...r.kernel.org,
	Andrew Morton <akpm@...ux-foundation.org>,
	Hugh Dickins <hughd@...gle.com>,
	Andrea Arcangeli <aarcange@...hat.com>,
	"Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>,
	Rik van Riel <riel@...hat.com>, Mel Gorman <mgorman@...e.de>,
	Michal Hocko <mhocko@...e.cz>,
	Ebru Akagunduz <ebru.akagunduz@...il.com>,
	Alex Thorlton <athorlton@....com>,
	David Rientjes <rientjes@...gle.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...nel.org>,
	Vlastimil Babka <vbabka@...e.cz>
Subject: [RFC 1/6] mm, thp: stop preallocating hugepages in khugepaged

Khugepaged tries to preallocate a hugepage before scanning for THP collapse
candidates. If the preallocation fails, scanning is not attempted. This makes
sense, but it is only restricted to !NUMA configurations, where it does not
need to predict on which node to preallocate.

Besides the !NUMA restriction, the preallocated page may also end up being
unused and put back when no collapse candidate is found. I have observed the
thp_collapse_alloc vmstat counter to have 3+ times the value of the counter
of actually collapsed pages in /sys/.../khugepaged/pages_collapsed. On the
other hand, the periodic hugepage allocation attempts involving sync
compaction can be beneficial for the antifragmentation mechanism, but that's
however harder to evaluate.

The following patch will introduce per-node THP availability tracking, which
has more benefits than current preallocation and is applicable to CONFIG_NUMA.
We can therefore remove the preallocation, which also allows a cleanup of the
functions involved in khugepaged allocations. Another small benefit of the
patch is that NUMA configs can now reuse an allocated hugepage for another
collapse attempt, if the previous one was for the same node and failed.

Signed-off-by: Vlastimil Babka <vbabka@...e.cz>
---
 mm/huge_memory.c | 136 +++++++++++++++++++------------------------------------
 1 file changed, 46 insertions(+), 90 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fc00c8c..44fecfc4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -756,9 +756,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 	return 0;
 }
 
-static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
 {
-	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT));
 }
 
 /* Caller must hold page table lock. */
@@ -816,7 +816,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		return 0;
 	}
-	gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+	gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma));
 	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
 	if (unlikely(!page)) {
 		count_vm_event(THP_FAULT_FALLBACK);
@@ -1108,7 +1108,7 @@ alloc:
 	    !transparent_hugepage_debug_cow()) {
 		gfp_t gfp;
 
-		gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+		gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma));
 		new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
 	} else
 		new_page = NULL;
@@ -2289,40 +2289,44 @@ static int khugepaged_find_target_node(void)
 	return target_node;
 }
 
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+static inline struct page *alloc_hugepage_node(gfp_t gfp, int node)
 {
-	if (IS_ERR(*hpage)) {
-		if (!*wait)
-			return false;
-
-		*wait = false;
-		*hpage = NULL;
-		khugepaged_alloc_sleep();
-	} else if (*hpage) {
-		put_page(*hpage);
-		*hpage = NULL;
-	}
+	return alloc_pages_exact_node(node, gfp | __GFP_OTHER_NODE,
+							HPAGE_PMD_ORDER);
+}
+#else
+static int khugepaged_find_target_node(void)
+{
+	return 0;
+}
 
-	return true;
+static inline struct page *alloc_hugepage_node(gfp_t gfp, int node)
+{
+	return alloc_pages(gfp, HPAGE_PMD_ORDER);
 }
+#endif
 
 static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
-		       struct vm_area_struct *vma, unsigned long address,
-		       int node)
+*khugepaged_alloc_page(struct page **hpage, int node)
 {
-	VM_BUG_ON_PAGE(*hpage, *hpage);
+	gfp_t gfp;
 
 	/*
-	 * Before allocating the hugepage, release the mmap_sem read lock.
-	 * The allocation can take potentially a long time if it involves
-	 * sync compaction, and we do not need to hold the mmap_sem during
-	 * that. We will recheck the vma after taking it again in write mode.
+	 * If we allocated a hugepage previously and failed to collapse, reuse
+	 * the page, unless it's on different NUMA node.
 	 */
-	up_read(&mm->mmap_sem);
+	if (!IS_ERR_OR_NULL(*hpage)) {
+		if (IS_ENABLED(CONFIG_NUMA) && page_to_nid(*hpage) != node) {
+			put_page(*hpage);
+			*hpage = NULL;
+		} else {
+			return *hpage;
+		}
+	}
+
+	gfp = alloc_hugepage_gfpmask(khugepaged_defrag());
+	*hpage = alloc_hugepage_node(gfp, node);
 
-	*hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
-		khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
 	if (unlikely(!*hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 		*hpage = ERR_PTR(-ENOMEM);
@@ -2332,59 +2336,6 @@ static struct page
 	count_vm_event(THP_COLLAPSE_ALLOC);
 	return *hpage;
 }
-#else
-static int khugepaged_find_target_node(void)
-{
-	return 0;
-}
-
-static inline struct page *alloc_hugepage(int defrag)
-{
-	return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
-			   HPAGE_PMD_ORDER);
-}
-
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
-	struct page *hpage;
-
-	do {
-		hpage = alloc_hugepage(khugepaged_defrag());
-		if (!hpage) {
-			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-			if (!*wait)
-				return NULL;
-
-			*wait = false;
-			khugepaged_alloc_sleep();
-		} else
-			count_vm_event(THP_COLLAPSE_ALLOC);
-	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
-
-	return hpage;
-}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-{
-	if (!*hpage)
-		*hpage = khugepaged_alloc_hugepage(wait);
-
-	if (unlikely(!*hpage))
-		return false;
-
-	return true;
-}
-
-static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
-		       struct vm_area_struct *vma, unsigned long address,
-		       int node)
-{
-	up_read(&mm->mmap_sem);
-	VM_BUG_ON(!*hpage);
-	return  *hpage;
-}
-#endif
 
 static bool hugepage_vma_check(struct vm_area_struct *vma)
 {
@@ -2419,8 +2370,14 @@ static void collapse_huge_page(struct mm_struct *mm,
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
-	/* release the mmap_sem read lock. */
-	new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+	/*
+	 * Before allocating the hugepage, release the mmap_sem read lock.
+	 * The allocation can take potentially a long time if it involves
+	 * sync compaction, and we do not need to hold the mmap_sem during
+	 * that. We will recheck the vma after taking it again in write mode.
+	 */
+	up_read(&mm->mmap_sem);
+	new_page = khugepaged_alloc_page(hpage, node);
 	if (!new_page)
 		return;
 
@@ -2754,15 +2711,9 @@ static void khugepaged_do_scan(void)
 {
 	struct page *hpage = NULL;
 	unsigned int progress = 0, pass_through_head = 0;
-	unsigned int pages = khugepaged_pages_to_scan;
-	bool wait = true;
-
-	barrier(); /* write khugepaged_pages_to_scan to local stack */
+	unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
 
 	while (progress < pages) {
-		if (!khugepaged_prealloc_page(&hpage, &wait))
-			break;
-
 		cond_resched();
 
 		if (unlikely(kthread_should_stop() || freezing(current)))
@@ -2778,6 +2729,11 @@ static void khugepaged_do_scan(void)
 		else
 			progress = pages;
 		spin_unlock(&khugepaged_mm_lock);
+
+		if (IS_ERR(hpage)) {
+			khugepaged_alloc_sleep();
+			break;
+		}
 	}
 
 	if (!IS_ERR_OR_NULL(hpage))
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/