linux-kernel - [RFC PATCH] madvise: make madvise_cold_or_pageout_pte

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230713150558.200545-1-fengwei.yin@intel.com>
Date:   Thu, 13 Jul 2023 23:05:58 +0800
From:   Yin Fengwei <fengwei.yin@...el.com>
To:     linux-mm@...ck.org, linux-kernel@...r.kernel.org,
        akpm@...ux-foundation.org, yuzhao@...gle.com, willy@...radead.org,
        david@...hat.com, ryan.roberts@....com, shy828301@...il.com
Cc:     fengwei.yin@...el.com
Subject: [RFC PATCH] madvise: make madvise_cold_or_pageout_pte_range() support large folio

Current madvise_cold_or_pageout_pte_range() has two problems for
large folio support:
  - Using folio_mapcount() with large folio prevent large folio from
    picking up.
  - If large folio is in the range requested, shouldn't split it
    in madvise_cold_or_pageout_pte_range().

Fix them by:
  - Use folio_estimated_sharers() with large folio
  - If large folio is in the range requested, don't split it. Leave
    to page reclaim phase.

For large folio cross boundaries of requested range, skip it if it's
page cache. Try to split it if it's anonymous folio. If splitting
fails, skip it.

The main reason to call folio_referenced() is to clear the yong of
conresponding PTEs. So in page reclaim phase, there is good chance
the folio can be reclaimed.

Signed-off-by: Yin Fengwei <fengwei.yin@...el.com>
---
This patch is based on mlock large folio support rfc2 as it depends
on the folio_in_range() added by that patchset

Also folio_op_size() can be unitfied with get_folio_mlock_step().

Testing done:
  - kselftest: No new regression introduced.

 mm/madvise.c | 133 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 84 insertions(+), 49 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 38382a5d1e393..5748cf098235d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -31,6 +31,7 @@
 #include <linux/swapops.h>
 #include <linux/shmem_fs.h>
 #include <linux/mmu_notifier.h>
+#include <linux/kernel.h>
 
 #include <asm/tlb.h>
 
@@ -339,6 +340,35 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
 	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
+static inline bool skip_current_entry(struct folio *folio, bool pageout_anon)
+{
+	if (!folio)
+		return true;
+
+	if (folio_is_zone_device(folio))
+		return true;
+
+	if (!folio_test_lru(folio))
+		return true;
+
+	if (pageout_anon && !folio_test_anon(folio))
+		return true;
+
+	if (folio_test_unevictable(folio))
+		return true;
+
+	return false;
+}
+
+static inline unsigned int folio_op_size(struct folio *folio, pte_t pte,
+		unsigned long addr, unsigned long end)
+{
+	unsigned int nr;
+
+	nr = folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte);
+	return min_t(unsigned int, nr, (end - addr) >> PAGE_SHIFT);
+}
+
 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
@@ -353,6 +383,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	struct folio *folio = NULL;
 	LIST_HEAD(folio_list);
 	bool pageout_anon_only_filter;
+	unsigned long start = addr;
 
 	if (fatal_signal_pending(current))
 		return -EINTR;
@@ -383,7 +414,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		folio = pfn_folio(pmd_pfn(orig_pmd));
 
 		/* Do not interfere with other mappings of this folio */
-		if (folio_mapcount(folio) != 1)
+		if (folio_estimated_sharers(folio) != 1)
 			goto huge_unlock;
 
 		if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -442,78 +473,60 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	for (; addr < end; pte++, addr += PAGE_SIZE) {
 		ptent = ptep_get(pte);
 
-		if (pte_none(ptent))
-			continue;
-
-		if (!pte_present(ptent))
+		if (pte_none(ptent) || !pte_present(ptent))
 			continue;
 
 		folio = vm_normal_folio(vma, addr, ptent);
-		if (!folio || folio_is_zone_device(folio))
+		if (skip_current_entry(folio, pageout_anon_only_filter))
 			continue;
 
 		/*
-		 * Creating a THP page is expensive so split it only if we
-		 * are sure it's worth. Split it if we are only owner.
+		 * Split large folio if it's anonymous and cross the
+		 * boundaries of request range.
 		 */
 		if (folio_test_large(folio)) {
-			int err;
+			int err, step;
+
+			if (folio_estimated_sharers(folio) != 1)
+				continue;
+
+			if (folio_in_range(folio, vma, start, end))
+				goto pageout_cold_folio;
 
-			if (folio_mapcount(folio) != 1)
-				break;
-			if (pageout_anon_only_filter && !folio_test_anon(folio))
-				break;
-			if (!folio_trylock(folio))
-				break;
 			folio_get(folio);
+			step = folio_op_size(folio, ptent, addr, end);
+			if (!folio_test_anon(folio) || !folio_trylock(folio)) {
+				folio_put(folio);
+				goto next_folio;
+			}
+
 			arch_leave_lazy_mmu_mode();
 			pte_unmap_unlock(start_pte, ptl);
 			start_pte = NULL;
 			err = split_folio(folio);
 			folio_unlock(folio);
 			folio_put(folio);
-			if (err)
-				break;
+
 			start_pte = pte =
 				pte_offset_map_lock(mm, pmd, addr, &ptl);
 			if (!start_pte)
 				break;
 			arch_enter_lazy_mmu_mode();
-			pte--;
-			addr -= PAGE_SIZE;
-			continue;
-		}
 
-		/*
-		 * Do not interfere with other mappings of this folio and
-		 * non-LRU folio.
-		 */
-		if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
+			/* Skip the folio if split fails */
+			if (!err)
+				step = 0;
+next_folio:
+			pte += step - 1;
+			addr += (step - 1) << PAGE_SHIFT;
 			continue;
+		}
 
-		if (pageout_anon_only_filter && !folio_test_anon(folio))
+		/* Do not interfere with other mappings of this folio */
+		if (folio_mapcount(folio) != 1)
 			continue;
 
-		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
-		if (pte_young(ptent)) {
-			ptent = ptep_get_and_clear_full(mm, addr, pte,
-							tlb->fullmm);
-			ptent = pte_mkold(ptent);
-			set_pte_at(mm, addr, pte, ptent);
-			tlb_remove_tlb_entry(tlb, pte, addr);
-		}
-
-		/*
-		 * We are deactivating a folio for accelerating reclaiming.
-		 * VM couldn't reclaim the folio unless we clear PG_young.
-		 * As a side effect, it makes confuse idle-page tracking
-		 * because they will miss recent referenced history.
-		 */
-		folio_clear_referenced(folio);
-		folio_test_clear_young(folio);
-		if (folio_test_active(folio))
-			folio_set_workingset(folio);
+pageout_cold_folio:
 		if (pageout) {
 			if (folio_isolate_lru(folio)) {
 				if (folio_test_unevictable(folio))
@@ -529,8 +542,30 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		arch_leave_lazy_mmu_mode();
 		pte_unmap_unlock(start_pte, ptl);
 	}
-	if (pageout)
-		reclaim_pages(&folio_list);
+
+	if (pageout) {
+		LIST_HEAD(reclaim_list);
+
+		while (!list_empty(&folio_list)) {
+			int refs;
+			unsigned long flags;
+			struct mem_cgroup *memcg = folio_memcg(folio);
+
+			folio = lru_to_folio(&folio_list);
+			list_del(&folio->lru);
+
+			refs = folio_referenced(folio, 0, memcg, &flags);
+
+			if ((flags & VM_LOCKED) || (refs == -1)) {
+				folio_putback_lru(folio);
+				continue;
+			}
+
+			folio_test_clear_referenced(folio);
+			list_add(&folio->lru, &reclaim_list);
+		}
+		reclaim_pages(&reclaim_list);
+	}
 	cond_resched();
 
 	return 0;
-- 
2.39.2