linux-kernel - [PATCH v2] mm: madvise: pageout: ignore references rather than clearing young

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240226005739.24350-1-21cnbao@gmail.com>
Date: Mon, 26 Feb 2024 13:57:39 +1300
From: Barry Song <21cnbao@...il.com>
To: akpm@...ux-foundation.org,
	damon@...ts.linux.dev,
	sj@...nel.org,
	linux-mm@...ck.org
Cc: linux-kernel@...r.kernel.org,
	Barry Song <v-songbaohua@...o.com>,
	Minchan Kim <minchan@...nel.org>,
	Michal Hocko <mhocko@...e.com>,
	Johannes Weiner <hannes@...xchg.org>
Subject: [PATCH v2] mm: madvise: pageout: ignore references rather than clearing young

From: Barry Song <v-songbaohua@...o.com>

While doing MADV_PAGEOUT, the current code will clear PTE young
so that vmscan won't read young flags to allow the reclamation
of madvised folios to go ahead.
It seems we can do it by directly ignoring references, thus we
can remove tlb flush in madvise and rmap overhead in vmscan.

Regarding the side effect, in the original code, if a parallel
thread runs side by side to access the madvised memory with the
thread doing madvise, folios will get a chance to be re-activated
by vmscan (though the time gap is actually quite small since
checking PTEs is done immediately after clearing PTEs young). But
with this patch, they will still be reclaimed. But this behaviour
doing PAGEOUT and doing access at the same time is quite silly
like DoS. So probably, we don't need to care. Or ignoring the
new access during the quite small time gap is even better.

For DAMON's DAMOS_PAGEOUT based on physical address region, we
still keep its behaviour as is since a physical address might
be mapped by multiple processes. MADV_PAGEOUT based on virtual
address is actually much more aggressive on reclamation. To
untouch paddr's DAMOS_PAGEOUT, we simply pass ignore_references
as false in reclaim_pages().

A microbench as below has shown 6% decrement on the latency of
MADV_PAGEOUT,

 #define PGSIZE 4096
 main()
 {
 	int i;
 #define SIZE 512*1024*1024
 	volatile long *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

 	for (i = 0; i < SIZE/sizeof(long); i += PGSIZE / sizeof(long))
 		p[i] =  0x11;

 	madvise(p, SIZE, MADV_PAGEOUT);
 }

w/o patch                    w/ patch
root@10:~# time ./a.out      root@10:~# time ./a.out
real	0m49.634s            real   0m46.334s
user	0m0.637s             user   0m0.648s
sys	0m47.434s            sys    0m44.265s

Cc: SeongJae Park <sj@...nel.org>
Cc: Minchan Kim <minchan@...nel.org>
Cc: Michal Hocko <mhocko@...e.com>
Cc: Johannes Weiner <hannes@...xchg.org>
Signed-off-by: Barry Song <v-songbaohua@...o.com>
---
 -v2:
 Add description about paddr's pageout of DAMON in commit message (Thanks
 for SeongJae's help on clarifying lots of things about DAMON)

 mm/damon/paddr.c |  2 +-
 mm/internal.h    |  2 +-
 mm/madvise.c     |  8 ++++----
 mm/vmscan.c      | 12 +++++++-----
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 081e2a325778..5e6dc312072c 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -249,7 +249,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
 put_folio:
 		folio_put(folio);
 	}
-	applied = reclaim_pages(&folio_list);
+	applied = reclaim_pages(&folio_list, false);
 	cond_resched();
 	return applied * PAGE_SIZE;
 }
diff --git a/mm/internal.h b/mm/internal.h
index 93e229112045..36c11ea41f47 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -868,7 +868,7 @@ extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
-unsigned long reclaim_pages(struct list_head *folio_list);
+unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references);
 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 					    struct list_head *folio_list);
 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
diff --git a/mm/madvise.c b/mm/madvise.c
index abde3edb04f0..44a498c94158 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -386,7 +386,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			return 0;
 		}
 
-		if (pmd_young(orig_pmd)) {
+		if (!pageout && pmd_young(orig_pmd)) {
 			pmdp_invalidate(vma, addr, pmd);
 			orig_pmd = pmd_mkold(orig_pmd);
 
@@ -410,7 +410,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 huge_unlock:
 		spin_unlock(ptl);
 		if (pageout)
-			reclaim_pages(&folio_list);
+			reclaim_pages(&folio_list, true);
 		return 0;
 	}
 
@@ -490,7 +490,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 
 		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
 
-		if (pte_young(ptent)) {
+		if (!pageout && pte_young(ptent)) {
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			ptent = pte_mkold(ptent);
@@ -524,7 +524,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		pte_unmap_unlock(start_pte, ptl);
 	}
 	if (pageout)
-		reclaim_pages(&folio_list);
+		reclaim_pages(&folio_list, true);
 	cond_resched();
 
 	return 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 402c290fbf5a..ba2f37f46a73 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2102,7 +2102,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 }
 
 static unsigned int reclaim_folio_list(struct list_head *folio_list,
-				      struct pglist_data *pgdat)
+				      struct pglist_data *pgdat,
+				      bool ignore_references)
 {
 	struct reclaim_stat dummy_stat;
 	unsigned int nr_reclaimed;
@@ -2115,7 +2116,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
 		.no_demotion = 1,
 	};
 
-	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
+	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references);
 	while (!list_empty(folio_list)) {
 		folio = lru_to_folio(folio_list);
 		list_del(&folio->lru);
@@ -2125,7 +2126,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
 	return nr_reclaimed;
 }
 
-unsigned long reclaim_pages(struct list_head *folio_list)
+unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references)
 {
 	int nid;
 	unsigned int nr_reclaimed = 0;
@@ -2147,11 +2148,12 @@ unsigned long reclaim_pages(struct list_head *folio_list)
 			continue;
 		}
 
-		nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
+		nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid),
+						   ignore_references);
 		nid = folio_nid(lru_to_folio(folio_list));
 	} while (!list_empty(folio_list));
 
-	nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
+	nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references);
 
 	memalloc_noreclaim_restore(noreclaim_flag);
 
-- 
2.34.1