linux-kernel - [PATCH v2 1/2] mm: clear pte for folios that are zero filled

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240604105950.1134192-2-usamaarif642@gmail.com>
Date: Tue,  4 Jun 2024 11:58:24 +0100
From: Usama Arif <usamaarif642@...il.com>
To: akpm@...ux-foundation.org
Cc: hannes@...xchg.org,
	willy@...radead.org,
	yosryahmed@...gle.com,
	nphamcs@...il.com,
	chengming.zhou@...ux.dev,
	linux-mm@...ck.org,
	linux-kernel@...r.kernel.org,
	kernel-team@...a.com,
	Usama Arif <usamaarif642@...il.com>
Subject: [PATCH v2 1/2] mm: clear pte for folios that are zero filled

Approximately 10-20% of pages to be swapped out are zero pages [1].
Rather than reading/writing these pages to flash resulting
in increased I/O and flash wear, the pte can be cleared for those
addresses at unmap time while shrinking folio list. When this
causes a page fault, do_pte_missing will take care of this page.
With this patch, NVMe writes in Meta server fleet decreased
by almost 10% with conventional swap setup (zswap disabled).

[1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/

Signed-off-by: Usama Arif <usamaarif642@...il.com>
---
 include/linux/rmap.h |   1 +
 mm/rmap.c            | 163 ++++++++++++++++++++++---------------------
 mm/vmscan.c          |  89 ++++++++++++++++-------
 3 files changed, 150 insertions(+), 103 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bb53e5920b88..b36db1e886e4 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -100,6 +100,7 @@ enum ttu_flags {
 					 * do a final flush if necessary */
 	TTU_RMAP_LOCKED		= 0x80,	/* do not grab rmap lock:
 					 * caller holds it */
+	TTU_ZERO_FOLIO		= 0x100,/* zero folio */
 };
 
 #ifdef CONFIG_MMU
diff --git a/mm/rmap.c b/mm/rmap.c
index 52357d79917c..d98f70876327 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1819,96 +1819,101 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 */
 			dec_mm_counter(mm, mm_counter(folio));
 		} else if (folio_test_anon(folio)) {
-			swp_entry_t entry = page_swap_entry(subpage);
-			pte_t swp_pte;
-			/*
-			 * Store the swap location in the pte.
-			 * See handle_pte_fault() ...
-			 */
-			if (unlikely(folio_test_swapbacked(folio) !=
-					folio_test_swapcache(folio))) {
+			if (flags & TTU_ZERO_FOLIO) {
+				pte_clear(mm, address, pvmw.pte);
+				dec_mm_counter(mm, MM_ANONPAGES);
+			} else {
+				swp_entry_t entry = page_swap_entry(subpage);
+				pte_t swp_pte;
 				/*
-				 * unmap_huge_pmd_locked() will unmark a
-				 * PMD-mapped folio as lazyfree if the folio or
-				 * its PMD was redirtied.
+				 * Store the swap location in the pte.
+				 * See handle_pte_fault() ...
 				 */
-				if (!pmd_mapped)
-					WARN_ON_ONCE(1);
-				goto walk_done_err;
-			}
+				if (unlikely(folio_test_swapbacked(folio) !=
+						folio_test_swapcache(folio))) {
+					/*
+					 * unmap_huge_pmd_locked() will unmark a
+					 * PMD-mapped folio as lazyfree if the folio or
+					 * its PMD was redirtied.
+					 */
+					if (!pmd_mapped)
+						WARN_ON_ONCE(1);
+					goto walk_done_err;
+				}
 
-			/* MADV_FREE page check */
-			if (!folio_test_swapbacked(folio)) {
-				int ref_count, map_count;
+				/* MADV_FREE page check */
+				if (!folio_test_swapbacked(folio)) {
+					int ref_count, map_count;
 
-				/*
-				 * Synchronize with gup_pte_range():
-				 * - clear PTE; barrier; read refcount
-				 * - inc refcount; barrier; read PTE
-				 */
-				smp_mb();
+					/*
+					 * Synchronize with gup_pte_range():
+					 * - clear PTE; barrier; read refcount
+					 * - inc refcount; barrier; read PTE
+					 */
+					smp_mb();
 
-				ref_count = folio_ref_count(folio);
-				map_count = folio_mapcount(folio);
+					ref_count = folio_ref_count(folio);
+					map_count = folio_mapcount(folio);
 
-				/*
-				 * Order reads for page refcount and dirty flag
-				 * (see comments in __remove_mapping()).
-				 */
-				smp_rmb();
+					/*
+					 * Order reads for page refcount and dirty flag
+					 * (see comments in __remove_mapping()).
+					 */
+					smp_rmb();
 
-				/*
-				 * The only page refs must be one from isolation
-				 * plus the rmap(s) (dropped by discard:).
-				 */
-				if (ref_count == 1 + map_count &&
-				    !folio_test_dirty(folio)) {
-					dec_mm_counter(mm, MM_ANONPAGES);
-					goto discard;
-				}
+					/*
+					 * The only page refs must be one from isolation
+					 * plus the rmap(s) (dropped by discard:).
+					 */
+					if (ref_count == 1 + map_count &&
+					    !folio_test_dirty(folio)) {
+						dec_mm_counter(mm, MM_ANONPAGES);
+						goto discard;
+					}
 
-				/*
-				 * If the folio was redirtied, it cannot be
-				 * discarded. Remap the page to page table.
-				 */
-				set_pte_at(mm, address, pvmw.pte, pteval);
-				folio_set_swapbacked(folio);
-				goto walk_done_err;
-			}
+					/*
+					 * If the folio was redirtied, it cannot be
+					 * discarded. Remap the page to page table.
+					 */
+					set_pte_at(mm, address, pvmw.pte, pteval);
+					folio_set_swapbacked(folio);
+					goto walk_done_err;
+				}
 
-			if (swap_duplicate(entry) < 0) {
-				set_pte_at(mm, address, pvmw.pte, pteval);
-				goto walk_done_err;
-			}
-			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
-				swap_free(entry);
-				set_pte_at(mm, address, pvmw.pte, pteval);
-				goto walk_done_err;
-			}
+				if (swap_duplicate(entry) < 0) {
+					set_pte_at(mm, address, pvmw.pte, pteval);
+					goto walk_done_err;
+				}
+				if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+					swap_free(entry);
+					set_pte_at(mm, address, pvmw.pte, pteval);
+					goto walk_done_err;
+				}
 
-			/* See folio_try_share_anon_rmap(): clear PTE first. */
-			if (anon_exclusive &&
-			    folio_try_share_anon_rmap_pte(folio, subpage)) {
-				swap_free(entry);
-				set_pte_at(mm, address, pvmw.pte, pteval);
-				goto walk_done_err;
-			}
-			if (list_empty(&mm->mmlist)) {
-				spin_lock(&mmlist_lock);
-				if (list_empty(&mm->mmlist))
-					list_add(&mm->mmlist, &init_mm.mmlist);
-				spin_unlock(&mmlist_lock);
+				/* See folio_try_share_anon_rmap(): clear PTE first. */
+				if (anon_exclusive &&
+				    folio_try_share_anon_rmap_pte(folio, subpage)) {
+					swap_free(entry);
+					set_pte_at(mm, address, pvmw.pte, pteval);
+					goto walk_done_err;
+				}
+				if (list_empty(&mm->mmlist)) {
+					spin_lock(&mmlist_lock);
+					if (list_empty(&mm->mmlist))
+						list_add(&mm->mmlist, &init_mm.mmlist);
+					spin_unlock(&mmlist_lock);
+				}
+				dec_mm_counter(mm, MM_ANONPAGES);
+				inc_mm_counter(mm, MM_SWAPENTS);
+				swp_pte = swp_entry_to_pte(entry);
+				if (anon_exclusive)
+					swp_pte = pte_swp_mkexclusive(swp_pte);
+				if (pte_soft_dirty(pteval))
+					swp_pte = pte_swp_mksoft_dirty(swp_pte);
+				if (pte_uffd_wp(pteval))
+					swp_pte = pte_swp_mkuffd_wp(swp_pte);
+				set_pte_at(mm, address, pvmw.pte, swp_pte);
 			}
-			dec_mm_counter(mm, MM_ANONPAGES);
-			inc_mm_counter(mm, MM_SWAPENTS);
-			swp_pte = swp_entry_to_pte(entry);
-			if (anon_exclusive)
-				swp_pte = pte_swp_mkexclusive(swp_pte);
-			if (pte_soft_dirty(pteval))
-				swp_pte = pte_swp_mksoft_dirty(swp_pte);
-			if (pte_uffd_wp(pteval))
-				swp_pte = pte_swp_mkuffd_wp(swp_pte);
-			set_pte_at(mm, address, pvmw.pte, swp_pte);
 		} else {
 			/*
 			 * This is a locked file-backed folio,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b9170f767353..d54f44b556f0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1026,6 +1026,38 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
 	return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
 }
 
+static bool is_folio_page_zero_filled(struct folio *folio, int i)
+{
+	unsigned long *data;
+	unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1;
+	bool ret = false;
+
+	data = kmap_local_folio(folio, i * PAGE_SIZE);
+
+	if (data[last_pos])
+		goto out;
+
+	for (pos = 0; pos < last_pos; pos++) {
+		if (data[pos])
+			goto out;
+	}
+	ret = true;
+out:
+	kunmap_local(data);
+	return ret;
+}
+
+static bool is_folio_zero_filled(struct folio *folio)
+{
+	unsigned int i;
+
+	for (i = 0; i < folio_nr_pages(folio); i++) {
+		if (!is_folio_page_zero_filled(folio, i))
+			return false;
+	}
+	return true;
+}
+
 /*
  * shrink_folio_list() returns the number of reclaimed pages
  */
@@ -1053,6 +1085,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 		enum folio_references references = FOLIOREF_RECLAIM;
 		bool dirty, writeback;
 		unsigned int nr_pages;
+		bool folio_zero_filled = false;
 
 		cond_resched();
 
@@ -1270,6 +1303,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 			nr_pages = 1;
 		}
 
+		folio_zero_filled = is_folio_zero_filled(folio);
 		/*
 		 * The folio is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
@@ -1295,6 +1329,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 			if (folio_test_large(folio) && list_empty(&folio->_deferred_list))
 				flags |= TTU_SYNC;
 
+			if (folio_zero_filled)
+				flags |= TTU_ZERO_FOLIO;
+
 			try_to_unmap(folio, flags);
 			if (folio_mapped(folio)) {
 				stat->nr_unmap_fail += nr_pages;
@@ -1358,32 +1395,36 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 			 * starts and then write it out here.
 			 */
 			try_to_unmap_flush_dirty();
-			switch (pageout(folio, mapping, &plug)) {
-			case PAGE_KEEP:
-				goto keep_locked;
-			case PAGE_ACTIVATE:
-				goto activate_locked;
-			case PAGE_SUCCESS:
-				stat->nr_pageout += nr_pages;
+			if (folio_zero_filled) {
+				folio_clear_dirty(folio);
+			} else {
+				switch (pageout(folio, mapping, &plug)) {
+				case PAGE_KEEP:
+					goto keep_locked;
+				case PAGE_ACTIVATE:
+					goto activate_locked;
+				case PAGE_SUCCESS:
+					stat->nr_pageout += nr_pages;
 
-				if (folio_test_writeback(folio))
-					goto keep;
-				if (folio_test_dirty(folio))
-					goto keep;
+					if (folio_test_writeback(folio))
+						goto keep;
+					if (folio_test_dirty(folio))
+						goto keep;
 
-				/*
-				 * A synchronous write - probably a ramdisk.  Go
-				 * ahead and try to reclaim the folio.
-				 */
-				if (!folio_trylock(folio))
-					goto keep;
-				if (folio_test_dirty(folio) ||
-				    folio_test_writeback(folio))
-					goto keep_locked;
-				mapping = folio_mapping(folio);
-				fallthrough;
-			case PAGE_CLEAN:
-				; /* try to free the folio below */
+					/*
+					 * A synchronous write - probably a ramdisk.  Go
+					 * ahead and try to reclaim the folio.
+					 */
+					if (!folio_trylock(folio))
+						goto keep;
+					if (folio_test_dirty(folio) ||
+					    folio_test_writeback(folio))
+						goto keep_locked;
+					mapping = folio_mapping(folio);
+					fallthrough;
+				case PAGE_CLEAN:
+					; /* try to free the folio below */
+				}
 			}
 		}
 
-- 
2.43.0