lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20231119194740.94101-19-ryncsn@gmail.com>
Date:   Mon, 20 Nov 2023 03:47:34 +0800
From:   Kairui Song <ryncsn@...il.com>
To:     linux-mm@...ck.org
Cc:     Andrew Morton <akpm@...ux-foundation.org>,
        "Huang, Ying" <ying.huang@...el.com>,
        David Hildenbrand <david@...hat.com>,
        Hugh Dickins <hughd@...gle.com>,
        Johannes Weiner <hannes@...xchg.org>,
        Matthew Wilcox <willy@...radead.org>,
        Michal Hocko <mhocko@...e.com>, linux-kernel@...r.kernel.org,
        Kairui Song <kasong@...cent.com>
Subject: [PATCH 18/24] mm/swap: introduce a helper non fault swapin

From: Kairui Song <kasong@...cent.com>

There are two places where swapin is not direct caused by page fault:
shmem swapin is invoked through shmem mapping, swapoff cause swapin by
walking the page table. They used to construct a pseudo vmfault struct
for swapin function.

Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d
("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff
path is still using a pseudo vmfault.

Introduce a helper for them both, this help save stack usage for swapoff
path, and help apply a unified swapin cache and readahead policy check.

Also prepare for follow up commits.

Signed-off-by: Kairui Song <kasong@...cent.com>
---
 mm/shmem.c      | 51 ++++++++++++++++---------------------------------
 mm/swap.h       | 11 +++++++++++
 mm/swap_state.c | 38 ++++++++++++++++++++++++++++++++++++
 mm/swapfile.c   | 23 +++++++++++-----------
 4 files changed, 76 insertions(+), 47 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index f9ce4067c742..81d129aa66d1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1565,22 +1565,6 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
 			pgoff_t index, unsigned int order, pgoff_t *ilx);
 
-static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
-			struct shmem_inode_info *info, pgoff_t index)
-{
-	struct mempolicy *mpol;
-	pgoff_t ilx;
-	struct page *page;
-
-	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
-	page = swap_cluster_readahead(swap, gfp, mpol, ilx);
-	mpol_cond_put(mpol);
-
-	if (!page)
-		return NULL;
-	return page_folio(page);
-}
-
 /*
  * Make sure huge_gfp is always more limited than limit_gfp.
  * Some of the flags set permissions, while others set limitations.
@@ -1854,9 +1838,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct swap_info_struct *si;
+	enum swap_cache_result result;
 	struct folio *folio = NULL;
+	struct mempolicy *mpol;
+	struct page *page;
 	swp_entry_t swap;
+	pgoff_t ilx;
 	int error;
 
 	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
@@ -1866,34 +1853,30 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	if (is_poisoned_swp_entry(swap))
 		return -EIO;
 
-	si = get_swap_device(swap);
-	if (!si) {
+	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+	page = swapin_page_non_fault(swap, gfp, mpol, ilx, fault_mm, &result);
+	mpol_cond_put(mpol);
+
+	if (PTR_ERR(page) == -EBUSY) {
 		if (!shmem_confirm_swap(mapping, index, swap))
 			return -EEXIST;
 		else
 			return -EINVAL;
-	}
-
-	/* Look it up and read it in.. */
-	folio = swap_cache_get_folio(swap, NULL, NULL);
-	if (!folio) {
-		/* Or update major stats only when swapin succeeds?? */
-		if (fault_type) {
+	} else if (!page) {
+		error = -ENOMEM;
+		goto failed;
+	} else {
+		folio = page_folio(page);
+		if (fault_type && result != SWAP_CACHE_HIT) {
 			*fault_type |= VM_FAULT_MAJOR;
 			count_vm_event(PGMAJFAULT);
 			count_memcg_event_mm(fault_mm, PGMAJFAULT);
 		}
-		/* Here we actually start the io */
-		folio = shmem_swapin_cluster(swap, gfp, info, index);
-		if (!folio) {
-			error = -ENOMEM;
-			goto failed;
-		}
 	}
 
 	/* We have to do this with folio locked to prevent races */
 	folio_lock(folio);
-	if (!folio_test_swapcache(folio) ||
+	if ((result != SWAP_CACHE_BYPASS && !folio_test_swapcache(folio)) ||
 	    folio->swap.val != swap.val ||
 	    !shmem_confirm_swap(mapping, index, swap)) {
 		error = -EEXIST;
@@ -1930,7 +1913,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	delete_from_swap_cache(folio);
 	folio_mark_dirty(folio);
 	swap_free(swap);
-	put_swap_device(si);
 
 	*foliop = folio;
 	return 0;
@@ -1944,7 +1926,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 		folio_unlock(folio);
 		folio_put(folio);
 	}
-	put_swap_device(si);
 
 	return error;
 }
diff --git a/mm/swap.h b/mm/swap.h
index da9deb5ba37d..b073c29c9790 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -62,6 +62,10 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 				    struct mempolicy *mpol, pgoff_t ilx);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
 			      struct vm_fault *vmf, enum swap_cache_result *result);
+struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
+				   struct mempolicy *mpol, pgoff_t ilx,
+				   struct mm_struct *mm,
+				   enum swap_cache_result *result);
 
 static inline unsigned int folio_swap_flags(struct folio *folio)
 {
@@ -103,6 +107,13 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 	return NULL;
 }
 
+static inline struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
+		struct mempolicy *mpol, pgoff_t ilx, struct mm_struct *mm,
+		enum swap_cache_result *result)
+{
+	return NULL;
+}
+
 static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
 {
 	return 0;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ff8a166603d0..eef66757c615 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -956,6 +956,44 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	return page;
 }
 
+struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
+				   struct mempolicy *mpol, pgoff_t ilx,
+				   struct mm_struct *mm, enum swap_cache_result *result)
+{
+	enum swap_cache_result cache_result;
+	struct swap_info_struct *si;
+	void *shadow = NULL;
+	struct folio *folio;
+	struct page *page;
+
+	/* Prevent swapoff from happening to us */
+	si = get_swap_device(entry);
+	if (unlikely(!si))
+		return ERR_PTR(-EBUSY);
+
+	folio = swap_cache_get_folio(entry, NULL, &shadow);
+	if (folio) {
+		page = folio_file_page(folio, swp_offset(entry));
+		cache_result = SWAP_CACHE_HIT;
+		goto done;
+	}
+
+	if (swap_use_no_readahead(si, swp_offset(entry))) {
+		page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, mm);
+		if (shadow)
+			workingset_refault(page_folio(page), shadow);
+		cache_result = SWAP_CACHE_BYPASS;
+	} else {
+		page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+		cache_result = SWAP_CACHE_MISS;
+	}
+done:
+	put_swap_device(si);
+	if (result)
+		*result = cache_result;
+	return page;
+}
+
 #ifdef CONFIG_SYSFS
 static ssize_t vma_ra_enabled_show(struct kobject *kobj,
 				     struct kobj_attribute *attr, char *buf)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 925ad92486a4..f8c5096fe0f0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1822,20 +1822,15 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 	si = swap_info[type];
 	do {
+		int ret;
+		pte_t ptent;
+		pgoff_t ilx;
+		swp_entry_t entry;
 		struct page *page;
 		unsigned long offset;
+		struct mempolicy *mpol;
 		unsigned char swp_count;
 		struct folio *folio = NULL;
-		swp_entry_t entry;
-		int ret;
-		pte_t ptent;
-
-		struct vm_fault vmf = {
-			.vma = vma,
-			.address = addr,
-			.real_address = addr,
-			.pmd = pmd,
-		};
 
 		if (!pte++) {
 			pte = pte_offset_map(pmd, addr);
@@ -1855,8 +1850,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		offset = swp_offset(entry);
 		pte_unmap(pte);
 		pte = NULL;
-		page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
-					&vmf, NULL);
+
+		mpol = get_vma_policy(vma, addr, 0, &ilx);
+		page = swapin_page_non_fault(entry, GFP_HIGHUSER_MOVABLE,
+					     mpol, ilx, vma->vm_mm, NULL);
+		mpol_cond_put(mpol);
+
 		if (IS_ERR(page))
 			return PTR_ERR(page);
 		else if (page)
-- 
2.42.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ