[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250214175709.76029-8-ryncsn@gmail.com>
Date: Sat, 15 Feb 2025 01:57:09 +0800
From: Kairui Song <ryncsn@...il.com>
To: linux-mm@...ck.org
Cc: Andrew Morton <akpm@...ux-foundation.org>,
Chris Li <chrisl@...nel.org>,
Barry Song <v-songbaohua@...o.com>,
Hugh Dickins <hughd@...gle.com>,
Yosry Ahmed <yosryahmed@...gle.com>,
"Huang, Ying" <ying.huang@...ux.alibaba.com>,
Baoquan He <bhe@...hat.com>,
Nhat Pham <nphamcs@...il.com>,
Johannes Weiner <hannes@...xchg.org>,
Kalesh Singh <kaleshsingh@...gle.com>,
linux-kernel@...r.kernel.org,
Kairui Song <kasong@...cent.com>
Subject: [PATCH 7/7] mm, swap: simplify folio swap allocation
From: Kairui Song <kasong@...cent.com>
With slot cache gone, clean up the allocation helpers even more.
folio_alloc_swap will be the only entry for allocation and adding
the folio to swap cache (except suspend), making it opposite of
folio_free_swap.
Signed-off-by: Kairui Song <kasong@...cent.com>
---
include/linux/swap.h | 8 ++--
mm/shmem.c | 21 +++------
mm/swap.h | 6 ---
mm/swap_state.c | 57 ----------------------
mm/swapfile.c | 110 ++++++++++++++++++++++++++++---------------
mm/vmscan.c | 16 ++++++-
6 files changed, 94 insertions(+), 124 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 456833705ea0..e799e965dac8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -478,7 +478,7 @@ static inline long get_nr_swap_pages(void)
}
extern void si_swapinfo(struct sysinfo *);
-swp_entry_t folio_alloc_swap(struct folio *folio);
+bool folio_alloc_swap(struct folio *folio, gfp_t gfp_mask);
bool folio_free_swap(struct folio *folio);
void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
@@ -587,11 +587,9 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-static inline swp_entry_t folio_alloc_swap(struct folio *folio)
+static bool folio_alloc_swap(struct folio *folio, gfp_t gfp_mask);
{
- swp_entry_t entry;
- entry.val = 0;
- return entry;
+ return false;
}
static inline bool folio_free_swap(struct folio *folio)
diff --git a/mm/shmem.c b/mm/shmem.c
index b35ba250c53d..2aa206b52ff2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1546,7 +1546,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- swp_entry_t swap;
pgoff_t index;
int nr_pages;
bool split = false;
@@ -1628,14 +1627,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
folio_mark_uptodate(folio);
}
- swap = folio_alloc_swap(folio);
- if (!swap.val) {
- if (nr_pages > 1)
- goto try_split;
-
- goto redirty;
- }
-
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
* if it's not already there. Do it now before the folio is
@@ -1648,20 +1639,20 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (list_empty(&info->swaplist))
list_add(&info->swaplist, &shmem_swaplist);
- if (add_to_swap_cache(folio, swap,
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
- NULL) == 0) {
+ if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
shmem_recalc_inode(inode, 0, nr_pages);
- swap_shmem_alloc(swap, nr_pages);
- shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
+ swap_shmem_alloc(folio->swap, nr_pages);
+ shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(folio_mapped(folio));
return swap_writepage(&folio->page, wbc);
}
+ list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
- put_swap_folio(folio, swap);
+ if (nr_pages > 1)
+ goto try_split;
redirty:
folio_mark_dirty(folio);
if (wbc->for_reclaim)
diff --git a/mm/swap.h b/mm/swap.h
index ad2f121de970..0abb68091b4f 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -50,7 +50,6 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
}
void show_swap_cache_info(void);
-bool add_to_swap(struct folio *folio);
void *get_shadow_from_swap_cache(swp_entry_t entry);
int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp);
@@ -163,11 +162,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
return filemap_get_folio(mapping, index);
}
-static inline bool add_to_swap(struct folio *folio)
-{
- return false;
-}
-
static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
{
return NULL;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2b5744e211cd..68fd981b514f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -166,63 +166,6 @@ void __delete_from_swap_cache(struct folio *folio,
__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
}
-/**
- * add_to_swap - allocate swap space for a folio
- * @folio: folio we want to move to swap
- *
- * Allocate swap space for the folio and add the folio to the
- * swap cache.
- *
- * Context: Caller needs to hold the folio lock.
- * Return: Whether the folio was added to the swap cache.
- */
-bool add_to_swap(struct folio *folio)
-{
- swp_entry_t entry;
- int err;
-
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
-
- entry = folio_alloc_swap(folio);
- if (!entry.val)
- return false;
-
- /*
- * XArray node allocations from PF_MEMALLOC contexts could
- * completely exhaust the page allocator. __GFP_NOMEMALLOC
- * stops emergency reserves from being allocated.
- *
- * TODO: this could cause a theoretical memory reclaim
- * deadlock in the swap out path.
- */
- /*
- * Add it to the swap cache.
- */
- err = add_to_swap_cache(folio, entry,
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
- if (err)
- goto fail;
- /*
- * Normally the folio will be dirtied in unmap because its
- * pte should be dirty. A special case is MADV_FREE page. The
- * page's pte could have dirty bit cleared but the folio's
- * SwapBacked flag is still set because clearing the dirty bit
- * and SwapBacked flag has no lock protected. For such folio,
- * unmap will not set dirty bit for it, so folio reclaim will
- * not write the folio out. This can cause data corruption when
- * the folio is swapped in later. Always setting the dirty flag
- * for the folio solves the problem.
- */
- folio_mark_dirty(folio);
-
- return true;
-
-fail:
- put_swap_folio(folio, entry);
- return false;
-}
-
/*
* This must be called only on folios that have
* been verified to be in the swap cache and locked.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 66c8869ef346..8449bd703bd8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1181,9 +1181,9 @@ static bool get_swap_device_info(struct swap_info_struct *si)
* Fast path try to get swap entries with specified order from current
* CPU's swap entry pool (a cluster).
*/
-static int swap_alloc_fast(swp_entry_t *entry,
- unsigned char usage,
- int order)
+static bool swap_alloc_fast(swp_entry_t *entry,
+ unsigned char usage,
+ int order)
{
struct swap_cluster_info *ci;
struct swap_info_struct *si;
@@ -1203,47 +1203,31 @@ static int swap_alloc_fast(swp_entry_t *entry,
return !!found;
}
-swp_entry_t folio_alloc_swap(struct folio *folio)
+/* Rotate the device and switch to a new cluster */
+static bool swap_alloc_rotate(swp_entry_t *entry,
+ unsigned char usage,
+ int order)
{
- unsigned int order = folio_order(folio);
- unsigned int size = 1 << order;
- struct swap_info_struct *si, *next;
- swp_entry_t entry = {};
- unsigned long offset;
int node;
+ unsigned long offset;
+ struct swap_info_struct *si, *next;
- if (order) {
- /*
- * Should not even be attempting large allocations when huge
- * page swap is disabled. Warn and fail the allocation.
- */
- if (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER) {
- VM_WARN_ON_ONCE(1);
- return entry;
- }
- }
-
- /* Fast path using percpu cluster */
- local_lock(&percpu_swap_cluster.lock);
- if (swap_alloc_fast(&entry, SWAP_HAS_CACHE, order))
- goto out_alloced;
-
- /* Rotate the device and switch to a new cluster */
+ node = numa_node_id();
spin_lock(&swap_avail_lock);
start_over:
- node = numa_node_id();
plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
+ /* Rotate the device and switch to a new cluster */
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
if (get_swap_device_info(si)) {
offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
put_swap_device(si);
if (offset) {
- entry = swp_entry(si->type, offset);
- goto out_alloced;
+ *entry = swp_entry(si->type, offset);
+ return true;
}
if (order)
- goto out_failed;
+ return false;
}
spin_lock(&swap_avail_lock);
@@ -1262,20 +1246,68 @@ swp_entry_t folio_alloc_swap(struct folio *folio)
goto start_over;
}
spin_unlock(&swap_avail_lock);
-out_failed:
+ return false;
+}
+
+/**
+ * folio_alloc_swap - allocate swap space for a folio
+ * @folio: folio we want to move to swap
+ * @gfp: gfp mask for shadow nodes
+ *
+ * Allocate swap space for the folio and add the folio to the
+ * swap cache.
+ *
+ * Context: Caller needs to hold the folio lock.
+ * Return: Whether the folio was added to the swap cache.
+ */
+bool folio_alloc_swap(struct folio *folio, gfp_t gfp)
+{
+ unsigned int order = folio_order(folio);
+ unsigned int size = 1 << order;
+ swp_entry_t entry = {};
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
+
+ /*
+ * Should not even be attempting large allocations when huge
+ * page swap is disabled. Warn and fail the allocation.
+ */
+ if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) {
+ VM_WARN_ON_ONCE(1);
+ return false;
+ }
+
+ local_lock(&percpu_swap_cluster.lock);
+ if (swap_alloc_fast(&entry, SWAP_HAS_CACHE, order))
+ goto out_alloced;
+ if (swap_alloc_rotate(&entry, SWAP_HAS_CACHE, order))
+ goto out_alloced;
local_unlock(&percpu_swap_cluster.lock);
- return entry;
+ return false;
out_alloced:
local_unlock(&percpu_swap_cluster.lock);
- if (mem_cgroup_try_charge_swap(folio, entry)) {
- put_swap_folio(folio, entry);
- entry.val = 0;
- } else {
- atomic_long_sub(size, &nr_swap_pages);
- }
+ if (mem_cgroup_try_charge_swap(folio, entry))
+ goto out_free;
- return entry;
+ /*
+ * XArray node allocations from PF_MEMALLOC contexts could
+ * completely exhaust the page allocator. __GFP_NOMEMALLOC
+ * stops emergency reserves from being allocated.
+ *
+ * TODO: this could cause a theoretical memory reclaim
+ * deadlock in the swap out path.
+ */
+ if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
+ goto out_free;
+
+ atomic_long_sub(size, &nr_swap_pages);
+ return true;
+
+out_free:
+ put_swap_folio(folio, entry);
+ return false;
}
static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fcca38bc640f..71a6b597e469 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1289,7 +1289,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
split_folio_to_list(folio, folio_list))
goto activate_locked;
}
- if (!add_to_swap(folio)) {
+ if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) {
int __maybe_unused order = folio_order(folio);
if (!folio_test_large(folio))
@@ -1305,9 +1305,21 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
}
#endif
count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
- if (!add_to_swap(folio))
+ if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN))
goto activate_locked_split;
}
+ /*
+ * Normally the folio will be dirtied in unmap because its
+ * pte should be dirty. A special case is MADV_FREE page. The
+ * page's pte could have dirty bit cleared but the folio's
+ * SwapBacked flag is still set because clearing the dirty bit
+ * and SwapBacked flag has no lock protected. For such folio,
+ * unmap will not set dirty bit for it, so folio reclaim will
+ * not write the folio out. This can cause data corruption when
+ * the folio is swapped in later. Always setting the dirty flag
+ * for the folio solves the problem.
+ */
+ folio_mark_dirty(folio);
}
}
--
2.48.1
Powered by blists - more mailing lists