[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250514201729.48420-22-ryncsn@gmail.com>
Date: Thu, 15 May 2025 04:17:21 +0800
From: Kairui Song <ryncsn@...il.com>
To: linux-mm@...ck.org
Cc: Andrew Morton <akpm@...ux-foundation.org>,
Matthew Wilcox <willy@...radead.org>,
Hugh Dickins <hughd@...gle.com>,
Chris Li <chrisl@...nel.org>,
David Hildenbrand <david@...hat.com>,
Yosry Ahmed <yosryahmed@...gle.com>,
"Huang, Ying" <ying.huang@...ux.alibaba.com>,
Nhat Pham <nphamcs@...il.com>,
Johannes Weiner <hannes@...xchg.org>,
Baolin Wang <baolin.wang@...ux.alibaba.com>,
Baoquan He <bhe@...hat.com>,
Barry Song <baohua@...nel.org>,
Kalesh Singh <kaleshsingh@...gle.com>,
Kemeng Shi <shikemeng@...weicloud.com>,
Tim Chen <tim.c.chen@...ux.intel.com>,
Ryan Roberts <ryan.roberts@....com>,
linux-kernel@...r.kernel.org,
Kairui Song <kasong@...cent.com>
Subject: [PATCH 21/28] mm, swap: add folio to swap cache directly on allocation
From: Kairui Song <kasong@...cent.com>
All swap allocations are folio based now (except for hibernation),
and swap cache is protected by cluster lock too. So insert the folio
directly in to the swap cache upon allocation while holding the cluster
to avoid problems caused by dropping and re-acquiring the lock.
Signed-off-by: Kairui Song <kasong@...cent.com>
---
mm/swap.h | 8 ++--
mm/swap_state.c | 48 +++++++++++++++----
mm/swapfile.c | 122 ++++++++++++++++++++----------------------------
3 files changed, 93 insertions(+), 85 deletions(-)
diff --git a/mm/swap.h b/mm/swap.h
index 2ae4624a0e48..b042609e6eb2 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -185,7 +185,10 @@ static inline struct address_space *swap_address_space(swp_entry_t entry)
/* Below helpers requires the caller to pin the swap device. */
extern struct folio *swap_cache_get_folio(swp_entry_t entry);
extern struct folio *swap_cache_add_folio(swp_entry_t entry, struct folio *folio,
- void **shadow, bool swapin);
+ void **shadow);
+extern void __swap_cache_add_folio(swp_entry_t entry,
+ struct swap_cluster_info *ci,
+ struct folio *folio);
extern bool swap_cache_check_folio(swp_entry_t entry);
extern void *swap_cache_get_shadow(swp_entry_t entry);
/* Below helpers requires the caller to lock the swap cluster. */
@@ -368,8 +371,7 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
return NULL;
}
-static inline int swap_cache_add_folio(swp_entry_t end, struct folio *folio,
- void **shadow, bool swapin)
+static inline int swap_cache_add_folio(swp_entry_t end, struct folio *folio, void **shadow)
{
return -EINVAL;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ea6a1741db5c..9e7d40215958 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -110,12 +110,39 @@ int __swap_cache_replace_folio(struct swap_cluster_info *ci, swp_entry_t entry,
return 0;
}
-/*
- * Return the folio being added on success, or return the existing folio
- * with conflicting index on failure.
- */
+/* For swap allocator's initial allocation of entries to a folio */
+void __swap_cache_add_folio(swp_entry_t entry, struct swap_cluster_info *ci,
+ struct folio *folio)
+{
+ pgoff_t offset = swp_offset(entry), end;
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ /*
+ * Allocator should always allocate aligned entries so folio based
+ * operations never crossed more than one cluster.
+ */
+ VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(offset, nr_pages), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_uptodate(folio), folio);
+
+ end = offset + nr_pages;
+ do {
+ WARN_ON_ONCE(!swp_te_is_null(__swap_table_get(ci, offset)));
+ __swap_table_set_folio(ci, offset, folio);
+ } while (++offset < end);
+
+ folio_ref_add(folio, nr_pages);
+ folio_set_swapcache(folio);
+ folio->swap = entry;
+
+ node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
+}
+
+/* For swap in or perform IO for an allocated swap entry. */
struct folio *swap_cache_add_folio(swp_entry_t entry, struct folio *folio,
- void **shadow, bool swapin)
+ void **shadow)
{
swp_te_t exist;
pgoff_t end, start, offset;
@@ -127,9 +154,10 @@ struct folio *swap_cache_add_folio(swp_entry_t entry, struct folio *folio,
start = swp_offset(entry);
end = start + nr_pages;
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(start, nr_pages), folio);
again:
offset = start;
existing = NULL;
@@ -141,7 +169,7 @@ struct folio *swap_cache_add_folio(swp_entry_t entry, struct folio *folio,
existing = swp_te_folio(exist);
goto out_failed;
}
- if (swapin && __swap_cache_set_entry(si, ci, offset))
+ if (__swap_cache_set_entry(si, ci, offset))
goto out_failed;
if (shadow && swp_te_is_shadow(exist))
*shadow = swp_te_shadow(exist);
@@ -381,7 +409,7 @@ static struct folio *__swapin_cache_add_prepare(swp_entry_t entry,
__folio_set_locked(folio);
__folio_set_swapbacked(folio);
- swapcache = swap_cache_add_folio(entry, folio, &shadow, true);
+ swapcache = swap_cache_add_folio(entry, folio, &shadow);
if (swapcache != folio) {
folio_unlock(folio);
return swapcache;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0f2a499ff2c9..91025ba98653 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -709,18 +709,17 @@ static bool cluster_scan_range(struct swap_info_struct *si,
return true;
}
-static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
- unsigned int start, unsigned char usage,
- unsigned int order)
+static bool cluster_alloc_range(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ struct folio *folio,
+ unsigned int offset)
{
- unsigned int nr_pages = 1 << order;
- unsigned long offset, end = start + nr_pages;
-
- lockdep_assert_held(&ci->lock);
+ unsigned int order = folio ? folio_order(folio) : 0;
+ swp_entry_t entry = swp_entry(si->type, offset);
+ unsigned long nr_pages = 1 << order;
if (!(si->flags & SWP_WRITEOK))
return false;
-
/*
* The first allocation in a cluster makes the
* cluster exclusive to this order
@@ -728,28 +727,33 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
if (cluster_is_empty(ci))
ci->order = order;
- for (offset = start; offset < end; offset++) {
- VM_WARN_ON_ONCE(swap_count(si->swap_map[offset]));
- VM_WARN_ON_ONCE(!swp_te_is_null(__swap_table_get(ci, offset)));
- si->swap_map[offset] = usage;
- }
swap_range_alloc(si, nr_pages);
ci->count += nr_pages;
+ if (folio) {
+ /* from folio_alloc_swap */
+ __swap_cache_add_folio(entry, ci, folio);
+ memset(&si->swap_map[offset], SWAP_HAS_CACHE, nr_pages);
+ } else {
+ /* from get_swap_page_of_type */
+ VM_WARN_ON_ONCE(si->swap_map[offset] || swap_cache_check_folio(entry));
+ si->swap_map[offset] = 1;
+ }
+
return true;
}
/* Try use a new cluster for current CPU and allocate from it. */
static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
struct swap_cluster_info *ci,
- unsigned long offset,
- unsigned int order,
- unsigned char usage)
+ struct folio *folio,
+ unsigned long offset)
{
unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
- unsigned int nr_pages = 1 << order;
+ unsigned int order = folio ? folio_order(folio) : 0;
+ unsigned long nr_pages = 1 << order;
bool need_reclaim, ret;
lockdep_assert_held(&ci->lock);
@@ -777,7 +781,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
if (!ret)
continue;
}
- if (!cluster_alloc_range(si, ci, offset, usage, order))
+ if (!cluster_alloc_range(si, ci, folio, offset))
break;
found = offset;
offset += nr_pages;
@@ -851,10 +855,11 @@ static void swap_reclaim_work(struct work_struct *work)
* Try to allocate swap entries with specified order and try set a new
* cluster for current CPU too.
*/
-static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
- unsigned char usage)
+static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
+ struct folio *folio)
{
struct swap_cluster_info *ci;
+ unsigned int order = folio ? folio_order(folio) : 0;
unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
/*
@@ -874,8 +879,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
offset = cluster_offset(si, ci);
- found = alloc_swap_scan_cluster(si, ci, offset,
- order, usage);
+ found = alloc_swap_scan_cluster(si, ci, folio, offset);
} else {
swap_unlock_cluster(ci);
}
@@ -886,8 +890,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
new_cluster:
ci = isolate_lock_cluster(si, &si->free_clusters);
if (ci) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
+ found = alloc_swap_scan_cluster(si, ci, folio, cluster_offset(si, ci));
if (found)
goto done;
}
@@ -898,8 +901,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
if (order < PMD_ORDER) {
while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
+ found = alloc_swap_scan_cluster(si, ci, folio, cluster_offset(si, ci));
if (found)
goto done;
}
@@ -912,8 +914,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
*/
ci = isolate_lock_cluster(si, &si->frag_clusters[order]);
if (ci) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
+ found = alloc_swap_scan_cluster(si, ci, folio, cluster_offset(si, ci));
if (found)
goto done;
}
@@ -937,15 +938,13 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
* allocation, but reclaim may drop si->lock and race with another user.
*/
while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
+ found = alloc_swap_scan_cluster(si, ci, folio, cluster_offset(si, ci));
if (found)
goto done;
}
while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
+ found = alloc_swap_scan_cluster(si, ci, folio, cluster_offset(si, ci));
if (found)
goto done;
}
@@ -1138,12 +1137,12 @@ static bool get_swap_device_info(struct swap_info_struct *si)
* Fast path try to get swap entries with specified order from current
* CPU's swap entry pool (a cluster).
*/
-static bool swap_alloc_fast(swp_entry_t *entry,
- int order)
+static bool swap_alloc_fast(struct folio *folio)
{
+ unsigned int order = folio_order(folio);
struct swap_cluster_info *ci;
struct swap_info_struct *si;
- unsigned int offset, found = SWAP_ENTRY_INVALID;
+ unsigned int offset;
/*
* Once allocated, swap_info_struct will never be completely freed,
@@ -1158,24 +1157,21 @@ static bool swap_alloc_fast(swp_entry_t *entry,
if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
offset = cluster_offset(si, ci);
- found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
- if (found)
- *entry = swp_entry(si->type, found);
+ alloc_swap_scan_cluster(si, ci, folio, offset);
} else {
swap_unlock_cluster(ci);
}
-
put_swap_device(si);
- return !!found;
+ return folio->swap.val != SWAP_ENTRY_INVALID;
}
/* Rotate the device and switch to a new cluster */
-static bool swap_alloc_slow(swp_entry_t *entry,
- int order)
+static void swap_alloc_slow(struct folio *folio)
{
int node;
unsigned long offset;
struct swap_info_struct *si, *next;
+ unsigned int order = folio_order(folio);
node = numa_node_id();
spin_lock(&swap_avail_lock);
@@ -1185,14 +1181,12 @@ static bool swap_alloc_slow(swp_entry_t *entry,
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
if (get_swap_device_info(si)) {
- offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
+ offset = cluster_alloc_swap_entry(si, folio);
put_swap_device(si);
- if (offset) {
- *entry = swp_entry(si->type, offset);
- return true;
- }
+ if (offset)
+ return;
if (order)
- return false;
+ return;
}
spin_lock(&swap_avail_lock);
@@ -1211,7 +1205,6 @@ static bool swap_alloc_slow(swp_entry_t *entry,
goto start_over;
}
spin_unlock(&swap_avail_lock);
- return false;
}
/*
@@ -1278,10 +1271,6 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
{
unsigned int order = folio_order(folio);
unsigned int size = 1 << order;
- swp_entry_t entry = {};
-
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
if (order) {
/*
@@ -1302,32 +1291,21 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
}
local_lock(&percpu_swap_cluster.lock);
- if (!swap_alloc_fast(&entry, order))
- swap_alloc_slow(&entry, order);
+ if (!swap_alloc_fast(folio))
+ swap_alloc_slow(folio);
local_unlock(&percpu_swap_cluster.lock);
/* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
- if (mem_cgroup_try_charge_swap(folio, entry))
- goto out_free;
-
- if (!entry.val)
+ if (mem_cgroup_try_charge_swap(folio, folio->swap)) {
+ folio_free_swap_cache(folio);
return -ENOMEM;
+ }
- if (WARN_ON(swap_cache_add_folio(entry, folio, NULL, false) != folio))
- goto out_free;
-
- /*
- * Allocator should always allocate aligned entries so folio based
- * operations never crossed more than one cluster.
- */
- VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio->swap.val, size), folio);
+ if (!folio->swap.val)
+ return -ENOMEM;
atomic_long_sub(size, &nr_swap_pages);
return 0;
-
-out_free:
- put_swap_folio(folio, entry);
- return -ENOMEM;
}
/*
@@ -1858,7 +1836,7 @@ swp_entry_t get_swap_page_of_type(int type)
/* This is called for allocating swap entry, not cache */
if (get_swap_device_info(si)) {
if (si->flags & SWP_WRITEOK) {
- offset = cluster_alloc_swap_entry(si, 0, 1);
+ offset = cluster_alloc_swap_entry(si, NULL);
if (offset) {
entry = swp_entry(si->type, offset);
atomic_long_dec(&nr_swap_pages);
--
2.49.0
Powered by blists - more mailing lists