[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20241018064805.336490-6-kanchana.p.sridhar@intel.com>
Date: Thu, 17 Oct 2024 23:48:03 -0700
From: Kanchana P Sridhar <kanchana.p.sridhar@...el.com>
To: linux-kernel@...r.kernel.org,
linux-mm@...ck.org,
hannes@...xchg.org,
yosryahmed@...gle.com,
nphamcs@...il.com,
chengming.zhou@...ux.dev,
usamaarif642@...il.com,
ryan.roberts@....com,
ying.huang@...el.com,
21cnbao@...il.com,
akpm@...ux-foundation.org,
hughd@...gle.com,
willy@...radead.org,
bfoster@...hat.com,
dchinner@...hat.com,
chrisl@...nel.org,
david@...hat.com
Cc: wajdi.k.feghali@...el.com,
vinodh.gopal@...el.com,
kanchana.p.sridhar@...el.com
Subject: [RFC PATCH v1 5/7] mm: swap, zswap: zswap folio_batch processing with IAA decompression batching.
This patch provides the functionality that processes a "zswap_batch" in
which swap_read_folio() had previously stored swap entries found in zswap,
for batched loading.
The newly added zswap_finish_load_batch() API implements the main zswap
load batching functionality. This makes use of the sub-batches of
zswap_entry/xarray/page/source-length readily available from
zswap_add_load_batch(). These sub-batch arrays are processed one at a time,
until the entire zswap folio_batch has been loaded. The existing
zswap_load() functionality of deleting zswap_entries for folios found in
the swapcache, is preserved.
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@...el.com>
---
include/linux/zswap.h | 22 ++++++
mm/page_io.c | 35 +++++++++
mm/swap.h | 17 +++++
mm/zswap.c | 171 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 245 insertions(+)
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 1d6de281f243..a0792c2b300a 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -110,6 +110,15 @@ struct zswap_store_pipeline_state {
u8 nr_comp_pages;
};
+/* Note: If SWAP_CRYPTO_SUB_BATCH_SIZE exceeds 256, change the u8 to u16. */
+struct zswap_load_sub_batch_state {
+ struct xarray **trees;
+ struct zswap_entry **entries;
+ struct page **pages;
+ unsigned int *slens;
+ u8 nr_decomp;
+};
+
bool zswap_store_batching_enabled(void);
void __zswap_store_batch(struct swap_in_memory_cache_cb *simc);
void __zswap_store_batch_single(struct swap_in_memory_cache_cb *simc);
@@ -136,6 +145,14 @@ static inline bool zswap_add_load_batch(
return false;
}
+void __zswap_finish_load_batch(struct zswap_decomp_batch *zd_batch);
+static inline void zswap_finish_load_batch(
+ struct zswap_decomp_batch *zd_batch)
+{
+ if (zswap_load_batching_enabled())
+ __zswap_finish_load_batch(zd_batch);
+}
+
unsigned long zswap_total_pages(void);
bool zswap_store(struct folio *folio);
bool zswap_load(struct folio *folio);
@@ -188,6 +205,11 @@ static inline bool zswap_add_load_batch(
return false;
}
+static inline void zswap_finish_load_batch(
+ struct zswap_decomp_batch *zd_batch)
+{
+}
+
static inline bool zswap_store(struct folio *folio)
{
return false;
diff --git a/mm/page_io.c b/mm/page_io.c
index 9750302d193b..aa83221318ef 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -816,6 +816,41 @@ bool swap_read_folio(struct folio *folio, struct swap_iocb **plug,
return true;
}
+static void __swap_post_process_zswap_load_batch(
+ struct zswap_decomp_batch *zswap_batch)
+{
+ u8 i;
+
+ for (i = 0; i < folio_batch_count(&zswap_batch->fbatch); ++i) {
+ struct folio *folio = zswap_batch->fbatch.folios[i];
+ folio_unlock(folio);
+ }
+}
+
+/*
+ * The swapin_readahead batching interface makes sure that the
+ * input zswap_batch consists of folios belonging to the same swap
+ * device type.
+ */
+void __swap_read_zswap_batch_unplug(struct zswap_decomp_batch *zswap_batch,
+ struct swap_iocb **splug)
+{
+ unsigned long pflags;
+
+ if (!folio_batch_count(&zswap_batch->fbatch))
+ return;
+
+ psi_memstall_enter(&pflags);
+ delayacct_swapin_start();
+
+ /* Load the zswap batch. */
+ zswap_finish_load_batch(zswap_batch);
+ __swap_post_process_zswap_load_batch(zswap_batch);
+
+ psi_memstall_leave(&pflags);
+ delayacct_swapin_end();
+}
+
void __swap_read_unplug(struct swap_iocb *sio)
{
struct iov_iter from;
diff --git a/mm/swap.h b/mm/swap.h
index 310f99007fe6..2b82c8ed765c 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -125,6 +125,16 @@ struct swap_iocb;
bool swap_read_folio(struct folio *folio, struct swap_iocb **plug,
struct zswap_decomp_batch *zswap_batch,
struct folio_batch *non_zswap_batch);
+void __swap_read_zswap_batch_unplug(
+ struct zswap_decomp_batch *zswap_batch,
+ struct swap_iocb **splug);
+static inline void swap_read_zswap_batch_unplug(
+ struct zswap_decomp_batch *zswap_batch,
+ struct swap_iocb **splug)
+{
+ if (likely(zswap_batch))
+ __swap_read_zswap_batch_unplug(zswap_batch, splug);
+}
void __swap_read_unplug(struct swap_iocb *plug);
static inline void swap_read_unplug(struct swap_iocb *plug)
{
@@ -268,6 +278,13 @@ static inline bool swap_read_folio(struct folio *folio, struct swap_iocb **plug,
{
return false;
}
+
+static inline void swap_read_zswap_batch_unplug(
+ struct zswap_decomp_batch *zswap_batch,
+ struct swap_iocb **splug)
+{
+}
+
static inline void swap_write_unplug(struct swap_iocb *sio)
{
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 1d293f95d525..39bf7d8810e9 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -35,6 +35,7 @@
#include <linux/pagemap.h>
#include <linux/workqueue.h>
#include <linux/list_lru.h>
+#include <linux/delayacct.h>
#include "swap.h"
#include "internal.h"
@@ -2401,6 +2402,176 @@ bool __zswap_add_load_batch(struct zswap_decomp_batch *zd_batch,
return true;
}
+static __always_inline void zswap_load_sub_batch_init(
+ struct zswap_decomp_batch *zd_batch,
+ unsigned int sb,
+ struct zswap_load_sub_batch_state *zls)
+{
+ zls->trees = zd_batch->trees[sb];
+ zls->entries = zd_batch->entries[sb];
+ zls->pages = zd_batch->pages[sb];
+ zls->slens = zd_batch->slens[sb];
+ zls->nr_decomp = zd_batch->nr_decomp[sb];
+}
+
+static void zswap_load_map_sources(
+ struct zswap_load_sub_batch_state *zls,
+ u8 *srcs[])
+{
+ u8 i;
+
+ for (i = 0; i < zls->nr_decomp; ++i) {
+ struct zswap_entry *entry = zls->entries[i];
+ struct zpool *zpool = entry->pool->zpool;
+ u8 *buf = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
+ memcpy(srcs[i], buf, entry->length);
+ zpool_unmap_handle(zpool, entry->handle);
+ }
+}
+
+static void zswap_decompress_batch(
+ struct zswap_load_sub_batch_state *zls,
+ u8 *srcs[],
+ int decomp_errors[])
+{
+ struct crypto_acomp_ctx *acomp_ctx;
+
+ acomp_ctx = raw_cpu_ptr(zls->entries[0]->pool->acomp_ctx);
+
+ swap_crypto_acomp_decompress_batch(
+ srcs,
+ zls->pages,
+ zls->slens,
+ decomp_errors,
+ zls->nr_decomp,
+ acomp_ctx);
+}
+
+static void zswap_load_batch_updates(
+ struct zswap_decomp_batch *zd_batch,
+ unsigned int sb,
+ struct zswap_load_sub_batch_state *zls,
+ int decomp_errors[])
+{
+ unsigned int j;
+ u8 i;
+
+ for (i = 0; i < zls->nr_decomp; ++i) {
+ j = (sb * SWAP_CRYPTO_SUB_BATCH_SIZE) + i;
+ struct folio *folio = zd_batch->fbatch.folios[j];
+ struct zswap_entry *entry = zls->entries[i];
+
+ BUG_ON(decomp_errors[i]);
+ count_vm_event(ZSWPIN);
+ if (entry->objcg)
+ count_objcg_events(entry->objcg, ZSWPIN, 1);
+
+ if (zd_batch->swapcache[j]) {
+ zswap_entry_free(entry);
+ folio_mark_dirty(folio);
+ }
+
+ folio_mark_uptodate(folio);
+ }
+}
+
+static void zswap_load_decomp_batch(
+ struct zswap_decomp_batch *zd_batch,
+ unsigned int sb,
+ struct zswap_load_sub_batch_state *zls)
+{
+ int decomp_errors[SWAP_CRYPTO_SUB_BATCH_SIZE];
+ struct crypto_acomp_ctx *acomp_ctx;
+
+ acomp_ctx = raw_cpu_ptr(zls->entries[0]->pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+
+ zswap_load_map_sources(zls, acomp_ctx->buffer);
+
+ zswap_decompress_batch(zls, acomp_ctx->buffer, decomp_errors);
+
+ mutex_unlock(&acomp_ctx->mutex);
+
+ zswap_load_batch_updates(zd_batch, sb, zls, decomp_errors);
+}
+
+static void zswap_load_start_accounting(
+ struct zswap_decomp_batch *zd_batch,
+ unsigned int sb,
+ struct zswap_load_sub_batch_state *zls,
+ bool workingset[],
+ bool in_thrashing[])
+{
+ unsigned int j;
+ u8 i;
+
+ for (i = 0; i < zls->nr_decomp; ++i) {
+ j = (sb * SWAP_CRYPTO_SUB_BATCH_SIZE) + i;
+ struct folio *folio = zd_batch->fbatch.folios[j];
+ workingset[i] = folio_test_workingset(folio);
+ if (workingset[i])
+ delayacct_thrashing_start(&in_thrashing[i]);
+ }
+}
+
+static void zswap_load_end_accounting(
+ struct zswap_decomp_batch *zd_batch,
+ struct zswap_load_sub_batch_state *zls,
+ bool workingset[],
+ bool in_thrashing[])
+{
+ u8 i;
+
+ for (i = 0; i < zls->nr_decomp; ++i)
+ if (workingset[i])
+ delayacct_thrashing_end(&in_thrashing[i]);
+}
+
+/*
+ * All entries in a zd_batch belong to the same swap device.
+ */
+void __zswap_finish_load_batch(struct zswap_decomp_batch *zd_batch)
+{
+ struct zswap_load_sub_batch_state zls;
+ unsigned int nr_folios = folio_batch_count(&zd_batch->fbatch);
+ unsigned int nr_sb = DIV_ROUND_UP(nr_folios, SWAP_CRYPTO_SUB_BATCH_SIZE);
+ unsigned int sb;
+
+ /*
+ * Process the zd_batch in sub-batches of
+ * SWAP_CRYPTO_SUB_BATCH_SIZE.
+ */
+ for (sb = 0; sb < nr_sb; ++sb) {
+ bool workingset[SWAP_CRYPTO_SUB_BATCH_SIZE];
+ bool in_thrashing[SWAP_CRYPTO_SUB_BATCH_SIZE];
+
+ zswap_load_sub_batch_init(zd_batch, sb, &zls);
+
+ zswap_load_start_accounting(zd_batch, sb, &zls,
+ workingset, in_thrashing);
+
+ /* Decompress the batch. */
+ if (zls.nr_decomp)
+ zswap_load_decomp_batch(zd_batch, sb, &zls);
+
+ /*
+ * Should we free zswap_entries, as in zswap_load():
+ * With the new swapin_readahead batching interface,
+ * all prefetch entries are read into the swapcache.
+ * Freeing the zswap entries here causes segfaults,
+ * most probably because a page-fault occured while
+ * the buffer was being decompressed.
+ * Allowing the regular folio_free_swap() sequence
+ * in do_swap_page() appears to keep things stable
+ * without duplicated zswap-swapcache memory, as far
+ * as I can tell from my testing.
+ */
+
+ zswap_load_end_accounting(zd_batch, &zls,
+ workingset, in_thrashing);
+ }
+}
+
void zswap_invalidate(swp_entry_t swp)
{
pgoff_t offset = swp_offset(swp);
--
2.27.0
Powered by blists - more mailing lists