[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKEwX=PD8LNvacFP_JF=KO7R3mfjzTp+HyoEhGe06FQCOT0PaA@mail.gmail.com>
Date: Thu, 20 Feb 2025 15:28:42 -0800
From: Nhat Pham <nphamcs@...il.com>
To: Kanchana P Sridhar <kanchana.p.sridhar@...el.com>
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org, hannes@...xchg.org,
yosry.ahmed@...ux.dev, chengming.zhou@...ux.dev, usamaarif642@...il.com,
ryan.roberts@....com, 21cnbao@...il.com, akpm@...ux-foundation.org,
linux-crypto@...r.kernel.org, herbert@...dor.apana.org.au,
davem@...emloft.net, clabbe@...libre.com, ardb@...nel.org,
ebiggers@...gle.com, surenb@...gle.com, kristen.c.accardi@...el.com,
wajdi.k.feghali@...el.com, vinodh.gopal@...el.com
Subject: Re: [PATCH v6 16/16] mm: zswap: Fix for zstd performance regression
with 2M folios.
On Wed, Feb 5, 2025 at 11:21 PM Kanchana P Sridhar
<kanchana.p.sridhar@...el.com> wrote:
>
> With the previous patch that enables support for batch compressions in
> zswap_compress_folio(), a 6.2% throughput regression was seen with zstd and
> 2M folios, using vm-scalability/usemem.
>
> For compressors that don't support batching, this was root-caused to the
> following zswap_store_folio() structure:
>
> Batched stores:
> ---------------
> - Allocate all entries,
> - Compress all entries,
> - Store all entries in xarray/LRU.
Can you clarify why the above structure leads to performance regression?
>
> Hence, the above structure is maintained only for batched stores, and the
> following structure is implemented for sequential stores of large folio pages,
> that fixes the zstd regression, while preserving common code paths for batched
> and sequential stores of a folio:
>
> Sequential stores:
> ------------------
> For each page in folio:
> - allocate an entry,
> - compress the page,
> - store the entry in xarray/LRU.
>
> This is submitted as a separate patch only for code review purposes. I will
> squash this with the previous commit in subsequent versions of this
> patch-series.
>
> Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@...el.com>
> ---
> mm/zswap.c | 193 ++++++++++++++++++++++++++++++-----------------------
> 1 file changed, 111 insertions(+), 82 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index f1cba77eda62..7bfc720a6201 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1592,40 +1592,32 @@ static bool zswap_batch_compress(struct folio *folio,
> return ret;
> }
>
> -static bool zswap_compress_folio(struct folio *folio,
> - struct zswap_entry *entries[],
> - struct zswap_pool *pool)
> +static __always_inline bool zswap_compress_folio(struct folio *folio,
> + struct zswap_entry *entries[],
> + long from_index,
> + struct zswap_pool *pool,
> + unsigned int batch_size,
> + struct crypto_acomp_ctx *acomp_ctx)
> {
> - long index, nr_pages = folio_nr_pages(folio);
> - struct crypto_acomp_ctx *acomp_ctx;
> - unsigned int batch_size;
> + long index = from_index, nr_pages = folio_nr_pages(folio);
> bool ret = true;
>
> - acomp_ctx = acomp_ctx_get_cpu_lock(pool);
> - batch_size = acomp_ctx->nr_reqs;
> -
> if ((batch_size > 1) && (nr_pages > 1)) {
> - for (index = 0; index < nr_pages; index += batch_size) {
> + for (; index < nr_pages; index += batch_size) {
>
> if (!zswap_batch_compress(folio, index, batch_size,
> &entries[index], pool, acomp_ctx)) {
> ret = false;
> - goto unlock_acomp_ctx;
> + break;
> }
> }
> } else {
> - for (index = 0; index < nr_pages; ++index) {
> - struct page *page = folio_page(folio, index);
> + struct page *page = folio_page(folio, index);
>
> - if (!zswap_compress(page, entries[index], pool, acomp_ctx)) {
> - ret = false;
> - goto unlock_acomp_ctx;
> - }
> - }
> + if (!zswap_compress(page, entries[index], pool, acomp_ctx))
> + ret = false;
> }
>
> -unlock_acomp_ctx:
> - acomp_ctx_put_unlock(acomp_ctx);
> return ret;
> }
>
> @@ -1637,92 +1629,128 @@ static bool zswap_compress_folio(struct folio *folio,
> * handles to ERR_PTR(-EINVAL) at allocation time, and the fact that the
> * entry's handle is subsequently modified only upon a successful zpool_malloc()
> * after the page is compressed.
> + *
> + * For compressors that don't support batching, the following structure
> + * showed a performance regression with zstd/2M folios:
> + *
> + * Batched stores:
> + * ---------------
> + * - Allocate all entries,
> + * - Compress all entries,
> + * - Store all entries in xarray/LRU.
> + *
> + * Hence, the above structure is maintained only for batched stores, and the
> + * following structure is implemented for sequential stores of large folio pages,
> + * that fixes the regression, while preserving common code paths for batched
> + * and sequential stores of a folio:
> + *
> + * Sequential stores:
> + * ------------------
> + * For each page in folio:
> + * - allocate an entry,
> + * - compress the page,
> + * - store the entry in xarray/LRU.
> */
> static bool zswap_store_folio(struct folio *folio,
> struct obj_cgroup *objcg,
> struct zswap_pool *pool)
> {
> - long index, from_index = 0, nr_pages = folio_nr_pages(folio);
> + long index = 0, from_index = 0, nr_pages, nr_folio_pages = folio_nr_pages(folio);
> struct zswap_entry **entries = NULL;
> + struct crypto_acomp_ctx *acomp_ctx;
> int node_id = folio_nid(folio);
> + unsigned int batch_size;
>
> - entries = kmalloc(nr_pages * sizeof(*entries), GFP_KERNEL);
> + entries = kmalloc(nr_folio_pages * sizeof(*entries), GFP_KERNEL);
> if (!entries)
> return false;
>
> - for (index = 0; index < nr_pages; ++index) {
> - entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id);
> + acomp_ctx = acomp_ctx_get_cpu_lock(pool);
> + batch_size = acomp_ctx->nr_reqs;
>
> - if (!entries[index]) {
> - zswap_reject_kmemcache_fail++;
> - nr_pages = index;
> - goto store_folio_failed;
> + nr_pages = (batch_size > 1) ? nr_folio_pages : 1;
> +
> + while (1) {
> + for (index = from_index; index < nr_pages; ++index) {
> + entries[index] = zswap_entry_cache_alloc(GFP_KERNEL, node_id);
> +
> + if (!entries[index]) {
> + zswap_reject_kmemcache_fail++;
> + nr_pages = index;
> + goto store_folio_failed;
> + }
> +
> + entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL);
> }
>
> - entries[index]->handle = (unsigned long)ERR_PTR(-EINVAL);
> - }
> + if (!zswap_compress_folio(folio, entries, from_index, pool, batch_size, acomp_ctx))
> + goto store_folio_failed;
>
> - if (!zswap_compress_folio(folio, entries, pool))
> - goto store_folio_failed;
> + for (index = from_index; index < nr_pages; ++index) {
> + swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index));
> + struct zswap_entry *old, *entry = entries[index];
>
> - for (index = 0; index < nr_pages; ++index) {
> - swp_entry_t page_swpentry = page_swap_entry(folio_page(folio, index));
> - struct zswap_entry *old, *entry = entries[index];
> + old = xa_store(swap_zswap_tree(page_swpentry),
> + swp_offset(page_swpentry),
> + entry, GFP_KERNEL);
> + if (xa_is_err(old)) {
> + int err = xa_err(old);
>
> - old = xa_store(swap_zswap_tree(page_swpentry),
> - swp_offset(page_swpentry),
> - entry, GFP_KERNEL);
> - if (xa_is_err(old)) {
> - int err = xa_err(old);
> + WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
> + zswap_reject_alloc_fail++;
> + from_index = index;
> + goto store_folio_failed;
> + }
>
> - WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
> - zswap_reject_alloc_fail++;
> - from_index = index;
> - goto store_folio_failed;
> - }
> + /*
> + * We may have had an existing entry that became stale when
> + * the folio was redirtied and now the new version is being
> + * swapped out. Get rid of the old.
> + */
> + if (old)
> + zswap_entry_free(old);
>
> - /*
> - * We may have had an existing entry that became stale when
> - * the folio was redirtied and now the new version is being
> - * swapped out. Get rid of the old.
> - */
> - if (old)
> - zswap_entry_free(old);
> + /*
> + * The entry is successfully compressed and stored in the tree, there is
> + * no further possibility of failure. Grab refs to the pool and objcg,
> + * charge zswap memory, and increment zswap_stored_pages.
> + * The opposite actions will be performed by zswap_entry_free()
> + * when the entry is removed from the tree.
> + */
> + zswap_pool_get(pool);
> + if (objcg) {
> + obj_cgroup_get(objcg);
> + obj_cgroup_charge_zswap(objcg, entry->length);
> + }
> + atomic_long_inc(&zswap_stored_pages);
>
> - /*
> - * The entry is successfully compressed and stored in the tree, there is
> - * no further possibility of failure. Grab refs to the pool and objcg,
> - * charge zswap memory, and increment zswap_stored_pages.
> - * The opposite actions will be performed by zswap_entry_free()
> - * when the entry is removed from the tree.
> - */
> - zswap_pool_get(pool);
> - if (objcg) {
> - obj_cgroup_get(objcg);
> - obj_cgroup_charge_zswap(objcg, entry->length);
> + /*
> + * We finish initializing the entry while it's already in xarray.
> + * This is safe because:
> + *
> + * 1. Concurrent stores and invalidations are excluded by folio lock.
> + *
> + * 2. Writeback is excluded by the entry not being on the LRU yet.
> + * The publishing order matters to prevent writeback from seeing
> + * an incoherent entry.
> + */
> + entry->pool = pool;
> + entry->swpentry = page_swpentry;
> + entry->objcg = objcg;
> + entry->referenced = true;
> + if (entry->length) {
> + INIT_LIST_HEAD(&entry->lru);
> + zswap_lru_add(&zswap_list_lru, entry);
> + }
> }
> - atomic_long_inc(&zswap_stored_pages);
>
> - /*
> - * We finish initializing the entry while it's already in xarray.
> - * This is safe because:
> - *
> - * 1. Concurrent stores and invalidations are excluded by folio lock.
> - *
> - * 2. Writeback is excluded by the entry not being on the LRU yet.
> - * The publishing order matters to prevent writeback from seeing
> - * an incoherent entry.
> - */
> - entry->pool = pool;
> - entry->swpentry = page_swpentry;
> - entry->objcg = objcg;
> - entry->referenced = true;
> - if (entry->length) {
> - INIT_LIST_HEAD(&entry->lru);
> - zswap_lru_add(&zswap_list_lru, entry);
> - }
> + from_index = nr_pages++;
> +
> + if (nr_pages > nr_folio_pages)
> + break;
> }
>
> + acomp_ctx_put_unlock(acomp_ctx);
> kfree(entries);
> return true;
>
> @@ -1734,6 +1762,7 @@ static bool zswap_store_folio(struct folio *folio,
> zswap_entry_cache_free(entries[index]);
> }
>
> + acomp_ctx_put_unlock(acomp_ctx);
> kfree(entries);
> return false;
> }
> --
> 2.27.0
>
Powered by blists - more mailing lists