linux-kernel - Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKEwX=OYs6WAJvPoiLArvO+HGf-ZF6USoJ+QUuaVXzhoXcexHw@mail.gmail.com>
Date: Tue, 12 Mar 2024 16:12:32 +0700
From: Nhat Pham <nphamcs@...il.com>
To: Johannes Weiner <hannes@...xchg.org>
Cc: Andrew Morton <akpm@...ux-foundation.org>, Yosry Ahmed <yosryahmed@...gle.com>, 
	Chengming Zhou <zhouchengming@...edance.com>, linux-mm@...ck.org, 
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking

On Mon, Mar 11, 2024 at 11:12 PM Johannes Weiner <hannes@...xchg.org> wrote:
>
> Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> total cycles currently going into updating the zswap_pool_total_size.
>
> There are three consumers of this counter:
> - store, to enforce the globally configured pool limit
> - meminfo & debugfs, to report the size to the user
> - shrink, to determine the batch size for each cycle
>
> Instead of aggregating everytime an entry enters or exits the zswap
> pool, aggregate the value from the zpools on-demand:
>
> - Stores aggregate the counter anyway upon success. Aggregating to
>   check the limit instead is the same amount of work.
>
> - Meminfo & debugfs might benefit somewhat from a pre-aggregated
>   counter, but aren't exactly hotpaths.
>
> - Shrinking can aggregate once for every cycle instead of doing it for
>   every freed entry. As the shrinker might work on tens or hundreds of
>   objects per scan cycle, this is a large reduction in aggregations.

Nice!

>
> The paths that benefit dramatically are swapin, swapoff, and
> unmaps. There could be millions of pages being processed until
> somebody asks for the pool size again. This eliminates the pool size
> updates from those paths entirely.
>
> Signed-off-by: Johannes Weiner <hannes@...xchg.org>

With your fixlet applied:
Reviewed-by: Nhat Pham <nphamcs@...il.com>

> ---
>  fs/proc/meminfo.c     |  3 +-
>  include/linux/zswap.h |  2 +-
>  mm/zswap.c            | 98 +++++++++++++++++++++----------------------
>  3 files changed, 49 insertions(+), 54 deletions(-)
>
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 45af9a989d40..245171d9164b 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
>         show_val_kb(m, "SwapTotal:      ", i.totalswap);
>         show_val_kb(m, "SwapFree:       ", i.freeswap);
>  #ifdef CONFIG_ZSWAP
> -       seq_printf(m,  "Zswap:          %8lu kB\n",
> -                  (unsigned long)(zswap_pool_total_size >> 10));
> +       show_val_kb(m, "Zswap:          ", zswap_total_pages());
>         seq_printf(m,  "Zswapped:       %8lu kB\n",
>                    (unsigned long)atomic_read(&zswap_stored_pages) <<
>                    (PAGE_SHIFT - 10));
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index 341aea490070..2a85b941db97 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -7,7 +7,6 @@
>
>  struct lruvec;
>
> -extern u64 zswap_pool_total_size;
>  extern atomic_t zswap_stored_pages;
>
>  #ifdef CONFIG_ZSWAP
> @@ -27,6 +26,7 @@ struct zswap_lruvec_state {
>         atomic_long_t nr_zswap_protected;
>  };
>
> +unsigned long zswap_total_pages(void);
>  bool zswap_store(struct folio *folio);
>  bool zswap_load(struct folio *folio);
>  void zswap_invalidate(swp_entry_t swp);
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 9a3237752082..7c39327a7cc2 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -43,8 +43,6 @@
>  /*********************************
>  * statistics
>  **********************************/
> -/* Total bytes used by the compressed storage */
> -u64 zswap_pool_total_size;
>  /* The number of compressed pages currently stored in zswap */
>  atomic_t zswap_stored_pages = ATOMIC_INIT(0);
>  /* The number of same-value filled pages currently stored in zswap */
> @@ -264,45 +262,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
>         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
>                  zpool_get_type((p)->zpools[0]))
>
> -static bool zswap_is_full(void)
> -{
> -       return totalram_pages() * zswap_max_pool_percent / 100 <
> -                       DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static bool zswap_can_accept(void)
> -{
> -       return totalram_pages() * zswap_accept_thr_percent / 100 *
> -                               zswap_max_pool_percent / 100 >
> -                       DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static u64 get_zswap_pool_size(struct zswap_pool *pool)
> -{
> -       u64 pool_size = 0;
> -       int i;
> -
> -       for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> -               pool_size += zpool_get_total_size(pool->zpools[i]);
> -
> -       return pool_size;
> -}
> -
> -static void zswap_update_total_size(void)
> -{
> -       struct zswap_pool *pool;
> -       u64 total = 0;
> -
> -       rcu_read_lock();
> -
> -       list_for_each_entry_rcu(pool, &zswap_pools, list)
> -               total += get_zswap_pool_size(pool);
> -
> -       rcu_read_unlock();
> -
> -       zswap_pool_total_size = total;
> -}
> -
>  /*********************************
>  * pool functions
>  **********************************/
> @@ -540,6 +499,28 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>         return NULL;
>  }
>
> +static unsigned long zswap_max_pages(void)
> +{
> +       return totalram_pages() * zswap_max_pool_percent / 100;
> +}
> +
> +unsigned long zswap_total_pages(void)
> +{
> +       struct zswap_pool *pool;
> +       u64 total = 0;
> +
> +       rcu_read_lock();
> +       list_for_each_entry_rcu(pool, &zswap_pools, list) {
> +               int i;
> +
> +               for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> +                       total += zpool_get_total_size(pool->zpools[i]);
> +       }
> +       rcu_read_unlock();
> +
> +       return total >> PAGE_SHIFT;
> +}
> +
>  /*********************************
>  * param callbacks
>  **********************************/
> @@ -912,7 +893,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
>         }
>         zswap_entry_cache_free(entry);
>         atomic_dec(&zswap_stored_pages);
> -       zswap_update_total_size();
>  }
>
>  /*
> @@ -1317,7 +1297,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>         nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
>  #else
>         /* use pool stats instead of memcg stats */
> -       nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
> +       nr_backing = zswap_total_pages();
>         nr_stored = atomic_read(&zswap_nr_stored);
>  #endif
>
> @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
>  {
>         struct mem_cgroup *memcg;
>         int ret, failures = 0;
> +       unsigned long thr;
> +
> +       /* Reclaim down to the accept threshold */
> +       thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
>
>         /* global reclaim will select cgroup in a round-robin fashion. */
>         do {
> @@ -1432,10 +1416,9 @@ static void shrink_worker(struct work_struct *w)
>                         break;
>                 if (ret && ++failures == MAX_RECLAIM_RETRIES)
>                         break;
> -
>  resched:
>                 cond_resched();
> -       } while (!zswap_can_accept());
> +       } while (zswap_total_pages() > thr);
>  }
>
>  static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
> @@ -1476,6 +1459,7 @@ bool zswap_store(struct folio *folio)
>         struct zswap_entry *entry, *dupentry;
>         struct obj_cgroup *objcg = NULL;
>         struct mem_cgroup *memcg = NULL;
> +       unsigned long max_pages, cur_pages;
>
>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>         VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1487,6 +1471,7 @@ bool zswap_store(struct folio *folio)
>         if (!zswap_enabled)
>                 goto check_old;
>
> +       /* Check cgroup limits */
>         objcg = get_obj_cgroup_from_folio(folio);
>         if (objcg && !obj_cgroup_may_zswap(objcg)) {
>                 memcg = get_mem_cgroup_from_objcg(objcg);
> @@ -1497,15 +1482,20 @@ bool zswap_store(struct folio *folio)
>                 mem_cgroup_put(memcg);
>         }
>
> -       /* reclaim space if needed */
> -       if (zswap_is_full()) {
> +       /* Check global limits */
> +       cur_pages = zswap_total_pages();
> +       max_pages = zswap_max_pages();
> +
> +       if (cur_pages >= max_pages) {
>                 zswap_pool_limit_hit++;
>                 zswap_pool_reached_full = true;
>                 goto shrink;
>         }
>
>         if (zswap_pool_reached_full) {
> -              if (!zswap_can_accept())
> +               unsigned long thr = max_pages * zswap_accept_thr_percent / 100;
> +
> +               if (cur_pages > thr)
>                         goto shrink;
>                 else
>                         zswap_pool_reached_full = false;
> @@ -1581,7 +1571,6 @@ bool zswap_store(struct folio *folio)
>
>         /* update stats */
>         atomic_inc(&zswap_stored_pages);
> -       zswap_update_total_size();
>         count_vm_event(ZSWPOUT);
>
>         return true;
> @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
>
>  static struct dentry *zswap_debugfs_root;
>
> +static int debugfs_get_total_size(void *data, u64 *val)
> +{
> +       *val = zswap_total_pages() * PAGE_SIZE;
> +       return 0;
> +}
> +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
> +
>  static int zswap_debugfs_init(void)
>  {
>         if (!debugfs_initialized())
> @@ -1732,8 +1728,8 @@ static int zswap_debugfs_init(void)
>                            zswap_debugfs_root, &zswap_reject_compress_poor);
>         debugfs_create_u64("written_back_pages", 0444,
>                            zswap_debugfs_root, &zswap_written_back_pages);
> -       debugfs_create_u64("pool_total_size", 0444,
> -                          zswap_debugfs_root, &zswap_pool_total_size);
> +       debugfs_create_file("pool_total_size", 0444,
> +                           zswap_debugfs_root, NULL, &total_size_fops);
>         debugfs_create_atomic_t("stored_pages", 0444,
>                                 zswap_debugfs_root, &zswap_stored_pages);
>         debugfs_create_atomic_t("same_filled_pages", 0444,
> --
> 2.44.0
>