lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CACePvbU+NviPmpXQAJUrY4rTqmY_rvYy2JvDBAfT290GmQmfZg@mail.gmail.com>
Date: Wed, 12 Nov 2025 06:44:23 -0800
From: Chris Li <chrisl@...nel.org>
To: Youngjun Park <youngjun.park@....com>
Cc: akpm@...ux-foundation.org, linux-mm@...ck.org, cgroups@...r.kernel.org, 
	linux-kernel@...r.kernel.org, kasong@...cent.com, hannes@...xchg.org, 
	mhocko@...nel.org, roman.gushchin@...ux.dev, shakeel.butt@...ux.dev, 
	muchun.song@...ux.dev, shikemeng@...weicloud.com, nphamcs@...il.com, 
	bhe@...hat.com, baohua@...nel.org, gunho.lee@....com, taejoon.song@....com
Subject: Re: [PATCH 3/3] mm/swap: integrate swap tier infrastructure into swap subsystem

It seems you should introduce the tiers first. Allow users to define tiers.
Then the follow up patches use tiers defined here.

The patch order seems reversed to me.

See some feedback below, to be continued.

Chhris


On Sun, Nov 9, 2025 at 4:50 AM Youngjun Park <youngjun.park@....com> wrote:
>
> Integrate the swap tier infrastructure into the existing swap subsystem
> to enable selective swap device usage based on tier configuration.
>
> Signed-off-by: Youngjun Park <youngjun.park@....com>
> ---
>  mm/memcontrol.c | 69 ++++++++++++++++++++++++++++++++++++
>  mm/page_io.c    | 21 ++++++++++-
>  mm/swap_state.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++
>  mm/swapfile.c   | 15 ++++++--
>  4 files changed, 194 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index bfc986da3289..33c7cc069754 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -68,6 +68,7 @@
>  #include <net/ip.h>
>  #include "slab.h"
>  #include "memcontrol-v1.h"
> +#include "swap_tier.h"
>
>  #include <linux/uaccess.h>
>
> @@ -3730,6 +3731,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
>  {
>         lru_gen_exit_memcg(memcg);
>         memcg_wb_domain_exit(memcg);
> +       swap_tiers_put_mask(memcg);
>         __mem_cgroup_free(memcg);
>  }
>
> @@ -3842,6 +3844,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
>                 page_counter_init(&memcg->kmem, &parent->kmem, false);
>                 page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
>  #endif
> +#ifdef CONFIG_SWAP_TIER
> +               memcg->tiers_mask = 0;
> +               memcg->tiers_onoff = 0;
> +#endif
> +
>         } else {
>                 init_memcg_stats();
>                 init_memcg_events();
> @@ -3850,6 +3857,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
>  #ifdef CONFIG_MEMCG_V1
>                 page_counter_init(&memcg->kmem, NULL, false);
>                 page_counter_init(&memcg->tcpmem, NULL, false);
> +#endif
> +#ifdef CONFIG_SWAP_TIER
Again, don't need this config.

> +               memcg->tiers_mask = DEFAULT_FULL_MASK;

Is this  memcg->tiers_mask a cached value after evaluating the
swap.tiers onoff list by looking up the parent?

I was thinking of starting with always evaluating the tiers_mask. Then
we don't need to store it here. How do you indicate the tiers_mask is
out of date?

> +               memcg->tiers_onoff = DEFAULT_ON_MASK;
>  #endif
>                 root_mem_cgroup = memcg;
>                 return &memcg->css;
> @@ -5390,6 +5401,56 @@ static int swap_events_show(struct seq_file *m, void *v)
>         return 0;
>  }
>
> +#ifdef CONFIG_SWAP_TIER
> +static int swap_tier_show(struct seq_file *m, void *v)
> +{
> +       struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
> +
> +       swap_tiers_show_memcg(m, memcg);
> +       return 0;
> +}
> +
> +static ssize_t swap_tier_write(struct kernfs_open_file *of,
> +                               char *buf, size_t nbytes, loff_t off)
> +{
> +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +       struct tiers_desc desc[MAX_SWAPTIER] = {};
> +       char *pos = buf, *token;
> +       int nr = 0;
> +       int ret;
> +
> +       while ((token = strsep(&pos, " \t\n")) != NULL) {

Not allowing plain space " "? Compare pointer != NULL is redundant.

> +               if (!*token)
> +                       continue;
> +
> +               if (nr >= MAX_SWAPTIER)
> +                       return -E2BIG;
> +
> +               if (token[0] != '+' && token[0] != '-')
> +                       return -EINVAL;
> +
> +               desc[nr].ops = (token[0] == '+') ? TIER_ON_MASK : TIER_OFF_MASK;
> +
> +               if (strlen(token) <= 1) {
> +                       strscpy(desc[nr].name, DEFAULT_TIER_NAME);
> +                       nr++;
> +                       continue;
> +               }
> +
> +               if (strscpy(desc[nr].name, token + 1, MAX_TIERNAME) < 0)
> +                       return -EINVAL;
> +
> +               nr++;
I don't think you need this nr, you will reach to the end of the
string any way. What if the user specifies the same tier more than
once? It is not optimal but the kernel should take it.

OK, I see what is going on now, this whole desc thing can be greatly
simplified. You shouldn't need to maintain the desc[nr], that desc
array is the onoff mask in my mind. You just need to keep the tier
bits in order.

Notice in the memory.swap.tiers. Except for the default tier pattern,
which always the first one if exists. The rest of the tier + - order
does not matter. You look up the tier name into the tier mask bit.
Just set the onoff bits for that tier.

> +       }
> +
> +       ret = swap_tiers_get_mask(desc, nr, memcg);
> +       if (ret)
> +               return ret;
> +
> +       return nbytes;
> +}
> +#endif
> +
>  static struct cftype swap_files[] = {
>         {
>                 .name = "swap.current",
> @@ -5422,6 +5483,14 @@ static struct cftype swap_files[] = {
>                 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
>                 .seq_show = swap_events_show,
>         },
> +#ifdef CONFIG_SWAP_TIER
> +       {
> +               .name = "swap.tiers",
> +               .flags = CFTYPE_NOT_ON_ROOT,
> +               .seq_show = swap_tier_show,
> +               .write = swap_tier_write,
> +       },
> +#endif
>         { }     /* terminate */
>  };
>
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 3c342db77ce3..2b3b1154a169 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -26,6 +26,7 @@
>  #include <linux/delayacct.h>
>  #include <linux/zswap.h>
>  #include "swap.h"
> +#include "swap_tier.h"
>
>  static void __end_swap_bio_write(struct bio *bio)
>  {
> @@ -233,6 +234,24 @@ static void swap_zeromap_folio_clear(struct folio *folio)
>         }
>  }
>
> +#if defined(CONFIG_SWAP_TIER) && defined(CONFIG_ZSWAP)
> +static bool folio_swap_tier_zswap_test_off(struct folio *folio)
> +{
> +       struct mem_cgroup *memcg;
> +
> +       memcg = folio_memcg(folio);
> +       if (memcg)
> +               return swap_tier_test_off(memcg->tiers_mask,
> +                       TIER_MASK(SWAP_TIER_ZSWAP, TIER_ON_MASK));
> +
> +       return false;
> +}
> +#else
> +static bool folio_swap_tier_zswap_test_off(struct folio *folio)
> +{
> +       return false;
> +}
> +#endif
>  /*
>   * We may have stale swap cache pages in memory: notice
>   * them here and get rid of the unnecessary final write.
> @@ -272,7 +291,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
>          */
>         swap_zeromap_folio_clear(folio);
>
> -       if (zswap_store(folio)) {
> +       if (folio_swap_tier_zswap_test_off(folio) || zswap_store(folio)) {
>                 count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
>                 goto out_unlock;
>         }
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 3f85a1c4cfd9..2e5f65ff2479 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -25,6 +25,7 @@
>  #include "internal.h"
>  #include "swap_table.h"
>  #include "swap.h"
> +#include "swap_tier.h"
>
>  /*
>   * swapper_space is a fiction, retained to simplify the path through
> @@ -836,8 +837,100 @@ static ssize_t vma_ra_enabled_store(struct kobject *kobj,
>  }
>  static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
>
> +#ifdef CONFIG_SWAP_TIER
> +static ssize_t tiers_show(struct kobject *kobj,
> +                                    struct kobj_attribute *attr, char *buf)
> +{
> +       return swap_tiers_show_sysfs(buf);
> +}
> +
> +static ssize_t tiers_store(struct kobject *kobj,
> +                               struct kobj_attribute *attr,
> +                               const char *buf, size_t count)
> +{
> +       struct tiers_desc desc[MAX_SWAPTIER] = {};
> +       int nr = 0;
> +       char *data, *p, *token;
> +       int ret = 0;
> +       bool is_add = true;
> +
> +       if (!count)
> +               return -EINVAL;
> +
> +       data = kmemdup_nul(buf, count, GFP_KERNEL);
> +       if (!data)
> +               return -ENOMEM;
> +
> +       p = data;
> +
> +       if (*p == '+')
> +               p++;
> +       else if (*p == '-') {
> +               is_add = false;
> +               p++;
> +       } else
> +               return -EINVAL;
> +
> +       while ((token = strsep(&p, ", \t\n")) != NULL) {
> +               if (!*token)
> +                       continue;
> +
> +               if (nr >= MAX_SWAPTIER) {
> +                       ret = -E2BIG;
> +                       goto out;
> +               }
> +
> +               if (is_add) {
> +                       char *name, *prio_str;
> +                       int prio;
> +
> +                       name = strsep(&token, ":");
> +                       prio_str = token;
> +
> +                       if (!name || !prio_str || !*name || !*prio_str) {
> +                               ret = -EINVAL;
> +                               goto out;
> +                       }
> +
> +                       if (strscpy(desc[nr].name, name, MAX_TIERNAME) < 0) {
> +                               ret = -EINVAL;
> +                               goto out;
> +                       }
> +
> +                       if (kstrtoint(prio_str, 10, &prio)) {
> +                               ret = -EINVAL;
> +                               goto out;
> +                       }
> +
> +                       desc[nr].prio_st = prio;
> +               } else {
> +                       if (strscpy(desc[nr].name, token, MAX_TIERNAME) < 0) {
> +                               ret = -EINVAL;
> +                               goto out;
> +                       }
> +                       desc[nr].prio_st = 0;
> +               }
> +               nr++;
> +       }
> +
> +       if (is_add)
> +               ret = swap_tiers_add(desc, nr);
> +       else
> +               ret = swap_tiers_remove(desc, nr);
> +
> +out:
> +       kfree(data);
> +       return ret ? ret : count;
> +}
> +
> +static struct kobj_attribute tier_attr = __ATTR_RW(tiers);
> +#endif
> +
>  static struct attribute *swap_attrs[] = {
>         &vma_ra_enabled_attr.attr,
> +#ifdef CONFIG_SWAP_TIER
> +       &tier_attr.attr,
> +#endif
>         NULL,
>  };
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index a5c90e419ff3..8715a2d94140 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -49,6 +49,7 @@
>  #include "swap_table.h"
>  #include "internal.h"
>  #include "swap.h"
> +#include "swap_tier.h"
>
>  static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
>                                  unsigned char);
> @@ -1296,7 +1297,8 @@ static bool get_swap_device_info(struct swap_info_struct *si)
>
>  /* Rotate the device and switch to a new cluster */
>  static void swap_alloc_entry(swp_entry_t *entry,
> -                           int order)
> +                           int order,
> +                           int mask)
>  {
>         unsigned long offset;
>         struct swap_info_struct *si, *next;
> @@ -1304,6 +1306,8 @@ static void swap_alloc_entry(swp_entry_t *entry,
>         spin_lock(&swap_avail_lock);
>  start_over:
>         plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
> +               if (swap_tiers_test_off(si->tier_idx, mask))
> +                       continue;
>                 /* Rotate the device and switch to a new cluster */
>                 plist_requeue(&si->avail_list, &swap_avail_head);
>                 spin_unlock(&swap_avail_lock);
> @@ -1376,6 +1380,7 @@ int folio_alloc_swap(struct folio *folio)
>  {
>         unsigned int order = folio_order(folio);
>         unsigned int size = 1 << order;
> +       int mask;
>         swp_entry_t entry = {};
>
>         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
> @@ -1400,8 +1405,8 @@ int folio_alloc_swap(struct folio *folio)
>         }
>
>  again:
> -       swap_alloc_entry(&entry, order);
> -
> +       mask = swap_tiers_collect_compare_mask(folio_memcg(folio));
> +       swap_alloc_entry(&entry, order, mask);
>         if (unlikely(!order && !entry.val)) {
>                 if (swap_sync_discard())
>                         goto again;
> @@ -2673,6 +2678,8 @@ static void _enable_swap_info(struct swap_info_struct *si)
>
>         /* Add back to available list */
>         add_to_avail_list(si, true);
> +
> +       swap_tiers_assign(si);
>  }
>
>  static void enable_swap_info(struct swap_info_struct *si, int prio,
> @@ -2840,6 +2847,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>         spin_lock(&swap_lock);
>         spin_lock(&p->lock);
>         drain_mmlist();
> +       swap_tiers_release(p);
>
>         swap_file = p->swap_file;
>         p->swap_file = NULL;
> @@ -4004,6 +4012,7 @@ static int __init swapfile_init(void)
>                 swap_migration_ad_supported = true;
>  #endif /* CONFIG_MIGRATION */
>
> +       swap_tiers_init();
>         return 0;
>  }
>  subsys_initcall(swapfile_init);
> --
> 2.34.1
>
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ