[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z3ZVayoOMkJ08dZk@MiWiFi-R3L-srv>
Date: Thu, 2 Jan 2025 16:59:23 +0800
From: Baoquan He <bhe@...hat.com>
To: Kairui Song <kasong@...cent.com>
Cc: linux-mm@...ck.org, Andrew Morton <akpm@...ux-foundation.org>,
Chris Li <chrisl@...nel.org>, Barry Song <v-songbaohua@...o.com>,
Ryan Roberts <ryan.roberts@....com>,
Hugh Dickins <hughd@...gle.com>,
Yosry Ahmed <yosryahmed@...gle.com>,
"Huang, Ying" <ying.huang@...ux.alibaba.com>,
Nhat Pham <nphamcs@...il.com>, Johannes Weiner <hannes@...xchg.org>,
Kalesh Singh <kaleshsingh@...gle.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3 06/13] mm, swap: clean up plist removal and adding
Hi Kairui,
On 12/31/24 at 01:46am, Kairui Song wrote:
......snip...
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 7963a0c646a4..e6e58cfb5178 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -128,6 +128,26 @@ static inline unsigned char swap_count(unsigned char ent)
> return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
> }
I am reading swap code, while at it, I am going through this patchset
too. Have some nitpick, please see below inline comments.
>
> +/*
> + * Use the second highest bit of inuse_pages counter as the indicator
> + * of if one swap device is on the available plist, so the atomic can
~~ redundant?
> + * still be updated arithmetic while having special data embedded.
~~~~~~~~~~ typo, arithmetically?
> + *
> + * inuse_pages counter is the only thing indicating if a device should
> + * be on avail_lists or not (except swapon / swapoff). By embedding the
> + * on-list bit in the atomic counter, updates no longer need any lock
~~~ off-list?
> + * to check the list status.
> + *
> + * This bit will be set if the device is not on the plist and not
> + * usable, will be cleared if the device is on the plist.
> + */
> +#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
> +#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
> +static long swap_usage_in_pages(struct swap_info_struct *si)
> +{
> + return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
> +}
> +
> /* Reclaim the swap entry anyway if possible */
> #define TTRS_ANYWAY 0x1
> /*
> @@ -717,7 +737,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
> int nr_reclaim;
>
> if (force)
> - to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
> + to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
>
> while (!list_empty(&si->full_clusters)) {
> ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
> @@ -872,42 +892,128 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> return found;
> }
>
> -static void __del_from_avail_list(struct swap_info_struct *si)
> +/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
Seems it just says the opposite. The off-list bit is set in
this function.
> +static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> {
> int nid;
> + unsigned long pages;
> +
> + spin_lock(&swap_avail_lock);
> +
> + if (swapoff) {
> + /*
> + * Forcefully remove it. Clear the SWP_WRITEOK flags for
> + * swapoff here so it's synchronized by both si->lock and
> + * swap_avail_lock, to ensure the result can be seen by
> + * add_to_avail_list.
> + */
> + lockdep_assert_held(&si->lock);
> + si->flags &= ~SWP_WRITEOK;
> + atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
> + } else {
> + /*
> + * If not called by swapoff, take it off-list only if it's
> + * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
> + * si->inuse_pages == pages), any concurrent slot freeing,
> + * or device already removed from plist by someone else
> + * will make this return false.
> + */
> + pages = si->pages;
> + if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
> + pages | SWAP_USAGE_OFFLIST_BIT))
> + goto skip;
> + }
>
> - assert_spin_locked(&si->lock);
> for_each_node(nid)
> plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
> +
> +skip:
> + spin_unlock(&swap_avail_lock);
> }
>
> -static void del_from_avail_list(struct swap_info_struct *si)
> +/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
Ditto.
> +static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> {
> + int nid;
> + long val;
> + unsigned long pages;
> +
> spin_lock(&swap_avail_lock);
> - __del_from_avail_list(si);
> +
> + /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
> + if (swapon) {
> + lockdep_assert_held(&si->lock);
> + si->flags |= SWP_WRITEOK;
> + } else {
> + if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
> + goto skip;
> + }
> +
> + if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
> + goto skip;
> +
> + val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
> +
> + /*
> + * When device is full and device is on the plist, only one updater will
> + * see (inuse_pages == si->pages) and will call del_from_avail_list. If
> + * that updater happen to be here, just skip adding.
> + */
> + pages = si->pages;
> + if (val == pages) {
> + /* Just like the cmpxchg in del_from_avail_list */
> + if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
> + pages | SWAP_USAGE_OFFLIST_BIT))
> + goto skip;
> + }
> +
> + for_each_node(nid)
> + plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
> +
> +skip:
> spin_unlock(&swap_avail_lock);
> }
>
> -static void swap_range_alloc(struct swap_info_struct *si,
> - unsigned int nr_entries)
> +/*
> + * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
Not sure if swap_inuse_add()/swap_inuse_sub() or swap_inuse_cnt_add/sub()
is better, because it mixes with the usage of si->swap_map[offset].
Anyway, not strong opinion.
> + * within each cluster, so the total contribution to the global counter should
> + * always be positive and cannot exceed the total number of usable slots.
> + */
> +static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
> {
> - WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
> - if (si->inuse_pages == si->pages) {
> - del_from_avail_list(si);
> + long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
>
> - if (si->cluster_info && vm_swap_full())
> - schedule_work(&si->reclaim_work);
> + /*
> + * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
> + * remove it from the plist.
> + */
> + if (unlikely(val == si->pages)) {
> + del_from_avail_list(si, false);
> + return true;
> }
> +
> + return false;
> }
>
> -static void add_to_avail_list(struct swap_info_struct *si)
> +static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
> {
> - int nid;
> + long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
>
> - spin_lock(&swap_avail_lock);
> - for_each_node(nid)
> - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
> - spin_unlock(&swap_avail_lock);
> + /*
> + * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
> + * remove it from the plist.
> + */
> + if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
> + add_to_avail_list(si, false);
> +}
> +
> +static void swap_range_alloc(struct swap_info_struct *si,
> + unsigned int nr_entries)
> +{
> + if (swap_usage_add(si, nr_entries)) {
> + if (si->cluster_info && vm_swap_full())
We may not need check si->cluster_info here since it always exists now.
> + schedule_work(&si->reclaim_work);
> + }
> }
>
> static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> @@ -925,8 +1031,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> for (i = 0; i < nr_entries; i++)
> clear_bit(offset + i, si->zeromap);
>
> - if (si->inuse_pages == si->pages)
> - add_to_avail_list(si);
> if (si->flags & SWP_BLKDEV)
> swap_slot_free_notify =
> si->bdev->bd_disk->fops->swap_slot_free_notify;
> @@ -946,7 +1050,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> */
> smp_wmb();
> atomic_long_add(nr_entries, &nr_swap_pages);
> - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
> + swap_usage_sub(si, nr_entries);
> }
>
> static int cluster_alloc_swap(struct swap_info_struct *si,
> @@ -1036,19 +1140,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
> spin_unlock(&swap_avail_lock);
> spin_lock(&si->lock);
> - if ((si->inuse_pages == si->pages) || !(si->flags & SWP_WRITEOK)) {
> - spin_lock(&swap_avail_lock);
> - if (plist_node_empty(&si->avail_lists[node])) {
> - spin_unlock(&si->lock);
> - goto nextsi;
> - }
> - WARN(!(si->flags & SWP_WRITEOK),
> - "swap_info %d in list but !SWP_WRITEOK\n",
> - si->type);
> - __del_from_avail_list(si);
> - spin_unlock(&si->lock);
> - goto nextsi;
> - }
> n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
> n_goal, swp_entries, order);
> spin_unlock(&si->lock);
> @@ -1057,7 +1148,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> cond_resched();
>
> spin_lock(&swap_avail_lock);
> -nextsi:
> /*
> * if we got here, it's likely that si was almost full before,
> * and since scan_swap_map_slots() can drop the si->lock,
> @@ -1789,7 +1879,7 @@ unsigned int count_swap_pages(int type, int free)
> if (sis->flags & SWP_WRITEOK) {
> n = sis->pages;
> if (free)
> - n -= sis->inuse_pages;
> + n -= swap_usage_in_pages(sis);
> }
> spin_unlock(&sis->lock);
> }
> @@ -2124,7 +2214,7 @@ static int try_to_unuse(unsigned int type)
> swp_entry_t entry;
> unsigned int i;
>
> - if (!READ_ONCE(si->inuse_pages))
> + if (!swap_usage_in_pages(si))
> goto success;
>
> retry:
> @@ -2137,7 +2227,7 @@ static int try_to_unuse(unsigned int type)
>
> spin_lock(&mmlist_lock);
> p = &init_mm.mmlist;
> - while (READ_ONCE(si->inuse_pages) &&
> + while (swap_usage_in_pages(si) &&
> !signal_pending(current) &&
> (p = p->next) != &init_mm.mmlist) {
>
> @@ -2165,7 +2255,7 @@ static int try_to_unuse(unsigned int type)
> mmput(prev_mm);
>
> i = 0;
> - while (READ_ONCE(si->inuse_pages) &&
> + while (swap_usage_in_pages(si) &&
> !signal_pending(current) &&
> (i = find_next_to_unuse(si, i)) != 0) {
>
> @@ -2200,7 +2290,7 @@ static int try_to_unuse(unsigned int type)
> * folio_alloc_swap(), temporarily hiding that swap. It's easy
> * and robust (though cpu-intensive) just to keep retrying.
> */
> - if (READ_ONCE(si->inuse_pages)) {
> + if (swap_usage_in_pages(si)) {
> if (!signal_pending(current))
> goto retry;
> return -EINTR;
> @@ -2209,7 +2299,7 @@ static int try_to_unuse(unsigned int type)
> success:
> /*
> * Make sure that further cleanups after try_to_unuse() returns happen
> - * after swap_range_free() reduces si->inuse_pages to 0.
> + * after swap_range_free() reduces inuse_pages to 0.
Here, I personally think the original si->inuse_pages may be better.
> */
> smp_mb();
> return 0;
> @@ -2227,7 +2317,7 @@ static void drain_mmlist(void)
> unsigned int type;
>
> for (type = 0; type < nr_swapfiles; type++)
> - if (swap_info[type]->inuse_pages)
> + if (swap_usage_in_pages(swap_info[type]))
> return;
> spin_lock(&mmlist_lock);
> list_for_each_safe(p, next, &init_mm.mmlist)
> @@ -2406,7 +2496,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
>
> static void _enable_swap_info(struct swap_info_struct *si)
> {
> - si->flags |= SWP_WRITEOK;
> atomic_long_add(si->pages, &nr_swap_pages);
> total_swap_pages += si->pages;
>
> @@ -2423,9 +2512,8 @@ static void _enable_swap_info(struct swap_info_struct *si)
> */
> plist_add(&si->list, &swap_active_head);
>
> - /* add to available list if swap device is not full */
> - if (si->inuse_pages < si->pages)
> - add_to_avail_list(si);
> + /* Add back to available list */
> + add_to_avail_list(si, true);
> }
>
> static void enable_swap_info(struct swap_info_struct *si, int prio,
> @@ -2523,7 +2611,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> goto out_dput;
> }
> spin_lock(&p->lock);
> - del_from_avail_list(p);
> + del_from_avail_list(p, true);
> if (p->prio < 0) {
> struct swap_info_struct *si = p;
> int nid;
> @@ -2541,7 +2629,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> plist_del(&p->list, &swap_active_head);
> atomic_long_sub(p->pages, &nr_swap_pages);
> total_swap_pages -= p->pages;
> - p->flags &= ~SWP_WRITEOK;
> spin_unlock(&p->lock);
> spin_unlock(&swap_lock);
>
> @@ -2721,7 +2808,7 @@ static int swap_show(struct seq_file *swap, void *v)
> }
>
> bytes = K(si->pages);
> - inuse = K(READ_ONCE(si->inuse_pages));
> + inuse = K(swap_usage_in_pages(si));
>
> file = si->swap_file;
> len = seq_file_path(swap, file, " \t\n\\");
> @@ -2838,6 +2925,7 @@ static struct swap_info_struct *alloc_swap_info(void)
> }
> spin_lock_init(&p->lock);
> spin_lock_init(&p->cont_lock);
> + atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
> init_completion(&p->comp);
>
> return p;
> @@ -3335,7 +3423,7 @@ void si_swapinfo(struct sysinfo *val)
> struct swap_info_struct *si = swap_info[type];
>
> if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
> - nr_to_be_unused += READ_ONCE(si->inuse_pages);
> + nr_to_be_unused += swap_usage_in_pages(si);
> }
> val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
> val->totalswap = total_swap_pages + nr_to_be_unused;
> --
> 2.47.1
>
>
Powered by blists - more mailing lists