[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMgjq7CXR95es5hamNeTPg5zrh2njJAV9PHP9y_dOij8NiJCVA@mail.gmail.com>
Date: Fri, 3 Jan 2025 16:07:07 +0800
From: Kairui Song <ryncsn@...il.com>
To: Baoquan He <bhe@...hat.com>
Cc: linux-mm@...ck.org, Andrew Morton <akpm@...ux-foundation.org>,
Chris Li <chrisl@...nel.org>, Barry Song <v-songbaohua@...o.com>,
Ryan Roberts <ryan.roberts@....com>, Hugh Dickins <hughd@...gle.com>,
Yosry Ahmed <yosryahmed@...gle.com>, "Huang, Ying" <ying.huang@...ux.alibaba.com>,
Nhat Pham <nphamcs@...il.com>, Johannes Weiner <hannes@...xchg.org>,
Kalesh Singh <kaleshsingh@...gle.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3 06/13] mm, swap: clean up plist removal and adding
On Thu, Jan 2, 2025 at 4:59 PM Baoquan He <bhe@...hat.com> wrote:
>
> Hi Kairui,
>
> On 12/31/24 at 01:46am, Kairui Song wrote:
> ......snip...
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 7963a0c646a4..e6e58cfb5178 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -128,6 +128,26 @@ static inline unsigned char swap_count(unsigned char ent)
> > return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
> > }
>
> I am reading swap code, while at it, I am going through this patchset
> too. Have some nitpick, please see below inline comments.
Thanks!
> >
> > +/*
> > + * Use the second highest bit of inuse_pages counter as the indicator
> > + * of if one swap device is on the available plist, so the atomic can
> ~~ redundant?
> > + * still be updated arithmetic while having special data embedded.
> ~~~~~~~~~~ typo, arithmetically?
> > + *
> > + * inuse_pages counter is the only thing indicating if a device should
> > + * be on avail_lists or not (except swapon / swapoff). By embedding the
> > + * on-list bit in the atomic counter, updates no longer need any lock
> ~~~ off-list?
Ah, right, some typos, will fix these.
> > + * to check the list status.
> > + *
> > + * This bit will be set if the device is not on the plist and not
> > + * usable, will be cleared if the device is on the plist.
> > + */
> > +#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
> > +#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
> > +static long swap_usage_in_pages(struct swap_info_struct *si)
> > +{
> > + return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
> > +}
> > +
> > /* Reclaim the swap entry anyway if possible */
> > #define TTRS_ANYWAY 0x1
> > /*
> > @@ -717,7 +737,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
> > int nr_reclaim;
> >
> > if (force)
> > - to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
> > + to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
> >
> > while (!list_empty(&si->full_clusters)) {
> > ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
> > @@ -872,42 +892,128 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> > return found;
> > }
> >
> > -static void __del_from_avail_list(struct swap_info_struct *si)
> > +/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
> Seems it just says the opposite. The off-list bit is set in
> this function.
Right, the comments are opposite... will fix them.
> > +static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> > {
> > int nid;
> > + unsigned long pages;
> > +
> > + spin_lock(&swap_avail_lock);
> > +
> > + if (swapoff) {
> > + /*
> > + * Forcefully remove it. Clear the SWP_WRITEOK flags for
> > + * swapoff here so it's synchronized by both si->lock and
> > + * swap_avail_lock, to ensure the result can be seen by
> > + * add_to_avail_list.
> > + */
> > + lockdep_assert_held(&si->lock);
> > + si->flags &= ~SWP_WRITEOK;
> > + atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
> > + } else {
> > + /*
> > + * If not called by swapoff, take it off-list only if it's
> > + * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
> > + * si->inuse_pages == pages), any concurrent slot freeing,
> > + * or device already removed from plist by someone else
> > + * will make this return false.
> > + */
> > + pages = si->pages;
> > + if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
> > + pages | SWAP_USAGE_OFFLIST_BIT))
> > + goto skip;
> > + }
> >
> > - assert_spin_locked(&si->lock);
> > for_each_node(nid)
> > plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
> > +
> > +skip:
> > + spin_unlock(&swap_avail_lock);
> > }
> >
> > -static void del_from_avail_list(struct swap_info_struct *si)
> > +/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
>
> Ditto.
>
> > +static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> > {
> > + int nid;
> > + long val;
> > + unsigned long pages;
> > +
> > spin_lock(&swap_avail_lock);
> > - __del_from_avail_list(si);
> > +
> > + /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
> > + if (swapon) {
> > + lockdep_assert_held(&si->lock);
> > + si->flags |= SWP_WRITEOK;
> > + } else {
> > + if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
> > + goto skip;
> > + }
> > +
> > + if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
> > + goto skip;
> > +
> > + val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
> > +
> > + /*
> > + * When device is full and device is on the plist, only one updater will
> > + * see (inuse_pages == si->pages) and will call del_from_avail_list. If
> > + * that updater happen to be here, just skip adding.
> > + */
> > + pages = si->pages;
> > + if (val == pages) {
> > + /* Just like the cmpxchg in del_from_avail_list */
> > + if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
> > + pages | SWAP_USAGE_OFFLIST_BIT))
> > + goto skip;
> > + }
> > +
> > + for_each_node(nid)
> > + plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
> > +
> > +skip:
> > spin_unlock(&swap_avail_lock);
> > }
> >
> > -static void swap_range_alloc(struct swap_info_struct *si,
> > - unsigned int nr_entries)
> > +/*
> > + * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
>
> Not sure if swap_inuse_add()/swap_inuse_sub() or swap_inuse_cnt_add/sub()
> is better, because it mixes with the usage of si->swap_map[offset].
> Anyway, not strong opinion.
>
> > + * within each cluster, so the total contribution to the global counter should
> > + * always be positive and cannot exceed the total number of usable slots.
> > + */
> > +static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
> > {
> > - WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
> > - if (si->inuse_pages == si->pages) {
> > - del_from_avail_list(si);
> > + long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
> >
> > - if (si->cluster_info && vm_swap_full())
> > - schedule_work(&si->reclaim_work);
> > + /*
> > + * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
> > + * remove it from the plist.
> > + */
> > + if (unlikely(val == si->pages)) {
> > + del_from_avail_list(si, false);
> > + return true;
> > }
> > +
> > + return false;
> > }
> >
> > -static void add_to_avail_list(struct swap_info_struct *si)
> > +static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
> > {
> > - int nid;
> > + long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
> >
> > - spin_lock(&swap_avail_lock);
> > - for_each_node(nid)
> > - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
> > - spin_unlock(&swap_avail_lock);
> > + /*
> > + * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
> > + * remove it from the plist.
> > + */
> > + if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
> > + add_to_avail_list(si, false);
> > +}
> > +
> > +static void swap_range_alloc(struct swap_info_struct *si,
> > + unsigned int nr_entries)
> > +{
> > + if (swap_usage_add(si, nr_entries)) {
> > + if (si->cluster_info && vm_swap_full())
>
> We may not need check si->cluster_info here since it always exists now.
Good catch, it can be dropped indeed as an optimization, one previous
patch in this series is supposed to drop them all, I think I forgot
this one.
>
> > + schedule_work(&si->reclaim_work);
> > + }
> > }
> >
> > static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> > @@ -925,8 +1031,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> > for (i = 0; i < nr_entries; i++)
> > clear_bit(offset + i, si->zeromap);
> >
> > - if (si->inuse_pages == si->pages)
> > - add_to_avail_list(si);
> > if (si->flags & SWP_BLKDEV)
> > swap_slot_free_notify =
> > si->bdev->bd_disk->fops->swap_slot_free_notify;
> > @@ -946,7 +1050,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
> > */
> > smp_wmb();
> > atomic_long_add(nr_entries, &nr_swap_pages);
> > - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
> > + swap_usage_sub(si, nr_entries);
> > }
> >
> > static int cluster_alloc_swap(struct swap_info_struct *si,
> > @@ -1036,19 +1140,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> > plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
> > spin_unlock(&swap_avail_lock);
> > spin_lock(&si->lock);
> > - if ((si->inuse_pages == si->pages) || !(si->flags & SWP_WRITEOK)) {
> > - spin_lock(&swap_avail_lock);
> > - if (plist_node_empty(&si->avail_lists[node])) {
> > - spin_unlock(&si->lock);
> > - goto nextsi;
> > - }
> > - WARN(!(si->flags & SWP_WRITEOK),
> > - "swap_info %d in list but !SWP_WRITEOK\n",
> > - si->type);
> > - __del_from_avail_list(si);
> > - spin_unlock(&si->lock);
> > - goto nextsi;
> > - }
> > n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
> > n_goal, swp_entries, order);
> > spin_unlock(&si->lock);
> > @@ -1057,7 +1148,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
> > cond_resched();
> >
> > spin_lock(&swap_avail_lock);
> > -nextsi:
> > /*
> > * if we got here, it's likely that si was almost full before,
> > * and since scan_swap_map_slots() can drop the si->lock,
> > @@ -1789,7 +1879,7 @@ unsigned int count_swap_pages(int type, int free)
> > if (sis->flags & SWP_WRITEOK) {
> > n = sis->pages;
> > if (free)
> > - n -= sis->inuse_pages;
> > + n -= swap_usage_in_pages(sis);
> > }
> > spin_unlock(&sis->lock);
> > }
> > @@ -2124,7 +2214,7 @@ static int try_to_unuse(unsigned int type)
> > swp_entry_t entry;
> > unsigned int i;
> >
> > - if (!READ_ONCE(si->inuse_pages))
> > + if (!swap_usage_in_pages(si))
> > goto success;
> >
> > retry:
> > @@ -2137,7 +2227,7 @@ static int try_to_unuse(unsigned int type)
> >
> > spin_lock(&mmlist_lock);
> > p = &init_mm.mmlist;
> > - while (READ_ONCE(si->inuse_pages) &&
> > + while (swap_usage_in_pages(si) &&
> > !signal_pending(current) &&
> > (p = p->next) != &init_mm.mmlist) {
> >
> > @@ -2165,7 +2255,7 @@ static int try_to_unuse(unsigned int type)
> > mmput(prev_mm);
> >
> > i = 0;
> > - while (READ_ONCE(si->inuse_pages) &&
> > + while (swap_usage_in_pages(si) &&
> > !signal_pending(current) &&
> > (i = find_next_to_unuse(si, i)) != 0) {
> >
> > @@ -2200,7 +2290,7 @@ static int try_to_unuse(unsigned int type)
> > * folio_alloc_swap(), temporarily hiding that swap. It's easy
> > * and robust (though cpu-intensive) just to keep retrying.
> > */
> > - if (READ_ONCE(si->inuse_pages)) {
> > + if (swap_usage_in_pages(si)) {
> > if (!signal_pending(current))
> > goto retry;
> > return -EINTR;
> > @@ -2209,7 +2299,7 @@ static int try_to_unuse(unsigned int type)
> > success:
> > /*
> > * Make sure that further cleanups after try_to_unuse() returns happen
> > - * after swap_range_free() reduces si->inuse_pages to 0.
> > + * after swap_range_free() reduces inuse_pages to 0.
>
> Here, I personally think the original si->inuse_pages may be better.
I updated this comment to avoid people from mis-using it directly,
anyway it's a trivial comment, can keep it unchanged.
Powered by blists - more mailing lists