lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAGsJ_4yWLpd262vOMb11qeQXhXeNFLOKpmEUat3kvJv4wBxi5w@mail.gmail.com>
Date: Tue, 10 Dec 2024 16:24:43 +0800
From: Barry Song <21cnbao@...il.com>
To: chenridong <chenridong@...wei.com>
Cc: Chen Ridong <chenridong@...weicloud.com>, akpm@...ux-foundation.org, mhocko@...e.com, 
	hannes@...xchg.org, yosryahmed@...gle.com, yuzhao@...gle.com, 
	david@...hat.com, willy@...radead.org, ryan.roberts@....com, 
	wangkefeng.wang@...wei.com, linux-mm@...ck.org, linux-kernel@...r.kernel.org, 
	wangweiyang2@...wei.com, xieym_ict@...mail.com
Subject: Re: [PATCH v4 1/1] mm: vmascan: retry folios written back while
 isolated for traditional LRU

On Tue, Dec 10, 2024 at 2:41 PM chenridong <chenridong@...wei.com> wrote:
>
>
>
> On 2024/12/10 12:54, Barry Song wrote:
> > On Mon, Dec 9, 2024 at 4:46 PM Chen Ridong <chenridong@...weicloud.com> wrote:
> >>
> >> From: Chen Ridong <chenridong@...wei.com>
> >>
> >> The commit 359a5e1416ca ("mm: multi-gen LRU: retry folios written back
> >> while isolated") only fixed the issue for mglru. However, this issue
> >> also exists in the traditional active/inactive LRU. This issue will be
> >> worse if THP is split, which makes the list longer and needs longer time
> >> to finish a batch of folios reclaim.
> >>
> >> This issue should be fixed in the same way for the traditional LRU.
> >> Therefore, the common logic was extracted to the 'find_folios_written_back'
> >> function firstly, which is then reused in the 'shrink_inactive_list'
> >> function. Finally, retry reclaiming those folios that may have missed the
> >> rotation for traditional LRU.
> >
> > let's drop the cover-letter and refine the changelog.
> >
> Will update.
>
> >>
> >> Signed-off-by: Chen Ridong <chenridong@...wei.com>
> >> ---
> >>  include/linux/mmzone.h |   3 +-
> >>  mm/vmscan.c            | 108 +++++++++++++++++++++++++++++------------
> >>  2 files changed, 77 insertions(+), 34 deletions(-)
> >>
> >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> >> index b36124145a16..47c6e8c43dcd 100644
> >> --- a/include/linux/mmzone.h
> >> +++ b/include/linux/mmzone.h
> >> @@ -391,6 +391,7 @@ struct page_vma_mapped_walk;
> >>
> >>  #define LRU_GEN_MASK           ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
> >>  #define LRU_REFS_MASK          ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
> >> +#define LRU_REFS_FLAGS         (BIT(PG_referenced) | BIT(PG_workingset))
> >>
> >>  #ifdef CONFIG_LRU_GEN
> >>
> >> @@ -406,8 +407,6 @@ enum {
> >>         NR_LRU_GEN_CAPS
> >>  };
> >>
> >> -#define LRU_REFS_FLAGS         (BIT(PG_referenced) | BIT(PG_workingset))
> >> -
> >>  #define MIN_LRU_BATCH          BITS_PER_LONG
> >>  #define MAX_LRU_BATCH          (MIN_LRU_BATCH * 64)
> >>
> >> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >> index 76378bc257e3..1f0d194f8b2f 100644
> >> --- a/mm/vmscan.c
> >> +++ b/mm/vmscan.c
> >> @@ -283,6 +283,48 @@ static void set_task_reclaim_state(struct task_struct *task,
> >>         task->reclaim_state = rs;
> >>  }
> >>
> >> +/**
> >> + * find_folios_written_back - Find and move the written back folios to a new list.
> >> + * @list: filios list
> >> + * @clean: the written back folios list
> >> + * @skip: whether skip to move the written back folios to clean list.
> >> + */
> >> +static inline void find_folios_written_back(struct list_head *list,
> >> +               struct list_head *clean, bool skip)
> >> +{
> >> +       struct folio *folio;
> >> +       struct folio *next;
> >> +
> >> +       list_for_each_entry_safe_reverse(folio, next, list, lru) {
> >> +               if (!folio_evictable(folio)) {
> >> +                       list_del(&folio->lru);
> >> +                       folio_putback_lru(folio);
> >> +                       continue;
> >> +               }
> >> +
> >> +               if (folio_test_reclaim(folio) &&
> >> +                   (folio_test_dirty(folio) || folio_test_writeback(folio))) {
> >> +                       /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
> >> +                       if (lru_gen_enabled() && folio_test_workingset(folio))
> >> +                               folio_set_referenced(folio);
> >> +                       continue;
> >> +               }
> >> +
> >> +               if (skip || folio_test_active(folio) || folio_test_referenced(folio) ||
> >> +                   folio_mapped(folio) || folio_test_locked(folio) ||
> >> +                   folio_test_dirty(folio) || folio_test_writeback(folio)) {
> >> +                       /* don't add rejected folios to the oldest generation */
> >> +                       if (lru_gen_enabled())
> >> +                               set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
> >> +                                             BIT(PG_active));
> >> +                       continue;
> >> +               }
> >> +
> >> +               /* retry folios that may have missed folio_rotate_reclaimable() */
> >> +               list_move(&folio->lru, clean);
> >> +       }
> >> +}
> >> +
> >>  /*
> >>   * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
> >>   * scan_control->nr_reclaimed.
> >> @@ -1907,6 +1949,25 @@ static int current_may_throttle(void)
> >>         return !(current->flags & PF_LOCAL_THROTTLE);
> >>  }
> >>
> >> +static inline void acc_reclaimed_stat(struct reclaim_stat *stat,
> >> +               struct reclaim_stat *curr)
> >> +{
> >> +       int i;
> >> +
> >> +       stat->nr_dirty += curr->nr_dirty;
> >> +       stat->nr_unqueued_dirty += curr->nr_unqueued_dirty;
> >> +       stat->nr_congested += curr->nr_congested;
> >> +       stat->nr_writeback += curr->nr_writeback;
> >> +       stat->nr_immediate += curr->nr_immediate;
> >> +       stat->nr_pageout += curr->nr_pageout;
> >> +       stat->nr_ref_keep += curr->nr_ref_keep;
> >> +       stat->nr_unmap_fail += curr->nr_unmap_fail;
> >> +       stat->nr_lazyfree_fail += curr->nr_lazyfree_fail;
> >> +       stat->nr_demoted += curr->nr_demoted;
> >> +       for (i = 0; i < ANON_AND_FILE; i++)
> >> +               stat->nr_activate[i] = curr->nr_activate[i];
> >> +}
> >
> > you had no this before, what's the purpose of this?
> >
>
> We may call shrink_folio_list twice, and the 'stat curr' will reset in
> the shrink_folio_list function. We should accumulate the stats as a
> whole, which will then be used to calculate the cost and return it to
> the caller.

Does mglru have the same issue? If so, we may need to send a patch to
fix mglru's stat accounting as well. By the way, the code is rather
messy—could it be implemented as shown below instead?

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1f0d194f8b2f..40d2ddde21f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1094,7 +1094,6 @@ static unsigned int shrink_folio_list(struct
list_head *folio_list,
  struct swap_iocb *plug = NULL;

  folio_batch_init(&free_folios);
- memset(stat, 0, sizeof(*stat));
  cond_resched();
  do_demote_pass = can_demote(pgdat->node_id, sc);

@@ -1949,25 +1948,6 @@ static int current_may_throttle(void)
  return !(current->flags & PF_LOCAL_THROTTLE);
 }

-static inline void acc_reclaimed_stat(struct reclaim_stat *stat,
- struct reclaim_stat *curr)
-{
- int i;
-
- stat->nr_dirty += curr->nr_dirty;
- stat->nr_unqueued_dirty += curr->nr_unqueued_dirty;
- stat->nr_congested += curr->nr_congested;
- stat->nr_writeback += curr->nr_writeback;
- stat->nr_immediate += curr->nr_immediate;
- stat->nr_pageout += curr->nr_pageout;
- stat->nr_ref_keep += curr->nr_ref_keep;
- stat->nr_unmap_fail += curr->nr_unmap_fail;
- stat->nr_lazyfree_fail += curr->nr_lazyfree_fail;
- stat->nr_demoted += curr->nr_demoted;
- for (i = 0; i < ANON_AND_FILE; i++)
- stat->nr_activate[i] = curr->nr_activate[i];
-}
-
 /*
  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
  * of reclaimed pages
@@ -1981,7 +1961,7 @@ static unsigned long
shrink_inactive_list(unsigned long nr_to_scan,
  unsigned long nr_scanned;
  unsigned int nr_reclaimed = 0;
  unsigned long nr_taken;
- struct reclaim_stat stat, curr;
+ struct reclaim_stat stat;
  bool file = is_file_lru(lru);
  enum vm_event_item item;
  struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2022,9 +2002,8 @@ static unsigned long
shrink_inactive_list(unsigned long nr_to_scan,

  memset(&stat, 0, sizeof(stat));
 retry:
- nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false);
+ nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
  find_folios_written_back(&folio_list, &clean_list, skip_retry);
- acc_reclaimed_stat(&stat, &curr);

  spin_lock_irq(&lruvec->lru_lock);
  move_folios_to_lru(lruvec, &folio_list);

>
> Thanks,
> Ridong
>
> >> +
> >>  /*
> >>   * shrink_inactive_list() is a helper for shrink_node().  It returns the number
> >>   * of reclaimed pages
> >> @@ -1916,14 +1977,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> >>                 enum lru_list lru)
> >>  {
> >>         LIST_HEAD(folio_list);
> >> +       LIST_HEAD(clean_list);
> >>         unsigned long nr_scanned;
> >>         unsigned int nr_reclaimed = 0;
> >>         unsigned long nr_taken;
> >> -       struct reclaim_stat stat;
> >> +       struct reclaim_stat stat, curr;
> >>         bool file = is_file_lru(lru);
> >>         enum vm_event_item item;
> >>         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> >>         bool stalled = false;
> >> +       bool skip_retry = false;
> >>
> >>         while (unlikely(too_many_isolated(pgdat, file, sc))) {
> >>                 if (stalled)
> >> @@ -1957,10 +2020,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> >>         if (nr_taken == 0)
> >>                 return 0;
> >>
> >> -       nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
> >> +       memset(&stat, 0, sizeof(stat));
> >> +retry:
> >> +       nr_reclaimed += shrink_folio_list(&folio_list, pgdat, sc, &curr, false);
> >> +       find_folios_written_back(&folio_list, &clean_list, skip_retry);
> >> +       acc_reclaimed_stat(&stat, &curr);
> >>
> >>         spin_lock_irq(&lruvec->lru_lock);
> >>         move_folios_to_lru(lruvec, &folio_list);
> >> +       if (!list_empty(&clean_list)) {
> >> +               list_splice_init(&clean_list, &folio_list);
> >> +               skip_retry = true;
> >> +               spin_unlock_irq(&lruvec->lru_lock);
> >> +               goto retry;

This is rather confusing. We're still jumping to retry even though
skip_retry=true is set. Can we find a clearer approach for this?

It was somewhat acceptable before we introduced the extracted
function find_folios_written_back(). However, it has become
harder to follow now that skip_retry is passed across functions.

I find renaming skip_retry to is_retry more intuitive. The logic
is that since we are already retrying, find_folios_written_back()
shouldn’t move folios to the clean list again. The intended semantics
are: we have retris, don’t retry again.


> >> +       }
> >>
> >>         __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
> >>                                         stat.nr_demoted);
> >> @@ -4567,8 +4640,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
> >>         int reclaimed;
> >>         LIST_HEAD(list);
> >>         LIST_HEAD(clean);
> >> -       struct folio *folio;
> >> -       struct folio *next;
> >>         enum vm_event_item item;
> >>         struct reclaim_stat stat;
> >>         struct lru_gen_mm_walk *walk;
> >> @@ -4597,34 +4668,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
> >>                         scanned, reclaimed, &stat, sc->priority,
> >>                         type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
> >>
> >> -       list_for_each_entry_safe_reverse(folio, next, &list, lru) {
> >> -               if (!folio_evictable(folio)) {
> >> -                       list_del(&folio->lru);
> >> -                       folio_putback_lru(folio);
> >> -                       continue;
> >> -               }
> >> -
> >> -               if (folio_test_reclaim(folio) &&
> >> -                   (folio_test_dirty(folio) || folio_test_writeback(folio))) {
> >> -                       /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
> >> -                       if (folio_test_workingset(folio))
> >> -                               folio_set_referenced(folio);
> >> -                       continue;
> >> -               }
> >> -
> >> -               if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
> >> -                   folio_mapped(folio) || folio_test_locked(folio) ||
> >> -                   folio_test_dirty(folio) || folio_test_writeback(folio)) {
> >> -                       /* don't add rejected folios to the oldest generation */
> >> -                       set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
> >> -                                     BIT(PG_active));
> >> -                       continue;
> >> -               }
> >> -
> >> -               /* retry folios that may have missed folio_rotate_reclaimable() */
> >> -               list_move(&folio->lru, &clean);
> >> -       }
> >> -
> >> +       find_folios_written_back(&list, &clean, skip_retry);
> >>         spin_lock_irq(&lruvec->lru_lock);
> >>
> >>         move_folios_to_lru(lruvec, &list);
> >> --
> >> 2.34.1
> >>
> >

Thanks
 Barry

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ