[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <21100102-51b6-79d5-03db-1bb7f97fa94c@google.com>
Date: Sun, 13 Jul 2025 12:57:18 -0700 (PDT)
From: Hugh Dickins <hughd@...gle.com>
To: Roman Gushchin <roman.gushchin@...ux.dev>
cc: Andrew Morton <akpm@...ux-foundation.org>,
Johannes Weiner <hannes@...xchg.org>,
Shakeel Butt <shakeel.butt@...ux.dev>,
Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
Michal Hocko <mhocko@...nel.org>, David Hildenbrand <david@...hat.com>,
linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] mm: skip lru_note_cost() when scanning only file or
anon
On Fri, 11 Jul 2025, Roman Gushchin wrote:
> Sorry, sent a wrong version with a trivial bug. Below is the correct
> one. The only difference is s/&sc/sc when calling scan_balance_biased().
>
> --
>
> From c06530edfb8a11139f2d7878ce3956b9238cc702 Mon Sep 17 00:00:00 2001
> From: Roman Gushchin <roman.gushchin@...ux.dev>
> Subject: [PATCH] mm: skip lru_note_cost() when scanning only file or anon
>
> lru_note_cost() records relative cost of incurring io and cpu spent
> on lru rotations, which is used to balance the pressure on file and
> anon memory. The applied pressure is inversely proportional to the
> recorded cost of reclaiming, but only within 2/3 of the range
> (swappiness aside).
>
> This is useful when both anon and file memory is reclaimable, however
> in many cases it's not the case: e.g. there might be no swap,
> proactive reclaim can target anon memory specifically,
> the memory pressure can come from cgroup v1's memsw limit, etc.
> In all these cases recording the cost will only bias all following
> reclaim, also potentially outside of the scope of the original memcg.
>
> So it's better to not record the cost if it comes from the initially
> biased reclaim.
>
> lru_note_cost() is a relatively expensive function, which traverses
> the memcg tree up to the root and takes the lruvec lock on each level.
> Overall it's responsible for about 50% of cycles spent on lruvec lock,
> which might be a non-trivial number overall under heavy memory
> pressure. So optimizing out a large number of lru_note_cost() calls
> is also beneficial from the performance perspective.
>
> Signed-off-by: Roman Gushchin <roman.gushchin@...ux.dev>
> ---
> mm/vmscan.c | 34 +++++++++++++++++++++++++---------
> 1 file changed, 25 insertions(+), 9 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c86a2495138a..7d08606b08ea 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -71,6 +71,13 @@
> #define CREATE_TRACE_POINTS
> #include <trace/events/vmscan.h>
>
> +enum scan_balance {
> + SCAN_EQUAL,
> + SCAN_FRACT,
> + SCAN_ANON,
> + SCAN_FILE,
> +};
> +
> struct scan_control {
> /* How many pages shrink_list() should reclaim */
> unsigned long nr_to_reclaim;
> @@ -90,6 +97,7 @@ struct scan_control {
> /*
> * Scan pressure balancing between anon and file LRUs
> */
> + enum scan_balance scan_balance;
> unsigned long anon_cost;
> unsigned long file_cost;
>
> @@ -1988,6 +1996,17 @@ static int current_may_throttle(void)
> return !(current->flags & PF_LOCAL_THROTTLE);
> }
>
> +static bool scan_balance_biased(struct scan_control *sc)
> +{
> + switch (sc->scan_balance) {
> + case SCAN_EQUAL:
> + case SCAN_FRACT:
> + return false;
> + default:
> + return true;
> + }
> +}
> +
> /*
> * shrink_inactive_list() is a helper for shrink_node(). It returns the number
> * of reclaimed pages
> @@ -2054,7 +2073,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
> spin_unlock_irq(&lruvec->lru_lock);
>
> - lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
> + if (!scan_balance_biased(sc))
> + lru_note_cost(lruvec, file, stat.nr_pageout,
> + nr_scanned - nr_reclaimed);
>
> /*
> * If dirty folios are scanned that are not queued for IO, it
> @@ -2202,7 +2223,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
> __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
> spin_unlock_irq(&lruvec->lru_lock);
>
> - if (nr_rotated)
> + if (nr_rotated && !scan_balance_biased(sc))
> lru_note_cost(lruvec, file, 0, nr_rotated);
> trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
> nr_deactivate, nr_rotated, sc->priority, file);
> @@ -2327,13 +2348,6 @@ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
> return inactive * inactive_ratio < active;
> }
>
> -enum scan_balance {
> - SCAN_EQUAL,
> - SCAN_FRACT,
> - SCAN_ANON,
> - SCAN_FILE,
> -};
> -
> static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
> {
> unsigned long file;
> @@ -2613,6 +2627,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
> calculate_pressure_balance(sc, swappiness, fraction, &denominator);
>
> out:
> + sc->scan_balance = scan_balance;
> +
> for_each_evictable_lru(lru) {
> bool file = is_file_lru(lru);
> unsigned long lruvec_size;
> --
> 2.50.0
Roman, I'm expressing no opinion on your patch above, but please may I
throw the patch below (against 6.16-rc) over the wall to you, to add as a
1/2 or 2/2 to yours (as it stands, it does conflict slightly with yours).
My attention needs to be on other things; but five years ago in
https://lore.kernel.org/linux-mm/alpine.LSU.2.11.2009211440570.5214@eggly.anvils/
I noted how lru_note_cost() became more costly with per-memcg lru_lock,
but did nothing about it at the time. Apparently now is the time.
Thanks,
Hugh
[PATCH] mm: lru_note_cost_unlock_irq()
Dropping a lock, just to demand it again for an afterthought, cannot be
good if contended: convert lru_note_cost() to lru_note_cost_unlock_irq().
Signed-off-by: Hugh Dickins <hughd@...gle.com>
---
include/linux/swap.h | 5 +++--
mm/swap.c | 26 +++++++++++++++++++-------
mm/vmscan.c | 8 +++-----
3 files changed, 25 insertions(+), 14 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index bc0e1c275fc0..a64a87cda960 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -376,8 +376,9 @@ extern unsigned long totalreserve_pages;
/* linux/mm/swap.c */
-void lru_note_cost(struct lruvec *lruvec, bool file,
- unsigned int nr_io, unsigned int nr_rotated);
+void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated)
+ __releases(lruvec->lru_lock);
void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
diff --git a/mm/swap.c b/mm/swap.c
index 4fc322f7111a..37053f222a6e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -237,8 +237,9 @@ void folio_rotate_reclaimable(struct folio *folio)
folio_batch_add_and_move(folio, lru_move_tail, true);
}
-void lru_note_cost(struct lruvec *lruvec, bool file,
- unsigned int nr_io, unsigned int nr_rotated)
+void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated)
+ __releases(lruvec->lru_lock)
{
unsigned long cost;
@@ -250,8 +251,12 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
* different between them, adjust scan balance for CPU work.
*/
cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
+ if (!cost) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ return;
+ }
- do {
+ for (;;) {
unsigned long lrusize;
/*
@@ -261,7 +266,6 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
* rcu lock, so would be safe even if the page was on the LRU
* and could move simultaneously to a new lruvec).
*/
- spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
lruvec->file_cost += cost;
@@ -285,14 +289,22 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
lruvec->file_cost /= 2;
lruvec->anon_cost /= 2;
}
+
spin_unlock_irq(&lruvec->lru_lock);
- } while ((lruvec = parent_lruvec(lruvec)));
+ lruvec = parent_lruvec(lruvec);
+ if (!lruvec)
+ break;
+ spin_lock_irq(&lruvec->lru_lock);
+ }
}
void lru_note_cost_refault(struct folio *folio)
{
- lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
- folio_nr_pages(folio), 0);
+ struct lruvec *lruvec;
+
+ lruvec = folio_lruvec_lock_irq(folio);
+ lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio),
+ folio_nr_pages(folio), 0);
}
static void lru_activate(struct lruvec *lruvec, struct folio *folio)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8dfd2864bbf..5ba49f884bc0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2059,9 +2059,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
__count_vm_events(item, nr_reclaimed);
count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
- spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+ lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
+ nr_scanned - nr_reclaimed);
/*
* If dirty folios are scanned that are not queued for IO, it
@@ -2207,10 +2207,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&lruvec->lru_lock);
- if (nr_rotated)
- lru_note_cost(lruvec, file, 0, nr_rotated);
+ lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
--
2.43.0
Powered by blists - more mailing lists