lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <21100102-51b6-79d5-03db-1bb7f97fa94c@google.com>
Date: Sun, 13 Jul 2025 12:57:18 -0700 (PDT)
From: Hugh Dickins <hughd@...gle.com>
To: Roman Gushchin <roman.gushchin@...ux.dev>
cc: Andrew Morton <akpm@...ux-foundation.org>, 
    Johannes Weiner <hannes@...xchg.org>, 
    Shakeel Butt <shakeel.butt@...ux.dev>, 
    Lorenzo Stoakes <lorenzo.stoakes@...cle.com>, 
    Michal Hocko <mhocko@...nel.org>, David Hildenbrand <david@...hat.com>, 
    linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] mm: skip lru_note_cost() when scanning only file or
 anon

On Fri, 11 Jul 2025, Roman Gushchin wrote:

> Sorry, sent a wrong version with a trivial bug. Below is the correct
> one. The only difference is s/&sc/sc when calling scan_balance_biased().
> 
> --
> 
> From c06530edfb8a11139f2d7878ce3956b9238cc702 Mon Sep 17 00:00:00 2001
> From: Roman Gushchin <roman.gushchin@...ux.dev>
> Subject: [PATCH] mm: skip lru_note_cost() when scanning only file or anon
> 
> lru_note_cost() records relative cost of incurring io and cpu spent
> on lru rotations, which is used to balance the pressure on file and
> anon memory. The applied pressure is inversely proportional to the
> recorded cost of reclaiming, but only within 2/3 of the range
> (swappiness aside).
> 
> This is useful when both anon and file memory is reclaimable, however
> in many cases it's not the case: e.g. there might be no swap,
> proactive reclaim can target anon memory specifically,
> the memory pressure can come from cgroup v1's memsw limit, etc.
> In all these cases recording the cost will only bias all following
> reclaim, also potentially outside of the scope of the original memcg.
> 
> So it's better to not record the cost if it comes from the initially
> biased reclaim.
> 
> lru_note_cost() is a relatively expensive function, which traverses
> the memcg tree up to the root and takes the lruvec lock on each level.
> Overall it's responsible for about 50% of cycles spent on lruvec lock,
> which might be a non-trivial number overall under heavy memory
> pressure. So optimizing out a large number of lru_note_cost() calls
> is also beneficial from the performance perspective.
> 
> Signed-off-by: Roman Gushchin <roman.gushchin@...ux.dev>
> ---
>  mm/vmscan.c | 34 +++++++++++++++++++++++++---------
>  1 file changed, 25 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c86a2495138a..7d08606b08ea 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -71,6 +71,13 @@
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/vmscan.h>
>  
> +enum scan_balance {
> +	SCAN_EQUAL,
> +	SCAN_FRACT,
> +	SCAN_ANON,
> +	SCAN_FILE,
> +};
> +
>  struct scan_control {
>  	/* How many pages shrink_list() should reclaim */
>  	unsigned long nr_to_reclaim;
> @@ -90,6 +97,7 @@ struct scan_control {
>  	/*
>  	 * Scan pressure balancing between anon and file LRUs
>  	 */
> +	enum scan_balance scan_balance;
>  	unsigned long	anon_cost;
>  	unsigned long	file_cost;
>  
> @@ -1988,6 +1996,17 @@ static int current_may_throttle(void)
>  	return !(current->flags & PF_LOCAL_THROTTLE);
>  }
>  
> +static bool scan_balance_biased(struct scan_control *sc)
> +{
> +	switch (sc->scan_balance) {
> +	case SCAN_EQUAL:
> +	case SCAN_FRACT:
> +		return false;
> +	default:
> +		return true;
> +	}
> +}
> +
>  /*
>   * shrink_inactive_list() is a helper for shrink_node().  It returns the number
>   * of reclaimed pages
> @@ -2054,7 +2073,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
>  	__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
>  	spin_unlock_irq(&lruvec->lru_lock);
>  
> -	lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
> +	if (!scan_balance_biased(sc))
> +		lru_note_cost(lruvec, file, stat.nr_pageout,
> +			      nr_scanned - nr_reclaimed);
>  
>  	/*
>  	 * If dirty folios are scanned that are not queued for IO, it
> @@ -2202,7 +2223,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
>  	spin_unlock_irq(&lruvec->lru_lock);
>  
> -	if (nr_rotated)
> +	if (nr_rotated && !scan_balance_biased(sc))
>  		lru_note_cost(lruvec, file, 0, nr_rotated);
>  	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
>  			nr_deactivate, nr_rotated, sc->priority, file);
> @@ -2327,13 +2348,6 @@ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
>  	return inactive * inactive_ratio < active;
>  }
>  
> -enum scan_balance {
> -	SCAN_EQUAL,
> -	SCAN_FRACT,
> -	SCAN_ANON,
> -	SCAN_FILE,
> -};
> -
>  static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
>  {
>  	unsigned long file;
> @@ -2613,6 +2627,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>  	calculate_pressure_balance(sc, swappiness, fraction, &denominator);
>  
>  out:
> +	sc->scan_balance = scan_balance;
> +
>  	for_each_evictable_lru(lru) {
>  		bool file = is_file_lru(lru);
>  		unsigned long lruvec_size;
> -- 
> 2.50.0

Roman, I'm expressing no opinion on your patch above, but please may I
throw the patch below (against 6.16-rc) over the wall to you, to add as a
1/2 or 2/2 to yours (as it stands, it does conflict slightly with yours).

My attention needs to be on other things; but five years ago in
https://lore.kernel.org/linux-mm/alpine.LSU.2.11.2009211440570.5214@eggly.anvils/
I noted how lru_note_cost() became more costly with per-memcg lru_lock,
but did nothing about it at the time.  Apparently now is the time.

Thanks,
Hugh

[PATCH] mm: lru_note_cost_unlock_irq()

Dropping a lock, just to demand it again for an afterthought, cannot be
good if contended: convert lru_note_cost() to lru_note_cost_unlock_irq().

Signed-off-by: Hugh Dickins <hughd@...gle.com>
---
 include/linux/swap.h |  5 +++--
 mm/swap.c            | 26 +++++++++++++++++++-------
 mm/vmscan.c          |  8 +++-----
 3 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bc0e1c275fc0..a64a87cda960 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -376,8 +376,9 @@ extern unsigned long totalreserve_pages;
 
 
 /* linux/mm/swap.c */
-void lru_note_cost(struct lruvec *lruvec, bool file,
-		   unsigned int nr_io, unsigned int nr_rotated);
+void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
+		unsigned int nr_io, unsigned int nr_rotated)
+		__releases(lruvec->lru_lock);
 void lru_note_cost_refault(struct folio *);
 void folio_add_lru(struct folio *);
 void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
diff --git a/mm/swap.c b/mm/swap.c
index 4fc322f7111a..37053f222a6e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -237,8 +237,9 @@ void folio_rotate_reclaimable(struct folio *folio)
 	folio_batch_add_and_move(folio, lru_move_tail, true);
 }
 
-void lru_note_cost(struct lruvec *lruvec, bool file,
-		   unsigned int nr_io, unsigned int nr_rotated)
+void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
+		unsigned int nr_io, unsigned int nr_rotated)
+		__releases(lruvec->lru_lock)
 {
 	unsigned long cost;
 
@@ -250,8 +251,12 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
 	 * different between them, adjust scan balance for CPU work.
 	 */
 	cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
+	if (!cost) {
+		spin_unlock_irq(&lruvec->lru_lock);
+		return;
+	}
 
-	do {
+	for (;;) {
 		unsigned long lrusize;
 
 		/*
@@ -261,7 +266,6 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
 		 *    rcu lock, so would be safe even if the page was on the LRU
 		 *    and could move simultaneously to a new lruvec).
 		 */
-		spin_lock_irq(&lruvec->lru_lock);
 		/* Record cost event */
 		if (file)
 			lruvec->file_cost += cost;
@@ -285,14 +289,22 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
 			lruvec->file_cost /= 2;
 			lruvec->anon_cost /= 2;
 		}
+
 		spin_unlock_irq(&lruvec->lru_lock);
-	} while ((lruvec = parent_lruvec(lruvec)));
+		lruvec = parent_lruvec(lruvec);
+		if (!lruvec)
+			break;
+		spin_lock_irq(&lruvec->lru_lock);
+	}
 }
 
 void lru_note_cost_refault(struct folio *folio)
 {
-	lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
-		      folio_nr_pages(folio), 0);
+	struct lruvec *lruvec;
+
+	lruvec = folio_lruvec_lock_irq(folio);
+	lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio),
+				folio_nr_pages(folio), 0);
 }
 
 static void lru_activate(struct lruvec *lruvec, struct folio *folio)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8dfd2864bbf..5ba49f884bc0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2059,9 +2059,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 		__count_vm_events(item, nr_reclaimed);
 	count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
 	__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
-	spin_unlock_irq(&lruvec->lru_lock);
 
-	lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+	lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
+					nr_scanned - nr_reclaimed);
 
 	/*
 	 * If dirty folios are scanned that are not queued for IO, it
@@ -2207,10 +2207,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
 
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-	spin_unlock_irq(&lruvec->lru_lock);
 
-	if (nr_rotated)
-		lru_note_cost(lruvec, file, 0, nr_rotated);
+	lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated);
 	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
 			nr_deactivate, nr_rotated, sc->priority, file);
 }
-- 
2.43.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ