lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251127233635.4170047-4-krisman@suse.de>
Date: Thu, 27 Nov 2025 18:36:30 -0500
From: Gabriel Krisman Bertazi <krisman@...e.de>
To: linux-mm@...ck.org
Cc: Gabriel Krisman Bertazi <krisman@...e.de>,
	linux-kernel@...r.kernel.org,
	jack@...e.cz,
	Mateusz Guzik <mjguzik@...il.com>,
	Shakeel Butt <shakeel.butt@...ux.dev>,
	Michal Hocko <mhocko@...nel.org>,
	Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
	Dennis Zhou <dennis@...nel.org>,
	Tejun Heo <tj@...nel.org>,
	Christoph Lameter <cl@...two.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	David Hildenbrand <david@...hat.com>,
	Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
	"Liam R. Howlett" <Liam.Howlett@...cle.com>,
	Vlastimil Babka <vbabka@...e.cz>,
	Mike Rapoport <rppt@...nel.org>,
	Suren Baghdasaryan <surenb@...gle.com>
Subject: [RFC PATCH 3/4] mm: Avoid percpu MM counters on single-threaded tasks

The cost of the pcpu memory allocation when forking a new task is
non-negligible, as reported in a few occasions, such as [1].

But it can also be fully avoided for single-threaded applications, where
we know the vast majority of updates happen from the local task context.

For the trivial benchmark, bound to cpu 0 to reduce cost of migrations),
like below:

     for (( i = 0; i < 20000; i++ )); do /bin/true; done

on an 80c machine, this patchset yielded a 6% improvement in system
time.  On a 256c machine, the system time reduced by 11%. Profiling
shows mm_init went from 13.5% of samples to less than 3.33% in the same
256c machine:

Before:
-   13.50%     3.93%  benchmark.sh     [kernel.kallsyms] [k] mm_init
   - 9.57% mm_init
      + 4.80% pcpu_alloc_noprof
      + 3.87% __percpu_counter_init_many

After:
-    3.33%     0.80%  benchmark.sh  [kernel.kallsyms]  [k] mm_init
   - 2.53% mm_init
      + 2.05% pcpu_alloc_noprof

For kernbench in 256c, the patchset yields a 1.4% improvement on system
time.  For gitsource, the improvement in system time I'm measuring is
around 3.12%.

The upgrade adds some overhead to the second fork, in particular an
atomic operation, besides the expensive allocation that was moved from
the first fork to the second.  So a fair question is the impact of this
patchset on multi-threaded applications.  I wrote a microbenchmark
similar to the /bin/true above, but that just spawns a second pthread
and waits for it to finish. The second thread just returns immediately.
This is executed in a loop, bound to a single NUMA node, with:

       for (( i = 0; i < 20000; i++ )); do /bin/parallel-true; done

Profiling shows the lazy upgrade impact is minimal to the
performance:

-    0.68%     0.04%  parallel-true  [kernel.kallsyms]  [k] __lazy_percpu_counter_upgrade_many
   - 0.64% __lazy_percpu_counter_upgrade_many
        0.62% pcpu_alloc_noprof

Which is confirmed by the measured system time. With 20k runs, i'm still
getting a slight improvement from baseline for the 2t case (2-4%).

[1] https://lore.kernel.org/all/20230608111408.s2minsenlcjow7q3@quack3

Suggested-by: Jan Kara <jack@...e.cz>
Signed-off-by: Gabriel Krisman Bertazi <krisman@...e.de>
---
 include/linux/mm.h          | 24 ++++++++----------------
 include/linux/mm_types.h    |  4 ++--
 include/trace/events/kmem.h |  4 ++--
 kernel/fork.c               | 14 ++++++--------
 4 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d16b33bacc32..29de4c60ac6c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2679,36 +2679,28 @@ static inline bool get_user_page_fast_only(unsigned long addr,
  */
 static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
 {
-	return percpu_counter_read_positive(&mm->rss_stat[member]);
+	return lazy_percpu_counter_read_positive(&mm->rss_stat[member]);
 }
 
 static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member)
 {
-	return percpu_counter_sum_positive(&mm->rss_stat[member]);
+	return lazy_percpu_counter_sum_positive(&mm->rss_stat[member]);
 }
 
 void mm_trace_rss_stat(struct mm_struct *mm, int member);
 
 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
 {
-	percpu_counter_add(&mm->rss_stat[member], value);
-
-	mm_trace_rss_stat(mm, member);
-}
-
-static inline void inc_mm_counter(struct mm_struct *mm, int member)
-{
-	percpu_counter_inc(&mm->rss_stat[member]);
+	if (READ_ONCE(current->mm) == mm)
+		lazy_percpu_counter_add_fast(&mm->rss_stat[member], value);
+	else
+		lazy_percpu_counter_add_atomic(&mm->rss_stat[member], value);
 
 	mm_trace_rss_stat(mm, member);
 }
 
-static inline void dec_mm_counter(struct mm_struct *mm, int member)
-{
-	percpu_counter_dec(&mm->rss_stat[member]);
-
-	mm_trace_rss_stat(mm, member);
-}
+#define inc_mm_counter(mm, member) add_mm_counter(mm, member, 1)
+#define dec_mm_counter(mm, member) add_mm_counter(mm, member, -1)
 
 /* Optimized variant when folio is already known not to be anon */
 static inline int mm_counter_file(struct folio *folio)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 90e5790c318f..5a8d677efa85 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -18,7 +18,7 @@
 #include <linux/page-flags-layout.h>
 #include <linux/workqueue.h>
 #include <linux/seqlock.h>
-#include <linux/percpu_counter.h>
+#include <linux/lazy_percpu_counter.h>
 #include <linux/types.h>
 #include <linux/bitmap.h>
 
@@ -1119,7 +1119,7 @@ struct mm_struct {
 		unsigned long saved_e_flags;
 #endif
 
-		struct percpu_counter rss_stat[NR_MM_COUNTERS];
+		struct lazy_percpu_counter rss_stat[NR_MM_COUNTERS];
 
 		struct linux_binfmt *binfmt;
 
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 7f93e754da5c..e21572f4d8a6 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -442,8 +442,8 @@ TRACE_EVENT(rss_stat,
 		__entry->mm_id = mm_ptr_to_hash(mm);
 		__entry->curr = !!(current->mm == mm);
 		__entry->member = member;
-		__entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
-							    << PAGE_SHIFT);
+		__entry->size = (lazy_percpu_counter_sum_positive(&mm->rss_stat[member])
+				 << PAGE_SHIFT);
 	),
 
 	TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a9..92698c60922e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -583,7 +583,7 @@ static void check_mm(struct mm_struct *mm)
 			 "Please make sure 'struct resident_page_types[]' is updated as well");
 
 	for (i = 0; i < NR_MM_COUNTERS; i++) {
-		long x = percpu_counter_sum(&mm->rss_stat[i]);
+		long x = lazy_percpu_counter_sum_local(&mm->rss_stat[i]);
 
 		if (unlikely(x)) {
 			pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
@@ -688,7 +688,7 @@ void __mmdrop(struct mm_struct *mm)
 	put_user_ns(mm->user_ns);
 	mm_pasid_drop(mm);
 	mm_destroy_cid(mm);
-	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
+	lazy_percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
 
 	free_mm(mm);
 }
@@ -1083,16 +1083,11 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (mm_alloc_cid(mm, p))
 		goto fail_cid;
 
-	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
-				     NR_MM_COUNTERS))
-		goto fail_pcpu;
-
+	lazy_percpu_counter_init_many(mm->rss_stat, 0, NR_MM_COUNTERS);
 	mm->user_ns = get_user_ns(user_ns);
 	lru_gen_init_mm(mm);
 	return mm;
 
-fail_pcpu:
-	mm_destroy_cid(mm);
 fail_cid:
 	destroy_context(mm);
 fail_nocontext:
@@ -1535,6 +1530,9 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk)
 		return 0;
 
 	if (clone_flags & CLONE_VM) {
+		if (lazy_percpu_counter_upgrade_many(oldmm->rss_stat,
+						     NR_MM_COUNTERS, GFP_KERNEL_ACCOUNT))
+			return -ENOMEM;
 		mmget(oldmm);
 		mm = oldmm;
 	} else {
-- 
2.51.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ