linux-kernel - [PATCH 09/11] x86/mm: Move flush_tlb

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260203112401.3889029-10-zhouchuyi@bytedance.com>
Date: Tue,  3 Feb 2026 19:23:59 +0800
From: "Chuyi Zhou" <zhouchuyi@...edance.com>
To: <tglx@...utronix.de>, <mingo@...hat.com>, <luto@...nel.org>, 
	<peterz@...radead.org>, <paulmck@...nel.org>, <muchun.song@...ux.dev>, 
	<bp@...en8.de>, <dave.hansen@...ux.intel.com>
Cc: <linux-kernel@...r.kernel.org>, "Chuyi Zhou" <zhouchuyi@...edance.com>
Subject: [PATCH 09/11] x86/mm: Move flush_tlb_info back to the stack

Commit 3db6d5a5ecaf ("x86/mm/tlb: Remove 'struct flush_tlb_info' from the
stack") changed flush_tlb_info from stack variables to per-CPU variables.
This brought about a performance improvement of around 3% in extreme test.
However, it also required that all flush_tlb* operations keep preemption
disabled entirely to prevent concurrent modifications of flush_tlb_info.
flush_tlb* needs to send IPIs to remote CPUs and synchronously wait for all
remote CPUs to complete their local TLB flushes. The process could take
tens of milliseconds when interrupts are disabled or with a large number of
remote CPUs.

>From the perspective of improving kernel real-time performance, this patch
reverts flush_tlb_info back to stack variables. This is a preparation for
enabling preemption during TLB flush in next patch.

Signed-off-by: Chuyi Zhou <zhouchuyi@...edance.com>
---
 arch/x86/mm/tlb.c | 124 ++++++++++++++++++----------------------------
 1 file changed, 49 insertions(+), 75 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index f5b93e01e347..2d68297ed35b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1394,71 +1394,30 @@ void flush_tlb_multi(const struct cpumask *cpumask,
  */
 unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
-
-#ifdef CONFIG_DEBUG_VM
-static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
-#endif
-
-static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
-			unsigned long start, unsigned long end,
-			unsigned int stride_shift, bool freed_tables,
-			u64 new_tlb_gen)
+void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+				unsigned long end, unsigned int stride_shift,
+				bool freed_tables)
 {
-	struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
+	int cpu = get_cpu();
 
-#ifdef CONFIG_DEBUG_VM
-	/*
-	 * Ensure that the following code is non-reentrant and flush_tlb_info
-	 * is not overwritten. This means no TLB flushing is initiated by
-	 * interrupt handlers and machine-check exception handlers.
-	 */
-	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
-#endif
+	struct flush_tlb_info info = {
+		.mm = mm,
+		.stride_shift = stride_shift,
+		.freed_tables = freed_tables,
+		.trim_cpumask = 0,
+		.initiating_cpu = cpu
+	};
 
-	/*
-	 * If the number of flushes is so large that a full flush
-	 * would be faster, do a full flush.
-	 */
 	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
 		start = 0;
 		end = TLB_FLUSH_ALL;
 	}
 
-	info->start		= start;
-	info->end		= end;
-	info->mm		= mm;
-	info->stride_shift	= stride_shift;
-	info->freed_tables	= freed_tables;
-	info->new_tlb_gen	= new_tlb_gen;
-	info->initiating_cpu	= smp_processor_id();
-	info->trim_cpumask	= 0;
-
-	return info;
-}
-
-static void put_flush_tlb_info(void)
-{
-#ifdef CONFIG_DEBUG_VM
-	/* Complete reentrancy prevention checks */
-	barrier();
-	this_cpu_dec(flush_tlb_info_idx);
-#endif
-}
-
-void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
-				unsigned long end, unsigned int stride_shift,
-				bool freed_tables)
-{
-	struct flush_tlb_info *info;
-	int cpu = get_cpu();
-	u64 new_tlb_gen;
-
 	/* This is also a barrier that synchronizes with switch_mm(). */
-	new_tlb_gen = inc_mm_tlb_gen(mm);
+	info.new_tlb_gen = inc_mm_tlb_gen(mm);
 
-	info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
-				  new_tlb_gen);
+	info.start = start;
+	info.end = end;
 
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
@@ -1466,19 +1425,18 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	 * flush_tlb_func_local() directly in this case.
 	 */
 	if (mm_global_asid(mm)) {
-		broadcast_tlb_flush(info);
+		broadcast_tlb_flush(&info);
 	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
-		info->trim_cpumask = should_trim_cpumask(mm);
-		flush_tlb_multi(mm_cpumask(mm), info);
+		info.trim_cpumask = should_trim_cpumask(mm);
+		flush_tlb_multi(mm_cpumask(mm), &info);
 		consider_global_asid(mm);
 	} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
 		lockdep_assert_irqs_enabled();
 		local_irq_disable();
-		flush_tlb_func(info);
+		flush_tlb_func(&info);
 		local_irq_enable();
 	}
 
-	put_flush_tlb_info();
 	put_cpu();
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
 }
@@ -1548,19 +1506,29 @@ static void kernel_tlb_flush_range(struct flush_tlb_info *info)
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	struct flush_tlb_info *info;
+	struct flush_tlb_info info = {
+		.mm = NULL,
+		.stride_shift = PAGE_SHIFT,
+		.freed_tables = false,
+		.trim_cpumask = 0,
+		.new_tlb_gen = TLB_GENERATION_INVALID
+	};
 
 	guard(preempt)();
 
-	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
-				  TLB_GENERATION_INVALID);
+	if ((end - start) >> PAGE_SHIFT > tlb_single_page_flush_ceiling) {
+		start = 0;
+		end = TLB_FLUSH_ALL;
+	}
 
-	if (info->end == TLB_FLUSH_ALL)
-		kernel_tlb_flush_all(info);
-	else
-		kernel_tlb_flush_range(info);
+	info.initiating_cpu = smp_processor_id(),
+	info.start = start;
+	info.end = end;
 
-	put_flush_tlb_info();
+	if (info.end == TLB_FLUSH_ALL)
+		kernel_tlb_flush_all(&info);
+	else
+		kernel_tlb_flush_range(&info);
 }
 
 /*
@@ -1728,12 +1696,19 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
 
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
-	struct flush_tlb_info *info;
-
 	int cpu = get_cpu();
 
-	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
-				  TLB_GENERATION_INVALID);
+	struct flush_tlb_info info = {
+		.start = 0,
+		.end = TLB_FLUSH_ALL,
+		.mm = NULL,
+		.stride_shift = 0,
+		.freed_tables = false,
+		.new_tlb_gen = TLB_GENERATION_INVALID,
+		.initiating_cpu = cpu,
+		.trim_cpumask = 0,
+	};
+
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
 	 * a local TLB flush is needed. Optimize this use-case by calling
@@ -1743,17 +1718,16 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 		invlpgb_flush_all_nonglobals();
 		batch->unmapped_pages = false;
 	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
-		flush_tlb_multi(&batch->cpumask, info);
+		flush_tlb_multi(&batch->cpumask, &info);
 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 		lockdep_assert_irqs_enabled();
 		local_irq_disable();
-		flush_tlb_func(info);
+		flush_tlb_func(&info);
 		local_irq_enable();
 	}
 
 	cpumask_clear(&batch->cpumask);
 
-	put_flush_tlb_info();
 	put_cpu();
 }
 
-- 
2.20.1