linux-kernel - [PATCH 10/11] x86/mm: Enable preemption during native_flush_tlb

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260203112401.3889029-11-zhouchuyi@bytedance.com>
Date: Tue,  3 Feb 2026 19:24:00 +0800
From: "Chuyi Zhou" <zhouchuyi@...edance.com>
To: <tglx@...utronix.de>, <mingo@...hat.com>, <luto@...nel.org>, 
	<peterz@...radead.org>, <paulmck@...nel.org>, <muchun.song@...ux.dev>, 
	<bp@...en8.de>, <dave.hansen@...ux.intel.com>
Cc: <linux-kernel@...r.kernel.org>, "Chuyi Zhou" <zhouchuyi@...edance.com>
Subject: [PATCH 10/11] x86/mm: Enable preemption during native_flush_tlb_multi

flush_tlb_mm_range()/arch_tlbbatch_flush() -> native_flush_tlb_multi() is a
common triggering path in real production environments. When pages are
reclaimed or process exit, native_flush_tlb_multi() sends IPIs to remote
CPUs and waits for all remote CPUs to complete their local TLB flushes. The
overall latency may reach tens of milliseconds due to a large number of
remote CPUs and other factors (such as interrupts being disabled). Since
flush_tlb_mm_range()/arch_tlbbatch_flush() always disable preemption, which
may cause increased scheduling latency for other threads on the current
CPU.

Previous patche convert flush_tlb_info from per-cpu variable to on-stack
variable. Additionally, it's no longer necessary to explicitly disable
preemption before calling smp_call*() since they internally handles the
preemption logic. Now is's safe to enable preemption during
native_flush_tlb_multi().

Signed-off-by: Chuyi Zhou <zhouchuyi@...edance.com>
---
 arch/x86/hyperv/mmu.c |  2 ++
 arch/x86/kernel/kvm.c |  4 +++-
 arch/x86/mm/tlb.c     | 23 +++++++++++++----------
 arch/x86/xen/mmu_pv.c |  1 +
 4 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index cfcb60468b01..394f849af10a 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -65,6 +65,8 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
 	unsigned long flags;
 	bool do_lazy = !info->freed_tables;
 
+	guard(preempt)();
+
 	trace_hyperv_mmu_flush_tlb_multi(cpus, info);
 
 	if (!hv_hypercall_pg)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index df78ddee0abb..6b56dab28e66 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -654,8 +654,10 @@ static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
 	u8 state;
 	int cpu;
 	struct kvm_steal_time *src;
-	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+	struct cpumask *flushmask;
 
+	guard(preempt)();
+	flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
 	cpumask_copy(flushmask, cpumask);
 	/*
 	 * We have to call flush only on online vCPUs. And
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 2d68297ed35b..4162d7ff024f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1398,21 +1398,23 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 				unsigned long end, unsigned int stride_shift,
 				bool freed_tables)
 {
-	int cpu = get_cpu();
-
 	struct flush_tlb_info info = {
 		.mm = mm,
 		.stride_shift = stride_shift,
 		.freed_tables = freed_tables,
-		.trim_cpumask = 0,
-		.initiating_cpu = cpu
+		.trim_cpumask = 0
 	};
+	int cpu;
 
 	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
 		start = 0;
 		end = TLB_FLUSH_ALL;
 	}
 
+	migrate_disable();
+
+	cpu = info.initiating_cpu = smp_processor_id();
+
 	/* This is also a barrier that synchronizes with switch_mm(). */
 	info.new_tlb_gen = inc_mm_tlb_gen(mm);
 
@@ -1425,6 +1427,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	 * flush_tlb_func_local() directly in this case.
 	 */
 	if (mm_global_asid(mm)) {
+		guard(preempt)();
 		broadcast_tlb_flush(&info);
 	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
 		info.trim_cpumask = should_trim_cpumask(mm);
@@ -1437,7 +1440,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 		local_irq_enable();
 	}
 
-	put_cpu();
+	migrate_enable();
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
 }
 
@@ -1696,8 +1699,6 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
 
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
-	int cpu = get_cpu();
-
 	struct flush_tlb_info info = {
 		.start = 0,
 		.end = TLB_FLUSH_ALL,
@@ -1705,9 +1706,13 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 		.stride_shift = 0,
 		.freed_tables = false,
 		.new_tlb_gen = TLB_GENERATION_INVALID,
-		.initiating_cpu = cpu,
 		.trim_cpumask = 0,
 	};
+	int cpu;
+
+	guard(migrate)();
+
+	info.initiating_cpu = cpu = smp_processor_id();
 
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
@@ -1727,8 +1732,6 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 	}
 
 	cpumask_clear(&batch->cpumask);
-
-	put_cpu();
 }
 
 /*
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2a4a8deaf612..b801721050f7 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1330,6 +1330,7 @@ static void xen_flush_tlb_multi(const struct cpumask *cpus,
 	const size_t mc_entry_size = sizeof(args->op) +
 		sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
 
+	guard(preempt)();
 	trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end);
 
 	if (cpumask_empty(cpus))
-- 
2.20.1