[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260203112401.3889029-11-zhouchuyi@bytedance.com>
Date: Tue, 3 Feb 2026 19:24:00 +0800
From: "Chuyi Zhou" <zhouchuyi@...edance.com>
To: <tglx@...utronix.de>, <mingo@...hat.com>, <luto@...nel.org>,
<peterz@...radead.org>, <paulmck@...nel.org>, <muchun.song@...ux.dev>,
<bp@...en8.de>, <dave.hansen@...ux.intel.com>
Cc: <linux-kernel@...r.kernel.org>, "Chuyi Zhou" <zhouchuyi@...edance.com>
Subject: [PATCH 10/11] x86/mm: Enable preemption during native_flush_tlb_multi
flush_tlb_mm_range()/arch_tlbbatch_flush() -> native_flush_tlb_multi() is a
common triggering path in real production environments. When pages are
reclaimed or process exit, native_flush_tlb_multi() sends IPIs to remote
CPUs and waits for all remote CPUs to complete their local TLB flushes. The
overall latency may reach tens of milliseconds due to a large number of
remote CPUs and other factors (such as interrupts being disabled). Since
flush_tlb_mm_range()/arch_tlbbatch_flush() always disable preemption, which
may cause increased scheduling latency for other threads on the current
CPU.
Previous patche convert flush_tlb_info from per-cpu variable to on-stack
variable. Additionally, it's no longer necessary to explicitly disable
preemption before calling smp_call*() since they internally handles the
preemption logic. Now is's safe to enable preemption during
native_flush_tlb_multi().
Signed-off-by: Chuyi Zhou <zhouchuyi@...edance.com>
---
arch/x86/hyperv/mmu.c | 2 ++
arch/x86/kernel/kvm.c | 4 +++-
arch/x86/mm/tlb.c | 23 +++++++++++++----------
arch/x86/xen/mmu_pv.c | 1 +
4 files changed, 19 insertions(+), 11 deletions(-)
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index cfcb60468b01..394f849af10a 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -65,6 +65,8 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
unsigned long flags;
bool do_lazy = !info->freed_tables;
+ guard(preempt)();
+
trace_hyperv_mmu_flush_tlb_multi(cpus, info);
if (!hv_hypercall_pg)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index df78ddee0abb..6b56dab28e66 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -654,8 +654,10 @@ static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
u8 state;
int cpu;
struct kvm_steal_time *src;
- struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+ struct cpumask *flushmask;
+ guard(preempt)();
+ flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
cpumask_copy(flushmask, cpumask);
/*
* We have to call flush only on online vCPUs. And
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 2d68297ed35b..4162d7ff024f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1398,21 +1398,23 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,
bool freed_tables)
{
- int cpu = get_cpu();
-
struct flush_tlb_info info = {
.mm = mm,
.stride_shift = stride_shift,
.freed_tables = freed_tables,
- .trim_cpumask = 0,
- .initiating_cpu = cpu
+ .trim_cpumask = 0
};
+ int cpu;
if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
start = 0;
end = TLB_FLUSH_ALL;
}
+ migrate_disable();
+
+ cpu = info.initiating_cpu = smp_processor_id();
+
/* This is also a barrier that synchronizes with switch_mm(). */
info.new_tlb_gen = inc_mm_tlb_gen(mm);
@@ -1425,6 +1427,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
* flush_tlb_func_local() directly in this case.
*/
if (mm_global_asid(mm)) {
+ guard(preempt)();
broadcast_tlb_flush(&info);
} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
info.trim_cpumask = should_trim_cpumask(mm);
@@ -1437,7 +1440,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
local_irq_enable();
}
- put_cpu();
+ migrate_enable();
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
@@ -1696,8 +1699,6 @@ EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
- int cpu = get_cpu();
-
struct flush_tlb_info info = {
.start = 0,
.end = TLB_FLUSH_ALL,
@@ -1705,9 +1706,13 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
.stride_shift = 0,
.freed_tables = false,
.new_tlb_gen = TLB_GENERATION_INVALID,
- .initiating_cpu = cpu,
.trim_cpumask = 0,
};
+ int cpu;
+
+ guard(migrate)();
+
+ info.initiating_cpu = cpu = smp_processor_id();
/*
* flush_tlb_multi() is not optimized for the common case in which only
@@ -1727,8 +1732,6 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
}
cpumask_clear(&batch->cpumask);
-
- put_cpu();
}
/*
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2a4a8deaf612..b801721050f7 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1330,6 +1330,7 @@ static void xen_flush_tlb_multi(const struct cpumask *cpus,
const size_t mc_entry_size = sizeof(args->op) +
sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
+ guard(preempt)();
trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end);
if (cpumask_empty(cpus))
--
2.20.1
Powered by blists - more mailing lists