[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250519224849.4a98b33d@fangorn>
Date: Mon, 19 May 2025 22:48:49 -0400
From: Rik van Riel <riel@...riel.com>
To: linux-kernel@...r.kernel.org
Cc: linux-mm@...ck.org, x86@...nel.org, kernel-team@...a.com,
dave.hansen@...ux.intel.com, luto@...nel.org, peterz@...radead.org,
tglx@...utronix.de, mingo@...hat.com, bp@...en8.de, hpa@...or.com,
nadav.amit@...il.com, Rik van Riel <riel@...com>
Subject: Re: [RFC v2.1 9/9] x86/mm: userspace & pageout flushing using Intel
RAR
On Mon, 19 May 2025 21:02:34 -0400
Rik van Riel <riel@...riel.com> wrote:
> From: Rik van Riel <riel@...com>
>
> Use Intel RAR to flush userspace mappings.
The version below no longer segfaults.
However, I am still hitting the WARN_ON() in leave_mm(),
when called from the idle task through cpuidle_enter_state().
---8<---
From e80e10cdb6f15d29a65ab438cb07ba4b99f64b6e Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@...com>
Date: Thu, 24 Apr 2025 07:15:44 -0700
Subject: [PATCH 10/11] x86/mm: userspace & pageout flushing using Intel RAR
Use Intel RAR to flush userspace mappings.
Because RAR flushes are targeted using a cpu bitmap, the rules are
a little bit different than for true broadcast TLB invalidation.
For true broadcast TLB invalidation, like done with AMD INVLPGB,
a global ASID always has up to date TLB entries on every CPU.
The context switch code never has to flush the TLB when switching
to a global ASID on any CPU with INVLPGB.
For RAR, the TLB mappings for a global ASID are kept up to date
only on CPUs within the mm_cpumask, which lazily follows the
threads around the system. The context switch code does not
need to flush the TLB if the CPU is in the mm_cpumask, and
the PCID used stays the same.
However, a CPU that falls outside of the mm_cpumask can have
out of date TLB mappings for this task. When switching to
that task on a CPU not in the mm_cpumask, the TLB does need
to be flushed.
Signed-off-by: Rik van Riel <riel@...riel.com>
---
arch/x86/include/asm/tlbflush.h | 9 ++-
arch/x86/mm/tlb.c | 133 +++++++++++++++++++++++++-------
2 files changed, 111 insertions(+), 31 deletions(-)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cc9935bbbd45..bdde3ce6c9b1 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -276,7 +276,8 @@ static inline u16 mm_global_asid(struct mm_struct *mm)
{
u16 asid;
- if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB) &&
+ !cpu_feature_enabled(X86_FEATURE_RAR))
return 0;
asid = smp_load_acquire(&mm->context.global_asid);
@@ -289,7 +290,8 @@ static inline u16 mm_global_asid(struct mm_struct *mm)
static inline void mm_init_global_asid(struct mm_struct *mm)
{
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) ||
+ cpu_feature_enabled(X86_FEATURE_RAR)) {
mm->context.global_asid = 0;
mm->context.asid_transition = false;
}
@@ -313,7 +315,8 @@ static inline void mm_clear_asid_transition(struct mm_struct *mm)
static inline bool mm_in_asid_transition(struct mm_struct *mm)
{
- if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB) &&
+ !cpu_feature_enabled(X86_FEATURE_RAR))
return false;
return mm && READ_ONCE(mm->context.asid_transition);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 35489df811dc..457191c2b5de 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -203,7 +203,8 @@ struct new_asid {
unsigned int need_flush : 1;
};
-static struct new_asid choose_new_asid(struct mm_struct *next, u64 next_tlb_gen)
+static struct new_asid choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+ bool new_cpu)
{
struct new_asid ns;
u16 asid;
@@ -216,14 +217,22 @@ static struct new_asid choose_new_asid(struct mm_struct *next, u64 next_tlb_gen)
/*
* TLB consistency for global ASIDs is maintained with hardware assisted
- * remote TLB flushing. Global ASIDs are always up to date.
+ * remote TLB flushing. Global ASIDs are always up to date with INVLPGB,
+ * and up to date for CPUs in the mm_cpumask with RAR..
*/
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) ||
+ cpu_feature_enabled(X86_FEATURE_RAR)) {
u16 global_asid = mm_global_asid(next);
if (global_asid) {
ns.asid = global_asid;
ns.need_flush = 0;
+ /*
+ * If the CPU fell out of the cpumask, it can be
+ * out of date with RAR, and should be flushed.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_RAR))
+ ns.need_flush = new_cpu;
return ns;
}
}
@@ -281,7 +290,14 @@ static void reset_global_asid_space(void)
{
lockdep_assert_held(&global_asid_lock);
- invlpgb_flush_all_nonglobals();
+ /*
+ * The global flush ensures that a freshly allocated global ASID
+ * has no entries in any TLB, and can be used immediately.
+ * With Intel RAR, the TLB may still need to be flushed at context
+ * switch time when dealing with a CPU that was not in the mm_cpumask
+ * for the process, and may have missed flushes along the way.
+ */
+ flush_tlb_all();
/*
* The TLB flush above makes it safe to re-use the previously
@@ -358,7 +374,7 @@ static void use_global_asid(struct mm_struct *mm)
{
u16 asid;
- guard(raw_spinlock_irqsave)(&global_asid_lock);
+ guard(raw_spinlock)(&global_asid_lock);
/* This process is already using broadcast TLB invalidation. */
if (mm_global_asid(mm))
@@ -384,13 +400,14 @@ static void use_global_asid(struct mm_struct *mm)
void mm_free_global_asid(struct mm_struct *mm)
{
- if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB) &&
+ !cpu_feature_enabled(X86_FEATURE_RAR))
return;
if (!mm_global_asid(mm))
return;
- guard(raw_spinlock_irqsave)(&global_asid_lock);
+ guard(raw_spinlock)(&global_asid_lock);
/* The global ASID can be re-used only after flush at wrap-around. */
#ifdef CONFIG_BROADCAST_TLB_FLUSH
@@ -408,7 +425,8 @@ static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
{
u16 global_asid = mm_global_asid(mm);
- if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB) &&
+ !cpu_feature_enabled(X86_FEATURE_RAR))
return false;
/* Process is transitioning to a global ASID */
@@ -426,13 +444,17 @@ static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
*/
static void consider_global_asid(struct mm_struct *mm)
{
- if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB) &&
+ !cpu_feature_enabled(X86_FEATURE_RAR))
return;
/* Check every once in a while. */
if ((current->pid & 0x1f) != (jiffies & 0x1f))
return;
+ if (mm == &init_mm)
+ return;
+
/*
* Assign a global ASID if the process is active on
* 4 or more CPUs simultaneously.
@@ -480,7 +502,7 @@ static void finish_asid_transition(struct flush_tlb_info *info)
mm_clear_asid_transition(mm);
}
-static void broadcast_tlb_flush(struct flush_tlb_info *info)
+static void invlpgb_tlb_flush(struct flush_tlb_info *info)
{
bool pmd = info->stride_shift == PMD_SHIFT;
unsigned long asid = mm_global_asid(info->mm);
@@ -511,8 +533,6 @@ static void broadcast_tlb_flush(struct flush_tlb_info *info)
addr += nr << info->stride_shift;
} while (addr < info->end);
- finish_asid_transition(info);
-
/* Wait for the INVLPGBs kicked off above to finish. */
__tlbsync();
}
@@ -840,7 +860,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
/* Check if the current mm is transitioning to a global ASID */
if (mm_needs_global_asid(next, prev_asid)) {
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
- ns = choose_new_asid(next, next_tlb_gen);
+ ns = choose_new_asid(next, next_tlb_gen, true);
goto reload_tlb;
}
@@ -878,6 +898,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
ns.asid = prev_asid;
ns.need_flush = true;
} else {
+ bool new_cpu = false;
/*
* Apply process to process speculation vulnerability
* mitigations if applicable.
@@ -892,20 +913,25 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
- /* Start receiving IPIs and then read tlb_gen (and LAM below) */
- if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))
+ /* Start receiving IPIs and RAR invalidations */
+ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) {
cpumask_set_cpu(cpu, mm_cpumask(next));
+ if (cpu_feature_enabled(X86_FEATURE_RAR))
+ new_cpu = true;
+ }
+
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
- ns = choose_new_asid(next, next_tlb_gen);
+ ns = choose_new_asid(next, next_tlb_gen, new_cpu);
}
reload_tlb:
new_lam = mm_lam_cr3_mask(next);
if (ns.need_flush) {
- VM_WARN_ON_ONCE(is_global_asid(ns.asid));
- this_cpu_write(cpu_tlbstate.ctxs[ns.asid].ctx_id, next->context.ctx_id);
- this_cpu_write(cpu_tlbstate.ctxs[ns.asid].tlb_gen, next_tlb_gen);
+ if (is_dyn_asid(ns.asid)) {
+ this_cpu_write(cpu_tlbstate.ctxs[ns.asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[ns.asid].tlb_gen, next_tlb_gen);
+ }
load_new_mm_cr3(next->pgd, ns.asid, new_lam, true);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
@@ -1096,7 +1122,7 @@ static void flush_tlb_func(void *info)
u64 local_tlb_gen;
bool local = smp_processor_id() == f->initiating_cpu;
unsigned long nr_invalidate = 0;
- u64 mm_tlb_gen;
+ u64 mm_tlb_gen = 0;
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
@@ -1122,12 +1148,17 @@ static void flush_tlb_func(void *info)
loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
}
- /* Broadcast ASIDs are always kept up to date with INVLPGB. */
- if (is_global_asid(loaded_mm_asid))
+ /*
+ * Broadcast ASIDs are always kept up to date with INVLPGB; with
+ * Intel RAR IPI based flushes are used periodically to trim the
+ * mm_cpumask, and flushes that get here should be processed.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) &&
+ is_global_asid(loaded_mm_asid))
return;
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
- loaded_mm->context.ctx_id);
+ VM_WARN_ON(is_dyn_asid(loaded_mm_asid) && loaded_mm->context.ctx_id !=
+ this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id));
if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) {
/*
@@ -1143,7 +1174,8 @@ static void flush_tlb_func(void *info)
return;
}
- local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+ if (is_dyn_asid(loaded_mm_asid))
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
f->new_tlb_gen <= local_tlb_gen)) {
@@ -1242,7 +1274,8 @@ static void flush_tlb_func(void *info)
}
/* Both paths above update our state to mm_tlb_gen. */
- this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
+ if (is_dyn_asid(loaded_mm_asid))
+ this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
/* Tracing is done in a unified manner to reduce the code size */
done:
@@ -1358,6 +1391,35 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
#endif
+static void rar_tlb_flush(struct flush_tlb_info *info)
+{
+ unsigned long asid = mm_global_asid(info->mm);
+ u16 pcid = kern_pcid(asid);
+
+ /* Flush the remote CPUs. */
+ smp_call_rar_many(mm_cpumask(info->mm), pcid, info->start, info->end);
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ smp_call_rar_many(mm_cpumask(info->mm), user_pcid(asid), info->start, info->end);
+
+ /* Flush the local TLB, if needed. */
+ if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(info->mm))) {
+ lockdep_assert_irqs_enabled();
+ local_irq_disable();
+ flush_tlb_func(info);
+ local_irq_enable();
+ }
+}
+
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_tlb_flush(info);
+ else /* Intel RAR */
+ rar_tlb_flush(info);
+
+ finish_asid_transition(info);
+}
+
static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned int stride_shift, bool freed_tables,
@@ -1418,15 +1480,22 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
new_tlb_gen);
+ /*
+ * IPIs and RAR can be targeted to a cpumask. Periodically trim that
+ * mm_cpumask by sending TLB flush IPIs, even when most TLB flushes
+ * are done with RAR.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB) || !mm_global_asid(mm))
+ info->trim_cpumask = should_trim_cpumask(mm);
+
/*
* flush_tlb_multi() is not optimized for the common case in which only
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (mm_global_asid(mm)) {
+ if (mm_global_asid(mm) && !info->trim_cpumask) {
broadcast_tlb_flush(info);
} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
- info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
consider_global_asid(mm);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
@@ -1737,6 +1806,14 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
invlpgb_flush_all_nonglobals();
batch->unmapped_pages = false;
+ } else if (cpu_feature_enabled(X86_FEATURE_RAR) && cpumask_any(&batch->cpumask) < nr_cpu_ids) {
+ rar_full_flush(&batch->cpumask);
+ if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ lockdep_assert_irqs_enabled();
+ local_irq_disable();
+ invpcid_flush_all_nonglobals();
+ local_irq_enable();
+ }
} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
--
2.47.1
Powered by blists - more mailing lists