[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <alpine.DEB.2.20.1706221037320.1885@nanos>
Date: Thu, 22 Jun 2017 14:21:49 +0200 (CEST)
From: Thomas Gleixner <tglx@...utronix.de>
To: Andy Lutomirski <luto@...nel.org>
cc: X86 ML <x86@...nel.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
Borislav Petkov <bp@...en8.de>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Andrew Morton <akpm@...ux-foundation.org>,
Mel Gorman <mgorman@...e.de>,
"linux-mm@...ck.org" <linux-mm@...ck.org>,
Nadav Amit <nadav.amit@...il.com>,
Rik van Riel <riel@...hat.com>,
Dave Hansen <dave.hansen@...el.com>,
Arjan van de Ven <arjan@...ux.intel.com>,
Peter Zijlstra <peterz@...radead.org>
Subject: Re: [PATCH v3 11/11] x86/mm: Try to preserve old TLB entries using
PCID
On Wed, 21 Jun 2017, Andy Lutomirski wrote:
> On Wed, Jun 21, 2017 at 6:38 AM, Thomas Gleixner <tglx@...utronix.de> wrote:
> > That requires a conditional branch
> >
> > if (asid >= NR_DYNAMIC_ASIDS) {
> > asid = 0;
> > ....
> > }
> >
> > The question is whether 4 IDs would be sufficient which trades the branch
> > for a mask operation. Or you go for 8 and spend another cache line.
>
> Interesting. I'm inclined to either leave it at 6 or reduce it to 4
> for now and to optimize later.
:)
> > Hmm. So this loop needs to be taken unconditionally even if the task stays
> > on the same CPU. And of course the number of dynamic IDs has to be short in
> > order to makes this loop suck performance wise.
> >
> > Something like the completely disfunctional below might be worthwhile to
> > explore. At least arch/x86/mm/ compiles :)
> >
> > It gets rid of the loop search and lifts the limit of dynamic ids by
> > trading it with a percpu variable in mm_context_t.
>
> That would work, but it would take a lot more memory on large systems
> with lots of processes, and I'd also be concerned that we might run
> out of dynamic percpu space.
Yeah, did not think about the dynamic percpu space.
> How about a different idea: make the percpu data structure look like a
> 4-way set associative cache. The ctxs array could be, say, 1024
> entries long without using crazy amounts of memory. We'd divide it
> into 256 buckets, so you'd index it like ctxs[4*bucket + slot]. For
> each mm, we choose a random bucket (from 0 through 256), and then we'd
> just loop over the four slots in the bucket in choose_asid(). This
> would require very slightly more arithmetic (I'd guess only one or two
> cycles, though) but, critically, wouldn't touch any more cachelines.
>
> The downside of both of these approaches over the one in this patch is
> that the change that the percpu cacheline we need is not in the cache
> is quite a bit higher since it's potentially a different cacheline for
> each mm. It would probably still be a win because avoiding the flush
> is really quite valuable.
>
> What do you think? The added code would be tiny.
That might be worth a try.
Now one other optimization which should be trivial to add is to keep the 4
asid context entries in cpu_tlbstate and cache the last asid in thread
info. If that's still valid then use it otherwise unconditionally get a new
one. That avoids the whole loop machinery and thread info is cache hot in
the context switch anyway. Delta patch on top of your version below.
> (P.S. Why doesn't random_p32() try arch_random_int()?)
Could you please ask questions which do not require crystalballs for
answering?
Thanks,
tglx
8<-------------------
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -159,8 +159,16 @@ static inline void destroy_context(struc
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk);
-extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk);
+extern void __switch_mm_irqs_off(struct mm_struct *prev,
+ struct mm_struct *next, u32 *last_asid);
+
+static inline void switch_mm_irqs_off(struct mm_struct *prev,
+ struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ __switch_mm_irqs_off(prev, next, &tsk->thread_info.asid);
+}
+
#define switch_mm_irqs_off switch_mm_irqs_off
#define activate_mm(prev, next) \
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -54,6 +54,7 @@ struct task_struct;
struct thread_info {
unsigned long flags; /* low level flags */
+ u32 asid;
};
#define INIT_THREAD_INFO(tsk) \
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -83,10 +83,13 @@ static inline u64 bump_mm_tlb_gen(struct
#endif
/*
- * 6 because 6 should be plenty and struct tlb_state will fit in
- * two cache lines.
+ * NR_DYNAMIC_ASIDS must be a power of 2. 4 makes tlb_state fit into two
+ * cache lines.
*/
-#define NR_DYNAMIC_ASIDS 6
+#define NR_DYNAMIC_ASIDS_BITS 2
+#define NR_DYNAMIC_ASIDS (1U << NR_DYNAMIC_ASIDS_BITS)
+#define DYNAMIC_ASIDS_MASK (NR_DYNAMIC_ASIDS - 1)
+#define ASID_NEEDS_FLUSH (1U << 16)
struct tlb_context {
u64 ctx_id;
@@ -102,7 +105,8 @@ struct tlb_state {
*/
struct mm_struct *loaded_mm;
u16 loaded_mm_asid;
- u16 next_asid;
+ u16 curr_asid;
+ u32 notask_asid;
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -812,7 +812,7 @@ void __init zone_sizes_init(void)
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
- .next_asid = 1,
+ .curr_asid = 0,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,43 +30,32 @@
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
-static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
- u16 *new_asid, bool *need_flush)
+static u32 choose_new_asid(mm_context_t *nctx, u32 *last_asid, u64 next_tlb_gen)
{
- u16 asid;
+ struct tlb_context *tctx;
+ u32 asid;
- if (!static_cpu_has(X86_FEATURE_PCID)) {
- *new_asid = 0;
- *need_flush = true;
- return;
- }
+ if (!static_cpu_has(X86_FEATURE_PCID))
+ return ASID_NEEDS_FLUSH;
- for (asid = 0; asid < NR_DYNAMIC_ASIDS; asid++) {
- if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
- next->context.ctx_id)
- continue;
-
- *new_asid = asid;
- *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
- next_tlb_gen);
- return;
+ asid = *last_asid;
+ tctx = this_cpu_ptr(cpu_tlbstate.ctxs + asid);
+ if (likely(tctx->ctx_id == nctx->ctx_id)) {
+ if (tctx->tlb_gen != next_tlb_gen)
+ asid |= ASID_NEEDS_FLUSH;
+ return asid;
}
- /*
- * We don't currently own an ASID slot on this CPU.
- * Allocate a slot.
- */
- *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
- if (*new_asid >= NR_DYNAMIC_ASIDS) {
- *new_asid = 0;
- this_cpu_write(cpu_tlbstate.next_asid, 1);
- }
- *need_flush = true;
+ asid = this_cpu_inc_return(cpu_tlbstate.curr_asid);
+ asid &= DYNAMIC_ASIDS_MASK;
+ *last_asid = asid;
+ return asid | ASID_NEEDS_FLUSH;
}
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ unsigned long flags;
/*
* It's plausible that we're in lazy TLB mode while our mm is init_mm.
@@ -82,21 +71,27 @@ void leave_mm(int cpu)
/* Warn if we're not lazy. */
WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
- switch_mm(NULL, &init_mm, NULL);
+ local_irq_save(flags);
+ switch_mm_irqs_off(NULL, &init_mm, NULL);
+ local_irq_restore(flags);
}
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
unsigned long flags;
+ u32 *last_asid;
+
+ last_asid = tsk ? &tsk->thread_info.asid :
+ this_cpu_ptr(&cpu_tlbstate.notask_asid);
local_irq_save(flags);
- switch_mm_irqs_off(prev, next, tsk);
+ __switch_mm_irqs_off(prev, next, last_asid);
local_irq_restore(flags);
}
-void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
+void __switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ u32 *last_asid)
{
unsigned cpu = smp_processor_id();
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -157,8 +152,7 @@ void switch_mm_irqs_off(struct mm_struct
* are not reflected in tlb_gen.)
*/
} else {
- u16 new_asid;
- bool need_flush;
+ u32 new_asid;
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
@@ -187,9 +181,11 @@ void switch_mm_irqs_off(struct mm_struct
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
- choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ new_asid = choose_new_asid(&next->context, last_asid,
+ next_tlb_gen);
- if (need_flush) {
+ if (new_asid & ASID_NEEDS_FLUSH) {
+ new_asid &= DYNAMIC_ASIDS_MASK;
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id,
next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen,
Powered by blists - more mailing lists