lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Thu, 21 Jul 2022 15:13:22 +0200
From:   Alexander Potapenko <glider@...gle.com>
To:     "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
Cc:     Andi Kleen <ak@...ux.intel.com>,
        Andrey Konovalov <andreyknvl@...il.com>,
        Dave Hansen <dave.hansen@...ux.intel.com>,
        Dmitriy Vyukov <dvyukov@...gle.com>,
        "H.J. Lu" <hjl.tools@...il.com>,
        Kostya Serebryany <kcc@...gle.com>,
        LKML <linux-kernel@...r.kernel.org>,
        Linux Memory Management List <linux-mm@...ck.org>,
        Andy Lutomirski <luto@...nel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Rick Edgecombe <rick.p.edgecombe@...el.com>,
        Andrey Ryabinin <ryabinin.a.a@...il.com>,
        Taras Madan <tarasmadan@...gle.com>,
        "the arch/x86 maintainers" <x86@...nel.org>
Subject: Re: [PATCHv5.1 04/13] x86/mm: Handle LAM on context switch

On Wed, Jul 13, 2022 at 5:02 PM Kirill A. Shutemov
<kirill.shutemov@...ux.intel.com> wrote:
>
> Linear Address Masking mode for userspace pointers encoded in CR3 bits.
> The mode is selected per-thread. Add new thread features indicate that the
> thread has Linear Address Masking enabled.
>
> switch_mm_irqs_off() now respects these flags and constructs CR3
> accordingly.
>
> The active LAM mode gets recorded in the tlb_state.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
Tested-by: Alexander Potapenko <glider@...gle.com>

> ---
>  v5.1:
>   - Fix build issue with CONFIG_MODULE=y
> ---
>  arch/x86/include/asm/mmu.h         |  3 +++
>  arch/x86/include/asm/mmu_context.h | 24 +++++++++++++++++
>  arch/x86/include/asm/tlbflush.h    | 35 +++++++++++++++++++++++++
>  arch/x86/mm/tlb.c                  | 42 +++++++++++++++++++-----------
>  4 files changed, 89 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> index 5d7494631ea9..002889ca8978 100644
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -40,6 +40,9 @@ typedef struct {
>
>  #ifdef CONFIG_X86_64
>         unsigned short flags;
> +
> +       /* Active LAM mode:  X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */
> +       unsigned long lam_cr3_mask;
>  #endif
>
>         struct mutex lock;
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index b8d40ddeab00..69c943b2ae90 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -91,6 +91,29 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
>  }
>  #endif
>
> +#ifdef CONFIG_X86_64
> +static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
> +{
> +       return mm->context.lam_cr3_mask;
> +}
> +
> +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
> +{
> +       mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
> +}
> +
> +#else
> +
> +static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
> +{
> +       return 0;
> +}
> +
> +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
> +{
> +}
> +#endif
> +
>  #define enter_lazy_tlb enter_lazy_tlb
>  extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
>
> @@ -168,6 +191,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
>  {
>         arch_dup_pkeys(oldmm, mm);
>         paravirt_arch_dup_mmap(oldmm, mm);
> +       dup_lam(oldmm, mm);
>         return ldt_dup_context(oldmm, mm);
>  }
>
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 4af5579c7ef7..efe83d33327f 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -100,6 +100,16 @@ struct tlb_state {
>          */
>         bool invalidate_other;
>
> +#ifdef CONFIG_X86_64
> +       /*
> +        * Active LAM mode.
> +        *
> +        * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM
> +        * disabled.
> +        */
> +       u8 lam;
> +#endif
> +
>         /*
>          * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
>          * the corresponding user PCID needs a flush next time we
> @@ -356,6 +366,30 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
>  }
>  #define huge_pmd_needs_flush huge_pmd_needs_flush
>
> +#ifdef CONFIG_X86_64
> +static inline unsigned long tlbstate_lam_cr3_mask(void)
> +{
> +       unsigned long lam = this_cpu_read(cpu_tlbstate.lam);
> +
> +       return lam << X86_CR3_LAM_U57_BIT;
> +}
> +
> +static inline void set_tlbstate_cr3_lam_mask(unsigned long mask)
> +{
> +       this_cpu_write(cpu_tlbstate.lam, mask >> X86_CR3_LAM_U57_BIT);
> +}
> +
> +#else
> +
> +static inline unsigned long tlbstate_lam_cr3_mask(void)
> +{
> +       return 0;
> +}
> +
> +static inline void set_tlbstate_cr3_lam_mask(u64 mask)
> +{
> +}
> +#endif
>  #endif /* !MODULE */
>
>  static inline void __native_tlb_flush_global(unsigned long cr4)
> @@ -363,4 +397,5 @@ static inline void __native_tlb_flush_global(unsigned long cr4)
>         native_write_cr4(cr4 ^ X86_CR4_PGE);
>         native_write_cr4(cr4);
>  }
> +
>  #endif /* _ASM_X86_TLBFLUSH_H */
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index d400b6d9d246..4c93f87a8928 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -154,17 +154,18 @@ static inline u16 user_pcid(u16 asid)
>         return ret;
>  }
>
> -static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
> +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
>  {
>         if (static_cpu_has(X86_FEATURE_PCID)) {
> -               return __sme_pa(pgd) | kern_pcid(asid);
> +               return __sme_pa(pgd) | kern_pcid(asid) | lam;
>         } else {
>                 VM_WARN_ON_ONCE(asid != 0);
> -               return __sme_pa(pgd);
> +               return __sme_pa(pgd) | lam;
>         }
>  }
>
> -static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
> +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
> +                                             unsigned long lam)
>  {
>         VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
>         /*
> @@ -173,7 +174,7 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
>          * boot because all CPU's the have same capabilities:
>          */
>         VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
> -       return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
> +       return __sme_pa(pgd) | kern_pcid(asid) | lam | CR3_NOFLUSH;
>  }
>
>  /*
> @@ -274,15 +275,16 @@ static inline void invalidate_user_asid(u16 asid)
>                   (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
>  }
>
> -static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
> +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
> +                           bool need_flush)
>  {
>         unsigned long new_mm_cr3;
>
>         if (need_flush) {
>                 invalidate_user_asid(new_asid);
> -               new_mm_cr3 = build_cr3(pgdir, new_asid);
> +               new_mm_cr3 = build_cr3(pgdir, new_asid, lam);
>         } else {
> -               new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
> +               new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam);
>         }
>
>         /*
> @@ -491,6 +493,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
>  {
>         struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
>         u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
> +       unsigned long prev_lam = tlbstate_lam_cr3_mask();
> +       unsigned long new_lam = mm_lam_cr3_mask(next);
>         bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
>         unsigned cpu = smp_processor_id();
>         u64 next_tlb_gen;
> @@ -520,7 +524,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
>          * isn't free.
>          */
>  #ifdef CONFIG_DEBUG_VM
> -       if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
> +       if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, prev_lam))) {
>                 /*
>                  * If we were to BUG here, we'd be very likely to kill
>                  * the system so hard that we don't see the call trace.
> @@ -622,15 +626,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
>                 barrier();
>         }
>
> +       set_tlbstate_cr3_lam_mask(new_lam);
>         if (need_flush) {
>                 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
>                 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
> -               load_new_mm_cr3(next->pgd, new_asid, true);
> +               load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
>
>                 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
>         } else {
>                 /* The new ASID is already up to date. */
> -               load_new_mm_cr3(next->pgd, new_asid, false);
> +               load_new_mm_cr3(next->pgd, new_asid, new_lam, false);
>
>                 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
>         }
> @@ -691,6 +696,10 @@ void initialize_tlbstate_and_flush(void)
>         /* Assert that CR3 already references the right mm. */
>         WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
>
> +       /* LAM expected to be disabled in CR3 and init_mm */
> +       WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
> +       WARN_ON(mm_lam_cr3_mask(&init_mm));
> +
>         /*
>          * Assert that CR4.PCIDE is set if needed.  (CR4.PCIDE initialization
>          * doesn't work like other CR4 bits because it can only be set from
> @@ -699,8 +708,8 @@ void initialize_tlbstate_and_flush(void)
>         WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
>                 !(cr4_read_shadow() & X86_CR4_PCIDE));
>
> -       /* Force ASID 0 and force a TLB flush. */
> -       write_cr3(build_cr3(mm->pgd, 0));
> +       /* Disable LAM, force ASID 0 and force a TLB flush. */
> +       write_cr3(build_cr3(mm->pgd, 0, 0));
>
>         /* Reinitialize tlbstate. */
>         this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
> @@ -708,6 +717,7 @@ void initialize_tlbstate_and_flush(void)
>         this_cpu_write(cpu_tlbstate.next_asid, 1);
>         this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
>         this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
> +       set_tlbstate_cr3_lam_mask(0);
>
>         for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
>                 this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
> @@ -1047,8 +1057,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>   */
>  unsigned long __get_current_cr3_fast(void)
>  {
> -       unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
> -               this_cpu_read(cpu_tlbstate.loaded_mm_asid));
> +       unsigned long cr3 =
> +               build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
> +               this_cpu_read(cpu_tlbstate.loaded_mm_asid),
> +               tlbstate_lam_cr3_mask());
>
>         /* For now, be very restrictive about when this can be called. */
>         VM_WARN_ON(in_nmi() || preemptible());
> --
> 2.35.1
>


-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Liana Sebastian
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ