linux-kernel - Re: [PATCH v4 1/6] arch/x86/kvm: Refactor l1d flush lifecycle management

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <dc40ceb1e6ff29f90b2579deba5ad107fe1fe905.camel@amazon.com>
Date:   Sat, 25 Apr 2020 01:49:20 +0000
From:   "Singh, Balbir" <sblbir@...zon.com>
To:     "thomas.lendacky@....com" <thomas.lendacky@....com>,
        "tglx@...utronix.de" <tglx@...utronix.de>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
CC:     "keescook@...omium.org" <keescook@...omium.org>,
        "tony.luck@...el.com" <tony.luck@...el.com>,
        "benh@...nel.crashing.org" <benh@...nel.crashing.org>,
        "jpoimboe@...hat.com" <jpoimboe@...hat.com>,
        "x86@...nel.org" <x86@...nel.org>,
        "dave.hansen@...el.com" <dave.hansen@...el.com>
Subject: Re:  [PATCH v4 1/6] arch/x86/kvm: Refactor l1d flush lifecycle
 management

On Fri, 2020-04-24 at 13:59 -0500, Tom Lendacky wrote:
> 
> On 4/23/20 9:01 AM, Balbir Singh wrote:
> > Split out the allocation and free routines to be used in a follow
> > up set of patches (to reuse for L1D flushing).
> > 
> > Signed-off-by: Balbir Singh <sblbir@...zon.com>
> > Reviewed-by: Kees Cook <keescook@...omium.org>
> > ---
> >   arch/x86/include/asm/cacheflush.h |  3 +++
> >   arch/x86/kernel/Makefile          |  1 +
> >   arch/x86/kernel/l1d_flush.c       | 36 +++++++++++++++++++++++++++++++
> >   arch/x86/kvm/vmx/vmx.c            | 25 +++------------------
> >   4 files changed, 43 insertions(+), 22 deletions(-)
> >   create mode 100644 arch/x86/kernel/l1d_flush.c
> > 
> > diff --git a/arch/x86/include/asm/cacheflush.h
> > b/arch/x86/include/asm/cacheflush.h
> > index 63feaf2a5f93..bac56fcd9790 100644
> > --- a/arch/x86/include/asm/cacheflush.h
> > +++ b/arch/x86/include/asm/cacheflush.h
> > @@ -6,6 +6,9 @@
> >   #include <asm-generic/cacheflush.h>
> >   #include <asm/special_insns.h>
> > 
> > +#define L1D_CACHE_ORDER 4
> 
> Since this is becoming a generic function now, shouldn't this value be
> based on the actual L1D cache size? Is this value based on a 32KB data
> cache and the idea is to write twice the size of the cache to be sure that
> every entry has been replaced - with the second 32KB to catch the odd line
> that might not have been pulled in?
> 

Currently the only users are VMX L1TF and optional prctl(). It should be based
on actual L1D cache size, I checked a little bit and the largest L1D cache
size across various x86 bits is 64K. so there are three options here:

1. We refactor the code, we would need to save the L1D cache size and use
cpu_dev callbacks for L1D flush
2. We can make the current code depend on L1D_FLUSH MSR and enable it only
when that feature is available. There would be no software fallback. Then
follow it up with #1
3. We keep the code as is on the assumption that all of L1D <= 64K across the
current platforms and we do #1 in a followup (since the prctl is optional and
the only other user is the VMX code).

Thanks for the review,
Balbir Singh.



> Thanks,
> Tom
> 
> >   void clflush_cache_range(void *addr, unsigned int size);
> > +void *l1d_flush_alloc_pages(void);
> > +void l1d_flush_cleanup_pages(void *l1d_flush_pages);
> > 
> >   #endif /* _ASM_X86_CACHEFLUSH_H */
> > diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> > index 92e1261ec4ec..42c11ca85f1c 100644
> > --- a/arch/x86/kernel/Makefile
> > +++ b/arch/x86/kernel/Makefile
> > @@ -158,3 +158,4 @@ ifeq ($(CONFIG_X86_64),y)
> >   endif
> > 
> >   obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)        += ima_arch.o
> > +obj-y                                                += l1d_flush.o
> > diff --git a/arch/x86/kernel/l1d_flush.c b/arch/x86/kernel/l1d_flush.c
> > new file mode 100644
> > index 000000000000..d605878c8f28
> > --- /dev/null
> > +++ b/arch/x86/kernel/l1d_flush.c
> > @@ -0,0 +1,36 @@
> > +#include <linux/mm.h>
> > +#include <asm/cacheflush.h>
> > +
> > +void *l1d_flush_alloc_pages(void)
> > +{
> > +     struct page *page;
> > +     void *l1d_flush_pages = NULL;
> > +     int i;
> > +
> > +     /*
> > +      * This allocation for l1d_flush_pages is not tied to a VM/task's
> > +      * lifetime and so should not be charged to a memcg.
> > +      */
> > +     page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
> > +     if (!page)
> > +             return NULL;
> > +     l1d_flush_pages = page_address(page);
> > +
> > +     /*
> > +      * Initialize each page with a different pattern in
> > +      * order to protect against KSM in the nested
> > +      * virtualization case.
> > +      */
> > +     for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
> > +             memset(l1d_flush_pages + i * PAGE_SIZE, i + 1,
> > +                             PAGE_SIZE);
> > +     }
> > +     return l1d_flush_pages;
> > +}
> > +EXPORT_SYMBOL_GPL(l1d_flush_alloc_pages);
> > +
> > +void l1d_flush_cleanup_pages(void *l1d_flush_pages)
> > +{
> > +     free_pages((unsigned long)l1d_flush_pages, L1D_CACHE_ORDER);
> > +}
> > +EXPORT_SYMBOL_GPL(l1d_flush_cleanup_pages);
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 83050977490c..225aa8219bac 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -203,14 +203,10 @@ static const struct {
> >       [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
> >   };
> > 
> > -#define L1D_CACHE_ORDER 4
> >   static void *vmx_l1d_flush_pages;
> > 
> >   static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
> >   {
> > -     struct page *page;
> > -     unsigned int i;
> > -
> >       if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
> >               l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
> >               return 0;
> > @@ -253,24 +249,9 @@ static int vmx_setup_l1d_flush(enum
> > vmx_l1d_flush_state l1tf)
> > 
> >       if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
> >           !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
> > -             /*
> > -              * This allocation for vmx_l1d_flush_pages is not tied to a
> > VM
> > -              * lifetime and so should not be charged to a memcg.
> > -              */
> > -             page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
> > -             if (!page)
> > +             vmx_l1d_flush_pages = l1d_flush_alloc_pages();
> > +             if (!vmx_l1d_flush_pages)
> >                       return -ENOMEM;
> > -             vmx_l1d_flush_pages = page_address(page);
> > -
> > -             /*
> > -              * Initialize each page with a different pattern in
> > -              * order to protect against KSM in the nested
> > -              * virtualization case.
> > -              */
> > -             for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
> > -                     memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
> > -                            PAGE_SIZE);
> > -             }
> >       }
> > 
> >       l1tf_vmx_mitigation = l1tf;
> > @@ -8026,7 +8007,7 @@ static struct kvm_x86_init_ops vmx_init_ops
> > __initdata = {
> >   static void vmx_cleanup_l1d_flush(void)
> >   {
> >       if (vmx_l1d_flush_pages) {
> > -             free_pages((unsigned long)vmx_l1d_flush_pages,
> > L1D_CACHE_ORDER);
> > +             l1d_flush_cleanup_pages(vmx_l1d_flush_pages);
> >               vmx_l1d_flush_pages = NULL;
> >       }
> >       /* Restore state so sysfs ignores VMX */
> >