[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221215195911.GE3632095@ls.amr.corp.intel.com>
Date: Thu, 15 Dec 2022 11:59:11 -0800
From: Isaku Yamahata <isaku.yamahata@...il.com>
To: Zhi Wang <zhi.wang.linux@...il.com>
Cc: isaku.yamahata@...el.com, kvm@...r.kernel.org,
linux-kernel@...r.kernel.org, isaku.yamahata@...il.com,
Paolo Bonzini <pbonzini@...hat.com>, erdemaktas@...gle.com,
Sean Christopherson <seanjc@...gle.com>,
Sagi Shahar <sagis@...gle.com>,
David Matlack <dmatlack@...gle.com>,
Sean Christopherson <sean.j.christopherson@...el.com>,
Kai Huang <kai.huang@...el.com>,
Zhi Wang <zhi.a.wang@...el.com>
Subject: Re: [PATCH v10 016/108] KVM: TDX: create/destroy VM structure
On Wed, Nov 23, 2022 at 12:36:59PM +0200,
Zhi Wang <zhi.wang.linux@...il.com> wrote:
> On Sat, 29 Oct 2022 23:22:17 -0700
> isaku.yamahata@...el.com wrote:
>
> > From: Sean Christopherson <sean.j.christopherson@...el.com>
> >
> > As the first step to create TDX guest, create/destroy VM struct.
> > Assign TDX private Host Key ID (HKID) to the TDX guest for memory
> > encryption and allocate extra pages for the TDX guest. On
> > destruction, free allocated pages, and HKID.
> >
> > Before tearing down private page tables, TDX requires some resources
> > of the guest TD to be destroyed (i.e. keyID must have been reclaimed,
> > etc). Add flush_shadow_all_private callback before tearing down
> > private page tables for it.
> >
> > Add a second kvm_x86_ops hook in kvm_arch_destroy_vm() to support
> > TDX's destruction path, which needs to first put the VM into a
> > teardown state, then free per-vCPU resources, and finally free per-VM
> > resources.
> >
> > Co-developed-by: Kai Huang <kai.huang@...el.com>
> > Signed-off-by: Kai Huang <kai.huang@...el.com>
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@...el.com>
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@...el.com>
> > ---
> > arch/x86/include/asm/kvm-x86-ops.h | 2 +
> > arch/x86/include/asm/kvm_host.h | 2 +
> > arch/x86/kvm/vmx/main.c | 34 ++-
> > arch/x86/kvm/vmx/tdx.c | 411
> > +++++++++++++++++++++++++++++ arch/x86/kvm/vmx/tdx.h |
> > 2 + arch/x86/kvm/vmx/x86_ops.h | 11 +
> > arch/x86/kvm/x86.c | 8 +
> > 7 files changed, 467 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm-x86-ops.h
> > b/arch/x86/include/asm/kvm-x86-ops.h index 8a5c5ae70bc5..3a29a6b31ee8
> > 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h
> > +++ b/arch/x86/include/asm/kvm-x86-ops.h
> > @@ -21,7 +21,9 @@ KVM_X86_OP(has_emulated_msr)
> > KVM_X86_OP(vcpu_after_set_cpuid)
> > KVM_X86_OP(is_vm_type_supported)
> > KVM_X86_OP(vm_init)
> > +KVM_X86_OP_OPTIONAL(flush_shadow_all_private)
> > KVM_X86_OP_OPTIONAL(vm_destroy)
> > +KVM_X86_OP_OPTIONAL(vm_free)
> > KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
> > KVM_X86_OP(vcpu_create)
> > KVM_X86_OP(vcpu_free)
> > diff --git a/arch/x86/include/asm/kvm_host.h
> > b/arch/x86/include/asm/kvm_host.h index 2a41a93a80f3..2870155ce6fb
> > 100644 --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1472,7 +1472,9 @@ struct kvm_x86_ops {
> > bool (*is_vm_type_supported)(unsigned long vm_type);
> > unsigned int vm_size;
> > int (*vm_init)(struct kvm *kvm);
> > + void (*flush_shadow_all_private)(struct kvm *kvm);
> > void (*vm_destroy)(struct kvm *kvm);
> > + void (*vm_free)(struct kvm *kvm);
> >
> > /* Create, but do not attach this VCPU */
> > int (*vcpu_precreate)(struct kvm *kvm);
> > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> > index 0900ff2f2390..d01a946a18cf 100644
> > --- a/arch/x86/kvm/vmx/main.c
> > +++ b/arch/x86/kvm/vmx/main.c
> > @@ -29,18 +29,44 @@ static __init int vt_hardware_setup(void)
> > return 0;
> > }
> >
> > +static void vt_hardware_unsetup(void)
> > +{
> > + tdx_hardware_unsetup();
> > + vmx_hardware_unsetup();
> > +}
> > +
> > static int vt_vm_init(struct kvm *kvm)
> > {
> > if (is_td(kvm))
> > - return -EOPNOTSUPP; /* Not ready to create
> > guest TD yet. */
> > + return tdx_vm_init(kvm);
> >
> > return vmx_vm_init(kvm);
> > }
> >
> > +static void vt_flush_shadow_all_private(struct kvm *kvm)
> > +{
> > + if (is_td(kvm))
> > + return tdx_mmu_release_hkid(kvm);
> > +}
> > +
> > +static void vt_vm_destroy(struct kvm *kvm)
> > +{
> > + if (is_td(kvm))
> > + return;
> > +
> > + vmx_vm_destroy(kvm);
> > +}
> > +
> > +static void vt_vm_free(struct kvm *kvm)
> > +{
> > + if (is_td(kvm))
> > + return tdx_vm_free(kvm);
> > +}
> > +
> > struct kvm_x86_ops vt_x86_ops __initdata = {
> > .name = "kvm_intel",
> >
> > - .hardware_unsetup = vmx_hardware_unsetup,
> > + .hardware_unsetup = vt_hardware_unsetup,
> > .check_processor_compatibility =
> > vmx_check_processor_compatibility,
> > .hardware_enable = vmx_hardware_enable,
> > @@ -50,7 +76,9 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
> > .is_vm_type_supported = vt_is_vm_type_supported,
> > .vm_size = sizeof(struct kvm_vmx),
> > .vm_init = vt_vm_init,
> > - .vm_destroy = vmx_vm_destroy,
> > + .flush_shadow_all_private = vt_flush_shadow_all_private,
> > + .vm_destroy = vt_vm_destroy,
> > + .vm_free = vt_vm_free,
> >
> > .vcpu_precreate = vmx_vcpu_precreate,
> > .vcpu_create = vmx_vcpu_create,
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index 530e72f85762..ec88dde0d300 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -32,6 +32,401 @@ struct tdx_capabilities {
> > /* Capabilities of KVM + the TDX module. */
> > static struct tdx_capabilities tdx_caps;
> >
> > +/*
> > + * Some TDX SEAMCALLs (TDH.MNG.CREATE, TDH.PHYMEM.CACHE.WB,
> > + * TDH.MNG.KEY.RECLAIMID, TDH.MNG.KEY.FREEID etc) tries to acquire a
> > global lock
> > + * internally in TDX module. If failed, TDX_OPERAND_BUSY is
> > returned without
> > + * spinning or waiting due to a constraint on execution time. It's
> > caller's
> > + * responsibility to avoid race (or retry on TDX_OPERAND_BUSY). Use
> > this mutex
> > + * to avoid race in TDX module because the kernel knows better about
> > scheduling.
> > + */
> > +static DEFINE_MUTEX(tdx_lock);
> > +static struct mutex *tdx_mng_key_config_lock;
> > +
> > +static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
> > +{
> > + return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
> > +}
> > +
> > +static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
> > +{
> > + return kvm_tdx->tdr.added;
> > +}
> > +
> > +static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
> > +{
> > + tdx_keyid_free(kvm_tdx->hkid);
> > + kvm_tdx->hkid = -1;
> > +}
> > +
> > +static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
> > +{
> > + return kvm_tdx->hkid > 0;
> > +}
> > +
> > +static void tdx_clear_page(unsigned long page)
> > +{
> > + const void *zero_page = (const void *)
> > __va(page_to_phys(ZERO_PAGE(0)));
> > + unsigned long i;
> > +
> > + /*
> > + * Zeroing the page is only necessary for systems with
> > MKTME-i:
> > + * when re-assign one page from old keyid to a new keyid,
> > MOVDIR64B is
> > + * required to clear/write the page with new keyid to
> > prevent integrity
> > + * error when read on the page with new keyid.
> > + *
> > + * The cache line could be poisoned (even without MKTME-i),
> > clear the
> > + * poison bit.
> > + */
> > + for (i = 0; i < PAGE_SIZE; i += 64)
> > + movdir64b((void *)(page + i), zero_page);
>
> In patch 11, the clflush is used on the movdir64b target. Should we
> also use the clflush here?
>
> I suppose the clflush is to prevent the unexpected cacheline writeback
> after the movdir64b.
No, clflush doesn't work. The cache with HKID set needs to be flushed and
(potential) poison needs to be cleared. In host OS/VMM, clflush flushes on cache
with HKID=0 (or HKID of MKTME). clflush doesn't clear poison.
--
Isaku Yamahata <isaku.yamahata@...il.com>
Powered by blists - more mailing lists