lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 17 Feb 2021 09:29:42 -0800
From:   Sean Christopherson <seanjc@...gle.com>
To:     Maxim Levitsky <mlevitsk@...hat.com>
Cc:     kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
        Wanpeng Li <wanpengli@...cent.com>,
        Borislav Petkov <bp@...en8.de>,
        Paolo Bonzini <pbonzini@...hat.com>,
        Joerg Roedel <joro@...tes.org>,
        Jim Mattson <jmattson@...gle.com>,
        "H. Peter Anvin" <hpa@...or.com>,
        "maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)" <x86@...nel.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Vitaly Kuznetsov <vkuznets@...hat.com>,
        Ingo Molnar <mingo@...hat.com>
Subject: Re: [PATCH 4/7] KVM: nVMX: move inject_page_fault tweak to
 .complete_mmu_init

On Wed, Feb 17, 2021, Maxim Levitsky wrote:
> This fixes a (mostly theoretical) bug which can happen if ept=0
> on host and we run a nested guest which triggers a mmu context
> reset while running nested.
> In this case the .inject_page_fault callback will be lost.
>
> Signed-off-by: Maxim Levitsky <mlevitsk@...hat.com>
> ---
>  arch/x86/kvm/vmx/nested.c | 8 +-------
>  arch/x86/kvm/vmx/nested.h | 1 +
>  arch/x86/kvm/vmx/vmx.c    | 5 ++++-
>  3 files changed, 6 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 0b6dab6915a3..f9de729dbea6 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -419,7 +419,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
>  }
>  
>  
> -static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
> +void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
>  		struct x86_exception *fault)
>  {
>  	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> @@ -2620,9 +2620,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>  		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
>  	}
>  
> -	if (!enable_ept)
> -		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
> -
>  	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
>  	    WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
>  				     vmcs12->guest_ia32_perf_global_ctrl)))
> @@ -4224,9 +4221,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
>  	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
>  		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
>  
> -	if (!enable_ept)
> -		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;

Oof, please explicitly call out these types of side effects in the changelog,
it took me a while to piece together that this can be dropped because a MMU
reset is guaranteed and is also guaranteed to restore inject_page_fault.

I would even go so far as to say this particular line of code should be removed
in a separate commit.  Unless I'm overlooking something, this code is
effectively a nop, which means it doesn't need to be removed to make the bug fix
functionally correct.

All that being said, I'm pretty we can eliminate setting inject_page_fault
dynamically.  I think that would yield more maintainable code.  Following these
flows is a nightmare.  The change itself will be scarier, but I'm pretty sure
the end result will be a lot cleaner.

And I believe there's also a second bug that would be fixed by such an approach.
Doesn't vmx_inject_page_fault_nested() need to be used for the nested_mmu when
ept=1?  E.g. if the emulator injects a #PF to L2, L1 should still be able to
intercept the #PF even if L1 is using EPT.  This likely hasn't been noticed
because hypervisors typically don't intercept #PF when EPT is enabled.

Something like this (very incomplete):

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 30e9b0cb9abd..f957514a4d65 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4497,7 +4497,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        context->direct_map = true;
        context->get_guest_pgd = get_cr3;
        context->get_pdptr = kvm_pdptr_read;
-       context->inject_page_fault = kvm_inject_page_fault;

        if (!is_paging(vcpu)) {
                context->nx = false;
@@ -4687,7 +4686,6 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)

        context->get_guest_pgd     = get_cr3;
        context->get_pdptr         = kvm_pdptr_read;
-       context->inject_page_fault = kvm_inject_page_fault;
 }

 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
@@ -4701,7 +4699,6 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
        g_context->mmu_role.as_u64 = new_role.as_u64;
        g_context->get_guest_pgd     = get_cr3;
        g_context->get_pdptr         = kvm_pdptr_read;
-       g_context->inject_page_fault = kvm_inject_page_fault;

        /*
         * L2 page tables are never shadowed, so there is no need to sync
@@ -5272,6 +5269,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
        if (ret)
                goto fail_allocate_root;

+       static_call(kvm_x86_mmu_create)(vcpu);
+
        return ret;
  fail_allocate_root:
        free_mmu_pages(&vcpu->arch.guest_mmu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index a63da447ede9..aa6c48295117 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -425,15 +425,14 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
 }


-static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
+static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
                struct x86_exception *fault)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

-       WARN_ON(!is_guest_mode(vcpu));
-
-       if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
-               !to_vmx(vcpu)->nested.nested_run_pending) {
+       if (guest_mode(vcpu) &&
+           nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
+           !to_vmx(vcpu)->nested.nested_run_pending) {
                vmcs12->vm_exit_intr_error_code = fault->error_code;
                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
                                  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
@@ -2594,9 +2593,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
        }

-       if (!enable_ept)
-               vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
-
        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
            WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
                                     vmcs12->guest_ia32_perf_global_ctrl)))
@@ -4198,9 +4194,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);

-       if (!enable_ept)
-               vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-
        nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);

        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1204e5f0fe67..0e5ee22eea77 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3081,6 +3081,13 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        vmx->emulation_required = emulation_required(vcpu);
 }

+static void vmx_mmu_create(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.root_mmu.inject_page_fault = vmx_inject_page_fault;
+       vcpu->arch.guest_mmu.inject_page_fault = nested_ept_inject_page_fault;
+       vcpu->arch.nested_mmu.inject_page_fault = vmx_inject_page_fault;
+}
+
 static int vmx_get_max_tdp_level(void)
 {
        if (cpu_has_vmx_ept_5levels())
@@ -7721,6 +7728,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {

        .write_l1_tsc_offset = vmx_write_l1_tsc_offset,

+       .mmu_create = vmx_mmu_create,
        .load_mmu_pgd = vmx_load_mmu_pgd,

        .check_intercept = vmx_check_intercept,

> -
>  	nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
>  
>  	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
> diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
> index 197148d76b8f..2ab279744d38 100644
> --- a/arch/x86/kvm/vmx/nested.h
> +++ b/arch/x86/kvm/vmx/nested.h
> @@ -36,6 +36,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
>  void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
>  bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
>  				 int size);
> +void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,struct x86_exception *fault);
>  
>  static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
>  {
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index bf6ef674d688..c43324df4877 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -3254,7 +3254,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
>  
>  static void vmx_complete_mmu_init(struct kvm_vcpu *vcpu)
>  {
> -
> +	if (!enable_ept && is_guest_mode(vcpu)) {
> +		WARN_ON(mmu_is_nested(vcpu));
> +		vcpu->arch.mmu->inject_page_fault = vmx_inject_page_fault_nested;
> +	}
>  }
>  
>  static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
> -- 
> 2.26.2
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ