[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240814114230.hgzrh3cal6k6x2dn@yy-desk-7060>
Date: Wed, 14 Aug 2024 19:42:30 +0800
From: Yuan Yao <yuan.yao@...ux.intel.com>
To: Sean Christopherson <seanjc@...gle.com>
Cc: Paolo Bonzini <pbonzini@...hat.com>, kvm@...r.kernel.org,
linux-kernel@...r.kernel.org, Peter Gonda <pgonda@...gle.com>,
Michael Roth <michael.roth@....com>,
Vishal Annapurve <vannapurve@...gle.com>,
Ackerly Tng <ackerleytng@...gle.com>
Subject: Re: [PATCH 03/22] KVM: x86/mmu: Trigger unprotect logic only on
write-protection page faults
On Fri, Aug 09, 2024 at 12:03:00PM -0700, Sean Christopherson wrote:
> Trigger KVM's various "unprotect gfn" paths if and only if the page fault
> was a write to a write-protected gfn. To do so, add a new page fault
> return code, RET_PF_WRITE_PROTECTED, to explicitly and precisely track
> such page faults.
>
> If a page fault requires emulation for any MMIO (or any reason besides
> write-protection), trying to unprotect the gfn is pointless and risks
> putting the vCPU into an infinite loop. E.g. KVM will put the vCPU into
> an infinite loop if the vCPU manages to trigger MMIO on a page table walk.
>
> Fixes: 147277540bbc ("kvm: svm: Add support for additional SVM NPF error codes")
> Cc: stable@...r.kernel.org
> Signed-off-by: Sean Christopherson <seanjc@...gle.com>
> ---
> arch/x86/kvm/mmu/mmu.c | 78 +++++++++++++++++++--------------
> arch/x86/kvm/mmu/mmu_internal.h | 3 ++
> arch/x86/kvm/mmu/mmutrace.h | 1 +
> arch/x86/kvm/mmu/paging_tmpl.h | 2 +-
> arch/x86/kvm/mmu/tdp_mmu.c | 6 +--
> 5 files changed, 53 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 901be9e420a4..e3aa04c498ea 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2914,10 +2914,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> trace_kvm_mmu_set_spte(level, gfn, sptep);
> }
>
> - if (wrprot) {
> - if (write_fault)
> - ret = RET_PF_EMULATE;
> - }
> + if (wrprot && write_fault)
> + ret = RET_PF_WRITE_PROTECTED;
>
> if (flush)
> kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
> @@ -4549,7 +4547,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> return RET_PF_RETRY;
>
> if (page_fault_handle_page_track(vcpu, fault))
> - return RET_PF_EMULATE;
> + return RET_PF_WRITE_PROTECTED;
>
> r = fast_page_fault(vcpu, fault);
> if (r != RET_PF_INVALID)
> @@ -4642,7 +4640,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
> int r;
>
> if (page_fault_handle_page_track(vcpu, fault))
> - return RET_PF_EMULATE;
> + return RET_PF_WRITE_PROTECTED;
>
> r = fast_page_fault(vcpu, fault);
> if (r != RET_PF_INVALID)
> @@ -4726,6 +4724,9 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
> case RET_PF_EMULATE:
> return -ENOENT;
>
> + case RET_PF_WRITE_PROTECTED:
> + return -EPERM;
> +
> case RET_PF_RETRY:
> case RET_PF_CONTINUE:
> case RET_PF_INVALID:
> @@ -5960,6 +5961,41 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
> write_unlock(&vcpu->kvm->mmu_lock);
> }
>
> +static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> + u64 error_code, int *emulation_type)
> +{
> + bool direct = vcpu->arch.mmu->root_role.direct;
> +
> + /*
> + * Before emulating the instruction, check if the error code
> + * was due to a RO violation while translating the guest page.
> + * This can occur when using nested virtualization with nested
> + * paging in both guests. If true, we simply unprotect the page
> + * and resume the guest.
> + */
> + if (direct &&
> + (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
> + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
> + return RET_PF_FIXED;
> + }
> +
> + /*
> + * The gfn is write-protected, but if emulation fails we can still
> + * optimistically try to just unprotect the page and let the processor
> + * re-execute the instruction that caused the page fault. Do not allow
> + * retrying MMIO emulation, as it's not only pointless but could also
> + * cause us to enter an infinite loop because the processor will keep
> + * faulting on the non-existent MMIO address. Retrying an instruction
> + * from a nested guest is also pointless and dangerous as we are only
> + * explicitly shadowing L1's page tables, i.e. unprotecting something
> + * for L1 isn't going to magically fix whatever issue cause L2 to fail.
> + */
> + if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
Looks the mmio_info_in_cache() checking can be removed,
emulation should not come here with RET_PF_WRITE_PROTECTED
introduced, may WARN_ON_ONCE() it.
> + *emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
> +
> + return RET_PF_EMULATE;
> +}
> +
> int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
> void *insn, int insn_len)
> {
> @@ -6005,6 +6041,10 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
> if (r < 0)
> return r;
>
> + if (r == RET_PF_WRITE_PROTECTED)
> + r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
> + &emulation_type);
> +
> if (r == RET_PF_FIXED)
> vcpu->stat.pf_fixed++;
> else if (r == RET_PF_EMULATE)
> @@ -6015,32 +6055,6 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
> if (r != RET_PF_EMULATE)
> return 1;
>
> - /*
> - * Before emulating the instruction, check if the error code
> - * was due to a RO violation while translating the guest page.
> - * This can occur when using nested virtualization with nested
> - * paging in both guests. If true, we simply unprotect the page
> - * and resume the guest.
> - */
> - if (vcpu->arch.mmu->root_role.direct &&
> - (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
> - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
> - return 1;
> - }
> -
> - /*
> - * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
> - * optimistically try to just unprotect the page and let the processor
> - * re-execute the instruction that caused the page fault. Do not allow
> - * retrying MMIO emulation, as it's not only pointless but could also
> - * cause us to enter an infinite loop because the processor will keep
> - * faulting on the non-existent MMIO address. Retrying an instruction
> - * from a nested guest is also pointless and dangerous as we are only
> - * explicitly shadowing L1's page tables, i.e. unprotecting something
> - * for L1 isn't going to magically fix whatever issue cause L2 to fail.
> - */
> - if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
> - emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
> emulate:
> return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
> insn_len);
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 1721d97743e9..50d2624111f8 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -258,6 +258,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
> * RET_PF_CONTINUE: So far, so good, keep handling the page fault.
> * RET_PF_RETRY: let CPU fault again on the address.
> * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
> + * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the
> + * gfn and retry, or emulate the instruction directly.
> * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
> * RET_PF_FIXED: The faulting entry has been fixed.
> * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
> @@ -274,6 +276,7 @@ enum {
> RET_PF_CONTINUE = 0,
> RET_PF_RETRY,
> RET_PF_EMULATE,
> + RET_PF_WRITE_PROTECTED,
> RET_PF_INVALID,
> RET_PF_FIXED,
> RET_PF_SPURIOUS,
> diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
> index 195d98bc8de8..f35a830ce469 100644
> --- a/arch/x86/kvm/mmu/mmutrace.h
> +++ b/arch/x86/kvm/mmu/mmutrace.h
> @@ -57,6 +57,7 @@
> TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
> TRACE_DEFINE_ENUM(RET_PF_RETRY);
> TRACE_DEFINE_ENUM(RET_PF_EMULATE);
> +TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED);
> TRACE_DEFINE_ENUM(RET_PF_INVALID);
> TRACE_DEFINE_ENUM(RET_PF_FIXED);
> TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 69941cebb3a8..a722a3c96af9 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -805,7 +805,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>
> if (page_fault_handle_page_track(vcpu, fault)) {
> shadow_page_table_clear_flood(vcpu, fault->addr);
> - return RET_PF_EMULATE;
> + return RET_PF_WRITE_PROTECTED;
> }
>
> r = mmu_topup_memory_caches(vcpu, true);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index c7dc49ee7388..8bf44ac9372f 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1046,10 +1046,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
> * protected, emulation is needed. If the emulation was skipped,
> * the vCPU would have the same fault again.
> */
> - if (wrprot) {
> - if (fault->write)
> - ret = RET_PF_EMULATE;
> - }
> + if (wrprot && fault->write)
> + ret = RET_PF_WRITE_PROTECTED;
>
> /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
> if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
> --
> 2.46.0.76.ge559c4bf1a-goog
>
>
Powered by blists - more mailing lists