linux-kernel - Re: [PATCH v2 8/8] KVM: x86: hyper-v: Deactivate APICv only when AutoEOI feature is in use

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <64ed28249c1895a59c9f2e2aa2e4c09a381f69e5.camel@redhat.com>
Date:   Thu, 22 Jul 2021 20:35:54 +0300
From:   Maxim Levitsky <mlevitsk@...hat.com>
To:     Sean Christopherson <seanjc@...gle.com>
Cc:     kvm@...r.kernel.org,
        "open list:X86 ARCHITECTURE (32-BIT AND 64-BIT)" 
        <linux-kernel@...r.kernel.org>, Jim Mattson <jmattson@...gle.com>,
        Joerg Roedel <joro@...tes.org>, Borislav Petkov <bp@...en8.de>,
        Vitaly Kuznetsov <vkuznets@...hat.com>,
        Wanpeng Li <wanpengli@...cent.com>,
        Paolo Bonzini <pbonzini@...hat.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        "H. Peter Anvin" <hpa@...or.com>, Ingo Molnar <mingo@...hat.com>,
        "maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)" <x86@...nel.org>
Subject: Re: [PATCH v2 8/8] KVM: x86: hyper-v: Deactivate APICv only when
 AutoEOI feature is in use

On Mon, 2021-07-19 at 18:49 +0000, Sean Christopherson wrote:
> On Sun, Jul 18, 2021, Maxim Levitsky wrote:
> > I am more inclined to fix this by just tracking if we hold the srcu
> > lock on each VCPU manually, just as we track the srcu index anyway,
> > and then kvm_request_apicv_update can use this to drop the srcu
> > lock when needed.
> 
> The entire approach of dynamically adding/removing the memslot seems doomed to
> failure, and is likely responsible for the performance issues with AVIC, e.g. a
> single vCPU temporarily inhibiting AVIC will zap all SPTEs _twice_; on disable
> and again on re-enable.
> 
> Rather than pile on more gunk, what about special casing the APIC access page
> memslot in try_async_pf()?  E.g. zap the GFN in avic_update_access_page() when
> disabling (and bounce through kvm_{inc,dec}_notifier_count()), and have the page
> fault path skip directly to MMIO emulation without caching the MMIO info.  It'd
> also give us a good excuse to rename try_async_pf() :-)
> 
> If lack of MMIO caching is a performance problem, an alternative solution would
> be to allow caching but add a helper to zap the MMIO SPTE and request all vCPUs to
> clear their cache.
> 
> It's all a bit gross, especially hijacking the mmu_notifier path, but IMO it'd be
> less awful than the current memslot+SRCU mess.

Hi!

I am testing your approach and it actually works very well! I can't seem to break it.

Could you explain why do I need to do something with kvm_{inc,dec}_notifier_count()) ?

I just use your patch as is, plus the changes that are needed in kvm_request_apicv_update.

On AVIC unlike APICv, the page in this memslot is pinned in the physical memory still
(AVIC misses the code that APICv has in this regard).
It should be done in the future though.

Best regards,
	Maxim Levitsky

> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index f4d35289f59e..ea434d76cfb0 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3767,9 +3767,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
>  }
> 
> -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> -                        gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
> -                        bool write, bool *writable)
> +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> +                           gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
> +                           bool write, bool *writable, int *r)
>  {
>         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>         bool async;
> @@ -3780,13 +3780,26 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>          * be zapped before KVM inserts a new MMIO SPTE for the gfn.
>          */
>         if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
> -               return true;
> +               goto out_retry;
> 
> -       /* Don't expose private memslots to L2. */
> -       if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
> -               *pfn = KVM_PFN_NOSLOT;
> -               *writable = false;
> -               return false;
> +       if (!kvm_is_visible_memslot(slot)) {
> +               /* Don't expose private memslots to L2. */
> +               if (is_guest_mode(vcpu)) {
> +                       *pfn = KVM_PFN_NOSLOT;
> +                       *writable = false;
> +                       return false;
> +               }
> +               /*
> +                * If the APIC access page exists but is disabled, go directly
> +                * to emulation without caching the MMIO access or creating a
> +                * MMIO SPTE.  That way the cache doesn't need to be purged
> +                * when the AVIC is re-enabled.
> +                */
> +               if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
> +                   !vcpu->kvm->arch.apic_access_memslot_enabled) {
> +                       *r = RET_PF_EMULATE;
> +                       return true;
> +               }
>         }
> 
>         async = false;
> @@ -3800,14 +3813,19 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
>                         trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
>                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
> -                       return true;
> -               } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
> -                       return true;
> +                       goto out_retry;
> +               } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) {
> +                       goto out_retry;
> +               }
>         }
> 
>         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
>                                     write, writable, hva);
>         return false;
> +
> +out_retry:
> +       *r = RET_PF_RETRY;
> +       return true;
>  }
> 
>  static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
> @@ -3839,9 +3857,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
>         mmu_seq = vcpu->kvm->mmu_notifier_seq;
>         smp_rmb();
> 
> -       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
> -                        write, &map_writable))
> -               return RET_PF_RETRY;
> +       if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, write,
> +                           &map_writable, &r))
> +               return r;
> 
>         if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
>                 return r;
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 490a028ddabe..9747124b877d 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -881,9 +881,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
>         mmu_seq = vcpu->kvm->mmu_notifier_seq;
>         smp_rmb();
>  
> -       if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
> -                        write_fault, &map_writable))
> -               return RET_PF_RETRY;
> +       if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
> +                           write_fault, &map_writable, &r))
> +               return r;
>  
>         if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
>                 return r;
>