linux-kernel - Re: [PATCH] KVM/nVMX: Stop mapping the "APIC-access address" page into the kernel

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <1543856375.17106.25.camel@amazon.de>
Date:   Mon, 3 Dec 2018 16:59:36 +0000
From:   "Raslan, KarimAllah" <karahmed@...zon.de>
To:     "jmattson@...gle.com" <jmattson@...gle.com>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "rkrcmar@...hat.com" <rkrcmar@...hat.com>,
        "kvm@...r.kernel.org" <kvm@...r.kernel.org>,
        "pbonzini@...hat.com" <pbonzini@...hat.com>
Subject: Re: [PATCH] KVM/nVMX: Stop mapping the "APIC-access address" page
 into the kernel

On Mon, 2018-12-03 at 14:59 +0100, KarimAllah Ahmed wrote:
> The "APIC-access address" is simply a token that the hypervisor puts into
> the PFN of a 4K EPTE (or PTE if using shadow paging) that triggers APIC
> virtualization whenever a page walk terminates with that PFN. This address
> has to be a legal address (i.e.  within the physical address supported by
> the CPU), but it need not have WB memory behind it. In fact, it need not
> have anything at all behind it. When bit 31 ("activate secondary controls")
> of the primary processor-based VM-execution controls is set and bit 0
> ("virtualize APIC accesses") of the secondary processor-based VM-execution
> controls is set, the PFN recorded in the VMCS "APIC-access address" field
> will never be touched. (Instead, the access triggers APIC virtualization,
> which may access the PFN recorded in the "Virtual-APIC address" field of
> the VMCS.)
> 
> So stop mapping the "APIC-access address" page into the kernel and even
> drop the requirements to have a valid page backing it. Instead, just use
> some token that:
> 
> 1) Not one of the valid guest pages.
> 2) Within the physical address supported by the CPU.
> 
> Suggested-by: Jim Mattson <jmattson@...gle.com>
> Signed-off-by: KarimAllah Ahmed <karahmed@...zon.de>
> ---
> 
> Thanks Jim for the commit message :)
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/mmu.c              | 10 ++++++
>  arch/x86/kvm/vmx.c              | 71 ++++++++++++++++++-----------------------
>  3 files changed, 42 insertions(+), 40 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index fbda5a9..7e50196 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1077,6 +1077,7 @@ struct kvm_x86_ops {
>  	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
>  	void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
>  	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
> +	bool (*nested_apic_access_addr)(struct kvm_vcpu *vcpu, gpa_t gpa, hpa_t *hpa);
>  	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
>  	int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 7c03c0f..ae46a8d 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3962,9 +3962,19 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
>  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>  			 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
>  {
> +	hpa_t hpa;
>  	struct kvm_memory_slot *slot;
>  	bool async;
>  
> +	if (is_guest_mode(vcpu) &&
> +	    kvm_x86_ops->nested_apic_access_addr &&
> +	    kvm_x86_ops->nested_apic_access_addr(vcpu, gfn_to_gpa(gfn), &hpa)) {
> +		*pfn = hpa >> PAGE_SHIFT;
> +		if (writable)
> +			*writable = true;
> +		return false;
> +	}

Now thinking further about this, I actually still need to validate that the L12 
EPT for this gfn actually contains the apic_access address. To ensure that I 
only fixup the fault when the L1 hypervisor sets up both VMCS L12 APIC_ACCESS 
and L12 EPT to contain the same address.

Will fix and send v2.

> +
>  	/*
>  	 * Don't expose private memslots to L2.
>  	 */
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 83a614f..340cf56 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -864,7 +864,6 @@ struct nested_vmx {
>  	 * Guest pages referred to in the vmcs02 with host-physical
>  	 * pointers, so we must keep them pinned while L2 runs.
>  	 */
> -	struct page *apic_access_page;
>  	struct kvm_host_map virtual_apic_map;
>  	struct kvm_host_map pi_desc_map;
>  	struct kvm_host_map msr_bitmap_map;
> @@ -8512,10 +8511,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
>  	kfree(vmx->nested.cached_vmcs12);
>  	kfree(vmx->nested.cached_shadow_vmcs12);
>  	/* Unpin physical memory we referred to in the vmcs02 */
> -	if (vmx->nested.apic_access_page) {
> -		kvm_release_page_dirty(vmx->nested.apic_access_page);
> -		vmx->nested.apic_access_page = NULL;
> -	}
>  	kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
>  	kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
>  	vmx->nested.pi_desc = NULL;
> @@ -11901,41 +11896,27 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
>  static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>  						 struct vmcs12 *vmcs12);
>  
> +static hpa_t vmx_apic_access_addr(void)
> +{
> +	/*
> +	 * The physical address choosen here has to:
> +	 * 1) Never be an address that could be assigned to a guest.
> +	 * 2) Within the maximum physical limits of the CPU.
> +	 *
> +	 * So our choice below is completely random, but at least it follows
> +	 * these two rules.
> +	 */
> +	return __pa_symbol(_text) & PAGE_MASK;
> +}
> +
>  static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
>  {
>  	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	struct kvm_host_map *map;
> -	struct page *page;
> -	u64 hpa;
>  
> -	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
> -		/*
> -		 * Translate L1 physical address to host physical
> -		 * address for vmcs02. Keep the page pinned, so this
> -		 * physical address remains valid. We keep a reference
> -		 * to it so we can release it later.
> -		 */
> -		if (vmx->nested.apic_access_page) { /* shouldn't happen */
> -			kvm_release_page_dirty(vmx->nested.apic_access_page);
> -			vmx->nested.apic_access_page = NULL;
> -		}
> -		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
> -		/*
> -		 * If translation failed, no matter: This feature asks
> -		 * to exit when accessing the given address, and if it
> -		 * can never be accessed, this feature won't do
> -		 * anything anyway.
> -		 */
> -		if (!is_error_page(page)) {
> -			vmx->nested.apic_access_page = page;
> -			hpa = page_to_phys(vmx->nested.apic_access_page);
> -			vmcs_write64(APIC_ACCESS_ADDR, hpa);
> -		} else {
> -			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
> -					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
> -		}
> -	}
> +	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
> +		vmcs_write64(APIC_ACCESS_ADDR, vmx_apic_access_addr());
>  
>  	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
>  		map = &vmx->nested.virtual_apic_map;
> @@ -14196,12 +14177,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
>  	/* This is needed for same reason as it was needed in prepare_vmcs02 */
>  	vmx->host_rsp = 0;
>  
> -	/* Unpin physical memory we referred to in vmcs02 */
> -	if (vmx->nested.apic_access_page) {
> -		kvm_release_page_dirty(vmx->nested.apic_access_page);
> -		vmx->nested.apic_access_page = NULL;
> -	}
> -
>  	kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
>  	kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
>  	vmx->nested.pi_desc = NULL;
> @@ -14949,6 +14924,21 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
>  	return 0;
>  }
>  
> +static bool vmx_nested_apic_access_addr(struct kvm_vcpu *vcpu,
> +					gpa_t gpa, hpa_t *hpa)
> +{
> +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +
> +	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
> +	    page_address_valid(vcpu, vmcs12->apic_access_addr) &&
> +	    (gpa_to_gfn(gpa) == gpa_to_gfn(vmcs12->apic_access_addr))) {
> +		*hpa = vmx_apic_access_addr();
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.cpu_has_kvm_support = cpu_has_kvm_support,
>  	.disabled_by_bios = vmx_disabled_by_bios,
> @@ -15022,6 +15012,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.update_cr8_intercept = update_cr8_intercept,
>  	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
>  	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
> +	.nested_apic_access_addr = vmx_nested_apic_access_addr,
>  	.get_enable_apicv = vmx_get_enable_apicv,
>  	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
>  	.load_eoi_exitmap = vmx_load_eoi_exitmap,



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrer: Christian Schlaeger, Ralf Herbrich
Ust-ID: DE 289 237 879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B