linux-kernel - Re: [PATCH -v2] KVM, Fix QEMU-KVM is killed by guest SRAO MCE

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100513214307.GA25003@amt.cnet>
Date:	Thu, 13 May 2010 18:43:07 -0300
From:	Marcelo Tosatti <mtosatti@...hat.com>
To:	Huang Ying <ying.huang@...el.com>
Cc:	Avi Kivity <avi@...hat.com>, Andi Kleen <andi@...stfloor.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	masbock@...ux.vnet.ibm.com,
	"Wu, Fengguang" <fengguang.wu@...el.com>,
	linux-kernel@...r.kernel.org, kvm@...r.kernel.org
Subject: Re: [PATCH -v2] KVM, Fix QEMU-KVM is killed by guest SRAO MCE

On Wed, May 12, 2010 at 02:44:03PM +0800, Huang Ying wrote:
> In common cases, guest SRAO MCE will cause corresponding poisoned page
> be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay
> the MCE to guest OS.
> 
> But it is reported that if the poisoned page is accessed in guest
> after un-mapped and before MCE is relayed to guest OS, QEMU-KVM will
> be killed.
> 
> The reason is as follow. Because poisoned page has been un-mapped,
> guest access will cause guest exit and kvm_mmu_page_fault will be
> called. kvm_mmu_page_fault can not get the poisoned page for fault
> address, so kernel and user space MMIO processing is tried in turn. In
> user MMIO processing, poisoned page is accessed again, then QEMU-KVM
> is killed by force_sig_info.
> 
> To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM
> and do not try kernel and user space MMIO processing for poisoned
> page.
> 
> 
> Changelog:
> 
> v2:
> 
> - Use page table walker to determine whether the virtual address is
>   poisoned to avoid change user space interface (via changing
>   get_user_pages).
> 
> - Wrap bad page processing into kvm_handle_bad_page to avoid code
>   duplicating.
> 
> Reported-by: Max Asbock <masbock@...ux.vnet.ibm.com>
> Signed-off-by: Huang Ying <ying.huang@...el.com>
> ---
>  arch/x86/kvm/mmu.c         |   34 ++++++++++++++++++++++++++--------
>  arch/x86/kvm/paging_tmpl.h |    7 ++-----
>  include/linux/kvm_host.h   |    1 +
>  include/linux/mm.h         |    8 ++++++++
>  mm/memory-failure.c        |   28 ++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c        |   30 ++++++++++++++++++++++++++++--
>  6 files changed, 93 insertions(+), 15 deletions(-)
> 
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -32,6 +32,7 @@
>  #include <linux/compiler.h>
>  #include <linux/srcu.h>
>  #include <linux/slab.h>
> +#include <linux/uaccess.h>
>  
>  #include <asm/page.h>
>  #include <asm/cmpxchg.h>
> @@ -1975,6 +1976,27 @@ static int __direct_map(struct kvm_vcpu
>  	return pt_write;
>  }
>  
> +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
> +{
> +	char buf[1];
> +	void __user *hva;
> +	int r;
> +
> +	/* Touch the page, so send SIGBUS */
> +	hva = (void __user *)gfn_to_hva(kvm, gfn);
> +	r = copy_from_user(buf, hva, 1);
> +}

A SIGBUS signal has been raised by memory poisoning already, so i don't
see why this is needed?

To avoid the MMIO processing in userspace before the MCE is sent to the
guest you can just return -EAGAIN from the page fault handlers back to
kvm_mmu_page_fault.

> +int is_hwpoison_pfn(pfn_t pfn)
> +{
> +	return pfn == hwpoison_pfn;
> +}
> +EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
> +
>  static inline unsigned long bad_hva(void)
>  {
>  	return PAGE_OFFSET;
> @@ -939,6 +948,11 @@ static pfn_t hva_to_pfn(struct kvm *kvm,
>  	if (unlikely(npages != 1)) {
>  		struct vm_area_struct *vma;
>  
> +		if (is_hwpoison_address(addr)) {
> +			get_page(hwpoison_page);
> +			return page_to_pfn(hwpoison_page);
> +		}
> +
>  		down_read(&current->mm->mmap_sem);
>  		vma = find_vma(current->mm, addr);
>  
> @@ -2198,6 +2212,15 @@ int kvm_init(void *opaque, unsigned int
>  
>  	bad_pfn = page_to_pfn(bad_page);
>  
> +	hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +
> +	if (hwpoison_page == NULL) {
> +		r = -ENOMEM;
> +		goto out_free_0;
> +	}
> +
> +	hwpoison_pfn = page_to_pfn(hwpoison_page);
> +
>  	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
>  		r = -ENOMEM;
>  		goto out_free_0;
> @@ -2269,6 +2292,8 @@ out_free_1:
>  out_free_0a:
>  	free_cpumask_var(cpus_hardware_enabled);
>  out_free_0:
> +	if (hwpoison_page)
> +		__free_page(hwpoison_page);
>  	__free_page(bad_page);
>  out:
>  	kvm_arch_exit();
> @@ -2291,6 +2316,7 @@ void kvm_exit(void)
>  	kvm_arch_hardware_unsetup();
>  	kvm_arch_exit();
>  	free_cpumask_var(cpus_hardware_enabled);
> +	__free_page(hwpoison_page);
>  	__free_page(bad_page);
>  }
>  EXPORT_SYMBOL_GPL(kvm_exit);
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -45,6 +45,7 @@
>  #include <linux/page-isolation.h>
>  #include <linux/suspend.h>
>  #include <linux/slab.h>
> +#include <linux/swapops.h>
>  #include "internal.h"
>  
>  int sysctl_memory_failure_early_kill __read_mostly = 0;
> @@ -1296,3 +1297,30 @@ done:
>  	/* keep elevated page count for bad page */
>  	return ret;
>  }
> +
> +int is_hwpoison_address(unsigned long addr)
> +{
> +	pgd_t *pgdp;
> +	pud_t *pudp;
> +	pmd_t *pmdp;
> +	pte_t pte, *ptep;
> +	swp_entry_t entry;
> +
> +	pgdp = pgd_offset(current->mm, addr);
> +	if (!pgd_present(*pgdp))
> +		return 0;
> +	pudp = pud_offset(pgdp, addr);
> +	if (!pud_present(*pudp))
> +		return 0;
> +	pmdp = pmd_offset(pudp, addr);
> +	if (!pmd_present(*pmdp))
> +		return 0;

Need to bail out if pmd is huge.

> +	ptep = pte_offset_map(pmdp, addr);
> +	pte = *ptep;
> +	pte_unmap(ptep);
> +	if (!is_swap_pte(pte))
> +		return 0;
> +	entry = pte_to_swp_entry(pte);
> +	return is_hwpoison_entry(entry);
> +}
> +EXPORT_SYMBOL_GPL(is_hwpoison_address);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/