linux-kernel - Re: [PATCH] Add MCE support to KVM

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <49DE195D.1020303@redhat.com>
Date:	Thu, 09 Apr 2009 18:50:53 +0300
From:	Avi Kivity <avi@...hat.com>
To:	Huang Ying <ying.huang@...el.com>
CC:	kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
	Andi Kleen <andi@...stfloor.org>
Subject: Re: [PATCH] Add MCE support to KVM

Huang Ying wrote:
> Add MCE support to KVM. The related MSRs are emulated. A new vcpu
> ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation such as
> the mcg_cap. MCE is injected via vcpu ioctl command
> KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are
> not implemented.
>
> Signed-off-by: Huang Ying <ying.huang@...el.com>
>
> ---
>  arch/x86/include/asm/kvm_host.h |    5 
>  arch/x86/include/asm/mce.h      |    1 
>  arch/x86/kvm/x86.c              |  202 +++++++++++++++++++++++++++++++++++-----
>  include/linux/kvm.h             |   15 ++
>  4 files changed, 199 insertions(+), 24 deletions(-)
>
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -42,6 +42,7 @@
>  #include <asm/msr.h>
>  #include <asm/desc.h>
>  #include <asm/mtrr.h>
> +#include <asm/mce.h>
>  
>  #define MAX_IO_MSRS 256
>  #define CR0_RESERVED_BITS						\
> @@ -734,23 +735,43 @@ static int set_msr_mtrr(struct kvm_vcpu 
>  	return 0;
>  }
>  
> -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> +static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>  {
> +	u64 mcg_cap = vcpu->arch.mcg_cap;
> +	unsigned bank_num = mcg_cap & 0xff;
> +
>  	switch (msr) {
> -	case MSR_EFER:
> -		set_efer(vcpu, data);
> -		break;
> -	case MSR_IA32_MC0_STATUS:
> -		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
> -		       __func__, data);
> -		break;
>  	case MSR_IA32_MCG_STATUS:
> -		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
> -			__func__, data);
> +		vcpu->arch.mcg_status = data;
>  		break;
>  	case MSR_IA32_MCG_CTL:
> -		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
> -			__func__, data);
> +		if (!(mcg_cap & MCG_CTL_P))
> +			return 1;
> +		if (data != 0 && data != ~(u64)0)
> +			return -1;
> +		vcpu->arch.mcg_ctl = data;
> +		break;
> +	default:
> +		if (msr >= MSR_IA32_MC0_CTL &&
> +		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
> +			u32 offset = msr - MSR_IA32_MC0_CTL;
> +			/* only 0 or all 1s can be written to IA32_MCi_CTL */
> +			if ((offset & 0x3) == 0 &&
> +			    data != 0 && data != ~(u64)0)
> +				return -1;
> +			vcpu->arch.mce_banks[offset] = data;
> +			break;
> +		}
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> +{
> +	switch (msr) {
> +	case MSR_EFER:
> +		set_efer(vcpu, data);
>  		break;
>  	case MSR_IA32_DEBUGCTLMSR:
>  		if (!data) {
> @@ -807,6 +828,8 @@ int kvm_set_msr_common(struct kvm_vcpu *
>  		break;
>  	}
>  	default:
> +		if (!set_msr_mce(vcpu, msr, data))
> +		    break;
>  		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
>  		return 1;
>  	}
>   

Is there any reason you split kvm_set_msr_common() into two functions?

>  
> -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>  {
>  	u64 data;
> +	u64 mcg_cap = vcpu->arch.mcg_cap;
> +	unsigned bank_num = mcg_cap & 0xff;
>  
>  	switch (msr) {
> -	case 0xc0010010: /* SYSCFG */
> -	case 0xc0010015: /* HWCR */
> -	case MSR_IA32_PLATFORM_ID:
>  	case MSR_IA32_P5_MC_ADDR:
>  	case MSR_IA32_P5_MC_TYPE:
> -	case MSR_IA32_MC0_CTL:
> -	case MSR_IA32_MCG_STATUS:
> +		data = 0;
> +		break;
>  	case MSR_IA32_MCG_CAP:
> +		data = vcpu->arch.mcg_cap;
> +		break;
>  	case MSR_IA32_MCG_CTL:
> -	case MSR_IA32_MC0_MISC:
> -	case MSR_IA32_MC0_MISC+4:
> -	case MSR_IA32_MC0_MISC+8:
> -	case MSR_IA32_MC0_MISC+12:
> -	case MSR_IA32_MC0_MISC+16:
> -	case MSR_IA32_MC0_MISC+20:
> +		if (!(mcg_cap & MCG_CTL_P))
> +			return 1;
> +		data = vcpu->arch.mcg_ctl;
> +		break;
> +	case MSR_IA32_MCG_STATUS:
> +		data = vcpu->arch.mcg_status;
> +		break;
> +	default:
> +		if (msr >= MSR_IA32_MC0_CTL &&
> +		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
> +			u32 offset = msr - MSR_IA32_MC0_CTL;
> +			data = vcpu->arch.mce_banks[offset];
> +			break;
> +		}
> +		return 1;
> +	}
> +	*pdata = data;
> +	return 0;
> +}
> +
> +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> +{
> +	u64 data;
> +
> +	switch (msr) {
> +	case 0xc0010010: /* SYSCFG */
> +	case 0xc0010015: /* HWCR */
>   

Please use MSR_ constants (add them if they don't exist yet).

> +	case MSR_IA32_PLATFORM_ID:
>  	case MSR_IA32_UCODE_REV:
>  	case MSR_IA32_EBL_CR_POWERON:
>  	case MSR_IA32_DEBUGCTLMSR:
> @@ -921,6 +967,8 @@ int kvm_get_msr_common(struct kvm_vcpu *
>  		data = vcpu->arch.time;
>  		break;
>  	default:
> +		if (!get_msr_mce(vcpu, msr, &data))
> +			break;
>  		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
>  		return 1;
>  	}
> @@ -1443,6 +1491,87 @@ static int vcpu_ioctl_tpr_access_reporti
>  	return 0;
>  }
>   

Again, why split?

>  
> +static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
> +					u64 mcg_cap)
> +{
> +	int r;
> +	unsigned bank_num = mcg_cap & 0xff, bank;
> +	u64 *banks;
> +
> +	r = -EINVAL;
> +	if (!bank_num)
> +		goto out;
> +	r = -ENOMEM;
> +	banks = kzalloc(bank_num * sizeof(u64) * 4, GFP_KERNEL);
>   

If banks is already allocated, you'll leak it.

Why not always allocate it on vcpu setup?

> +static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
> +				      struct kvm_x86_mce *mce)
> +{
> +	u64 mcg_cap = vcpu->arch.mcg_cap;
> +	unsigned bank_num = mcg_cap & 0xff;
> +	u64 *banks = vcpu->arch.mce_banks;
> +
> +	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
> +		return -EINVAL;
> +	/*
> +	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
> +	 * reporting is disabled
> +	 */
> +	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
> +	    vcpu->arch.mcg_ctl != ~(u64)0)
> +		return 0;
> +	banks += 4 * mce->bank;
> +	/*
> +	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
> +	 * reporting is disabled for the bank
> +	 */
> +	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
> +		return 0;
> +	if (mce->status & MCI_STATUS_UC) {
> +		u64 status = mce->status;
> +		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
> +		    !(vcpu->arch.cr4 & X86_CR4_MCE)) {
> +			printk(KERN_DEBUG "kvm: set_mce: "
> +			       "injects mce exception while "
> +			       "previous one is in progress!\n");
> +			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
> +			return 0;
> +		}
> +		if (banks[1] & MCI_STATUS_VAL)
> +			status |= MCI_STATUS_OVER;
> +		banks[1] = mce->status;
> +		banks[2] = mce->addr;
> +		banks[3] = mce->misc;
> +		vcpu->arch.mcg_status = mce->mcg_status;
> +		kvm_queue_exception(vcpu, MC_VECTOR);
> +	} else if (!(banks[1] & MCI_STATUS_VAL) ||
> +		   (!(banks[1] & MCI_STATUS_UC) &&
> +		    !((mcg_cap & MCG_TES_P) && ((banks[1]>>53) & 0x3) < 2))) {
> +		u64 status = mce->status;
> +		if (banks[1] & MCI_STATUS_VAL)
> +			status |= MCI_STATUS_OVER;
> +		banks[1] = mce->status;
> +		banks[2] = mce->addr;
> +		banks[3] = mce->misc;
> +	} else
> +		banks[1] |= MCI_STATUS_OVER;
> +	return 0;
> +}
>   

Can userspace just use KVM_SET_MSR for this?

> +	case KVM_X86_SETUP_MCE: {
> +		u64 mcg_cap;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
> +			goto out;
> +		/*
> +		 * extended machine-check state registers and CMCI are
> +		 * not supported.
> +		 */
> +		mcg_cap &= ~(MCG_EXT_P|MCG_CMCI_P);
>   

Instead of silently dropping, should return an error.

> +		if (copy_to_user(argp, &mcg_cap, sizeof mcg_cap))
> +			goto out;
>   

And not copy.



>  #define KVM_TRC_SHIFT           16
>  /*
>   * kvm trace categories
> @@ -528,6 +540,9 @@ struct kvm_irq_routing {
>  #define KVM_NMI                   _IO(KVMIO,  0x9a)
>  /* Available with KVM_CAP_SET_GUEST_DEBUG */
>  #define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
> +/* MCE for x86 */
> +#define KVM_X86_SETUP_MCE           _IOWR(KVMIO,  0x9a, __u64)
> +#define KVM_X86_SET_MCE           _IOW(KVMIO,  0x9b, struct kvm_x86_mce)
>   

Please add a KVM_CAP_ to advertise this capability.


-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/