linux-kernel - Re: [RFC PATCH] KVM: x86: Allow Qemu/KVM to use PVH entry point

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f1518b66-e936-2311-4cfb-be05da5f4009@oracle.com>
Date:   Wed, 29 Nov 2017 09:24:10 -0800
From:   Maran Wilson <maran.wilson@...cle.com>
To:     Juergen Gross <jgross@...e.com>, boris.ostrovsky@...cle.com,
        tglx@...utronix.de, mingo@...hat.com, hpa@...or.com,
        x86@...nel.org, xen-devel@...ts.xenproject.org,
        linux-kernel@...r.kernel.org, roger.pau@...rix.com,
        rkrcmar@...hat.com, JBeulich@...e.com, andrew.cooper3@...rix.com,
        pbonzini@...hat.com, kvm@...r.kernel.org
Subject: Re: [RFC PATCH] KVM: x86: Allow Qemu/KVM to use PVH entry point

On 11/29/2017 12:21 AM, Juergen Gross wrote:
> On 28/11/17 20:34, Maran Wilson wrote:
>> For certain applications it is desirable to rapidly boot a KVM virtual
>> machine. In cases where legacy hardware and software support within the
>> guest is not needed, Qemu should be able to boot directly into the
>> uncompressed Linux kernel binary without the need to run firmware.
>>
>> There already exists an ABI to allow this for Xen PVH guests and the ABI is
>> supported by Linux and FreeBSD:
>>
>>     https://xenbits.xen.org/docs/unstable/misc/hvmlite.html
>>
>> This PoC patch enables Qemu to use that same entry point for booting KVM
>> guests.
>>
>> Even though the code is still PoC quality, I'm sending this as an RFC now
>> since there are a number of different ways the specific implementation
>> details can be handled. I chose a shared code path for Xen and KVM guests
>> but could just as easily create a separate code path that is advertised by
>> a different ELF note for KVM. There also seems to be some flexibility in
>> how the e820 table data is passed and how (or if) it should be identified
>> as e820 data. As a starting point, I've chosen the options that seem to
>> result in the smallest patch with minimal to no changes required of the
>> x86/HVM direct boot ABI.
> I like the idea.
>
> I'd rather split up the different hypervisor types early and use a
> common set of service functions instead of special casing xen_guest
> everywhere. This would make it much easier to support the KVM PVH
> boot without the need to configure the kernel with CONFIG_XEN.

Thanks for the feedback. I'll try doing something like that as this 
patch moves from proof of concept to a real proposal.

> Another option would be to use the same boot path as with grub: set
> the boot params in zeropage and start at startup_32.

I think others have already responded about that. The main thing I was 
trying to avoid, was adding any Linux OS specific initialization (like 
zeropage) to QEMU. Especially since this PVH entry point already exists 
in Linux.

Thanks,
-Maran

>
> Juergen
>
>> ---
>>   arch/x86/xen/enlighten_pvh.c | 74 ++++++++++++++++++++++++++++++++------------
>>   1 file changed, 55 insertions(+), 19 deletions(-)
>>
>> diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
>> index 98ab176..d93f711 100644
>> --- a/arch/x86/xen/enlighten_pvh.c
>> +++ b/arch/x86/xen/enlighten_pvh.c
>> @@ -31,21 +31,46 @@ static void xen_pvh_arch_setup(void)
>>   		acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
>>   }
>>   
>> -static void __init init_pvh_bootparams(void)
>> +static void __init init_pvh_bootparams(bool xen_guest)
>>   {
>>   	struct xen_memory_map memmap;
>>   	int rc;
>>   
>>   	memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
>>   
>> -	memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
>> -	set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
>> -	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
>> -	if (rc) {
>> -		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
>> -		BUG();
>> +	if (xen_guest) {
>> +		memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
>> +		set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
>> +		rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
>> +		if (rc) {
>> +			xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
>> +			BUG();
>> +		}
>> +		pvh_bootparams.e820_entries = memmap.nr_entries;
>> +	} else if (pvh_start_info.nr_modules > 1) {
>> +		/* The second module should be the e820 data for KVM guests */
>> +		struct hvm_modlist_entry *modaddr;
>> +		char e820_sig[] = "e820 data";
>> +		struct boot_e820_entry *ep;
>> +		struct e820_table *tp;
>> +		char *cmdline_str;
>> +		int idx;
>> +
>> +		modaddr = __va(pvh_start_info.modlist_paddr +
>> +			       sizeof(struct hvm_modlist_entry));
>> +		cmdline_str = __va(modaddr->cmdline_paddr);
>> +
>> +		if ((modaddr->cmdline_paddr) &&
>> +		    (!strncmp(e820_sig, cmdline_str, sizeof(e820_sig)))) {
>> +			tp = __va(modaddr->paddr);
>> +			ep = (struct boot_e820_entry *)tp->entries;
>> +
>> +			pvh_bootparams.e820_entries = tp->nr_entries;
>> +
>> +			for (idx = 0; idx < tp->nr_entries ; idx++, ep++)
>> +				pvh_bootparams.e820_table[idx] = *ep;
>> +		}
>>   	}
>> -	pvh_bootparams.e820_entries = memmap.nr_entries;
>>   
>>   	if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) {
>>   		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr =
>> @@ -55,8 +80,9 @@ static void __init init_pvh_bootparams(void)
>>   		pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type =
>>   			E820_TYPE_RESERVED;
>>   		pvh_bootparams.e820_entries++;
>> -	} else
>> +	} else if (xen_guest) {
>>   		xen_raw_printk("Warning: Can fit ISA range into e820\n");
>> +	}
>>   
>>   	pvh_bootparams.hdr.cmd_line_ptr =
>>   		pvh_start_info.cmdline_paddr;
>> @@ -76,7 +102,7 @@ static void __init init_pvh_bootparams(void)
>>   	 * environment (i.e. hardware_subarch 0).
>>   	 */
>>   	pvh_bootparams.hdr.version = 0x212;
>> -	pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
>> +	pvh_bootparams.hdr.type_of_loader = ((xen_guest ? 0x9 : 0xb) << 4) | 0;
>>   }
>>   
>>   /*
>> @@ -85,22 +111,32 @@ static void __init init_pvh_bootparams(void)
>>    */
>>   void __init xen_prepare_pvh(void)
>>   {
>> -	u32 msr;
>> +
>> +	u32 msr = xen_cpuid_base();
>>   	u64 pfn;
>> +	bool xen_guest = msr ? true : false;
>>   
>>   	if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
>> -		xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
>> -				pvh_start_info.magic);
>> +		if (xen_guest)
>> +			xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
>> +					pvh_start_info.magic);
>>   		BUG();
>>   	}
>>   
>> -	xen_pvh = 1;
>> +	if (xen_guest) {
>> +		xen_pvh = 1;
>> +
>> +		msr = cpuid_ebx(msr + 2);
>> +		pfn = __pa(hypercall_page);
>> +		wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
>> +
>> +	} else if (!hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0)) {
>> +		BUG();
>> +	}
>>   
>> -	msr = cpuid_ebx(xen_cpuid_base() + 2);
>> -	pfn = __pa(hypercall_page);
>> -	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
>> +	init_pvh_bootparams(xen_guest);
>>   
>> -	init_pvh_bootparams();
>> +	if (xen_guest)
>> +		x86_init.oem.arch_setup = xen_pvh_arch_setup;
>>   
>> -	x86_init.oem.arch_setup = xen_pvh_arch_setup;
>>   }
>>