[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <SA1PR21MB1335046F4BA2B407C9026130BF6F9@SA1PR21MB1335.namprd21.prod.outlook.com>
Date:   Tue, 2 May 2023 01:34:18 +0000
From:   Dexuan Cui <decui@...rosoft.com>
To:     "Michael Kelley (LINUX)" <mikelley@...rosoft.com>,
        "ak@...ux.intel.com" <ak@...ux.intel.com>,
        "arnd@...db.de" <arnd@...db.de>, "bp@...en8.de" <bp@...en8.de>,
        "brijesh.singh@....com" <brijesh.singh@....com>,
        "dan.j.williams@...el.com" <dan.j.williams@...el.com>,
        "dave.hansen@...ux.intel.com" <dave.hansen@...ux.intel.com>,
        Haiyang Zhang <haiyangz@...rosoft.com>,
        "hpa@...or.com" <hpa@...or.com>,
        "jane.chu@...cle.com" <jane.chu@...cle.com>,
        "kirill.shutemov@...ux.intel.com" <kirill.shutemov@...ux.intel.com>,
        KY Srinivasan <kys@...rosoft.com>,
        "linux-arch@...r.kernel.org" <linux-arch@...r.kernel.org>,
        "linux-hyperv@...r.kernel.org" <linux-hyperv@...r.kernel.org>,
        "luto@...nel.org" <luto@...nel.org>,
        "mingo@...hat.com" <mingo@...hat.com>,
        "peterz@...radead.org" <peterz@...radead.org>,
        "rostedt@...dmis.org" <rostedt@...dmis.org>,
        "sathyanarayanan.kuppuswamy@...ux.intel.com" 
        <sathyanarayanan.kuppuswamy@...ux.intel.com>,
        "seanjc@...gle.com" <seanjc@...gle.com>,
        "tglx@...utronix.de" <tglx@...utronix.de>,
        "tony.luck@...el.com" <tony.luck@...el.com>,
        "wei.liu@...nel.org" <wei.liu@...nel.org>,
        "x86@...nel.org" <x86@...nel.org>
CC:     "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Tianyu Lan <Tianyu.Lan@...rosoft.com>
Subject: RE: [PATCH v5 5/6] Drivers: hv: vmbus: Support TDX guests
> From: Michael Kelley (LINUX) <mikelley@...rosoft.com>
> Sent: Monday, May 1, 2023 10:33 AM
> ...
> From: Dexuan Cui
> >
> > Add Hyper-V specific code so that a TDX guest can run on Hyper-V:
> >   No need to use hv_vp_assist_page.
> >   Don't use the unsafe Hyper-V TSC page.
> >   Don't try to use HV_REGISTER_CRASH_CTL.
> >   Don't trust Hyper-V's TLB-flushing hypercalls.
> >   Don't use lazy EOI.
> >   Share SynIC Event/Message pages and VMBus Monitor pages with the
> >  host.
> 
> This patch no longer does anything with the VMBus monitor pages.
Sorry, I forgot to update the commit log. Will drop this from the log.
> >   Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init().
> 
> The above line in the commit message is stale and can be dropped.
Will drop this from the commit log.
> > @@ -116,6 +117,7 @@ int hv_synic_alloc(void)
> >  {
> >  	int cpu;
> >  	struct hv_per_cpu_context *hv_cpu;
> > +	int ret = -ENOMEM;
> >
> >  	/*
> >  	 * First, zero all per-cpu memory areas so hv_synic_free() can
> > @@ -159,6 +161,28 @@ int hv_synic_alloc(void)
> >  				goto err;
> >  			}
> >  		}
> > +
> > +		/* It's better to leak the page if the decryption fails. */
> > +		if (hv_isolation_type_tdx()) {
> > +			ret = set_memory_decrypted(
> > +				(unsigned long)hv_cpu->synic_message_page, 1);
> > +			if (ret) {
> > +				pr_err("Failed to decrypt SYNIC msg page\n");
> > +				hv_cpu->synic_message_page = NULL;
> > +				goto err;
> > +			}
> > +
> > +			ret = set_memory_decrypted(
> > +				(unsigned long)hv_cpu->synic_event_page, 1);
> > +			if (ret) {
> > +				pr_err("Failed to decrypt SYNIC event page\n");
> > +				hv_cpu->synic_event_page = NULL;
> > +				goto err;
> > +			}
> 
> The error handling still doesn't work quite correctly.   In the TDX case, upon
> exiting this function, the synic_message_page and the synic_event_page
> must
> each either be mapped decrypted or be NULL.  This requirement is so
> that hv_synic_free() will do the right thing in changing the mapping back to
> encrypted.  hv_synic_free() can't handle a non-NULL page being encrypted.
> 
> In the above code, if we fail to decrypt the synic_message_page, then setting
> it to NULL will leak the page (which we'll live with) and ensures that
> hv_synic_free()
> will handle it correctly.  But at that point we'll exit with synic_event_page
> non-NULL and in the encrypted state, which hv_synic_free() can't handle.
> 
> Michael
Thanks for spotting the issue! 
I think the below extra changes should do the job:
@@ -121,91 +121,102 @@ int hv_synic_alloc(void)
        /*
         * First, zero all per-cpu memory areas so hv_synic_free() can
         * detect what memory has been allocated and cleanup properly
         * after any failures.
         */
        for_each_present_cpu(cpu) {
                hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
                memset(hv_cpu, 0, sizeof(*hv_cpu));
        }
        hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask),
                                         GFP_KERNEL);
        if (hv_context.hv_numa_map == NULL) {
                pr_err("Unable to allocate NUMA map\n");
                goto err;
        }
        for_each_present_cpu(cpu) {
                hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
                tasklet_init(&hv_cpu->msg_dpc,
                             vmbus_on_msg_dpc, (unsigned long) hv_cpu);
                /*
                 * Synic message and event pages are allocated by paravisor.
                 * Skip these pages allocation here.
                 */
                if (!hv_isolation_type_snp() && !hv_root_partition) {
                        hv_cpu->synic_message_page =
                                (void *)get_zeroed_page(GFP_ATOMIC);
                        if (hv_cpu->synic_message_page == NULL) {
                                pr_err("Unable to allocate SYNIC message page\n");
                                goto err;
                        }
                        hv_cpu->synic_event_page =
                                (void *)get_zeroed_page(GFP_ATOMIC);
                        if (hv_cpu->synic_event_page == NULL) {
                                pr_err("Unable to allocate SYNIC event page\n");
+
+                               free_page((unsigned long)hv_cpu->synic_message_page);
+                               hv_cpu->synic_message_page = NULL;
+
                                goto err;
                        }
                }
                /* It's better to leak the page if the decryption fails. */
                if (hv_isolation_type_tdx()) {
                        ret = set_memory_decrypted(
                                (unsigned long)hv_cpu->synic_message_page, 1);
                        if (ret) {
                                pr_err("Failed to decrypt SYNIC msg page\n");
                                hv_cpu->synic_message_page = NULL;
+
+                               /*
+                                * Free the event page so that a TDX VM won't
+                                * try to encrypt the page in hv_synic_free().
+                                */
+                               free_page((unsigned long)hv_cpu->synic_event_page);
+                               hv_cpu->synic_event_page = NULL;
                                goto err;
                        }
                        ret = set_memory_decrypted(
                                (unsigned long)hv_cpu->synic_event_page, 1);
                        if (ret) {
                                pr_err("Failed to decrypt SYNIC event page\n");
                                hv_cpu->synic_event_page = NULL;
                                goto err;
                        }
                        memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
                        memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
                }
        }
        return 0;
 err:
        /*
         * Any memory allocations that succeeded will be freed when
         * the caller cleans up by calling hv_synic_free()
         */
        return ret;
 }
I'm going to use the below (i.e. v5 + the above extra changes) in v6.
Please let me know if there is still any bug.
@@ -116,6 +117,7 @@ int hv_synic_alloc(void)
 {
        int cpu;
        struct hv_per_cpu_context *hv_cpu;
+       int ret = -ENOMEM;
        /*
         * First, zero all per-cpu memory areas so hv_synic_free() can
@@ -156,9 +158,42 @@ int hv_synic_alloc(void)
                                (void *)get_zeroed_page(GFP_ATOMIC);
                        if (hv_cpu->synic_event_page == NULL) {
                                pr_err("Unable to allocate SYNIC event page\n");
+
+                               free_page((unsigned long)hv_cpu->synic_message_page);
+                               hv_cpu->synic_message_page = NULL;
+
                                goto err;
                        }
                }
+
+               /* It's better to leak the page if the decryption fails. */
+               if (hv_isolation_type_tdx()) {
+                       ret = set_memory_decrypted(
+                               (unsigned long)hv_cpu->synic_message_page, 1);
+                       if (ret) {
+                               pr_err("Failed to decrypt SYNIC msg page\n");
+                               hv_cpu->synic_message_page = NULL;
+
+                               /*
+                                * Free the event page so that a TDX VM won't
+                                * try to encrypt the page in hv_synic_free().
+                                */
+                               free_page((unsigned long)hv_cpu->synic_event_page);
+                               hv_cpu->synic_event_page = NULL;
+                               goto err;
+                       }
+
+                       ret = set_memory_decrypted(
+                               (unsigned long)hv_cpu->synic_event_page, 1);
+                       if (ret) {
+                               pr_err("Failed to decrypt SYNIC event page\n");
+                               hv_cpu->synic_event_page = NULL;
+                               goto err;
+                       }
+
+                       memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
+                       memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
+               }
        }
        return 0;
@@ -167,18 +202,40 @@ int hv_synic_alloc(void)
         * Any memory allocations that succeeded will be freed when
         * the caller cleans up by calling hv_synic_free()
         */
-       return -ENOMEM;
+       return ret;
 }
 void hv_synic_free(void)
 {
        int cpu;
+       int ret;
        for_each_present_cpu(cpu) {
                struct hv_per_cpu_context *hv_cpu
                        = per_cpu_ptr(hv_context.cpu_context, cpu);
+               /* It's better to leak the page if the encryption fails. */
+               if (hv_isolation_type_tdx()) {
+                       if (hv_cpu->synic_message_page) {
+                               ret = set_memory_encrypted((unsigned long)
+                                       hv_cpu->synic_message_page, 1);
+                               if (ret) {
+                                       pr_err("Failed to encrypt SYNIC msg page\n");
+                                       hv_cpu->synic_message_page = NULL;
+                               }
+                       }
+
+                       if (hv_cpu->synic_event_page) {
+                               ret = set_memory_encrypted((unsigned long)
+                                       hv_cpu->synic_event_page, 1);
+                               if (ret) {
+                                       pr_err("Failed to encrypt SYNIC event page\n");
+                                       hv_cpu->synic_event_page = NULL;
+                               }
+                       }
+               }
+
                free_page((unsigned long)hv_cpu->synic_event_page);
                free_page((unsigned long)hv_cpu->synic_message_page);
        }
I'll post a separate patch (currently if hv_synic_alloc() --> get_zeroed_page() fails,
hv_context.hv_numa_map is leaked):
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1515,27 +1515,27 @@ static int vmbus_bus_init(void)
        }
        ret = hv_synic_alloc();
        if (ret)
                goto err_alloc;
        /*
         * Initialize the per-cpu interrupt state and stimer state.
         * Then connect to the host.
         */
        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
                                hv_synic_init, hv_synic_cleanup);
        if (ret < 0)
-               goto err_cpuhp;
+               goto err_alloc;
        hyperv_cpuhp_online = ret;
        ret = vmbus_connect();
        if (ret)
                goto err_connect;
        if (hv_is_isolation_supported())
                sysctl_record_panic_msg = 0;
        /*
         * Only register if the crash MSRs are available
         */
        if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
@@ -1567,29 +1567,28 @@ static int vmbus_bus_init(void)
        /*
         * Always register the vmbus unload panic notifier because we
         * need to shut the VMbus channel connection on panic.
         */
        atomic_notifier_chain_register(&panic_notifier_list,
                               &hyperv_panic_vmbus_unload_block);
        vmbus_request_offers();
        return 0;
 err_connect:
        cpuhp_remove_state(hyperv_cpuhp_online);
-err_cpuhp:
-       hv_synic_free();
 err_alloc:
+       hv_synic_free();
        if (vmbus_irq == -1) {
                hv_remove_vmbus_handler();
        } else {
                free_percpu_irq(vmbus_irq, vmbus_evt);
                free_percpu(vmbus_evt);
        }
 err_setup:
        bus_unregister(&hv_bus);
        unregister_sysctl_table(hv_ctl_table_hdr);
        hv_ctl_table_hdr = NULL;
        return ret;
 }
Powered by blists - more mailing lists
 
