[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <52e916fc-8fe3-f9bc-009e-ca84ab7dd650@amazon.de>
Date: Mon, 6 Jul 2020 12:12:31 +0200
From: Alexander Graf <graf@...zon.de>
To: Andra Paraschiv <andraprs@...zon.com>,
<linux-kernel@...r.kernel.org>
CC: Anthony Liguori <aliguori@...zon.com>,
Benjamin Herrenschmidt <benh@...nel.crashing.org>,
Colm MacCarthaigh <colmmacc@...zon.com>,
"Bjoern Doebel" <doebel@...zon.de>,
David Woodhouse <dwmw@...zon.co.uk>,
"Frank van der Linden" <fllinden@...zon.com>,
Greg KH <gregkh@...uxfoundation.org>,
Martin Pohlack <mpohlack@...zon.de>,
Matt Wilson <msw@...zon.com>,
"Paolo Bonzini" <pbonzini@...hat.com>,
Balbir Singh <sblbir@...zon.com>,
"Stefano Garzarella" <sgarzare@...hat.com>,
Stefan Hajnoczi <stefanha@...hat.com>,
Stewart Smith <trawets@...zon.com>,
Uwe Dannowski <uwed@...zon.de>, <kvm@...r.kernel.org>,
<ne-devel-upstream@...zon.com>
Subject: Re: [PATCH v4 09/18] nitro_enclaves: Add logic for enclave vcpu
creation
On 22.06.20 22:03, Andra Paraschiv wrote:
> An enclave, before being started, has its resources set. One of its
> resources is CPU.
>
> The NE CPU pool is set for choosing CPUs for enclaves from it. Offline
> the CPUs from the NE CPU pool during the pool setup and online them back
> during the NE CPU pool teardown.
>
> The enclave CPUs need to be full cores and from the same NUMA node. CPU
> 0 and its siblings have to remain available to the primary / parent VM.
>
> Add ioctl command logic for enclave vCPU creation. Return as result a
> file descriptor that is associated with the enclave vCPU.
>
> Signed-off-by: Alexandru Vasile <lexnv@...zon.com>
> Signed-off-by: Andra Paraschiv <andraprs@...zon.com>
> ---
> Changelog
>
> v3 -> v4
>
> * Setup the NE CPU pool at runtime via a sysfs file for the kernel
> parameter.
> * Check enclave CPUs to be from the same NUMA node.
> * Use dev_err instead of custom NE log pattern.
> * Update the NE ioctl call to match the decoupling from the KVM API.
>
> v2 -> v3
>
> * Remove the WARN_ON calls.
> * Update static calls sanity checks.
> * Update kzfree() calls to kfree().
> * Remove file ops that do nothing for now - open, ioctl and release.
>
> v1 -> v2
>
> * Add log pattern for NE.
> * Update goto labels to match their purpose.
> * Remove the BUG_ON calls.
> * Check if enclave state is init when setting enclave vcpu.
> ---
> drivers/virt/nitro_enclaves/ne_misc_dev.c | 491 ++++++++++++++++++++++
> 1 file changed, 491 insertions(+)
>
> diff --git a/drivers/virt/nitro_enclaves/ne_misc_dev.c b/drivers/virt/nitro_enclaves/ne_misc_dev.c
> index f70496813033..d6777008f685 100644
> --- a/drivers/virt/nitro_enclaves/ne_misc_dev.c
> +++ b/drivers/virt/nitro_enclaves/ne_misc_dev.c
> @@ -39,7 +39,11 @@
> * TODO: Update logic to create new sysfs entries instead of using
> * a kernel parameter e.g. if multiple sysfs files needed.
> */
> +static int ne_set_kernel_param(const char *val, const struct kernel_param *kp);
> +
> static const struct kernel_param_ops ne_cpu_pool_ops = {
> + .get = param_get_string,
> + .set = ne_set_kernel_param,
> };
>
> static char ne_cpus[PAGE_SIZE];
> @@ -60,6 +64,485 @@ struct ne_cpu_pool {
>
> static struct ne_cpu_pool ne_cpu_pool;
>
> +static const struct file_operations ne_enclave_vcpu_fops = {
> + .owner = THIS_MODULE,
> + .llseek = noop_llseek,
> +};
Do we really need an fd for an object without operations? I think the
general flow to add CPUs from the pool to the VM is very sensible. But I
don't think we really need an fd as return value from that operation.
> +
> +/**
> + * ne_check_enclaves_created - Verify if at least one enclave has been created.
> + *
> + * @pdev: PCI device used for enclave lifetime management.
> + *
> + * @returns: true if at least one enclave is created, false otherwise.
> + */
> +static bool ne_check_enclaves_created(struct pci_dev *pdev)
> +{
> + struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev);
> +
> + if (!ne_pci_dev)
> + return false;
Please pass in the ne_pci_dev into this function directly.
> +
> + mutex_lock(&ne_pci_dev->enclaves_list_mutex);
> +
> + if (list_empty(&ne_pci_dev->enclaves_list)) {
> + mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
> +
> + return false;
If you make this a return variable, you save on the unlock duplication.
> + }
> +
> + mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
> +
> + return true;
> +}
> +
> +/**
> + * ne_setup_cpu_pool - Set the NE CPU pool after handling sanity checks such as
> + * not sharing CPU cores with the primary / parent VM or not using CPU 0, which
> + * should remain available for the primary / parent VM. Offline the CPUs from
> + * the pool after the checks passed.
> + *
> + * @pdev: PCI device used for enclave lifetime management.
> + * @ne_cpu_list: the CPU list used for setting NE CPU pool.
> + *
> + * @returns: 0 on success, negative return value on failure.
> + */
> +static int ne_setup_cpu_pool(struct pci_dev *pdev, const char *ne_cpu_list)
> +{
> + unsigned int cpu = 0;
> + unsigned int cpu_sibling = 0;
> + int numa_node = -1;
> + int rc = -EINVAL;
> +
> + if (!capable(CAP_SYS_ADMIN)) {
> + dev_err(&pdev->dev, "No admin capability for CPU pool setup\n");
No need to print anything here. It only gives non-admin users a chance
to spill the kernel log. If non-admin users can write at all? Can they?
Also, isn't this at the wrong abstraction level? I would expect such a
check to happen on the file write function, not here.
> +
> + return -EPERM;
> + }
> +
> + if (!ne_cpu_list)
> + return 0;
> +
> + if (ne_check_enclaves_created(pdev)) {
> + dev_err(&pdev->dev, "The CPU pool is used, enclaves created\n");
> +
> + return -EINVAL;
> + }
> +
> + mutex_lock(&ne_cpu_pool.mutex);
> +
> + rc = cpulist_parse(ne_cpu_list, ne_cpu_pool.avail);
> + if (rc < 0) {
> + dev_err(&pdev->dev,
> + "Error in cpulist parse [rc=%d]\n", rc);
> +
> + goto unlock_mutex;
> + }
> +
> + /*
> + * Check if CPU 0 and its siblings are included in the provided CPU pool
> + * They should remain available for the primary / parent VM.
> + */
> + if (cpumask_test_cpu(0, ne_cpu_pool.avail)) {
> +
> + dev_err(&pdev->dev,
> + "CPU 0 has to remain available for the primary VM\n");
Shouldn't this also change the read value of the sysfs file?
> +
> + rc = -EINVAL;
> +
> + goto unlock_mutex;
> + }
> +
> + for_each_cpu(cpu_sibling, topology_sibling_cpumask(0)) {
> + if (cpumask_test_cpu(cpu_sibling, ne_cpu_pool.avail)) {
> + dev_err(&pdev->dev,
> + "CPU sibling %d of CPU 0 is in the CPU pool\n",
> + cpu_sibling);
Same here. I would expect the sysfs file to reflect either the previous
state or <empty> because failures mean no CPUs are donated anymore.
Can we somehow implement the get function of the param as something that
gets generated dynamically?
> +
> + rc = -EINVAL;
> +
> + goto unlock_mutex;
> + }
> + }
> +
> + /*
> + * Check if CPU siblings are included in the provided CPU pool. The
> + * expectation is that CPU cores are made available in the CPU pool for
> + * enclaves.
> + */
> + for_each_cpu(cpu, ne_cpu_pool.avail) {
> + for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) {
> + if (!cpumask_test_cpu(cpu_sibling, ne_cpu_pool.avail)) {
> + dev_err(&pdev->dev,
> + "CPU %d is not in the CPU pool\n",
> + cpu_sibling);
> +
> + rc = -EINVAL;
> +
> + goto unlock_mutex;
> + }
> + }
> + }
> +
> + /*
> + * Check if the CPUs from the NE CPU pool are from the same NUMA node.
> + */
> + for_each_cpu(cpu, ne_cpu_pool.avail) {
> + if (numa_node < 0) {
> + numa_node = cpu_to_node(cpu);
> +
> + if (numa_node < 0) {
> + dev_err(&pdev->dev,
> + "Invalid NUMA node %d\n", numa_node);
> +
> + rc = -EINVAL;
> +
> + goto unlock_mutex;
> + }
> + } else {
> + if (numa_node != cpu_to_node(cpu)) {
> + dev_err(&pdev->dev,
> + "CPUs are from different NUMA nodes\n");
> +
> + rc = -EINVAL;
> +
> + goto unlock_mutex;
> + }
> + }
> + }
> +
There should be a comment here that describes the why:
/*
* CPUs that are donated to enclaves should not be considered online
* by Linux anymore, as the hypervisor will degrade them to floating.
*
* We offline them here, to not degrade performance and expose correct
* topology to Linux and user space.
*/
> + for_each_cpu(cpu, ne_cpu_pool.avail) {
> + rc = remove_cpu(cpu);
> + if (rc != 0) {
> + dev_err(&pdev->dev,
> + "CPU %d is not offlined [rc=%d]\n", cpu, rc);
> +
> + goto online_cpus;
> + }
> + }
> +
> + mutex_unlock(&ne_cpu_pool.mutex);
> +
> + return 0;
> +
> +online_cpus:
> + for_each_cpu(cpu, ne_cpu_pool.avail)
> + add_cpu(cpu);
> +unlock_mutex:
> + mutex_unlock(&ne_cpu_pool.mutex);
> +
> + return rc;
> +}
> +
> +/**
> + * ne_teardown_cpu_pool - Online the CPUs from the NE CPU pool and cleanup the
> + * CPU pool.
> + *
> + * @pdev: PCI device used for enclave lifetime management.
> + */
> +static void ne_teardown_cpu_pool(struct pci_dev *pdev)
> +{
> + unsigned int cpu = 0;
> + int rc = -EINVAL;
> +
> + if (!capable(CAP_SYS_ADMIN)) {
> + dev_err(&pdev->dev, "No admin capability for CPU pool setup\n");
> +
> + return;
> + }
> +
> + if (!ne_cpu_pool.avail)
> + return;
> +
> + if (ne_check_enclaves_created(pdev)) {
> + dev_err(&pdev->dev, "The CPU pool is used, enclaves created\n");
> +
> + return;
> + }
> +
> + mutex_lock(&ne_cpu_pool.mutex);
> +
> + for_each_cpu(cpu, ne_cpu_pool.avail) {
> + rc = add_cpu(cpu);
> + if (rc != 0)
> + dev_err(&pdev->dev,
> + "CPU %d is not onlined [rc=%d]\n", cpu, rc);
> + }
> +
> + cpumask_clear(ne_cpu_pool.avail);
> +
> + mutex_unlock(&ne_cpu_pool.mutex);
> +}
> +
> +static int ne_set_kernel_param(const char *val, const struct kernel_param *kp)
> +{
> + const char *ne_cpu_list = val;
> + struct pci_dev *pdev = pci_get_device(PCI_VENDOR_ID_AMAZON,
> + PCI_DEVICE_ID_NE, NULL);
Isn't there a better way?
> + int rc = -EINVAL;
> +
> + if (!pdev)
> + return -ENODEV;
> +
> + ne_teardown_cpu_pool(pdev);
> +
> + rc = ne_setup_cpu_pool(pdev, ne_cpu_list);
> + if (rc < 0) {
> + dev_err(&pdev->dev, "Error in setup CPU pool [rc=%d]\n", rc);
> +
> + return rc;
> + }
> +
> + return param_set_copystring(val, kp);
> +}
> +
> +/**
> + * ne_get_cpu_from_cpu_pool - Get a CPU from the CPU pool. If the vCPU id is 0,
> + * the CPU is autogenerated and chosen from the NE CPU pool.
> + *
> + * This function gets called with the ne_enclave mutex held.
> + *
> + * @ne_enclave: private data associated with the current enclave.
> + * @vcpu_id: id of the CPU to be associated with the given slot, apic id on x86.
> + *
> + * @returns: 0 on success, negative return value on failure.
> + */
> +static int ne_get_cpu_from_cpu_pool(struct ne_enclave *ne_enclave, u32 *vcpu_id)
That's a very awkward API. Can you instead just pass by-value and return
the resulting CPU ID?
> +{
> + unsigned int cpu = 0;
> + unsigned int cpu_sibling = 0;
> +
> + if (*vcpu_id != 0) {
> + if (cpumask_test_cpu(*vcpu_id, ne_enclave->cpu_siblings)) {
> + cpumask_clear_cpu(*vcpu_id, ne_enclave->cpu_siblings);
> +
> + return 0;
> + }
> +
> + mutex_lock(&ne_cpu_pool.mutex);
> +
> + if (!cpumask_test_cpu(*vcpu_id, ne_cpu_pool.avail)) {
> + dev_err_ratelimited(ne_misc_dev.this_device,
> + "CPU %d is not in NE CPU pool\n",
> + *vcpu_id);
> +
> + mutex_unlock(&ne_cpu_pool.mutex);
> +
> + return -EINVAL;
I think you're better off making the return value explicit for the
error, so that user space can print the error message rather than us.
> + }
> +
> + cpumask_clear_cpu(*vcpu_id, ne_cpu_pool.avail);
> +
> + /*
> + * Make sure the CPU siblings are not marked as available
> + * anymore.
> + */
> + for_each_cpu(cpu_sibling, topology_sibling_cpumask(*vcpu_id)) {
> + if (cpu_sibling != *vcpu_id) {
> + cpumask_clear_cpu(cpu_sibling,
> + ne_cpu_pool.avail);
> +
> + cpumask_set_cpu(cpu_sibling,
> + ne_enclave->cpu_siblings);
> + }
> + }
> +
> + mutex_unlock(&ne_cpu_pool.mutex);
> +
> + return 0;
> + }
> +
> + /* There are CPU siblings available to choose from. */
> + cpu = cpumask_any(ne_enclave->cpu_siblings);
> + if (cpu < nr_cpu_ids) {
> + cpumask_clear_cpu(cpu, ne_enclave->cpu_siblings);
> +
> + *vcpu_id = cpu;
> +
> + return 0;
> + }
> +
> + mutex_lock(&ne_cpu_pool.mutex);
> +
> + /* Choose any CPU from the available CPU pool. */
> + cpu = cpumask_any(ne_cpu_pool.avail);
> + if (cpu >= nr_cpu_ids) {
> + dev_err_ratelimited(ne_misc_dev.this_device,
> + "No CPUs available in CPU pool\n");
> +
> + mutex_unlock(&ne_cpu_pool.mutex);
> +
> + return -EINVAL;
I think you're better off making the return value explicit for the
error, so that user space can print the error message rather than us.
> + }
> +
> + cpumask_clear_cpu(cpu, ne_cpu_pool.avail);
> +
> + /* Make sure the CPU siblings are not marked as available anymore. */
> + for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) {
> + if (cpu_sibling != cpu) {
> + cpumask_clear_cpu(cpu_sibling, ne_cpu_pool.avail);
> +
> + cpumask_set_cpu(cpu_sibling, ne_enclave->cpu_siblings);
> + }
> + }
> +
> + mutex_unlock(&ne_cpu_pool.mutex);
I find the function slightly confusingly structured. Why can't we do
something like
if (!vcpu_id) {
vcpu_id = find_next_free_vcpu_id();
if (vcpu_id < 0)
return -ENOSPC;
}
[logic to handle an explicit vcpu id]
I think that would be much more readable.
> +
> + *vcpu_id = cpu;
> +
> + return 0;
> +}
> +
> +/**
> + * ne_create_vcpu_ioctl - Add vCPU to the slot associated with the current
> + * enclave. Create vCPU file descriptor to be further used for CPU handling.
> + *
> + * This function gets called with the ne_enclave mutex held.
> + *
> + * @ne_enclave: private data associated with the current enclave.
> + * @vcpu_id: id of the CPU to be associated with the given slot, apic id on x86.
> + *
> + * @returns: vCPU fd on success, negative return value on failure.
> + */
> +static int ne_create_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id)
> +{
> + struct ne_pci_dev_cmd_reply cmd_reply = {};
> + int fd = 0;
> + struct file *file = NULL;
> + struct ne_vcpu_id *ne_vcpu_id = NULL;
> + int rc = -EINVAL;
> + struct slot_add_vcpu_req slot_add_vcpu_req = {};
> +
> + if (ne_enclave->mm != current->mm)
> + return -EIO;
> +
> + ne_vcpu_id = kzalloc(sizeof(*ne_vcpu_id), GFP_KERNEL);
> + if (!ne_vcpu_id)
> + return -ENOMEM;
> +
> + fd = get_unused_fd_flags(O_CLOEXEC);
> + if (fd < 0) {
> + rc = fd;
> +
> + dev_err_ratelimited(ne_misc_dev.this_device,
> + "Error in getting unused fd [rc=%d]\n", rc);
> +
> + goto free_ne_vcpu_id;
> + }
> +
> + /* TODO: Include (vcpu) id in the ne-vm-vcpu naming. */
> + file = anon_inode_getfile("ne-vm-vcpu", &ne_enclave_vcpu_fops,
> + ne_enclave, O_RDWR);
> + if (IS_ERR(file)) {
> + rc = PTR_ERR(file);
> +
> + dev_err_ratelimited(ne_misc_dev.this_device,
> + "Error in anon inode get file [rc=%d]\n",
> + rc);
> +
> + goto put_fd;
> + }
> +
> + slot_add_vcpu_req.slot_uid = ne_enclave->slot_uid;
> + slot_add_vcpu_req.vcpu_id = vcpu_id;
> +
> + rc = ne_do_request(ne_enclave->pdev, SLOT_ADD_VCPU, &slot_add_vcpu_req,
> + sizeof(slot_add_vcpu_req), &cmd_reply,
> + sizeof(cmd_reply));
> + if (rc < 0) {
> + dev_err_ratelimited(ne_misc_dev.this_device,
> + "Error in slot add vcpu [rc=%d]\n", rc);
> +
> + goto put_file;
> + }
> +
> + ne_vcpu_id->vcpu_id = vcpu_id;
> +
> + list_add(&ne_vcpu_id->vcpu_id_list_entry, &ne_enclave->vcpu_ids_list);
> +
> + ne_enclave->nr_vcpus++;
> +
> + fd_install(fd, file);
> +
> + return fd;
> +
> +put_file:
> + fput(file);
> +put_fd:
> + put_unused_fd(fd);
> +free_ne_vcpu_id:
> + kfree(ne_vcpu_id);
> +
> + return rc;
> +}
> +
> +static long ne_enclave_ioctl(struct file *file, unsigned int cmd,
> + unsigned long arg)
> +{
> + struct ne_enclave *ne_enclave = file->private_data;
> +
> + if (!ne_enclave || !ne_enclave->pdev)
> + return -EINVAL;
> +
> + switch (cmd) {
> + case NE_CREATE_VCPU: {
Can this be an ADD_VCPU rather than CREATE? We don't really need a vcpu
fd after all ...
Alex
> + int rc = -EINVAL;
> + u32 vcpu_id = 0;
> +
> + if (copy_from_user(&vcpu_id, (void *)arg, sizeof(vcpu_id))) {
> + dev_err_ratelimited(ne_misc_dev.this_device,
> + "Error in copy from user\n");
> +
> + return -EFAULT;
> + }
> +
> + mutex_lock(&ne_enclave->enclave_info_mutex);
> +
> + if (ne_enclave->state != NE_STATE_INIT) {
> + dev_err_ratelimited(ne_misc_dev.this_device,
> + "Enclave isn't in init state\n");
> +
> + mutex_unlock(&ne_enclave->enclave_info_mutex);
> +
> + return -EINVAL;
> + }
> +
> + /* Use the CPU pool for choosing a CPU for the enclave. */
> + rc = ne_get_cpu_from_cpu_pool(ne_enclave, &vcpu_id);
> + if (rc < 0) {
> + dev_err_ratelimited(ne_misc_dev.this_device,
> + "Error in get CPU from pool\n");
> +
> + mutex_unlock(&ne_enclave->enclave_info_mutex);
> +
> + return -EINVAL;
> + }
> +
> + rc = ne_create_vcpu_ioctl(ne_enclave, vcpu_id);
> +
> + /* Put back the CPU in enclave cpu pool, if add vcpu error. */
> + if (rc < 0)
> + cpumask_set_cpu(vcpu_id, ne_enclave->cpu_siblings);
> +
> + mutex_unlock(&ne_enclave->enclave_info_mutex);
> +
> + if (copy_to_user((void *)arg, &vcpu_id, sizeof(vcpu_id))) {
> + dev_err_ratelimited(ne_misc_dev.this_device,
> + "Error in copy to user\n");
> +
> + return -EFAULT;
> + }
> +
> + return rc;
> + }
> +
> + default:
> + return -ENOTTY;
> + }
> +
> + return 0;
> +}
> +
> static __poll_t ne_enclave_poll(struct file *file, poll_table *wait)
> {
> __poll_t mask = 0;
> @@ -79,6 +562,7 @@ static const struct file_operations ne_enclave_fops = {
> .owner = THIS_MODULE,
> .llseek = noop_llseek,
> .poll = ne_enclave_poll,
> + .unlocked_ioctl = ne_enclave_ioctl,
> };
>
> /**
> @@ -286,8 +770,15 @@ static int __init ne_init(void)
>
> static void __exit ne_exit(void)
> {
> + struct pci_dev *pdev = pci_get_device(PCI_VENDOR_ID_AMAZON,
> + PCI_DEVICE_ID_NE, NULL);
> + if (!pdev)
> + return;
> +
> pci_unregister_driver(&ne_pci_driver);
>
> + ne_teardown_cpu_pool(pdev);
> +
> free_cpumask_var(ne_cpu_pool.avail);
> }
>
>
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
Powered by blists - more mailing lists