linux-kernel - Re: [PATCH v4 09/18] nitro_enclaves: Add logic for enclave vcpu creation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <52e916fc-8fe3-f9bc-009e-ca84ab7dd650@amazon.de>
Date:   Mon, 6 Jul 2020 12:12:31 +0200
From:   Alexander Graf <graf@...zon.de>
To:     Andra Paraschiv <andraprs@...zon.com>,
        <linux-kernel@...r.kernel.org>
CC:     Anthony Liguori <aliguori@...zon.com>,
        Benjamin Herrenschmidt <benh@...nel.crashing.org>,
        Colm MacCarthaigh <colmmacc@...zon.com>,
        "Bjoern Doebel" <doebel@...zon.de>,
        David Woodhouse <dwmw@...zon.co.uk>,
        "Frank van der Linden" <fllinden@...zon.com>,
        Greg KH <gregkh@...uxfoundation.org>,
        Martin Pohlack <mpohlack@...zon.de>,
        Matt Wilson <msw@...zon.com>,
        "Paolo Bonzini" <pbonzini@...hat.com>,
        Balbir Singh <sblbir@...zon.com>,
        "Stefano Garzarella" <sgarzare@...hat.com>,
        Stefan Hajnoczi <stefanha@...hat.com>,
        Stewart Smith <trawets@...zon.com>,
        Uwe Dannowski <uwed@...zon.de>, <kvm@...r.kernel.org>,
        <ne-devel-upstream@...zon.com>
Subject: Re: [PATCH v4 09/18] nitro_enclaves: Add logic for enclave vcpu
 creation



On 22.06.20 22:03, Andra Paraschiv wrote:
> An enclave, before being started, has its resources set. One of its
> resources is CPU.
> 
> The NE CPU pool is set for choosing CPUs for enclaves from it. Offline
> the CPUs from the NE CPU pool during the pool setup and online them back
> during the NE CPU pool teardown.
> 
> The enclave CPUs need to be full cores and from the same NUMA node. CPU
> 0 and its siblings have to remain available to the primary / parent VM.
> 
> Add ioctl command logic for enclave vCPU creation. Return as result a
> file descriptor that is associated with the enclave vCPU.
> 
> Signed-off-by: Alexandru Vasile <lexnv@...zon.com>
> Signed-off-by: Andra Paraschiv <andraprs@...zon.com>
> ---
> Changelog
> 
> v3 -> v4
> 
> * Setup the NE CPU pool at runtime via a sysfs file for the kernel
>    parameter.
> * Check enclave CPUs to be from the same NUMA node.
> * Use dev_err instead of custom NE log pattern.
> * Update the NE ioctl call to match the decoupling from the KVM API.
> 
> v2 -> v3
> 
> * Remove the WARN_ON calls.
> * Update static calls sanity checks.
> * Update kzfree() calls to kfree().
> * Remove file ops that do nothing for now - open, ioctl and release.
> 
> v1 -> v2
> 
> * Add log pattern for NE.
> * Update goto labels to match their purpose.
> * Remove the BUG_ON calls.
> * Check if enclave state is init when setting enclave vcpu.
> ---
>   drivers/virt/nitro_enclaves/ne_misc_dev.c | 491 ++++++++++++++++++++++
>   1 file changed, 491 insertions(+)
> 
> diff --git a/drivers/virt/nitro_enclaves/ne_misc_dev.c b/drivers/virt/nitro_enclaves/ne_misc_dev.c
> index f70496813033..d6777008f685 100644
> --- a/drivers/virt/nitro_enclaves/ne_misc_dev.c
> +++ b/drivers/virt/nitro_enclaves/ne_misc_dev.c
> @@ -39,7 +39,11 @@
>    * TODO: Update logic to create new sysfs entries instead of using
>    * a kernel parameter e.g. if multiple sysfs files needed.
>    */
> +static int ne_set_kernel_param(const char *val, const struct kernel_param *kp);
> +
>   static const struct kernel_param_ops ne_cpu_pool_ops = {
> +	.get = param_get_string,
> +	.set = ne_set_kernel_param,
>   };
>   
>   static char ne_cpus[PAGE_SIZE];
> @@ -60,6 +64,485 @@ struct ne_cpu_pool {
>   
>   static struct ne_cpu_pool ne_cpu_pool;
>   
> +static const struct file_operations ne_enclave_vcpu_fops = {
> +	.owner		= THIS_MODULE,
> +	.llseek		= noop_llseek,
> +};

Do we really need an fd for an object without operations? I think the 
general flow to add CPUs from the pool to the VM is very sensible. But I 
don't think we really need an fd as return value from that operation.

> +
> +/**
> + * ne_check_enclaves_created - Verify if at least one enclave has been created.
> + *
> + * @pdev: PCI device used for enclave lifetime management.
> + *
> + * @returns: true if at least one enclave is created, false otherwise.
> + */
> +static bool ne_check_enclaves_created(struct pci_dev *pdev)
> +{
> +	struct ne_pci_dev *ne_pci_dev = pci_get_drvdata(pdev);
> +
> +	if (!ne_pci_dev)
> +		return false;

Please pass in the ne_pci_dev into this function directly.

> +
> +	mutex_lock(&ne_pci_dev->enclaves_list_mutex);
> +
> +	if (list_empty(&ne_pci_dev->enclaves_list)) {
> +		mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
> +
> +		return false;

If you make this a return variable, you save on the unlock duplication.

> +	}
> +
> +	mutex_unlock(&ne_pci_dev->enclaves_list_mutex);
> +
> +	return true;
> +}
> +
> +/**
> + * ne_setup_cpu_pool - Set the NE CPU pool after handling sanity checks such as
> + * not sharing CPU cores with the primary / parent VM or not using CPU 0, which
> + * should remain available for the primary / parent VM. Offline the CPUs from
> + * the pool after the checks passed.
> + *
> + * @pdev: PCI device used for enclave lifetime management.
> + * @ne_cpu_list: the CPU list used for setting NE CPU pool.
> + *
> + * @returns: 0 on success, negative return value on failure.
> + */
> +static int ne_setup_cpu_pool(struct pci_dev *pdev, const char *ne_cpu_list)
> +{
> +	unsigned int cpu = 0;
> +	unsigned int cpu_sibling = 0;
> +	int numa_node = -1;
> +	int rc = -EINVAL;
> +
> +	if (!capable(CAP_SYS_ADMIN)) {
> +		dev_err(&pdev->dev, "No admin capability for CPU pool setup\n");

No need to print anything here. It only gives non-admin users a chance 
to spill the kernel log. If non-admin users can write at all? Can they?

Also, isn't this at the wrong abstraction level? I would expect such a 
check to happen on the file write function, not here.

> +
> +		return -EPERM;
> +	}
> +
> +	if (!ne_cpu_list)
> +		return 0;
> +
> +	if (ne_check_enclaves_created(pdev)) {
> +		dev_err(&pdev->dev, "The CPU pool is used, enclaves created\n");
> +
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&ne_cpu_pool.mutex);
> +
> +	rc = cpulist_parse(ne_cpu_list, ne_cpu_pool.avail);
> +	if (rc < 0) {
> +		dev_err(&pdev->dev,
> +			"Error in cpulist parse [rc=%d]\n", rc);
> +
> +		goto unlock_mutex;
> +	}
> +
> +	/*
> +	 * Check if CPU 0 and its siblings are included in the provided CPU pool
> +	 * They should remain available for the primary / parent VM.
> +	 */
> +	if (cpumask_test_cpu(0, ne_cpu_pool.avail)) {
> +
> +		dev_err(&pdev->dev,
> +			"CPU 0 has to remain available for the primary VM\n");

Shouldn't this also change the read value of the sysfs file?

> +
> +		rc = -EINVAL;
> +
> +		goto unlock_mutex;
> +	}
> +
> +	for_each_cpu(cpu_sibling, topology_sibling_cpumask(0)) {
> +		if (cpumask_test_cpu(cpu_sibling, ne_cpu_pool.avail)) {
> +			dev_err(&pdev->dev,
> +				"CPU sibling %d of CPU 0 is in the CPU pool\n",
> +				cpu_sibling);

Same here. I would expect the sysfs file to reflect either the previous 
state or <empty> because failures mean no CPUs are donated anymore.

Can we somehow implement the get function of the param as something that 
gets generated dynamically?

> +
> +			rc = -EINVAL;
> +
> +			goto unlock_mutex;
> +		}
> +	}
> +
> +	/*
> +	 * Check if CPU siblings are included in the provided CPU pool. The
> +	 * expectation is that CPU cores are made available in the CPU pool for
> +	 * enclaves.
> +	 */
> +	for_each_cpu(cpu, ne_cpu_pool.avail) {
> +		for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) {
> +			if (!cpumask_test_cpu(cpu_sibling, ne_cpu_pool.avail)) {
> +				dev_err(&pdev->dev,
> +					"CPU %d is not in the CPU pool\n",
> +					cpu_sibling);
> +
> +				rc = -EINVAL;
> +
> +				goto unlock_mutex;
> +			}
> +		}
> +	}
> +
> +	/*
> +	 * Check if the CPUs from the NE CPU pool are from the same NUMA node.
> +	 */
> +	for_each_cpu(cpu, ne_cpu_pool.avail) {
> +		if (numa_node < 0) {
> +			numa_node = cpu_to_node(cpu);
> +
> +			if (numa_node < 0) {
> +				dev_err(&pdev->dev,
> +					"Invalid NUMA node %d\n", numa_node);
> +
> +				rc = -EINVAL;
> +
> +				goto unlock_mutex;
> +			}
> +		} else {
> +			if (numa_node != cpu_to_node(cpu)) {
> +				dev_err(&pdev->dev,
> +					"CPUs are from different NUMA nodes\n");
> +
> +				rc = -EINVAL;
> +
> +				goto unlock_mutex;
> +			}
> +		}
> +	}
> +

There should be a comment here that describes the why:

/*
  * CPUs that are donated to enclaves should not be considered online
  * by Linux anymore, as the hypervisor will degrade them to floating.
  *
  * We offline them here, to not degrade performance and expose correct
  * topology to Linux and user space.
  */

> +	for_each_cpu(cpu, ne_cpu_pool.avail) {
> +		rc = remove_cpu(cpu);
> +		if (rc != 0) {
> +			dev_err(&pdev->dev,
> +				"CPU %d is not offlined [rc=%d]\n", cpu, rc);
> +
> +			goto online_cpus;
> +		}
> +	}
> +
> +	mutex_unlock(&ne_cpu_pool.mutex);
> +
> +	return 0;
> +
> +online_cpus:
> +	for_each_cpu(cpu, ne_cpu_pool.avail)
> +		add_cpu(cpu);
> +unlock_mutex:
> +	mutex_unlock(&ne_cpu_pool.mutex);
> +
> +	return rc;
> +}
> +
> +/**
> + * ne_teardown_cpu_pool - Online the CPUs from the NE CPU pool and cleanup the
> + * CPU pool.
> + *
> + * @pdev: PCI device used for enclave lifetime management.
> + */
> +static void ne_teardown_cpu_pool(struct pci_dev *pdev)
> +{
> +	unsigned int cpu = 0;
> +	int rc = -EINVAL;
> +
> +	if (!capable(CAP_SYS_ADMIN)) {
> +		dev_err(&pdev->dev, "No admin capability for CPU pool setup\n");
> +
> +		return;
> +	}
> +
> +	if (!ne_cpu_pool.avail)
> +		return;
> +
> +	if (ne_check_enclaves_created(pdev)) {
> +		dev_err(&pdev->dev, "The CPU pool is used, enclaves created\n");
> +
> +		return;
> +	}
> +
> +	mutex_lock(&ne_cpu_pool.mutex);
> +
> +	for_each_cpu(cpu, ne_cpu_pool.avail) {
> +		rc = add_cpu(cpu);
> +		if (rc != 0)
> +			dev_err(&pdev->dev,
> +				"CPU %d is not onlined [rc=%d]\n", cpu, rc);
> +	}
> +
> +	cpumask_clear(ne_cpu_pool.avail);
> +
> +	mutex_unlock(&ne_cpu_pool.mutex);
> +}
> +
> +static int ne_set_kernel_param(const char *val, const struct kernel_param *kp)
> +{
> +	const char *ne_cpu_list = val;
> +	struct pci_dev *pdev = pci_get_device(PCI_VENDOR_ID_AMAZON,
> +					      PCI_DEVICE_ID_NE, NULL);

Isn't there a better way?

> +	int rc = -EINVAL;
> +
> +	if (!pdev)
> +		return -ENODEV;
> +
> +	ne_teardown_cpu_pool(pdev);
> +
> +	rc = ne_setup_cpu_pool(pdev, ne_cpu_list);
> +	if (rc < 0) {
> +		dev_err(&pdev->dev, "Error in setup CPU pool [rc=%d]\n", rc);
> +
> +		return rc;
> +	}
> +
> +	return param_set_copystring(val, kp);
> +}
> +
> +/**
> + * ne_get_cpu_from_cpu_pool - Get a CPU from the CPU pool. If the vCPU id is 0,
> + * the CPU is autogenerated and chosen from the NE CPU pool.
> + *
> + * This function gets called with the ne_enclave mutex held.
> + *
> + * @ne_enclave: private data associated with the current enclave.
> + * @vcpu_id: id of the CPU to be associated with the given slot, apic id on x86.
> + *
> + * @returns: 0 on success, negative return value on failure.
> + */
> +static int ne_get_cpu_from_cpu_pool(struct ne_enclave *ne_enclave, u32 *vcpu_id)

That's a very awkward API. Can you instead just pass by-value and return 
the resulting CPU ID?

> +{
> +	unsigned int cpu = 0;
> +	unsigned int cpu_sibling = 0;
> +
> +	if (*vcpu_id != 0) {
> +		if (cpumask_test_cpu(*vcpu_id, ne_enclave->cpu_siblings)) {
> +			cpumask_clear_cpu(*vcpu_id, ne_enclave->cpu_siblings);
> +
> +			return 0;
> +		}
> +
> +		mutex_lock(&ne_cpu_pool.mutex);
> +
> +		if (!cpumask_test_cpu(*vcpu_id, ne_cpu_pool.avail)) {
> +			dev_err_ratelimited(ne_misc_dev.this_device,
> +					    "CPU %d is not in NE CPU pool\n",
> +					    *vcpu_id);
> +
> +			mutex_unlock(&ne_cpu_pool.mutex);
> +
> +			return -EINVAL;

I think you're better off making the return value explicit for the 
error, so that user space can print the error message rather than us.

> +		}
> +
> +		cpumask_clear_cpu(*vcpu_id, ne_cpu_pool.avail);
> +
> +		/*
> +		 * Make sure the CPU siblings are not marked as available
> +		 * anymore.
> +		 */
> +		for_each_cpu(cpu_sibling, topology_sibling_cpumask(*vcpu_id)) {
> +			if (cpu_sibling != *vcpu_id) {
> +				cpumask_clear_cpu(cpu_sibling,
> +						  ne_cpu_pool.avail);
> +
> +				cpumask_set_cpu(cpu_sibling,
> +						ne_enclave->cpu_siblings);
> +			}
> +		}
> +
> +		mutex_unlock(&ne_cpu_pool.mutex);
> +
> +		return 0;
> +	}
> +
> +	/* There are CPU siblings available to choose from. */
> +	cpu = cpumask_any(ne_enclave->cpu_siblings);
> +	if (cpu < nr_cpu_ids) {
> +		cpumask_clear_cpu(cpu, ne_enclave->cpu_siblings);
> +
> +		*vcpu_id = cpu;
> +
> +		return 0;
> +	}
> +
> +	mutex_lock(&ne_cpu_pool.mutex);
> +
> +	/* Choose any CPU from the available CPU pool. */
> +	cpu = cpumask_any(ne_cpu_pool.avail);
> +	if (cpu >= nr_cpu_ids) {
> +		dev_err_ratelimited(ne_misc_dev.this_device,
> +				    "No CPUs available in CPU pool\n");
> +
> +		mutex_unlock(&ne_cpu_pool.mutex);
> +
> +		return -EINVAL;

I think you're better off making the return value explicit for the 
error, so that user space can print the error message rather than us.

> +	}
> +
> +	cpumask_clear_cpu(cpu, ne_cpu_pool.avail);
> +
> +	/* Make sure the CPU siblings are not marked as available anymore. */
> +	for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) {
> +		if (cpu_sibling != cpu) {
> +			cpumask_clear_cpu(cpu_sibling, ne_cpu_pool.avail);
> +
> +			cpumask_set_cpu(cpu_sibling, ne_enclave->cpu_siblings);
> +		}
> +	}
> +
> +	mutex_unlock(&ne_cpu_pool.mutex);

I find the function slightly confusingly structured. Why can't we do 
something like


   if (!vcpu_id) {
     vcpu_id = find_next_free_vcpu_id();
     if (vcpu_id < 0)
         return -ENOSPC;
   }

   [logic to handle an explicit vcpu id]

I think that would be much more readable.

> +
> +	*vcpu_id = cpu;
> +
> +	return 0;
> +}
> +
> +/**
> + * ne_create_vcpu_ioctl - Add vCPU to the slot associated with the current
> + * enclave. Create vCPU file descriptor to be further used for CPU handling.
> + *
> + * This function gets called with the ne_enclave mutex held.
> + *
> + * @ne_enclave: private data associated with the current enclave.
> + * @vcpu_id: id of the CPU to be associated with the given slot, apic id on x86.
> + *
> + * @returns: vCPU fd on success, negative return value on failure.
> + */
> +static int ne_create_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id)
> +{
> +	struct ne_pci_dev_cmd_reply cmd_reply = {};
> +	int fd = 0;
> +	struct file *file = NULL;
> +	struct ne_vcpu_id *ne_vcpu_id = NULL;
> +	int rc = -EINVAL;
> +	struct slot_add_vcpu_req slot_add_vcpu_req = {};
> +
> +	if (ne_enclave->mm != current->mm)
> +		return -EIO;
> +
> +	ne_vcpu_id = kzalloc(sizeof(*ne_vcpu_id), GFP_KERNEL);
> +	if (!ne_vcpu_id)
> +		return -ENOMEM;
> +
> +	fd = get_unused_fd_flags(O_CLOEXEC);
> +	if (fd < 0) {
> +		rc = fd;
> +
> +		dev_err_ratelimited(ne_misc_dev.this_device,
> +				    "Error in getting unused fd [rc=%d]\n", rc);
> +
> +		goto free_ne_vcpu_id;
> +	}
> +
> +	/* TODO: Include (vcpu) id in the ne-vm-vcpu naming. */
> +	file = anon_inode_getfile("ne-vm-vcpu", &ne_enclave_vcpu_fops,
> +				  ne_enclave, O_RDWR);
> +	if (IS_ERR(file)) {
> +		rc = PTR_ERR(file);
> +
> +		dev_err_ratelimited(ne_misc_dev.this_device,
> +				    "Error in anon inode get file [rc=%d]\n",
> +				    rc);
> +
> +		goto put_fd;
> +	}
> +
> +	slot_add_vcpu_req.slot_uid = ne_enclave->slot_uid;
> +	slot_add_vcpu_req.vcpu_id = vcpu_id;
> +
> +	rc = ne_do_request(ne_enclave->pdev, SLOT_ADD_VCPU, &slot_add_vcpu_req,
> +			   sizeof(slot_add_vcpu_req), &cmd_reply,
> +			   sizeof(cmd_reply));
> +	if (rc < 0) {
> +		dev_err_ratelimited(ne_misc_dev.this_device,
> +				    "Error in slot add vcpu [rc=%d]\n", rc);
> +
> +		goto put_file;
> +	}
> +
> +	ne_vcpu_id->vcpu_id = vcpu_id;
> +
> +	list_add(&ne_vcpu_id->vcpu_id_list_entry, &ne_enclave->vcpu_ids_list);
> +
> +	ne_enclave->nr_vcpus++;
> +
> +	fd_install(fd, file);
> +
> +	return fd;
> +
> +put_file:
> +	fput(file);
> +put_fd:
> +	put_unused_fd(fd);
> +free_ne_vcpu_id:
> +	kfree(ne_vcpu_id);
> +
> +	return rc;
> +}
> +
> +static long ne_enclave_ioctl(struct file *file, unsigned int cmd,
> +			     unsigned long arg)
> +{
> +	struct ne_enclave *ne_enclave = file->private_data;
> +
> +	if (!ne_enclave || !ne_enclave->pdev)
> +		return -EINVAL;
> +
> +	switch (cmd) {
> +	case NE_CREATE_VCPU: {

Can this be an ADD_VCPU rather than CREATE? We don't really need a vcpu 
fd after all ...


Alex

> +		int rc = -EINVAL;
> +		u32 vcpu_id = 0;
> +
> +		if (copy_from_user(&vcpu_id, (void *)arg, sizeof(vcpu_id))) {
> +			dev_err_ratelimited(ne_misc_dev.this_device,
> +					    "Error in copy from user\n");
> +
> +			return -EFAULT;
> +		}
> +
> +		mutex_lock(&ne_enclave->enclave_info_mutex);
> +
> +		if (ne_enclave->state != NE_STATE_INIT) {
> +			dev_err_ratelimited(ne_misc_dev.this_device,
> +					    "Enclave isn't in init state\n");
> +
> +			mutex_unlock(&ne_enclave->enclave_info_mutex);
> +
> +			return -EINVAL;
> +		}
> +
> +		/* Use the CPU pool for choosing a CPU for the enclave. */
> +		rc = ne_get_cpu_from_cpu_pool(ne_enclave, &vcpu_id);
> +		if (rc < 0) {
> +			dev_err_ratelimited(ne_misc_dev.this_device,
> +					    "Error in get CPU from pool\n");
> +
> +			mutex_unlock(&ne_enclave->enclave_info_mutex);
> +
> +			return -EINVAL;
> +		}
> +
> +		rc = ne_create_vcpu_ioctl(ne_enclave, vcpu_id);
> +
> +		/* Put back the CPU in enclave cpu pool, if add vcpu error. */
> +		if (rc < 0)
> +			cpumask_set_cpu(vcpu_id, ne_enclave->cpu_siblings);
> +
> +		mutex_unlock(&ne_enclave->enclave_info_mutex);
> +
> +		if (copy_to_user((void *)arg, &vcpu_id, sizeof(vcpu_id))) {
> +			dev_err_ratelimited(ne_misc_dev.this_device,
> +					    "Error in copy to user\n");
> +
> +			return -EFAULT;
> +		}
> +
> +		return rc;
> +	}
> +
> +	default:
> +		return -ENOTTY;
> +	}
> +
> +	return 0;
> +}
> +
>   static __poll_t ne_enclave_poll(struct file *file, poll_table *wait)
>   {
>   	__poll_t mask = 0;
> @@ -79,6 +562,7 @@ static const struct file_operations ne_enclave_fops = {
>   	.owner		= THIS_MODULE,
>   	.llseek		= noop_llseek,
>   	.poll		= ne_enclave_poll,
> +	.unlocked_ioctl	= ne_enclave_ioctl,
>   };
>   
>   /**
> @@ -286,8 +770,15 @@ static int __init ne_init(void)
>   
>   static void __exit ne_exit(void)
>   {
> +	struct pci_dev *pdev = pci_get_device(PCI_VENDOR_ID_AMAZON,
> +					      PCI_DEVICE_ID_NE, NULL);
> +	if (!pdev)
> +		return;
> +
>   	pci_unregister_driver(&ne_pci_driver);
>   
> +	ne_teardown_cpu_pool(pdev);
> +
>   	free_cpumask_var(ne_cpu_pool.avail);
>   }
>   
> 



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879