[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240610133207.7d039dab.alex.williamson@redhat.com>
Date: Mon, 10 Jun 2024 13:32:07 -0600
From: Alex Williamson <alex.williamson@...hat.com>
To: Fred Griffoul <fgriffo@...zon.co.uk>
Cc: <griffoul@...il.com>, Catalin Marinas <catalin.marinas@....com>, Will
Deacon <will@...nel.org>, Waiman Long <longman@...hat.com>, Zefan Li
<lizefan.x@...edance.com>, Tejun Heo <tj@...nel.org>, Johannes Weiner
<hannes@...xchg.org>, Mark Rutland <mark.rutland@....com>, Marc Zyngier
<maz@...nel.org>, Oliver Upton <oliver.upton@...ux.dev>, Mark Brown
<broonie@...nel.org>, Ard Biesheuvel <ardb@...nel.org>, Joey Gouly
<joey.gouly@....com>, Ryan Roberts <ryan.roberts@....com>, Jeremy Linton
<jeremy.linton@....com>, Jason Gunthorpe <jgg@...pe.ca>, Yi Liu
<yi.l.liu@...el.com>, Kevin Tian <kevin.tian@...el.com>, Eric Auger
<eric.auger@...hat.com>, Stefan Hajnoczi <stefanha@...hat.com>, "Christian
Brauner" <brauner@...nel.org>, Ankit Agrawal <ankita@...dia.com>, "Reinette
Chatre" <reinette.chatre@...el.com>, Ye Bin <yebin10@...wei.com>,
<linux-arm-kernel@...ts.infradead.org>, <linux-kernel@...r.kernel.org>,
<kvm@...r.kernel.org>, <cgroups@...r.kernel.org>
Subject: Re: [PATCH v5 2/2] vfio/pci: add msi interrupt affinity support
On Mon, 10 Jun 2024 12:57:08 +0000
Fred Griffoul <fgriffo@...zon.co.uk> wrote:
> The usual way to configure a device interrupt from userland is to write
> the /proc/irq/<irq>/smp_affinity or smp_affinity_list files. When using
> vfio to implement a device driver or a virtual machine monitor, this may
> not be ideal: the process managing the vfio device interrupts may not be
> granted root privilege, for security reasons. Thus it cannot directly
> control the interrupt affinity and has to rely on an external command.
>
> This patch extends the VFIO_DEVICE_SET_IRQS ioctl() with a new data flag
> to specify the affinity of interrupts of a vfio pci device.
>
> The CPU affinity mask argument must be a subset of the process cpuset,
> otherwise an error -EPERM is returned.
>
> The vfio_irq_set argument shall be set-up in the following way:
>
> - the 'flags' field have the new flag VFIO_IRQ_SET_DATA_AFFINITY set
> as well as VFIO_IRQ_SET_ACTION_TRIGGER.
>
> - the variable-length 'data' field is a cpu_set_t structure, as
> for the sched_setaffinity() syscall, the size of which is derived
> from 'argsz'.
>
> Signed-off-by: Fred Griffoul <fgriffo@...zon.co.uk>
> ---
> drivers/vfio/pci/vfio_pci_core.c | 27 +++++++++++++++++----
> drivers/vfio/pci/vfio_pci_intrs.c | 39 +++++++++++++++++++++++++++++++
> drivers/vfio/vfio_main.c | 13 +++++++----
> include/uapi/linux/vfio.h | 10 +++++++-
> 4 files changed, 80 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 80cae87fff36..2e3419560480 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1192,6 +1192,7 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> {
> unsigned long minsz = offsetofend(struct vfio_irq_set, count);
> struct vfio_irq_set hdr;
> + cpumask_var_t mask;
> u8 *data = NULL;
> int max, ret = 0;
> size_t data_size = 0;
> @@ -1207,9 +1208,22 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> return ret;
>
> if (data_size) {
> - data = memdup_user(&arg->data, data_size);
> - if (IS_ERR(data))
> - return PTR_ERR(data);
> + if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY) {
> + if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> + return -ENOMEM;
> +
> + if (copy_from_user(mask, &arg->data, data_size)) {
> + ret = -EFAULT;
> + goto out;
> + }
> +
> + data = (u8 *)mask;
Seems like this could just use the memdup_user() path, why do we care
to copy it into a cpumask_var_t here? If we do care, wouldn't we
implement something like get_user_cpu_mask() used by
sched_setaffinity(2)?
> +
> + } else {
> + data = memdup_user(&arg->data, data_size);
> + if (IS_ERR(data))
> + return PTR_ERR(data);
> + }
> }
>
> mutex_lock(&vdev->igate);
> @@ -1218,7 +1232,12 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> hdr.count, data);
>
> mutex_unlock(&vdev->igate);
> - kfree(data);
> +
> +out:
> + if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY && data_size)
> + free_cpumask_var(mask);
> + else
> + kfree(data);
>
> return ret;
> }
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> index 8382c5834335..fe01303cf94e 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -19,6 +19,7 @@
> #include <linux/vfio.h>
> #include <linux/wait.h>
> #include <linux/slab.h>
> +#include <linux/cpuset.h>
>
> #include "vfio_pci_priv.h"
>
> @@ -675,6 +676,41 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
> return 0;
> }
>
> +static int vfio_pci_set_msi_affinity(struct vfio_pci_core_device *vdev,
> + unsigned int start, unsigned int count,
> + struct cpumask *irq_mask)
Aside from the name, what makes this unique to MSI vectors?
> +{
> + struct vfio_pci_irq_ctx *ctx;
> + cpumask_var_t allowed_mask;
> + unsigned int i;
> + int err = 0;
> +
> + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
> + return -ENOMEM;
> +
> + cpuset_cpus_allowed(current, allowed_mask);
> + if (!cpumask_subset(irq_mask, allowed_mask)) {
> + err = -EPERM;
> + goto finish;
> + }
> +
> + for (i = start; i < start + count; i++) {
> + ctx = vfio_irq_ctx_get(vdev, i);
> + if (!ctx) {
> + err = -EINVAL;
> + break;
> + }
> +
> + err = irq_set_affinity(ctx->producer.irq, irq_mask);
> + if (err)
> + break;
> + }
Is this typical/userful behavior to set a series of vectors to the same
cpu_set_t? It's unusual behavior for this ioctl to apply the same data
across multiple vectors. Should the DATA_AFFINITY case support an
array of cpu_set_t?
> +
> +finish:
> + free_cpumask_var(allowed_mask);
> + return err;
> +}
> +
> static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
> unsigned index, unsigned start,
> unsigned count, uint32_t flags, void *data)
> @@ -713,6 +749,9 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
> if (!irq_is(vdev, index))
> return -EINVAL;
>
> + if (flags & VFIO_IRQ_SET_DATA_AFFINITY)
> + return vfio_pci_set_msi_affinity(vdev, start, count, data);
> +
> for (i = start; i < start + count; i++) {
> ctx = vfio_irq_ctx_get(vdev, i);
> if (!ctx)
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index e97d796a54fb..e75c5d66681c 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1505,23 +1505,28 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
> size = 0;
> break;
> case VFIO_IRQ_SET_DATA_BOOL:
> - size = sizeof(uint8_t);
> + size = hdr->count * sizeof(uint8_t);
> break;
> case VFIO_IRQ_SET_DATA_EVENTFD:
> - size = sizeof(int32_t);
> + size = size_mul(hdr->count, sizeof(int32_t));
Why use size_mul() in one place and not the other?
> + break;
> + case VFIO_IRQ_SET_DATA_AFFINITY:
> + size = hdr->argsz - minsz;
> + if (size > cpumask_size())
> + size = cpumask_size();
Or just set size = (hdr->argsz - minsz) / count?
Generate an error if (hdr->argsz - minsz) % count?
It seems like all the cpumask'items can be contained to the set affinity
function.
> break;
> default:
> return -EINVAL;
> }
>
> if (size) {
> - if (hdr->argsz - minsz < hdr->count * size)
> + if (hdr->argsz - minsz < size)
> return -EINVAL;
>
> if (!data_size)
> return -EINVAL;
>
> - *data_size = hdr->count * size;
> + *data_size = size;
> }
>
> return 0;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 2b68e6cdf190..5ba2ca223550 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -580,6 +580,12 @@ struct vfio_irq_info {
> *
> * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
> * ACTION_TRIGGER specifies kernel->user signaling.
> + *
> + * DATA_AFFINITY specifies the affinity for the range of interrupt vectors.
> + * It must be set with ACTION_TRIGGER in 'flags'. The variable-length 'data'
> + * array is a CPU affinity mask 'cpu_set_t' structure, as for the
> + * sched_setaffinity() syscall argument: the 'argsz' field is used to check
> + * the actual cpu_set_t size.
> */
DATA_CPUSET?
The IRQ_INFO ioctl should probably also report support for this
feature.
Is there any proposed userspace code that takes advantage of this
interface? Thanks,
Alex
> struct vfio_irq_set {
> __u32 argsz;
> @@ -587,6 +593,7 @@ struct vfio_irq_set {
> #define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */
> #define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */
> #define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */
> +#define VFIO_IRQ_SET_DATA_AFFINITY (1 << 6) /* Data is cpu_set_t */
> #define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */
> #define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */
> #define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */
> @@ -599,7 +606,8 @@ struct vfio_irq_set {
>
> #define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \
> VFIO_IRQ_SET_DATA_BOOL | \
> - VFIO_IRQ_SET_DATA_EVENTFD)
> + VFIO_IRQ_SET_DATA_EVENTFD | \
> + VFIO_IRQ_SET_DATA_AFFINITY)
> #define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \
> VFIO_IRQ_SET_ACTION_UNMASK | \
> VFIO_IRQ_SET_ACTION_TRIGGER)
Powered by blists - more mailing lists