[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <85eeab19-1f53-6c45-95a2-44c1cfd39184@redhat.com>
Date: Thu, 26 Dec 2019 16:40:18 +0800
From: Jason Wang <jasowang@...hat.com>
To: Zha Bin <zhabin@...ux.alibaba.com>, linux-kernel@...r.kernel.org
Cc: mst@...hat.com, slp@...hat.com, virtio-dev@...ts.oasis-open.org,
gerry@...ux.alibaba.com, jing2.liu@...el.com, chao.p.peng@...el.com
Subject: Re: [PATCH v1 2/2] virtio-mmio: add features for virtio-mmio
specification version 3
On 2019/12/25 上午10:50, Zha Bin wrote:
> From: Liu Jiang <gerry@...ux.alibaba.com>
>
> Userspace VMMs (e.g. Qemu microvm, Firecracker) take advantage of using
> virtio over mmio devices as a lightweight machine model for modern
> cloud. The standard virtio over MMIO transport layer only supports one
> legacy interrupt, which is much heavier than virtio over PCI transport
> layer using MSI. Legacy interrupt has long work path and causes specific
> VMExits in following cases, which would considerably slow down the
> performance:
>
> 1) read interrupt status register
> 2) update interrupt status register
> 3) write IOAPIC EOI register
>
> We proposed to update virtio over MMIO to version 3[1] to add the
> following new features and enhance the performance.
>
> 1) Support Message Signaled Interrupt(MSI), which increases the
> interrupt performance for virtio multi-queue devices
> 2) Support per-queue doorbell, so the guest kernel may directly write
> to the doorbells provided by virtio devices.
>
> The following is the network tcp_rr performance testing report, tested
> with virtio-pci device, vanilla virtio-mmio device and patched
> virtio-mmio device (run test 3 times for each case):
>
> netperf -t TCP_RR -H 192.168.1.36 -l 30 -- -r 32,1024
>
> Virtio-PCI Virtio-MMIO Virtio-MMIO(MSI)
> trans/s 9536 6939 9500
> trans/s 9734 7029 9749
> trans/s 9894 7095 9318
>
> [1] https://lkml.org/lkml/2019/12/20/113
>
> Signed-off-by: Liu Jiang <gerry@...ux.alibaba.com>
> Signed-off-by: Zha Bin <zhabin@...ux.alibaba.com>
> Signed-off-by: Chao Peng <chao.p.peng@...ux.intel.com>
> Signed-off-by: Jing Liu <jing2.liu@...ux.intel.com>
> ---
> drivers/virtio/virtio_mmio.c | 226 ++++++++++++++++++++++++++++++++++++---
> include/uapi/linux/virtio_mmio.h | 28 +++++
> 2 files changed, 240 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
> index e09edb5..065c083 100644
> --- a/drivers/virtio/virtio_mmio.c
> +++ b/drivers/virtio/virtio_mmio.c
> @@ -68,8 +68,8 @@
> #include <linux/virtio_config.h>
> #include <uapi/linux/virtio_mmio.h>
> #include <linux/virtio_ring.h>
> -
> -
> +#include <linux/msi.h>
> +#include <asm/irqdomain.h>
>
> /* The alignment to use between consumer and producer parts of vring.
> * Currently hardcoded to the page size. */
> @@ -90,6 +90,12 @@ struct virtio_mmio_device {
> /* a list of queues so we can dispatch IRQs */
> spinlock_t lock;
> struct list_head virtqueues;
> +
> + int doorbell_base;
> + int doorbell_scale;
It's better to use the terminology defined by spec, e.g
notify_base/notify_multiplexer etc.
And we usually use unsigned type for offset.
> + bool msi_enabled;
> + /* Name strings for interrupts. */
> + char (*vm_vq_names)[256];
> };
>
> struct virtio_mmio_vq_info {
> @@ -101,6 +107,8 @@ struct virtio_mmio_vq_info {
> };
>
>
> +static void vm_free_msi_irqs(struct virtio_device *vdev);
> +static int vm_request_msi_vectors(struct virtio_device *vdev, int nirqs);
>
> /* Configuration interface */
>
> @@ -273,12 +281,28 @@ static bool vm_notify(struct virtqueue *vq)
> {
> struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
>
> + if (vm_dev->version == 3) {
> + int offset = vm_dev->doorbell_base +
> + vm_dev->doorbell_scale * vq->index;
> + writel(vq->index, vm_dev->base + offset);
In virtio-pci we store the doorbell address in vq->priv to avoid doing
multiplication in fast path.
> + } else
> /* We write the queue's selector into the notification register to
> * signal the other end */
> - writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
> + writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
> +
> return true;
> }
>
> +/* Handle a configuration change */
> +static irqreturn_t vm_config_changed(int irq, void *opaque)
> +{
> + struct virtio_mmio_device *vm_dev = opaque;
> +
> + virtio_config_changed(&vm_dev->vdev);
> +
> + return IRQ_HANDLED;
> +}
> +
> /* Notify all virtqueues on an interrupt. */
> static irqreturn_t vm_interrupt(int irq, void *opaque)
> {
> @@ -307,7 +331,12 @@ static irqreturn_t vm_interrupt(int irq, void *opaque)
> return ret;
> }
>
> +static int vm_msi_irq_vector(struct device *dev, unsigned int nr)
> +{
> + struct msi_desc *entry = first_msi_entry(dev);
>
> + return entry->irq + nr;
> +}
>
> static void vm_del_vq(struct virtqueue *vq)
> {
> @@ -316,6 +345,12 @@ static void vm_del_vq(struct virtqueue *vq)
> unsigned long flags;
> unsigned int index = vq->index;
>
> + if (vm_dev->version == 3 && vm_dev->msi_enabled) {
> + int irq = vm_msi_irq_vector(&vq->vdev->dev, index + 1);
> +
> + free_irq(irq, vq);
> + }
> +
> spin_lock_irqsave(&vm_dev->lock, flags);
> list_del(&info->node);
> spin_unlock_irqrestore(&vm_dev->lock, flags);
> @@ -334,15 +369,41 @@ static void vm_del_vq(struct virtqueue *vq)
> kfree(info);
> }
>
> -static void vm_del_vqs(struct virtio_device *vdev)
> +static void vm_free_irqs(struct virtio_device *vdev)
> {
> struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
> +
> + if (vm_dev->version == 3)
> + vm_free_msi_irqs(vdev);
> + else
> + free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev);
> +}
> +
> +static void vm_del_vqs(struct virtio_device *vdev)
> +{
> struct virtqueue *vq, *n;
>
> list_for_each_entry_safe(vq, n, &vdev->vqs, list)
> vm_del_vq(vq);
>
> - free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev);
> + vm_free_irqs(vdev);
> +}
> +
> +static int vm_request_single_irq(struct virtio_device *vdev)
> +{
> + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
> + int irq = platform_get_irq(vm_dev->pdev, 0);
> + int err;
> +
> + if (irq < 0) {
> + dev_err(&vdev->dev, "Cannot get IRQ resource\n");
> + return irq;
> + }
> +
> + err = request_irq(irq, vm_interrupt, IRQF_SHARED,
> + dev_name(&vdev->dev), vm_dev);
> +
> + return err;
> }
>
> static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
> @@ -455,6 +516,72 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
> return ERR_PTR(err);
> }
>
> +static inline void mmio_msi_set_enable(struct virtio_mmio_device *vm_dev,
> + int enable)
> +{
> + u16 status;
> +
> + status = readw(vm_dev->base + VIRTIO_MMIO_MSI_STATUS);
> + if (enable)
> + status |= VIRTIO_MMIO_MSI_ENABLE_MASK;
> + else
> + status &= ~VIRTIO_MMIO_MSI_ENABLE_MASK;
> + writew(status, vm_dev->base + VIRTIO_MMIO_MSI_STATUS);
> +}
> +
> +static int vm_find_vqs_msi(struct virtio_device *vdev, unsigned int nvqs,
> + struct virtqueue *vqs[], vq_callback_t *callbacks[],
> + const char * const names[], const bool *ctx,
> + struct irq_affinity *desc)
> +{
> + int i, err, irq;
> + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
> +
> + /* Allocate nvqs irqs for queues and one irq for configuration */
> + err = vm_request_msi_vectors(vdev, nvqs + 1);
> + if (err != 0)
> + return err;
> +
> + for (i = 0; i < nvqs; i++) {
> + int index = i + 1;
> +
> + if (!names[i]) {
> + vqs[i] = NULL;
> + continue;
> + }
> + vqs[i] = vm_setup_vq(vdev, i, callbacks[i], names[i],
> + ctx ? ctx[i] : false);
> + if (IS_ERR(vqs[i])) {
> + err = PTR_ERR(vqs[i]);
> + goto error_find;
> + }
> +
> + if (!callbacks[i])
> + continue;
> +
> + irq = vm_msi_irq_vector(&vqs[i]->vdev->dev, index);
> + /* allocate per-vq irq if available and necessary */
> + snprintf(vm_dev->vm_vq_names[index],
> + sizeof(*vm_dev->vm_vq_names),
> + "%s-%s",
> + dev_name(&vm_dev->vdev.dev), names[i]);
> + err = request_irq(irq, vring_interrupt, 0,
> + vm_dev->vm_vq_names[index], vqs[i]);
> +
> + if (err)
> + goto error_find;
> + }
> +
> + mmio_msi_set_enable(vm_dev, 1);
> + vm_dev->msi_enabled = true;
> +
> + return 0;
> +
> +error_find:
> + vm_del_vqs(vdev);
> + return err;
> +}
> +
> static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
> struct virtqueue *vqs[],
> vq_callback_t *callbacks[],
> @@ -463,16 +590,16 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
> struct irq_affinity *desc)
> {
> struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
> - int irq = platform_get_irq(vm_dev->pdev, 0);
> int i, err, queue_idx = 0;
>
> - if (irq < 0) {
> - dev_err(&vdev->dev, "Cannot get IRQ resource\n");
> - return irq;
> + if (vm_dev->version == 3) {
> + err = vm_find_vqs_msi(vdev, nvqs, vqs, callbacks,
> + names, ctx, desc);
> + if (!err)
> + return 0;
> }
>
> - err = request_irq(irq, vm_interrupt, IRQF_SHARED,
> - dev_name(&vdev->dev), vm_dev);
> + err = vm_request_single_irq(vdev);
> if (err)
> return err;
>
> @@ -493,6 +620,73 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
> return 0;
> }
>
> +static void mmio_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
> +{
> + struct device *dev = desc->dev;
> + struct virtio_device *vdev = dev_to_virtio(dev);
> + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
> + void __iomem *pos = vm_dev->base;
> + uint16_t cmd = VIRTIO_MMIO_MSI_CMD(VIRTIO_MMIO_MSI_CMD_UPDATE,
> + desc->platform.msi_index);
> +
> + writel(msg->address_lo, pos + VIRTIO_MMIO_MSI_ADDRESS_LOW);
> + writel(msg->address_hi, pos + VIRTIO_MMIO_MSI_ADDRESS_HIGH);
> + writel(msg->data, pos + VIRTIO_MMIO_MSI_DATA);
> + writew(cmd, pos + VIRTIO_MMIO_MSI_COMMAND);
> +}
> +
> +static int vm_request_msi_vectors(struct virtio_device *vdev, int nirqs)
> +{
> + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
> + int irq, err;
> + u16 csr = readw(vm_dev->base + VIRTIO_MMIO_MSI_STATUS);
> +
> + if (vm_dev->msi_enabled || (csr & VIRTIO_MMIO_MSI_ENABLE_MASK) == 0)
> + return -EINVAL;
> +
> + vm_dev->vm_vq_names = kmalloc_array(nirqs, sizeof(*vm_dev->vm_vq_names),
> + GFP_KERNEL);
> + if (!vm_dev->vm_vq_names)
> + return -ENOMEM;
> +
> + if (!vdev->dev.msi_domain)
> + vdev->dev.msi_domain = platform_msi_get_def_irq_domain();
> + err = platform_msi_domain_alloc_irqs(&vdev->dev, nirqs,
> + mmio_write_msi_msg);
Can platform_msi_domain_alloc_irqs check msi_domain vs NULL?
> + if (err)
> + goto error_free_mem;
> +
> + /* The first MSI vector is used for configuration change events. */
> + snprintf(vm_dev->vm_vq_names[0], sizeof(*vm_dev->vm_vq_names),
> + "%s-config", dev_name(&vdev->dev));
> + irq = vm_msi_irq_vector(&vdev->dev, 0);
> + err = request_irq(irq, vm_config_changed, 0, vm_dev->vm_vq_names[0],
> + vm_dev);
> + if (err)
> + goto error_free_mem;
> +
> + return 0;
> +
> +error_free_mem:
> + kfree(vm_dev->vm_vq_names);
> + vm_dev->vm_vq_names = NULL;
> + return err;
> +}
> +
> +static void vm_free_msi_irqs(struct virtio_device *vdev)
> +{
> + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
> +
> + if (vm_dev->msi_enabled) {
> + mmio_msi_set_enable(vm_dev, 0);
> + free_irq(vm_msi_irq_vector(&vdev->dev, 0), vm_dev);
> + platform_msi_domain_free_irqs(&vdev->dev);
> + kfree(vm_dev->vm_vq_names);
> + vm_dev->vm_vq_names = NULL;
> + vm_dev->msi_enabled = false;
> + }
> +}
> +
> static const char *vm_bus_name(struct virtio_device *vdev)
> {
> struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
> @@ -567,7 +761,7 @@ static int virtio_mmio_probe(struct platform_device *pdev)
>
> /* Check device version */
> vm_dev->version = readl(vm_dev->base + VIRTIO_MMIO_VERSION);
> - if (vm_dev->version < 1 || vm_dev->version > 2) {
> + if (vm_dev->version < 1 || vm_dev->version > 3) {
> dev_err(&pdev->dev, "Version %ld not supported!\n",
> vm_dev->version);
> return -ENXIO;
> @@ -603,6 +797,12 @@ static int virtio_mmio_probe(struct platform_device *pdev)
> dev_warn(&pdev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n");
>
> platform_set_drvdata(pdev, vm_dev);
> + if (vm_dev->version == 3) {
> + u32 db = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
> +
> + vm_dev->doorbell_base = db & 0xffff;
> + vm_dev->doorbell_scale = (db >> 16) & 0xffff;
> + }
>
> rc = register_virtio_device(&vm_dev->vdev);
> if (rc)
> @@ -619,8 +819,6 @@ static int virtio_mmio_remove(struct platform_device *pdev)
> return 0;
> }
>
> -
> -
Unnecessary changes.
> /* Devices list parameter */
>
> #if defined(CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES)
> diff --git a/include/uapi/linux/virtio_mmio.h b/include/uapi/linux/virtio_mmio.h
> index c4b0968..148895e 100644
> --- a/include/uapi/linux/virtio_mmio.h
> +++ b/include/uapi/linux/virtio_mmio.h
> @@ -122,6 +122,34 @@
> #define VIRTIO_MMIO_QUEUE_USED_LOW 0x0a0
> #define VIRTIO_MMIO_QUEUE_USED_HIGH 0x0a4
>
> +/* MSI related registers */
> +
> +/* MSI status register */
> +#define VIRTIO_MMIO_MSI_STATUS 0x0c0
> +/* MSI command register */
> +#define VIRTIO_MMIO_MSI_COMMAND 0x0c2
> +/* MSI low 32 bit address, 64 bits in two halves */
> +#define VIRTIO_MMIO_MSI_ADDRESS_LOW 0x0c4
> +/* MSI high 32 bit address, 64 bits in two halves */
> +#define VIRTIO_MMIO_MSI_ADDRESS_HIGH 0x0c8
> +/* MSI data */
> +#define VIRTIO_MMIO_MSI_DATA 0x0cc
I wonder what's the advantage of using registers instead of memory
mapped tables as PCI did. Is this because MMIO doesn't have capability
list which makes it hard to be extended if we have variable size of tables?
> +
> +/* RO: MSI feature enabled mask */
> +#define VIRTIO_MMIO_MSI_ENABLE_MASK 0x8000
> +/* RO: Maximum queue size available */
> +#define VIRTIO_MMIO_MSI_STATUS_QMASK 0x07ff
> +/* Reserved */
> +#define VIRTIO_MMIO_MSI_STATUS_RESERVED 0x7800
> +
> +#define VIRTIO_MMIO_MSI_CMD_UPDATE 0x1
I believe we need a command to read the number of vectors supported by
the device, or 2048 is assumed to be a fixed size here?
Thanks
> +#define VIRTIO_MMIO_MSI_CMD_OP_MASK 0xf000
> +#define VIRTIO_MMIO_MSI_CMD_ARG_MASK 0x0fff
> +#define VIRTIO_MMIO_MSI_CMD(op, arg) ((op) << 12 | (arg))
> +#define VIRITO_MMIO_MSI_CMD_ARG(cmd) ((cmd) & VIRTIO_MMIO_MSI_CMD_ARG_MASK)
> +#define VIRTIO_MMIO_MSI_CMD_OP(cmd) \
> + (((cmd) & VIRTIO_MMIO_MSI_CMD_OP_MASK) >> 12)
> +
> /* Configuration atomicity value */
> #define VIRTIO_MMIO_CONFIG_GENERATION 0x0fc
>
Powered by blists - more mailing lists