lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 25 Dec 2019 10:50:23 +0800
From:   Zha Bin <zhabin@...ux.alibaba.com>
To:     linux-kernel@...r.kernel.org
Cc:     mst@...hat.com, jasowang@...hat.com, slp@...hat.com,
        virtio-dev@...ts.oasis-open.org, gerry@...ux.alibaba.com,
        zhabin@...ux.alibaba.com, jing2.liu@...el.com,
        chao.p.peng@...el.com
Subject: [PATCH v1 2/2] virtio-mmio: add features for virtio-mmio specification version 3

From: Liu Jiang <gerry@...ux.alibaba.com>

Userspace VMMs (e.g. Qemu microvm, Firecracker) take advantage of using
virtio over mmio devices as a lightweight machine model for modern
cloud. The standard virtio over MMIO transport layer only supports one
legacy interrupt, which is much heavier than virtio over PCI transport
layer using MSI. Legacy interrupt has long work path and causes specific
VMExits in following cases, which would considerably slow down the
performance:

1) read interrupt status register
2) update interrupt status register
3) write IOAPIC EOI register

We proposed to update virtio over MMIO to version 3[1] to add the
following new features and enhance the performance.

1) Support Message Signaled Interrupt(MSI), which increases the
   interrupt performance for virtio multi-queue devices
2) Support per-queue doorbell, so the guest kernel may directly write
   to the doorbells provided by virtio devices.

The following is the network tcp_rr performance testing report, tested
with virtio-pci device, vanilla virtio-mmio device and patched
virtio-mmio device (run test 3 times for each case):

	netperf -t TCP_RR -H 192.168.1.36 -l 30 -- -r 32,1024

		Virtio-PCI    Virtio-MMIO   Virtio-MMIO(MSI)
	trans/s	    9536	6939		9500
	trans/s	    9734	7029		9749
	trans/s	    9894	7095		9318

[1] https://lkml.org/lkml/2019/12/20/113

Signed-off-by: Liu Jiang <gerry@...ux.alibaba.com>
Signed-off-by: Zha Bin <zhabin@...ux.alibaba.com>
Signed-off-by: Chao Peng <chao.p.peng@...ux.intel.com>
Signed-off-by: Jing Liu <jing2.liu@...ux.intel.com>
---
 drivers/virtio/virtio_mmio.c     | 226 ++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/virtio_mmio.h |  28 +++++
 2 files changed, 240 insertions(+), 14 deletions(-)

diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index e09edb5..065c083 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -68,8 +68,8 @@
 #include <linux/virtio_config.h>
 #include <uapi/linux/virtio_mmio.h>
 #include <linux/virtio_ring.h>
-
-
+#include <linux/msi.h>
+#include <asm/irqdomain.h>
 
 /* The alignment to use between consumer and producer parts of vring.
  * Currently hardcoded to the page size. */
@@ -90,6 +90,12 @@ struct virtio_mmio_device {
 	/* a list of queues so we can dispatch IRQs */
 	spinlock_t lock;
 	struct list_head virtqueues;
+
+	int doorbell_base;
+	int doorbell_scale;
+	bool msi_enabled;
+	/* Name strings for interrupts. */
+	char (*vm_vq_names)[256];
 };
 
 struct virtio_mmio_vq_info {
@@ -101,6 +107,8 @@ struct virtio_mmio_vq_info {
 };
 
 
+static void vm_free_msi_irqs(struct virtio_device *vdev);
+static int vm_request_msi_vectors(struct virtio_device *vdev, int nirqs);
 
 /* Configuration interface */
 
@@ -273,12 +281,28 @@ static bool vm_notify(struct virtqueue *vq)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
 
+	if (vm_dev->version == 3) {
+		int offset = vm_dev->doorbell_base +
+			     vm_dev->doorbell_scale * vq->index;
+		writel(vq->index, vm_dev->base + offset);
+	} else
 	/* We write the queue's selector into the notification register to
 	 * signal the other end */
-	writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
+		writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
+
 	return true;
 }
 
+/* Handle a configuration change */
+static irqreturn_t vm_config_changed(int irq, void *opaque)
+{
+	struct virtio_mmio_device *vm_dev = opaque;
+
+	virtio_config_changed(&vm_dev->vdev);
+
+	return IRQ_HANDLED;
+}
+
 /* Notify all virtqueues on an interrupt. */
 static irqreturn_t vm_interrupt(int irq, void *opaque)
 {
@@ -307,7 +331,12 @@ static irqreturn_t vm_interrupt(int irq, void *opaque)
 	return ret;
 }
 
+static int vm_msi_irq_vector(struct device *dev, unsigned int nr)
+{
+	struct msi_desc *entry = first_msi_entry(dev);
 
+	return entry->irq + nr;
+}
 
 static void vm_del_vq(struct virtqueue *vq)
 {
@@ -316,6 +345,12 @@ static void vm_del_vq(struct virtqueue *vq)
 	unsigned long flags;
 	unsigned int index = vq->index;
 
+	if (vm_dev->version == 3 && vm_dev->msi_enabled) {
+		int irq = vm_msi_irq_vector(&vq->vdev->dev, index + 1);
+
+		free_irq(irq, vq);
+	}
+
 	spin_lock_irqsave(&vm_dev->lock, flags);
 	list_del(&info->node);
 	spin_unlock_irqrestore(&vm_dev->lock, flags);
@@ -334,15 +369,41 @@ static void vm_del_vq(struct virtqueue *vq)
 	kfree(info);
 }
 
-static void vm_del_vqs(struct virtio_device *vdev)
+static void vm_free_irqs(struct virtio_device *vdev)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+
+	if (vm_dev->version == 3)
+		vm_free_msi_irqs(vdev);
+	else
+		free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev);
+}
+
+static void vm_del_vqs(struct virtio_device *vdev)
+{
 	struct virtqueue *vq, *n;
 
 	list_for_each_entry_safe(vq, n, &vdev->vqs, list)
 		vm_del_vq(vq);
 
-	free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev);
+	vm_free_irqs(vdev);
+}
+
+static int vm_request_single_irq(struct virtio_device *vdev)
+{
+	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+	int irq = platform_get_irq(vm_dev->pdev, 0);
+	int err;
+
+	if (irq < 0) {
+		dev_err(&vdev->dev, "Cannot get IRQ resource\n");
+		return irq;
+	}
+
+	err = request_irq(irq, vm_interrupt, IRQF_SHARED,
+			dev_name(&vdev->dev), vm_dev);
+
+	return err;
 }
 
 static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
@@ -455,6 +516,72 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 	return ERR_PTR(err);
 }
 
+static inline void mmio_msi_set_enable(struct virtio_mmio_device *vm_dev,
+		int enable)
+{
+	u16 status;
+
+	status = readw(vm_dev->base + VIRTIO_MMIO_MSI_STATUS);
+	if (enable)
+		status |= VIRTIO_MMIO_MSI_ENABLE_MASK;
+	else
+		status &= ~VIRTIO_MMIO_MSI_ENABLE_MASK;
+	writew(status, vm_dev->base + VIRTIO_MMIO_MSI_STATUS);
+}
+
+static int vm_find_vqs_msi(struct virtio_device *vdev, unsigned int nvqs,
+			struct virtqueue *vqs[], vq_callback_t *callbacks[],
+			const char * const names[], const bool *ctx,
+			struct irq_affinity *desc)
+{
+	int i, err, irq;
+	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+
+	/* Allocate nvqs irqs for queues and one irq for configuration */
+	err = vm_request_msi_vectors(vdev, nvqs + 1);
+	if (err != 0)
+		return err;
+
+	for (i = 0; i < nvqs; i++) {
+		int index = i + 1;
+
+		if (!names[i]) {
+			vqs[i] = NULL;
+			continue;
+		}
+		vqs[i] = vm_setup_vq(vdev, i, callbacks[i], names[i],
+				ctx ? ctx[i] : false);
+		if (IS_ERR(vqs[i])) {
+			err = PTR_ERR(vqs[i]);
+			goto error_find;
+		}
+
+		if (!callbacks[i])
+			continue;
+
+		irq = vm_msi_irq_vector(&vqs[i]->vdev->dev, index);
+		/* allocate per-vq irq if available and necessary */
+		snprintf(vm_dev->vm_vq_names[index],
+			sizeof(*vm_dev->vm_vq_names),
+			"%s-%s",
+			dev_name(&vm_dev->vdev.dev), names[i]);
+		err = request_irq(irq, vring_interrupt, 0,
+				vm_dev->vm_vq_names[index], vqs[i]);
+
+		if (err)
+			goto error_find;
+	}
+
+	mmio_msi_set_enable(vm_dev, 1);
+	vm_dev->msi_enabled = true;
+
+	return 0;
+
+error_find:
+	vm_del_vqs(vdev);
+	return err;
+}
+
 static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
@@ -463,16 +590,16 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct irq_affinity *desc)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
-	int irq = platform_get_irq(vm_dev->pdev, 0);
 	int i, err, queue_idx = 0;
 
-	if (irq < 0) {
-		dev_err(&vdev->dev, "Cannot get IRQ resource\n");
-		return irq;
+	if (vm_dev->version == 3) {
+		err = vm_find_vqs_msi(vdev, nvqs, vqs, callbacks,
+				names, ctx, desc);
+		if (!err)
+			return 0;
 	}
 
-	err = request_irq(irq, vm_interrupt, IRQF_SHARED,
-			dev_name(&vdev->dev), vm_dev);
+	err = vm_request_single_irq(vdev);
 	if (err)
 		return err;
 
@@ -493,6 +620,73 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 	return 0;
 }
 
+static void mmio_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+	struct device *dev = desc->dev;
+	struct virtio_device *vdev = dev_to_virtio(dev);
+	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+	void __iomem *pos = vm_dev->base;
+	uint16_t cmd = VIRTIO_MMIO_MSI_CMD(VIRTIO_MMIO_MSI_CMD_UPDATE,
+			desc->platform.msi_index);
+
+	writel(msg->address_lo, pos + VIRTIO_MMIO_MSI_ADDRESS_LOW);
+	writel(msg->address_hi, pos + VIRTIO_MMIO_MSI_ADDRESS_HIGH);
+	writel(msg->data, pos + VIRTIO_MMIO_MSI_DATA);
+	writew(cmd, pos + VIRTIO_MMIO_MSI_COMMAND);
+}
+
+static int vm_request_msi_vectors(struct virtio_device *vdev, int nirqs)
+{
+	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+	int irq, err;
+	u16 csr = readw(vm_dev->base + VIRTIO_MMIO_MSI_STATUS);
+
+	if (vm_dev->msi_enabled || (csr & VIRTIO_MMIO_MSI_ENABLE_MASK) == 0)
+		return -EINVAL;
+
+	vm_dev->vm_vq_names = kmalloc_array(nirqs, sizeof(*vm_dev->vm_vq_names),
+			GFP_KERNEL);
+	if (!vm_dev->vm_vq_names)
+		return -ENOMEM;
+
+	if (!vdev->dev.msi_domain)
+		vdev->dev.msi_domain = platform_msi_get_def_irq_domain();
+	err = platform_msi_domain_alloc_irqs(&vdev->dev, nirqs,
+			mmio_write_msi_msg);
+	if (err)
+		goto error_free_mem;
+
+	/* The first MSI vector is used for configuration change events. */
+	snprintf(vm_dev->vm_vq_names[0], sizeof(*vm_dev->vm_vq_names),
+			"%s-config", dev_name(&vdev->dev));
+	irq = vm_msi_irq_vector(&vdev->dev, 0);
+	err = request_irq(irq, vm_config_changed, 0, vm_dev->vm_vq_names[0],
+			vm_dev);
+	if (err)
+		goto error_free_mem;
+
+	return 0;
+
+error_free_mem:
+	kfree(vm_dev->vm_vq_names);
+	vm_dev->vm_vq_names = NULL;
+	return err;
+}
+
+static void vm_free_msi_irqs(struct virtio_device *vdev)
+{
+	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
+
+	if (vm_dev->msi_enabled) {
+		mmio_msi_set_enable(vm_dev, 0);
+		free_irq(vm_msi_irq_vector(&vdev->dev, 0), vm_dev);
+		platform_msi_domain_free_irqs(&vdev->dev);
+		kfree(vm_dev->vm_vq_names);
+		vm_dev->vm_vq_names = NULL;
+		vm_dev->msi_enabled = false;
+	}
+}
+
 static const char *vm_bus_name(struct virtio_device *vdev)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
@@ -567,7 +761,7 @@ static int virtio_mmio_probe(struct platform_device *pdev)
 
 	/* Check device version */
 	vm_dev->version = readl(vm_dev->base + VIRTIO_MMIO_VERSION);
-	if (vm_dev->version < 1 || vm_dev->version > 2) {
+	if (vm_dev->version < 1 || vm_dev->version > 3) {
 		dev_err(&pdev->dev, "Version %ld not supported!\n",
 				vm_dev->version);
 		return -ENXIO;
@@ -603,6 +797,12 @@ static int virtio_mmio_probe(struct platform_device *pdev)
 		dev_warn(&pdev->dev, "Failed to enable 64-bit or 32-bit DMA.  Trying to continue, but this might not work.\n");
 
 	platform_set_drvdata(pdev, vm_dev);
+	if (vm_dev->version == 3) {
+		u32 db = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY);
+
+		vm_dev->doorbell_base = db & 0xffff;
+		vm_dev->doorbell_scale = (db >> 16) & 0xffff;
+	}
 
 	rc = register_virtio_device(&vm_dev->vdev);
 	if (rc)
@@ -619,8 +819,6 @@ static int virtio_mmio_remove(struct platform_device *pdev)
 	return 0;
 }
 
-
-
 /* Devices list parameter */
 
 #if defined(CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES)
diff --git a/include/uapi/linux/virtio_mmio.h b/include/uapi/linux/virtio_mmio.h
index c4b0968..148895e 100644
--- a/include/uapi/linux/virtio_mmio.h
+++ b/include/uapi/linux/virtio_mmio.h
@@ -122,6 +122,34 @@
 #define VIRTIO_MMIO_QUEUE_USED_LOW	0x0a0
 #define VIRTIO_MMIO_QUEUE_USED_HIGH	0x0a4
 
+/* MSI related registers */
+
+/* MSI status register */
+#define VIRTIO_MMIO_MSI_STATUS		0x0c0
+/* MSI command register */
+#define VIRTIO_MMIO_MSI_COMMAND		0x0c2
+/* MSI low 32 bit address, 64 bits in two halves */
+#define VIRTIO_MMIO_MSI_ADDRESS_LOW	0x0c4
+/* MSI high 32 bit address, 64 bits in two halves */
+#define VIRTIO_MMIO_MSI_ADDRESS_HIGH	0x0c8
+/* MSI data */
+#define VIRTIO_MMIO_MSI_DATA		0x0cc
+
+/* RO: MSI feature enabled mask */
+#define VIRTIO_MMIO_MSI_ENABLE_MASK	0x8000
+/* RO: Maximum queue size available */
+#define VIRTIO_MMIO_MSI_STATUS_QMASK	0x07ff
+/* Reserved */
+#define VIRTIO_MMIO_MSI_STATUS_RESERVED	0x7800
+
+#define VIRTIO_MMIO_MSI_CMD_UPDATE	0x1
+#define VIRTIO_MMIO_MSI_CMD_OP_MASK	0xf000
+#define VIRTIO_MMIO_MSI_CMD_ARG_MASK	0x0fff
+#define VIRTIO_MMIO_MSI_CMD(op, arg)	((op) << 12 | (arg))
+#define VIRITO_MMIO_MSI_CMD_ARG(cmd)	((cmd) & VIRTIO_MMIO_MSI_CMD_ARG_MASK)
+#define VIRTIO_MMIO_MSI_CMD_OP(cmd)	\
+		(((cmd) & VIRTIO_MMIO_MSI_CMD_OP_MASK) >> 12)
+
 /* Configuration atomicity value */
 #define VIRTIO_MMIO_CONFIG_GENERATION	0x0fc
 
-- 
1.8.3.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ