linux-kernel - [PATCH 10/12] VMCI: guest side driver implementation.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20121101173005.9733.90420.stgit@promb-2n-dhcp175.eng.vmware.com>
Date:	Thu, 01 Nov 2012 10:30:10 -0700
From:	George Zhang <georgezhang@...are.com>
To:	linux-kernel@...r.kernel.org, georgezhang@...are.com,
	virtualization@...ts.linux-foundation.org
Cc:	pv-drivers@...are.com, gregkh@...uxfoundation.org
Subject: [PATCH 10/12] VMCI: guest side driver implementation.

VMCI guest side driver code implementation.


Signed-off-by: George Zhang <georgezhang@...are.com>
---
 drivers/misc/vmw_vmci/vmci_guest.c |  762 ++++++++++++++++++++++++++++++++++++
 1 files changed, 762 insertions(+), 0 deletions(-)
 create mode 100644 drivers/misc/vmw_vmci/vmci_guest.c

diff --git a/drivers/misc/vmw_vmci/vmci_guest.c b/drivers/misc/vmw_vmci/vmci_guest.c
new file mode 100644
index 0000000..eedbe4d
--- /dev/null
+++ b/drivers/misc/vmw_vmci/vmci_guest.c
@@ -0,0 +1,762 @@
+/*
+ * VMware VMCI Driver
+ *
+ * Copyright (C) 2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include <linux/vmw_vmci_defs.h>
+#include <linux/vmw_vmci_api.h>
+#include <linux/moduleparam.h>
+#include <linux/interrupt.h>
+#include <linux/highmem.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+
+#include "vmci_common_int.h"
+#include "vmci_datagram.h"
+#include "vmci_doorbell.h"
+#include "vmci_context.h"
+#include "vmci_driver.h"
+#include "vmci_event.h"
+
+#define VMCI_UTIL_NUM_RESOURCES 1
+
+static bool vmci_disable_msi;
+module_param_named(disable_msi, vmci_disable_msi, bool, 0);
+MODULE_PARM_DESC(disable_msi, "Disable MSI use in driver - (default=0)");
+
+static bool vmci_disable_msix;
+module_param_named(disable_msix, vmci_disable_msix, bool, 0);
+MODULE_PARM_DESC(disable_msix, "Disable MSI-X use in driver - (default=0)");
+
+static u32 ctx_update_sub_id = VMCI_INVALID_ID;
+static u32 vm_context_id = VMCI_INVALID_ID;
+
+struct vmci_guest_device {
+	struct device *dev;	/* PCI device we are attached to */
+	void __iomem *iobase;
+
+	unsigned int irq;
+	unsigned int intr_type;
+	bool exclusive_vectors;
+	struct msix_entry msix_entries[VMCI_MAX_INTRS];
+
+	struct tasklet_struct datagram_tasklet;
+	struct tasklet_struct bm_tasklet;
+
+	void *data_buffer;
+	void *notification_bitmap;
+};
+
+/* vmci_dev singleton device and supporting data*/
+static struct vmci_guest_device *vmci_dev_g;
+static DEFINE_SPINLOCK(vmci_dev_spinlock);
+
+static atomic_t vmci_num_guest_devices = ATOMIC_INIT(0);
+
+bool vmci_guest_code_active(void)
+{
+	return atomic_read(&vmci_num_guest_devices) != 0;
+}
+
+u32 vmci_get_vm_context_id(void)
+{
+	if (vm_context_id == VMCI_INVALID_ID) {
+		u32 result;
+		struct vmci_datagram get_cid_msg;
+		get_cid_msg.dst =
+		    vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID,
+				     VMCI_GET_CONTEXT_ID);
+		get_cid_msg.src = VMCI_ANON_SRC_HANDLE;
+		get_cid_msg.payload_size = 0;
+		result = vmci_send_datagram(&get_cid_msg);
+		if (result >= 0)
+			vm_context_id = result;
+	}
+	return vm_context_id;
+}
+
+/*
+ * VM to hypervisor call mechanism. We use the standard VMware naming
+ * convention since shared code is calling this function as well.
+ */
+int vmci_send_datagram(struct vmci_datagram *dg)
+{
+	unsigned long flags;
+	int result;
+
+	/* Check args. */
+	if (dg == NULL)
+		return VMCI_ERROR_INVALID_ARGS;
+
+	/*
+	 * Need to acquire spinlock on the device because the datagram
+	 * data may be spread over multiple pages and the monitor may
+	 * interleave device user rpc calls from multiple
+	 * VCPUs. Acquiring the spinlock precludes that
+	 * possibility. Disabling interrupts to avoid incoming
+	 * datagrams during a "rep out" and possibly landing up in
+	 * this function.
+	 */
+	spin_lock_irqsave(&vmci_dev_spinlock, flags);
+
+	if (vmci_dev_g) {
+		iowrite8_rep(vmci_dev_g->iobase + VMCI_DATA_OUT_ADDR,
+			     dg, VMCI_DG_SIZE(dg));
+		result = ioread32(vmci_dev_g->iobase + VMCI_RESULT_LOW_ADDR);
+	} else {
+		result = VMCI_ERROR_UNAVAILABLE;
+	}
+
+	spin_unlock_irqrestore(&vmci_dev_spinlock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(vmci_send_datagram);
+
+/*
+ * Gets called with the new context id if updated or resumed.
+ * Context id.
+ */
+static void vmci_guest_cid_update(u32 sub_id,
+				  const struct vmci_event_data *event_data,
+				  void *client_data)
+{
+	const struct vmci_event_payld_ctx *ev_payload =
+				vmci_event_data_const_payload(event_data);
+
+	if (sub_id != ctx_update_sub_id) {
+		pr_devel("Invalid subscriber (ID=0x%x).\n", sub_id);
+		return;
+	}
+
+	if (!event_data || ev_payload->context_id == VMCI_INVALID_ID) {
+		pr_devel("Invalid event data.\n");
+		return;
+	}
+
+	pr_devel("Updating context from (ID=0x%x) to (ID=0x%x) on event (type=%d).\n",
+		 vm_context_id, ev_payload->context_id, event_data->event);
+
+	vm_context_id = ev_payload->context_id;
+}
+
+/*
+ * Verify that the host supports the hypercalls we need. If it does not,
+ * try to find fallback hypercalls and use those instead.  Returns
+ * true if required hypercalls (or fallback hypercalls) are
+ * supported by the host, false otherwise.
+ */
+static bool vmci_check_host_caps(struct pci_dev *pdev)
+{
+	bool result;
+	struct vmci_resource_query_msg *msg;
+	u32 msg_size = sizeof(struct vmci_resource_query_hdr) +
+				VMCI_UTIL_NUM_RESOURCES * sizeof(u32);
+	struct vmci_datagram *check_msg;
+
+	check_msg = kmalloc(msg_size, GFP_KERNEL);
+	if (!check_msg) {
+		dev_err(&pdev->dev, "%s: Insufficient memory.\n", __func__);
+		return false;
+	}
+
+	check_msg->dst = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID,
+					  VMCI_RESOURCES_QUERY);
+	check_msg->src = VMCI_ANON_SRC_HANDLE;
+	check_msg->payload_size = msg_size - VMCI_DG_HEADERSIZE;
+	msg = (struct vmci_resource_query_msg *)VMCI_DG_PAYLOAD(check_msg);
+
+	msg->num_resources = VMCI_UTIL_NUM_RESOURCES;
+	msg->resources[0] = VMCI_GET_CONTEXT_ID;
+
+	/* Checks that hyper calls are supported */
+	result = vmci_send_datagram(check_msg) == 0x01;
+	kfree(check_msg);
+
+	dev_dbg(&pdev->dev, "%s: Host capability check: %s.\n",
+		__func__, result ? "PASSED" : "FAILED");
+
+	/* We need the vector. There are no fallbacks. */
+	return result;
+}
+
+/*
+ * Reads datagrams from the data in port and dispatches them. We
+ * always start reading datagrams into only the first page of the
+ * datagram buffer. If the datagrams don't fit into one page, we
+ * use the maximum datagram buffer size for the remainder of the
+ * invocation. This is a simple heuristic for not penalizing
+ * small datagrams.
+ *
+ * This function assumes that it has exclusive access to the data
+ * in port for the duration of the call.
+ */
+static void vmci_dispatch_dgs(unsigned long data)
+{
+	struct vmci_guest_device *vmci_dev = (struct vmci_guest_device *)data;
+	u8 *dg_in_buffer = vmci_dev->data_buffer;
+	struct vmci_datagram *dg;
+	size_t dg_in_buffer_size = VMCI_MAX_DG_SIZE;
+	size_t current_dg_in_buffer_size = PAGE_SIZE;
+	size_t remaining_bytes;
+
+	BUILD_BUG_ON(VMCI_MAX_DG_SIZE < PAGE_SIZE);
+
+	ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
+		    vmci_dev->data_buffer, current_dg_in_buffer_size);
+	dg = (struct vmci_datagram *)dg_in_buffer;
+	remaining_bytes = current_dg_in_buffer_size;
+
+	while (dg->dst.resource != VMCI_INVALID_ID ||
+	       remaining_bytes > PAGE_SIZE) {
+		unsigned dg_in_size;
+
+		/*
+		 * When the input buffer spans multiple pages, a datagram can
+		 * start on any page boundary in the buffer.
+		 */
+		if (dg->dst.resource == VMCI_INVALID_ID) {
+			ASSERT(remaining_bytes > PAGE_SIZE);
+			dg = (struct vmci_datagram *)roundup((uintptr_t)
+							     dg + 1, PAGE_SIZE);
+			ASSERT((u8 *) dg <
+			       dg_in_buffer + current_dg_in_buffer_size);
+			remaining_bytes =
+			    (size_t) (dg_in_buffer + current_dg_in_buffer_size -
+				      (u8 *) dg);
+			continue;
+		}
+
+		dg_in_size = VMCI_DG_SIZE_ALIGNED(dg);
+
+		if (dg_in_size <= dg_in_buffer_size) {
+			int result;
+
+			/*
+			 * If the remaining bytes in the datagram
+			 * buffer doesn't contain the complete
+			 * datagram, we first make sure we have enough
+			 * room for it and then we read the reminder
+			 * of the datagram and possibly any following
+			 * datagrams.
+			 */
+			if (dg_in_size > remaining_bytes) {
+				if (remaining_bytes !=
+				    current_dg_in_buffer_size) {
+
+					/*
+					 * We move the partial
+					 * datagram to the front and
+					 * read the reminder of the
+					 * datagram and possibly
+					 * following calls into the
+					 * following bytes.
+					 */
+					memmove(dg_in_buffer, dg_in_buffer +
+						current_dg_in_buffer_size -
+						remaining_bytes,
+						remaining_bytes);
+					dg = (struct vmci_datagram *)
+					    dg_in_buffer;
+				}
+
+				if (current_dg_in_buffer_size !=
+				    dg_in_buffer_size)
+					current_dg_in_buffer_size =
+					    dg_in_buffer_size;
+
+				ioread8_rep(vmci_dev->iobase +
+						VMCI_DATA_IN_ADDR,
+					vmci_dev->data_buffer +
+						remaining_bytes,
+					current_dg_in_buffer_size -
+						remaining_bytes);
+			}
+
+			/*
+			 * We special case event datagrams from the
+			 * hypervisor.
+			 */
+			if (dg->src.context == VMCI_HYPERVISOR_CONTEXT_ID &&
+			    dg->dst.resource == VMCI_EVENT_HANDLER) {
+				result = vmci_event_dispatch(dg);
+			} else {
+				result = vmci_datagram_invoke_guest_handler(dg);
+			}
+			if (result < VMCI_SUCCESS)
+				dev_dbg(vmci_dev->dev,
+					"Datagram with resource (ID=0x%x) failed (err=%d).\n",
+					 dg->dst.resource, result);
+
+			/* On to the next datagram. */
+			dg = (struct vmci_datagram *)((u8 *) dg +
+						      dg_in_size);
+		} else {
+			size_t bytes_to_skip;
+
+			/*
+			 * Datagram doesn't fit in datagram buffer of maximal
+			 * size. We drop it.
+			 */
+			dev_dbg(vmci_dev->dev,
+				"Failed to receive datagram (size=%u bytes).\n",
+				 dg_in_size);
+
+			bytes_to_skip = dg_in_size - remaining_bytes;
+			if (current_dg_in_buffer_size != dg_in_buffer_size)
+				current_dg_in_buffer_size = dg_in_buffer_size;
+
+			for (;;) {
+				ioread8_rep(vmci_dev->iobase +
+						VMCI_DATA_IN_ADDR,
+					vmci_dev->data_buffer,
+					current_dg_in_buffer_size);
+				if (bytes_to_skip <= current_dg_in_buffer_size)
+					break;
+
+				bytes_to_skip -= current_dg_in_buffer_size;
+			}
+			dg = (struct vmci_datagram *)(dg_in_buffer +
+						      bytes_to_skip);
+		}
+
+		remaining_bytes =
+		    (size_t) (dg_in_buffer + current_dg_in_buffer_size -
+			      (u8 *) dg);
+
+		if (remaining_bytes < VMCI_DG_HEADERSIZE) {
+			/* Get the next batch of datagrams. */
+
+			ioread8_rep(vmci_dev->iobase + VMCI_DATA_IN_ADDR,
+					vmci_dev->data_buffer,
+					current_dg_in_buffer_size);
+			dg = (struct vmci_datagram *)dg_in_buffer;
+			remaining_bytes = current_dg_in_buffer_size;
+		}
+	}
+}
+
+/*
+ * Scans the notification bitmap for raised flags, clears them
+ * and handles the notifications.
+ */
+static void vmci_process_bitmap(unsigned long data)
+{
+	struct vmci_guest_device *dev = (struct vmci_guest_device *)data;
+
+	if (!dev->notification_bitmap) {
+		dev_dbg(dev->dev, "No bitmap present in %s.\n", __func__);
+		return;
+	}
+
+	vmci_dbell_scan_notification_entries(dev->notification_bitmap);
+}
+
+/*
+ * Enable MSI-X.  Try exclusive vectors first, then shared vectors.
+ */
+static int vmci_enable_msix(struct pci_dev *pdev,
+			    struct vmci_guest_device *vmci_dev)
+{
+	int i;
+	int result;
+
+	for (i = 0; i < VMCI_MAX_INTRS; ++i) {
+		vmci_dev->msix_entries[i].entry = i;
+		vmci_dev->msix_entries[i].vector = i;
+	}
+
+	result = pci_enable_msix(pdev, vmci_dev->msix_entries, VMCI_MAX_INTRS);
+	if (result == 0)
+		vmci_dev->exclusive_vectors = true;
+	else if (result > 0)
+		result = pci_enable_msix(pdev, vmci_dev->msix_entries, 1);
+
+	return result;
+}
+
+/*
+ * Interrupt handler for legacy or MSI interrupt, or for first MSI-X
+ * interrupt (vector VMCI_INTR_DATAGRAM).
+ */
+static irqreturn_t vmci_interrupt(int irq, void *_dev)
+{
+	struct vmci_guest_device *dev = _dev;
+
+	/*
+	 * If we are using MSI-X with exclusive vectors then we simply schedule
+	 * the datagram tasklet, since we know the interrupt was meant for us.
+	 * Otherwise we must read the ICR to determine what to do.
+	 */
+
+	if (dev->intr_type == VMCI_INTR_TYPE_MSIX && dev->exclusive_vectors) {
+		tasklet_schedule(&dev->datagram_tasklet);
+	} else {
+		unsigned int icr;
+
+		ASSERT(dev->intr_type == VMCI_INTR_TYPE_INTX ||
+		       dev->intr_type == VMCI_INTR_TYPE_MSI);
+
+		/* Acknowledge interrupt and determine what needs doing. */
+		icr = ioread32(dev->iobase + VMCI_ICR_ADDR);
+		if (icr == 0 || icr == ~0)
+			return IRQ_NONE;
+
+		if (icr & VMCI_ICR_DATAGRAM) {
+			tasklet_schedule(&dev->datagram_tasklet);
+			icr &= ~VMCI_ICR_DATAGRAM;
+		}
+
+		if (icr & VMCI_ICR_NOTIFICATION) {
+			tasklet_schedule(&dev->bm_tasklet);
+			icr &= ~VMCI_ICR_NOTIFICATION;
+		}
+
+		if (icr != 0)
+			dev_warn(dev->dev,
+				 "Ignoring unknown interrupt cause (%d).\n", icr);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Interrupt handler for MSI-X interrupt vector VMCI_INTR_NOTIFICATION,
+ * which is for the notification bitmap.  Will only get called if we are
+ * using MSI-X with exclusive vectors.
+ */
+static irqreturn_t vmci_interrupt_bm(int irq, void *_dev)
+{
+	struct vmci_guest_device *dev = _dev;
+
+	/* For MSI-X we can just assume it was meant for us. */
+	ASSERT(dev->intr_type == VMCI_INTR_TYPE_MSIX && dev->exclusive_vectors);
+	tasklet_schedule(&dev->bm_tasklet);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Most of the initialization at module load time is done here.
+ */
+static int __devinit vmci_guest_probe_device(struct pci_dev *pdev,
+					     const struct pci_device_id *id)
+{
+	struct vmci_guest_device *vmci_dev;
+	void __iomem *iobase;
+	unsigned int capabilities;
+	unsigned long cmd;
+	int vmci_err;
+	int error;
+
+	dev_dbg(&pdev->dev, "Probing for vmci/PCI guest device.\n");
+
+	error = pcim_enable_device(pdev);
+	if (error) {
+		dev_err(&pdev->dev,
+			"Failed to enable VMCI device: %d\n", error);
+		return error;
+	}
+
+	error = pcim_iomap_regions(pdev, 1 << 0, MODULE_NAME);
+	if (error) {
+		dev_err(&pdev->dev, "Failed to reserve/map IO regions.\n");
+		return error;
+	}
+
+	iobase = pcim_iomap_table(pdev)[0];
+
+	dev_info(&pdev->dev, "Found VMCI PCI device at %#lx, irq %u.\n",
+		 (unsigned long)iobase, pdev->irq);
+
+	vmci_dev = devm_kzalloc(&pdev->dev, sizeof(*vmci_dev), GFP_KERNEL);
+	if (!vmci_dev) {
+		dev_err(&pdev->dev,
+			"Can't allocate memory for VMCI device.\n");
+		return -ENOMEM;
+	}
+
+	vmci_dev->dev = &pdev->dev;
+	vmci_dev->intr_type = VMCI_INTR_TYPE_INTX;
+	vmci_dev->exclusive_vectors = false;
+	vmci_dev->iobase = iobase;
+
+	tasklet_init(&vmci_dev->datagram_tasklet,
+		     vmci_dispatch_dgs, (unsigned long)vmci_dev);
+	tasklet_init(&vmci_dev->bm_tasklet,
+		     vmci_process_bitmap, (unsigned long)vmci_dev);
+
+	vmci_dev->data_buffer = vmalloc(VMCI_MAX_DG_SIZE);
+	if (!vmci_dev->data_buffer) {
+		dev_err(&pdev->dev,
+			"Can't allocate memory for datagram buffer.\n");
+		return -ENOMEM;
+	}
+
+	pci_set_master(pdev);	/* To enable queue_pair functionality. */
+
+	/*
+	 * Verify that the VMCI Device supports the capabilities that
+	 * we need. If the device is missing capabilities that we would
+	 * like to use, check for fallback capabilities and use those
+	 * instead (so we can run a new VM on old hosts). Fail the load if
+	 * a required capability is missing and there is no fallback.
+	 *
+	 * Right now, we need datagrams. There are no fallbacks.
+	 */
+	capabilities = ioread32(vmci_dev->iobase + VMCI_CAPS_ADDR);
+	if (!(capabilities & VMCI_CAPS_DATAGRAM)) {
+		dev_err(&pdev->dev, "Device does not support datagrams.\n");
+		error = -ENXIO;
+		goto err_free_data_buffer;
+	}
+
+	/*
+	 * If the hardware supports notifications, we will use that as
+	 * well.
+	 */
+	if (capabilities & VMCI_CAPS_NOTIFICATIONS) {
+		vmci_dev->notification_bitmap = vmalloc(PAGE_SIZE);
+		if (!vmci_dev->notification_bitmap) {
+			dev_warn(&pdev->dev,
+				 "Unable to allocate notification bitmap.\n");
+		} else {
+			memset(vmci_dev->notification_bitmap, 0, PAGE_SIZE);
+			capabilities |= VMCI_CAPS_NOTIFICATIONS;
+		}
+	}
+
+	dev_info(&pdev->dev, "Using capabilities 0x%x.\n", capabilities);
+
+	/* Let the host know which capabilities we intend to use. */
+	iowrite32(capabilities, vmci_dev->iobase + VMCI_CAPS_ADDR);
+
+	/* Set up global device so that we can start sending datagrams */
+	spin_lock_irq(&vmci_dev_spinlock);
+	vmci_dev_g = vmci_dev;
+	spin_unlock_irq(&vmci_dev_spinlock);
+
+	/*
+	 * Register notification bitmap with device if that capability is
+	 * used.
+	 */
+	if (capabilities & VMCI_CAPS_NOTIFICATIONS) {
+		struct page *page =
+			vmalloc_to_page(vmci_dev->notification_bitmap);
+		unsigned long bitmap_ppn = page_to_pfn(page);
+		if (!vmci_dbell_register_notification_bitmap(bitmap_ppn)) {
+			dev_warn(&pdev->dev,
+				 "VMCI device unable to register notification bitmap with PPN 0x%x.\n",
+				 (u32) bitmap_ppn);
+			goto err_remove_vmci_dev_g;
+		}
+	}
+
+	/* Check host capabilities. */
+	if (!vmci_check_host_caps(pdev))
+		goto err_remove_bitmap;
+
+	/* Enable device. */
+
+	/*
+	 * We subscribe to the VMCI_EVENT_CTX_ID_UPDATE here so we can
+	 * update the internal context id when needed.
+	 */
+	vmci_err = vmci_event_subscribe(VMCI_EVENT_CTX_ID_UPDATE,
+					vmci_guest_cid_update, NULL,
+					&ctx_update_sub_id);
+	if (vmci_err < VMCI_SUCCESS)
+		dev_warn(&pdev->dev,
+			 "Failed to subscribe to event (type=%d): %d\n",
+			 VMCI_EVENT_CTX_ID_UPDATE, vmci_err);
+
+	/*
+	 * Enable interrupts.  Try MSI-X first, then MSI, and then fallback on
+	 * legacy interrupts.
+	 */
+	if (!vmci_disable_msix && !vmci_enable_msix(pdev, vmci_dev)) {
+		vmci_dev->intr_type = VMCI_INTR_TYPE_MSIX;
+		vmci_dev->irq = vmci_dev->msix_entries[0].vector;
+	} else if (!vmci_disable_msi && !pci_enable_msi(pdev)) {
+		vmci_dev->intr_type = VMCI_INTR_TYPE_MSI;
+		vmci_dev->irq = pdev->irq;
+	} else {
+		vmci_dev->intr_type = VMCI_INTR_TYPE_INTX;
+		vmci_dev->irq = pdev->irq;
+	}
+
+	/*
+	 * Request IRQ for legacy or MSI interrupts, or for first
+	 * MSI-X vector.
+	 */
+	error = request_irq(vmci_dev->irq, vmci_interrupt, IRQF_SHARED,
+			    MODULE_NAME, vmci_dev);
+	if (error) {
+		dev_err(&pdev->dev, "Irq %u in use: %d\n", vmci_dev->irq, error);
+		goto err_disable_msi;
+	}
+
+	/*
+	 * For MSI-X with exclusive vectors we need to request an
+	 * interrupt for each vector so that we get a separate
+	 * interrupt handler routine.  This allows us to distinguish
+	 * between the vectors.
+	 */
+	if (vmci_dev->exclusive_vectors) {
+		ASSERT(vmci_dev->intr_type == VMCI_INTR_TYPE_MSIX);
+		error = request_irq(vmci_dev->msix_entries[1].vector,
+				    vmci_interrupt_bm, 0, MODULE_NAME,
+				    vmci_dev);
+		if (error) {
+			dev_err(&pdev->dev,
+				"Failed to allocate irq %u: %d\n",
+				vmci_dev->msix_entries[1].vector, error);
+			goto err_free_irq;
+		}
+	}
+
+	dev_dbg(&pdev->dev, "Registered device.\n");
+
+	atomic_inc(&vmci_num_guest_devices);
+
+	/* Enable specific interrupt bits. */
+	cmd = VMCI_IMR_DATAGRAM;
+	if (capabilities & VMCI_CAPS_NOTIFICATIONS)
+		cmd |= VMCI_IMR_NOTIFICATION;
+	iowrite32(cmd, vmci_dev->iobase + VMCI_IMR_ADDR);
+
+	/* Enable interrupts. */
+	iowrite32(VMCI_CONTROL_INT_ENABLE,
+		  vmci_dev->iobase + VMCI_CONTROL_ADDR);
+
+	pci_set_drvdata(pdev, vmci_dev);
+	return 0;
+
+err_free_irq:
+	free_irq(vmci_dev->irq, &vmci_dev);
+	tasklet_kill(&vmci_dev->datagram_tasklet);
+	tasklet_kill(&vmci_dev->bm_tasklet);
+
+err_disable_msi:
+	if (vmci_dev->intr_type == VMCI_INTR_TYPE_MSIX)
+		pci_disable_msix(pdev);
+	else if (vmci_dev->intr_type == VMCI_INTR_TYPE_MSI)
+		pci_disable_msi(pdev);
+
+	vmci_err = vmci_event_unsubscribe(ctx_update_sub_id);
+	if (vmci_err < VMCI_SUCCESS)
+		dev_warn(&pdev->dev,
+			 "Failed to unsubscribe from event (type=%d) with subscriber (ID=0x%x): %d\n",
+			 VMCI_EVENT_CTX_ID_UPDATE, ctx_update_sub_id, vmci_err);
+
+err_remove_bitmap:
+	if (vmci_dev->notification_bitmap) {
+		iowrite32(VMCI_CONTROL_RESET,
+				vmci_dev->iobase + VMCI_CONTROL_ADDR);
+		vfree(vmci_dev->notification_bitmap);
+	}
+
+err_remove_vmci_dev_g:
+	spin_lock_irq(&vmci_dev_spinlock);
+	vmci_dev_g = NULL;
+	spin_unlock_irq(&vmci_dev_spinlock);
+
+err_free_data_buffer:
+	vfree(vmci_dev->data_buffer);
+
+	/* The rest are managed resources and will be freed by PCI core */
+	return error;
+}
+
+static void __devexit vmci_guest_remove_device(struct pci_dev *pdev)
+{
+	struct vmci_guest_device *vmci_dev = pci_get_drvdata(pdev);
+	int vmci_err;
+
+	dev_dbg(&pdev->dev, "Removing device\n");
+
+	atomic_dec(&vmci_num_guest_devices);
+
+	vmci_qp_guest_endpoints_exit();
+
+	vmci_err = vmci_event_unsubscribe(ctx_update_sub_id);
+	if (vmci_err < VMCI_SUCCESS)
+		dev_warn(&pdev->dev,
+			 "Failed to unsubscribe from event (type=%d) with subscriber (ID=0x%x): %d\n",
+			 VMCI_EVENT_CTX_ID_UPDATE, ctx_update_sub_id, vmci_err);
+
+	spin_lock_irq(&vmci_dev_spinlock);
+	vmci_dev_g = NULL;
+	spin_unlock_irq(&vmci_dev_spinlock);
+
+	dev_dbg(&pdev->dev, "Resetting vmci device\n");
+	iowrite32(VMCI_CONTROL_RESET, vmci_dev->iobase + VMCI_CONTROL_ADDR);
+
+	/*
+	 * Free IRQ and then disable MSI/MSI-X as appropriate.  For
+	 * MSI-X, we might have multiple vectors, each with their own
+	 * IRQ, which we must free too.
+	 */
+	free_irq(vmci_dev->irq, vmci_dev);
+	if (vmci_dev->intr_type == VMCI_INTR_TYPE_MSIX) {
+		if (vmci_dev->exclusive_vectors)
+			free_irq(vmci_dev->msix_entries[1].vector, vmci_dev);
+		pci_disable_msix(pdev);
+	} else if (vmci_dev->intr_type == VMCI_INTR_TYPE_MSI) {
+		pci_disable_msi(pdev);
+	}
+
+	tasklet_kill(&vmci_dev->datagram_tasklet);
+	tasklet_kill(&vmci_dev->bm_tasklet);
+
+	if (vmci_dev->notification_bitmap) {
+		/*
+		 * The device reset above cleared the bitmap state of the
+		 * device, so we can safely free it here.
+		 */
+
+		vfree(vmci_dev->notification_bitmap);
+	}
+
+	vfree(vmci_dev->data_buffer);
+
+	/* The rest are managed resources and will be freed by PCI core */
+}
+
+static DEFINE_PCI_DEVICE_TABLE(vmci_ids) = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_VMWARE, PCI_DEVICE_ID_VMWARE_VMCI), },
+	{ 0 },
+};
+MODULE_DEVICE_TABLE(pci, vmci_ids);
+
+static struct pci_driver vmci_guest_driver = {
+	.name		= MODULE_NAME,
+	.id_table	= vmci_ids,
+	.probe		= vmci_guest_probe_device,
+	.remove		= __devexit_p(vmci_guest_remove_device),
+};
+
+int __init vmci_guest_init(void)
+{
+	return pci_register_driver(&vmci_guest_driver);
+}
+
+void __exit vmci_guest_exit(void)
+{
+	pci_unregister_driver(&vmci_guest_driver);
+}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/