lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20221206055816.292304-4-lei.rao@intel.com>
Date:   Tue,  6 Dec 2022 13:58:14 +0800
From:   Lei Rao <lei.rao@...el.com>
To:     kbusch@...nel.org, axboe@...com, kch@...dia.com, hch@....de,
        sagi@...mberg.me, alex.williamson@...hat.com, cohuck@...hat.com,
        jgg@...pe.ca, yishaih@...dia.com,
        shameerali.kolothum.thodi@...wei.com, kevin.tian@...el.com,
        mjrosato@...ux.ibm.com, linux-kernel@...r.kernel.org,
        linux-nvme@...ts.infradead.org, kvm@...r.kernel.org
Cc:     eddie.dong@...el.com, yadong.li@...el.com, yi.l.liu@...el.com,
        Konrad.wilk@...cle.com, stephen@...eticom.com, hang.yuan@...el.com,
        Lei Rao <lei.rao@...el.com>
Subject: [RFC PATCH 3/5] nvme-vfio: enable the function of VFIO live migration.

Implement specific VFIO live migration operations for NVMe devices.

Signed-off-by: Lei Rao <lei.rao@...el.com>
Signed-off-by: Yadong Li <yadong.li@...el.com>
Signed-off-by: Chaitanya Kulkarni <kch@...dia.com>
Reviewed-by: Eddie Dong <eddie.dong@...el.com>
Reviewed-by: Hang Yuan <hang.yuan@...el.com>
---
 drivers/vfio/pci/nvme/Kconfig |   5 +-
 drivers/vfio/pci/nvme/nvme.c  | 543 ++++++++++++++++++++++++++++++++--
 drivers/vfio/pci/nvme/nvme.h  | 111 +++++++
 3 files changed, 637 insertions(+), 22 deletions(-)
 create mode 100644 drivers/vfio/pci/nvme/nvme.h

diff --git a/drivers/vfio/pci/nvme/Kconfig b/drivers/vfio/pci/nvme/Kconfig
index c281fe154007..12e0eaba0de1 100644
--- a/drivers/vfio/pci/nvme/Kconfig
+++ b/drivers/vfio/pci/nvme/Kconfig
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config NVME_VFIO_PCI
 	tristate "VFIO support for NVMe PCI devices"
+	depends on NVME_CORE
 	depends on VFIO_PCI_CORE
 	help
-	  This provides generic VFIO PCI support for NVMe device
-	  using the VFIO framework.
+	  This provides migration support for NVMe devices using the
+	  VFIO framework.
 
 	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/nvme/nvme.c b/drivers/vfio/pci/nvme/nvme.c
index f1386d8a9287..698e470a4e53 100644
--- a/drivers/vfio/pci/nvme/nvme.c
+++ b/drivers/vfio/pci/nvme/nvme.c
@@ -13,29 +13,503 @@
 #include <linux/types.h>
 #include <linux/vfio.h>
 #include <linux/anon_inodes.h>
-#include <linux/kernel.h>
-#include <linux/vfio_pci_core.h>
+
+#include "nvme.h"
+
+#define MAX_MIGRATION_SIZE (256 * 1024)
+
+static int nvmevf_cmd_suspend_device(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+	struct pci_dev *dev = nvmevf_dev->core_device.pdev;
+	struct nvme_live_mig_command c = { };
+	int ret;
+
+	c.suspend.opcode = nvme_admin_live_mig_suspend;
+	c.suspend.vf_index = nvmevf_dev->vf_id;
+
+	ret = nvme_submit_vf_cmd(dev, (struct nvme_command *)&c, NULL, NULL, 0);
+	if (ret) {
+		dev_warn(&dev->dev, "Suspend virtual function failed (ret=0x%x)\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static int nvmevf_cmd_resume_device(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+	struct pci_dev *dev = nvmevf_dev->core_device.pdev;
+	struct nvme_live_mig_command c = { };
+	int ret;
+
+	c.resume.opcode = nvme_admin_live_mig_resume;
+	c.resume.vf_index = nvmevf_dev->vf_id;
+
+	ret = nvme_submit_vf_cmd(dev, (struct nvme_command *)&c, NULL, NULL, 0);
+	if (ret) {
+		dev_warn(&dev->dev, "Resume virtual function failed (ret=0x%x)\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static int nvmevf_cmd_query_data_size(struct nvmevf_pci_core_device *nvmevf_dev,
+					  size_t *state_size)
+{
+	struct pci_dev *dev = nvmevf_dev->core_device.pdev;
+	struct nvme_live_mig_command c = { };
+	size_t result;
+	int ret;
+
+	c.query.opcode = nvme_admin_live_mig_query_data_size;
+	c.query.vf_index = nvmevf_dev->vf_id;
+
+	ret = nvme_submit_vf_cmd(dev, (struct nvme_command *)&c, &result, NULL, 0);
+	if (ret) {
+		dev_warn(&dev->dev, "Query the states size failed (ret=0x%x)\n", ret);
+		*state_size = 0;
+		return ret;
+	}
+	*state_size = result;
+	return 0;
+}
+
+static int nvmevf_cmd_save_data(struct nvmevf_pci_core_device *nvmevf_dev,
+				    void *buffer, size_t buffer_len)
+{
+	struct pci_dev *dev = nvmevf_dev->core_device.pdev;
+	struct nvme_live_mig_command c = { };
+	int ret;
+
+	c.save.opcode = nvme_admin_live_mig_save_data;
+	c.save.vf_index = nvmevf_dev->vf_id;
+
+	ret = nvme_submit_vf_cmd(dev, (struct nvme_command *)&c, NULL, buffer, buffer_len);
+	if (ret) {
+		dev_warn(&dev->dev, "Save the device states failed (ret=0x%x)\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static int nvmevf_cmd_load_data(struct nvmevf_pci_core_device *nvmevf_dev,
+				    struct nvmevf_migration_file *migf)
+{
+	struct pci_dev *dev = nvmevf_dev->core_device.pdev;
+	struct nvme_live_mig_command c = { };
+	int ret;
+
+	c.load.opcode = nvme_admin_live_mig_load_data;
+	c.load.vf_index = nvmevf_dev->vf_id;
+	c.load.size = migf->total_length;
+
+	ret = nvme_submit_vf_cmd(dev, (struct nvme_command *)&c, NULL,
+			migf->vf_data, migf->total_length);
+	if (ret) {
+		dev_warn(&dev->dev, "Load the device states failed (ret=0x%x)\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static struct nvmevf_pci_core_device *nvmevf_drvdata(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+	return container_of(core_device, struct nvmevf_pci_core_device, core_device);
+}
+
+static void nvmevf_disable_fd(struct nvmevf_migration_file *migf)
+{
+	mutex_lock(&migf->lock);
+
+	/* release the device states buffer */
+	kvfree(migf->vf_data);
+	migf->vf_data = NULL;
+	migf->disabled = true;
+	migf->total_length = 0;
+	migf->filp->f_pos = 0;
+	mutex_unlock(&migf->lock);
+}
+
+static int nvmevf_release_file(struct inode *inode, struct file *filp)
+{
+	struct nvmevf_migration_file *migf = filp->private_data;
+
+	nvmevf_disable_fd(migf);
+	mutex_destroy(&migf->lock);
+	kfree(migf);
+	return 0;
+}
+
+static ssize_t nvmevf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
+{
+	struct nvmevf_migration_file *migf = filp->private_data;
+	ssize_t done = 0;
+	int ret;
+
+	if (pos)
+		return -ESPIPE;
+	pos = &filp->f_pos;
+
+	mutex_lock(&migf->lock);
+	if (*pos > migf->total_length) {
+		done = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (migf->disabled) {
+		done = -EINVAL;
+		goto out_unlock;
+	}
+
+	len = min_t(size_t, migf->total_length - *pos, len);
+	if (len) {
+		ret = copy_to_user(buf, migf->vf_data + *pos, len);
+		if (ret) {
+			done = -EFAULT;
+			goto out_unlock;
+		}
+		*pos += len;
+		done = len;
+	}
+
+out_unlock:
+	mutex_unlock(&migf->lock);
+	return done;
+}
+
+static const struct file_operations nvmevf_save_fops = {
+	.owner = THIS_MODULE,
+	.read = nvmevf_save_read,
+	.release = nvmevf_release_file,
+	.llseek = no_llseek,
+};
+
+static ssize_t nvmevf_resume_write(struct file *filp, const char __user *buf,
+				       size_t len, loff_t *pos)
+{
+	struct nvmevf_migration_file *migf = filp->private_data;
+	loff_t requested_length;
+	ssize_t done = 0;
+	int ret;
+
+	if (pos)
+		return -ESPIPE;
+	pos = &filp->f_pos;
+
+	if (*pos < 0 || check_add_overflow((loff_t)len, *pos, &requested_length))
+		return -EINVAL;
+
+	if (requested_length > MAX_MIGRATION_SIZE)
+		return -ENOMEM;
+	mutex_lock(&migf->lock);
+	if (migf->disabled) {
+		done = -ENODEV;
+		goto out_unlock;
+	}
+
+	ret = copy_from_user(migf->vf_data + *pos, buf, len);
+	if (ret) {
+		done = -EFAULT;
+		goto out_unlock;
+	}
+	*pos += len;
+	done = len;
+	migf->total_length += len;
+
+out_unlock:
+	mutex_unlock(&migf->lock);
+	return done;
+}
+
+static const struct file_operations nvmevf_resume_fops = {
+	.owner = THIS_MODULE,
+	.write = nvmevf_resume_write,
+	.release = nvmevf_release_file,
+	.llseek = no_llseek,
+};
+
+static void nvmevf_disable_fds(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+	if (nvmevf_dev->resuming_migf) {
+		nvmevf_disable_fd(nvmevf_dev->resuming_migf);
+		fput(nvmevf_dev->resuming_migf->filp);
+		nvmevf_dev->resuming_migf = NULL;
+	}
+
+	if (nvmevf_dev->saving_migf) {
+		nvmevf_disable_fd(nvmevf_dev->saving_migf);
+		fput(nvmevf_dev->saving_migf->filp);
+		nvmevf_dev->saving_migf = NULL;
+	}
+}
+
+static struct nvmevf_migration_file *
+nvmevf_pci_resume_device_data(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+	struct nvmevf_migration_file *migf;
+	int ret;
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	migf->filp = anon_inode_getfile("nvmevf_mig", &nvmevf_resume_fops, migf,
+					O_WRONLY);
+	if (IS_ERR(migf->filp)) {
+		int err = PTR_ERR(migf->filp);
+
+		kfree(migf);
+		return ERR_PTR(err);
+	}
+	stream_open(migf->filp->f_inode, migf->filp);
+	mutex_init(&migf->lock);
+
+	/* Allocate buffer to load the device states and the max states is 256K */
+	migf->vf_data = kvzalloc(MAX_MIGRATION_SIZE, GFP_KERNEL);
+	if (!migf->vf_data) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	return migf;
+
+out_free:
+	fput(migf->filp);
+	return ERR_PTR(ret);
+}
+
+static struct nvmevf_migration_file *
+nvmevf_pci_save_device_data(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+	struct nvmevf_migration_file *migf;
+	int ret;
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	migf->filp = anon_inode_getfile("nvmevf_mig", &nvmevf_save_fops, migf,
+					O_RDONLY);
+	if (IS_ERR(migf->filp)) {
+		int err = PTR_ERR(migf->filp);
+
+		kfree(migf);
+		return ERR_PTR(err);
+	}
+
+	stream_open(migf->filp->f_inode, migf->filp);
+	mutex_init(&migf->lock);
+
+	ret = nvmevf_cmd_query_data_size(nvmevf_dev, &migf->total_length);
+	if (ret)
+		goto out_free;
+	/* Allocate buffer and save the device states*/
+	migf->vf_data = kvzalloc(migf->total_length, GFP_KERNEL);
+	if (!migf->vf_data) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	ret = nvmevf_cmd_save_data(nvmevf_dev, migf->vf_data, migf->total_length);
+	if (ret)
+		goto out_free;
+
+	return migf;
+out_free:
+	fput(migf->filp);
+	return ERR_PTR(ret);
+}
+
+static struct file *
+nvmevf_pci_step_device_state_locked(struct nvmevf_pci_core_device *nvmevf_dev, u32 new)
+{
+	u32 cur = nvmevf_dev->mig_state;
+	int ret;
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) {
+		ret = nvmevf_cmd_suspend_device(nvmevf_dev);
+		if (ret)
+			return ERR_PTR(ret);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+		struct nvmevf_migration_file *migf;
+
+		migf = nvmevf_pci_save_device_data(nvmevf_dev);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+		get_file(migf->filp);
+		nvmevf_dev->saving_migf = migf;
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
+		nvmevf_disable_fds(nvmevf_dev);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+		struct nvmevf_migration_file *migf;
+
+		migf = nvmevf_pci_resume_device_data(nvmevf_dev);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+		get_file(migf->filp);
+		nvmevf_dev->resuming_migf = migf;
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+		ret = nvmevf_cmd_load_data(nvmevf_dev, nvmevf_dev->resuming_migf);
+		if (ret)
+			return ERR_PTR(ret);
+		nvmevf_disable_fds(nvmevf_dev);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING) {
+		nvmevf_cmd_resume_device(nvmevf_dev);
+		return NULL;
+	}
+
+	/* vfio_mig_get_next_state() does not use arcs other than the above */
+	WARN_ON(true);
+	return ERR_PTR(-EINVAL);
+}
+
+static void nvmevf_state_mutex_unlock(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+again:
+	spin_lock(&nvmevf_dev->reset_lock);
+	if (nvmevf_dev->deferred_reset) {
+		nvmevf_dev->deferred_reset = false;
+		spin_unlock(&nvmevf_dev->reset_lock);
+		nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+		nvmevf_disable_fds(nvmevf_dev);
+		goto again;
+	}
+	mutex_unlock(&nvmevf_dev->state_mutex);
+	spin_unlock(&nvmevf_dev->reset_lock);
+}
+
+static struct file *
+nvmevf_pci_set_device_state(struct vfio_device *vdev, enum vfio_device_mig_state new_state)
+{
+	struct nvmevf_pci_core_device *nvmevf_dev = container_of(vdev,
+			struct nvmevf_pci_core_device, core_device.vdev);
+	enum vfio_device_mig_state next_state;
+	struct file *res = NULL;
+	int ret;
+
+	mutex_lock(&nvmevf_dev->state_mutex);
+	while (new_state != nvmevf_dev->mig_state) {
+		ret = vfio_mig_get_next_state(vdev, nvmevf_dev->mig_state, new_state, &next_state);
+		if (ret) {
+			res = ERR_PTR(-EINVAL);
+			break;
+		}
+
+		res = nvmevf_pci_step_device_state_locked(nvmevf_dev, next_state);
+		if (IS_ERR(res))
+			break;
+		nvmevf_dev->mig_state = next_state;
+		if (WARN_ON(res && new_state != nvmevf_dev->mig_state)) {
+			fput(res);
+			res = ERR_PTR(-EINVAL);
+			break;
+		}
+	}
+	nvmevf_state_mutex_unlock(nvmevf_dev);
+	return res;
+}
+
+static int nvmevf_pci_get_device_state(struct vfio_device *vdev,
+					   enum vfio_device_mig_state *curr_state)
+{
+	struct nvmevf_pci_core_device *nvmevf_dev = container_of(
+			vdev, struct nvmevf_pci_core_device, core_device.vdev);
+
+	mutex_lock(&nvmevf_dev->state_mutex);
+	*curr_state = nvmevf_dev->mig_state;
+	nvmevf_state_mutex_unlock(nvmevf_dev);
+	return 0;
+}
 
 static int nvmevf_pci_open_device(struct vfio_device *core_vdev)
 {
-	struct vfio_pci_core_device *vdev =
-		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	struct nvmevf_pci_core_device *nvmevf_dev = container_of(
+			core_vdev, struct nvmevf_pci_core_device, core_device.vdev);
+	struct vfio_pci_core_device *vdev = &nvmevf_dev->core_device;
 	int ret;
 
 	ret = vfio_pci_core_enable(vdev);
 	if (ret)
 		return ret;
 
+	if (nvmevf_dev->migrate_cap)
+		nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING;
 	vfio_pci_core_finish_enable(vdev);
 	return 0;
 }
 
+static void nvmevf_cmd_close_migratable(struct nvmevf_pci_core_device *nvmevf_dev)
+{
+	if (!nvmevf_dev->migrate_cap)
+		return;
+
+	mutex_lock(&nvmevf_dev->state_mutex);
+	nvmevf_disable_fds(nvmevf_dev);
+	nvmevf_state_mutex_unlock(nvmevf_dev);
+}
+
+static void nvmevf_pci_close_device(struct vfio_device *core_vdev)
+{
+	struct nvmevf_pci_core_device *nvmevf_dev = container_of(
+			core_vdev, struct nvmevf_pci_core_device, core_device.vdev);
+
+	nvmevf_cmd_close_migratable(nvmevf_dev);
+	vfio_pci_core_close_device(core_vdev);
+}
+
+static const struct vfio_migration_ops nvmevf_pci_mig_ops = {
+	.migration_set_state = nvmevf_pci_set_device_state,
+	.migration_get_state = nvmevf_pci_get_device_state,
+};
+
+static int nvmevf_migration_init_dev(struct vfio_device *core_vdev)
+{
+	struct nvmevf_pci_core_device *nvmevf_dev = container_of(core_vdev,
+					struct nvmevf_pci_core_device, core_device.vdev);
+	struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
+	int vf_id;
+	int ret = -1;
+
+	if (!pdev->is_virtfn)
+		return ret;
+
+	nvmevf_dev->migrate_cap = 1;
+
+	vf_id = pci_iov_vf_id(pdev);
+	if (vf_id < 0)
+		return ret;
+	nvmevf_dev->vf_id = vf_id + 1;
+	core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY;
+
+	mutex_init(&nvmevf_dev->state_mutex);
+	spin_lock_init(&nvmevf_dev->reset_lock);
+	core_vdev->mig_ops = &nvmevf_pci_mig_ops;
+
+	return vfio_pci_core_init_dev(core_vdev);
+}
+
 static const struct vfio_device_ops nvmevf_pci_ops = {
 	.name = "nvme-vfio-pci",
-	.init = vfio_pci_core_init_dev,
+	.init = nvmevf_migration_init_dev,
 	.release = vfio_pci_core_release_dev,
 	.open_device = nvmevf_pci_open_device,
-	.close_device = vfio_pci_core_close_device,
+	.close_device = nvmevf_pci_close_device,
 	.ioctl = vfio_pci_core_ioctl,
 	.device_feature = vfio_pci_core_ioctl_feature,
 	.read = vfio_pci_core_read,
@@ -47,32 +521,56 @@ static const struct vfio_device_ops nvmevf_pci_ops = {
 
 static int nvmevf_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	struct vfio_pci_core_device *vdev;
+	struct nvmevf_pci_core_device *nvmevf_dev;
 	int ret;
 
-	vdev = vfio_alloc_device(vfio_pci_core_device, vdev, &pdev->dev,
-				&nvmevf_pci_ops);
-	if (IS_ERR(vdev))
-		return PTR_ERR(vdev);
+	nvmevf_dev = vfio_alloc_device(nvmevf_pci_core_device, core_device.vdev,
+					&pdev->dev, &nvmevf_pci_ops);
+	if (IS_ERR(nvmevf_dev))
+		return PTR_ERR(nvmevf_dev);
 
-	dev_set_drvdata(&pdev->dev, vdev);
-	ret = vfio_pci_core_register_device(vdev);
+	dev_set_drvdata(&pdev->dev, &nvmevf_dev->core_device);
+	ret = vfio_pci_core_register_device(&nvmevf_dev->core_device);
 	if (ret)
 		goto out_put_dev;
-
 	return 0;
 
 out_put_dev:
-	vfio_put_device(&vdev->vdev);
+	vfio_put_device(&nvmevf_dev->core_device.vdev);
 	return ret;
+
 }
 
 static void nvmevf_pci_remove(struct pci_dev *pdev)
 {
-	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+	struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev);
+
+	vfio_pci_core_unregister_device(&nvmevf_dev->core_device);
+	vfio_put_device(&nvmevf_dev->core_device.vdev);
+}
+
+static void nvmevf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+	struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev);
+
+	if (!nvmevf_dev->migrate_cap)
+		return;
 
-	vfio_pci_core_unregister_device(vdev);
-	vfio_put_device(&vdev->vdev);
+	/*
+	 * As the higher VFIO layers are holding locks across reset and using
+	 * those same locks with the mm_lock we need to prevent ABBA deadlock
+	 * with the state_mutex and mm_lock.
+	 * In case the state_mutex was taken already we defer the cleanup work
+	 * to the unlock flow of the other running context.
+	 */
+	spin_lock(&nvmevf_dev->reset_lock);
+	nvmevf_dev->deferred_reset = true;
+	if (!mutex_trylock(&nvmevf_dev->state_mutex)) {
+		spin_unlock(&nvmevf_dev->reset_lock);
+		return;
+	}
+	spin_unlock(&nvmevf_dev->reset_lock);
+	nvmevf_state_mutex_unlock(nvmevf_dev);
 }
 
 static const struct pci_device_id nvmevf_pci_table[] = {
@@ -83,12 +581,17 @@ static const struct pci_device_id nvmevf_pci_table[] = {
 
 MODULE_DEVICE_TABLE(pci, nvmevf_pci_table);
 
+static const struct pci_error_handlers nvmevf_err_handlers = {
+	.reset_done = nvmevf_pci_aer_reset_done,
+	.error_detected = vfio_pci_core_aer_err_detected,
+};
+
 static struct pci_driver nvmevf_pci_driver = {
 	.name = KBUILD_MODNAME,
 	.id_table = nvmevf_pci_table,
 	.probe = nvmevf_pci_probe,
 	.remove = nvmevf_pci_remove,
-	.err_handler = &vfio_pci_core_err_handlers,
+	.err_handler = &nvmevf_err_handlers,
 	.driver_managed_dma = true,
 };
 
@@ -96,4 +599,4 @@ module_pci_driver(nvmevf_pci_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Lei Rao <lei.rao@...el.com>");
-MODULE_DESCRIPTION("NVMe VFIO PCI - Generic VFIO PCI driver for NVMe");
+MODULE_DESCRIPTION("NVMe VFIO PCI - VFIO PCI driver with live migration support for NVMe");
diff --git a/drivers/vfio/pci/nvme/nvme.h b/drivers/vfio/pci/nvme/nvme.h
new file mode 100644
index 000000000000..c8464554ef53
--- /dev/null
+++ b/drivers/vfio/pci/nvme/nvme.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022, INTEL CORPORATION. All rights reserved
+ */
+
+#ifndef NVME_VFIO_PCI_H
+#define NVME_VFIO_PCI_H
+
+#include <linux/kernel.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/nvme.h>
+
+struct nvme_live_mig_query_size {
+	__u8	opcode;
+	__u8	flags;
+	__u16	command_id;
+	__u32	rsvd1[9];
+	__u16	vf_index;
+	__u16	rsvd2;
+	__u32	rsvd3[5];
+};
+
+struct nvme_live_mig_suspend {
+	__u8	opcode;
+	__u8	flags;
+	__u16	command_id;
+	__u32	rsvd1[9];
+	__u16	vf_index;
+	__u16	rsvd2;
+	__u32	rsvd3[5];
+};
+
+struct nvme_live_mig_resume {
+	__u8    opcode;
+	__u8    flags;
+	__u16   command_id;
+	__u32   rsvd1[9];
+	__u16   vf_index;
+	__u16   rsvd2;
+	__u32   rsvd3[5];
+};
+
+struct nvme_live_mig_save_data {
+	__u8	opcode;
+	__u8	flags;
+	__u16	command_id;
+	__u32	rsvd1[5];
+	__le64	prp1;
+	__le64	prp2;
+	__u16	vf_index;
+	__u16	rsvd2;
+	__u32	rsvd3[5];
+};
+
+struct nvme_live_mig_load_data {
+	__u8    opcode;
+	__u8    flags;
+	__u16   command_id;
+	__u32   rsvd1[5];
+	__le64  prp1;
+	__le64  prp2;
+	__u16   vf_index;
+	__u16	rsvd2;
+	__u32	size;
+	__u32   rsvd3[4];
+};
+
+enum nvme_live_mig_admin_opcode {
+	nvme_admin_live_mig_query_data_size	= 0xC4,
+	nvme_admin_live_mig_suspend		= 0xC8,
+	nvme_admin_live_mig_resume		= 0xCC,
+	nvme_admin_live_mig_save_data		= 0xD2,
+	nvme_admin_live_mig_load_data		= 0xD5,
+};
+
+struct nvme_live_mig_command {
+	union {
+		struct nvme_live_mig_query_size query;
+		struct nvme_live_mig_suspend	suspend;
+		struct nvme_live_mig_resume	resume;
+		struct nvme_live_mig_save_data	save;
+		struct nvme_live_mig_load_data	load;
+	};
+};
+
+struct nvmevf_migration_file {
+	struct file *filp;
+	struct mutex lock;
+	bool disabled;
+	u8 *vf_data;
+	size_t total_length;
+};
+
+struct nvmevf_pci_core_device {
+	struct vfio_pci_core_device core_device;
+	int vf_id;
+	u8 migrate_cap:1;
+	u8 deferred_reset:1;
+	/* protect migration state */
+	struct mutex state_mutex;
+	enum vfio_device_mig_state mig_state;
+	/* protect the reset_done flow */
+	spinlock_t reset_lock;
+	struct nvmevf_migration_file *resuming_migf;
+	struct nvmevf_migration_file *saving_migf;
+};
+
+extern int nvme_submit_vf_cmd(struct pci_dev *dev, struct nvme_command *cmd,
+			size_t *result, void *buffer, unsigned int bufflen);
+
+#endif /* NVME_VFIO_PCI_H */
-- 
2.34.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ