[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5819EA34.1020607@intel.com>
Date: Wed, 02 Nov 2016 21:29:24 +0800
From: Jike Song <jike.song@...el.com>
To: Kirti Wankhede <kwankhede@...dia.com>
CC: alex.williamson@...hat.com, pbonzini@...hat.com, kraxel@...hat.com,
cjia@...dia.com, qemu-devel@...gnu.org, kvm@...r.kernel.org,
kevin.tian@...el.com, bjsdjshi@...ux.vnet.ibm.com,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v10 09/19] vfio iommu type1: Add support for mediated
devices
On 10/27/2016 05:29 AM, Kirti Wankhede wrote:
> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
>
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
>
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
> cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
> verify unpinning request and to unpin remaining pages while detaching, if
> there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
> exist in the container then all pages are already pinned and accounted.
> Accouting for mdev device is only done if there is no iommu capable
> domain in the container.
> - Page accouting is updated on hot plug and unplug mdev device and pass
> through device.
>
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - Linux VM hot plug and unplug vGPU device while GPU pass through device
> exist
> - Linux VM hot plug and unplug GPU pass through device while vGPU device
> exist
>
> Signed-off-by: Kirti Wankhede <kwankhede@...dia.com>
> Signed-off-by: Neo Jia <cjia@...dia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> ---
> drivers/vfio/vfio_iommu_type1.c | 646 +++++++++++++++++++++++++++++++++++-----
> 1 file changed, 571 insertions(+), 75 deletions(-)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 861ac2a1b0c3..5add11a147e1 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -36,6 +36,7 @@
> #include <linux/uaccess.h>
> #include <linux/vfio.h>
> #include <linux/workqueue.h>
> +#include <linux/mdev.h>
>
> #define DRIVER_VERSION "0.2"
> #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@...hat.com>"
> @@ -55,18 +56,26 @@ MODULE_PARM_DESC(disable_hugepages,
>
> struct vfio_iommu {
> struct list_head domain_list;
> + struct vfio_domain *external_domain; /* domain for external user */
> struct mutex lock;
> struct rb_root dma_list;
> bool v2;
> bool nesting;
> };
>
> +struct external_addr_space {
> + struct task_struct *task;
> + struct rb_root pfn_list; /* pinned Host pfn list */
> + struct mutex pfn_list_lock; /* mutex for pfn_list */
> +};
> +
> struct vfio_domain {
> - struct iommu_domain *domain;
> - struct list_head next;
> - struct list_head group_list;
> - int prot; /* IOMMU_CACHE */
> - bool fgsp; /* Fine-grained super pages */
> + struct iommu_domain *domain;
> + struct list_head next;
> + struct list_head group_list;
> + struct external_addr_space *external_addr_space;
> + int prot; /* IOMMU_CACHE */
> + bool fgsp; /* Fine-grained super pages */
> };
>
> struct vfio_dma {
> @@ -75,6 +84,7 @@ struct vfio_dma {
> unsigned long vaddr; /* Process virtual addr */
> size_t size; /* Map size (bytes) */
> int prot; /* IOMMU_READ/WRITE */
> + bool iommu_mapped;
> };
>
> struct vfio_group {
> @@ -83,6 +93,21 @@ struct vfio_group {
> };
>
> /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> + struct rb_node node;
> + unsigned long vaddr; /* virtual addr */
> + dma_addr_t iova; /* IOVA */
> + unsigned long pfn; /* Host pfn */
> + int prot;
> + atomic_t ref_count;
> +};
> +
> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
> + (!list_empty(&iommu->domain_list))
> +
> +/*
> * This code handles mapping and unmapping of user data buffers
> * into DMA'ble space using the IOMMU
> */
> @@ -130,6 +155,101 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
> rb_erase(&old->node, &iommu->dma_list);
> }
>
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> + unsigned long pfn)
> +{
> + struct rb_node *node;
> + struct vfio_pfn *vpfn;
> +
> + node = domain->external_addr_space->pfn_list.rb_node;
> +
> + while (node) {
> + vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> + if (pfn < vpfn->pfn)
> + node = node->rb_left;
> + else if (pfn > vpfn->pfn)
> + node = node->rb_right;
> + else
> + return vpfn;
> + }
> +
> + return NULL;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> + struct rb_node **link, *parent = NULL;
> + struct vfio_pfn *vpfn;
> +
> + link = &domain->external_addr_space->pfn_list.rb_node;
> + while (*link) {
> + parent = *link;
> + vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> + if (new->pfn < vpfn->pfn)
> + link = &(*link)->rb_left;
> + else
> + link = &(*link)->rb_right;
> + }
> +
> + rb_link_node(&new->node, parent, link);
> + rb_insert_color(&new->node, &domain->external_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> + rb_erase(&old->node, &domain->external_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
> + dma_addr_t iova, unsigned long pfn, int prot)
> +{
> + struct vfio_pfn *vpfn;
> +
> + vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> + if (!vpfn)
> + return -ENOMEM;
> +
> + vpfn->vaddr = vaddr;
> + vpfn->iova = iova;
> + vpfn->pfn = pfn;
> + vpfn->prot = prot;
> + atomic_set(&vpfn->ref_count, 1);
> + vfio_link_pfn(domain, vpfn);
> + return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> + struct vfio_pfn *vpfn)
> +{
> + vfio_unlink_pfn(domain, vpfn);
> + kfree(vpfn);
> +}
> +
> +static int vfio_pfn_account(struct vfio_iommu *iommu, unsigned long pfn)
> +{
> + struct vfio_pfn *p;
> + struct vfio_domain *domain = iommu->external_domain;
> + int ret = 1;
> +
> + if (!domain)
> + return 1;
> +
> + mutex_lock(&domain->external_addr_space->pfn_list_lock);
> +
> + p = vfio_find_pfn(domain, pfn);
> + if (p)
> + ret = 0;
> +
> + mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> + return ret;
> +}
> +
> struct vwork {
> struct mm_struct *mm;
> long npage;
> @@ -270,12 +390,13 @@ static int vaddr_get_pfn(struct mm_struct *remote_mm, unsigned long vaddr,
> * the iommu can only map chunks of consecutive pfns anyway, so get the
> * first page and all consecutive pages with the same locking.
> */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> - int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(struct vfio_iommu *iommu,
> + unsigned long vaddr, long npage,
> + int prot, unsigned long *pfn_base)
> {
> unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> bool lock_cap = capable(CAP_IPC_LOCK);
> - long ret, i;
> + long ret, i, lock_acct = 0;
> bool rsvd;
>
> if (!current->mm)
> @@ -285,9 +406,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
> if (ret)
> return ret;
>
> + lock_acct = vfio_pfn_account(iommu, *pfn_base);
> +
> rsvd = is_invalid_reserved_pfn(*pfn_base);
>
> - if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
> + if (!rsvd && !lock_cap && current->mm->locked_vm + lock_acct > limit) {
> put_pfn(*pfn_base, prot);
> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> limit << PAGE_SHIFT);
> @@ -314,8 +437,10 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
> break;
> }
>
> + lock_acct += vfio_pfn_account(iommu, pfn);
> +
> if (!rsvd && !lock_cap &&
> - current->mm->locked_vm + i + 1 > limit) {
> + current->mm->locked_vm + lock_acct > limit) {
> put_pfn(pfn, prot);
> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> __func__, limit << PAGE_SHIFT);
> @@ -329,14 +454,19 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
> return i;
> }
>
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> - int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(struct vfio_iommu *iommu,
> + unsigned long pfn, long npage, int prot,
> + bool do_accounting)
> {
> - unsigned long unlocked = 0;
> + unsigned long unlocked = 0, unlock_acct = 0;
> long i;
>
> - for (i = 0; i < npage; i++)
> + for (i = 0; i < npage; i++) {
> + if (do_accounting)
> + unlock_acct += vfio_pfn_account(iommu, pfn);
> +
> unlocked += put_pfn(pfn++, prot);
> + }
>
> if (do_accounting)
> vfio_lock_acct(current, -unlocked);
> @@ -344,14 +474,208 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
> return unlocked;
> }
>
> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> +static long __vfio_pin_page_external(struct vfio_domain *domain,
> + unsigned long vaddr, int prot,
> + unsigned long *pfn_base,
> + bool do_accounting)
> +{
> + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> + bool lock_cap = capable(CAP_IPC_LOCK);
> + long ret;
> + bool rsvd;
> + struct task_struct *task = domain->external_addr_space->task;
> +
> + if (!task->mm)
> + return -ENODEV;
> +
> + ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
Hi Kirti,
In your implementation of vaddr_get_pfn, as long as the 1st argument
is not NULL, you will call get_user_pages_remote other than get_user_pages_fast.
But still there is a possibility here @task is actually @current, thereby
task->mm == current->mm. That should not be rare, but probably happen
at most times: the QEMU process emulates MMIO read/write, and call
vfio_pin_pages in its process context.
If I read correctly, current implementation enforces a locked GUP, no
matter which process context it is in.
--
Thanks,
Jike
Powered by blists - more mailing lists