linux-kernel - Re: [PATCH v10 09/19] vfio iommu type1: Add support for mediated devices

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20161027170116.7f47e3b6@t450s.home>
Date:   Thu, 27 Oct 2016 17:01:16 -0600
From:   Alex Williamson <alex.williamson@...hat.com>
To:     Kirti Wankhede <kwankhede@...dia.com>
Cc:     <pbonzini@...hat.com>, <kraxel@...hat.com>, <cjia@...dia.com>,
        <qemu-devel@...gnu.org>, <kvm@...r.kernel.org>,
        <kevin.tian@...el.com>, <jike.song@...el.com>,
        <bjsdjshi@...ux.vnet.ibm.com>, <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v10 09/19] vfio iommu type1: Add support for mediated
 devices

On Thu, 27 Oct 2016 02:59:16 +0530
Kirti Wankhede <kwankhede@...dia.com> wrote:

> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
>   cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
>   verify unpinning request and to unpin remaining pages while detaching, if
>   there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
>   exist in the container then all pages are already pinned and accounted.
>   Accouting for mdev device is only done if there is no iommu capable
>   domain in the container.
> - Page accouting is updated on hot plug and unplug mdev device and pass
>   through device.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>   exist
> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>   exist
> 
> Signed-off-by: Kirti Wankhede <kwankhede@...dia.com>
> Signed-off-by: Neo Jia <cjia@...dia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> ---
>  drivers/vfio/vfio_iommu_type1.c | 646 +++++++++++++++++++++++++++++++++++-----
>  1 file changed, 571 insertions(+), 75 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 861ac2a1b0c3..5add11a147e1 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -36,6 +36,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
>  #include <linux/workqueue.h>
> +#include <linux/mdev.h>
>  
>  #define DRIVER_VERSION  "0.2"
>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@...hat.com>"
> @@ -55,18 +56,26 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct vfio_domain	*external_domain; /* domain for external user */
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
>  	bool			nesting;
>  };
>  
> +struct external_addr_space {
> +	struct task_struct	*task;
> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
> +};
> +
>  struct vfio_domain {
> -	struct iommu_domain	*domain;
> -	struct list_head	next;
> -	struct list_head	group_list;
> -	int			prot;		/* IOMMU_CACHE */
> -	bool			fgsp;		/* Fine-grained super pages */
> +	struct iommu_domain		*domain;
> +	struct list_head		next;
> +	struct list_head		group_list;
> +	struct external_addr_space	*external_addr_space;
> +	int				prot;	/* IOMMU_CACHE */
> +	bool				fgsp;	/* Fine-grained super pages */
>  };
>  
>  struct vfio_dma {
> @@ -75,6 +84,7 @@ struct vfio_dma {
>  	unsigned long		vaddr;		/* Process virtual addr */
>  	size_t			size;		/* Map size (bytes) */
>  	int			prot;		/* IOMMU_READ/WRITE */
> +	bool			iommu_mapped;
>  };
>  
>  struct vfio_group {
> @@ -83,6 +93,21 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +	struct rb_node		node;
> +	unsigned long		vaddr;		/* virtual addr */
> +	dma_addr_t		iova;		/* IOVA */
> +	unsigned long		pfn;		/* Host pfn */
> +	int			prot;
> +	atomic_t		ref_count;
> +};
> +
> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> +					(!list_empty(&iommu->domain_list))
> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -130,6 +155,101 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> +				      unsigned long pfn)
> +{
> +	struct rb_node *node;
> +	struct vfio_pfn *vpfn;
> +
> +	node = domain->external_addr_space->pfn_list.rb_node;
> +
> +	while (node) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if (pfn < vpfn->pfn)
> +			node = node->rb_left;
> +		else if (pfn > vpfn->pfn)
> +			node = node->rb_right;
> +		else
> +			return vpfn;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> +	struct rb_node **link, *parent = NULL;
> +	struct vfio_pfn *vpfn;
> +
> +	link = &domain->external_addr_space->pfn_list.rb_node;
> +	while (*link) {
> +		parent = *link;
> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +		if (new->pfn < vpfn->pfn)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &domain->external_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> +	rb_erase(&old->node, &domain->external_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
> +				dma_addr_t iova, unsigned long pfn, int prot)
> +{
> +	struct vfio_pfn *vpfn;
> +
> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> +	if (!vpfn)
> +		return -ENOMEM;
> +
> +	vpfn->vaddr = vaddr;
> +	vpfn->iova = iova;
> +	vpfn->pfn = pfn;
> +	vpfn->prot = prot;
> +	atomic_set(&vpfn->ref_count, 1);
> +	vfio_link_pfn(domain, vpfn);
> +	return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> +				      struct vfio_pfn *vpfn)
> +{
> +	vfio_unlink_pfn(domain, vpfn);
> +	kfree(vpfn);
> +}
> +
> +static int vfio_pfn_account(struct vfio_iommu *iommu, unsigned long pfn)
> +{
> +	struct vfio_pfn *p;
> +	struct vfio_domain *domain = iommu->external_domain;
> +	int ret = 1;
> +
> +	if (!domain)
> +		return 1;
> +
> +	mutex_lock(&domain->external_addr_space->pfn_list_lock);
> +
> +	p = vfio_find_pfn(domain, pfn);
> +	if (p)
> +		ret = 0;
> +
> +	mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> +	return ret;
> +}
> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -270,12 +390,13 @@ static int vaddr_get_pfn(struct mm_struct *remote_mm, unsigned long vaddr,
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> -			   int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(struct vfio_iommu *iommu,
> +				    unsigned long vaddr, long npage,
> +				    int prot, unsigned long *pfn_base)
>  {
>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
> -	long ret, i;
> +	long ret, i, lock_acct = 0;


Initialization to zero is unnecessary here


>  	bool rsvd;
>  
>  	if (!current->mm)
> @@ -285,9 +406,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	if (ret)
>  		return ret;
>  
> +	lock_acct = vfio_pfn_account(iommu, *pfn_base);
> +
>  	rsvd = is_invalid_reserved_pfn(*pfn_base);
>  
> -	if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
> +	if (!rsvd && !lock_cap && current->mm->locked_vm + lock_acct > limit) {
>  		put_pfn(*pfn_base, prot);
>  		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
>  			limit << PAGE_SHIFT);
> @@ -314,8 +437,10 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  			break;
>  		}
>  
> +		lock_acct += vfio_pfn_account(iommu, pfn);
> +
>  		if (!rsvd && !lock_cap &&
> -		    current->mm->locked_vm + i + 1 > limit) {
> +		    current->mm->locked_vm + lock_acct > limit) {
>  			put_pfn(pfn, prot);
>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>  				__func__, limit << PAGE_SHIFT);
> @@ -329,14 +454,19 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	return i;
>  }
>  
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> -			     int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(struct vfio_iommu *iommu,
> +				      unsigned long pfn, long npage, int prot,
> +				      bool do_accounting)
>  {
> -	unsigned long unlocked = 0;
> +	unsigned long unlocked = 0, unlock_acct = 0;
>  	long i;
>  
> -	for (i = 0; i < npage; i++)
> +	for (i = 0; i < npage; i++) {
> +		if (do_accounting)
> +			unlock_acct += vfio_pfn_account(iommu, pfn);
> +
>  		unlocked += put_pfn(pfn++, prot);
> +	}


Something is wrong here, we never do anything with unlock_acct

>  
>  	if (do_accounting)
>  		vfio_lock_acct(current, -unlocked);
> @@ -344,14 +474,208 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
>  	return unlocked;
>  }
>  
> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> +static long __vfio_pin_page_external(struct vfio_domain *domain,
> +				     unsigned long vaddr, int prot,
> +				     unsigned long *pfn_base,
> +				     bool do_accounting)
> +{
> +	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	bool lock_cap = capable(CAP_IPC_LOCK);

capable() checks if the current user has this capability, but this path
is not user invoked.  Whose capabilities are we testing here?

> +	long ret;
> +	bool rsvd;
> +	struct task_struct *task = domain->external_addr_space->task;

@task is set from @current when the group is attached, did we get a
reference to that task, how do we know it's still valid?  Could a given
vfio_dma use a different task_struct?  I think these are the kinds of
issues Alexey was noting.

> +
> +	if (!task->mm)
> +		return -ENODEV;
> +
> +	ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
> +	if (ret)
> +		return ret;
> +
> +	rsvd = is_invalid_reserved_pfn(*pfn_base);
> +
> +	if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) {
> +		put_pfn(*pfn_base, prot);
> +		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> +			limit << PAGE_SHIFT);

This happens when the vendor driver tries to pin pages, asynchronous to
any discernible user action, what are we supposed to do when we see
this in dmesg?  How can we actually figure out what this is associated
with?

> +		return -ENOMEM;
> +	}
> +
> +	if (!rsvd && do_accounting)
> +		vfio_lock_acct(task, 1);
> +
> +	return 1;
> +}
> +
> +static void __vfio_unpin_page_external(struct vfio_domain *domain,
> +				       unsigned long pfn, int prot,
> +				       bool do_accounting)
> +{
> +	put_pfn(pfn, prot);
> +
> +	if (do_accounting)
> +		vfio_lock_acct(domain->external_addr_space->task, -1);


What if this task no longer exists.


> +}
> +
> +static int vfio_unpin_pfn(struct vfio_domain *domain,
> +			  struct vfio_pfn *vpfn, bool do_accounting)
> +{
> +	__vfio_unpin_page_external(domain, vpfn->pfn, vpfn->prot,
> +				   do_accounting);
> +
> +	if (atomic_dec_and_test(&vpfn->ref_count))
> +		vfio_remove_from_pfn_list(domain, vpfn);
> +
> +	return 1;
> +}
> +
> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
> +				       unsigned long *user_pfn,
> +				       long npage, int prot,
> +				       unsigned long *phys_pfn)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain;
> +	struct external_addr_space *addr_space;
> +	int i, j, ret;
> +	long retpage;
> +	unsigned long remote_vaddr;
> +	unsigned long *pfn = phys_pfn;
> +	struct vfio_dma *dma;
> +	bool do_accounting;
> +
> +	if (!iommu || !user_pfn || !phys_pfn)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	if (!iommu->external_domain) {
> +		ret = -EINVAL;
> +		goto pin_done;
> +	}
> +
> +	domain = iommu->external_domain;
> +	addr_space = domain->external_addr_space;
> +	/*
> +	 * If iommu capable domain exist in the container then all pages are
> +	 * already pinned and accounted. Accouting should be done if there is no
> +	 * iommu capable domain in the container.
> +	 */
> +	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +		dma_addr_t iova;
> +
> +		iova = user_pfn[i] << PAGE_SHIFT;
> +
> +		dma = vfio_find_dma(iommu, iova, 0);
> +		if (!dma) {
> +			ret = -EINVAL;
> +			goto pin_unwind;
> +		}
> +
> +		remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +		retpage = __vfio_pin_page_external(domain, remote_vaddr, prot,
> +						   &pfn[i], do_accounting);
> +		if (retpage <= 0) {
> +			WARN_ON(!retpage);
> +			ret = (int)retpage;
> +			goto pin_unwind;
> +		}
> +
> +		mutex_lock(&addr_space->pfn_list_lock);
> +
> +		/* search if pfn exist */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p) {


if @do_accounting is true, aren't we now double accounting this pfn.  I
think we're trying to pin it twice, but not account for it twice.  Also
the problem with what if the user is running right at their locked
memory limit and the vendor driver pins a page (maybe it's already
pinned by another vendor driver), they're going to cross the limit when
really that page has already been accounted.


> +			atomic_inc(&p->ref_count);
> +			mutex_unlock(&addr_space->pfn_list_lock);
> +			continue;
> +		}
> +
> +		ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
> +					   pfn[i], prot);
> +		mutex_unlock(&addr_space->pfn_list_lock);
> +
> +		if (ret) {
> +			__vfio_unpin_page_external(domain, pfn[i], prot,
> +						   do_accounting);
> +			goto pin_unwind;
> +		}
> +	}
> +
> +	ret = i;
> +	goto pin_done;
> +
> +pin_unwind:
> +	pfn[i] = 0;
> +	mutex_lock(&addr_space->pfn_list_lock);
> +	for (j = 0; j < i; j++) {
> +		struct vfio_pfn *p;
> +
> +		p = vfio_find_pfn(domain, pfn[j]);
> +		if (p)
> +			vfio_unpin_pfn(domain, p, do_accounting);
> +
> +		pfn[j] = 0;
> +	}
> +	mutex_unlock(&addr_space->pfn_list_lock);
> +
> +pin_done:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long *pfn,
> +					 long npage)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +	struct vfio_domain *domain = NULL;
> +	bool do_accounting;
> +	long unlocked = 0;
> +	int i;
> +
> +	if (!iommu || !pfn)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	domain = iommu->external_domain;
> +
> +	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> +
> +	mutex_lock(&domain->external_addr_space->pfn_list_lock);
> +
> +	for (i = 0; i < npage; i++) {
> +		struct vfio_pfn *p;
> +
> +		/* verify if pfn exist in pfn_list */
> +		p = vfio_find_pfn(domain, pfn[i]);
> +		if (p)
> +			unlocked += vfio_unpin_pfn(domain, p, do_accounting);
> +
> +	}
> +	mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> +
> +	mutex_unlock(&iommu->lock);
> +	return unlocked;
> +}
> +
> +static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> +			     bool do_accounting)
>  {
>  	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
>  	struct vfio_domain *domain, *d;
>  	long unlocked = 0;
>  
>  	if (!dma->size)
> -		return;
> +		return 0;
> +
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		return 0;
> +
>  	/*
>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>  	 * need a much more complicated tracking system.  Unfortunately that
> @@ -393,20 +717,27 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  		if (WARN_ON(!unmapped))
>  			break;
>  
> -		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> -					     unmapped >> PAGE_SHIFT,
> -					     dma->prot, false);
> +		unlocked += __vfio_unpin_pages_remote(iommu, phys >> PAGE_SHIFT,
> +						      unmapped >> PAGE_SHIFT,
> +						      dma->prot, false);
>  		iova += unmapped;
>  
>  		cond_resched();
>  	}
>  
> -	vfio_lock_acct(current, -unlocked);
> +	dma->iommu_mapped = false;
> +
> +	if (do_accounting) {
> +		vfio_lock_acct(current, -unlocked);
> +		return 0;
> +	}
> +
> +	return unlocked;
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  {
> -	vfio_unmap_unpin(iommu, dma);
> +	vfio_unmap_unpin(iommu, dma, true);
>  	vfio_unlink_dma(iommu, dma);
>  	kfree(dma);
>  }
> @@ -569,17 +900,57 @@ unwind:
>  	return ret;
>  }
>  
> +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> +			    size_t map_size)
> +{
> +	dma_addr_t iova = dma->iova;
> +	unsigned long vaddr = dma->vaddr;
> +	size_t size = map_size;
> +	long npage;
> +	unsigned long pfn;
> +	int ret = 0;
> +
> +	while (size) {
> +		/* Pin a contiguous chunk of memory */
> +		npage = __vfio_pin_pages_remote(iommu, vaddr + dma->size,
> +						size >> PAGE_SHIFT, dma->prot,
> +						&pfn);
> +		if (npage <= 0) {
> +			WARN_ON(!npage);
> +			ret = (int)npage;
> +			break;
> +		}
> +
> +		/* Map it! */
> +		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
> +				     dma->prot);
> +		if (ret) {
> +			__vfio_unpin_pages_remote(iommu, pfn, npage, dma->prot,
> +						  true);
> +			break;
> +		}
> +
> +		size -= npage << PAGE_SHIFT;
> +		dma->size += npage << PAGE_SHIFT;
> +	}
> +
> +	dma->iommu_mapped = true;
> +
> +	if (ret)
> +		vfio_remove_dma(iommu, dma);
> +
> +	return ret;
> +}
> +
>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  			   struct vfio_iommu_type1_dma_map *map)
>  {
>  	dma_addr_t iova = map->iova;
>  	unsigned long vaddr = map->vaddr;
>  	size_t size = map->size;
> -	long npage;
>  	int ret = 0, prot = 0;
>  	uint64_t mask;
>  	struct vfio_dma *dma;
> -	unsigned long pfn;
>  
>  	/* Verify that none of our __u64 fields overflow */
>  	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> @@ -622,29 +993,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  	/* Insert zero-sized and grow as we map chunks of it */
>  	vfio_link_dma(iommu, dma);
>  
> -	while (size) {
> -		/* Pin a contiguous chunk of memory */
> -		npage = vfio_pin_pages(vaddr + dma->size,
> -				       size >> PAGE_SHIFT, prot, &pfn);
> -		if (npage <= 0) {
> -			WARN_ON(!npage);
> -			ret = (int)npage;
> -			break;
> -		}
> -
> -		/* Map it! */
> -		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
> -		if (ret) {
> -			vfio_unpin_pages(pfn, npage, prot, true);
> -			break;
> -		}
> -
> -		size -= npage << PAGE_SHIFT;
> -		dma->size += npage << PAGE_SHIFT;
> -	}
> -
> -	if (ret)
> -		vfio_remove_dma(iommu, dma);
> +	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		dma->size = size;
> +	else
> +		ret = vfio_pin_map_dma(iommu, dma, size);
>  
>  	mutex_unlock(&iommu->lock);
>  	return ret;
> @@ -673,10 +1026,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
>  	n = rb_first(&iommu->dma_list);
>  
> -	/* If there's not a domain, there better not be any mappings */
> -	if (WARN_ON(n && !d))
> -		return -EINVAL;
> -
>  	for (; n; n = rb_next(n)) {
>  		struct vfio_dma *dma;
>  		dma_addr_t iova;
> @@ -685,20 +1034,43 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  		iova = dma->iova;
>  
>  		while (iova < dma->iova + dma->size) {
> -			phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
> +			phys_addr_t phys;
>  			size_t size;
>  
> -			if (WARN_ON(!phys)) {
> -				iova += PAGE_SIZE;
> -				continue;
> -			}
> +			if (dma->iommu_mapped) {
> +				phys = iommu_iova_to_phys(d->domain, iova);
>  
> -			size = PAGE_SIZE;
> +				if (WARN_ON(!phys)) {
> +					iova += PAGE_SIZE;
> +					continue;
> +				}
>  
> -			while (iova + size < dma->iova + dma->size &&
> -			       phys + size == iommu_iova_to_phys(d->domain,
> +				size = PAGE_SIZE;
> +
> +				while (iova + size < dma->iova + dma->size &&
> +				    phys + size == iommu_iova_to_phys(d->domain,
>  								 iova + size))
> -				size += PAGE_SIZE;
> +					size += PAGE_SIZE;
> +			} else {
> +				unsigned long pfn;
> +				unsigned long vaddr = dma->vaddr +
> +						     (iova - dma->iova);
> +				size_t n = dma->iova + dma->size - iova;
> +				long npage;
> +
> +				npage = __vfio_pin_pages_remote(iommu, vaddr,
> +								n >> PAGE_SHIFT,
> +								dma->prot,
> +								&pfn);
> +				if (npage <= 0) {
> +					WARN_ON(!npage);
> +					ret = (int)npage;
> +					return ret;
> +				}
> +
> +				phys = pfn << PAGE_SHIFT;
> +				size = npage << PAGE_SHIFT;
> +			}
>  
>  			ret = iommu_map(domain->domain, iova, phys,
>  					size, dma->prot | domain->prot);
> @@ -707,6 +1079,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
>  
>  			iova += size;
>  		}
> +
> +		dma->iommu_mapped = true;
>  	}
>  
>  	return 0;
> @@ -764,7 +1138,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_group *group;
>  	struct vfio_domain *domain, *d;
> -	struct bus_type *bus = NULL;
> +	struct bus_type *bus = NULL, *mdev_bus;
>  	int ret;
>  
>  	mutex_lock(&iommu->lock);
> @@ -776,6 +1150,13 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  		}
>  	}
>  
> +	if (iommu->external_domain) {
> +		if (find_iommu_group(iommu->external_domain, iommu_group)) {
> +			mutex_unlock(&iommu->lock);
> +			return -EINVAL;
> +		}
> +	}
> +
>  	group = kzalloc(sizeof(*group), GFP_KERNEL);
>  	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
>  	if (!group || !domain) {
> @@ -790,6 +1171,38 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>  	if (ret)
>  		goto out_free;
>  
> +	mdev_bus = symbol_get(mdev_bus_type);
> +
> +	if (mdev_bus) {
> +		if ((bus == mdev_bus) && !iommu_present(bus)) {
> +			symbol_put(mdev_bus_type);
> +			if (!iommu->external_domain) {
> +				struct external_addr_space *addr_space;
> +
> +				addr_space = kzalloc(sizeof(*addr_space),
> +						     GFP_KERNEL);
> +				if (!addr_space) {
> +					ret = -ENOMEM;
> +					goto out_free;
> +				}
> +
> +				INIT_LIST_HEAD(&domain->group_list);
> +				addr_space->task = current;


Regarding current here, could one process open the vfio container and
vfio group and perform this operation, then pass the file descriptors
for each to another task via SCM_RIGHTS?  I don't think we can prevent
it.  Wouldn't that mean the vendor driver is pinning pages in the
wrong task_struct?  Do we need to track (and reference) the task_struct
for each vfio_dma?  Maybe we need this anyway so that only the
task_struct that performed a DMA_MAP can do a DMA_UNMAP of that range.


> +				addr_space->pfn_list = RB_ROOT;
> +				mutex_init(&addr_space->pfn_list_lock);
> +				domain->external_addr_space = addr_space;
> +				iommu->external_domain = domain;
> +			} else
> +				kfree(domain);
> +
> +			list_add(&group->next,
> +				 &iommu->external_domain->group_list);
> +			mutex_unlock(&iommu->lock);
> +			return 0;
> +		}
> +		symbol_put(mdev_bus_type);
> +	}
> +
>  	domain->domain = iommu_domain_alloc(bus);
>  	if (!domain->domain) {
>  		ret = -EIO;
> @@ -880,6 +1293,42 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
>  		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>  }
>  
> +static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> +{
> +	struct vfio_domain *domain = iommu->external_domain;
> +	struct vfio_dma *dma, *tdma;
> +	struct rb_node *n;
> +	long locked = 0, unlocked = 0;
> +
> +	rbtree_postorder_for_each_entry_safe(dma, tdma, &iommu->dma_list,
> +					     node) {
> +		unlocked += vfio_unmap_unpin(iommu, dma, false);
> +	}
> +
> +	mutex_lock(&domain->external_addr_space->pfn_list_lock);
> +
> +	n = rb_first(&domain->external_addr_space->pfn_list);
> +
> +	for (; n; n = rb_next(n))
> +		locked++;
> +
> +	vfio_lock_acct(domain->external_addr_space->task, locked - unlocked);
> +	mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> +}
> +
> +static void vfio_external_unpin_all(struct vfio_domain *domain,
> +				    bool do_accounting)
> +{
> +	struct rb_node *node;
> +
> +	mutex_lock(&domain->external_addr_space->pfn_list_lock);
> +	while ((node = rb_first(&domain->external_addr_space->pfn_list)))
> +		vfio_unpin_pfn(domain, rb_entry(node, struct vfio_pfn, node),
> +			       do_accounting);
> +
> +	mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>  					  struct iommu_group *iommu_group)
>  {
> @@ -889,6 +1338,28 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> +	if (iommu->external_domain) {
> +		domain = iommu->external_domain;
> +		group = find_iommu_group(domain, iommu_group);
> +		if (group) {
> +			list_del(&group->next);
> +			kfree(group);
> +
> +			if (list_empty(&domain->group_list)) {
> +				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
> +					vfio_external_unpin_all(domain, true);
> +					vfio_iommu_unmap_unpin_all(iommu);
> +				} else
> +					vfio_external_unpin_all(domain, false);
> +				kfree(domain);
> +				iommu->external_domain = NULL;
> +			}
> +			goto detach_group_done;
> +		}
> +	}
> +
> +	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> +		goto detach_group_done;
>  
>  	list_for_each_entry(domain, &iommu->domain_list, next) {
>  		group = find_iommu_group(domain, iommu_group);
> @@ -899,21 +1370,27 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>  		list_del(&group->next);
>  		kfree(group);
>  		/*
> -		 * Group ownership provides privilege, if the group
> -		 * list is empty, the domain goes away.  If it's the
> -		 * last domain, then all the mappings go away too.
> +		 * Group ownership provides privilege, if the group list is
> +		 * empty, the domain goes away. If it's the last domain with
> +		 * iommu and external domain doesn't exist, then all the
> +		 * mappings go away too. If it's the last domain with iommu and
> +		 * external domain exist, update accounting
>  		 */
>  		if (list_empty(&domain->group_list)) {
> -			if (list_is_singular(&iommu->domain_list))
> -				vfio_iommu_unmap_unpin_all(iommu);
> +			if (list_is_singular(&iommu->domain_list)) {
> +				if (!iommu->external_domain)
> +					vfio_iommu_unmap_unpin_all(iommu);
> +				else
> +					vfio_iommu_unmap_unpin_reaccount(iommu);
> +			}
>  			iommu_domain_free(domain->domain);
>  			list_del(&domain->next);
>  			kfree(domain);
>  		}
> -		goto done;
> +		break;
>  	}
>  
> -done:
> +detach_group_done:
>  	mutex_unlock(&iommu->lock);
>  }
>  
> @@ -945,23 +1422,40 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	return iommu;
>  }
>  
> +static void vfio_release_domain(struct vfio_domain *domain)
> +{
> +	struct vfio_group *group, *group_tmp;
> +
> +	list_for_each_entry_safe(group, group_tmp,
> +				 &domain->group_list, next) {
> +		if (!domain->external_addr_space)
> +			iommu_detach_group(domain->domain, group->iommu_group);
> +		list_del(&group->next);
> +		kfree(group);
> +	}
> +
> +	if (domain->external_addr_space)
> +		vfio_external_unpin_all(domain, false);
> +	else
> +		iommu_domain_free(domain->domain);
> +}
> +
>  static void vfio_iommu_type1_release(void *iommu_data)
>  {
>  	struct vfio_iommu *iommu = iommu_data;
>  	struct vfio_domain *domain, *domain_tmp;
> -	struct vfio_group *group, *group_tmp;
> +
> +	if (iommu->external_domain) {
> +		vfio_release_domain(iommu->external_domain);
> +		kfree(iommu->external_domain);
> +		iommu->external_domain = NULL;
> +	}
>  
>  	vfio_iommu_unmap_unpin_all(iommu);
>  
>  	list_for_each_entry_safe(domain, domain_tmp,
>  				 &iommu->domain_list, next) {
> -		list_for_each_entry_safe(group, group_tmp,
> -					 &domain->group_list, next) {
> -			iommu_detach_group(domain->domain, group->iommu_group);
> -			list_del(&group->next);
> -			kfree(group);
> -		}
> -		iommu_domain_free(domain->domain);
> +		vfio_release_domain(domain);
>  		list_del(&domain->next);
>  		kfree(domain);
>  	}
> @@ -1069,6 +1563,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
>  	.ioctl		= vfio_iommu_type1_ioctl,
>  	.attach_group	= vfio_iommu_type1_attach_group,
>  	.detach_group	= vfio_iommu_type1_detach_group,
> +	.pin_pages	= vfio_iommu_type1_pin_pages,
> +	.unpin_pages	= vfio_iommu_type1_unpin_pages,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)