linux-kernel - Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update iova list

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7b4fb7cb-cae6-478b-2e69-7d5049920c8d@redhat.com>
Date:   Tue, 23 Jan 2018 09:32:14 +0100
From:   Auger Eric <eric.auger@...hat.com>
To:     Alex Williamson <alex.williamson@...hat.com>,
        Shameer Kolothum <shameerali.kolothum.thodi@...wei.com>
Cc:     pmorel@...ux.vnet.ibm.com, kvm@...r.kernel.org,
        linux-kernel@...r.kernel.org, linuxarm@...wei.com,
        john.garry@...wei.com, xuwei5@...ilicon.com
Subject: Re: [RFC v2 2/5] vfio/type1: Check reserve region conflict and update
 iova list

Hi Shameer,

On 18/01/18 01:04, Alex Williamson wrote:
> On Fri, 12 Jan 2018 16:45:28 +0000
> Shameer Kolothum <shameerali.kolothum.thodi@...wei.com> wrote:
> 
>> This retrieves the reserved regions associated with dev group and
>> checks for conflicts with any existing dma mappings. Also update
>> the iova list excluding the reserved regions.
>>
>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@...wei.com>
>> ---
>>  drivers/vfio/vfio_iommu_type1.c | 161 +++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 159 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 11cbd49..7609070 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -28,6 +28,7 @@
>>  #include <linux/device.h>
>>  #include <linux/fs.h>
>>  #include <linux/iommu.h>
>> +#include <linux/list_sort.h>
>>  #include <linux/module.h>
>>  #include <linux/mm.h>
>>  #include <linux/rbtree.h>
>> @@ -1199,6 +1200,20 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
>>  	return ret;
>>  }
>>  
> 
> /* list_sort helper */
> 
>> +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head *b)
>> +{
>> +	struct iommu_resv_region *ra, *rb;
>> +
>> +	ra = container_of(a, struct iommu_resv_region, list);
>> +	rb = container_of(b, struct iommu_resv_region, list);
>> +
>> +	if (ra->start < rb->start)
>> +		return -1;
>> +	if (ra->start > rb->start)
>> +		return 1;
>> +	return 0;
>> +}
>> +
>>  static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
>>  				struct list_head *head)
>>  {
>> @@ -1274,6 +1289,24 @@ static int vfio_iommu_valid_aperture(struct vfio_iommu *iommu,
>>  }
>>  
>>  /*
>> + * Check reserved region conflicts with existing dma mappings
>> + */
>> +static int vfio_iommu_resv_region_conflict(struct vfio_iommu *iommu,
>> +				struct list_head *resv_regions)
>> +{
>> +	struct iommu_resv_region *region;
>> +
>> +	/* Check for conflict with existing dma mappings */
>> +	list_for_each_entry(region, resv_regions, list) {
>> +		if (vfio_find_dma_overlap(iommu, region->start,
>> +				    region->start + region->length - 1))
>> +			return -EINVAL;
>> +	}
>> +
>> +	return 0;
>> +}
> 
> This basically does the same test as vfio_iommu_valid_aperture but
> properly names it a conflict test.  Please be consistent.  Should this
> also return bool, "conflict" is a yes/no answer.
> 
>> +
>> +/*
>>   * Adjust the iommu aperture window if new aperture is a valid one
>>   */
>>  static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
>> @@ -1316,6 +1349,51 @@ static int vfio_iommu_iova_aper_adjust(struct vfio_iommu *iommu,
>>  	return 0;
>>  }
>>  
>> +/*
>> + * Check and update iova region list in case a reserved region
>> + * overlaps the iommu iova range
>> + */
>> +static int vfio_iommu_iova_resv_adjust(struct vfio_iommu *iommu,
>> +					struct list_head *resv_regions)
> 
> "resv_region" in previous function, just "resv" here, use consistent
> names.  Also, what are we adjusting.  Maybe "exclude" is a better term.
> 
>> +{
>> +	struct iommu_resv_region *resv;
>> +	struct list_head *iova = &iommu->iova_list;
>> +	struct vfio_iova *n, *next;
>> +
>> +	list_for_each_entry(resv, resv_regions, list) {
>> +		phys_addr_t start, end;
>> +
>> +		start = resv->start;
>> +		end = resv->start + resv->length - 1;
>> +
>> +		list_for_each_entry_safe(n, next, iova, list) {
>> +			phys_addr_t a, b;
>> +			int ret = 0;
>> +
>> +			a = n->start;
>> +			b = n->end;
> 
> 'a' and 'b' variables actually make this incredibly confusing.  Use
> better variable names or just drop them entirely, it's much easier to
> follow as n->start & n->end.
> 
>> +			/* No overlap */
>> +			if ((start > b) || (end < a))
>> +				continue;
>> +			/* Split the current node and create holes */
>> +			if (start > a)
>> +				ret = vfio_insert_iova(a, start - 1, &n->list);
>> +			if (!ret && end < b)
>> +				ret = vfio_insert_iova(end + 1, b, &n->list);
>> +			if (ret)
>> +				return ret;
>> +
>> +			list_del(&n->list);
> 
> This is trickier than it appears and deserves some explanation.  AIUI,
> we're actually inserting duplicate entries for the remainder at the
> start of the range and then at the end of the range (and the order is
> important here because we're inserting each before the current node),
> and then we delete the current node.  So the iova_list is kept sorted
> through this process, though temporarily includes some bogus, unordered
> sub-sets.
> 
>> +			kfree(n);
>> +		}
>> +	}
>> +
>> +	if (list_empty(iova))
>> +		return -EINVAL;
>> +
>> +	return 0;
>> +}
>> +
>>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  					 struct iommu_group *iommu_group)
>>  {
>> @@ -1327,6 +1405,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	bool resv_msi, msi_remap;
>>  	phys_addr_t resv_msi_base;
>>  	struct iommu_domain_geometry geo;
>> +	struct list_head group_resv_regions;
>> +	struct iommu_resv_region *resv, *resv_next;
>>  
>>  	mutex_lock(&iommu->lock);
>>  
>> @@ -1404,6 +1484,14 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	if (ret)
>>  		goto out_detach;
>>  
>> +	INIT_LIST_HEAD(&group_resv_regions);
>> +	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
>> +	list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
iommu_get_group_resv_regions returns a sorted list (see
iommu_insert_resv_regions kerneldoc comment). You can have overlapping
regions of different types though.

Thanks

Eric
>> +
>> +	ret = vfio_iommu_resv_region_conflict(iommu, &group_resv_regions);
>> +	if (ret)
>> +		goto out_detach;
>> +
>>  	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
>>  
>>  	INIT_LIST_HEAD(&domain->group_list);
>> @@ -1434,11 +1522,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  		    d->prot == domain->prot) {
>>  			iommu_detach_group(domain->domain, iommu_group);
>>  			if (!iommu_attach_group(d->domain, iommu_group)) {
>> +				ret = vfio_iommu_iova_resv_adjust(iommu,
>> +							&group_resv_regions);
>> +				if (!ret)
>> +					goto out_domain;
> 
> The above function is not without side effects if it fails, it's
> altered the iova_list.  It needs to be valid for the remaining domains
> if we're going to continue.
> 
>> +
>>  				list_add(&group->next, &d->group_list);
>>  				iommu_domain_free(domain->domain);
>>  				kfree(domain);
>> -				mutex_unlock(&iommu->lock);
>> -				return 0;
>> +				goto done;
>>  			}
>>  
>>  			ret = iommu_attach_group(domain->domain, iommu_group);
>> @@ -1465,8 +1557,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	if (ret)
>>  		goto out_detach;
>>  
>> +	ret = vfio_iommu_iova_resv_adjust(iommu, &group_resv_regions);
>> +	if (ret)
>> +		goto out_detach;
> 
> Can't we process the reserved regions once before we get here rather
> than have two separate call points that do the same thing?  In order to
> roll back from errors above, it seems like we need to copy iova_list
> and work on the copy, installing it and deleting the original only on
> success.
> 
>> +
>>  	list_add(&domain->next, &iommu->domain_list);
>>  
>> +done:
>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
>> +		kfree(resv);
>>  	mutex_unlock(&iommu->lock);
>>  
>>  	return 0;
>> @@ -1475,6 +1574,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
>>  	iommu_detach_group(domain->domain, iommu_group);
>>  out_domain:
>>  	iommu_domain_free(domain->domain);
>> +	list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
>> +		kfree(resv);
>>  out_free:
>>  	kfree(domain);
>>  	kfree(group);
>> @@ -1559,6 +1660,60 @@ static void vfio_iommu_iova_aper_refresh(struct vfio_iommu *iommu)
>>  	node->end = end;
>>  }
>>  
>> +/*
>> + * Called when a group is detached. The reserved regions for that
>> + * group can be part of valid iova now. But since reserved regions
>> + * may be duplicated among groups, populate the iova valid regions
>> +   list again.
>> + */
>> +static void vfio_iommu_iova_resv_refresh(struct vfio_iommu *iommu)
>> +{
>> +	struct vfio_domain *d;
>> +	struct vfio_group *g;
>> +	struct vfio_iova *node, *tmp;
>> +	struct iommu_resv_region *resv, *resv_next;
>> +	struct list_head resv_regions;
>> +	phys_addr_t start, end;
>> +
>> +	INIT_LIST_HEAD(&resv_regions);
>> +
>> +	list_for_each_entry(d, &iommu->domain_list, next) {
>> +		list_for_each_entry(g, &d->group_list, next)
>> +			iommu_get_group_resv_regions(g->iommu_group,
>> +							 &resv_regions);
>> +	}
>> +
>> +	if (list_empty(&resv_regions))
>> +		return;
>> +
>> +	list_sort(NULL, &resv_regions, vfio_resv_cmp);
>> +
>> +	node = list_first_entry(&iommu->iova_list, struct vfio_iova, list);
>> +	start = node->start;
>> +	node = list_last_entry(&iommu->iova_list, struct vfio_iova, list);
>> +	end = node->end;
> 
> list_sort() only sorts based on ->start, we added reserved regions for
> all our groups to one list, we potentially have multiple entries with
> the same ->start.  How can we be sure that the last one in the list
> actually has the largest ->end value?
> 
>> +
>> +	/* purge the iova list and create new one */
>> +	list_for_each_entry_safe(node, tmp, &iommu->iova_list, list) {
>> +		list_del(&node->list);
>> +		kfree(node);
>> +	}
>> +
>> +	if (vfio_iommu_iova_aper_adjust(iommu, start, end)) {
>> +		pr_warn("%s: Failed to update iova aperture. VFIO DMA map request may fail\n",
>> +			__func__);
> 
> Map requests "will" fail.  Is this the right error strategy?  Detaching
> a group cannot fail.  Aren't we better off leaving the iova_list we had
> in place?  If we cannot expand the iova aperture when a group is
> removed, a user can continue unscathed.
> 
>> +		goto done;
>> +	}
>> +
>> +	/* adjust the iova with current reserved regions */
>> +	if (vfio_iommu_iova_resv_adjust(iommu, &resv_regions))
>> +		pr_warn("%s: Failed to update iova list with reserve regions. VFIO DMA map request may fail\n",
>> +			__func__);
> 
> Same.
> 
>> +done:
>> +	list_for_each_entry_safe(resv, resv_next, &resv_regions, list)
>> +		kfree(resv);
>> +}
>> +
>>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>>  					  struct iommu_group *iommu_group)
>>  {
>> @@ -1617,6 +1772,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
>>  		break;
>>  	}
>>  
>> +	vfio_iommu_iova_resv_refresh(iommu);
>> +
>>  detach_group_done:
>>  	mutex_unlock(&iommu->lock);
>>  }
>