linux-kernel - Re: [RFC 5/6] iommu/dma: Add simple batching flush queue implementation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a8e778da-7b41-a6ba-83c3-c366a426c3da@arm.com>
Date:   Thu, 20 Oct 2022 14:18:26 +0100
From:   Robin Murphy <robin.murphy@....com>
To:     Niklas Schnelle <schnelle@...ux.ibm.com>,
        Matthew Rosato <mjrosato@...ux.ibm.com>, iommu@...ts.linux.dev,
        Joerg Roedel <joro@...tes.org>, Will Deacon <will@...nel.org>,
        Jason Gunthorpe <jgg@...dia.com>
Cc:     Gerd Bayer <gbayer@...ux.ibm.com>,
        Pierre Morel <pmorel@...ux.ibm.com>,
        linux-s390@...r.kernel.org, borntraeger@...ux.ibm.com,
        hca@...ux.ibm.com, gor@...ux.ibm.com,
        gerald.schaefer@...ux.ibm.com, agordeev@...ux.ibm.com,
        svens@...ux.ibm.com, linux-kernel@...r.kernel.org,
        Wenjia Zhang <wenjia@...ux.ibm.com>,
        Julian Ruess <julianr@...ux.ibm.com>
Subject: Re: [RFC 5/6] iommu/dma: Add simple batching flush queue
 implementation

On 2022-10-19 15:44, Niklas Schnelle wrote:
> Having dma-iommu prepared for alternative flush queue implementations we
> add a simple per-domain flush queue that optimzes for scenarios where
> global IOTLB flushes are used but are quite expensive and poorly
> parallelized. This is for example the case when IOTLB flushes are used
> to trigger updates to an underlying hypervisor's shadow tables and
> approximates the global flushing scheme previously in use on s390.
> 
> This is achieved by having a per-domain global flush queue that allows
> queuing a much large number of lazily freed IOVAs than the per-CPU flush
> queues. While using a single queue reduces parallelism this is not
> a problem when global IOTLB flushes are synchronized in the hypervisor
> anyway. While this flush queue allows queuing a large number of IOVAs we
> do limit the time a freed IOVA remains accessible by hardware to
> 1 second using a timeout that is reset on flush.
> 
> Enable this new flush queue implementation by default on s390 systems
> which use IOTLB flushes to trigger shadowing namely z/VM and KVM guests.

There seems to be way more duplication here than there probably needs to 
be. Is there any reason we can't keep using the same iova_fq and 
iova_fq_entry structures and just refactor how cookie->fq is 
allocated/freed/dereferenced to choose between percpu and direct? Making 
the queue size and timout variable - and thus potentially tuneable - 
isn't necessarily a bad thing for percpu queues either (the size can be 
constrained to powers of two and stored as a mask to keep the wrapping 
arithemetic efficient).

Thanks,
Robin.

> Link: https://lore.kernel.org/linux-iommu/3e402947-61f9-b7e8-1414-fde006257b6f@arm.com/
> Signed-off-by: Niklas Schnelle <schnelle@...ux.ibm.com>
> ---
>   drivers/iommu/dma-iommu.c  | 157 +++++++++++++++++++++++++++++++++++--
>   drivers/iommu/iommu.c      |  19 +++--
>   drivers/iommu/s390-iommu.c |  11 +++
>   include/linux/iommu.h      |   6 ++
>   4 files changed, 180 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 77d969f5aad7..427fb84f50c3 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -49,8 +49,13 @@ struct iommu_dma_cookie {
>   		/* Full allocator for IOMMU_DMA_IOVA_COOKIE */
>   		struct {
>   			struct iova_domain		iovad;
> -			/* Flush queue */
> -			struct iova_percpu __percpu *percpu_fq;
> +			/* Flush queues */
> +			union {
> +				struct iova_percpu __percpu *percpu_fq;
> +				struct iova_simple *simple_fq;
> +			};
> +			/* Queue timeout in milliseconds */
> +			unsigned int		fq_timeout;
>   			/* Number of TLB flushes that have been started */
>   			atomic64_t		fq_flush_start_cnt;
>   			/* Number of TLB flushes that have been finished */
> @@ -104,6 +109,119 @@ struct iova_percpu {
>   	spinlock_t lock;
>   };
>   
> +/* Simplified batched flush queue for expensive IOTLB flushes */
> +#define IOVA_SIMPLE_SIZE	32768
> +/* Maximum time in milliseconds an IOVA can remain lazily freed */
> +#define IOVA_SIMPLE_TIMEOUT	1000
> +
> +struct iova_simple_entry {
> +	unsigned long iova_pfn;
> +	unsigned long pages;
> +};
> +
> +struct iova_simple {
> +	/* Unlike iova_percpu we use a single queue lock */
> +	spinlock_t lock;
> +	unsigned int tail;
> +	unsigned long total_pages;
> +	struct list_head freelist;
> +	struct iova_simple_entry entries[];
> +};
> +
> +static bool is_full_simple(struct iommu_dma_cookie *cookie)
> +{
> +	struct iommu_domain *fq_domain = cookie->fq_domain;
> +	struct iova_domain *iovad = &cookie->iovad;
> +	struct iova_simple *sq = cookie->simple_fq;
> +	unsigned long aperture_pages;
> +
> +	assert_spin_locked(&sq->lock);
> +
> +	/* If more than 7/8 the aperture is batched let's flush */
> +	aperture_pages = ((fq_domain->geometry.aperture_end +  1) -
> +		fq_domain->geometry.aperture_start) >> iova_shift(iovad);
> +	aperture_pages -= aperture_pages >> 3;
> +
> +	return (sq->tail >= IOVA_SIMPLE_SIZE ||
> +		sq->total_pages >= aperture_pages);
> +}
> +
> +static void flush_simple(struct iommu_dma_cookie *cookie)
> +{
> +	struct iova_simple *sq = cookie->simple_fq;
> +	unsigned int i;
> +
> +	assert_spin_locked(&sq->lock);
> +	/* We're flushing so postpone timeout */
> +	mod_timer(&cookie->fq_timer,
> +		  jiffies + msecs_to_jiffies(cookie->fq_timeout));
> +	cookie->fq_domain->ops->flush_iotlb_all(cookie->fq_domain);
> +
> +	put_pages_list(&sq->freelist);
> +	for (i = 0; i < sq->tail; i++) {
> +		free_iova_fast(&cookie->iovad,
> +			       sq->entries[i].iova_pfn,
> +			       sq->entries[i].pages);
> +	}
> +	sq->tail = 0;
> +	sq->total_pages = 0;
> +}
> +
> +static void flush_simple_lock(struct iommu_dma_cookie *cookie)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&cookie->simple_fq->lock, flags);
> +	flush_simple(cookie);
> +	spin_unlock_irqrestore(&cookie->simple_fq->lock, flags);
> +}
> +
> +static void queue_iova_simple(struct iommu_dma_cookie *cookie,
> +			      unsigned long pfn, unsigned long pages,
> +			      struct list_head *freelist)
> +{
> +	struct iova_simple *sq = cookie->simple_fq;
> +	unsigned long flags;
> +	unsigned int idx;
> +
> +	spin_lock_irqsave(&sq->lock, flags);
> +	if (is_full_simple(cookie))
> +		flush_simple(cookie);
> +	idx = sq->tail++;
> +
> +	sq->entries[idx].iova_pfn = pfn;
> +	sq->entries[idx].pages    = pages;
> +	list_splice(freelist, &sq->freelist);
> +	sq->total_pages += pages;
> +	spin_unlock_irqrestore(&sq->lock, flags);
> +}
> +
> +static int iommu_dma_init_simple(struct iommu_dma_cookie *cookie)
> +{
> +	struct iova_simple *queue;
> +
> +	queue = vzalloc(sizeof(*queue) +
> +			IOVA_SIMPLE_SIZE * sizeof(struct iova_simple_entry));
> +	if (!queue)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&queue->freelist);
> +	cookie->fq_timeout = IOVA_SIMPLE_TIMEOUT;
> +	cookie->simple_fq = queue;
> +
> +	return 0;
> +}
> +
> +static void iommu_dma_free_simple(struct iommu_dma_cookie *cookie)
> +{
> +	if (!cookie->simple_fq)
> +		return;
> +
> +	put_pages_list(&cookie->simple_fq->freelist);
> +	vfree(cookie->simple_fq);
> +	cookie->simple_fq = NULL;
> +}
> +
>   #define ring_for_each_percpu(i, fq) \
>   	for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_PERCPU_SIZE)
>   
> @@ -169,12 +287,23 @@ static void flush_percpu(struct iommu_dma_cookie *cookie)
>   	}
>   }
>   
> +static void iommu_dma_flush_fq(struct iommu_dma_cookie *cookie)
> +{
> +	if (!cookie->fq_domain)
> +		return;
> +
> +	if (cookie->fq_domain->type == IOMMU_DOMAIN_DMA_FQ)
> +		flush_percpu(cookie);
> +	else
> +		flush_simple_lock(cookie);
> +}
> +
>   static void fq_flush_timeout(struct timer_list *t)
>   {
>   	struct iommu_dma_cookie *cookie = from_timer(cookie, t, fq_timer);
>   
>   	atomic_set(&cookie->fq_timer_on, 0);
> -	flush_percpu(cookie);
> +	iommu_dma_flush_fq(cookie);
>   }
>   
>   static void queue_iova_percpu(struct iommu_dma_cookie *cookie,
> @@ -223,13 +352,16 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
>   	 */
>   	smp_mb();
>   
> -	queue_iova_percpu(cookie, pfn, pages, freelist);
> +	if (cookie->fq_domain->type == IOMMU_DOMAIN_DMA_FQ)
> +		queue_iova_percpu(cookie, pfn, pages, freelist);
> +	else
> +		queue_iova_simple(cookie, pfn, pages, freelist);
>   
>   	/* Avoid false sharing as much as possible. */
>   	if (!atomic_read(&cookie->fq_timer_on) &&
>   	    !atomic_xchg(&cookie->fq_timer_on, 1))
>   		mod_timer(&cookie->fq_timer,
> -			  jiffies + msecs_to_jiffies(IOVA_PERCPU_TIMEOUT));
> +			  jiffies + msecs_to_jiffies(cookie->fq_timeout));
>   }
>   
>   static void iommu_dma_free_percpu(struct iommu_dma_cookie *cookie)
> @@ -253,7 +385,10 @@ static void iommu_dma_free_fq(struct iommu_dma_cookie *cookie)
>   {
>   	del_timer_sync(&cookie->fq_timer);
>   	/* The IOVAs will be torn down separately, so just free our queued pages */
> -	iommu_dma_free_percpu(cookie);
> +	if (cookie->fq_domain->type == IOMMU_DOMAIN_DMA_FQ)
> +		iommu_dma_free_percpu(cookie);
> +	else
> +		iommu_dma_free_simple(cookie);
>   }
>   
>   static int iommu_dma_init_percpu(struct iommu_dma_cookie *cookie)
> @@ -280,6 +415,7 @@ static int iommu_dma_init_percpu(struct iommu_dma_cookie *cookie)
>   			INIT_LIST_HEAD(&fq->entries[i].freelist);
>   	}
>   
> +	cookie->fq_timeout = IOVA_PERCPU_TIMEOUT;
>   	cookie->percpu_fq = queue;
>   
>   	return 0;
> @@ -294,7 +430,10 @@ int iommu_dma_init_fq(struct iommu_domain *domain)
>   	if (cookie->fq_domain)
>   		return 0;
>   
> -	rc = iommu_dma_init_percpu(cookie);
> +	if (domain->type == IOMMU_DOMAIN_DMA_FQ)
> +		rc = iommu_dma_init_percpu(cookie);
> +	else
> +		rc = iommu_dma_init_simple(cookie);
>   	if (rc) {
>   		pr_warn("iova flush queue initialization failed\n");
>   		return rc;
> @@ -613,7 +752,9 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
>   		goto done_unlock;
>   
>   	/* If the FQ fails we can simply fall back to strict mode */
> -	if (domain->type == IOMMU_DOMAIN_DMA_FQ && iommu_dma_init_fq(domain))
> +	if ((domain->type == IOMMU_DOMAIN_DMA_FQ ||
> +	     domain->type == IOMMU_DOMAIN_DMA_SQ) &&
> +	    iommu_dma_init_fq(domain))
>   		domain->type = IOMMU_DOMAIN_DMA;
>   
>   	ret = iova_reserve_iommu_regions(dev, domain);
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 4893c2429ca5..2b3a12799702 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -140,6 +140,7 @@ static const char *iommu_domain_type_str(unsigned int t)
>   		return "Unmanaged";
>   	case IOMMU_DOMAIN_DMA:
>   	case IOMMU_DOMAIN_DMA_FQ:
> +	case IOMMU_DOMAIN_DMA_SQ:
>   		return "Translated";
>   	default:
>   		return "Unknown";
> @@ -437,7 +438,8 @@ early_param("iommu.strict", iommu_dma_setup);
>   void iommu_set_dma_strict(void)
>   {
>   	iommu_dma_strict = true;
> -	if (iommu_def_domain_type == IOMMU_DOMAIN_DMA_FQ)
> +	if (iommu_def_domain_type == IOMMU_DOMAIN_DMA_FQ ||
> +	    iommu_def_domain_type == IOMMU_DOMAIN_DMA_SQ)
>   		iommu_def_domain_type = IOMMU_DOMAIN_DMA;
>   }
>   
> @@ -638,6 +640,9 @@ static ssize_t iommu_group_show_type(struct iommu_group *group,
>   		case IOMMU_DOMAIN_DMA_FQ:
>   			type = "DMA-FQ\n";
>   			break;
> +		case IOMMU_DOMAIN_DMA_SQ:
> +			type = "DMA-SQ\n";
> +			break;
>   		}
>   	}
>   	mutex_unlock(&group->mutex);
> @@ -2908,10 +2913,11 @@ static int iommu_change_dev_def_domain(struct iommu_group *group,
>   	}
>   
>   	/* We can bring up a flush queue without tearing down the domain */
> -	if (type == IOMMU_DOMAIN_DMA_FQ && prev_dom->type == IOMMU_DOMAIN_DMA) {
> +	if ((type == IOMMU_DOMAIN_DMA_FQ || type == IOMMU_DOMAIN_DMA_SQ) &&
> +	    prev_dom->type == IOMMU_DOMAIN_DMA) {
>   		ret = iommu_dma_init_fq(prev_dom);
>   		if (!ret)
> -			prev_dom->type = IOMMU_DOMAIN_DMA_FQ;
> +			prev_dom->type = type;
>   		goto out;
>   	}
>   
> @@ -2982,6 +2988,8 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
>   		req_type = IOMMU_DOMAIN_DMA;
>   	else if (sysfs_streq(buf, "DMA-FQ"))
>   		req_type = IOMMU_DOMAIN_DMA_FQ;
> +	else if (sysfs_streq(buf, "DMA-SQ"))
> +		req_type = IOMMU_DOMAIN_DMA_SQ;
>   	else if (sysfs_streq(buf, "auto"))
>   		req_type = 0;
>   	else
> @@ -3033,8 +3041,9 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
>   
>   	/* Check if the device in the group still has a driver bound to it */
>   	device_lock(dev);
> -	if (device_is_bound(dev) && !(req_type == IOMMU_DOMAIN_DMA_FQ &&
> -	    group->default_domain->type == IOMMU_DOMAIN_DMA)) {
> +	if (device_is_bound(dev) &&
> +	    !((req_type == IOMMU_DOMAIN_DMA_FQ || req_type == IOMMU_DOMAIN_DMA_SQ) &&
> +	      group->default_domain->type == IOMMU_DOMAIN_DMA)) {
>   		pr_err_ratelimited("Device is still bound to driver\n");
>   		ret = -EBUSY;
>   		goto out;
> diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
> index c2b8a7b96b8e..506f8b92931f 100644
> --- a/drivers/iommu/s390-iommu.c
> +++ b/drivers/iommu/s390-iommu.c
> @@ -324,6 +324,15 @@ static bool s390_iommu_capable(struct device *dev, enum iommu_cap cap)
>   	}
>   }
>   
> +static int s390_iommu_def_domain_type(struct device *dev)
> +{
> +	struct zpci_dev *zdev = to_zpci_dev(dev);
> +
> +	if (zdev->tlb_refresh)
> +		return IOMMU_DOMAIN_DMA_SQ;
> +	return IOMMU_DOMAIN_DMA_FQ;
> +}
> +
>   static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
>   {
>   	struct s390_domain *s390_domain;
> @@ -331,6 +340,7 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
>   	switch (domain_type) {
>   	case IOMMU_DOMAIN_DMA:
>   	case IOMMU_DOMAIN_DMA_FQ:
> +	case IOMMU_DOMAIN_DMA_SQ:
>   	case IOMMU_DOMAIN_UNMANAGED:
>   		break;
>   	default:
> @@ -774,6 +784,7 @@ subsys_initcall(s390_iommu_init);
>   
>   static const struct iommu_ops s390_iommu_ops = {
>   	.capable = s390_iommu_capable,
> +	.def_domain_type = s390_iommu_def_domain_type,
>   	.domain_alloc = s390_domain_alloc,
>   	.probe_device = s390_iommu_probe_device,
>   	.probe_finalize = s390_iommu_probe_finalize,
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index a325532aeab5..6c3fe62ec0df 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -63,6 +63,7 @@ struct iommu_domain_geometry {
>   					      implementation              */
>   #define __IOMMU_DOMAIN_PT	(1U << 2)  /* Domain is identity mapped   */
>   #define __IOMMU_DOMAIN_DMA_FQ	(1U << 3)  /* DMA-API uses flush queue    */
> +#define __IOMMU_DOMAIN_DMA_SQ	(1U << 4)  /* DMA-API uses max fill queue */
>   
>   /*
>    * This are the possible domain-types
> @@ -77,6 +78,8 @@ struct iommu_domain_geometry {
>    *				  certain optimizations for these domains
>    *	IOMMU_DOMAIN_DMA_FQ	- As above, but definitely using batched TLB
>    *				  invalidation.
> + *	IOMMU_DOMAIN_DMA_SQ	- As above, but batched invalidations are only
> + *				  flushed when running out of queue space.
>    */
>   #define IOMMU_DOMAIN_BLOCKED	(0U)
>   #define IOMMU_DOMAIN_IDENTITY	(__IOMMU_DOMAIN_PT)
> @@ -86,6 +89,9 @@ struct iommu_domain_geometry {
>   #define IOMMU_DOMAIN_DMA_FQ	(__IOMMU_DOMAIN_PAGING |	\
>   				 __IOMMU_DOMAIN_DMA_API |	\
>   				 __IOMMU_DOMAIN_DMA_FQ)
> +#define IOMMU_DOMAIN_DMA_SQ	(__IOMMU_DOMAIN_PAGING |	\
> +				 __IOMMU_DOMAIN_DMA_API |	\
> +				 __IOMMU_DOMAIN_DMA_SQ)
>   
>   struct iommu_domain {
>   	unsigned type;