linux-kernel - Re: [PATCH] iommu/iova: Bettering utilizing cpu

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5149c162-cf38-4aa4-9e96-27c6897cad36@arm.com>
Date: Mon, 24 Jun 2024 11:56:01 +0100
From: Robin Murphy <robin.murphy@....com>
To: Zhang Zekun <zhangzekun11@...wei.com>, joro@...tes.org, will@...nel.org,
 john.g.garry@...cle.com
Cc: iommu@...ts.linux.dev, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] iommu/iova: Bettering utilizing cpu_rcaches in no-strict
 mode

On 2024-06-24 9:39 am, Zhang Zekun wrote:
> Currently, when iommu working in no-strict mode, fq_flush_timeout()
> will always try to free iovas on one cpu. Freeing the iovas from all
> cpus on one cpu is not cache-friendly to iova_rcache, because it will
> first filling up the cpu_rcache and then pushing iovas to the depot,
> iovas in the depot will finally goto the underlying rbtree if the
> depot_size is greater than num_online_cpus().

That is the design intent - if the excess magazines sit in the depot 
long enough to be reclaimed then other CPUs didn't want them either. 
We're trying to minimise the amount of unused cached IOVAs sitting 
around wasting memory, since IOVA memory consumption has proven to be 
quite significant on large systems.

As alluded to in the original cover letter, 100ms for IOVA_DEPOT_DELAY 
was just my arbitrary value of "long enough" to keep the initial 
implementation straightforward - I do expect that certain workloads 
might benefit from tuning it differently, but without proof of what they 
are and what they want, there's little justification for introducing 
extra complexity and potential user ABI yet.

> Let fq_flush_timeout()
> freeing iovas on cpus who call dma_unmap* APIs, can decrease the overall
> time caused by fq_flush_timeout() by better utilizing the iova_rcache,
> and minimizing the competition for the iova_rbtree_lock().

I would have assumed that a single CPU simply throwing magazines into 
the depot list from its own percpu cache is quicker, or at least no 
slower, than doing the same while causing additional contention/sharing 
by interfering with other percpu caches as well. And where does the 
rbtree come into that either way? If an observable performance issue 
actually exists here, I'd like a more detailed breakdown to understand it.

Thanks,
Robin.

> Signed-off-by: Zhang Zekun <zhangzekun11@...wei.com>
> ---
>   drivers/iommu/dma-iommu.c | 22 +++++++++++++---------
>   drivers/iommu/iova.c      | 21 ++++++++++++++-------
>   include/linux/iova.h      |  7 +++++++
>   3 files changed, 34 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 43520e7275cc..217b7c70d06c 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -145,7 +145,8 @@ static inline unsigned int fq_ring_add(struct iova_fq *fq)
>   	return idx;
>   }
>   
> -static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
> +static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
> +				int cpu)
>   {
>   	u64 counter = atomic64_read(&cookie->fq_flush_finish_cnt);
>   	unsigned int idx;
> @@ -158,20 +159,21 @@ static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq
>   			break;
>   
>   		iommu_put_pages_list(&fq->entries[idx].freelist);
> -		free_iova_fast(&cookie->iovad,
> +		free_iova_fast_cpu(&cookie->iovad,
>   			       fq->entries[idx].iova_pfn,
> -			       fq->entries[idx].pages);
> +			       fq->entries[idx].pages, cpu);
>   
>   		fq->head = (fq->head + 1) & fq->mod_mask;
>   	}
>   }
>   
> -static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
> +static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
> +			 int cpu)
>   {
>   	unsigned long flags;
>   
>   	spin_lock_irqsave(&fq->lock, flags);
> -	fq_ring_free_locked(cookie, fq);
> +	fq_ring_free_locked(cookie, fq, cpu);
>   	spin_unlock_irqrestore(&fq->lock, flags);
>   }
>   
> @@ -191,10 +193,11 @@ static void fq_flush_timeout(struct timer_list *t)
>   	fq_flush_iotlb(cookie);
>   
>   	if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) {
> -		fq_ring_free(cookie, cookie->single_fq);
> +		cpu = smp_processor_id();
> +		fq_ring_free(cookie, cookie->single_fq, cpu);
>   	} else {
>   		for_each_possible_cpu(cpu)
> -			fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu));
> +			fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu), cpu);
>   	}
>   }
>   
> @@ -205,6 +208,7 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
>   	struct iova_fq *fq;
>   	unsigned long flags;
>   	unsigned int idx;
> +	int cpu = smp_processor_id();
>   
>   	/*
>   	 * Order against the IOMMU driver's pagetable update from unmapping
> @@ -227,11 +231,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
>   	 * flushed out on another CPU. This makes the fq_full() check below less
>   	 * likely to be true.
>   	 */
> -	fq_ring_free_locked(cookie, fq);
> +	fq_ring_free_locked(cookie, fq, cpu);
>   
>   	if (fq_full(fq)) {
>   		fq_flush_iotlb(cookie);
> -		fq_ring_free_locked(cookie, fq);
> +		fq_ring_free_locked(cookie, fq, cpu);
>   	}
>   
>   	idx = fq_ring_add(fq);
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index d59d0ea2fd21..46a2188c263b 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -20,7 +20,7 @@
>   
>   static bool iova_rcache_insert(struct iova_domain *iovad,
>   			       unsigned long pfn,
> -			       unsigned long size);
> +			       unsigned long size, int cpu);
>   static unsigned long iova_rcache_get(struct iova_domain *iovad,
>   				     unsigned long size,
>   				     unsigned long limit_pfn);
> @@ -423,12 +423,19 @@ EXPORT_SYMBOL_GPL(alloc_iova_fast);
>   void
>   free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
>   {
> -	if (iova_rcache_insert(iovad, pfn, size))
> +	free_iova_fast_cpu(iovad, pfn, size, smp_processor_id());
> +}
> +EXPORT_SYMBOL_GPL(free_iova_fast);
> +
> +void
> +free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
> +		   unsigned long size, int cpu)
> +{
> +	if (iova_rcache_insert(iovad, pfn, size, cpu))
>   		return;
>   
>   	free_iova(iovad, pfn);
>   }
> -EXPORT_SYMBOL_GPL(free_iova_fast);
>   
>   static void iova_domain_free_rcaches(struct iova_domain *iovad)
>   {
> @@ -762,13 +769,13 @@ EXPORT_SYMBOL_GPL(iova_domain_init_rcaches);
>    */
>   static bool __iova_rcache_insert(struct iova_domain *iovad,
>   				 struct iova_rcache *rcache,
> -				 unsigned long iova_pfn)
> +				 unsigned long iova_pfn, int cpu)
>   {
>   	struct iova_cpu_rcache *cpu_rcache;
>   	bool can_insert = false;
>   	unsigned long flags;
>   
> -	cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
> +	cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
>   	spin_lock_irqsave(&cpu_rcache->lock, flags);
>   
>   	if (!iova_magazine_full(cpu_rcache->loaded)) {
> @@ -799,14 +806,14 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
>   }
>   
>   static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
> -			       unsigned long size)
> +			       unsigned long size, int cpu)
>   {
>   	unsigned int log_size = order_base_2(size);
>   
>   	if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
>   		return false;
>   
> -	return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
> +	return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn, cpu);
>   }
>   
>   /*
> diff --git a/include/linux/iova.h b/include/linux/iova.h
> index d2c4fd923efa..91e4e3d5bcb3 100644
> --- a/include/linux/iova.h
> +++ b/include/linux/iova.h
> @@ -93,6 +93,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
>   	bool size_aligned);
>   void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
>   		    unsigned long size);
> +void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
> +			unsigned long size, int cpu);
>   unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
>   			      unsigned long limit_pfn, bool flush_rcache);
>   struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
> @@ -134,6 +136,11 @@ static inline void free_iova_fast(struct iova_domain *iovad,
>   {
>   }
>   
> +static inline void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
> +				      unsigned long size, int cpu);
> +{
> +}
> +
>   static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
>   					    unsigned long size,
>   					    unsigned long limit_pfn,