linux-kernel - Re: [PATCH v2 -next] iommu/dma: avoid expensive indirect calls for sync operations

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <96be2403-d0b1-7dbc-8278-4347f1a9347d@linux.intel.com>
Date:   Wed, 16 Nov 2022 12:10:34 +0800
From:   Ethan Zhao <haifeng.zhao@...ux.intel.com>
To:     Eric Dumazet <edumazet@...gle.com>, Joerg Roedel <joro@...tes.org>,
        Robin Murphy <robin.murphy@....com>,
        Will Deacon <will@...nel.org>
Cc:     linux-kernel <linux-kernel@...r.kernel.org>,
        netdev@...r.kernel.org, Eric Dumazet <eric.dumazet@...il.com>,
        iommu@...ts.linux.dev
Subject: Re: [PATCH v2 -next] iommu/dma: avoid expensive indirect calls for
 sync operations

Hi,

On 2022/11/16 2:28, Eric Dumazet wrote:
> Quite often, NIC devices do not need dma_sync operations
> on x86_64 at least.
>
> Indeed, when dev_is_dma_coherent(dev) is true and
> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> and friends do nothing.
>
> However, indirectly calling them when CONFIG_RETPOLINE=y
> consumes about 10% of cycles on a cpu receiving packets
> from softirq at ~100Gbit rate, as shown in [1]
>
> Even if/when CONFIG_RETPOLINE is not set, there
> is a cost of about 3%.
>
> This patch adds dev->skip_dma_sync boolean that can be opted-in.
>
> For instance iommu_setup_dma_ops() can set this boolean to true
> if CONFIG_DMA_API_DEBUG is not set, and dev_is_dma_coherent(dev).
>
> Then later, if/when swiotlb is used for the first time, the flag
> is turned off, from swiotlb_tbl_map_single()
>
> We might in the future inline again these helpers, like:
>
> static void inline
> dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
> 			size_t size, enum dma_data_direction dir)
> {
> 	if (!dev_skip_dma_sync(dev))
> 		__dma_sync_single_for_cpu(dev, addr, size, dir);
> }
>
> perf profile before the patch:
>
>      18.53%  [kernel]       [k] gq_rx_skb
>      14.77%  [kernel]       [k] napi_reuse_skb
>       8.95%  [kernel]       [k] skb_release_data
>       5.42%  [kernel]       [k] dev_gro_receive
>       5.37%  [kernel]       [k] memcpy
> <*>  5.26%  [kernel]       [k] iommu_dma_sync_sg_for_cpu
>       4.78%  [kernel]       [k] tcp_gro_receive
> <*>  4.42%  [kernel]       [k] iommu_dma_sync_sg_for_device
>       4.12%  [kernel]       [k] ipv6_gro_receive
>       3.65%  [kernel]       [k] gq_pool_get
>       3.25%  [kernel]       [k] skb_gro_receive
>       2.07%  [kernel]       [k] napi_gro_frags
>       1.98%  [kernel]       [k] tcp6_gro_receive
>       1.27%  [kernel]       [k] gq_rx_prep_buffers
>       1.18%  [kernel]       [k] gq_rx_napi_handler
>       0.99%  [kernel]       [k] csum_partial
>       0.74%  [kernel]       [k] csum_ipv6_magic
>       0.72%  [kernel]       [k] free_pcp_prepare
>       0.60%  [kernel]       [k] __napi_poll
>       0.58%  [kernel]       [k] net_rx_action
>       0.56%  [kernel]       [k] read_tsc
> <*>  0.50%  [kernel]       [k] __x86_indirect_thunk_r11
>       0.45%  [kernel]       [k] memset
>
> After patch, lines with <*> no longer show up, and overall
> cpu usage looks much better (~60% instead of ~72%)
>
>      25.56%  [kernel]       [k] gq_rx_skb
>       9.90%  [kernel]       [k] napi_reuse_skb
>       7.39%  [kernel]       [k] dev_gro_receive
>       6.78%  [kernel]       [k] memcpy
>       6.53%  [kernel]       [k] skb_release_data
>       6.39%  [kernel]       [k] tcp_gro_receive
>       5.71%  [kernel]       [k] ipv6_gro_receive
>       4.35%  [kernel]       [k] napi_gro_frags
>       4.34%  [kernel]       [k] skb_gro_receive
>       3.50%  [kernel]       [k] gq_pool_get
>       3.08%  [kernel]       [k] gq_rx_napi_handler
>       2.35%  [kernel]       [k] tcp6_gro_receive
>       2.06%  [kernel]       [k] gq_rx_prep_buffers
>       1.32%  [kernel]       [k] csum_partial
>       0.93%  [kernel]       [k] csum_ipv6_magic
>       0.65%  [kernel]       [k] net_rx_action
>
> Many thanks to Robin Murphy for his feedback and ideas to make this patch
> much better !
>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> Cc: Robin Murphy <robin.murphy@....com>
> Cc: Joerg Roedel <joro@...tes.org>
> Cc: Will Deacon <will@...nel.org>
> Cc: iommu@...ts.linux.dev
> ---
>   drivers/iommu/dma-iommu.c   |  2 ++
>   include/linux/device.h      |  1 +
>   include/linux/dma-map-ops.h |  5 +++++
>   kernel/dma/mapping.c        | 20 ++++++++++++++++----
>   kernel/dma/swiotlb.c        |  3 +++
>   5 files changed, 27 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 9297b741f5e80e2408e864fc3f779410d6b04d49..bd3f4d3d646cc57c7588f22d49ea32ac693e38ff 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -1587,6 +1587,8 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
>   		if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
>   			goto out_err;
>   		dev->dma_ops = &iommu_dma_ops;
> +		if (!IS_ENABLED(CONFIG_DMA_API_DEBUG) && dev_is_dma_coherent(dev))
> +			dev->skip_dma_sync = true;
>   	}
>   
>   	return;
> diff --git a/include/linux/device.h b/include/linux/device.h
> index 424b55df02727b5742070f72374fd65f5dd68151..2fbb2cc18e44e21eba5f43557ee16d0dc92ef2ef 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -647,6 +647,7 @@ struct device {
>       defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
>   	bool			dma_coherent:1;
>   #endif
> +	bool			skip_dma_sync:1;
>   #ifdef CONFIG_DMA_OPS_BYPASS
>   	bool			dma_ops_bypass : 1;
>   #endif
> diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> index d678afeb8a13a3a54380a959d14f79bca9c23d8e..4691081f71c51da5468cf6703570ebc7a64d40c5 100644
> --- a/include/linux/dma-map-ops.h
> +++ b/include/linux/dma-map-ops.h
> @@ -275,6 +275,11 @@ static inline bool dev_is_dma_coherent(struct device *dev)
>   }
>   #endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */
>   
> +static inline bool dev_skip_dma_sync(struct device *dev)
> +{
> +	return dev->skip_dma_sync;
> +}
> +
>   void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
>   		gfp_t gfp, unsigned long attrs);
>   void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
> diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
> index 33437d6206445812b6d4d5b33c77235d18074dec..5d5d286ffae7fa6b7ff1aef46bdc59e7e31a8038 100644
> --- a/kernel/dma/mapping.c
> +++ b/kernel/dma/mapping.c
> @@ -328,9 +328,12 @@ EXPORT_SYMBOL(dma_unmap_resource);
>   void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
>   		enum dma_data_direction dir)
>   {
> -	const struct dma_map_ops *ops = get_dma_ops(dev);
> +	const struct dma_map_ops *ops;
>   
>   	BUG_ON(!valid_dma_direction(dir));
> +	if (dev_skip_dma_sync(dev))
> +		return;

May I know why this funciton that is called by all coherent or 
non-coherent dev

got a burden to decide to bail out or not ? Is it really the better 
point to check

it ?


Thanks,

Ethan

> +	ops = get_dma_ops(dev);;
>   	if (dma_map_direct(dev, ops))
>   		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
>   	else if (ops->sync_single_for_cpu)
> @@ -342,9 +345,12 @@ EXPORT_SYMBOL(dma_sync_single_for_cpu);
>   void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
>   		size_t size, enum dma_data_direction dir)
>   {
> -	const struct dma_map_ops *ops = get_dma_ops(dev);
> +	const struct dma_map_ops *ops;
>   
>   	BUG_ON(!valid_dma_direction(dir));
> +	if (dev_skip_dma_sync(dev))
> +		return;
> +	ops = get_dma_ops(dev);;
>   	if (dma_map_direct(dev, ops))
>   		dma_direct_sync_single_for_device(dev, addr, size, dir);
>   	else if (ops->sync_single_for_device)
> @@ -356,9 +362,12 @@ EXPORT_SYMBOL(dma_sync_single_for_device);
>   void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
>   		    int nelems, enum dma_data_direction dir)
>   {
> -	const struct dma_map_ops *ops = get_dma_ops(dev);
> +	const struct dma_map_ops *ops;
>   
>   	BUG_ON(!valid_dma_direction(dir));
> +	if (dev_skip_dma_sync(dev))
> +		return;
> +	ops = get_dma_ops(dev);;
>   	if (dma_map_direct(dev, ops))
>   		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
>   	else if (ops->sync_sg_for_cpu)
> @@ -370,9 +379,12 @@ EXPORT_SYMBOL(dma_sync_sg_for_cpu);
>   void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
>   		       int nelems, enum dma_data_direction dir)
>   {
> -	const struct dma_map_ops *ops = get_dma_ops(dev);
> +	const struct dma_map_ops *ops;
>   
>   	BUG_ON(!valid_dma_direction(dir));
> +	if (dev_skip_dma_sync(dev))
> +		return;
> +	ops = get_dma_ops(dev);;
>   	if (dma_map_direct(dev, ops))
>   		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
>   	else if (ops->sync_sg_for_device)
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index 339a990554e7fed98dd337efe4fb759a98161cdb..03ebd9803db1a457600f1fac8a18fb3dde724a6f 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -734,6 +734,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
>   	int index;
>   	phys_addr_t tlb_addr;
>   
> +	if (unlikely(dev->skip_dma_sync))
> +		dev->skip_dma_sync = false;
> +
>   	if (!mem || !mem->nslabs) {
>   		dev_warn_ratelimited(dev,
>   			"Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");