lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1597736591-20457-1-git-send-email-pullip.cho@samsung.com>
Date:   Tue, 18 Aug 2020 16:43:10 +0900
From:   Cho KyongHo <pullip.cho@...sung.com>
To:     joro@...tes.org, catalin.marinas@....com, will@...nel.org
Cc:     iommu@...ts.linux-foundation.org, linux-kernel@...r.kernel.org,
        linux-arm-kernel@...ts.infradead.org, m.szyprowski@...sung.com,
        robin.murphy@....com, janghyuck.kim@...sung.com,
        hyesoo.yu@...sung.com, Cho KyongHo <pullip.cho@...sung.com>
Subject: [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync

Cache maintenance operations in the most of CPU architectures needs
memory barrier after the cache maintenance for the DMAs to view the
region of the memory correctly. The problem is that memory barrier is
very expensive and dma_[un]map_sg() and dma_sync_sg_for_{device|cpu}()
involves the memory barrier per every single cache sg entry. In some
CPU micro-architecture, a single memory barrier consumes more time than
cache clean on 4KiB. It becomes more serious if the number of CPU cores
are larger.
This patch introduces arch_sync_dma_for_device_relaxed() and
arch_sync_dma_for_cpu_relaxed() which do not involve memory barrier.
So the users called those functions require explicitly calling
arch_sync_barrier_for_device() and arch_sync_barrier_for_cpu(),
respectively to confirm the view of memory is consistent between the
CPUs and DMAs.

Signed-off-by: Cho KyongHo <pullip.cho@...sung.com>
---
 drivers/iommu/dma-iommu.c       |  6 +++--
 include/linux/dma-direct.h      | 29 +++++++++++++++++-----
 include/linux/dma-noncoherent.h | 54 +++++++++++++++++++++++++++++++++++++++++
 kernel/dma/Kconfig              |  8 ++++++
 kernel/dma/direct.c             | 25 +++++++++++++++----
 5 files changed, 109 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 5141d49..4f9c9cb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -705,7 +705,8 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
 		return;
 
 	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
+		arch_sync_dma_for_cpu_relaxed(sg_phys(sg), sg->length, dir);
+	arch_sync_barrier_for_cpu(dir);
 }
 
 static void iommu_dma_sync_sg_for_device(struct device *dev,
@@ -719,7 +720,8 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
 		return;
 
 	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
+		arch_sync_dma_for_device_relaxed(sg_phys(sg), sg->length, dir);
+	arch_sync_barrier_for_device(dir);
 }
 
 static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 6e87225..f5b1fee 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -152,7 +152,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
 }
 
-static inline dma_addr_t dma_direct_map_page(struct device *dev,
+static inline dma_addr_t __dma_direct_map_page(struct device *dev,
 		struct page *page, unsigned long offset, size_t size,
 		enum dma_data_direction dir, unsigned long attrs)
 {
@@ -172,20 +172,37 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
 		return DMA_MAPPING_ERROR;
 	}
 
-	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		arch_sync_dma_for_device(phys, size, dir);
 	return dma_addr;
 }
 
-static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+static inline dma_addr_t dma_direct_map_page(struct device *dev,
+		struct page *page, unsigned long offset, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	dma_addr_t dma_addr = __dma_direct_map_page(dev, page, offset, size, dir, attrs);
+
+	if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
+	    !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(page_to_phys(page) + offset, size, dir);
+
+	return dma_addr;
+}
+
+static inline void __dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	phys_addr_t phys = dma_to_phys(dev, addr);
 
+	if (unlikely(is_swiotlb_buffer(phys)))
+		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+}
+
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
-	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+	__dma_direct_unmap_page(dev, addr, size, dir, attrs);
 }
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index ca09a4e..0a31e6c 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -73,23 +73,77 @@ static inline void arch_dma_cache_sync(struct device *dev, void *vaddr,
 #endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE_RELAXED
+void arch_sync_dma_for_device_relaxed(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
+
+static inline void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
+{
+	arch_sync_dma_for_device_relaxed(paddr, size, dir);
+	arch_sync_barrier_for_device(dir);
+}
+#else
+#define arch_sync_dma_for_device_relaxed arch_sync_dma_for_device
+
 void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir);
+
+static inline void arch_sync_barrier_for_device(enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE_RELAXED */
 #else
+static inline void arch_sync_dma_for_device_relaxed(phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir)
+{
+}
+
 static inline void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir)
 {
 }
+
+static inline void arch_sync_barrier_for_device(enum dma_data_direction dir)
+{
+}
 #endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED
+void arch_sync_dma_for_cpu_relaxed(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
+
+static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
+{
+	arch_sync_dma_for_cpu_relaxed(paddr, size, dir);
+	arch_sync_barrier_for_cpu(dir);
+}
+#else
+#define arch_sync_dma_for_cpu_relaxed arch_sync_dma_for_cpu
+
 void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir);
+
+static inline void arch_sync_barrier_for_cpu(enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED */
 #else
+static inline void arch_sync_dma_for_cpu_relaxed(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
+{
+}
+
 static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir)
 {
 }
+
+static inline void arch_sync_barrier_for_cpu(enum dma_data_direction dir)
+{
+}
 #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 847a9d1..d6fe727f1 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -59,6 +59,14 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU
 	bool
 	select NEED_DMA_MAP_STATE
 
+config ARCH_HAS_SYNC_DMA_FOR_DEVICE_RELAXED
+	bool
+	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
+
+config ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED
+	bool
+	select ARCH_HAS_SYNC_DMA_FOR_CPU
+
 config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
 	bool
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index db6ef07a..52e5fd1 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -321,9 +321,12 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 					dir, SYNC_FOR_DEVICE);
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_device(paddr, sg->length,
+			arch_sync_dma_for_device_relaxed(paddr, sg->length,
 					dir);
 	}
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_barrier_for_device(dir);
 }
 #endif
 
@@ -340,15 +343,17 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_cpu(paddr, sg->length, dir);
+			arch_sync_dma_for_cpu_relaxed(paddr, sg->length, dir);
 
 		if (unlikely(is_swiotlb_buffer(paddr)))
 			swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
 					SYNC_FOR_CPU);
 	}
 
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
+		arch_sync_barrier_for_cpu(dir);
 		arch_sync_dma_for_cpu_all();
+	}
 }
 
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
@@ -357,8 +362,11 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 	struct scatterlist *sg;
 	int i;
 
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir);
+
 	for_each_sg(sgl, sg, nents, i)
-		dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
+		__dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
 			     attrs);
 }
 #endif
@@ -370,13 +378,20 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	struct scatterlist *sg;
 
 	for_each_sg(sgl, sg, nents, i) {
-		sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
+		sg->dma_address = __dma_direct_map_page(dev, sg_page(sg),
 				sg->offset, sg->length, dir, attrs);
 		if (sg->dma_address == DMA_MAPPING_ERROR)
 			goto out_unmap;
 		sg_dma_len(sg) = sg->length;
 	}
 
+	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
+		for_each_sg(sgl, sg, nents, i)
+			arch_sync_dma_for_device_relaxed(dma_to_phys(dev, sg_dma_address(sg)),
+							 sg->length, dir);
+		arch_sync_barrier_for_device(dir);
+	}
+
 	return nents;
 
 out_unmap:
-- 
2.7.4

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ