[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20251117211238.50126-1-21cnbao@gmail.com>
Date: Tue, 18 Nov 2025 05:12:38 +0800
From: Barry Song <21cnbao@...il.com>
To: catalin.marinas@....com
Cc: 21cnbao@...il.com,
ada.coupriediaz@....com,
anshuman.khandual@....com,
ardb@...nel.org,
iommu@...ts.linux.dev,
linux-arm-kernel@...ts.infradead.org,
linux-kernel@...r.kernel.org,
m.szyprowski@...sung.com,
maz@...nel.org,
robin.murphy@....com,
ryan.roberts@....com,
surenb@...gle.com,
v-songbaohua@...o.com,
will@...nel.org,
zhengtangquan@...o.com
Subject: Re: [RFC PATCH 5/5] dma-mapping: Allow batched DMA sync operations if supported by the arch
On Fri, Nov 14, 2025 at 2:19 AM Catalin Marinas <catalin.marinas@....com> wrote:
>
> On Wed, Oct 29, 2025 at 10:31:15AM +0800, Barry Song wrote:
> > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> > index 1f9ee9759426..a0b45f84a91f 100644
> > --- a/kernel/dma/direct.c
> > +++ b/kernel/dma/direct.c
> > @@ -403,9 +403,16 @@ void dma_direct_sync_sg_for_device(struct device *dev,
> > swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
> >
> > if (!dev_is_dma_coherent(dev))
> > - arch_sync_dma_for_device(paddr, sg->length,
> > - dir);
> > +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
> > + arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
> > +#else
> > + arch_sync_dma_for_device(paddr, sg->length, dir);
> > +#endif
> > }
> > +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
> > + if (!dev_is_dma_coherent(dev))
> > + arch_sync_dma_batch_flush();
> > +#endif
> > }
> > #endif
>
> Just a high-level comment for now. I'm not opposed to the idea of
> batching the DSB barriers, we do this for ptes. However, the way it's
Thanks, Catalin. I agree we need batching, as phones and embedded systems
could use many DMA buffers while some chips lack DMA-coherency.
> implemented in the generic files, with lots of #ifdefs, makes the code
> pretty unreadable.
>
> Can we have something like arch_sync_dma_begin/end() and let the arch
> code handle the barriers as they see fit?
I guess I can refactor it as below and then remove the #ifdef/#else/#endif blocks.
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 8fcd0a9c1f39..73bca4d7149d 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -373,6 +373,20 @@ void arch_sync_dma_for_device_batch_add(phys_addr_t paddr, size_t size,
void arch_sync_dma_for_cpu_batch_add(phys_addr_t paddr, size_t size,
enum dma_data_direction dir);
void arch_sync_dma_batch_flush(void);
+#else
+static inline void arch_sync_dma_for_device_batch_add(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
+{
+ arch_sync_dma_for_device(paddr, size, dir);
+}
+static inline void arch_sync_dma_for_cpu_batch_add(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
+{
+ arch_sync_dma_for_cpu(paddr, size, dir);
+}
+static inline void arch_sync_dma_batch_flush(void)
+{
+}
#endif
#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index a0b45f84a91f..69b14b0c0501 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -403,16 +403,10 @@ void dma_direct_sync_sg_for_device(struct device *dev,
swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
if (!dev_is_dma_coherent(dev))
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
-#else
- arch_sync_dma_for_device(paddr, sg->length, dir);
-#endif
}
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
if (!dev_is_dma_coherent(dev))
arch_sync_dma_batch_flush();
-#endif
}
#endif
@@ -429,11 +423,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
if (!dev_is_dma_coherent(dev))
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
-#else
- arch_sync_dma_for_cpu(paddr, sg->length, dir);
-#endif
swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
@@ -443,9 +433,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev)) {
arch_sync_dma_for_cpu_all();
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
arch_sync_dma_batch_flush();
-#endif
}
}
@@ -458,29 +446,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
{
struct scatterlist *sg;
int i;
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
bool need_sync = false;
-#endif
for_each_sg(sgl, sg, nents, i) {
if (sg_dma_is_bus_address(sg)) {
sg_dma_unmark_bus_address(sg);
} else {
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
need_sync = true;
dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
sg_dma_len(sg), dir, attrs);
-
-#else
- dma_direct_unmap_phys(dev, sg->dma_address,
- sg_dma_len(sg), dir, attrs);
-#endif
}
}
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
if (need_sync && !dev_is_dma_coherent(dev))
arch_sync_dma_batch_flush();
-#endif
}
#endif
@@ -490,9 +468,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
struct pci_p2pdma_map_state p2pdma_state = {};
struct scatterlist *sg;
int i, ret;
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
bool need_sync = false;
-#endif
for_each_sg(sgl, sg, nents, i) {
switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -504,14 +480,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
*/
break;
case PCI_P2PDMA_MAP_NONE:
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
need_sync = true;
sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
sg->length, dir, attrs);
-#else
- sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
- sg->length, dir, attrs);
-#endif
if (sg->dma_address == DMA_MAPPING_ERROR) {
ret = -EIO;
goto out_unmap;
@@ -529,10 +500,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
sg_dma_len(sg) = sg->length;
}
-#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
if (need_sync && !dev_is_dma_coherent(dev))
arch_sync_dma_batch_flush();
-#endif
return nents;
out_unmap:
Thanks
Barry
Powered by blists - more mailing lists