linux-kernel - Re: [PATCH 14/14] dmaengine: dma350: Support ARM DMA-250

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b2987edf-8e49-4b9f-93a5-45cbe08b975b@arm.com>
Date: Fri, 29 Aug 2025 23:24:37 +0100
From: Robin Murphy <robin.murphy@....com>
To: Jisheng Zhang <jszhang@...nel.org>, Vinod Koul <vkoul@...nel.org>,
 Rob Herring <robh@...nel.org>, Krzysztof Kozlowski <krzk+dt@...nel.org>,
 Conor Dooley <conor+dt@...nel.org>
Cc: dmaengine@...r.kernel.org, devicetree@...r.kernel.org,
 linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 14/14] dmaengine: dma350: Support ARM DMA-250

On 2025-08-23 4:40 pm, Jisheng Zhang wrote:
> Compared with ARM DMA-350, DMA-250 is a simplified version. They share
> many common parts, but they do have difference. Add DMA-250 support
> by handling their difference by using different device_prep_slave_sg,
> device_prep_dma_cyclic and device_prep_dma_memcpy. DMA-250 doesn't
> support device_prep_dma_memset.
> 
> Signed-off-by: Jisheng Zhang <jszhang@...nel.org>
> ---
>   drivers/dma/arm-dma350.c | 444 +++++++++++++++++++++++++++++++++++++--
>   1 file changed, 424 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/dma/arm-dma350.c b/drivers/dma/arm-dma350.c
> index 5abb965c6687..0ee807424b7e 100644
> --- a/drivers/dma/arm-dma350.c
> +++ b/drivers/dma/arm-dma350.c
> @@ -1,7 +1,7 @@
>   // SPDX-License-Identifier: GPL-2.0
>   // Copyright (C) 2024-2025 Arm Limited
>   // Copyright (C) 2025 Synaptics Incorporated
> -// Arm DMA-350 driver
> +// Arm DMA-350/DMA-250 driver

Yeah, that's going to get old fast... By all means update the Kconfig 
help text if you think it's helpful to end users, but I don't think 
anyone expects comments like this to be exhaustive, so honestly I'd save 
the churn.
>   #include <linux/bitfield.h>
>   #include <linux/dmaengine.h>
> @@ -16,6 +16,10 @@
>   #include "dmaengine.h"
>   #include "virt-dma.h"
>   
> +#define DMANSECCTRL		0x0200
> +
> +#define NSEC_CNTXBASE		0x10
> +
>   #define DMAINFO			0x0f00
>   
>   #define DMA_BUILDCFG0		0xb0
> @@ -26,12 +30,16 @@
>   #define DMA_BUILDCFG1		0xb4
>   #define DMA_CFG_NUM_TRIGGER_IN	GENMASK(8, 0)
>   
> +#define DMA_BUILDCFG2		0xb8
> +#define DMA_CFG_HAS_TZ		BIT(8)I don't think we need to care about that. Yes, the TRM describes the 
total context memory size required from the PoV of the hardware itself, 
but even if SEC_CNTXBASE does exist, Non-Secure Linux can't set it, so 
clearly Linux can't need to provide memory for it.

> +
>   #define IIDR			0xc8
>   #define IIDR_PRODUCTID		GENMASK(31, 20)
>   #define IIDR_VARIANT		GENMASK(19, 16)
>   #define IIDR_REVISION		GENMASK(15, 12)
>   #define IIDR_IMPLEMENTER	GENMASK(11, 0)
>   
> +#define PRODUCTID_DMA250	0x250
>   #define PRODUCTID_DMA350	0x3a0
>   #define IMPLEMENTER_ARM		0x43b
>   
> @@ -140,6 +148,7 @@
>   #define CH_CFG_HAS_TRIGSEL	BIT(7)
>   #define CH_CFG_HAS_TRIGIN	BIT(5)
>   #define CH_CFG_HAS_WRAP		BIT(1)
> +#define CH_CFG_HAS_XSIZEHI	BIT(0)
>   
>   
>   #define LINK_REGCLEAR		BIT(0)
> @@ -218,6 +227,7 @@ struct d350_chan {
>   	bool cyclic;
>   	bool has_trig;
>   	bool has_wrap;
> +	bool has_xsizehi;
>   	bool coherent;
>   };
>   
> @@ -225,6 +235,10 @@ struct d350 {
>   	struct dma_device dma;
>   	int nchan;
>   	int nreq;
> +	bool is_d250;

That won't scale, but it also shouldn't be needed anyway - other than 
the context memory which is easily handled within the scope of the probe 
routine that already has the IIDR to hand, everything else ought to be 
based on the relevant feature flags.

> +	dma_addr_t cntx_mem_paddr;
> +	void *cntx_mem;
> +	u32 cntx_mem_size;
>   	struct d350_chan channels[] __counted_by(nchan);
>   };
>   
> @@ -238,6 +252,11 @@ static inline struct d350_desc *to_d350_desc(struct virt_dma_desc *vd)
>   	return container_of(vd, struct d350_desc, vd);
>   }
>   
> +static inline struct d350 *to_d350(struct dma_device *dd)
> +{
> +	return container_of(dd, struct d350, dma);
> +}
> +
>   static void d350_desc_free(struct virt_dma_desc *vd)
>   {
>   	struct d350_chan *dch = to_d350_chan(vd->tx.chan);
> @@ -585,6 +604,337 @@ static int d350_slave_config(struct dma_chan *chan, struct dma_slave_config *con
>   	return 0;
>   }
>   
> +static struct dma_async_tx_descriptor *d250_prep_memcpy(struct dma_chan *chan,
> +		dma_addr_t dest, dma_addr_t src, size_t len, unsigned long flags)

Case in point: We don't need a mess of separate copy-pasted functions, 
we just need to evolve the existing ones to split the respective 
operations into either 32-bit or 16-bit chunks depending on has_xsizehi 
- even on DMA-350, >32-bit sizes aren't properly supported since I never 
got as far as command linking, but there's no reason they shouldn't be.

> +{
> +	struct d350_chan *dch = to_d350_chan(chan);
> +	struct d350_desc *desc;
> +	u32 *cmd, *la_cmd, tsz;
> +	int sglen, i;
> +	struct d350_sg *sg;
> +	size_t xfer_len, step_max;
> +	dma_addr_t phys;
> +
> +	tsz = __ffs(len | dest | src | (1 << dch->tsz));
> +	step_max = ((1UL << 16) - 1) << tsz;
> +	sglen = DIV_ROUND_UP(len, step_max);
> +
> +	desc = kzalloc(struct_size(desc, sg, sglen), GFP_NOWAIT);
> +	if (!desc)
> +		return NULL;
> +
> +	desc->sglen = sglen;
> +	sglen = 0;
> +	while (len) {
> +		sg = &desc->sg[sglen];
> +		xfer_len = (len > step_max) ? step_max : len;

If only we had a min() function...

> +		sg->tsz = __ffs(xfer_len | dest | src | (1 << dch->tsz));

Um, what? By this point we've already decided to split based on the 
initial tsz, what purpose does recalculating it serve?

> +		sg->xsize = lower_16_bits(xfer_len >> sg->tsz);
> +
> +		sg->command = dma_pool_zalloc(dch->cmd_pool, GFP_NOWAIT, &phys);
> +		if (unlikely(!sg->command))
> +			goto err_cmd_alloc;
> +		sg->phys = phys;
> +
> +		cmd = sg->command;
> +		if (!sglen) {
> +			cmd[0] = LINK_CTRL | LINK_SRCADDR | LINK_DESADDR |
> +				 LINK_XSIZE | LINK_SRCTRANSCFG |
> +				 LINK_DESTRANSCFG | LINK_XADDRINC | LINK_LINKADDR;
> +
> +			cmd[1] = FIELD_PREP(CH_CTRL_TRANSIZE, sg->tsz) |
> +				 FIELD_PREP(CH_CTRL_XTYPE, CH_CTRL_XTYPE_CONTINUE);
> +
> +			cmd[2] = lower_32_bits(src);
> +			cmd[3] = lower_32_bits(dest);
> +			cmd[4] = FIELD_PREP(CH_XY_SRC, sg->xsize) |
> +				 FIELD_PREP(CH_XY_DES, sg->xsize);
> +			cmd[5] = dch->coherent ? TRANSCFG_WB : TRANSCFG_NC;
> +			cmd[6] = dch->coherent ? TRANSCFG_WB : TRANSCFG_NC;
> +			cmd[7] = FIELD_PREP(CH_XY_SRC, 1) | FIELD_PREP(CH_XY_DES, 1);
> +			la_cmd = &cmd[8];
> +		} else {
> +			*la_cmd = phys | CH_LINKADDR_EN;
> +			if (len <= step_max) {
> +				cmd[0] = LINK_CTRL | LINK_XSIZE | LINK_LINKADDR;
> +				cmd[1] = FIELD_PREP(CH_CTRL_TRANSIZE, sg->tsz) |
> +					 FIELD_PREP(CH_CTRL_XTYPE, CH_CTRL_XTYPE_CONTINUE);
> +				cmd[2] = FIELD_PREP(CH_XY_SRC, sg->xsize) |
> +					 FIELD_PREP(CH_XY_DES, sg->xsize);
> +				la_cmd = &cmd[3];
> +			} else {
> +				cmd[0] = LINK_XSIZE | LINK_LINKADDR;
> +				cmd[1] = FIELD_PREP(CH_XY_SRC, sg->xsize) |
> +					 FIELD_PREP(CH_XY_DES, sg->xsize);
> +				la_cmd = &cmd[2];
> +			}

Ok, we really need to figure out a better abstraction for command 
construction, the hard-coded array indices were a bad enough idea to 
start with, but this is almost impossible to make sense of.

> +		}
> +
> +		len -= xfer_len;
> +		src += xfer_len;
> +		dest += xfer_len;
> +		sglen++;
> +	}
> +
> +	/* the last cmdlink */
> +	*la_cmd = 0;
> +	desc->sg[sglen - 1].command[1] |= FIELD_PREP(CH_CTRL_DONETYPE, CH_CTRL_DONETYPE_CMD);

As for that, I don't even...

Furthermore, all these loops and conditionals are crazy anyway, and 
thoroughly failing to do justice to the hardware actually being pretty 
cool, namely that *commands can loop themselves*! Any single buffer/sg 
segment should take at most two commands - one dividing as much of the 
length as possible between XSIZE{HI} and CMDRESTARTCOUNT using 
REGRELOADTYPE=1, and/or one to transfer whatever non-multiple tail 
portion remains.

Honestly I'm sad the project for which I originally started this driver 
got canned, as this is the part I was really looking forward to having 
some fun with...

[...]
>   static int d350_pause(struct dma_chan *chan)
>   {
>   	struct d350_chan *dch = to_d350_chan(chan);
> @@ -620,20 +970,31 @@ static u32 d350_get_residue(struct d350_chan *dch)
>   	u32 res, xsize, xsizehi, linkaddr, linkaddrhi, hi_new;
>   	int i, sgcur, retries = 3; /* 1st time unlucky, 2nd improbable, 3rd just broken */
>   	struct d350_desc *desc = dch->desc;
> +	struct d350 *dmac = to_d350(dch->vc.chan.device);
>   
> -	hi_new = readl_relaxed(dch->base + CH_XSIZEHI);
> -	do {
> -		xsizehi = hi_new;
> -		xsize = readl_relaxed(dch->base + CH_XSIZE);
> +	if (dch->has_xsizehi) {
>   		hi_new = readl_relaxed(dch->base + CH_XSIZEHI);
> -	} while (xsizehi != hi_new && --retries);
> +		do {
> +			xsizehi = hi_new;
> +			xsize = readl_relaxed(dch->base + CH_XSIZE);
> +			hi_new = readl_relaxed(dch->base + CH_XSIZEHI);
> +		} while (xsizehi != hi_new && --retries);
> +	} else {
> +		xsize = readl_relaxed(dch->base + CH_XSIZE);
> +		xsizehi = 0;
> +	}
This is unnecessary - if the CH_XSIZEHI location isn't the actual 
register then it's RAZ/WI, which means the existing logic can take full 
advantage of it reading as zero and still work just the same.

> -	hi_new = readl_relaxed(dch->base + CH_LINKADDRHI);
> -	do {
> -		linkaddrhi = hi_new;
> -		linkaddr = readl_relaxed(dch->base + CH_LINKADDR);
> +	if (!dmac->is_d250) {

And similarly here. The only thing we should perhaps do specially for 
LINKADDRHI is omit it from command generation when ADDR_WIDTH <= 32 in 
general. I admit I was lazy there, since it's currently harmless for 
d350_start_next() to write the register location unconditionally, but 
I'm not sure how a 32-bit DMA-350 would handle it in an actual command 
link header.

>   		hi_new = readl_relaxed(dch->base + CH_LINKADDRHI);
> -	} while (linkaddrhi != hi_new && --retries);
> +		do {
> +			linkaddrhi = hi_new;
> +			linkaddr = readl_relaxed(dch->base + CH_LINKADDR);
> +			hi_new = readl_relaxed(dch->base + CH_LINKADDRHI);
> +		} while (linkaddrhi != hi_new && --retries);
> +	} else {
> +		linkaddr = readl_relaxed(dch->base + CH_LINKADDR);
> +		linkaddrhi = 0;
> +	}
>   
>   	for (i = 0; i < desc->sglen; i++) {
>   		if (desc->sg[i].phys == (((u64)linkaddrhi << 32) | (linkaddr & ~CH_LINKADDR_EN)))
> @@ -876,6 +1237,14 @@ static void d350_free_chan_resources(struct dma_chan *chan)
>   	dch->cmd_pool = NULL;
>   }
>   
> +static void d250_cntx_mem_release(void *ptr)
> +{
> +	struct d350 *dmac = ptr;
> +	struct device *dev = dmac->dma.dev;
> +
> +	dma_free_coherent(dev, dmac->cntx_mem_size, dmac->cntx_mem, dmac->cntx_mem_paddr);
> +}
> +
>   static int d350_probe(struct platform_device *pdev)
>   {
>   	struct device *dev = &pdev->dev;
> @@ -893,8 +1262,9 @@ static int d350_probe(struct platform_device *pdev)
>   	r = FIELD_GET(IIDR_VARIANT, reg);
>   	p = FIELD_GET(IIDR_REVISION, reg);
>   	if (FIELD_GET(IIDR_IMPLEMENTER, reg) != IMPLEMENTER_ARM ||
> -	    FIELD_GET(IIDR_PRODUCTID, reg) != PRODUCTID_DMA350)
> -		return dev_err_probe(dev, -ENODEV, "Not a DMA-350!");
> +	    ((FIELD_GET(IIDR_PRODUCTID, reg) != PRODUCTID_DMA350) &&
> +	    FIELD_GET(IIDR_PRODUCTID, reg) != PRODUCTID_DMA250))
> +		return dev_err_probe(dev, -ENODEV, "Not a DMA-350/DMA-250!");
>   
>   	reg = readl_relaxed(base + DMAINFO + DMA_BUILDCFG0);
>   	nchan = FIELD_GET(DMA_CFG_NUM_CHANNELS, reg) + 1;
> @@ -917,13 +1287,38 @@ static int d350_probe(struct platform_device *pdev)
>   		return ret;
>   	}
>   
> +	if (device_is_compatible(dev, "arm,dma-250")) {
If only we had a completely reliable product ID from the hardware itself...

> +		u32 cfg2;
> +		int secext_present;
> +
> +		dmac->is_d250 = true;
> +
> +		cfg2 = readl_relaxed(base + DMAINFO + DMA_BUILDCFG2);
> +		secext_present = (cfg2 & DMA_CFG_HAS_TZ) ? 1 : 0;
> +		dmac->cntx_mem_size = nchan * 64 * (1 + secext_present);

As before I think that's wrong.

> +		dmac->cntx_mem = dma_alloc_coherent(dev, dmac->cntx_mem_size,
> +						    &dmac->cntx_mem_paddr,
> +						    GFP_KERNEL);

This is too early, it needs to wait until after we've set the DMA mask. 
Also since this is purely private memory for the device, it may as well 
use DMA_ATTR_NO_KERNEL_MAPPING.

> +		if (!dmac->cntx_mem)
> +			return dev_err_probe(dev, -ENOMEM, "Failed to alloc context memory\n");
Just return -ENOMEM - dev_err_probe() adds nothing.

> +		ret = devm_add_action_or_reset(dev, d250_cntx_mem_release, dmac);
> +		if (ret) {
> +			dma_free_coherent(dev, dmac->cntx_mem_size,
> +					  dmac->cntx_mem, dmac->cntx_mem_paddr);

a) Understand that the mildly non-obvious "or reset" means it already 
calls the cleanup action on error, so this would be a double-free.

b) Don't reinvent dmam_alloc_*() in the first place though.

> +			return ret;
> +		}
> +		writel_relaxed(dmac->cntx_mem_paddr, base + DMANSECCTRL + NSEC_CNTXBASE);

Perhaps we should check that this hasn't already been set up first? I 
mean, we can't necessarily even be sure teh context memory interface can 
access the same address space as the DMA transfer interface at all; the 
design intent is at least partly to allow connecting a dedicated SRAM 
directly, see figure 1 here: 
https://developer.arm.com/documentation/108001/0000/DMAC-interfaces/AHB5-manager-interfaces/Separate-AHB5-ports-for-data-and-virtual-channel-context?lang=en

However I'm not sure how feasible that is to detect from software - the 
base register alone clearly isn't foolproof since 0 could be a valid 
address (especially in a private SRAM). At worst I suppose we might end 
up needing a DMA-250-specific DT property to say whether it does or 
doesn't need context memory from the OS...

> +	}
> +
>   	dma_set_mask_and_coherent(dev, DMA_BIT_MASK(aw));
>   	coherent = device_get_dma_attr(dev) == DEV_DMA_COHERENT;
>   
>   	reg = readl_relaxed(base + DMAINFO + DMA_BUILDCFG1);
>   	dmac->nreq = FIELD_GET(DMA_CFG_NUM_TRIGGER_IN, reg);
>   
> -	dev_dbg(dev, "DMA-350 r%dp%d with %d channels, %d requests\n", r, p, dmac->nchan, dmac->nreq);
> +	dev_info(dev, "%s r%dp%d with %d channels, %d requests\n",
> +		 dmac->is_d250 ? "DMA-250" : "DMA-350", r, p, dmac->nchan, dmac->nreq);

As Krzysztof said, this is a debug message and it's staying a debug 
message. And just replace "DMA-350" with "ProductID 0x%x" - it's only 
meant as a sanity-check that we're looking at the hardware we expect to 
be looking at.
>   	for (int i = min(dw, 16); i > 0; i /= 2) {
>   		dmac->dma.src_addr_widths |= BIT(i);
> @@ -935,7 +1330,10 @@ static int d350_probe(struct platform_device *pdev)
>   	dmac->dma.device_alloc_chan_resources = d350_alloc_chan_resources;
>   	dmac->dma.device_free_chan_resources = d350_free_chan_resources;
>   	dma_cap_set(DMA_MEMCPY, dmac->dma.cap_mask);
> -	dmac->dma.device_prep_dma_memcpy = d350_prep_memcpy;
> +	if (dmac->is_d250)
> +		dmac->dma.device_prep_dma_memcpy = d250_prep_memcpy;
> +	else
> +		dmac->dma.device_prep_dma_memcpy = d350_prep_memcpy;
>   	dmac->dma.device_pause = d350_pause;
>   	dmac->dma.device_resume = d350_resume;
>   	dmac->dma.device_terminate_all = d350_terminate_all;
> @@ -971,8 +1369,8 @@ static int d350_probe(struct platform_device *pdev)
>   			return dch->irq;
>   
>   		dch->has_wrap = FIELD_GET(CH_CFG_HAS_WRAP, reg);
> -		dch->has_trig = FIELD_GET(CH_CFG_HAS_TRIGIN, reg) &
> -				FIELD_GET(CH_CFG_HAS_TRIGSEL, reg);

Not only is this in the wrong patch, it's the wrong change to make 
anyway. If you're only adding support for fixed triggers, you need to 
explicitly *exclude* selectable triggers from that, because they work 
differently.

Thanks,
Robin.

> +		dch->has_xsizehi = FIELD_GET(CH_CFG_HAS_XSIZEHI, reg);
> +		dch->has_trig = FIELD_GET(CH_CFG_HAS_TRIGIN, reg);
>   
>   		/* Fill is a special case of Wrap */
>   		memset &= dch->has_wrap;
> @@ -994,8 +1392,13 @@ static int d350_probe(struct platform_device *pdev)
>   		dma_cap_set(DMA_SLAVE, dmac->dma.cap_mask);
>   		dma_cap_set(DMA_CYCLIC, dmac->dma.cap_mask);
>   		dmac->dma.device_config = d350_slave_config;
> -		dmac->dma.device_prep_slave_sg = d350_prep_slave_sg;
> -		dmac->dma.device_prep_dma_cyclic = d350_prep_cyclic;
> +		if (dmac->is_d250) {
> +			dmac->dma.device_prep_slave_sg = d250_prep_slave_sg;
> +			dmac->dma.device_prep_dma_cyclic = d250_prep_cyclic;
> +		} else {
> +			dmac->dma.device_prep_slave_sg = d350_prep_slave_sg;
> +			dmac->dma.device_prep_dma_cyclic = d350_prep_cyclic;
> +		}
>   	}
>   
>   	if (memset) {
> @@ -1019,6 +1422,7 @@ static void d350_remove(struct platform_device *pdev)
>   
>   static const struct of_device_id d350_of_match[] __maybe_unused = {
>   	{ .compatible = "arm,dma-350" },
> +	{ .compatible = "arm,dma-250" },
>   	{}
>   };
>   MODULE_DEVICE_TABLE(of, d350_of_match);
> @@ -1035,5 +1439,5 @@ module_platform_driver(d350_driver);
>   
>   MODULE_AUTHOR("Robin Murphy <robin.murphy@....com>");
>   MODULE_AUTHOR("Jisheng Zhang <jszhang@...nel.org>");
> -MODULE_DESCRIPTION("Arm DMA-350 driver");
> +MODULE_DESCRIPTION("Arm DMA-350/DMA-250 driver");
>   MODULE_LICENSE("GPL v2");