linux-kernel - Re: [PATCH 5/6] spi: imx: support dynamic burst length for ECSPI DMA mode

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID:
 <VI2PR04MB11147F814915ECCB6AE36F0F9E8DEA@VI2PR04MB11147.eurprd04.prod.outlook.com>
Date: Wed, 26 Nov 2025 07:42:36 +0000
From: Carlos Song <carlos.song@....com>
To: Marc Kleine-Budde <mkl@...gutronix.de>
CC: "broonie@...nel.org" <broonie@...nel.org>, Frank Li <frank.li@....com>,
	"hawnguo@...nel.org" <hawnguo@...nel.org>, "s.hauer@...gutronix.de"
	<s.hauer@...gutronix.de>, "kernel@...gutronix.de" <kernel@...gutronix.de>,
	"festevam@...il.com" <festevam@...il.com>, "imx@...ts.linux.dev"
	<imx@...ts.linux.dev>, "linux-kernel@...r.kernel.org"
	<linux-kernel@...r.kernel.org>, "linux-arm-kernel@...ts.infradead.org"
	<linux-arm-kernel@...ts.infradead.org>, "linux-spi@...r.kernel.org"
	<linux-spi@...r.kernel.org>
Subject: Re: [PATCH 5/6] spi: imx: support dynamic burst length for ECSPI DMA
 mode



> -----Original Message-----
> From: Marc Kleine-Budde <mkl@...gutronix.de>
> Sent: Tuesday, November 25, 2025 8:10 PM
> To: Carlos Song <carlos.song@....com>
> Cc: broonie@...nel.org; Frank Li <frank.li@....com>; hawnguo@...nel.org;
> s.hauer@...gutronix.de; kernel@...gutronix.de; festevam@...il.com;
> imx@...ts.linux.dev; linux-kernel@...r.kernel.org;
> linux-arm-kernel@...ts.infradead.org; linux-spi@...r.kernel.org
> Subject: [EXT] Re: [PATCH 5/6] spi: imx: support dynamic burst length for ECSPI
> DMA mode
> 
> On 25.11.2025 18:06:17, Carlos Song wrote:
> > ECSPI transfers only one word per frame in DMA mode, causing SCLK
> > stalls between words due to BURST_LENGTH updates, which significantly
> > impacts performance.
> >
> > To improve throughput, configure BURST_LENGTH as large as possible (up
> > to
> > 512 bytes per frame) instead of word length. This avoids delays
> > between words. When transfer length is not 4-byte aligned, use bounce
> > buffers to align data for DMA. TX uses aligned words for TXFIFO, while
> > RX trims DMA buffer data after transfer completion.
> >
> > Introduce a new dma_package structure to store:
> >   1. BURST_LENGTH values for each DMA request
> >   2. Variables for DMA submission
> >   3. DMA transmission length and actual data length
> >
> > Handle three cases:
> >   - len <= 512 bytes: one package, BURST_LENGTH = len * 8 - 1
> >   - len > 512 and aligned: one package, BURST_LENGTH = max (512 bytes)
> >   - len > 512 and unaligned: two packages, second for tail data
> >
> > Performance test (spidev_test @10MHz, 4KB):
> >   Before: tx/rx ~6651.9 kbps
> >   After:  tx/rx ~9922.2 kbps (~50% improvement)
> >
> > For compatibility with slow SPI devices, add configurable word delay
> > in DMA mode. When word delay is set, dynamic burst is disabled and
> > BURST_LENGTH equals word length.
> >
> > Signed-off-by: Carlos Song <carlos.song@....com>
> > ---
> >  drivers/spi/spi-imx.c | 409
> > ++++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 373 insertions(+), 36 deletions(-)
> >
> > diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index
> > 42f64d9535c9..f02a47fbba8a 100644
> > --- a/drivers/spi/spi-imx.c
> > +++ b/drivers/spi/spi-imx.c
> > @@ -60,6 +60,7 @@ MODULE_PARM_DESC(polling_limit_us,
> >  #define MX51_ECSPI_CTRL_MAX_BURST	512
> >  /* The maximum bytes that IMX53_ECSPI can transfer in target mode.*/
> >  #define MX53_MAX_TRANSFER_BYTES		512
> > +#define BYTES_PER_32BITS_WORD		4
> >
> >  enum spi_imx_devtype {
> >  	IMX1_CSPI,
> > @@ -95,6 +96,16 @@ struct spi_imx_devtype_data {
> >  	enum spi_imx_devtype devtype;
> >  };
> >
> > +struct dma_data_package {
> > +	u32 cmd_word;
> > +	void *dma_rx_buf;
> > +	void *dma_tx_buf;
> > +	dma_addr_t	dma_tx_addr;
> > +	dma_addr_t	dma_rx_addr;
> 
Hi, Marc

Thank you very much for your ack!

> Better use uniform indention: here one space, not one tab.
> 
Will do this in V2
> > +	int dma_len;
> > +	int data_len;
> > +};
> > +
> >  struct spi_imx_data {
> >  	struct spi_controller *controller;
> >  	struct device *dev;
> > @@ -130,6 +141,9 @@ struct spi_imx_data {
> >  	u32 wml;
> >  	struct completion dma_rx_completion;
> >  	struct completion dma_tx_completion;
> > +	struct dma_data_package *dma_data;
> 
> please add __counted_by(dma_package_num)
> 
Will do this in V2
> > +	int dma_package_num;
> > +	int rx_offset;
> >
> >  	const struct spi_imx_devtype_data *devtype_data;  }; @@ -189,6
> > +203,9 @@ MXC_SPI_BUF_TX(u16)
> >  MXC_SPI_BUF_RX(u32)
> >  MXC_SPI_BUF_TX(u32)
> >
> > +/* Align to cache line to avoid swiotlo bounce */ #define
> > +DMA_CACHE_ALIGNED_LEN(x) ALIGN((x), dma_get_cache_alignment())
> > +
> >  /* First entry is reserved, second entry is valid only if SDHC_SPIEN is set
> >   * (which is currently not the case in this driver)
> >   */
> > @@ -253,6 +270,14 @@ static bool spi_imx_can_dma(struct spi_controller
> *controller, struct spi_device
> >  	if (transfer->len < spi_imx->devtype_data->fifo_size)
> >  		return false;
> >
> > +	/* DMA only can transmit data in bytes */
> > +	if (spi_imx->bits_per_word != 8 && spi_imx->bits_per_word != 16 &&
> > +	    spi_imx->bits_per_word != 32)
> > +		return false;
> > +
> > +	if (transfer->len >= MAX_SDMA_BD_BYTES)
> > +		return false;
> > +
> >  	spi_imx->dynamic_burst = 0;
> >
> >  	return true;
> > @@ -1398,8 +1423,6 @@ static int spi_imx_sdma_init(struct device *dev,
> > struct spi_imx_data *spi_imx,
> >
> >  	init_completion(&spi_imx->dma_rx_completion);
> >  	init_completion(&spi_imx->dma_tx_completion);
> > -	controller->can_dma = spi_imx_can_dma;
> > -	controller->max_dma_len = MAX_SDMA_BD_BYTES;
> >  	spi_imx->controller->flags = SPI_CONTROLLER_MUST_RX |
> >  					 SPI_CONTROLLER_MUST_TX;
> >
> > @@ -1437,10 +1460,252 @@ static int spi_imx_calculate_timeout(struct
> spi_imx_data *spi_imx, int size)
> >  	return secs_to_jiffies(2 * timeout);  }
> >
> > +static void spi_imx_dma_unmap(struct spi_imx_data *spi_imx,
> > +			      struct dma_data_package *dma_data) {
> > +	struct device *tx_dev = spi_imx->controller->dma_tx->device->dev;
> > +	struct device *rx_dev = spi_imx->controller->dma_rx->device->dev;
> > +
> > +	dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
> > +			 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> > +			 DMA_TO_DEVICE);
> > +	dma_unmap_single(rx_dev, dma_data->dma_rx_addr,
> > +			 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> > +			 DMA_FROM_DEVICE);
> > +}
> > +
> > +static void spi_imx_dma_rx_data_handle(struct spi_imx_data *spi_imx,
> > +				       struct dma_data_package *dma_data, void
> *rx_buf,
> > +				       bool word_delay)
> > +{
> > +#ifdef __LITTLE_ENDIAN
> > +	unsigned int bytes_per_word =
> spi_imx_bytes_per_word(spi_imx->bits_per_word);
> > +	u32 *temp = dma_data->dma_rx_buf;
> 
> can you move this into the scope of the if() below?
> > +#endif
> > +	void *copy_ptr;
> > +	int unaligned;
> > +
> > +#ifdef __LITTLE_ENDIAN
> > +	/*
> > +	 * On little-endian CPUs, adjust byte order:
> > +	 * - Swap bytes when bpw = 8
> > +	 * - Swap half-words when bpw = 16
> > +	 * This ensures correct data ordering for DMA transfers.
> > +	 */
> > +	if (!word_delay) {
> > +		for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len,
> > +BYTES_PER_32BITS_WORD); i++) {
> 
> sizeof(*temp) instead of BYTES_PER_32BITS_WORD?
> 
Looks good I will try this in V2
> > +			if (bytes_per_word == 1)
> > +				swab32s(temp + i);
> > +			else if (bytes_per_word == 2)
> > +				swahw32s(temp + i);
> 
> Why do you do first a swap in place and then a memcpy()
> 
When dynamic burst enabled, DMA copy data always using buswidth=4 bytes.
CPU is little endian so bytes order actually little endian in dma_buf.
But for bytes_per_word=1, every bytes should be the same order with mem order, it should be big endian so swap every bytes.
In the same reason, bytes per word = 2, swap half word.
Bytes per word = 4 don't need do any thing.
(SPI is not ruled bytes order, so SPI bytes order always follow CPU bytes order, here still follow)
> > +		}
> > +	}
> > +#endif
> > +
> > +	if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
> 
> I think this deserves a comment, why you do a re-alignment of the data here.
> 
Yes. I can add one comment here.

In fact it is not re-alignment, it is trim data.
When dynamic burst enabled, DMA copy data always using bus width=4 bytes.
So DMA always get 4 bytes data from RXFIFO. But if data lens is not 4-byte alignment,
the data in the DMA bounce buffer contains extra garbage bytes, so it needs to be trimmed before memcopy to xfer buffer.

Why is the first word?
It is from HW behavior. When dynamic burst enabled, BURST_LENGTH will be set same with actual data len, 
It helps maintain correct bit count.

As RM shows:
"
In master mode, it controls the number of bits per SPI burst. Since the shift register always loads 32-bit
data from transmit FIFO, only the n least-significant (n = BURST LENGTH + 1) will be shifted out. The
remaining bits will be ignored.

Number of Valid Bits in a SPI burst.

0x000 A SPI burst contains the 1 LSB in a word.
0x001 A SPI burst contains the 2 LSB in a word.
0x002 A SPI burst contains the 3 LSB in a word.
...
0x01F A SPI burst contains all 32 bits in a word.
0x020 A SPI burst contains the 1 LSB in first word and all 32 bits in second word.
0x021 A SPI burst contains the 2 LSB in first word and all 32 bits in second word.
"
When data len is not 4 bytes-align, so the first word is always include some garbage bytes(if transfer 7 bytes. first word include one garbage byte and 3 valid bytes, four bytes in second word).

> > +		unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
> > +		copy_ptr = (u8 *)dma_data->dma_rx_buf +
> BYTES_PER_32BITS_WORD - unaligned;
> > +	} else {
> > +		copy_ptr = dma_data->dma_rx_buf;
> 
> Why do you use the bounce buffer if the data is aligned correct?
> 
Whatever data is 4 bytes align, when CPU is little endian, bytes swap should be done additionally according to different bytes per word setting.

Summary whole solution about dynamic burst for DMA mode:
1. Always read/write SPI FIFO with DMA buswidth = 4, so DMA bounce buffer always 4-bytes alignment:
  swap bytes/half word according to bytes per word=8/16 when in CPU little endian.
2. BURST_LENGTH setting is important, it helps maintain correct bit count(HW trim: don't shift out bits to TXFIFO also don't shift in bits in RXFIFO):
  * TX: Although DMA put 4 byte-alignment data to FIFO, but in bounce buffer we put valid data in valid LSB of first word, it can make sure ECSPI only shift out valid data in bonus buffer.
  * RX: Although DMA get 4bytes- alignment data from RXFIFO to bounce buffer, but trim it with valid LSB according actual xfer->len, it can make rx_buf is right data.

> > +	}
> > +
> > +	memcpy(rx_buf, copy_ptr, dma_data->data_len); }
> > +
> > +static int spi_imx_dma_map(struct spi_imx_data *spi_imx,
> > +			   struct dma_data_package *dma_data) {
> > +	struct spi_controller *controller = spi_imx->controller;
> > +	struct device *tx_dev = controller->dma_tx->device->dev;
> > +	struct device *rx_dev = controller->dma_rx->device->dev;
> > +
> > +	dma_data->dma_tx_addr = dma_map_single(tx_dev,
> dma_data->dma_tx_buf,
> > +
> DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> > +					       DMA_TO_DEVICE);
> > +	if (dma_mapping_error(tx_dev, dma_data->dma_tx_addr)) {
> > +		dev_err(spi_imx->dev, "DMA TX map failed\n");
> > +		return -EINVAL;
> 
> please propagate the error value of dma_mapping_error()
> 

Will do this in V2
> > +	}
> > +
> > +	dma_data->dma_rx_addr = dma_map_single(rx_dev,
> dma_data->dma_rx_buf,
> > +
> DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> > +					       DMA_FROM_DEVICE);
> > +	if (dma_mapping_error(rx_dev, dma_data->dma_rx_addr)) {
> > +		dev_err(spi_imx->dev, "DMA RX map failed\n");
> > +		goto rx_map_err;
> 
> there's only one user of the dma_unmap_single(), so no need for the goto.
> 
This goto is to unmap previous TX, not this RX. TX has been mapped then start to map RX, now RX mapping error, Do we really don't need to
rollback for TX?

> > +	}
> > +
> > +	return 0;
> > +
> > +rx_map_err:
> > +	dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
> > +			 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> > +			 DMA_TO_DEVICE);
> > +	return -EINVAL;
> > +}
> > +
> > +static int spi_imx_dma_tx_data_handle(struct spi_imx_data *spi_imx,
> > +				      struct dma_data_package *dma_data,
> > +				      const void *tx_buf,
> > +				      bool word_delay)
> > +{
> > +#ifdef __LITTLE_ENDIAN
> > +	unsigned int bytes_per_word =
> spi_imx_bytes_per_word(spi_imx->bits_per_word);
> > +	u32 *temp;
> > +#endif
> 
> move into scope of if()
> 
Will do this in V2.
> > +	void *copy_ptr;
> > +	int unaligned;
> > +
> > +	if (word_delay) {
> > +		dma_data->dma_len = dma_data->data_len;
> > +	} else {
> > +		/*
> > +		 * As per the reference manual, when burst length = 32*n + m bits,
> ECSPI
> > +		 * sends m LSB bits in the first word, followed by n full 32-bit words.
> > +		 * Since actual data may not be 4-byte aligned, allocate DMA TX/RX
> buffers
> > +		 * to ensure alignment. For TX, DMA pushes 4-byte aligned words to
> TXFIFO,
> > +		 * while ECSPI uses BURST_LENGTH settings to maintain correct bit
> count.
> > +		 * For RX, DMA receives 32-bit words from RXFIFO; after transfer
> completes,
> > +		 * trim the DMA RX buffer and copy the actual data to rx_buf.
> > +		 */
> 
> Ahh, please add the corresponding description for rx.
> 
Will do this in V2.
> > +		dma_data->dma_len = ALIGN(dma_data->data_len,
> BYTES_PER_32BITS_WORD);
> > +	}
> > +
> > +	dma_data->dma_tx_buf = kmalloc(dma_data->dma_len, GFP_KERNEL |
> > +__GFP_ZERO);
> 
> kzalloc()?
> 
Yes. Will do this in V2
> > +	if (!dma_data->dma_tx_buf)
> > +		return -ENOMEM;
> > +
> > +	dma_data->dma_rx_buf = kmalloc(dma_data->dma_len, GFP_KERNEL |
> __GFP_ZERO);
> > +	if (!dma_data->dma_rx_buf) {
> > +		kfree(dma_data->dma_tx_buf);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
> > +		unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
> > +		copy_ptr = (u8 *)dma_data->dma_tx_buf +
> BYTES_PER_32BITS_WORD - unaligned;
> > +	} else {
> > +		copy_ptr = dma_data->dma_tx_buf;
> > +	}
> > +
> > +	memcpy(copy_ptr, tx_buf, dma_data->data_len);
> > +
> > +	/*
> > +	 * When word_delay is enabled, DMA transfers an entire word in one
> minor loop.
> > +	 * In this case, no data requires additional handling.
> > +	 */
> > +	if (word_delay)
> > +		return 0;
> > +
> > +#ifdef __LITTLE_ENDIAN
> > +	/*
> > +	 * On little-endian CPUs, adjust byte order:
> > +	 * - Swap bytes when bpw = 8
> > +	 * - Swap half-words when bpw = 16
> > +	 * This ensures correct data ordering for DMA transfers.
> > +	 */
> > +	temp = dma_data->dma_tx_buf;
> > +	for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len,
> BYTES_PER_32BITS_WORD); i++) {
> > +		if (bytes_per_word == 1)
> > +			swab32s(temp + i);
> > +		else if (bytes_per_word == 2)
> > +			swahw32s(temp + i);
> > +	}
> > +#endif
> > +
> > +	return 0;
> > +}
> > +
> > +static int spi_imx_dma_data_prepare(struct spi_imx_data *spi_imx,
> > +				    struct spi_transfer *transfer,
> > +				    bool word_delay)
> > +{
> > +	u32 pre_bl, tail_bl;
> > +	u32 ctrl;
> > +	int ret;
> > +
> > +	/*
> > +	 * ECSPI supports a maximum burst of 512 bytes. When xfer->len exceeds
> 512
> > +	 * and is not a multiple of 512, a tail transfer is required. In this case,
> > +	 * an extra DMA request is issued for the remaining data.
> 
> Why do you need an extra transfer in this case?
> 

BURST_LEGTH is used for SPI HW to maintain correct bit count. So BURST_LENGTH should update with
data length. After DMA request submit, SPI can not update the BURST_LENGTH, when needed, we must
split two package, update the register then setup second DMA transfer.

ECSPI HW can update BURST_LENGTH auto, but it always update this using previous value.
When len > 512 but len is not 512-unaligned, we need two packages, second for tail data.
For example len is 512 *3 + 511. So first transfer using BURST_LENGTH = 512 bytes(auto update 3 times), DMA transfer len = 512 *3,
second package BURST_LENGTH = 511 bytes, DMA transfer len = 511.(If here we use 512 bytes as BURST_LENGTH,
SPI will shift out/in extra bits, it previous isn't acceptable!)

Carlos Song
> > +	 */
> > +	ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL);
> > +	if (word_delay) {
> > +		/*
> > +		 * When SPI IMX need to support word delay, according to "Sample
> Period Control
> > +		 * Register" shows, The Sample Period Control Register
> (ECSPI_PERIODREG)
> > +		 * provides software a way to insert delays (wait states) between
> consecutive
> > +		 * SPI transfers. As a result, ECSPI can only transfer one word per
> frame, and
> > +		 * the delay occurs between frames.
> > +		 */
> > +		spi_imx->dma_package_num = 1;
> > +		pre_bl = spi_imx->bits_per_word - 1;
> > +	} else if (transfer->len <= MX51_ECSPI_CTRL_MAX_BURST) {
> > +		spi_imx->dma_package_num = 1;
> > +		pre_bl = transfer->len * BITS_PER_BYTE - 1;
> > +	} else if (!(transfer->len % MX51_ECSPI_CTRL_MAX_BURST)) {
> > +		spi_imx->dma_package_num = 1;
> > +		pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
> > +	} else {
> > +		spi_imx->dma_package_num = 2;
> > +		pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
> > +		tail_bl = (transfer->len % MX51_ECSPI_CTRL_MAX_BURST) *
> BITS_PER_BYTE - 1;
> > +	}
> > +
> > +	spi_imx->dma_data = kmalloc_array(spi_imx->dma_package_num,
> > +					  sizeof(struct dma_data_package),
> > +					  GFP_KERNEL | __GFP_ZERO);
> > +	if (!spi_imx->dma_data) {
> > +		dev_err(spi_imx->dev, "Failed to allocate DMA package buffer!\n");
> > +		return -ENOMEM;
> > +	}
> > +
> > +	if (spi_imx->dma_package_num == 1) {
> > +		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> > +		ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> > +		spi_imx->dma_data[0].cmd_word = ctrl;
> > +		spi_imx->dma_data[0].data_len = transfer->len;
> > +		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0],
> transfer->tx_buf,
> > +						 word_delay);
> > +		if (ret) {
> > +			kfree(spi_imx->dma_data);
> > +			return ret;
> > +		}
> > +	} else {
> > +		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> > +		ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> > +		spi_imx->dma_data[0].cmd_word = ctrl;
> > +		spi_imx->dma_data[0].data_len =
> DIV_ROUND_DOWN_ULL(transfer->len,
> > +								   MX51_ECSPI_CTRL_MAX_BURST)
> > +								   * MX51_ECSPI_CTRL_MAX_BURST;
> > +		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0],
> transfer->tx_buf,
> > +						 false);
> > +		if (ret) {
> > +			kfree(spi_imx->dma_data);
> > +			return ret;
> > +		}
> > +
> > +		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> > +		ctrl |= tail_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> > +		spi_imx->dma_data[1].cmd_word = ctrl;
> > +		spi_imx->dma_data[1].data_len = transfer->len %
> MX51_ECSPI_CTRL_MAX_BURST;
> > +		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[1],
> > +						 transfer->tx_buf +
> spi_imx->dma_data[0].data_len,
> > +						 false);
> > +		if (ret) {
> > +			kfree(spi_imx->dma_data[0].dma_tx_buf);
> > +			kfree(spi_imx->dma_data[0].dma_rx_buf);
> > +			kfree(spi_imx->dma_data);
> > +		}
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
> > +			      struct dma_data_package *dma_data,
> >  			      struct spi_transfer *transfer)  {
> > -	struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg;
> >  	struct spi_controller *controller = spi_imx->controller;
> >  	struct dma_async_tx_descriptor *desc_tx, *desc_rx;
> >  	unsigned long transfer_timeout;
> > @@ -1451,9 +1716,9 @@ static int spi_imx_dma_submit(struct spi_imx_data
> *spi_imx,
> >  	 * The TX DMA setup starts the transfer, so make sure RX is configured
> >  	 * before TX.
> >  	 */
> > -	desc_rx = dmaengine_prep_slave_sg(controller->dma_rx,
> > -					  rx->sgl, rx->nents, DMA_DEV_TO_MEM,
> > -					  DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> > +	desc_rx = dmaengine_prep_slave_single(controller->dma_rx,
> dma_data->dma_rx_addr,
> > +					      dma_data->dma_len, DMA_DEV_TO_MEM,
> > +					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> >  	if (!desc_rx) {
> >  		transfer->error |= SPI_TRANS_FAIL_NO_START;
> >  		return -EINVAL;
> > @@ -1471,9 +1736,9 @@ static int spi_imx_dma_submit(struct spi_imx_data
> *spi_imx,
> >  	reinit_completion(&spi_imx->dma_rx_completion);
> >  	dma_async_issue_pending(controller->dma_rx);
> >
> > -	desc_tx = dmaengine_prep_slave_sg(controller->dma_tx,
> > -					  tx->sgl, tx->nents, DMA_MEM_TO_DEV,
> > -					  DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> > +	desc_tx = dmaengine_prep_slave_single(controller->dma_tx,
> dma_data->dma_tx_addr,
> > +					      dma_data->dma_len, DMA_MEM_TO_DEV,
> > +					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> >  	if (!desc_tx)
> >  		goto dmaengine_terminate_rx;
> >
> > @@ -1521,16 +1786,16 @@ static int spi_imx_dma_submit(struct
> > spi_imx_data *spi_imx,  }
> >
> >  static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
> > -				     struct spi_transfer *transfer)
> > +				     struct dma_data_package *dma_data,
> > +				     bool word_delay)
> >  {
> > -	struct sg_table *rx = &transfer->rx_sg;
> > -	struct scatterlist *last_sg = sg_last(rx->sgl, rx->nents);
> > -	unsigned int bytes_per_word, i;
> > +	unsigned int bytes_per_word = word_delay ?
> > +				      spi_imx_bytes_per_word(spi_imx->bits_per_word) :
> > +				      BYTES_PER_32BITS_WORD;
> > +	unsigned int i;
> >
> > -	/* Get the right burst length from the last sg to ensure no tail data */
> > -	bytes_per_word = spi_imx_bytes_per_word(transfer->bits_per_word);
> >  	for (i = spi_imx->devtype_data->fifo_size / 2; i > 0; i--) {
> > -		if (!(sg_dma_len(last_sg) % (i * bytes_per_word)))
> > +		if (!dma_data->dma_len % (i * bytes_per_word))
> >  			break;
> >  	}
> >  	/* Use 1 as wml in case no available burst length got */ @@ -1540,25
> > +1805,29 @@ static void spi_imx_dma_max_wml_find(struct spi_imx_data
> *spi_imx,
> >  	spi_imx->wml = i;
> >  }
> >
> > -static int spi_imx_dma_configure(struct spi_controller *controller)
> > +static int spi_imx_dma_configure(struct spi_controller *controller,
> > +bool word_delay)
> >  {
> >  	int ret;
> >  	enum dma_slave_buswidth buswidth;
> >  	struct dma_slave_config rx = {}, tx = {};
> >  	struct spi_imx_data *spi_imx =
> > spi_controller_get_devdata(controller);
> >
> > -	switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
> > -	case 4:
> > +	if (word_delay) {
> > +		switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
> > +		case 4:
> > +			buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
> > +			break;
> > +		case 2:
> > +			buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
> > +			break;
> > +		case 1:
> > +			buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
> > +			break;
> > +		default:
> > +			return -EINVAL;
> > +		}
> > +	} else {
> >  		buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
> > -		break;
> > -	case 2:
> > -		buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
> > -		break;
> > -	case 1:
> > -		buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
> > -		break;
> > -	default:
> > -		return -EINVAL;
> >  	}
> >
> >  	tx.direction = DMA_MEM_TO_DEV;
> > @@ -1584,15 +1853,17 @@ static int spi_imx_dma_configure(struct
> spi_controller *controller)
> >  	return 0;
> >  }
> >
> > -static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> > -				struct spi_transfer *transfer)
> > +static int spi_imx_dma_package_transfer(struct spi_imx_data *spi_imx,
> > +					struct dma_data_package *dma_data,
> > +					struct spi_transfer *transfer,
> > +					bool word_delay)
> >  {
> >  	struct spi_controller *controller = spi_imx->controller;
> >  	int ret;
> >
> > -	spi_imx_dma_max_wml_find(spi_imx, transfer);
> > +	spi_imx_dma_max_wml_find(spi_imx, dma_data, word_delay);
> >
> > -	ret = spi_imx_dma_configure(controller);
> > +	ret = spi_imx_dma_configure(controller, word_delay);
> >  	if (ret)
> >  		goto dma_failure_no_start;
> >
> > @@ -1603,10 +1874,17 @@ static int spi_imx_dma_transfer(struct
> spi_imx_data *spi_imx,
> >  	}
> >  	spi_imx->devtype_data->setup_wml(spi_imx);
> >
> > -	ret = spi_imx_dma_submit(spi_imx, transfer);
> > +	ret = spi_imx_dma_submit(spi_imx, dma_data, transfer);
> >  	if (ret)
> >  		return ret;
> >
> > +	/* Trim the DMA RX buffer and copy the actual data to rx_buf */
> > +	dma_sync_single_for_cpu(controller->dma_rx->device->dev,
> dma_data->dma_rx_addr,
> > +				dma_data->dma_len, DMA_FROM_DEVICE);
> > +	spi_imx_dma_rx_data_handle(spi_imx, dma_data, transfer->rx_buf +
> spi_imx->rx_offset,
> > +				   word_delay);
> > +	spi_imx->rx_offset += dma_data->data_len;
> > +
> >  	return 0;
> >  /* fallback to pio */
> >  dma_failure_no_start:
> > @@ -1614,6 +1892,60 @@ static int spi_imx_dma_transfer(struct
> spi_imx_data *spi_imx,
> >  	return ret;
> >  }
> >
> > +static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> > +				struct spi_transfer *transfer)
> > +{
> > +	bool word_delay = transfer->word_delay.value != 0;
> > +	int ret;
> > +	int i;
> > +
> > +	ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay);
> > +	if (ret < 0) {
> > +		transfer->error |= SPI_TRANS_FAIL_NO_START;
> > +		dev_err(spi_imx->dev, "DMA data prepare fail\n");
> > +		goto fallback_pio;
> > +	}
> > +
> > +	spi_imx->rx_offset = 0;
> > +
> > +	/* Each dma_package performs a separate DMA transfer once */
> > +	for (i = 0; i < spi_imx->dma_package_num; i++) {
> > +		ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]);
> > +		if (ret < 0) {
> > +			transfer->error |= SPI_TRANS_FAIL_NO_START;
> > +			dev_err(spi_imx->dev, "DMA map fail\n");
> > +			break;
> > +		}
> > +
> > +		/* Update the CTRL register BL field */
> > +		writel(spi_imx->dma_data[i].cmd_word, spi_imx->base +
> > +MX51_ECSPI_CTRL);
> > +
> > +		ret = spi_imx_dma_package_transfer(spi_imx,
> &spi_imx->dma_data[i],
> > +						   transfer, word_delay);
> > +
> > +		/* Whether the dma transmission is successful or not, dma unmap is
> necessary */
> > +		spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]);
> > +
> > +		if (ret < 0) {
> > +			dev_dbg(spi_imx->dev, "DMA %d transfer not really finish\n", i);
> > +			break;
> > +		}
> > +	}
> > +
> > +	for (int j = 0; j < spi_imx->dma_package_num; j++) {
> > +		kfree(spi_imx->dma_data[j].dma_tx_buf);
> > +		kfree(spi_imx->dma_data[j].dma_rx_buf);
> > +	}
> > +	kfree(spi_imx->dma_data);
> > +
> > +fallback_pio:
> > +	/* If no any dma package data is transferred, fallback to PIO mode transfer
> */
> > +	if ((transfer->error & SPI_TRANS_FAIL_NO_START) && i != 0)
> > +		transfer->error &= !SPI_TRANS_FAIL_NO_START;
> > +
> > +	return ret;
> > +}
> > +
> >  static int spi_imx_pio_transfer(struct spi_device *spi,
> >  				struct spi_transfer *transfer)
> >  {
> > @@ -1780,9 +2112,14 @@ static int spi_imx_transfer_one(struct
> spi_controller *controller,
> >  	 * transfer, the SPI transfer has already been mapped, so we
> >  	 * have to do the DMA transfer here.
> >  	 */
> > -	if (spi_imx->usedma)
> > -		return spi_imx_dma_transfer(spi_imx, transfer);
> > -
> > +	if (spi_imx->usedma) {
> > +		ret = spi_imx_dma_transfer(spi_imx, transfer);
> > +		if (transfer->error & SPI_TRANS_FAIL_NO_START) {
> > +			spi_imx->usedma = false;
> > +			return spi_imx_pio_transfer(spi, transfer);
> > +		}
> > +		return ret;
> > +	}
> >  	/* run in polling mode for short transfers */
> >  	if (transfer->len == 1 || (polling_limit_us &&
> >  				   spi_imx_transfer_estimate_time_us(transfer) <
> > polling_limit_us))
> > --
> > 2.34.1
> >
> >
> >
> 
> regards,
> Marc
> 
> --
> Pengutronix e.K.                 | Marc Kleine-Budde          |
> Embedded Linux                   | https://www.pengutronix.de |
> Vertretung Nürnberg              | Phone: +49-5121-206917-129 |
> Amtsgericht Hildesheim, HRA 2686 | Fax:   +49-5121-206917-9   |