[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <010e0551-58b8-4b61-8ad7-2f03ecc6baa3@microchip.com>
Date: Tue, 26 Aug 2025 17:23:49 +0200
From: Nicolas Ferre <nicolas.ferre@...rochip.com>
To: Théo Lebrun <theo.lebrun@...tlin.com>, Andrew Lunn
<andrew+netdev@...n.ch>, "David S. Miller" <davem@...emloft.net>, Eric
Dumazet <edumazet@...gle.com>, Jakub Kicinski <kuba@...nel.org>, Paolo Abeni
<pabeni@...hat.com>, Rob Herring <robh@...nel.org>, Krzysztof Kozlowski
<krzk+dt@...nel.org>, Conor Dooley <conor+dt@...nel.org>, Claudiu Beznea
<claudiu.beznea@...on.dev>, Geert Uytterhoeven <geert@...ux-m68k.org>, Harini
Katakam <harini.katakam@...inx.com>, Richard Cochran
<richardcochran@...il.com>, Russell King <linux@...linux.org.uk>
CC: <netdev@...r.kernel.org>, <devicetree@...r.kernel.org>,
<linux-kernel@...r.kernel.org>, Thomas Petazzoni
<thomas.petazzoni@...tlin.com>, Tawfik Bayouk <tawfik.bayouk@...ileye.com>,
Sean Anderson <sean.anderson@...ux.dev>
Subject: Re: [PATCH net v4 4/5] net: macb: single dma_alloc_coherent() for DMA
descriptors
On 20/08/2025 at 16:55, Théo Lebrun wrote:
> Move from 2*NUM_QUEUES dma_alloc_coherent() for DMA descriptor rings to
> 2 calls overall.
>
> Issue is with how all queues share the same register for configuring the
> upper 32-bits of Tx/Rx descriptor rings. Taking Tx, notice how TBQPH
> does *not* depend on the queue index:
>
> #define GEM_TBQP(hw_q) (0x0440 + ((hw_q) << 2))
> #define GEM_TBQPH(hw_q) (0x04C8)
>
> queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
> #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
> if (bp->hw_dma_cap & HW_DMA_CAP_64B)
> queue_writel(queue, TBQPH, upper_32_bits(queue->tx_ring_dma));
> #endif
>
> To maximise our chances of getting valid DMA addresses, we do a single
> dma_alloc_coherent() across queues. This improves the odds because
> alloc_pages() guarantees natural alignment. Other codepaths (IOMMU or
> dev/arch dma_map_ops) don't give high enough guarantees
> (even page-aligned isn't enough).
>
> Two consideration:
>
> - dma_alloc_coherent() gives us page alignment. Here we remove this
> constraint meaning each queue's ring won't be page-aligned anymore.
However... We must guarantee alignement depending on the controller's
bus width (32 or 64 bits)... but being page aligned and having
descriptors multiple of 64 bits anyway, we're good for each descriptor
(might be worth explicitly adding).
>
> - This can save some tiny amounts of memory. Fewer allocations means
> (1) less overhead (constant cost per alloc) and (2) less wasted bytes
> due to alignment constraints.
>
> Example for (2): 4 queues, default ring size (512), 64-bit DMA
> descriptors, 16K pages:
> - Before: 8 allocs of 8K, each rounded to 16K => 64K wasted.
> - After: 2 allocs of 32K => 0K wasted.
>
> Fixes: 02c958dd3446 ("net/macb: add TX multiqueue support for gem")
> Reviewed-by: Sean Anderson <sean.anderson@...ux.dev>
> Signed-off-by: Théo Lebrun <theo.lebrun@...tlin.com>
Acked-by: Nicolas Ferre <nicolas.ferre@...rochip.com>
Tested-by: Nicolas Ferre <nicolas.ferre@...rochip.com> # on sam9x75
> ---
> drivers/net/ethernet/cadence/macb_main.c | 80 ++++++++++++++++----------------
> 1 file changed, 41 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
> index d413e8bd4977187fd73f7cc48268baf933aab051..7f31f264a6d342ea01e2f61944b12c9b9a3fe66e 100644
> --- a/drivers/net/ethernet/cadence/macb_main.c
> +++ b/drivers/net/ethernet/cadence/macb_main.c
> @@ -2474,32 +2474,30 @@ static unsigned int macb_rx_ring_size_per_queue(struct macb *bp)
>
> static void macb_free_consistent(struct macb *bp)
> {
> + struct device *dev = &bp->pdev->dev;
> struct macb_queue *queue;
> unsigned int q;
> + size_t size;
>
> if (bp->rx_ring_tieoff) {
> - dma_free_coherent(&bp->pdev->dev, macb_dma_desc_get_size(bp),
> + dma_free_coherent(dev, macb_dma_desc_get_size(bp),
> bp->rx_ring_tieoff, bp->rx_ring_tieoff_dma);
> bp->rx_ring_tieoff = NULL;
> }
>
> bp->macbgem_ops.mog_free_rx_buffers(bp);
>
> + size = bp->num_queues * macb_tx_ring_size_per_queue(bp);
> + dma_free_coherent(dev, size, bp->queues[0].tx_ring, bp->queues[0].tx_ring_dma);
> +
> + size = bp->num_queues * macb_rx_ring_size_per_queue(bp);
> + dma_free_coherent(dev, size, bp->queues[0].rx_ring, bp->queues[0].rx_ring_dma);
> +
> for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
> kfree(queue->tx_skb);
> queue->tx_skb = NULL;
> - if (queue->tx_ring) {
> - dma_free_coherent(&bp->pdev->dev,
> - macb_tx_ring_size_per_queue(bp),
> - queue->tx_ring, queue->tx_ring_dma);
> - queue->tx_ring = NULL;
> - }
> - if (queue->rx_ring) {
> - dma_free_coherent(&bp->pdev->dev,
> - macb_rx_ring_size_per_queue(bp),
> - queue->rx_ring, queue->rx_ring_dma);
> - queue->rx_ring = NULL;
> - }
> + queue->tx_ring = NULL;
> + queue->rx_ring = NULL;
> }
> }
>
> @@ -2541,41 +2539,45 @@ static int macb_alloc_rx_buffers(struct macb *bp)
>
> static int macb_alloc_consistent(struct macb *bp)
> {
> + struct device *dev = &bp->pdev->dev;
> + dma_addr_t tx_dma, rx_dma;
> struct macb_queue *queue;
> unsigned int q;
> - u32 upper;
> - int size;
> + void *tx, *rx;
> + size_t size;
> +
> + /*
> + * Upper 32-bits of Tx/Rx DMA descriptor for each queues much match!
> + * We cannot enforce this guarantee, the best we can do is do a single
> + * allocation and hope it will land into alloc_pages() that guarantees
> + * natural alignment of physical addresses.
> + */
> +
> + size = bp->num_queues * macb_tx_ring_size_per_queue(bp);
> + tx = dma_alloc_coherent(dev, size, &tx_dma, GFP_KERNEL);
> + if (!tx || upper_32_bits(tx_dma) != upper_32_bits(tx_dma + size - 1))
> + goto out_err;
> + netdev_dbg(bp->dev, "Allocated %zu bytes for %u TX rings at %08lx (mapped %p)\n",
> + size, bp->num_queues, (unsigned long)tx_dma, tx);
> +
> + size = bp->num_queues * macb_rx_ring_size_per_queue(bp);
> + rx = dma_alloc_coherent(dev, size, &rx_dma, GFP_KERNEL);
> + if (!rx || upper_32_bits(rx_dma) != upper_32_bits(rx_dma + size - 1))
> + goto out_err;
> + netdev_dbg(bp->dev, "Allocated %zu bytes for %u RX rings at %08lx (mapped %p)\n",
> + size, bp->num_queues, (unsigned long)rx_dma, rx);
>
> for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
> - size = macb_tx_ring_size_per_queue(bp);
> - queue->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
> - &queue->tx_ring_dma,
> - GFP_KERNEL);
> - upper = upper_32_bits(queue->tx_ring_dma);
> - if (!queue->tx_ring ||
> - upper != upper_32_bits(bp->queues[0].tx_ring_dma))
> - goto out_err;
> - netdev_dbg(bp->dev,
> - "Allocated TX ring for queue %u of %d bytes at %08lx (mapped %p)\n",
> - q, size, (unsigned long)queue->tx_ring_dma,
> - queue->tx_ring);
> + queue->tx_ring = tx + macb_tx_ring_size_per_queue(bp) * q;
> + queue->tx_ring_dma = tx_dma + macb_tx_ring_size_per_queue(bp) * q;
> +
> + queue->rx_ring = rx + macb_rx_ring_size_per_queue(bp) * q;
> + queue->rx_ring_dma = rx_dma + macb_rx_ring_size_per_queue(bp) * q;
>
> size = bp->tx_ring_size * sizeof(struct macb_tx_skb);
> queue->tx_skb = kmalloc(size, GFP_KERNEL);
> if (!queue->tx_skb)
> goto out_err;
> -
> - size = macb_rx_ring_size_per_queue(bp);
> - queue->rx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
> - &queue->rx_ring_dma,
> - GFP_KERNEL);
> - upper = upper_32_bits(queue->rx_ring_dma);
> - if (!queue->rx_ring ||
> - upper != upper_32_bits(bp->queues[0].rx_ring_dma))
> - goto out_err;
> - netdev_dbg(bp->dev,
> - "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
> - size, (unsigned long)queue->rx_ring_dma, queue->rx_ring);
> }
> if (bp->macbgem_ops.mog_alloc_rx_buffers(bp))
> goto out_err;
>
> --
> 2.50.1
>
Powered by blists - more mailing lists