netdev - Re: [PATCH RFC net-next 2/6] cadence: macb/gem: handle multi-descriptor frame reception

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <DESRDBL2FBUZ.3LXAM56K3CQV2@bootlin.com>
Date: Mon, 08 Dec 2025 11:21:00 +0100
From: Théo Lebrun <theo.lebrun@...tlin.com>
To: "Paolo Valerio" <pvalerio@...hat.com>, Théo Lebrun
 <theo.lebrun@...tlin.com>, <netdev@...r.kernel.org>
Cc: "Nicolas Ferre" <nicolas.ferre@...rochip.com>, "Claudiu Beznea"
 <claudiu.beznea@...on.dev>, "Andrew Lunn" <andrew+netdev@...n.ch>, "David
 S. Miller" <davem@...emloft.net>, "Eric Dumazet" <edumazet@...gle.com>,
 "Jakub Kicinski" <kuba@...nel.org>, "Paolo Abeni" <pabeni@...hat.com>,
 "Lorenzo Bianconi" <lorenzo@...nel.org>, Grégory Clement
 <gregory.clement@...tlin.com>, Benoît Monin
 <benoit.monin@...tlin.com>, "Thomas Petazzoni"
 <thomas.petazzoni@...tlin.com>
Subject: Re: [PATCH RFC net-next 2/6] cadence: macb/gem: handle
 multi-descriptor frame reception

Hello Paolo, Théo, netdev,

On Tue Dec 2, 2025 at 6:32 PM CET, Paolo Valerio wrote:
> On 27 Nov 2025 at 02:38:45 PM, Théo Lebrun <theo.lebrun@...tlin.com> wrote:
>
>> On Wed Nov 19, 2025 at 2:53 PM CET, Paolo Valerio wrote:
>>> Add support for receiving network frames that span multiple DMA
>>> descriptors in the Cadence MACB/GEM Ethernet driver.
>>>
>>> The patch removes the requirement that limited frame reception to
>>> a single descriptor (RX_SOF && RX_EOF), also avoiding potential
>>> contiguous multi-page allocation for large frames. It also uses
>>> page pool fragments allocation instead of full page for saving
>>> memory.
>>>
>>> Signed-off-by: Paolo Valerio <pvalerio@...hat.com>
>>> ---
>>>  drivers/net/ethernet/cadence/macb.h      |   4 +-
>>>  drivers/net/ethernet/cadence/macb_main.c | 180 ++++++++++++++---------
>>>  2 files changed, 111 insertions(+), 73 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
>>> index dcf768bd1bc1..e2f397b7a27f 100644
>>> --- a/drivers/net/ethernet/cadence/macb.h
>>> +++ b/drivers/net/ethernet/cadence/macb.h
>>> @@ -960,8 +960,7 @@ struct macb_dma_desc_ptp {
>>>  #define PPM_FRACTION	16
>>>  
>>>  /* The buf includes headroom compatible with both skb and xdpf */
>>> -#define MACB_PP_HEADROOM		XDP_PACKET_HEADROOM
>>> -#define MACB_PP_MAX_BUF_SIZE(num)	(((num) * PAGE_SIZE) - MACB_PP_HEADROOM)
>>> +#define MACB_PP_HEADROOM	XDP_PACKET_HEADROOM
>>>  
>>>  /* struct macb_tx_skb - data about an skb which is being transmitted
>>>   * @skb: skb currently being transmitted, only set for the last buffer
>>> @@ -1273,6 +1272,7 @@ struct macb_queue {
>>>  	struct napi_struct	napi_rx;
>>>  	struct queue_stats stats;
>>>  	struct page_pool	*page_pool;
>>> +	struct sk_buff		*skb;
>>>  };
>>>  
>>>  struct ethtool_rx_fs_item {
>>> diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
>>> index 985c81913ba6..be0c8e101639 100644
>>> --- a/drivers/net/ethernet/cadence/macb_main.c
>>> +++ b/drivers/net/ethernet/cadence/macb_main.c
>>> @@ -1250,21 +1250,25 @@ static int macb_tx_complete(struct macb_queue *queue, int budget)
>>>  	return packets;
>>>  }
>>>  
>>> -static void *gem_page_pool_get_buff(struct page_pool *pool,
>>> +static void *gem_page_pool_get_buff(struct  macb_queue *queue,
>>>  				    dma_addr_t *dma_addr, gfp_t gfp_mask)
>>>  {
>>> +	struct macb *bp = queue->bp;
>>>  	struct page *page;
>>> +	int offset;
>>>  
>>> -	if (!pool)
>>> +	if (!queue->page_pool)
>>>  		return NULL;
>>>  
>>> -	page = page_pool_alloc_pages(pool, gfp_mask | __GFP_NOWARN);
>>> +	page = page_pool_alloc_frag(queue->page_pool, &offset,
>>> +				    bp->rx_buffer_size,
>>> +				    gfp_mask | __GFP_NOWARN);
>>>  	if (!page)
>>>  		return NULL;
>>>  
>>> -	*dma_addr = page_pool_get_dma_addr(page) + MACB_PP_HEADROOM;
>>> +	*dma_addr = page_pool_get_dma_addr(page) + MACB_PP_HEADROOM + offset;
>>>  
>>> -	return page_address(page);
>>> +	return page_address(page) + offset;
>>>  }
>>>  
>>>  static void gem_rx_refill(struct macb_queue *queue, bool napi)
>>> @@ -1286,7 +1290,7 @@ static void gem_rx_refill(struct macb_queue *queue, bool napi)
>>>  
>>>  		if (!queue->rx_buff[entry]) {
>>>  			/* allocate sk_buff for this free entry in ring */
>>> -			data = gem_page_pool_get_buff(queue->page_pool, &paddr,
>>> +			data = gem_page_pool_get_buff(queue, &paddr,
>>>  						      napi ? GFP_ATOMIC : GFP_KERNEL);
>>>  			if (unlikely(!data)) {
>>>  				netdev_err(bp->dev,
>>> @@ -1344,20 +1348,17 @@ static int gem_rx(struct macb_queue *queue, struct napi_struct *napi,
>>>  		  int budget)
>>>  {
>>>  	struct macb *bp = queue->bp;
>>> -	int			buffer_size;
>>>  	unsigned int		len;
>>>  	unsigned int		entry;
>>>  	void			*data;
>>> -	struct sk_buff		*skb;
>>>  	struct macb_dma_desc	*desc;
>>> +	int			data_len;
>>>  	int			count = 0;
>>>  
>>> -	buffer_size = DIV_ROUND_UP(bp->rx_buffer_size, PAGE_SIZE) * PAGE_SIZE;
>>> -
>>>  	while (count < budget) {
>>>  		u32 ctrl;
>>>  		dma_addr_t addr;
>>> -		bool rxused;
>>> +		bool rxused, first_frame;
>>>  
>>>  		entry = macb_rx_ring_wrap(bp, queue->rx_tail);
>>>  		desc = macb_rx_desc(queue, entry);
>>> @@ -1368,6 +1369,9 @@ static int gem_rx(struct macb_queue *queue, struct napi_struct *napi,
>>>  		rxused = (desc->addr & MACB_BIT(RX_USED)) ? true : false;
>>>  		addr = macb_get_addr(bp, desc);
>>>  
>>> +		dma_sync_single_for_cpu(&bp->pdev->dev,
>>> +					addr, bp->rx_buffer_size - bp->rx_offset,
>>> +					page_pool_get_dma_dir(queue->page_pool));
>>
>> page_pool_get_dma_dir() will always return the same value, we could
>> hardcode it.
>>
>>>  		if (!rxused)
>>>  			break;
>>>  
>>> @@ -1379,13 +1383,6 @@ static int gem_rx(struct macb_queue *queue, struct napi_struct *napi,
>>>  		queue->rx_tail++;
>>>  		count++;
>>>  
>>> -		if (!(ctrl & MACB_BIT(RX_SOF) && ctrl & MACB_BIT(RX_EOF))) {
>>> -			netdev_err(bp->dev,
>>> -				   "not whole frame pointed by descriptor\n");
>>> -			bp->dev->stats.rx_dropped++;
>>> -			queue->stats.rx_dropped++;
>>> -			break;
>>> -		}
>>>  		data = queue->rx_buff[entry];
>>>  		if (unlikely(!data)) {
>>>  			netdev_err(bp->dev,
>>> @@ -1395,64 +1392,102 @@ static int gem_rx(struct macb_queue *queue, struct napi_struct *napi,
>>>  			break;
>>>  		}
>>>  
>>> -		skb = napi_build_skb(data, buffer_size);
>>> -		if (unlikely(!skb)) {
>>> -			netdev_err(bp->dev,
>>> -				   "Unable to allocate sk_buff\n");
>>> -			page_pool_put_full_page(queue->page_pool,
>>> -						virt_to_head_page(data),
>>> -						false);
>>> -			break;
>>> +		first_frame = ctrl & MACB_BIT(RX_SOF);
>>> +		len = ctrl & bp->rx_frm_len_mask;
>>> +
>>> +		if (len) {
>>> +			data_len = len;
>>> +			if (!first_frame)
>>> +				data_len -= queue->skb->len;
>>> +		} else {
>>> +			data_len = SKB_WITH_OVERHEAD(bp->rx_buffer_size) - bp->rx_offset;
>>>  		}
>>>  
>>> -		/* Properly align Ethernet header.
>>> -		 *
>>> -		 * Hardware can add dummy bytes if asked using the RBOF
>>> -		 * field inside the NCFGR register. That feature isn't
>>> -		 * available if hardware is RSC capable.
>>> -		 *
>>> -		 * We cannot fallback to doing the 2-byte shift before
>>> -		 * DMA mapping because the address field does not allow
>>> -		 * setting the low 2/3 bits.
>>> -		 * It is 3 bits if HW_DMA_CAP_PTP, else 2 bits.
>>> -		 */
>>> -		skb_reserve(skb, bp->rx_offset);
>>> -		skb_mark_for_recycle(skb);
>>> +		if (first_frame) {
>>> +			queue->skb = napi_build_skb(data, bp->rx_buffer_size);
>>> +			if (unlikely(!queue->skb)) {
>>> +				netdev_err(bp->dev,
>>> +					   "Unable to allocate sk_buff\n");
>>> +				goto free_frags;
>>> +			}
>>> +
>>> +			/* Properly align Ethernet header.
>>> +			 *
>>> +			 * Hardware can add dummy bytes if asked using the RBOF
>>> +			 * field inside the NCFGR register. That feature isn't
>>> +			 * available if hardware is RSC capable.
>>> +			 *
>>> +			 * We cannot fallback to doing the 2-byte shift before
>>> +			 * DMA mapping because the address field does not allow
>>> +			 * setting the low 2/3 bits.
>>> +			 * It is 3 bits if HW_DMA_CAP_PTP, else 2 bits.
>>> +			 */
>>> +			skb_reserve(queue->skb, bp->rx_offset);
>>> +			skb_mark_for_recycle(queue->skb);
>>> +			skb_put(queue->skb, data_len);
>>> +			queue->skb->protocol = eth_type_trans(queue->skb, bp->dev);
>>> +
>>> +			skb_checksum_none_assert(queue->skb);
>>> +			if (bp->dev->features & NETIF_F_RXCSUM &&
>>> +			    !(bp->dev->flags & IFF_PROMISC) &&
>>> +			    GEM_BFEXT(RX_CSUM, ctrl) & GEM_RX_CSUM_CHECKED_MASK)
>>> +				queue->skb->ip_summed = CHECKSUM_UNNECESSARY;
>>> +		} else {
>>> +			if (!queue->skb) {
>>> +				netdev_err(bp->dev,
>>> +					   "Received non-starting frame while expecting it\n");
>>> +				goto free_frags;
>>> +			}
>>
>> Here as well we want `queue->rx_buff[entry] = NULL;` no?
>> Maybe put it in the free_frags codepath to ensure it isn't forgotten?
>>
>
> That's correct, that slipped in this RFC.
>
>>> +			struct skb_shared_info *shinfo = skb_shinfo(queue->skb);
>>> +			struct page *page = virt_to_head_page(data);
>>> +			int nr_frags = shinfo->nr_frags;
>>
>> Variable definitions in the middle of a scope? I think it is acceptable
>> when using __attribute__((cleanup)) but otherwise not so much (yet).
>>
>
> I guess I simply forgot to move them.
> Ack for this and for the previous ones.
>
>>> +			if (nr_frags >= ARRAY_SIZE(shinfo->frags))
>>> +				goto free_frags;
>>> +
>>> +			skb_add_rx_frag(queue->skb, nr_frags, page,
>>> +					data - page_address(page) + bp->rx_offset,
>>> +					data_len, bp->rx_buffer_size);
>>> +		}
>>>  
>>>  		/* now everything is ready for receiving packet */
>>>  		queue->rx_buff[entry] = NULL;
>>> -		len = ctrl & bp->rx_frm_len_mask;
>>> -
>>> -		netdev_vdbg(bp->dev, "gem_rx %u (len %u)\n", entry, len);
>>>  
>>> -		dma_sync_single_for_cpu(&bp->pdev->dev,
>>> -					addr, len,
>>> -					page_pool_get_dma_dir(queue->page_pool));
>>> -		skb_put(skb, len);
>>> -		skb->protocol = eth_type_trans(skb, bp->dev);
>>> -		skb_checksum_none_assert(skb);
>>> -		if (bp->dev->features & NETIF_F_RXCSUM &&
>>> -		    !(bp->dev->flags & IFF_PROMISC) &&
>>> -		    GEM_BFEXT(RX_CSUM, ctrl) & GEM_RX_CSUM_CHECKED_MASK)
>>> -			skb->ip_summed = CHECKSUM_UNNECESSARY;
>>> +		netdev_vdbg(bp->dev, "%s %u (len %u)\n", __func__, entry, data_len);
>>>  
>>> -		bp->dev->stats.rx_packets++;
>>> -		queue->stats.rx_packets++;
>>> -		bp->dev->stats.rx_bytes += skb->len;
>>> -		queue->stats.rx_bytes += skb->len;
>>> +		if (ctrl & MACB_BIT(RX_EOF)) {
>>> +			bp->dev->stats.rx_packets++;
>>> +			queue->stats.rx_packets++;
>>> +			bp->dev->stats.rx_bytes += queue->skb->len;
>>> +			queue->stats.rx_bytes += queue->skb->len;
>>>  
>>> -		gem_ptp_do_rxstamp(bp, skb, desc);
>>> +			gem_ptp_do_rxstamp(bp, queue->skb, desc);
>>>  
>>>  #if defined(DEBUG) && defined(VERBOSE_DEBUG)
>>> -		netdev_vdbg(bp->dev, "received skb of length %u, csum: %08x\n",
>>> -			    skb->len, skb->csum);
>>> -		print_hex_dump(KERN_DEBUG, " mac: ", DUMP_PREFIX_ADDRESS, 16, 1,
>>> -			       skb_mac_header(skb), 16, true);
>>> -		print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_ADDRESS, 16, 1,
>>> -			       skb->data, 32, true);
>>> +			netdev_vdbg(bp->dev, "received skb of length %u, csum: %08x\n",
>>> +				    queue->skb->len, queue->skb->csum);
>>> +			print_hex_dump(KERN_DEBUG, " mac: ", DUMP_PREFIX_ADDRESS, 16, 1,
>>> +				       skb_mac_header(queue->skb), 16, true);
>>> +			print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_ADDRESS, 16, 1,
>>> +				       queue->skb->data, 32, true);
>>>  #endif
>>>  
>>> -		napi_gro_receive(napi, skb);
>>> +			napi_gro_receive(napi, queue->skb);
>>> +			queue->skb = NULL;
>>> +		}
>>> +
>>> +		continue;
>>> +
>>> +free_frags:
>>> +		if (queue->skb) {
>>> +			dev_kfree_skb(queue->skb);
>>> +			queue->skb = NULL;
>>> +		} else {
>>> +			page_pool_put_full_page(queue->page_pool,
>>> +						virt_to_head_page(data),
>>> +						false);
>>> +		}
>>>  	}
>>>  
>>>  	gem_rx_refill(queue, true);
>>> @@ -2394,7 +2429,10 @@ static void macb_init_rx_buffer_size(struct macb *bp, size_t size)
>>>  	if (!macb_is_gem(bp)) {
>>>  		bp->rx_buffer_size = MACB_RX_BUFFER_SIZE;
>>>  	} else {
>>> -		bp->rx_buffer_size = size;
>>> +		bp->rx_buffer_size = size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
>>> +			+ MACB_PP_HEADROOM;
>>> +		if (bp->rx_buffer_size > PAGE_SIZE)
>>> +			bp->rx_buffer_size = PAGE_SIZE;
>>>  
>>>  		if (bp->rx_buffer_size % RX_BUFFER_MULTIPLE) {
>>>  			netdev_dbg(bp->dev,
>>> @@ -2589,18 +2627,15 @@ static int macb_alloc_consistent(struct macb *bp)
>>>  
>>>  static void gem_create_page_pool(struct macb_queue *queue)
>>>  {
>>> -	unsigned int num_pages = DIV_ROUND_UP(queue->bp->rx_buffer_size, PAGE_SIZE);
>>> -	struct macb *bp = queue->bp;
>>>  	struct page_pool_params pp_params = {
>>> -		.order = order_base_2(num_pages),
>>> +		.order = 0,
>>>  		.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
>>>  		.pool_size = queue->bp->rx_ring_size,
>>>  		.nid = NUMA_NO_NODE,
>>>  		.dma_dir = DMA_FROM_DEVICE,
>>>  		.dev = &queue->bp->pdev->dev,
>>>  		.napi = &queue->napi_rx,
>>> -		.offset = bp->rx_offset,
>>> -		.max_len = MACB_PP_MAX_BUF_SIZE(num_pages),
>>> +		.max_len = PAGE_SIZE,
>>>  	};
>>>  	struct page_pool *pool;
>>
>> I forgot about it in [PATCH 1/6], but the error handling if
>> gem_create_page_pool() fails is odd. We set queue->page_pool to NULL
>> and keep on going. Then once opened we'll fail allocating any buffer
>> but still be open. Shouldn't we fail the link up operation?
>>
>> If we want to support this codepath (page pool not allocated), then we
>> should unmask Rx interrupts only if alloc succeeded. I don't know if
>> we'd want that though.
>>
>> 	queue_writel(queue, IER, bp->rx_intr_mask | ...);
>>
>
> Makes sense to fail the link up.
> Doesn't this imply to move the page pool creation and refill into
> macb_open()?
>
> I didn't look into it, I'm not sure if this can potentially become a
> bigger change.

So I looked into it. Indeed buffer alloc should be done at open, doing
it at link up (that cannot fail) makes no sense. It is probably
historical, because on MACB it is mog_alloc_rx_buffers() that does all
the alloc. On GEM it only does the ring buffer but not each individual
slot buffer, which is done by ->mog_init_rings() / gem_rx_refill().

I am linking a patch that applies before your series. Then the rebase
conflict resolution is pretty simple, and the gem_create_page_pool()
function should be adapted something like:

-- >8 ------------------------------------------------------------------

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 3dcba7d672bf..55237b840289 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2865,7 +2865,7 @@ static int macb_alloc_consistent(struct macb *bp)
   return -ENOMEM;
 }

-static void gem_create_page_pool(struct macb_queue *queue, int qid)
+static int gem_create_page_pool(struct macb_queue *queue, int qid)
 {
   struct page_pool_params pp_params = {
      .order = 0,
@@ -2885,7 +2885,8 @@ static void gem_create_page_pool(struct macb_queue *queue, int qid)
   pool = page_pool_create(&pp_params);
   if (IS_ERR(pool)) {
      netdev_err(queue->bp->dev, "cannot create rx page pool\n");
-     pool = NULL;
+     err = PTR_ERR(pool);
+     goto clear_pool_ptr;
   }

   queue->page_pool = pool;
@@ -2904,13 +2905,15 @@ static void gem_create_page_pool(struct macb_queue *queue, int qid)
      goto unreg_info;
   }

-  return;
+  return 0;

 unreg_info:
   xdp_rxq_info_unreg(&queue->xdp_q);
 destroy_pool:
   page_pool_destroy(pool);
+clear_pool_ptr:
   queue->page_pool = NULL;
+  return err;
 }

 static void macb_init_tieoff(struct macb *bp)

-- >8 ------------------------------------------------------------------

Regards,

--
Théo Lebrun, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com