netdev - Re: [v4, net-next 4/7] bng

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <81fe0e2e-5f05-4258-b722-7a09e6d99182@redhat.com>
Date: Thu, 8 Jan 2026 11:09:53 +0100
From: Paolo Abeni <pabeni@...hat.com>
To: Bhargava Marreddy <bhargava.marreddy@...adcom.com>, davem@...emloft.net,
 edumazet@...gle.com, kuba@...nel.org, andrew+netdev@...n.ch, horms@...nel.org
Cc: netdev@...r.kernel.org, linux-kernel@...r.kernel.org,
 michael.chan@...adcom.com, pavan.chebbi@...adcom.com,
 vsrama-krishna.nemani@...adcom.com, vikas.gupta@...adcom.com,
 Rajashekar Hudumula <rajashekar.hudumula@...adcom.com>
Subject: Re: [v4, net-next 4/7] bng_en: Add TX support

On 1/5/26 8:21 AM, Bhargava Marreddy wrote:
> +static void __bnge_tx_int(struct bnge_net *bn, struct bnge_tx_ring_info *txr,
> +			  int budget)
> +{
> +	u16 hw_cons = txr->tx_hw_cons;
> +	struct bnge_dev *bd = bn->bd;
> +	unsigned int tx_bytes = 0;
> +	unsigned int tx_pkts = 0;
> +	struct netdev_queue *txq;
> +	u16 cons = txr->tx_cons;
> +	skb_frag_t *frag;
> +
> +	txq = netdev_get_tx_queue(bn->netdev, txr->txq_index);
> +
> +	while (RING_TX(bn, cons) != hw_cons) {
> +		struct bnge_sw_tx_bd *tx_buf;
> +		struct sk_buff *skb;
> +		int j, last;
> +
> +		tx_buf = &txr->tx_buf_ring[RING_TX(bn, cons)];
> +		skb = tx_buf->skb;
> +		if (unlikely(!skb)) {
> +			bnge_sched_reset_txr(bn, txr, cons);
> +			return;
> +		}
> +
> +		cons = NEXT_TX(cons);
> +		tx_pkts++;
> +		tx_bytes += skb->len;
> +		tx_buf->skb = NULL;
> +
> +		dma_unmap_single(bd->dev, dma_unmap_addr(tx_buf, mapping),
> +				 skb_headlen(skb), DMA_TO_DEVICE);
> +		last = tx_buf->nr_frags;
> +
> +		for (j = 0; j < last; j++) {
> +			frag = &skb_shinfo(skb)->frags[j];
> +			cons = NEXT_TX(cons);
> +			tx_buf = &txr->tx_buf_ring[RING_TX(bn, cons)];
> +			netmem_dma_unmap_page_attrs(bd->dev,
> +						    dma_unmap_addr(tx_buf,
> +								   mapping),
> +						    skb_frag_size(frag),
> +						    DMA_TO_DEVICE, 0);
> +		}

There is a similar chunk in bnge_free_tx_skbs(), you could avoid
douplication factoring that out in common helper.

> +
> +		cons = NEXT_TX(cons);
> +
> +		napi_consume_skb(skb, budget);
> +	}
> +
> +	WRITE_ONCE(txr->tx_cons, cons);
> +
> +	__netif_txq_completed_wake(txq, tx_pkts, tx_bytes,
> +				   bnge_tx_avail(bn, txr), bn->tx_wake_thresh,
> +				   (READ_ONCE(txr->dev_state) ==
> +				    BNGE_DEV_STATE_CLOSING));
> +}
> +
> +static void bnge_tx_int(struct bnge_net *bn, struct bnge_napi *bnapi,
> +			int budget)
> +{
> +	struct bnge_tx_ring_info *txr;
> +	int i;
> +
> +	bnge_for_each_napi_tx(i, bnapi, txr) {
> +		if (txr->tx_hw_cons != RING_TX(bn, txr->tx_cons))
> +			__bnge_tx_int(bn, txr, budget);

The above looks strange to me: there are multiple tx ring, but they are
all served by the same irq?!?

> +	}
> +
> +	bnapi->events &= ~BNGE_TX_CMP_EVENT;
> +}
> +
>  static void __bnge_poll_work_done(struct bnge_net *bn, struct bnge_napi *bnapi,
>  				  int budget)
>  {
>  	struct bnge_rx_ring_info *rxr = bnapi->rx_ring;
>  
> +	if ((bnapi->events & BNGE_TX_CMP_EVENT) && !bnapi->tx_fault)
> +		bnge_tx_int(bn, bnapi, budget);
> +
>  	if ((bnapi->events & BNGE_RX_EVENT)) {
>  		bnge_db_write(bn->bd, &rxr->rx_db, rxr->rx_prod);
>  		bnapi->events &= ~BNGE_RX_EVENT;
> @@ -456,9 +548,26 @@ static int __bnge_poll_work(struct bnge_net *bn, struct bnge_cp_ring_info *cpr,
>  		cmp_type = TX_CMP_TYPE(txcmp);
>  		if (cmp_type == CMP_TYPE_TX_L2_CMP ||
>  		    cmp_type == CMP_TYPE_TX_L2_COAL_CMP) {
> -			/*
> -			 * Tx Compl Processng
> -			 */
> +			u32 opaque = txcmp->tx_cmp_opaque;
> +			struct bnge_tx_ring_info *txr;
> +			u16 tx_freed;
> +
> +			txr = bnapi->tx_ring[TX_OPAQUE_RING(opaque)];
> +			event |= BNGE_TX_CMP_EVENT;
> +			if (cmp_type == CMP_TYPE_TX_L2_COAL_CMP)
> +				txr->tx_hw_cons = TX_CMP_SQ_CONS_IDX(txcmp);
> +			else
> +				txr->tx_hw_cons = TX_OPAQUE_PROD(bn, opaque);
> +			tx_freed = ((txr->tx_hw_cons - txr->tx_cons) &
> +				    bn->tx_ring_mask);
> +			/* return full budget so NAPI will complete. */
> +			if (unlikely(tx_freed >= bn->tx_wake_thresh)) {
> +				rx_pkts = budget;
> +				raw_cons = NEXT_RAW_CMP(raw_cons);
> +				if (budget)
> +					cpr->has_more_work = 1;
> +				break;
> +			}
>  		} else if (cmp_type >= CMP_TYPE_RX_L2_CMP &&
>  			   cmp_type <= CMP_TYPE_RX_L2_TPA_START_V3_CMP) {
>  			if (likely(budget))
> @@ -613,3 +722,277 @@ int bnge_napi_poll(struct napi_struct *napi, int budget)
>  poll_done:
>  	return work_done;
>  }
> +
> +static u16 bnge_xmit_get_cfa_action(struct sk_buff *skb)
> +{
> +	struct metadata_dst *md_dst = skb_metadata_dst(skb);
> +
> +	if (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)
> +		return 0;
> +
> +	return md_dst->u.port_info.port_id;
> +}
> +
> +static const u16 bnge_lhint_arr[] = {
> +	TX_BD_FLAGS_LHINT_512_AND_SMALLER,
> +	TX_BD_FLAGS_LHINT_512_TO_1023,
> +	TX_BD_FLAGS_LHINT_1024_TO_2047,
> +	TX_BD_FLAGS_LHINT_1024_TO_2047,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +	TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> +};
> +
> +static void bnge_txr_db_kick(struct bnge_net *bn, struct bnge_tx_ring_info *txr,
> +			     u16 prod)
> +{
> +	/* Sync BD data before updating doorbell */
> +	wmb();
> +	bnge_db_write(bn->bd, &txr->tx_db, prod);
> +	txr->kick_pending = 0;
> +}
> +
> +netdev_tx_t bnge_start_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> +	u32 len, free_size, vlan_tag_flags, cfa_action, flags;
> +	struct bnge_net *bn = netdev_priv(dev);
> +	struct bnge_tx_ring_info *txr;
> +	struct bnge_dev *bd = bn->bd;
> +	unsigned int length, pad = 0;
> +	struct bnge_sw_tx_bd *tx_buf;
> +	struct tx_bd *txbd, *txbd0;
> +	struct netdev_queue *txq;
> +	struct tx_bd_ext *txbd1;
> +	u16 prod, last_frag;
> +	dma_addr_t mapping;
> +	__le32 lflags = 0;
> +	skb_frag_t *frag;
> +	int i;
> +
> +	i = skb_get_queue_mapping(skb);
> +	if (unlikely(i >= bd->tx_nr_rings)) {

Under which conditions the above statement can be true?

> +		dev_kfree_skb_any(skb);
> +		dev_core_stats_tx_dropped_inc(dev);
> +		return NETDEV_TX_OK;
> +	}
> +
> +	txq = netdev_get_tx_queue(dev, i);
> +	txr = &bn->tx_ring[bn->tx_ring_map[i]];
> +	prod = txr->tx_prod;
> +
> +#if (MAX_SKB_FRAGS > TX_MAX_FRAGS)
> +	if (skb_shinfo(skb)->nr_frags > TX_MAX_FRAGS) {

You should probably implement ndo_features_check() and ensure the above
condition will never happen.

> +		netdev_warn_once(dev, "SKB has too many (%d) fragments, max supported is %d.  SKB will be linearized.\n",
> +				 skb_shinfo(skb)->nr_frags, TX_MAX_FRAGS);
> +		if (skb_linearize(skb)) {
> +			dev_kfree_skb_any(skb);
> +			dev_core_stats_tx_dropped_inc(dev);
> +			return NETDEV_TX_OK;
> +		}
> +	}
> +#endif
> +	free_size = bnge_tx_avail(bn, txr);
> +	if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) {
> +		/* We must have raced with NAPI cleanup */
> +		if (net_ratelimit() && txr->kick_pending)
> +			netif_warn(bn, tx_err, dev,
> +				   "bnge: ring busy w/ flush pending!\n");
> +		if (!netif_txq_try_stop(txq, bnge_tx_avail(bn, txr),
> +					bn->tx_wake_thresh))
> +			return NETDEV_TX_BUSY;
> +	}
> +
> +	if (unlikely(ipv6_hopopt_jumbo_remove(skb)))
> +		goto tx_free;
> +
> +	length = skb->len;
> +	len = skb_headlen(skb);
> +	last_frag = skb_shinfo(skb)->nr_frags;
> +
> +	txbd = &txr->tx_desc_ring[TX_RING(bn, prod)][TX_IDX(prod)];
> +
> +	tx_buf = &txr->tx_buf_ring[RING_TX(bn, prod)];

The naming (TX_RING() vs RING_TX() with quite different meaning) is IMHO
prone to errors.

> +	tx_buf->skb = skb;
> +	tx_buf->nr_frags = last_frag;
> +
> +	vlan_tag_flags = 0;
> +	cfa_action = bnge_xmit_get_cfa_action(skb);
> +	if (skb_vlan_tag_present(skb)) {
> +		vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN |
> +				 skb_vlan_tag_get(skb);
> +		/* Currently supports 8021Q, 8021AD vlan offloads
> +		 * QINQ1, QINQ2, QINQ3 vlan headers are deprecated
> +		 */
> +		if (skb->vlan_proto == htons(ETH_P_8021Q))
> +			vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT;
> +	}
> +
> +	if (unlikely(skb->no_fcs))
> +		lflags |= cpu_to_le32(TX_BD_FLAGS_NO_CRC);
> +
> +	if (length < BNGE_MIN_PKT_SIZE) {
> +		pad = BNGE_MIN_PKT_SIZE - length;
> +		if (skb_pad(skb, pad))
> +			/* SKB already freed. */
> +			goto tx_kick_pending;
> +		length = BNGE_MIN_PKT_SIZE;
> +	}
> +
> +	mapping = dma_map_single(bd->dev, skb->data, len, DMA_TO_DEVICE);
> +
> +	if (unlikely(dma_mapping_error(bd->dev, mapping)))
> +		goto tx_free;
> +
> +	dma_unmap_addr_set(tx_buf, mapping, mapping);
> +	flags = (len << TX_BD_LEN_SHIFT) | TX_BD_TYPE_LONG_TX_BD |
> +		TX_BD_CNT(last_frag + 2);
> +
> +	txbd->tx_bd_haddr = cpu_to_le64(mapping);
> +	txbd->tx_bd_opaque = SET_TX_OPAQUE(bn, txr, prod, 2 + last_frag);
> +
> +	prod = NEXT_TX(prod);
> +	txbd1 = (struct tx_bd_ext *)
> +		&txr->tx_desc_ring[TX_RING(bn, prod)][TX_IDX(prod)];
> +
> +	txbd1->tx_bd_hsize_lflags = lflags;
> +	if (skb_is_gso(skb)) {
> +		bool udp_gso = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4);
> +		u32 hdr_len;
> +
> +		if (skb->encapsulation) {
> +			if (udp_gso)
> +				hdr_len = skb_inner_transport_offset(skb) +
> +					  sizeof(struct udphdr);
> +			else
> +				hdr_len = skb_inner_tcp_all_headers(skb);
> +		} else if (udp_gso) {
> +			hdr_len = skb_transport_offset(skb) +
> +				  sizeof(struct udphdr);
> +		} else {
> +			hdr_len = skb_tcp_all_headers(skb);
> +		}
> +
> +		txbd1->tx_bd_hsize_lflags |= cpu_to_le32(TX_BD_FLAGS_LSO |
> +					TX_BD_FLAGS_T_IPID |
> +					(hdr_len << (TX_BD_HSIZE_SHIFT - 1)));
> +		length = skb_shinfo(skb)->gso_size;
> +		txbd1->tx_bd_mss = cpu_to_le32(length);
> +		length += hdr_len;
> +	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
> +		txbd1->tx_bd_hsize_lflags |=
> +			cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM);
> +		txbd1->tx_bd_mss = 0;
> +	}
> +
> +	length >>= 9;
> +	if (unlikely(length >= ARRAY_SIZE(bnge_lhint_arr))) {

a proper ndo_features_check() should avoid the above check.

/P