[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CANXQDta2Xn33j+Da_K=7LUHKj-SgMNdKT2wx5W1Z5nFw2uKDhw@mail.gmail.com>
Date: Tue, 13 Jan 2026 02:46:28 +0530
From: Bhargava Chenna Marreddy <bhargava.marreddy@...adcom.com>
To: Paolo Abeni <pabeni@...hat.com>
Cc: davem@...emloft.net, edumazet@...gle.com, kuba@...nel.org,
andrew+netdev@...n.ch, horms@...nel.org, netdev@...r.kernel.org,
linux-kernel@...r.kernel.org, michael.chan@...adcom.com,
pavan.chebbi@...adcom.com, vsrama-krishna.nemani@...adcom.com,
vikas.gupta@...adcom.com,
Rajashekar Hudumula <rajashekar.hudumula@...adcom.com>
Subject: Re: [v4, net-next 4/7] bng_en: Add TX support
On Thu, Jan 8, 2026 at 3:39 PM Paolo Abeni <pabeni@...hat.com> wrote:
>
> On 1/5/26 8:21 AM, Bhargava Marreddy wrote:
> > +static void __bnge_tx_int(struct bnge_net *bn, struct bnge_tx_ring_info *txr,
> > + int budget)
> > +{
> > + u16 hw_cons = txr->tx_hw_cons;
> > + struct bnge_dev *bd = bn->bd;
> > + unsigned int tx_bytes = 0;
> > + unsigned int tx_pkts = 0;
> > + struct netdev_queue *txq;
> > + u16 cons = txr->tx_cons;
> > + skb_frag_t *frag;
> > +
> > + txq = netdev_get_tx_queue(bn->netdev, txr->txq_index);
> > +
> > + while (RING_TX(bn, cons) != hw_cons) {
> > + struct bnge_sw_tx_bd *tx_buf;
> > + struct sk_buff *skb;
> > + int j, last;
> > +
> > + tx_buf = &txr->tx_buf_ring[RING_TX(bn, cons)];
> > + skb = tx_buf->skb;
> > + if (unlikely(!skb)) {
> > + bnge_sched_reset_txr(bn, txr, cons);
> > + return;
> > + }
> > +
> > + cons = NEXT_TX(cons);
> > + tx_pkts++;
> > + tx_bytes += skb->len;
> > + tx_buf->skb = NULL;
> > +
> > + dma_unmap_single(bd->dev, dma_unmap_addr(tx_buf, mapping),
> > + skb_headlen(skb), DMA_TO_DEVICE);
> > + last = tx_buf->nr_frags;
> > +
> > + for (j = 0; j < last; j++) {
> > + frag = &skb_shinfo(skb)->frags[j];
> > + cons = NEXT_TX(cons);
> > + tx_buf = &txr->tx_buf_ring[RING_TX(bn, cons)];
> > + netmem_dma_unmap_page_attrs(bd->dev,
> > + dma_unmap_addr(tx_buf,
> > + mapping),
> > + skb_frag_size(frag),
> > + DMA_TO_DEVICE, 0);
> > + }
>
> There is a similar chunk in bnge_free_tx_skbs(), you could avoid
> douplication factoring that out in common helper.
Agreed. I'll address this in the next revision.
>
> > +
> > + cons = NEXT_TX(cons);
> > +
> > + napi_consume_skb(skb, budget);
> > + }
> > +
> > + WRITE_ONCE(txr->tx_cons, cons);
> > +
> > + __netif_txq_completed_wake(txq, tx_pkts, tx_bytes,
> > + bnge_tx_avail(bn, txr), bn->tx_wake_thresh,
> > + (READ_ONCE(txr->dev_state) ==
> > + BNGE_DEV_STATE_CLOSING));
> > +}
> > +
> > +static void bnge_tx_int(struct bnge_net *bn, struct bnge_napi *bnapi,
> > + int budget)
> > +{
> > + struct bnge_tx_ring_info *txr;
> > + int i;
> > +
> > + bnge_for_each_napi_tx(i, bnapi, txr) {
> > + if (txr->tx_hw_cons != RING_TX(bn, txr->tx_cons))
> > + __bnge_tx_int(bn, txr, budget);
>
> The above looks strange to me: there are multiple tx ring, but they are
> all served by the same irq?!?
The design allows multiple TX rings per NAPI instance, so the loop is
required to service all rings associated with that IRQ vector.
>
> > + }
> > +
> > + bnapi->events &= ~BNGE_TX_CMP_EVENT;
> > +}
> > +
> > static void __bnge_poll_work_done(struct bnge_net *bn, struct bnge_napi *bnapi,
> > int budget)
> > {
> > struct bnge_rx_ring_info *rxr = bnapi->rx_ring;
> >
> > + if ((bnapi->events & BNGE_TX_CMP_EVENT) && !bnapi->tx_fault)
> > + bnge_tx_int(bn, bnapi, budget);
> > +
> > if ((bnapi->events & BNGE_RX_EVENT)) {
> > bnge_db_write(bn->bd, &rxr->rx_db, rxr->rx_prod);
> > bnapi->events &= ~BNGE_RX_EVENT;
> > @@ -456,9 +548,26 @@ static int __bnge_poll_work(struct bnge_net *bn, struct bnge_cp_ring_info *cpr,
> > cmp_type = TX_CMP_TYPE(txcmp);
> > if (cmp_type == CMP_TYPE_TX_L2_CMP ||
> > cmp_type == CMP_TYPE_TX_L2_COAL_CMP) {
> > - /*
> > - * Tx Compl Processng
> > - */
> > + u32 opaque = txcmp->tx_cmp_opaque;
> > + struct bnge_tx_ring_info *txr;
> > + u16 tx_freed;
> > +
> > + txr = bnapi->tx_ring[TX_OPAQUE_RING(opaque)];
> > + event |= BNGE_TX_CMP_EVENT;
> > + if (cmp_type == CMP_TYPE_TX_L2_COAL_CMP)
> > + txr->tx_hw_cons = TX_CMP_SQ_CONS_IDX(txcmp);
> > + else
> > + txr->tx_hw_cons = TX_OPAQUE_PROD(bn, opaque);
> > + tx_freed = ((txr->tx_hw_cons - txr->tx_cons) &
> > + bn->tx_ring_mask);
> > + /* return full budget so NAPI will complete. */
> > + if (unlikely(tx_freed >= bn->tx_wake_thresh)) {
> > + rx_pkts = budget;
> > + raw_cons = NEXT_RAW_CMP(raw_cons);
> > + if (budget)
> > + cpr->has_more_work = 1;
> > + break;
> > + }
> > } else if (cmp_type >= CMP_TYPE_RX_L2_CMP &&
> > cmp_type <= CMP_TYPE_RX_L2_TPA_START_V3_CMP) {
> > if (likely(budget))
> > @@ -613,3 +722,277 @@ int bnge_napi_poll(struct napi_struct *napi, int budget)
> > poll_done:
> > return work_done;
> > }
> > +
> > +static u16 bnge_xmit_get_cfa_action(struct sk_buff *skb)
> > +{
> > + struct metadata_dst *md_dst = skb_metadata_dst(skb);
> > +
> > + if (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)
> > + return 0;
> > +
> > + return md_dst->u.port_info.port_id;
> > +}
> > +
> > +static const u16 bnge_lhint_arr[] = {
> > + TX_BD_FLAGS_LHINT_512_AND_SMALLER,
> > + TX_BD_FLAGS_LHINT_512_TO_1023,
> > + TX_BD_FLAGS_LHINT_1024_TO_2047,
> > + TX_BD_FLAGS_LHINT_1024_TO_2047,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > + TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +};
> > +
> > +static void bnge_txr_db_kick(struct bnge_net *bn, struct bnge_tx_ring_info *txr,
> > + u16 prod)
> > +{
> > + /* Sync BD data before updating doorbell */
> > + wmb();
> > + bnge_db_write(bn->bd, &txr->tx_db, prod);
> > + txr->kick_pending = 0;
> > +}
> > +
> > +netdev_tx_t bnge_start_xmit(struct sk_buff *skb, struct net_device *dev)
> > +{
> > + u32 len, free_size, vlan_tag_flags, cfa_action, flags;
> > + struct bnge_net *bn = netdev_priv(dev);
> > + struct bnge_tx_ring_info *txr;
> > + struct bnge_dev *bd = bn->bd;
> > + unsigned int length, pad = 0;
> > + struct bnge_sw_tx_bd *tx_buf;
> > + struct tx_bd *txbd, *txbd0;
> > + struct netdev_queue *txq;
> > + struct tx_bd_ext *txbd1;
> > + u16 prod, last_frag;
> > + dma_addr_t mapping;
> > + __le32 lflags = 0;
> > + skb_frag_t *frag;
> > + int i;
> > +
> > + i = skb_get_queue_mapping(skb);
> > + if (unlikely(i >= bd->tx_nr_rings)) {
>
> Under which conditions the above statement can be true?
This is a defensive check to prevent out-of-bounds access during rare
races, such as an
ethtool -L reconfiguration where an in-flight skb might carry a stale
queue mapping.
Does this approach seem reasonable, or should I remove the check and
rely on the
stack's quiescence during reconfiguration?
>
> > + dev_kfree_skb_any(skb);
> > + dev_core_stats_tx_dropped_inc(dev);
> > + return NETDEV_TX_OK;
> > + }
> > +
> > + txq = netdev_get_tx_queue(dev, i);
> > + txr = &bn->tx_ring[bn->tx_ring_map[i]];
> > + prod = txr->tx_prod;
> > +
> > +#if (MAX_SKB_FRAGS > TX_MAX_FRAGS)
> > + if (skb_shinfo(skb)->nr_frags > TX_MAX_FRAGS) {
>
> You should probably implement ndo_features_check() and ensure the above
> condition will never happen.
Thanks, I'll address this in the next revision.
>
> > + netdev_warn_once(dev, "SKB has too many (%d) fragments, max supported is %d. SKB will be linearized.\n",
> > + skb_shinfo(skb)->nr_frags, TX_MAX_FRAGS);
> > + if (skb_linearize(skb)) {
> > + dev_kfree_skb_any(skb);
> > + dev_core_stats_tx_dropped_inc(dev);
> > + return NETDEV_TX_OK;
> > + }
> > + }
> > +#endif
> > + free_size = bnge_tx_avail(bn, txr);
> > + if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) {
> > + /* We must have raced with NAPI cleanup */
> > + if (net_ratelimit() && txr->kick_pending)
> > + netif_warn(bn, tx_err, dev,
> > + "bnge: ring busy w/ flush pending!\n");
> > + if (!netif_txq_try_stop(txq, bnge_tx_avail(bn, txr),
> > + bn->tx_wake_thresh))
> > + return NETDEV_TX_BUSY;
> > + }
> > +
> > + if (unlikely(ipv6_hopopt_jumbo_remove(skb)))
> > + goto tx_free;
> > +
> > + length = skb->len;
> > + len = skb_headlen(skb);
> > + last_frag = skb_shinfo(skb)->nr_frags;
> > +
> > + txbd = &txr->tx_desc_ring[TX_RING(bn, prod)][TX_IDX(prod)];
> > +
> > + tx_buf = &txr->tx_buf_ring[RING_TX(bn, prod)];
>
> The naming (TX_RING() vs RING_TX() with quite different meaning) is IMHO
> prone to errors.
Thanks, I'll address this in the next revision.
>
> > + tx_buf->skb = skb;
> > + tx_buf->nr_frags = last_frag;
> > +
> > + vlan_tag_flags = 0;
> > + cfa_action = bnge_xmit_get_cfa_action(skb);
> > + if (skb_vlan_tag_present(skb)) {
> > + vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN |
> > + skb_vlan_tag_get(skb);
> > + /* Currently supports 8021Q, 8021AD vlan offloads
> > + * QINQ1, QINQ2, QINQ3 vlan headers are deprecated
> > + */
> > + if (skb->vlan_proto == htons(ETH_P_8021Q))
> > + vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT;
> > + }
> > +
> > + if (unlikely(skb->no_fcs))
> > + lflags |= cpu_to_le32(TX_BD_FLAGS_NO_CRC);
> > +
> > + if (length < BNGE_MIN_PKT_SIZE) {
> > + pad = BNGE_MIN_PKT_SIZE - length;
> > + if (skb_pad(skb, pad))
> > + /* SKB already freed. */
> > + goto tx_kick_pending;
> > + length = BNGE_MIN_PKT_SIZE;
> > + }
> > +
> > + mapping = dma_map_single(bd->dev, skb->data, len, DMA_TO_DEVICE);
> > +
> > + if (unlikely(dma_mapping_error(bd->dev, mapping)))
> > + goto tx_free;
> > +
> > + dma_unmap_addr_set(tx_buf, mapping, mapping);
> > + flags = (len << TX_BD_LEN_SHIFT) | TX_BD_TYPE_LONG_TX_BD |
> > + TX_BD_CNT(last_frag + 2);
> > +
> > + txbd->tx_bd_haddr = cpu_to_le64(mapping);
> > + txbd->tx_bd_opaque = SET_TX_OPAQUE(bn, txr, prod, 2 + last_frag);
> > +
> > + prod = NEXT_TX(prod);
> > + txbd1 = (struct tx_bd_ext *)
> > + &txr->tx_desc_ring[TX_RING(bn, prod)][TX_IDX(prod)];
> > +
> > + txbd1->tx_bd_hsize_lflags = lflags;
> > + if (skb_is_gso(skb)) {
> > + bool udp_gso = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4);
> > + u32 hdr_len;
> > +
> > + if (skb->encapsulation) {
> > + if (udp_gso)
> > + hdr_len = skb_inner_transport_offset(skb) +
> > + sizeof(struct udphdr);
> > + else
> > + hdr_len = skb_inner_tcp_all_headers(skb);
> > + } else if (udp_gso) {
> > + hdr_len = skb_transport_offset(skb) +
> > + sizeof(struct udphdr);
> > + } else {
> > + hdr_len = skb_tcp_all_headers(skb);
> > + }
> > +
> > + txbd1->tx_bd_hsize_lflags |= cpu_to_le32(TX_BD_FLAGS_LSO |
> > + TX_BD_FLAGS_T_IPID |
> > + (hdr_len << (TX_BD_HSIZE_SHIFT - 1)));
> > + length = skb_shinfo(skb)->gso_size;
> > + txbd1->tx_bd_mss = cpu_to_le32(length);
> > + length += hdr_len;
> > + } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
> > + txbd1->tx_bd_hsize_lflags |=
> > + cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM);
> > + txbd1->tx_bd_mss = 0;
> > + }
> > +
> > + length >>= 9;
> > + if (unlikely(length >= ARRAY_SIZE(bnge_lhint_arr))) {
>
> a proper ndo_features_check() should avoid the above check.
Thanks, I'll address this in the next revision.
Thanks,
Bhargava Marreddy
>
> /P
>
Download attachment "smime.p7s" of type "application/pkcs7-signature" (5496 bytes)
Powered by blists - more mailing lists