netdev - Re: [v4, net-next 4/7] bng

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANXQDta2Xn33j+Da_K=7LUHKj-SgMNdKT2wx5W1Z5nFw2uKDhw@mail.gmail.com>
Date: Tue, 13 Jan 2026 02:46:28 +0530
From: Bhargava Chenna Marreddy <bhargava.marreddy@...adcom.com>
To: Paolo Abeni <pabeni@...hat.com>
Cc: davem@...emloft.net, edumazet@...gle.com, kuba@...nel.org, 
	andrew+netdev@...n.ch, horms@...nel.org, netdev@...r.kernel.org, 
	linux-kernel@...r.kernel.org, michael.chan@...adcom.com, 
	pavan.chebbi@...adcom.com, vsrama-krishna.nemani@...adcom.com, 
	vikas.gupta@...adcom.com, 
	Rajashekar Hudumula <rajashekar.hudumula@...adcom.com>
Subject: Re: [v4, net-next 4/7] bng_en: Add TX support

On Thu, Jan 8, 2026 at 3:39 PM Paolo Abeni <pabeni@...hat.com> wrote:
>
> On 1/5/26 8:21 AM, Bhargava Marreddy wrote:
> > +static void __bnge_tx_int(struct bnge_net *bn, struct bnge_tx_ring_info *txr,
> > +                       int budget)
> > +{
> > +     u16 hw_cons = txr->tx_hw_cons;
> > +     struct bnge_dev *bd = bn->bd;
> > +     unsigned int tx_bytes = 0;
> > +     unsigned int tx_pkts = 0;
> > +     struct netdev_queue *txq;
> > +     u16 cons = txr->tx_cons;
> > +     skb_frag_t *frag;
> > +
> > +     txq = netdev_get_tx_queue(bn->netdev, txr->txq_index);
> > +
> > +     while (RING_TX(bn, cons) != hw_cons) {
> > +             struct bnge_sw_tx_bd *tx_buf;
> > +             struct sk_buff *skb;
> > +             int j, last;
> > +
> > +             tx_buf = &txr->tx_buf_ring[RING_TX(bn, cons)];
> > +             skb = tx_buf->skb;
> > +             if (unlikely(!skb)) {
> > +                     bnge_sched_reset_txr(bn, txr, cons);
> > +                     return;
> > +             }
> > +
> > +             cons = NEXT_TX(cons);
> > +             tx_pkts++;
> > +             tx_bytes += skb->len;
> > +             tx_buf->skb = NULL;
> > +
> > +             dma_unmap_single(bd->dev, dma_unmap_addr(tx_buf, mapping),
> > +                              skb_headlen(skb), DMA_TO_DEVICE);
> > +             last = tx_buf->nr_frags;
> > +
> > +             for (j = 0; j < last; j++) {
> > +                     frag = &skb_shinfo(skb)->frags[j];
> > +                     cons = NEXT_TX(cons);
> > +                     tx_buf = &txr->tx_buf_ring[RING_TX(bn, cons)];
> > +                     netmem_dma_unmap_page_attrs(bd->dev,
> > +                                                 dma_unmap_addr(tx_buf,
> > +                                                                mapping),
> > +                                                 skb_frag_size(frag),
> > +                                                 DMA_TO_DEVICE, 0);
> > +             }
>
> There is a similar chunk in bnge_free_tx_skbs(), you could avoid
> douplication factoring that out in common helper.

Agreed. I'll address this in the next revision.

>
> > +
> > +             cons = NEXT_TX(cons);
> > +
> > +             napi_consume_skb(skb, budget);
> > +     }
> > +
> > +     WRITE_ONCE(txr->tx_cons, cons);
> > +
> > +     __netif_txq_completed_wake(txq, tx_pkts, tx_bytes,
> > +                                bnge_tx_avail(bn, txr), bn->tx_wake_thresh,
> > +                                (READ_ONCE(txr->dev_state) ==
> > +                                 BNGE_DEV_STATE_CLOSING));
> > +}
> > +
> > +static void bnge_tx_int(struct bnge_net *bn, struct bnge_napi *bnapi,
> > +                     int budget)
> > +{
> > +     struct bnge_tx_ring_info *txr;
> > +     int i;
> > +
> > +     bnge_for_each_napi_tx(i, bnapi, txr) {
> > +             if (txr->tx_hw_cons != RING_TX(bn, txr->tx_cons))
> > +                     __bnge_tx_int(bn, txr, budget);
>
> The above looks strange to me: there are multiple tx ring, but they are
> all served by the same irq?!?

The design allows multiple TX rings per NAPI instance, so the loop is
required to service all rings associated with that IRQ vector.

>
> > +     }
> > +
> > +     bnapi->events &= ~BNGE_TX_CMP_EVENT;
> > +}
> > +
> >  static void __bnge_poll_work_done(struct bnge_net *bn, struct bnge_napi *bnapi,
> >                                 int budget)
> >  {
> >       struct bnge_rx_ring_info *rxr = bnapi->rx_ring;
> >
> > +     if ((bnapi->events & BNGE_TX_CMP_EVENT) && !bnapi->tx_fault)
> > +             bnge_tx_int(bn, bnapi, budget);
> > +
> >       if ((bnapi->events & BNGE_RX_EVENT)) {
> >               bnge_db_write(bn->bd, &rxr->rx_db, rxr->rx_prod);
> >               bnapi->events &= ~BNGE_RX_EVENT;
> > @@ -456,9 +548,26 @@ static int __bnge_poll_work(struct bnge_net *bn, struct bnge_cp_ring_info *cpr,
> >               cmp_type = TX_CMP_TYPE(txcmp);
> >               if (cmp_type == CMP_TYPE_TX_L2_CMP ||
> >                   cmp_type == CMP_TYPE_TX_L2_COAL_CMP) {
> > -                     /*
> > -                      * Tx Compl Processng
> > -                      */
> > +                     u32 opaque = txcmp->tx_cmp_opaque;
> > +                     struct bnge_tx_ring_info *txr;
> > +                     u16 tx_freed;
> > +
> > +                     txr = bnapi->tx_ring[TX_OPAQUE_RING(opaque)];
> > +                     event |= BNGE_TX_CMP_EVENT;
> > +                     if (cmp_type == CMP_TYPE_TX_L2_COAL_CMP)
> > +                             txr->tx_hw_cons = TX_CMP_SQ_CONS_IDX(txcmp);
> > +                     else
> > +                             txr->tx_hw_cons = TX_OPAQUE_PROD(bn, opaque);
> > +                     tx_freed = ((txr->tx_hw_cons - txr->tx_cons) &
> > +                                 bn->tx_ring_mask);
> > +                     /* return full budget so NAPI will complete. */
> > +                     if (unlikely(tx_freed >= bn->tx_wake_thresh)) {
> > +                             rx_pkts = budget;
> > +                             raw_cons = NEXT_RAW_CMP(raw_cons);
> > +                             if (budget)
> > +                                     cpr->has_more_work = 1;
> > +                             break;
> > +                     }
> >               } else if (cmp_type >= CMP_TYPE_RX_L2_CMP &&
> >                          cmp_type <= CMP_TYPE_RX_L2_TPA_START_V3_CMP) {
> >                       if (likely(budget))
> > @@ -613,3 +722,277 @@ int bnge_napi_poll(struct napi_struct *napi, int budget)
> >  poll_done:
> >       return work_done;
> >  }
> > +
> > +static u16 bnge_xmit_get_cfa_action(struct sk_buff *skb)
> > +{
> > +     struct metadata_dst *md_dst = skb_metadata_dst(skb);
> > +
> > +     if (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)
> > +             return 0;
> > +
> > +     return md_dst->u.port_info.port_id;
> > +}
> > +
> > +static const u16 bnge_lhint_arr[] = {
> > +     TX_BD_FLAGS_LHINT_512_AND_SMALLER,
> > +     TX_BD_FLAGS_LHINT_512_TO_1023,
> > +     TX_BD_FLAGS_LHINT_1024_TO_2047,
> > +     TX_BD_FLAGS_LHINT_1024_TO_2047,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +     TX_BD_FLAGS_LHINT_2048_AND_LARGER,
> > +};
> > +
> > +static void bnge_txr_db_kick(struct bnge_net *bn, struct bnge_tx_ring_info *txr,
> > +                          u16 prod)
> > +{
> > +     /* Sync BD data before updating doorbell */
> > +     wmb();
> > +     bnge_db_write(bn->bd, &txr->tx_db, prod);
> > +     txr->kick_pending = 0;
> > +}
> > +
> > +netdev_tx_t bnge_start_xmit(struct sk_buff *skb, struct net_device *dev)
> > +{
> > +     u32 len, free_size, vlan_tag_flags, cfa_action, flags;
> > +     struct bnge_net *bn = netdev_priv(dev);
> > +     struct bnge_tx_ring_info *txr;
> > +     struct bnge_dev *bd = bn->bd;
> > +     unsigned int length, pad = 0;
> > +     struct bnge_sw_tx_bd *tx_buf;
> > +     struct tx_bd *txbd, *txbd0;
> > +     struct netdev_queue *txq;
> > +     struct tx_bd_ext *txbd1;
> > +     u16 prod, last_frag;
> > +     dma_addr_t mapping;
> > +     __le32 lflags = 0;
> > +     skb_frag_t *frag;
> > +     int i;
> > +
> > +     i = skb_get_queue_mapping(skb);
> > +     if (unlikely(i >= bd->tx_nr_rings)) {
>
> Under which conditions the above statement can be true?

This is a defensive check to prevent out-of-bounds access during rare
races, such as an
ethtool -L reconfiguration where an in-flight skb might carry a stale
queue mapping.

Does this approach seem reasonable, or should I remove the check and
rely on the
stack's quiescence during reconfiguration?

>
> > +             dev_kfree_skb_any(skb);
> > +             dev_core_stats_tx_dropped_inc(dev);
> > +             return NETDEV_TX_OK;
> > +     }
> > +
> > +     txq = netdev_get_tx_queue(dev, i);
> > +     txr = &bn->tx_ring[bn->tx_ring_map[i]];
> > +     prod = txr->tx_prod;
> > +
> > +#if (MAX_SKB_FRAGS > TX_MAX_FRAGS)
> > +     if (skb_shinfo(skb)->nr_frags > TX_MAX_FRAGS) {
>
> You should probably implement ndo_features_check() and ensure the above
> condition will never happen.

Thanks, I'll address this in the next revision.

>
> > +             netdev_warn_once(dev, "SKB has too many (%d) fragments, max supported is %d.  SKB will be linearized.\n",
> > +                              skb_shinfo(skb)->nr_frags, TX_MAX_FRAGS);
> > +             if (skb_linearize(skb)) {
> > +                     dev_kfree_skb_any(skb);
> > +                     dev_core_stats_tx_dropped_inc(dev);
> > +                     return NETDEV_TX_OK;
> > +             }
> > +     }
> > +#endif
> > +     free_size = bnge_tx_avail(bn, txr);
> > +     if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) {
> > +             /* We must have raced with NAPI cleanup */
> > +             if (net_ratelimit() && txr->kick_pending)
> > +                     netif_warn(bn, tx_err, dev,
> > +                                "bnge: ring busy w/ flush pending!\n");
> > +             if (!netif_txq_try_stop(txq, bnge_tx_avail(bn, txr),
> > +                                     bn->tx_wake_thresh))
> > +                     return NETDEV_TX_BUSY;
> > +     }
> > +
> > +     if (unlikely(ipv6_hopopt_jumbo_remove(skb)))
> > +             goto tx_free;
> > +
> > +     length = skb->len;
> > +     len = skb_headlen(skb);
> > +     last_frag = skb_shinfo(skb)->nr_frags;
> > +
> > +     txbd = &txr->tx_desc_ring[TX_RING(bn, prod)][TX_IDX(prod)];
> > +
> > +     tx_buf = &txr->tx_buf_ring[RING_TX(bn, prod)];
>
> The naming (TX_RING() vs RING_TX() with quite different meaning) is IMHO
> prone to errors.

Thanks, I'll address this in the next revision.

>
> > +     tx_buf->skb = skb;
> > +     tx_buf->nr_frags = last_frag;
> > +
> > +     vlan_tag_flags = 0;
> > +     cfa_action = bnge_xmit_get_cfa_action(skb);
> > +     if (skb_vlan_tag_present(skb)) {
> > +             vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN |
> > +                              skb_vlan_tag_get(skb);
> > +             /* Currently supports 8021Q, 8021AD vlan offloads
> > +              * QINQ1, QINQ2, QINQ3 vlan headers are deprecated
> > +              */
> > +             if (skb->vlan_proto == htons(ETH_P_8021Q))
> > +                     vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT;
> > +     }
> > +
> > +     if (unlikely(skb->no_fcs))
> > +             lflags |= cpu_to_le32(TX_BD_FLAGS_NO_CRC);
> > +
> > +     if (length < BNGE_MIN_PKT_SIZE) {
> > +             pad = BNGE_MIN_PKT_SIZE - length;
> > +             if (skb_pad(skb, pad))
> > +                     /* SKB already freed. */
> > +                     goto tx_kick_pending;
> > +             length = BNGE_MIN_PKT_SIZE;
> > +     }
> > +
> > +     mapping = dma_map_single(bd->dev, skb->data, len, DMA_TO_DEVICE);
> > +
> > +     if (unlikely(dma_mapping_error(bd->dev, mapping)))
> > +             goto tx_free;
> > +
> > +     dma_unmap_addr_set(tx_buf, mapping, mapping);
> > +     flags = (len << TX_BD_LEN_SHIFT) | TX_BD_TYPE_LONG_TX_BD |
> > +             TX_BD_CNT(last_frag + 2);
> > +
> > +     txbd->tx_bd_haddr = cpu_to_le64(mapping);
> > +     txbd->tx_bd_opaque = SET_TX_OPAQUE(bn, txr, prod, 2 + last_frag);
> > +
> > +     prod = NEXT_TX(prod);
> > +     txbd1 = (struct tx_bd_ext *)
> > +             &txr->tx_desc_ring[TX_RING(bn, prod)][TX_IDX(prod)];
> > +
> > +     txbd1->tx_bd_hsize_lflags = lflags;
> > +     if (skb_is_gso(skb)) {
> > +             bool udp_gso = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4);
> > +             u32 hdr_len;
> > +
> > +             if (skb->encapsulation) {
> > +                     if (udp_gso)
> > +                             hdr_len = skb_inner_transport_offset(skb) +
> > +                                       sizeof(struct udphdr);
> > +                     else
> > +                             hdr_len = skb_inner_tcp_all_headers(skb);
> > +             } else if (udp_gso) {
> > +                     hdr_len = skb_transport_offset(skb) +
> > +                               sizeof(struct udphdr);
> > +             } else {
> > +                     hdr_len = skb_tcp_all_headers(skb);
> > +             }
> > +
> > +             txbd1->tx_bd_hsize_lflags |= cpu_to_le32(TX_BD_FLAGS_LSO |
> > +                                     TX_BD_FLAGS_T_IPID |
> > +                                     (hdr_len << (TX_BD_HSIZE_SHIFT - 1)));
> > +             length = skb_shinfo(skb)->gso_size;
> > +             txbd1->tx_bd_mss = cpu_to_le32(length);
> > +             length += hdr_len;
> > +     } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
> > +             txbd1->tx_bd_hsize_lflags |=
> > +                     cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM);
> > +             txbd1->tx_bd_mss = 0;
> > +     }
> > +
> > +     length >>= 9;
> > +     if (unlikely(length >= ARRAY_SIZE(bnge_lhint_arr))) {
>
> a proper ndo_features_check() should avoid the above check.

Thanks, I'll address this in the next revision.

Thanks,
Bhargava Marreddy

>
> /P
>

Download attachment "smime.p7s" of type "application/pkcs7-signature" (5496 bytes)