[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CANn89iL-RJ84WB9W8SoZn6_UMko8sLBb_FEGjjGZTEO+9KOpAg@mail.gmail.com>
Date: Wed, 12 Nov 2025 05:55:07 -0800
From: Eric Dumazet <edumazet@...gle.com>
To: Aditya Garg <gargaditya@...ux.microsoft.com>
Cc: kys@...rosoft.com, haiyangz@...rosoft.com, wei.liu@...nel.org,
decui@...rosoft.com, andrew+netdev@...n.ch, davem@...emloft.net,
kuba@...nel.org, pabeni@...hat.com, longli@...rosoft.com,
kotaranov@...rosoft.com, horms@...nel.org, shradhagupta@...ux.microsoft.com,
ssengar@...ux.microsoft.com, ernis@...ux.microsoft.com,
dipayanroy@...ux.microsoft.com, shirazsaleem@...rosoft.com, leon@...nel.org,
mlevitsk@...hat.com, yury.norov@...il.com, sbhatta@...vell.com,
linux-hyperv@...r.kernel.org, netdev@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-rdma@...r.kernel.org,
gargaditya@...rosoft.com
Subject: Re: [PATCH net-next v4 1/2] net: mana: Handle SKB if TX SGEs exceed
hardware limit
On Wed, Nov 12, 2025 at 5:11 AM Aditya Garg
<gargaditya@...ux.microsoft.com> wrote:
>
> The MANA hardware supports a maximum of 30 scatter-gather entries (SGEs)
> per TX WQE. Exceeding this limit can cause TX failures.
> Add ndo_features_check() callback to validate SKB layout before
> transmission. For GSO SKBs that would exceed the hardware SGE limit, clear
> NETIF_F_GSO_MASK to enforce software segmentation in the stack.
> Add a fallback in mana_start_xmit() to linearize non-GSO SKBs that still
> exceed the SGE limit.
>
> Also, Add ethtool counter for SKBs linearized
>
> Co-developed-by: Dipayaan Roy <dipayanroy@...ux.microsoft.com>
> Signed-off-by: Dipayaan Roy <dipayanroy@...ux.microsoft.com>
> Signed-off-by: Aditya Garg <gargaditya@...ux.microsoft.com>
> ---
> drivers/net/ethernet/microsoft/mana/mana_en.c | 37 ++++++++++++++++++-
> .../ethernet/microsoft/mana/mana_ethtool.c | 2 +
> include/net/mana/gdma.h | 6 ++-
> include/net/mana/mana.h | 1 +
> 4 files changed, 43 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index cccd5b63cee6..67ae5421f9ee 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -11,6 +11,7 @@
> #include <linux/mm.h>
> #include <linux/pci.h>
> #include <linux/export.h>
> +#include <linux/skbuff.h>
>
> #include <net/checksum.h>
> #include <net/ip6_checksum.h>
> @@ -329,6 +330,20 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
> cq = &apc->tx_qp[txq_idx].tx_cq;
> tx_stats = &txq->stats;
>
> + if (MAX_SKB_FRAGS + 2 > MAX_TX_WQE_SGL_ENTRIES &&
> + skb_shinfo(skb)->nr_frags + 2 > MAX_TX_WQE_SGL_ENTRIES) {
> + /* GSO skb with Hardware SGE limit exceeded is not expected here
> + * as they are handled in mana_features_check() callback
> + */
> + if (skb_linearize(skb)) {
> + netdev_warn_once(ndev, "Failed to linearize skb with nr_frags=%d and is_gso=%d\n",
> + skb_shinfo(skb)->nr_frags,
> + skb_is_gso(skb));
> + goto tx_drop_count;
> + }
> + apc->eth_stats.linear_pkt_tx_cnt++;
> + }
> +
> pkg.tx_oob.s_oob.vcq_num = cq->gdma_id;
> pkg.tx_oob.s_oob.vsq_frame = txq->vsq_frame;
>
> @@ -442,8 +457,6 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
> }
> }
>
> - WARN_ON_ONCE(pkg.wqe_req.num_sge > MAX_TX_WQE_SGL_ENTRIES);
> -
> if (pkg.wqe_req.num_sge <= ARRAY_SIZE(pkg.sgl_array)) {
> pkg.wqe_req.sgl = pkg.sgl_array;
> } else {
> @@ -518,6 +531,25 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
> return NETDEV_TX_OK;
> }
>
#if MAX_SKB_FRAGS + 2 > MAX_TX_WQE_SGL_ENTRIES
> +static netdev_features_t mana_features_check(struct sk_buff *skb,
> + struct net_device *ndev,
> + netdev_features_t features)
> +{
> + if (MAX_SKB_FRAGS + 2 > MAX_TX_WQE_SGL_ENTRIES &&
> + skb_shinfo(skb)->nr_frags + 2 > MAX_TX_WQE_SGL_ENTRIES) {
> + /* Exceeds HW SGE limit.
> + * GSO case:
> + * Disable GSO so the stack will software-segment the skb
> + * into smaller skbs that fit the SGE budget.
> + * Non-GSO case:
> + * The xmit path will attempt skb_linearize() as a fallback.
> + */
> + if (skb_is_gso(skb))
No need to test skb_is_gso(skb), you can clear bits, this will be a
NOP if the packet is non GSO anyway.
> + features &= ~NETIF_F_GSO_MASK;
> + }
> + return features;
> +}
#endif
> +
> static void mana_get_stats64(struct net_device *ndev,
> struct rtnl_link_stats64 *st)
> {
> @@ -878,6 +910,7 @@ static const struct net_device_ops mana_devops = {
> .ndo_open = mana_open,
> .ndo_stop = mana_close,
> .ndo_select_queue = mana_select_queue,
> + .ndo_features_check = mana_features_check,
Note that if your mana_features_check() is a nop if MAX_SKB_FRAGS is
small enough,
you could set a non NULL .ndo_features_check based on a preprocessor condition
#if MAX_SKB_FRAGS + 2 > MAX_TX_WQE_SGL_ENTRIES
.ndo_features_check = ....
#endif
This would avoid an expensive indirect call when possible.
> .ndo_start_xmit = mana_start_xmit,
> .ndo_validate_addr = eth_validate_addr,
> .ndo_get_stats64 = mana_get_stats64,
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> index a1afa75a9463..fa5e1a2f06a9 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> @@ -71,6 +71,8 @@ static const struct mana_stats_desc mana_eth_stats[] = {
> {"tx_cq_err", offsetof(struct mana_ethtool_stats, tx_cqe_err)},
> {"tx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
> tx_cqe_unknown_type)},
> + {"linear_pkt_tx_cnt", offsetof(struct mana_ethtool_stats,
> + linear_pkt_tx_cnt)},
> {"rx_coalesced_err", offsetof(struct mana_ethtool_stats,
> rx_coalesced_err)},
> {"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
> diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
> index 637f42485dba..84614ebe0f4c 100644
> --- a/include/net/mana/gdma.h
> +++ b/include/net/mana/gdma.h
> @@ -592,6 +592,9 @@ enum {
> #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17)
> #define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
>
> +/* Driver supports linearizing the skb when num_sge exceeds hardware limit */
> +#define GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE BIT(20)
> +
> #define GDMA_DRV_CAP_FLAGS1 \
> (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
> GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \
> @@ -601,7 +604,8 @@ enum {
> GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \
> GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \
> GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \
> - GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE)
> + GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE | \
> + GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE)
>
> #define GDMA_DRV_CAP_FLAGS2 0
>
> diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
> index 8906901535f5..50a532fb30d6 100644
> --- a/include/net/mana/mana.h
> +++ b/include/net/mana/mana.h
> @@ -404,6 +404,7 @@ struct mana_ethtool_stats {
> u64 hc_tx_err_gdma;
> u64 tx_cqe_err;
> u64 tx_cqe_unknown_type;
> + u64 linear_pkt_tx_cnt;
> u64 rx_coalesced_err;
> u64 rx_cqe_unknown_type;
> };
> --
> 2.43.0
>
Powered by blists - more mailing lists