[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <BY3PR18MB4612468FB0CFB6DB031F888CAB799@BY3PR18MB4612.namprd18.prod.outlook.com>
Date: Tue, 30 Aug 2022 09:19:21 +0000
From: Manish Chopra <manishc@...vell.com>
To: Thinh Tran <thinhtr@...ux.vnet.ibm.com>,
"kuba@...nel.org" <kuba@...nel.org>
CC: "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
Ariel Elior <aelior@...vell.com>,
"davem@...emloft.net" <davem@...emloft.net>,
Sudarsana Reddy Kalluru <skalluru@...vell.com>,
Alok Prasad <palok@...vell.com>
Subject: RE: [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration
> -----Original Message-----
> From: Thinh Tran <thinhtr@...ux.vnet.ibm.com>
> Sent: Friday, August 26, 2022 1:30 AM
> To: kuba@...nel.org
> Cc: netdev@...r.kernel.org; Ariel Elior <aelior@...vell.com>;
> davem@...emloft.net; Manish Chopra <manishc@...vell.com>; Sudarsana
> Reddy Kalluru <skalluru@...vell.com>; Thinh Tran
> <thinhtr@...ux.vnet.ibm.com>
> Subject: [EXT] [PATCH] bnx2x: Fix error recovering in switch configuration
>
> External Email
>
> ----------------------------------------------------------------------
> As the BCM57810 and other I/O adapters are connected through a PCIe
> switch, the bnx2x driver causes unexpected system hang/crash while handling
> PCIe switch errors, if its error handler is called after other drivers' handlers.
>
> In this case, after numbers of bnx2x_tx_timout(), the
> bnx2x_nic_unload() is called, frees up resources and calls
> bnx2x_napi_disable(). Then when EEH calls its error handler, the
> bnx2x_io_error_detected() and
> bnx2x_io_slot_reset() also calling bnx2x_napi_disable() and freeing the
> resources.
>
> This patch will:
> - reduce the numbers of bnx2x_panic_dump() while in
> bnx2x_tx_timeout(), avoid filling up dmesg buffer.
> - use checking new napi_enable flags to prevent calling
> disable again which causing system hangs.
> - cheking if fp->page_pool already freed avoid system
> crash.
>
> Signed-off-by: Thinh Tran <thinhtr@...ux.vnet.ibm.com>
> ---
> drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 4 ++++
> .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 24 ++++++++++++++++++-
> .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 3 +++
> 3 files changed, 30 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> index dd5945c4bfec..7fa23d47907a 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> @@ -1509,6 +1509,10 @@ struct bnx2x {
> bool cnic_loaded;
> struct cnic_eth_dev *(*cnic_probe)(struct net_device *);
>
> + bool napi_enable;
> + bool cnic_napi_enable;
> + int tx_timeout_cnt;
> +
> /* Flag that indicates that we can start looking for FCoE L2 queue
> * completions in the default status block.
> */
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> index 712b5595bc39..bb8d91f44642 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
> @@ -1860,37 +1860,49 @@ static int bnx2x_setup_irqs(struct bnx2x *bp)
> static void bnx2x_napi_enable_cnic(struct bnx2x *bp) {
> int i;
> + if (bp->cnic_napi_enable)
> + return;
>
> for_each_rx_queue_cnic(bp, i) {
> napi_enable(&bnx2x_fp(bp, i, napi));
> }
> + bp->cnic_napi_enable = true;
> }
>
> static void bnx2x_napi_enable(struct bnx2x *bp) {
> int i;
> + if (bp->napi_enable)
> + return;
>
> for_each_eth_queue(bp, i) {
> napi_enable(&bnx2x_fp(bp, i, napi));
> }
> + bp->napi_enable = true;
> }
>
> static void bnx2x_napi_disable_cnic(struct bnx2x *bp) {
> int i;
> + if (!bp->cnic_napi_enable)
> + return;
>
> for_each_rx_queue_cnic(bp, i) {
> napi_disable(&bnx2x_fp(bp, i, napi));
> }
> + bp->cnic_napi_enable = false;
> }
>
> static void bnx2x_napi_disable(struct bnx2x *bp) {
> int i;
> + if (!bp->napi_enable)
> + return;
>
> for_each_eth_queue(bp, i) {
> napi_disable(&bnx2x_fp(bp, i, napi));
> }
> + bp->napi_enable = false;
> }
>
> void bnx2x_netif_start(struct bnx2x *bp) @@ -2554,6 +2566,7 @@ int
> bnx2x_load_cnic(struct bnx2x *bp)
> }
>
> /* Add all CNIC NAPI objects */
> + bp->cnic_napi_enable = false;
> bnx2x_add_all_napi_cnic(bp);
> DP(NETIF_MSG_IFUP, "cnic napi added\n");
> bnx2x_napi_enable_cnic(bp);
> @@ -2701,7 +2714,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int
> load_mode)
> */
> bnx2x_setup_tc(bp->dev, bp->max_cos);
>
> + bp->tx_timeout_cnt = 0;
> /* Add all NAPI objects */
> + bp->napi_enable = false;
> bnx2x_add_all_napi(bp);
> DP(NETIF_MSG_IFUP, "napi added\n");
> bnx2x_napi_enable(bp);
> @@ -4982,7 +4997,14 @@ void bnx2x_tx_timeout(struct net_device *dev,
> unsigned int txqueue)
> */
> if (!bp->panic)
> #ifndef BNX2X_STOP_ON_ERROR
> - bnx2x_panic_dump(bp, false);
> + {
> + if (++bp->tx_timeout_cnt > 3) {
> + bnx2x_panic_dump(bp, false);
> + bp->tx_timeout_cnt = 0;
> + } else {
> + netdev_err(bp->dev, "TX timeout %d times\n", bp-
> >tx_timeout_cnt);
> + }
> +
Hello Trinh,
bnx2x_panic_dump() dumps quite of some useful debug (fastpath, hw related) info to look at in the logs,
I think they should be dumped at very first occurrence of tx timeout rather than waiting on next few tx timeout
event (which may not even occur in some cases). One another way to prevent it could be by turning off the
carrier (netif_carrier_off()) at very first place in bnx2x_tx_timeout() to prevent such repeated occurrences of tx timeout on the netdev.
Thanks,
Manish
Powered by blists - more mailing lists