[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c47dc55f-9ef7-4a18-8419-fca4afb605b0@intel.com>
Date: Mon, 6 May 2024 11:35:52 +0200
From: Przemek Kitszel <przemyslaw.kitszel@...el.com>
To: Thinh Tran <thinhtr@...ux.ibm.com>, <netdev@...r.kernel.org>,
<kuba@...nel.org>, <anthony.l.nguyen@...el.com>,
<aleksandr.loktionov@...el.com>
CC: <jesse.brandeburg@...el.com>, <davem@...emloft.net>,
<edumazet@...gle.com>, <pabeni@...hat.com>,
<intel-wired-lan@...ts.osuosl.org>, Robert Thomas <rob.thomas@....com>
Subject: Re: [PATCH iwl-net] i40e: Fix repeated EEH reports in MSI domain
On 5/3/24 17:25, Thinh Tran wrote:
> The patch fixes an issue when repeated EEH reports with a single error
> on the bus of Intel X710 4-port 10G Base-T adapter, in the MSI domain
> causing the devices to be permanently disabled. It fully resets and
> restart the devices when handling the PCI EEH error.
>
> Two new functions, i40e_io_suspend() and i40e_io_resume(), have been
> introduced. These functions were refactor from the existing
> i40e_suspend() and i40e_resume() respectively. This refactoring was
> done due to concerns about the logic of the I40E_SUSPENSED state, which
> caused the device not able to recover. The functios are now used in the
s/functios/functions/
> EEH handling for device suspend/resume callbacks.
>
> - In the PCI error detected callback, replaced i40e_prep_for_reset()
> with i40e_io_suspend(). The chance is to fully suspend all I/O
> operations
it's only a chance? (perhaps a typo)
> - In the PCI error slot reset callback, replaced pci_enable_device_mem()
> with pci_enable_device(). This change enables both I/O and memory of
> the device.
>
> - In the PCI error resume callback, replace i40e_handle_reset_warning()
> with i40e_io_resume(). This change allows the system to resume I/O
> operations
>
> Fixes: a5f3d2c17b07 ("powerpc/pseries/pci: Add MSI domains")
>
> Signed-off-by: Thinh Tran <thinhtr@...ux.ibm.com>
> Tested-by: Robert Thomas <rob.thomas@....com>
In general do not add a blank line after Fixes tag.
Someone could complain that i40e driver had a bug prior to the cited
core commit, but for more practical purposes I find it good as is.
When you are a sender of the patch, it's good to place your Signed-off
as a last tag.
>
> ---
> drivers/net/ethernet/intel/i40e/i40e_main.c | 29 ++++++++++++++++-----
> 1 file changed, 22 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index 48b9ddb2b1b3..58418aa9231e 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -54,6 +54,9 @@ static int i40e_get_capabilities(struct i40e_pf *pf,
> enum i40e_admin_queue_opc list_type);
> static bool i40e_is_total_port_shutdown_enabled(struct i40e_pf *pf);
>
> +static int i40e_io_suspend(struct i40e_pf *pf);
> +static int i40e_io_resume(struct i40e_pf *pf);
I appreciate your effort to minimize changed lines to have a fix better
visible, however we avoid forward declarations in .c files.
I would split this into two commits - the first to just factor out/move
functions w/o functional changes, then second one with a logic fix.
> +
> /* i40e_pci_tbl - PCI Device ID Table
> *
> * Last entry must be all 0s
> @@ -11138,6 +11141,8 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit,
> ret = i40e_reset(pf);
> if (!ret)
> i40e_rebuild(pf, reinit, lock_acquired);
> + else
> + dev_err(&pf->pdev->dev, "%s: i40e_reset() FAILED", __func__);
> }
>
> /**
> @@ -16327,7 +16332,7 @@ static pci_ers_result_t i40e_pci_error_detected(struct pci_dev *pdev,
>
> /* shutdown all operations */
> if (!test_bit(__I40E_SUSPENDED, pf->state))
> - i40e_prep_for_reset(pf);
> + i40e_io_suspend(pf);
>
> /* Request a slot reset */
> return PCI_ERS_RESULT_NEED_RESET;
> @@ -16349,7 +16354,8 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev)
> u32 reg;
>
> dev_dbg(&pdev->dev, "%s\n", __func__);
> - if (pci_enable_device_mem(pdev)) {
> + /* enable I/O and memory of the device */
> + if (pci_enable_device(pdev)) {
> dev_info(&pdev->dev,
> "Cannot re-enable PCI device after reset.\n");
> result = PCI_ERS_RESULT_DISCONNECT;
> @@ -16411,8 +16417,7 @@ static void i40e_pci_error_resume(struct pci_dev *pdev)
> dev_dbg(&pdev->dev, "%s\n", __func__);
> if (test_bit(__I40E_SUSPENDED, pf->state))
> return;
> -
> - i40e_handle_reset_warning(pf, false);
> + i40e_io_resume(pf);
> }
>
> /**
> @@ -16521,11 +16526,16 @@ static void i40e_shutdown(struct pci_dev *pdev)
> static int __maybe_unused i40e_suspend(struct device *dev)
> {
> struct i40e_pf *pf = dev_get_drvdata(dev);
> - struct i40e_hw *hw = &pf->hw;
>
> /* If we're already suspended, then there is nothing to do */
> if (test_and_set_bit(__I40E_SUSPENDED, pf->state))
> return 0;
> + return i40e_io_suspend(pf);
> +}
> +
> +static int i40e_io_suspend(struct i40e_pf *pf)
> +{
> + struct i40e_hw *hw = &pf->hw;
>
> set_bit(__I40E_DOWN, pf->state);
>
> @@ -16572,11 +16582,16 @@ static int __maybe_unused i40e_suspend(struct device *dev)
> static int __maybe_unused i40e_resume(struct device *dev)
> {
> struct i40e_pf *pf = dev_get_drvdata(dev);
> - int err;
>
> /* If we're not suspended, then there is nothing to do */
> if (!test_bit(__I40E_SUSPENDED, pf->state))
> return 0;
> + return i40e_io_resume(pf);
> +}
> +
> +static int i40e_io_resume(struct i40e_pf *pf)
> +{
> + int err;
>
> /* We need to hold the RTNL lock prior to restoring interrupt schemes,
> * since we're going to be restoring queues
> @@ -16588,7 +16603,7 @@ static int __maybe_unused i40e_resume(struct device *dev)
> */
> err = i40e_restore_interrupt_scheme(pf);
> if (err) {
> - dev_err(dev, "Cannot restore interrupt scheme: %d\n",
> + dev_err(&pf->pdev->dev, "Cannot restore interrupt scheme: %d\n",
> err);
> }
>
Powered by blists - more mailing lists