linux-kernel - Re: [PATCH RESEND V2 net 1/7] net: hns3: fixed reset failure issues caused by the incorrect reset type

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z2KPw9WYCI/SZIjg@mev-dev.igk.intel.com>
Date: Wed, 18 Dec 2024 10:02:59 +0100
From: Michal Swiatkowski <michal.swiatkowski@...ux.intel.com>
To: Jijie Shao <shaojijie@...wei.com>
Cc: davem@...emloft.net, edumazet@...gle.com, kuba@...nel.org,
	pabeni@...hat.com, andrew+netdev@...n.ch, horms@...nel.org,
	shenjian15@...wei.com, wangpeiyang1@...wei.com,
	liuyonglong@...wei.com, chenhao418@...wei.com,
	jonathan.cameron@...wei.com, shameerali.kolothum.thodi@...wei.com,
	salil.mehta@...wei.com, netdev@...r.kernel.org,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH RESEND V2 net 1/7] net: hns3: fixed reset failure issues
 caused by the incorrect reset type

On Tue, Dec 17, 2024 at 09:08:33AM +0800, Jijie Shao wrote:
> From: Hao Lan <lanhao@...wei.com>
> 
> When a reset type that is not supported by the driver is input, a reset
> pending flag bit of the HNAE3_NONE_RESET type is generated in
> reset_pending. The driver does not have a mechanism to clear this type
> of error. As a result, the driver considers that the reset is not
> complete. This patch provides a mechanism to clear the
> HNAE3_NONE_RESET flag and the parameter of
> hnae3_ae_ops.set_default_reset_request is verified.
> 
> The error message:
> hns3 0000:39:01.0: cmd failed -16
> hns3 0000:39:01.0: hclge device re-init failed, VF is disabled!
> hns3 0000:39:01.0: failed to reset VF stack
> hns3 0000:39:01.0: failed to reset VF(4)
> hns3 0000:39:01.0: prepare reset(2) wait done
> hns3 0000:39:01.0 eth4: already uninitialized
> 
> Use the crash tool to view struct hclgevf_dev:
> struct hclgevf_dev {
> ...
> 	default_reset_request = 0x20,
> 	reset_level = HNAE3_NONE_RESET,
> 	reset_pending = 0x100,
> 	reset_type = HNAE3_NONE_RESET,
> ...
> };
> 
> Fixes: 720bd5837e37 ("net: hns3: add set_default_reset_request in the hnae3_ae_ops")
> Signed-off-by: Hao Lan <lanhao@...wei.com>
> Signed-off-by: Jijie Shao <shaojijie@...wei.com>
> Signed-off-by: Paolo Abeni <pabeni@...hat.com>
> ---
>  .../hisilicon/hns3/hns3pf/hclge_main.c        | 33 ++++++++++++++--
>  .../hisilicon/hns3/hns3vf/hclgevf_main.c      | 38 ++++++++++++++++---
>  2 files changed, 61 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> index 05942fa78b11..7d44dc777dc5 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> @@ -3574,6 +3574,17 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf,
>  	return ret;
>  }
>  
> +static void hclge_set_reset_pending(struct hclge_dev *hdev,
> +				    enum hnae3_reset_type reset_type)
> +{
> +	/* When an incorrect reset type is executed, the get_reset_level
> +	 * function generates the HNAE3_NONE_RESET flag. As a result, this
> +	 * type do not need to pending.
> +	 */
> +	if (reset_type != HNAE3_NONE_RESET)
> +		set_bit(reset_type, &hdev->reset_pending);
> +}
> +
>  static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
>  {
>  	u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg;
> @@ -3594,7 +3605,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
>  	 */
>  	if (BIT(HCLGE_VECTOR0_IMPRESET_INT_B) & msix_src_reg) {
>  		dev_info(&hdev->pdev->dev, "IMP reset interrupt\n");
> -		set_bit(HNAE3_IMP_RESET, &hdev->reset_pending);
> +		hclge_set_reset_pending(hdev, HNAE3_IMP_RESET);
>  		set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state);
>  		*clearval = BIT(HCLGE_VECTOR0_IMPRESET_INT_B);
>  		hdev->rst_stats.imp_rst_cnt++;
> @@ -3604,7 +3615,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
>  	if (BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B) & msix_src_reg) {
>  		dev_info(&hdev->pdev->dev, "global reset interrupt\n");
>  		set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state);
> -		set_bit(HNAE3_GLOBAL_RESET, &hdev->reset_pending);
> +		hclge_set_reset_pending(hdev, HNAE3_GLOBAL_RESET);
>  		*clearval = BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B);
>  		hdev->rst_stats.global_rst_cnt++;
>  		return HCLGE_VECTOR0_EVENT_RST;
> @@ -4052,7 +4063,7 @@ static void hclge_do_reset(struct hclge_dev *hdev)
>  	case HNAE3_FUNC_RESET:
>  		dev_info(&pdev->dev, "PF reset requested\n");
>  		/* schedule again to check later */
> -		set_bit(HNAE3_FUNC_RESET, &hdev->reset_pending);
> +		hclge_set_reset_pending(hdev, HNAE3_FUNC_RESET);
>  		hclge_reset_task_schedule(hdev);
>  		break;
>  	default:
> @@ -4086,6 +4097,8 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hnae3_ae_dev *ae_dev,
>  		clear_bit(HNAE3_FLR_RESET, addr);
>  	}
>  
> +	clear_bit(HNAE3_NONE_RESET, addr);
> +
>  	if (hdev->reset_type != HNAE3_NONE_RESET &&
>  	    rst_level < hdev->reset_type)
>  		return HNAE3_NONE_RESET;
> @@ -4227,7 +4240,7 @@ static bool hclge_reset_err_handle(struct hclge_dev *hdev)
>  		return false;
>  	} else if (hdev->rst_stats.reset_fail_cnt < MAX_RESET_FAIL_CNT) {
>  		hdev->rst_stats.reset_fail_cnt++;
> -		set_bit(hdev->reset_type, &hdev->reset_pending);
> +		hclge_set_reset_pending(hdev, hdev->reset_type);
Sth is unclear for me here. Doesn't HNAE3_NONE_RESET mean that there is
no reset? If yes, why in this case reset_fail_cnt++ is increasing?

Maybe the check for NONE_RESET should be done in this else if check to
prevent reset_fail_cnt from increasing (and also solve the problem with
pending bit set)
>  		dev_info(&hdev->pdev->dev,
>  			 "re-schedule reset task(%u)\n",
>  			 hdev->rst_stats.reset_fail_cnt);
> @@ -4470,8 +4483,20 @@ static void hclge_reset_event(struct pci_dev *pdev, struct hnae3_handle *handle)
>  static void hclge_set_def_reset_request(struct hnae3_ae_dev *ae_dev,
>  					enum hnae3_reset_type rst_type)
>  {
> +#define HCLGE_SUPPORT_RESET_TYPE \
> +	(BIT(HNAE3_FLR_RESET) | BIT(HNAE3_FUNC_RESET) | \
> +	BIT(HNAE3_GLOBAL_RESET) | BIT(HNAE3_IMP_RESET))
> +
>  	struct hclge_dev *hdev = ae_dev->priv;
>  
> +	if (!(BIT(rst_type) & HCLGE_SUPPORT_RESET_TYPE)) {
> +		/* To prevent reset triggered by hclge_reset_event */
> +		set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request);
> +		dev_warn(&hdev->pdev->dev, "unsupported reset type %d\n",
> +			 rst_type);
> +		return;
> +	}
Maybe (nit):
if (...) {
	rst_type = 
	dev_warn();
}

set_bit(rst_type, );
It is a little hard to follow with return in the if.

> +
>  	set_bit(rst_type, &hdev->default_reset_request);
>  }
>  
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
> index 2f6ffb88e700..fd0abe37fdd7 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
> @@ -1393,6 +1393,17 @@ static int hclgevf_notify_roce_client(struct hclgevf_dev *hdev,
>  	return ret;
>  }
>  
> +static void hclgevf_set_reset_pending(struct hclgevf_dev *hdev,
> +				      enum hnae3_reset_type reset_type)
> +{
> +	/* When an incorrect reset type is executed, the get_reset_level
> +	 * function generates the HNAE3_NONE_RESET flag. As a result, this
> +	 * type do not need to pending.
> +	 */
> +	if (reset_type != HNAE3_NONE_RESET)
> +		set_bit(reset_type, &hdev->reset_pending);
> +}
You already have a way to share the code between PF and VF, so please
move the same functions to common file in one direction up.

> +
>  static int hclgevf_reset_wait(struct hclgevf_dev *hdev)
>  {
>  #define HCLGEVF_RESET_WAIT_US	20000
> @@ -1542,7 +1553,7 @@ static void hclgevf_reset_err_handle(struct hclgevf_dev *hdev)
>  		hdev->rst_stats.rst_fail_cnt);
>  
>  	if (hdev->rst_stats.rst_fail_cnt < HCLGEVF_RESET_MAX_FAIL_CNT)
> -		set_bit(hdev->reset_type, &hdev->reset_pending);
> +		hclgevf_set_reset_pending(hdev, hdev->reset_type);
>  
>  	if (hclgevf_is_reset_pending(hdev)) {
>  		set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state);
> @@ -1662,6 +1673,8 @@ static enum hnae3_reset_type hclgevf_get_reset_level(unsigned long *addr)
>  		clear_bit(HNAE3_FLR_RESET, addr);
>  	}
>  
> +	clear_bit(HNAE3_NONE_RESET, addr);
> +
>  	return rst_level;
>  }
>  
> @@ -1671,14 +1684,15 @@ static void hclgevf_reset_event(struct pci_dev *pdev,
>  	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
>  	struct hclgevf_dev *hdev = ae_dev->priv;
>  
> -	dev_info(&hdev->pdev->dev, "received reset request from VF enet\n");
> -
>  	if (hdev->default_reset_request)
>  		hdev->reset_level =
>  			hclgevf_get_reset_level(&hdev->default_reset_request);
>  	else
>  		hdev->reset_level = HNAE3_VF_FUNC_RESET;
>  
> +	dev_info(&hdev->pdev->dev, "received reset request from VF enet, reset level is %d\n",
> +		 hdev->reset_level);
> +
>  	/* reset of this VF requested */
>  	set_bit(HCLGEVF_RESET_REQUESTED, &hdev->reset_state);
>  	hclgevf_reset_task_schedule(hdev);
> @@ -1689,8 +1703,20 @@ static void hclgevf_reset_event(struct pci_dev *pdev,
>  static void hclgevf_set_def_reset_request(struct hnae3_ae_dev *ae_dev,
>  					  enum hnae3_reset_type rst_type)
>  {
> +#define HCLGEVF_SUPPORT_RESET_TYPE \
> +	(BIT(HNAE3_VF_RESET) | BIT(HNAE3_VF_FUNC_RESET) | \
> +	BIT(HNAE3_VF_PF_FUNC_RESET) | BIT(HNAE3_VF_FULL_RESET) | \
> +	BIT(HNAE3_FLR_RESET) | BIT(HNAE3_VF_EXP_RESET))
> +
>  	struct hclgevf_dev *hdev = ae_dev->priv;
>  
> +	if (!(BIT(rst_type) & HCLGEVF_SUPPORT_RESET_TYPE)) {
> +		/* To prevent reset triggered by hclge_reset_event */
> +		set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request);
> +		dev_info(&hdev->pdev->dev, "unsupported reset type %d\n",
> +			 rst_type);
> +		return;
> +	}
>  	set_bit(rst_type, &hdev->default_reset_request);
>  }
>  
> @@ -1847,14 +1873,14 @@ static void hclgevf_reset_service_task(struct hclgevf_dev *hdev)
>  		 */
>  		if (hdev->reset_attempts > HCLGEVF_MAX_RESET_ATTEMPTS_CNT) {
>  			/* prepare for full reset of stack + pcie interface */
> -			set_bit(HNAE3_VF_FULL_RESET, &hdev->reset_pending);
> +			hclgevf_set_reset_pending(hdev, HNAE3_VF_FULL_RESET);
>  
>  			/* "defer" schedule the reset task again */
>  			set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state);
>  		} else {
>  			hdev->reset_attempts++;
>  
> -			set_bit(hdev->reset_level, &hdev->reset_pending);
> +			hclgevf_set_reset_pending(hdev, hdev->reset_level);
>  			set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state);
>  		}
>  		hclgevf_reset_task_schedule(hdev);
> @@ -1977,7 +2003,7 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev,
>  		rst_ing_reg = hclgevf_read_dev(&hdev->hw, HCLGEVF_RST_ING);
>  		dev_info(&hdev->pdev->dev,
>  			 "receive reset interrupt 0x%x!\n", rst_ing_reg);
> -		set_bit(HNAE3_VF_RESET, &hdev->reset_pending);
> +		hclgevf_set_reset_pending(hdev, HNAE3_VF_RESET);
>  		set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state);
>  		set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state);
>  		*clearval = ~(1U << HCLGEVF_VECTOR0_RST_INT_B);
> -- 
> 2.33.0