lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZSQDQll8rwbucNpP@kernel.org>
Date: Mon, 9 Oct 2023 15:42:26 +0200
From: Simon Horman <horms@...nel.org>
To: Jijie Shao <shaojijie@...wei.com>
Cc: yisen.zhuang@...wei.com, salil.mehta@...wei.com, davem@...emloft.net,
	edumazet@...gle.com, kuba@...nel.org, pabeni@...hat.com,
	shenjian15@...wei.com, wangjie125@...wei.com,
	liuyonglong@...wei.com, netdev@...r.kernel.org,
	linux-kernel@...r.kernel.org, Leon Romanovsky <leon@...nel.org>
Subject: Re: [PATCH V2 net-next 2/2] net: hns3: add vf fault detect support

+ Leon

On Sat, Oct 07, 2023 at 11:12:15AM +0800, Jijie Shao wrote:
> From: Jie Wang <wangjie125@...wei.com>
> 
> Currently hns3 driver supports vf fault detect feature. Several ras caused
> by VF resources don't need to do PF function reset for recovery. The driver
> only needs to reset the specified VF.
> 
> So this patch adds process in ras module. New process will get detailed
> information about ras and do the most correct measures based on these
> accurate information.
> 
> Signed-off-by: Jie Wang <wangjie125@...wei.com>
> Signed-off-by: Jijie Shao <shaojijie@...wei.com>
> ---
> changeLog:
> v1 -> v2:
>   - fix the wrong use of vf recovery notify interface
>   - add BUILD_BUG_ON to gurantee macros
>   - optimise hclge_handle_vf_queue_err_ras for unsupported firmware
>   v1: https://lore.kernel.org/netdev/20230113020829.48451-1-lanhao@huawei.com/ 

Hi Leon,

I believe you reviewed v1 of this back in January and February.
Could you find some time to look at v2?

> ---
>  drivers/net/ethernet/hisilicon/hns3/hnae3.h   |   1 +
>  .../hns3/hns3_common/hclge_comm_cmd.h         |   1 +
>  .../hisilicon/hns3/hns3pf/hclge_err.c         | 116 +++++++++++++++++-
>  .../hisilicon/hns3/hns3pf/hclge_err.h         |   2 +
>  .../hisilicon/hns3/hns3pf/hclge_main.c        |   3 +-
>  .../hisilicon/hns3/hns3pf/hclge_main.h        |   2 +
>  .../hisilicon/hns3/hns3pf/hclge_mbx.c         |   2 +-
>  7 files changed, 120 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
> index 46062106fc6a..d7e175a9cb49 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
> +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
> @@ -275,6 +275,7 @@ enum hnae3_reset_type {
>  	HNAE3_GLOBAL_RESET,
>  	HNAE3_IMP_RESET,
>  	HNAE3_NONE_RESET,
> +	HNAE3_VF_EXP_RESET,
>  	HNAE3_MAX_RESET,
>  };
>  
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h
> index 92e73d44f0e5..533c19d25e4f 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h
> @@ -93,6 +93,7 @@ enum hclge_opcode_type {
>  	HCLGE_OPC_DFX_SSU_REG_2		= 0x004F,
>  
>  	HCLGE_OPC_QUERY_DEV_SPECS	= 0x0050,
> +	HCLGE_OPC_GET_QUEUE_ERR_VF      = 0x0067,
>  
>  	/* MAC command */
>  	HCLGE_OPC_CONFIG_MAC_MODE	= 0x0301,
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
> index 3f35227ef1fa..d63e114f93d0 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
> @@ -1301,10 +1301,12 @@ static const struct hclge_hw_type_id hclge_hw_type_id_st[] = {
>  		.msg = "tqp_int_ecc_error"
>  	}, {
>  		.type_id = PF_ABNORMAL_INT_ERROR,
> -		.msg = "pf_abnormal_int_error"
> +		.msg = "pf_abnormal_int_error",
> +		.cause_by_vf = true
>  	}, {
>  		.type_id = MPF_ABNORMAL_INT_ERROR,
> -		.msg = "mpf_abnormal_int_error"
> +		.msg = "mpf_abnormal_int_error",
> +		.cause_by_vf = true
>  	}, {
>  		.type_id = COMMON_ERROR,
>  		.msg = "common_error"
> @@ -2759,7 +2761,7 @@ void hclge_handle_occurred_error(struct hclge_dev *hdev)
>  		hclge_handle_error_info_log(ae_dev);
>  }
>  
> -static void
> +static bool
>  hclge_handle_error_type_reg_log(struct device *dev,
>  				struct hclge_mod_err_info *mod_info,
>  				struct hclge_type_reg_err_info *type_reg_info)
> @@ -2770,6 +2772,7 @@ hclge_handle_error_type_reg_log(struct device *dev,
>  	u8 mod_id, total_module, type_id, total_type, i, is_ras;
>  	u8 index_module = MODULE_NONE;
>  	u8 index_type = NONE_ERROR;
> +	bool cause_by_vf = false;
>  
>  	mod_id = mod_info->mod_id;
>  	type_id = type_reg_info->type_id & HCLGE_ERR_TYPE_MASK;
> @@ -2788,6 +2791,7 @@ hclge_handle_error_type_reg_log(struct device *dev,
>  	for (i = 0; i < total_type; i++) {
>  		if (type_id == hclge_hw_type_id_st[i].type_id) {
>  			index_type = i;
> +			cause_by_vf = hclge_hw_type_id_st[i].cause_by_vf;
>  			break;
>  		}
>  	}
> @@ -2805,6 +2809,8 @@ hclge_handle_error_type_reg_log(struct device *dev,
>  	dev_err(dev, "reg_value:\n");
>  	for (i = 0; i < type_reg_info->reg_num; i++)
>  		dev_err(dev, "0x%08x\n", type_reg_info->hclge_reg[i]);
> +
> +	return cause_by_vf;
>  }
>  
>  static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
> @@ -2815,6 +2821,7 @@ static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
>  	struct device *dev = &hdev->pdev->dev;
>  	struct hclge_mod_err_info *mod_info;
>  	struct hclge_sum_err_info *sum_info;
> +	bool cause_by_vf = false;
>  	u8 mod_num, err_num, i;
>  	u32 offset = 0;
>  
> @@ -2843,12 +2850,16 @@ static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
>  
>  			type_reg_info = (struct hclge_type_reg_err_info *)
>  					    &buf[offset++];
> -			hclge_handle_error_type_reg_log(dev, mod_info,
> -							type_reg_info);
> +			if (hclge_handle_error_type_reg_log(dev, mod_info,
> +							    type_reg_info))
> +				cause_by_vf = true;
>  
>  			offset += type_reg_info->reg_num;
>  		}
>  	}
> +
> +	if (hnae3_ae_dev_vf_fault_supported(hdev->ae_dev) && cause_by_vf)
> +		set_bit(HNAE3_VF_EXP_RESET, &ae_dev->hw_err_reset_req);
>  }
>  
>  static int hclge_query_all_err_bd_num(struct hclge_dev *hdev, u32 *bd_num)
> @@ -2940,3 +2951,98 @@ int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev)
>  out:
>  	return ret;
>  }
> +
> +static bool hclge_reset_vf_in_bitmap(struct hclge_dev *hdev,
> +				     unsigned long *bitmap)
> +{
> +	struct hclge_vport *vport;
> +	bool exist_set = false;
> +	int func_id;
> +	int ret;
> +
> +	func_id = find_first_bit(bitmap, HCLGE_VPORT_NUM);
> +	if (func_id == PF_VPORT_ID)
> +		return false;
> +
> +	while (func_id != HCLGE_VPORT_NUM) {
> +		vport = hclge_get_vf_vport(hdev,
> +					   func_id - HCLGE_VF_VPORT_START_NUM);
> +		if (!vport) {
> +			dev_err(&hdev->pdev->dev, "invalid func id(%d)\n",
> +				func_id);
> +			return false;
> +		}
> +
> +		dev_info(&hdev->pdev->dev, "do function %d recovery.", func_id);
> +
> +		ret = hclge_reset_tqp(&vport->nic);
> +		if (ret) {
> +			dev_err(&hdev->pdev->dev,
> +				"failed to reset tqp, ret = %d.", ret);
> +			return false;
> +		}
> +
> +		ret = hclge_inform_vf_reset(vport, HNAE3_VF_FUNC_RESET);
> +		if (ret) {
> +			dev_err(&hdev->pdev->dev,
> +				"failed to reset func %d, ret = %d.",
> +				func_id, ret);
> +			return false;
> +		}
> +
> +		exist_set = true;
> +		clear_bit(func_id, bitmap);
> +		func_id = find_first_bit(bitmap, HCLGE_VPORT_NUM);
> +	}
> +
> +	return exist_set;
> +}
> +
> +static void hclge_get_vf_fault_bitmap(struct hclge_desc *desc,
> +				      unsigned long *bitmap)
> +{
> +#define HCLGE_FIR_FAULT_BYTES	24
> +#define HCLGE_SEC_FAULT_BYTES	8
> +
> +	u8 *buff;
> +
> +	BUILD_BUG_ON(HCLGE_FIR_FAULT_BYTES + HCLGE_SEC_FAULT_BYTES !=
> +		     BITS_TO_BYTES(HCLGE_VPORT_NUM));
> +
> +	memcpy(bitmap, desc[0].data, HCLGE_FIR_FAULT_BYTES);
> +	buff = (u8 *)bitmap + HCLGE_FIR_FAULT_BYTES;
> +	memcpy(buff, desc[1].data, HCLGE_SEC_FAULT_BYTES);
> +}
> +
> +int hclge_handle_vf_queue_err_ras(struct hclge_dev *hdev)
> +{
> +	unsigned long vf_fault_bitmap[BITS_TO_LONGS(HCLGE_VPORT_NUM)];
> +	struct hclge_desc desc[2];
> +	bool cause_by_vf = false;
> +	int ret;
> +
> +	if (!test_and_clear_bit(HNAE3_VF_EXP_RESET,
> +				&hdev->ae_dev->hw_err_reset_req) ||
> +	    !hnae3_ae_dev_vf_fault_supported(hdev->ae_dev))
> +		return 0;
> +
> +	hclge_comm_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_GET_QUEUE_ERR_VF,
> +					true);
> +	desc[0].flag |= cpu_to_le16(HCLGE_COMM_CMD_FLAG_NEXT);
> +	hclge_comm_cmd_setup_basic_desc(&desc[1], HCLGE_OPC_GET_QUEUE_ERR_VF,
> +					true);
> +
> +	ret = hclge_comm_cmd_send(&hdev->hw.hw, desc, 2);
> +	if (ret) {
> +		dev_err(&hdev->pdev->dev,
> +			"failed to get vf bitmap, ret = %d.\n", ret);
> +		return ret;
> +	}
> +	hclge_get_vf_fault_bitmap(desc, vf_fault_bitmap);
> +
> +	cause_by_vf = hclge_reset_vf_in_bitmap(hdev, vf_fault_bitmap);
> +	if (cause_by_vf)
> +		hdev->ae_dev->hw_err_reset_req = 0;
> +
> +	return 0;
> +}
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
> index 86be6fb32990..68b738affa66 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
> @@ -196,6 +196,7 @@ struct hclge_hw_module_id {
>  struct hclge_hw_type_id {
>  	enum hclge_err_type_list type_id;
>  	const char *msg;
> +	bool cause_by_vf; /* indicate the error may from vf exception */
>  };
>  
>  struct hclge_sum_err_info {
> @@ -228,4 +229,5 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
>  			       unsigned long *reset_requests);
>  int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev);
>  int hclge_handle_mac_tnl(struct hclge_dev *hdev);
> +int hclge_handle_vf_queue_err_ras(struct hclge_dev *hdev);
>  #endif
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> index c42574e29747..99c0576e6383 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> @@ -3424,7 +3424,7 @@ static int hclge_get_status(struct hnae3_handle *handle)
>  	return hdev->hw.mac.link;
>  }
>  
> -static struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf)
> +struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf)
>  {
>  	if (!pci_num_vf(hdev->pdev)) {
>  		dev_err(&hdev->pdev->dev,
> @@ -4468,6 +4468,7 @@ static void hclge_handle_err_recovery(struct hclge_dev *hdev)
>  	if (hclge_find_error_source(hdev)) {
>  		hclge_handle_error_info_log(ae_dev);
>  		hclge_handle_mac_tnl(hdev);
> +		hclge_handle_vf_queue_err_ras(hdev);
>  	}
>  
>  	hclge_handle_err_reset_request(hdev);
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
> index 7bc2049b723d..02c7aab3546e 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
> @@ -1146,4 +1146,6 @@ int hclge_dbg_dump_rst_info(struct hclge_dev *hdev, char *buf, int len);
>  int hclge_push_vf_link_status(struct hclge_vport *vport);
>  int hclge_enable_vport_vlan_filter(struct hclge_vport *vport, bool request_en);
>  int hclge_mac_update_stats(struct hclge_dev *hdev);
> +struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf);
> +int hclge_inform_vf_reset(struct hclge_vport *vport, u16 reset_type);
>  #endif
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
> index 04ff9bf12185..4b0d07ca2505 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
> @@ -124,7 +124,7 @@ static int hclge_send_mbx_msg(struct hclge_vport *vport, u8 *msg, u16 msg_len,
>  	return status;
>  }
>  
> -static int hclge_inform_vf_reset(struct hclge_vport *vport, u16 reset_type)
> +int hclge_inform_vf_reset(struct hclge_vport *vport, u16 reset_type)
>  {
>  	__le16 msg_data;
>  	u8 dest_vfid;
> -- 
> 2.30.0
> 
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ