netdev - Re: [PATCH net-next v2 6/6] ice: Add MDD logging via devlink health

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <k2stckvckusd7pdjkvxpbfqabnarrqc7igcirnhorj2gobidgj@iugsqakc45b6>
Date: Tue, 11 Feb 2025 15:30:23 +0100
From: Jiri Pirko <jiri@...nulli.us>
To: Tony Nguyen <anthony.l.nguyen@...el.com>
Cc: davem@...emloft.net, kuba@...nel.org, pabeni@...hat.com, 
	edumazet@...gle.com, andrew+netdev@...n.ch, netdev@...r.kernel.org, 
	Ben Shelton <benjamin.h.shelton@...el.com>, przemyslaw.kitszel@...el.com, mateusz.polchlopek@...el.com, 
	joe@...ches.com, horms@...nel.org, apw@...onical.com, lukas.bulwahn@...il.com, 
	dwaipayanray1@...il.com, Igor Bagnucki <igor.bagnucki@...el.com>, 
	Pucha Himasekhar Reddy <himasekharx.reddy.pucha@...el.com>
Subject: Re: [PATCH net-next v2 6/6] ice: Add MDD logging via devlink health

Tue, Dec 17, 2024 at 10:08:33PM +0100, anthony.l.nguyen@...el.com wrote:
>From: Ben Shelton <benjamin.h.shelton@...el.com>
>
>Add a devlink health reporter for MDD events. The 'dump' handler will
>return the information captured in each call to ice_handle_mdd_event().
>A device reset (CORER/PFR) will put the reporter back in healthy state.
>
>Signed-off-by: Ben Shelton <benjamin.h.shelton@...el.com>
>Reviewed-by: Igor Bagnucki <igor.bagnucki@...el.com>
>Reviewed-by: Wojciech Drewek <wojciech.drewek@...el.com>
>Reviewed-by: Simon Horman <horms@...nel.org>
>Signed-off-by: Mateusz Polchlopek <mateusz.polchlopek@...el.com>
>Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@...el.com> (A Contingent worker at Intel)
>Co-developed-by: Przemek Kitszel <przemyslaw.kitszel@...el.com>
>Signed-off-by: Przemek Kitszel <przemyslaw.kitszel@...el.com>
>Signed-off-by: Tony Nguyen <anthony.l.nguyen@...el.com>
>---
> .../net/ethernet/intel/ice/devlink/health.c   | 77 +++++++++++++++++++
> .../net/ethernet/intel/ice/devlink/health.h   | 11 +++
> drivers/net/ethernet/intel/ice/ice_main.c     |  6 ++
> 3 files changed, 94 insertions(+)
>
>diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c b/drivers/net/ethernet/intel/ice/devlink/health.c
>index 984d910fc41d..d23ae3aafaa7 100644
>--- a/drivers/net/ethernet/intel/ice/devlink/health.c
>+++ b/drivers/net/ethernet/intel/ice/devlink/health.c
>@@ -26,6 +26,79 @@ static void ice_devlink_health_report(struct devlink_health_reporter *reporter,
> 	devlink_health_report(reporter, msg, priv_ctx);
> }
> 
>+struct ice_mdd_event {
>+	enum ice_mdd_src src;
>+	u16 vf_num;
>+	u16 queue;
>+	u8 pf_num;
>+	u8 event;
>+};
>+
>+static const char *ice_mdd_src_to_str(enum ice_mdd_src src)
>+{
>+	switch (src) {
>+	case ICE_MDD_SRC_TX_PQM:
>+		return "tx_pqm";
>+	case ICE_MDD_SRC_TX_TCLAN:
>+		return "tx_tclan";
>+	case ICE_MDD_SRC_TX_TDPU:
>+		return "tx_tdpu";
>+	case ICE_MDD_SRC_RX:
>+		return "rx";
>+	default:
>+		return "invalid";
>+	}
>+}
>+
>+static int
>+ice_mdd_reporter_dump(struct devlink_health_reporter *reporter,
>+		      struct devlink_fmsg *fmsg, void *priv_ctx,
>+		      struct netlink_ext_ack *extack)
>+{
>+	struct ice_mdd_event *mdd_event = priv_ctx;
>+	const char *src;
>+
>+	if (!mdd_event)
>+		return 0;
>+
>+	src = ice_mdd_src_to_str(mdd_event->src);
>+
>+	devlink_fmsg_obj_nest_start(fmsg);
>+	devlink_fmsg_put(fmsg, "src", src);
>+	ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, pf_num);
>+	ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, vf_num);

Why you don't attach this reported to representor devlink port? I mean,
exposing pf/vf num just because the reporter is not attached to proper
object looks wrong to me. We have object hierarchy in devlink, benefit
from it.


>+	ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, event);
>+	ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, mdd_event, queue);
>+	devlink_fmsg_obj_nest_end(fmsg);
>+
>+	return 0;
>+}
>+
>+/**
>+ * ice_report_mdd_event - Report an MDD event through devlink health
>+ * @pf: the PF device structure
>+ * @src: the HW block that was the source of this MDD event
>+ * @pf_num: the pf_num on which the MDD event occurred
>+ * @vf_num: the vf_num on which the MDD event occurred
>+ * @event: the event type of the MDD event
>+ * @queue: the queue on which the MDD event occurred
>+ *
>+ * Report an MDD event that has occurred on this PF.
>+ */
>+void ice_report_mdd_event(struct ice_pf *pf, enum ice_mdd_src src, u8 pf_num,
>+			  u16 vf_num, u8 event, u16 queue)
>+{
>+	struct ice_mdd_event ev = {
>+		.src = src,
>+		.pf_num = pf_num,
>+		.vf_num = vf_num,
>+		.event = event,
>+		.queue = queue,
>+	};
>+
>+	ice_devlink_health_report(pf->health_reporters.mdd, "MDD event", &ev);
>+}
>+
> /**
>  * ice_fmsg_put_ptr - put hex value of pointer into fmsg
>  *
>@@ -136,6 +209,7 @@ ice_init_devlink_rep(struct ice_pf *pf,
> 	.dump = ice_ ## _name ## _reporter_dump, \
> }
> 
>+ICE_DEFINE_HEALTH_REPORTER_OPS(mdd);
> ICE_DEFINE_HEALTH_REPORTER_OPS(tx_hang);
> 
> /**
>@@ -148,6 +222,7 @@ void ice_health_init(struct ice_pf *pf)
> {
> 	struct ice_health *reps = &pf->health_reporters;
> 
>+	reps->mdd = ice_init_devlink_rep(pf, &ice_mdd_reporter_ops);
> 	reps->tx_hang = ice_init_devlink_rep(pf, &ice_tx_hang_reporter_ops);
> }
> 
>@@ -169,6 +244,7 @@ static void ice_deinit_devl_reporter(struct devlink_health_reporter *reporter)
>  */
> void ice_health_deinit(struct ice_pf *pf)
> {
>+	ice_deinit_devl_reporter(pf->health_reporters.mdd);
> 	ice_deinit_devl_reporter(pf->health_reporters.tx_hang);
> }
> 
>@@ -188,5 +264,6 @@ void ice_health_assign_healthy_state(struct devlink_health_reporter *reporter)
>  */
> void ice_health_clear(struct ice_pf *pf)
> {
>+	ice_health_assign_healthy_state(pf->health_reporters.mdd);
> 	ice_health_assign_healthy_state(pf->health_reporters.tx_hang);
> }
>diff --git a/drivers/net/ethernet/intel/ice/devlink/health.h b/drivers/net/ethernet/intel/ice/devlink/health.h
>index 5ce601227acb..532277fc57d7 100644
>--- a/drivers/net/ethernet/intel/ice/devlink/health.h
>+++ b/drivers/net/ethernet/intel/ice/devlink/health.h
>@@ -16,9 +16,17 @@
> struct ice_pf;
> struct ice_tx_ring;
> 
>+enum ice_mdd_src {
>+	ICE_MDD_SRC_TX_PQM,
>+	ICE_MDD_SRC_TX_TCLAN,
>+	ICE_MDD_SRC_TX_TDPU,
>+	ICE_MDD_SRC_RX,
>+};
>+
> /**
>  * struct ice_health - stores ice devlink health reporters and accompanied data
>  * @tx_hang: devlink health reporter for tx_hang event
>+ * @mdd: devlink health reporter for MDD detection event
>  * @tx_hang_buf: pre-allocated place to put info for Tx hang reporter from
>  *               non-sleeping context
>  * @tx_ring: ring that the hang occurred on
>@@ -27,6 +35,7 @@ struct ice_tx_ring;
>  * @vsi_num: VSI owning the queue that the hang occurred on
>  */
> struct ice_health {
>+	struct devlink_health_reporter *mdd;
> 	struct devlink_health_reporter *tx_hang;
> 	struct_group_tagged(ice_health_tx_hang_buf, tx_hang_buf,
> 		struct ice_tx_ring *tx_ring;
>@@ -42,6 +51,8 @@ void ice_health_clear(struct ice_pf *pf);
> 
> void ice_prep_tx_hang_report(struct ice_pf *pf, struct ice_tx_ring *tx_ring,
> 			     u16 vsi_num, u32 head, u32 intr);
>+void ice_report_mdd_event(struct ice_pf *pf, enum ice_mdd_src src, u8 pf_num,
>+			  u16 vf_num, u8 event, u16 queue);
> void ice_report_tx_hang(struct ice_pf *pf);
> 
> #endif /* _HEALTH_H_ */
>diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
>index 316f5109bd3f..1701f7143f24 100644
>--- a/drivers/net/ethernet/intel/ice/ice_main.c
>+++ b/drivers/net/ethernet/intel/ice/ice_main.c
>@@ -1816,6 +1816,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
> 		if (netif_msg_tx_err(pf))
> 			dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
> 				 event, queue, pf_num, vf_num);
>+		ice_report_mdd_event(pf, ICE_MDD_SRC_TX_PQM, pf_num, vf_num,
>+				     event, queue);
> 		wr32(hw, GL_MDET_TX_PQM, 0xffffffff);
> 	}
> 
>@@ -1829,6 +1831,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
> 		if (netif_msg_tx_err(pf))
> 			dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n",
> 				 event, queue, pf_num, vf_num);
>+		ice_report_mdd_event(pf, ICE_MDD_SRC_TX_TCLAN, pf_num, vf_num,
>+				     event, queue);
> 		wr32(hw, GL_MDET_TX_TCLAN_BY_MAC(hw), U32_MAX);
> 	}
> 
>@@ -1842,6 +1846,8 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
> 		if (netif_msg_rx_err(pf))
> 			dev_info(dev, "Malicious Driver Detection event %d on RX queue %d PF# %d VF# %d\n",
> 				 event, queue, pf_num, vf_num);
>+		ice_report_mdd_event(pf, ICE_MDD_SRC_RX, pf_num, vf_num, event,
>+				     queue);
> 		wr32(hw, GL_MDET_RX, 0xffffffff);
> 	}
> 
>-- 
>2.47.1
>