lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20250108023400.35081-1-tianruidong@linux.alibaba.com>
Date: Wed,  8 Jan 2025 10:34:00 +0800
From: Ruidong Tian <tianruidong@...ux.alibaba.com>
To: alexander.deucher@....com,
	christian.koenig@....com,
	Xinhui.Pan@....com,
	airlied@...il.com,
	simona@...ll.ch,
	xueshuai@...ux.alibaba.com
Cc: amd-gfx@...ts.freedesktop.org,
	dri-devel@...ts.freedesktop.org,
	linux-kernel@...r.kernel.org,
	tianruidong@...ux.alibaba.com
Subject: [RESEND PATCH] drm/amdgpu: add tracepoint while dump mca bank

RAS errors are typically exposed to user-space programs using tracepoints,
allowing tools like rasdaemon to decode and post-process them.
AMDGPU might also follow this, offering the following benefits:
1. It can proactively notify users of RAS events, eliminating the need
   to monitor /dev/kmsg.
2. It allows for further post-processing similar to AMD SMCA[1].

[1]: https://github.com/mchehab/rasdaemon/commit/932118

Signed-off-by: Ruidong Tian <tianruidong@...ux.alibaba.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c   |  3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 31 +++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 3ca03b5e0f91..9daa95365457 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -23,6 +23,7 @@
 #include "amdgpu_ras.h"
 #include "amdgpu.h"
 #include "amdgpu_mca.h"
+#include "amdgpu_trace.h"
 
 #include "umc/umc_6_7_0_offset.h"
 #include "umc/umc_6_7_0_sh_mask.h"
@@ -287,6 +288,8 @@ static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, st
 		      idx, entry->regs[MCA_REG_IDX_IPID]);
 	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
 		      idx, entry->regs[MCA_REG_IDX_SYND]);
+
+	trace_amdgpu_mca_bank_dumps(event_id, idx, entry);
 }
 
 static int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index 383fce40d4dd..a0ba79394099 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -554,6 +554,37 @@ TRACE_EVENT(amdgpu_reset_reg_dumps,
 		      __entry->value)
 );
 
+TRACE_EVENT(amdgpu_mca_bank_dumps,
+	   TP_PROTO(uint64_t event_id, int idx, struct mca_bank_entry *e),
+	   TP_ARGS(event_id, idx, e),
+	   TP_STRUCT__entry(
+			    __field(uint64_t, event_id)
+			    __field(int, idx)
+			    __field(uint64_t, status)
+			    __field(uint64_t, addr)
+			    __field(uint64_t, misc0)
+			    __field(uint64_t, ipid)
+			    __field(uint64_t, synd)
+			    ),
+	   TP_fast_assign(
+			  __entry->event_id = event_id;
+			  __entry->idx = idx;
+			  __entry->status = e->regs[MCA_REG_IDX_STATUS];
+			  __entry->addr = e->regs[MCA_REG_IDX_ADDR];
+			  __entry->misc0 = e->regs[MCA_REG_IDX_MISC0];
+			  __entry->ipid = e->regs[MCA_REG_IDX_IPID];
+			  __entry->synd = e->regs[MCA_REG_IDX_SYND];
+			  ),
+	   TP_printk("amdgpu mca bank dump: event_id: %lld, idx: %d, STATUS: %016llx, ADDR: %016llx, MISC0: %016llx, IPID: %016llx, SYND: %016llx",
+		     __entry->event_id,
+		     __entry->idx,
+		     __entry->status,
+		     __entry->addr,
+		     __entry->misc0,
+		     __entry->ipid,
+		     __entry->synd)
+);
+
 #undef AMDGPU_JOB_GET_TIMELINE_NAME
 #endif
 
-- 
2.33.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ