lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Thu, 13 Jul 2023 18:32:41 -0300
From:   André Almeida <andrealmeid@...lia.com>
To:     dri-devel@...ts.freedesktop.org, amd-gfx@...ts.freedesktop.org,
        linux-kernel@...r.kernel.org
Cc:     kernel-dev@...lia.com, alexander.deucher@....com,
        christian.koenig@....com, pierre-eric.pelloux-prayer@....com,
        'Marek Olšák' <maraeo@...il.com>,
        Samuel Pitoiset <samuel.pitoiset@...il.com>,
        Bas Nieuwenhuizen <bas@...nieuwenhuizen.nl>,
        Timur Kristóf <timur.kristof@...il.com>,
        michel.daenzer@...lbox.org,
        André Almeida <andrealmeid@...lia.com>
Subject: [PATCH v2 5/6] drm/amdgpu: Log IBs and ring name at coredump

Log the IB addresses used by the hung job along with the stuck ring
name. Note that due to nested IBs, the one that caused the reset itself
may be in not listed address.

Signed-off-by: André Almeida <andrealmeid@...lia.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 +++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e1cc83a89d46..cfeaf93934fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1086,6 +1086,9 @@ struct amdgpu_coredump_info {
 	struct amdgpu_task_info         reset_task_info;
 	struct timespec64               reset_time;
 	bool                            reset_vram_lost;
+	u64				*ibs;
+	u32				num_ibs;
+	char				ring_name[16];
 };
 #endif
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 07546781b8b8..431ccc3d7857 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5008,12 +5008,24 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
 				   coredump->adev->reset_dump_reg_value[i]);
 	}
 
+	if (coredump->num_ibs) {
+		drm_printf(&p, "IBs:\n");
+		for (i = 0; i < coredump->num_ibs; i++)
+			drm_printf(&p, "\t[%d] 0x%llx\n", i, coredump->ibs[i]);
+	}
+
+	if (coredump->ring_name[0] != '\0')
+		drm_printf(&p, "ring name: %s\n", coredump->ring_name);
+
 	return count - iter.remain;
 }
 
 static void amdgpu_devcoredump_free(void *data)
 {
-	kfree(data);
+	struct amdgpu_coredump_info *coredump = data;
+
+	kfree(coredump->ibs);
+	kfree(coredump);
 }
 
 static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
@@ -5021,6 +5033,8 @@ static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
 {
 	struct amdgpu_coredump_info *coredump;
 	struct drm_device *dev = adev_to_drm(adev);
+	struct amdgpu_job *job = reset_context->job;
+	int i;
 
 	coredump = kmalloc(sizeof(*coredump), GFP_NOWAIT);
 
@@ -5038,6 +5052,21 @@ static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
 
 	coredump->adev = adev;
 
+	if (job && job->num_ibs) {
+		struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
+		u32 num_ibs = job->num_ibs;
+
+		coredump->ibs = kmalloc_array(num_ibs, sizeof(coredump->ibs), GFP_NOWAIT);
+		if (coredump->ibs)
+			coredump->num_ibs = num_ibs;
+
+		for (i = 0; i < coredump->num_ibs; i++)
+			coredump->ibs[i] = job->ibs[i].gpu_addr;
+
+		if (ring)
+			strncpy(coredump->ring_name, ring->name, 16);
+	}
+
 	ktime_get_ts64(&coredump->reset_time);
 
 	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
-- 
2.41.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ