lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250720000146.1405060-2-olvaffe@gmail.com>
Date: Sat, 19 Jul 2025 17:01:38 -0700
From: Chia-I Wu <olvaffe@...il.com>
To: Boris Brezillon <boris.brezillon@...labora.com>,
	Steven Price <steven.price@....com>,
	Liviu Dudau <liviu.dudau@....com>,
	Maarten Lankhorst <maarten.lankhorst@...ux.intel.com>,
	Maxime Ripard <mripard@...nel.org>,
	Thomas Zimmermann <tzimmermann@...e.de>,
	David Airlie <airlied@...il.com>,
	Simona Vetter <simona@...ll.ch>,
	linux-kernel@...r.kernel.org,
	dri-devel@...ts.freedesktop.org
Subject: [PATCH 1/9] drm/panthor: add devcoredump support

Create a devcoredump on any faulty or fatal event. The coredump data is
in YAML format for readability and flexibility.

Only panthor_group state is captured for now.

Signed-off-by: Chia-I Wu <olvaffe@...il.com>
---
 drivers/gpu/drm/panthor/Makefile           |   2 +
 drivers/gpu/drm/panthor/panthor_coredump.c | 225 +++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_coredump.h |  68 +++++++
 drivers/gpu/drm/panthor/panthor_device.h   |   6 +
 drivers/gpu/drm/panthor/panthor_sched.c    |  69 +++++++
 drivers/gpu/drm/panthor/panthor_sched.h    |   5 +
 6 files changed, 375 insertions(+)
 create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.c
 create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.h

diff --git a/drivers/gpu/drm/panthor/Makefile b/drivers/gpu/drm/panthor/Makefile
index 15294719b09c..9fd1e74af1df 100644
--- a/drivers/gpu/drm/panthor/Makefile
+++ b/drivers/gpu/drm/panthor/Makefile
@@ -11,4 +11,6 @@ panthor-y := \
 	panthor_mmu.o \
 	panthor_sched.o
 
+panthor-$(CONFIG_DEV_COREDUMP) += panthor_coredump.o
+
 obj-$(CONFIG_DRM_PANTHOR) += panthor.o
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
new file mode 100644
index 000000000000..767f3327e3e8
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_coredump.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2025 Google LLC */
+
+#include <drm/drm_drv.h>
+#include <drm/drm_print.h>
+#include <drm/drm_managed.h>
+#include <generated/utsrelease.h>
+#include <linux/devcoredump.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/timekeeping.h>
+
+#include "panthor_coredump.h"
+#include "panthor_device.h"
+#include "panthor_sched.h"
+
+/**
+ * enum panthor_coredump_mask - Coredump state
+ */
+enum panthor_coredump_mask {
+	PANTHOR_COREDUMP_GROUP = BIT(0),
+};
+
+/**
+ * struct panthor_coredump_header - Coredump header
+ */
+struct panthor_coredump_header {
+	enum panthor_coredump_reason reason;
+	ktime_t timestamp;
+};
+
+/**
+ * struct panthor_coredump - Coredump
+ */
+struct panthor_coredump {
+	/** @ptdev: Device. */
+	struct panthor_device *ptdev;
+
+	/** @work: Bottom half of panthor_coredump_capture. */
+	struct work_struct work;
+
+	/** @header: Header. */
+	struct panthor_coredump_header header;
+
+	/** @mask: Bitmask of captured states. */
+	u32 mask;
+
+	struct panthor_coredump_group_state group;
+
+	/* @data: Serialized coredump data. */
+	void *data;
+
+	/* @size: Serialized coredump size. */
+	size_t size;
+};
+
+static const char *reason_str(enum panthor_coredump_reason reason)
+{
+	switch (reason) {
+	case PANTHOR_COREDUMP_REASON_MMU_FAULT:
+		return "MMU_FAULT";
+	case PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT:
+		return "CSG_REQ_TIMEOUT";
+	case PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE:
+		return "CSG_UNKNOWN_STATE";
+	case PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT:
+		return "CSG_PROGRESS_TIMEOUT";
+	case PANTHOR_COREDUMP_REASON_CS_FATAL:
+		return "CS_FATAL";
+	case PANTHOR_COREDUMP_REASON_CS_FAULT:
+		return "CS_FAULT";
+	case PANTHOR_COREDUMP_REASON_CS_TILER_OOM:
+		return "CS_TILER_OOM";
+	case PANTHOR_COREDUMP_REASON_JOB_TIMEOUT:
+		return "JOB_TIMEOUT";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static void print_group(struct drm_printer *p,
+			const struct panthor_coredump_group_state *group)
+{
+	drm_puts(p, "group:\n");
+	drm_printf(p, "  priority: %d\n", group->priority);
+	drm_printf(p, "  queue_count: %u\n", group->queue_count);
+	drm_printf(p, "  pid: %d\n", group->pid);
+	drm_printf(p, "  comm: %s\n", group->comm);
+	drm_printf(p, "  destroyed: %d\n", group->destroyed);
+	drm_printf(p, "  csg_id: %d\n", group->csg_id);
+}
+
+static void print_header(struct drm_printer *p,
+			 const struct panthor_coredump_header *header,
+			 const struct drm_driver *drv)
+{
+	drm_puts(p, "header:\n");
+	drm_puts(p, "  kernel: " UTS_RELEASE "\n");
+	drm_puts(p, "  module: " KBUILD_MODNAME "\n");
+	drm_printf(p, "  driver_version: %d.%d\n", drv->major, drv->minor);
+
+	drm_printf(p, "  reason: %s\n", reason_str(header->reason));
+	drm_printf(p, "  timestamp: %lld\n", ktime_to_ns(header->timestamp));
+}
+
+static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
+{
+	/* in YAML format */
+	drm_puts(p, "---\n");
+	print_header(p, &cd->header, cd->ptdev->base.driver);
+
+	if (cd->mask & PANTHOR_COREDUMP_GROUP)
+		print_group(p, &cd->group);
+}
+
+static void process_cd(struct panthor_device *ptdev,
+		       struct panthor_coredump *cd)
+{
+	struct drm_print_iterator iter = {
+		.remain = SSIZE_MAX,
+	};
+	struct drm_printer p = drm_coredump_printer(&iter);
+
+	print_cd(&p, cd);
+
+	iter.remain = SSIZE_MAX - iter.remain;
+	iter.data = kvmalloc(iter.remain, GFP_USER);
+	if (!iter.data)
+		return;
+
+	cd->data = iter.data;
+	cd->size = iter.remain;
+
+	drm_info(&ptdev->base, "generating coredump of size %zu\n", cd->size);
+
+	p = drm_coredump_printer(&iter);
+	print_cd(&p, cd);
+}
+
+static void capture_cd(struct panthor_device *ptdev,
+		       struct panthor_coredump *cd, struct panthor_group *group)
+{
+	drm_info(&ptdev->base, "capturing coredump states\n");
+
+	if (group) {
+		panthor_group_capture_coredump(group, &cd->group);
+		cd->mask |= PANTHOR_COREDUMP_GROUP;
+	}
+}
+
+static void panthor_coredump_free(void *data)
+{
+	struct panthor_coredump *cd = data;
+	struct panthor_device *ptdev = cd->ptdev;
+
+	kvfree(cd->data);
+	kfree(cd);
+
+	atomic_set(&ptdev->coredump.pending, 0);
+}
+
+static ssize_t panthor_coredump_read(char *buffer, loff_t offset, size_t count,
+				     void *data, size_t datalen)
+{
+	const struct panthor_coredump *cd = data;
+
+	if (offset >= cd->size)
+		return 0;
+
+	if (count > cd->size - offset)
+		count = cd->size - offset;
+
+	memcpy(buffer, cd->data + offset, count);
+
+	return count;
+}
+
+static void panthor_coredump_process_work(struct work_struct *work)
+{
+	struct panthor_coredump *cd =
+		container_of(work, struct panthor_coredump, work);
+	struct panthor_device *ptdev = cd->ptdev;
+
+	process_cd(ptdev, cd);
+
+	dev_coredumpm(ptdev->base.dev, THIS_MODULE, cd, 0, GFP_KERNEL,
+		      panthor_coredump_read, panthor_coredump_free);
+}
+
+void panthor_coredump_capture(struct panthor_coredump *cd,
+			      struct panthor_group *group)
+{
+	struct panthor_device *ptdev = cd->ptdev;
+
+	capture_cd(ptdev, cd, group);
+
+	queue_work(system_unbound_wq, &cd->work);
+}
+
+struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+		       enum panthor_coredump_reason reason, gfp_t gfp)
+{
+	struct panthor_coredump *cd;
+
+	/* reject all but the first coredump until it is handled */
+	if (atomic_cmpxchg(&ptdev->coredump.pending, 0, 1)) {
+		drm_dbg(&ptdev->base, "skip subsequent coredump\n");
+		return NULL;
+	}
+
+	cd = kzalloc(sizeof(*cd), gfp);
+	if (!cd) {
+		atomic_set(&ptdev->coredump.pending, 0);
+		return NULL;
+	}
+
+	cd->ptdev = ptdev;
+	INIT_WORK(&cd->work, panthor_coredump_process_work);
+
+	cd->header.reason = reason;
+	cd->header.timestamp = ktime_get_real();
+
+	return cd;
+}
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h b/drivers/gpu/drm/panthor/panthor_coredump.h
new file mode 100644
index 000000000000..dd1fe1c2e175
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_coredump.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2019 Collabora ltd. */
+
+#ifndef __PANTHOR_COREDUMP_H__
+#define __PANTHOR_COREDUMP_H__
+
+#include <drm/panthor_drm.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+struct panthor_coredump;
+struct panthor_device;
+struct panthor_group;
+
+/**
+ * enum panthor_coredump_reason - Coredump reason
+ */
+enum panthor_coredump_reason {
+	PANTHOR_COREDUMP_REASON_MMU_FAULT,
+	PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
+	PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
+	PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT,
+	PANTHOR_COREDUMP_REASON_CS_FATAL,
+	PANTHOR_COREDUMP_REASON_CS_FAULT,
+	PANTHOR_COREDUMP_REASON_CS_TILER_OOM,
+	PANTHOR_COREDUMP_REASON_JOB_TIMEOUT,
+};
+
+/**
+ * struct panthor_coredump_group_state - Coredump group state
+ *
+ * Interesting panthor_group fields.
+ */
+struct panthor_coredump_group_state {
+	enum drm_panthor_group_priority priority;
+	u32 queue_count;
+	pid_t pid;
+	char comm[TASK_COMM_LEN];
+	bool destroyed;
+	int csg_id;
+};
+
+#ifdef CONFIG_DEV_COREDUMP
+
+struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+		       enum panthor_coredump_reason reason, gfp_t gfp);
+
+void panthor_coredump_capture(struct panthor_coredump *cd,
+			      struct panthor_group *group);
+
+#else /* CONFIG_DEV_COREDUMP */
+
+static inline struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+		       enum panthor_coredump_reason reason, gfp_t gfp)
+{
+	return NULL;
+}
+
+static inline void panthor_coredump_capture(struct panthor_coredump *cd,
+					    struct panthor_group *group)
+{
+}
+
+#endif /* CONFIG_DEV_COREDUMP */
+
+#endif /* __PANTHOR_COREDUMP_H__ */
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index 4fc7cf2aeed5..766e53c25cfa 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -197,6 +197,12 @@ struct panthor_device {
 		atomic_t recovery_needed;
 	} pm;
 
+	/** @coredump: Coredump-related data. */
+	struct {
+		/** @pending: True if there is a pending coredump. */
+		atomic_t pending;
+	} coredump;
+
 	/** @profile_mask: User-set profiling flags for job accounting. */
 	u32 profile_mask;
 
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index a2248f692a03..eb45b5ad9774 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -23,6 +23,7 @@
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 
+#include "panthor_coredump.h"
 #include "panthor_devfreq.h"
 #include "panthor_device.h"
 #include "panthor_fw.h"
@@ -1031,6 +1032,10 @@ group_unbind_locked(struct panthor_group *group)
 	return 0;
 }
 
+static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
+					  enum panthor_coredump_reason reason,
+					  struct panthor_group *group);
+
 /**
  * cs_slot_prog_locked() - Program a queue slot
  * @ptdev: Device.
@@ -1249,6 +1254,10 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
 		drm_err(&ptdev->base, "Invalid state on CSG %d (state=%d)",
 			csg_id, csg_state);
 		new_state = PANTHOR_CS_GROUP_UNKNOWN_STATE;
+
+		panthor_sched_coredump_locked(
+			ptdev, PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
+			group);
 		break;
 	}
 
@@ -1378,6 +1387,9 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
 		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
 		 (unsigned int)CS_EXCEPTION_DATA(fatal),
 		 info);
+
+	panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FATAL,
+				      group);
 }
 
 static void
@@ -1426,6 +1438,9 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
 		 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
 		 (unsigned int)CS_EXCEPTION_DATA(fault),
 		 info);
+
+	panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FAULT,
+				      group);
 }
 
 static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
@@ -1480,6 +1495,10 @@ static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
 		drm_warn(&ptdev->base, "Failed to extend the tiler heap\n");
 		group->fatal_queues |= BIT(cs_id);
 		sched_queue_delayed_work(sched, tick, 0);
+
+		panthor_sched_coredump_locked(
+			ptdev, PANTHOR_COREDUMP_REASON_CS_TILER_OOM, group);
+
 		goto out_put_heap_pool;
 	}
 
@@ -1639,6 +1658,9 @@ csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 c
 		group->timedout = true;
 
 	sched_queue_delayed_work(sched, tick, 0);
+
+	panthor_sched_coredump_locked(
+		ptdev, PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT, group);
 }
 
 static void sched_process_csg_irq_locked(struct panthor_device *ptdev, u32 csg_id)
@@ -1858,8 +1880,16 @@ static int csgs_upd_ctx_apply_locked(struct panthor_device *ptdev,
 
 		if (ret && acked != req_mask &&
 		    ((csg_iface->input->req ^ csg_iface->output->ack) & req_mask) != 0) {
+			struct panthor_csg_slot *csg_slot =
+				&sched->csg_slots[csg_id];
+			struct panthor_group *group = csg_slot->group;
+
 			drm_err(&ptdev->base, "CSG %d update request timedout", csg_id);
 			ctx->timedout_mask |= BIT(csg_id);
+
+			panthor_sched_coredump_locked(
+				ptdev, PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
+				group);
 		}
 	}
 
@@ -2027,6 +2057,10 @@ tick_ctx_init(struct panthor_scheduler *sched,
 		 * CSG IRQs, so we can flag the faulty queue.
 		 */
 		if (panthor_vm_has_unhandled_faults(group->vm)) {
+			panthor_sched_coredump_locked(
+				ptdev, PANTHOR_COREDUMP_REASON_MMU_FAULT,
+				group);
+
 			sched_process_csg_irq_locked(ptdev, i);
 
 			/* No fatal fault reported, flag all queues as faulty. */
@@ -3237,6 +3271,10 @@ queue_timedout_job(struct drm_sched_job *sched_job)
 
 		group_queue_work(group, term);
 	}
+
+	panthor_sched_coredump_locked(
+		ptdev, PANTHOR_COREDUMP_REASON_JOB_TIMEOUT, group);
+
 	mutex_unlock(&sched->lock);
 
 	queue_start(queue);
@@ -3627,6 +3665,37 @@ int panthor_group_get_state(struct panthor_file *pfile,
 	return 0;
 }
 
+static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
+					  enum panthor_coredump_reason reason,
+					  struct panthor_group *group)
+{
+	struct panthor_coredump *cd;
+
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	/* GFP_NOWAIT because this may be called from fence signaling path */
+	cd = panthor_coredump_alloc(ptdev, reason, GFP_NOWAIT);
+	if (!cd)
+		return;
+
+	panthor_coredump_capture(cd, group);
+}
+
+void panthor_group_capture_coredump(const struct panthor_group *group,
+				    struct panthor_coredump_group_state *state)
+{
+	const struct panthor_device *ptdev = group->ptdev;
+
+	/* this is called from panthor_coredump_capture */
+	lockdep_assert_held(&ptdev->scheduler->lock);
+
+	state->priority = group->priority;
+	state->queue_count = group->queue_count;
+	/* TODO state->pid and state->comm */
+	state->destroyed = group->destroyed;
+	state->csg_id = group->csg_id;
+}
+
 int panthor_group_pool_create(struct panthor_file *pfile)
 {
 	struct panthor_group_pool *gpool;
diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
index 742b0b4ff3a3..6c564153133e 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.h
+++ b/drivers/gpu/drm/panthor/panthor_sched.h
@@ -14,8 +14,10 @@ struct drm_panthor_group_create;
 struct drm_panthor_queue_create;
 struct drm_panthor_group_get_state;
 struct drm_panthor_queue_submit;
+struct panthor_coredump_group_state;
 struct panthor_device;
 struct panthor_file;
+struct panthor_group;
 struct panthor_group_pool;
 struct panthor_job;
 
@@ -26,6 +28,9 @@ int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle);
 int panthor_group_get_state(struct panthor_file *pfile,
 			    struct drm_panthor_group_get_state *get_state);
 
+void panthor_group_capture_coredump(const struct panthor_group *group,
+				    struct panthor_coredump_group_state *state);
+
 struct drm_sched_job *
 panthor_job_create(struct panthor_file *pfile,
 		   u16 group_handle,
-- 
2.50.0.727.gbf7dc18ff4-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ