[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250720000146.1405060-2-olvaffe@gmail.com>
Date: Sat, 19 Jul 2025 17:01:38 -0700
From: Chia-I Wu <olvaffe@...il.com>
To: Boris Brezillon <boris.brezillon@...labora.com>,
Steven Price <steven.price@....com>,
Liviu Dudau <liviu.dudau@....com>,
Maarten Lankhorst <maarten.lankhorst@...ux.intel.com>,
Maxime Ripard <mripard@...nel.org>,
Thomas Zimmermann <tzimmermann@...e.de>,
David Airlie <airlied@...il.com>,
Simona Vetter <simona@...ll.ch>,
linux-kernel@...r.kernel.org,
dri-devel@...ts.freedesktop.org
Subject: [PATCH 1/9] drm/panthor: add devcoredump support
Create a devcoredump on any faulty or fatal event. The coredump data is
in YAML format for readability and flexibility.
Only panthor_group state is captured for now.
Signed-off-by: Chia-I Wu <olvaffe@...il.com>
---
drivers/gpu/drm/panthor/Makefile | 2 +
drivers/gpu/drm/panthor/panthor_coredump.c | 225 +++++++++++++++++++++
drivers/gpu/drm/panthor/panthor_coredump.h | 68 +++++++
drivers/gpu/drm/panthor/panthor_device.h | 6 +
drivers/gpu/drm/panthor/panthor_sched.c | 69 +++++++
drivers/gpu/drm/panthor/panthor_sched.h | 5 +
6 files changed, 375 insertions(+)
create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.c
create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.h
diff --git a/drivers/gpu/drm/panthor/Makefile b/drivers/gpu/drm/panthor/Makefile
index 15294719b09c..9fd1e74af1df 100644
--- a/drivers/gpu/drm/panthor/Makefile
+++ b/drivers/gpu/drm/panthor/Makefile
@@ -11,4 +11,6 @@ panthor-y := \
panthor_mmu.o \
panthor_sched.o
+panthor-$(CONFIG_DEV_COREDUMP) += panthor_coredump.o
+
obj-$(CONFIG_DRM_PANTHOR) += panthor.o
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c b/drivers/gpu/drm/panthor/panthor_coredump.c
new file mode 100644
index 000000000000..767f3327e3e8
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_coredump.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2025 Google LLC */
+
+#include <drm/drm_drv.h>
+#include <drm/drm_print.h>
+#include <drm/drm_managed.h>
+#include <generated/utsrelease.h>
+#include <linux/devcoredump.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/timekeeping.h>
+
+#include "panthor_coredump.h"
+#include "panthor_device.h"
+#include "panthor_sched.h"
+
+/**
+ * enum panthor_coredump_mask - Coredump state
+ */
+enum panthor_coredump_mask {
+ PANTHOR_COREDUMP_GROUP = BIT(0),
+};
+
+/**
+ * struct panthor_coredump_header - Coredump header
+ */
+struct panthor_coredump_header {
+ enum panthor_coredump_reason reason;
+ ktime_t timestamp;
+};
+
+/**
+ * struct panthor_coredump - Coredump
+ */
+struct panthor_coredump {
+ /** @ptdev: Device. */
+ struct panthor_device *ptdev;
+
+ /** @work: Bottom half of panthor_coredump_capture. */
+ struct work_struct work;
+
+ /** @header: Header. */
+ struct panthor_coredump_header header;
+
+ /** @mask: Bitmask of captured states. */
+ u32 mask;
+
+ struct panthor_coredump_group_state group;
+
+ /* @data: Serialized coredump data. */
+ void *data;
+
+ /* @size: Serialized coredump size. */
+ size_t size;
+};
+
+static const char *reason_str(enum panthor_coredump_reason reason)
+{
+ switch (reason) {
+ case PANTHOR_COREDUMP_REASON_MMU_FAULT:
+ return "MMU_FAULT";
+ case PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT:
+ return "CSG_REQ_TIMEOUT";
+ case PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE:
+ return "CSG_UNKNOWN_STATE";
+ case PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT:
+ return "CSG_PROGRESS_TIMEOUT";
+ case PANTHOR_COREDUMP_REASON_CS_FATAL:
+ return "CS_FATAL";
+ case PANTHOR_COREDUMP_REASON_CS_FAULT:
+ return "CS_FAULT";
+ case PANTHOR_COREDUMP_REASON_CS_TILER_OOM:
+ return "CS_TILER_OOM";
+ case PANTHOR_COREDUMP_REASON_JOB_TIMEOUT:
+ return "JOB_TIMEOUT";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static void print_group(struct drm_printer *p,
+ const struct panthor_coredump_group_state *group)
+{
+ drm_puts(p, "group:\n");
+ drm_printf(p, " priority: %d\n", group->priority);
+ drm_printf(p, " queue_count: %u\n", group->queue_count);
+ drm_printf(p, " pid: %d\n", group->pid);
+ drm_printf(p, " comm: %s\n", group->comm);
+ drm_printf(p, " destroyed: %d\n", group->destroyed);
+ drm_printf(p, " csg_id: %d\n", group->csg_id);
+}
+
+static void print_header(struct drm_printer *p,
+ const struct panthor_coredump_header *header,
+ const struct drm_driver *drv)
+{
+ drm_puts(p, "header:\n");
+ drm_puts(p, " kernel: " UTS_RELEASE "\n");
+ drm_puts(p, " module: " KBUILD_MODNAME "\n");
+ drm_printf(p, " driver_version: %d.%d\n", drv->major, drv->minor);
+
+ drm_printf(p, " reason: %s\n", reason_str(header->reason));
+ drm_printf(p, " timestamp: %lld\n", ktime_to_ns(header->timestamp));
+}
+
+static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
+{
+ /* in YAML format */
+ drm_puts(p, "---\n");
+ print_header(p, &cd->header, cd->ptdev->base.driver);
+
+ if (cd->mask & PANTHOR_COREDUMP_GROUP)
+ print_group(p, &cd->group);
+}
+
+static void process_cd(struct panthor_device *ptdev,
+ struct panthor_coredump *cd)
+{
+ struct drm_print_iterator iter = {
+ .remain = SSIZE_MAX,
+ };
+ struct drm_printer p = drm_coredump_printer(&iter);
+
+ print_cd(&p, cd);
+
+ iter.remain = SSIZE_MAX - iter.remain;
+ iter.data = kvmalloc(iter.remain, GFP_USER);
+ if (!iter.data)
+ return;
+
+ cd->data = iter.data;
+ cd->size = iter.remain;
+
+ drm_info(&ptdev->base, "generating coredump of size %zu\n", cd->size);
+
+ p = drm_coredump_printer(&iter);
+ print_cd(&p, cd);
+}
+
+static void capture_cd(struct panthor_device *ptdev,
+ struct panthor_coredump *cd, struct panthor_group *group)
+{
+ drm_info(&ptdev->base, "capturing coredump states\n");
+
+ if (group) {
+ panthor_group_capture_coredump(group, &cd->group);
+ cd->mask |= PANTHOR_COREDUMP_GROUP;
+ }
+}
+
+static void panthor_coredump_free(void *data)
+{
+ struct panthor_coredump *cd = data;
+ struct panthor_device *ptdev = cd->ptdev;
+
+ kvfree(cd->data);
+ kfree(cd);
+
+ atomic_set(&ptdev->coredump.pending, 0);
+}
+
+static ssize_t panthor_coredump_read(char *buffer, loff_t offset, size_t count,
+ void *data, size_t datalen)
+{
+ const struct panthor_coredump *cd = data;
+
+ if (offset >= cd->size)
+ return 0;
+
+ if (count > cd->size - offset)
+ count = cd->size - offset;
+
+ memcpy(buffer, cd->data + offset, count);
+
+ return count;
+}
+
+static void panthor_coredump_process_work(struct work_struct *work)
+{
+ struct panthor_coredump *cd =
+ container_of(work, struct panthor_coredump, work);
+ struct panthor_device *ptdev = cd->ptdev;
+
+ process_cd(ptdev, cd);
+
+ dev_coredumpm(ptdev->base.dev, THIS_MODULE, cd, 0, GFP_KERNEL,
+ panthor_coredump_read, panthor_coredump_free);
+}
+
+void panthor_coredump_capture(struct panthor_coredump *cd,
+ struct panthor_group *group)
+{
+ struct panthor_device *ptdev = cd->ptdev;
+
+ capture_cd(ptdev, cd, group);
+
+ queue_work(system_unbound_wq, &cd->work);
+}
+
+struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+ enum panthor_coredump_reason reason, gfp_t gfp)
+{
+ struct panthor_coredump *cd;
+
+ /* reject all but the first coredump until it is handled */
+ if (atomic_cmpxchg(&ptdev->coredump.pending, 0, 1)) {
+ drm_dbg(&ptdev->base, "skip subsequent coredump\n");
+ return NULL;
+ }
+
+ cd = kzalloc(sizeof(*cd), gfp);
+ if (!cd) {
+ atomic_set(&ptdev->coredump.pending, 0);
+ return NULL;
+ }
+
+ cd->ptdev = ptdev;
+ INIT_WORK(&cd->work, panthor_coredump_process_work);
+
+ cd->header.reason = reason;
+ cd->header.timestamp = ktime_get_real();
+
+ return cd;
+}
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h b/drivers/gpu/drm/panthor/panthor_coredump.h
new file mode 100644
index 000000000000..dd1fe1c2e175
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_coredump.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2019 Collabora ltd. */
+
+#ifndef __PANTHOR_COREDUMP_H__
+#define __PANTHOR_COREDUMP_H__
+
+#include <drm/panthor_drm.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+struct panthor_coredump;
+struct panthor_device;
+struct panthor_group;
+
+/**
+ * enum panthor_coredump_reason - Coredump reason
+ */
+enum panthor_coredump_reason {
+ PANTHOR_COREDUMP_REASON_MMU_FAULT,
+ PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
+ PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
+ PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT,
+ PANTHOR_COREDUMP_REASON_CS_FATAL,
+ PANTHOR_COREDUMP_REASON_CS_FAULT,
+ PANTHOR_COREDUMP_REASON_CS_TILER_OOM,
+ PANTHOR_COREDUMP_REASON_JOB_TIMEOUT,
+};
+
+/**
+ * struct panthor_coredump_group_state - Coredump group state
+ *
+ * Interesting panthor_group fields.
+ */
+struct panthor_coredump_group_state {
+ enum drm_panthor_group_priority priority;
+ u32 queue_count;
+ pid_t pid;
+ char comm[TASK_COMM_LEN];
+ bool destroyed;
+ int csg_id;
+};
+
+#ifdef CONFIG_DEV_COREDUMP
+
+struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+ enum panthor_coredump_reason reason, gfp_t gfp);
+
+void panthor_coredump_capture(struct panthor_coredump *cd,
+ struct panthor_group *group);
+
+#else /* CONFIG_DEV_COREDUMP */
+
+static inline struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+ enum panthor_coredump_reason reason, gfp_t gfp)
+{
+ return NULL;
+}
+
+static inline void panthor_coredump_capture(struct panthor_coredump *cd,
+ struct panthor_group *group)
+{
+}
+
+#endif /* CONFIG_DEV_COREDUMP */
+
+#endif /* __PANTHOR_COREDUMP_H__ */
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index 4fc7cf2aeed5..766e53c25cfa 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -197,6 +197,12 @@ struct panthor_device {
atomic_t recovery_needed;
} pm;
+ /** @coredump: Coredump-related data. */
+ struct {
+ /** @pending: True if there is a pending coredump. */
+ atomic_t pending;
+ } coredump;
+
/** @profile_mask: User-set profiling flags for job accounting. */
u32 profile_mask;
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index a2248f692a03..eb45b5ad9774 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -23,6 +23,7 @@
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
+#include "panthor_coredump.h"
#include "panthor_devfreq.h"
#include "panthor_device.h"
#include "panthor_fw.h"
@@ -1031,6 +1032,10 @@ group_unbind_locked(struct panthor_group *group)
return 0;
}
+static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
+ enum panthor_coredump_reason reason,
+ struct panthor_group *group);
+
/**
* cs_slot_prog_locked() - Program a queue slot
* @ptdev: Device.
@@ -1249,6 +1254,10 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
drm_err(&ptdev->base, "Invalid state on CSG %d (state=%d)",
csg_id, csg_state);
new_state = PANTHOR_CS_GROUP_UNKNOWN_STATE;
+
+ panthor_sched_coredump_locked(
+ ptdev, PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
+ group);
break;
}
@@ -1378,6 +1387,9 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev,
panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
(unsigned int)CS_EXCEPTION_DATA(fatal),
info);
+
+ panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FATAL,
+ group);
}
static void
@@ -1426,6 +1438,9 @@ cs_slot_process_fault_event_locked(struct panthor_device *ptdev,
panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
(unsigned int)CS_EXCEPTION_DATA(fault),
info);
+
+ panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FAULT,
+ group);
}
static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
@@ -1480,6 +1495,10 @@ static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
drm_warn(&ptdev->base, "Failed to extend the tiler heap\n");
group->fatal_queues |= BIT(cs_id);
sched_queue_delayed_work(sched, tick, 0);
+
+ panthor_sched_coredump_locked(
+ ptdev, PANTHOR_COREDUMP_REASON_CS_TILER_OOM, group);
+
goto out_put_heap_pool;
}
@@ -1639,6 +1658,9 @@ csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 c
group->timedout = true;
sched_queue_delayed_work(sched, tick, 0);
+
+ panthor_sched_coredump_locked(
+ ptdev, PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT, group);
}
static void sched_process_csg_irq_locked(struct panthor_device *ptdev, u32 csg_id)
@@ -1858,8 +1880,16 @@ static int csgs_upd_ctx_apply_locked(struct panthor_device *ptdev,
if (ret && acked != req_mask &&
((csg_iface->input->req ^ csg_iface->output->ack) & req_mask) != 0) {
+ struct panthor_csg_slot *csg_slot =
+ &sched->csg_slots[csg_id];
+ struct panthor_group *group = csg_slot->group;
+
drm_err(&ptdev->base, "CSG %d update request timedout", csg_id);
ctx->timedout_mask |= BIT(csg_id);
+
+ panthor_sched_coredump_locked(
+ ptdev, PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
+ group);
}
}
@@ -2027,6 +2057,10 @@ tick_ctx_init(struct panthor_scheduler *sched,
* CSG IRQs, so we can flag the faulty queue.
*/
if (panthor_vm_has_unhandled_faults(group->vm)) {
+ panthor_sched_coredump_locked(
+ ptdev, PANTHOR_COREDUMP_REASON_MMU_FAULT,
+ group);
+
sched_process_csg_irq_locked(ptdev, i);
/* No fatal fault reported, flag all queues as faulty. */
@@ -3237,6 +3271,10 @@ queue_timedout_job(struct drm_sched_job *sched_job)
group_queue_work(group, term);
}
+
+ panthor_sched_coredump_locked(
+ ptdev, PANTHOR_COREDUMP_REASON_JOB_TIMEOUT, group);
+
mutex_unlock(&sched->lock);
queue_start(queue);
@@ -3627,6 +3665,37 @@ int panthor_group_get_state(struct panthor_file *pfile,
return 0;
}
+static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
+ enum panthor_coredump_reason reason,
+ struct panthor_group *group)
+{
+ struct panthor_coredump *cd;
+
+ lockdep_assert_held(&ptdev->scheduler->lock);
+
+ /* GFP_NOWAIT because this may be called from fence signaling path */
+ cd = panthor_coredump_alloc(ptdev, reason, GFP_NOWAIT);
+ if (!cd)
+ return;
+
+ panthor_coredump_capture(cd, group);
+}
+
+void panthor_group_capture_coredump(const struct panthor_group *group,
+ struct panthor_coredump_group_state *state)
+{
+ const struct panthor_device *ptdev = group->ptdev;
+
+ /* this is called from panthor_coredump_capture */
+ lockdep_assert_held(&ptdev->scheduler->lock);
+
+ state->priority = group->priority;
+ state->queue_count = group->queue_count;
+ /* TODO state->pid and state->comm */
+ state->destroyed = group->destroyed;
+ state->csg_id = group->csg_id;
+}
+
int panthor_group_pool_create(struct panthor_file *pfile)
{
struct panthor_group_pool *gpool;
diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h
index 742b0b4ff3a3..6c564153133e 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.h
+++ b/drivers/gpu/drm/panthor/panthor_sched.h
@@ -14,8 +14,10 @@ struct drm_panthor_group_create;
struct drm_panthor_queue_create;
struct drm_panthor_group_get_state;
struct drm_panthor_queue_submit;
+struct panthor_coredump_group_state;
struct panthor_device;
struct panthor_file;
+struct panthor_group;
struct panthor_group_pool;
struct panthor_job;
@@ -26,6 +28,9 @@ int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle);
int panthor_group_get_state(struct panthor_file *pfile,
struct drm_panthor_group_get_state *get_state);
+void panthor_group_capture_coredump(const struct panthor_group *group,
+ struct panthor_coredump_group_state *state);
+
struct drm_sched_job *
panthor_job_create(struct panthor_file *pfile,
u16 group_handle,
--
2.50.0.727.gbf7dc18ff4-goog
Powered by blists - more mailing lists