linux-kernel - [PATCH v1 4/6] drm/lima: handle spurious timeouts due to high irq latency

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <20240117031212.1104034-5-nunes.erico@gmail.com>
Date: Wed, 17 Jan 2024 04:12:10 +0100
From: Erico Nunes <nunes.erico@...il.com>
To: Qiang Yu <yuq825@...il.com>,
	dri-devel@...ts.freedesktop.org,
	lima@...ts.freedesktop.org
Cc: anarsoul@...il.com,
	Maarten Lankhorst <maarten.lankhorst@...ux.intel.com>,
	Maxime Ripard <mripard@...nel.org>,
	Thomas Zimmermann <tzimmermann@...e.de>,
	David Airlie <airlied@...il.com>,
	Daniel Vetter <daniel@...ll.ch>,
	Sumit Semwal <sumit.semwal@...aro.org>,
	christian.koenig@....com,
	linux-kernel@...r.kernel.org,
	Erico Nunes <nunes.erico@...il.com>
Subject: [PATCH v1 4/6] drm/lima: handle spurious timeouts due to high irq latency

There are several unexplained and unreproduced cases of rendering
timeouts with lima, for which one theory is high IRQ latency coming from
somewhere else in the system.
This kind of occurrence may cause applications to trigger unnecessary
resets of the GPU or even applications to hang if it hits an issue in
the recovery path.
Panfrost already does some special handling to account for such
"spurious timeouts", it makes sense to have this in lima too to reduce
the chance that it hit users.

Signed-off-by: Erico Nunes <nunes.erico@...il.com>
---
 drivers/gpu/drm/lima/lima_sched.c | 32 ++++++++++++++++++++++++++-----
 drivers/gpu/drm/lima/lima_sched.h |  2 ++
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
index 66317296d831..9449b81bcd5b 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR MIT
 /* Copyright 2017-2019 Qiang Yu <yuq825@...il.com> */
 
+#include <linux/hardirq.h>
 #include <linux/iosys-map.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
@@ -223,10 +224,7 @@ static struct dma_fence *lima_sched_run_job(struct drm_sched_job *job)
 
 	task->fence = &fence->base;
 
-	/* for caller usage of the fence, otherwise irq handler
-	 * may consume the fence before caller use it
-	 */
-	dma_fence_get(task->fence);
+	task->done_fence = dma_fence_get(task->fence);
 
 	pipe->current_task = task;
 
@@ -401,9 +399,33 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
 	struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
 	struct lima_sched_task *task = to_lima_task(job);
 	struct lima_device *ldev = pipe->ldev;
+	struct lima_ip *ip = pipe->processor[0];
+
+	/*
+	 * If the GPU managed to complete this jobs fence, the timeout is
+	 * spurious. Bail out.
+	 */
+	if (dma_fence_is_signaled(task->done_fence)) {
+		DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
+		return DRM_GPU_SCHED_STAT_NOMINAL;
+	}
+
+	/*
+	 * Lima IRQ handler may take a long time to process an interrupt
+	 * if there is another IRQ handler hogging the processing.
+	 * In order to catch such cases and not report spurious Lima job
+	 * timeouts, synchronize the IRQ handler and re-check the fence
+	 * status.
+	 */
+	synchronize_irq(ip->irq);
+
+	if (dma_fence_is_signaled(task->done_fence)) {
+		DRM_WARN("%s unexpectedly high interrupt latency\n", lima_ip_name(ip));
+		return DRM_GPU_SCHED_STAT_NOMINAL;
+	}
 
 	if (!pipe->error)
-		DRM_ERROR("lima job timeout\n");
+		DRM_ERROR("%s lima job timeout\n", lima_ip_name(ip));
 
 	drm_sched_stop(&pipe->base, &task->base);
 
diff --git a/drivers/gpu/drm/lima/lima_sched.h b/drivers/gpu/drm/lima/lima_sched.h
index 6a11764d87b3..34050facb110 100644
--- a/drivers/gpu/drm/lima/lima_sched.h
+++ b/drivers/gpu/drm/lima/lima_sched.h
@@ -29,6 +29,8 @@ struct lima_sched_task {
 	bool recoverable;
 	struct lima_bo *heap;
 
+	struct dma_fence *done_fence;
+
 	/* pipe fence */
 	struct dma_fence *fence;
 };
-- 
2.43.0