linux-kernel - [PATCH v1 11/20] drm/amdgpu: use multiple entities in amdgpu_fill

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251104083605.13677-12-pierre-eric.pelloux-prayer@amd.com>
Date: Tue, 4 Nov 2025 09:35:26 +0100
From: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@....com>
To: Alex Deucher <alexander.deucher@....com>,
	Christian König <christian.koenig@....com>, David Airlie
	<airlied@...il.com>, Simona Vetter <simona@...ll.ch>, Sumit Semwal
	<sumit.semwal@...aro.org>
CC: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@....com>,
	<amd-gfx@...ts.freedesktop.org>, <dri-devel@...ts.freedesktop.org>,
	<linux-kernel@...r.kernel.org>, <linux-media@...r.kernel.org>,
	<linaro-mm-sig@...ts.linaro.org>
Subject: [PATCH v1 11/20] drm/amdgpu: use multiple entities in amdgpu_fill_buffer

The benefits of using multiple entities is that multiple fill jobs
can run in parallel. Otherwise, even if the entity has access
to multiple engines, a burst of N independent jobs will all
run on the same engine because an entity guarantees the ordering
of execution matches the ordering of the submission.

Callers can opt-out of this behavior by passing the entity they
want to use (see amdgpu_move_blit).

Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@....com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 84 ++++++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  1 +
 2 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index c357a6d9763a..839ea8c7f6be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -2224,6 +2224,7 @@ u32 amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable)
 		adev->mman.clear_entities = kcalloc(num_clear_entities,
 						    sizeof(struct amdgpu_ttm_entity),
 						    GFP_KERNEL);
+		atomic_set(&adev->mman.next_clear_entity, 0);
 		if (!adev->mman.clear_entities)
 			goto error_free_entity;
 
@@ -2498,10 +2499,12 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity,
 {
 	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
 	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
+	struct dma_fence *fences[TTM_FENCES_MAX_SLOT_COUNT] = {};
 	struct dma_fence *fence = NULL;
 	struct dma_resv *resv = NULL;
 	struct amdgpu_res_cursor dst;
-	int r;
+	uint64_t cur_size, to;
+	int r, e, n_fences;
 
 	/* The fences will be either added to the resv object or the last fence
 	 * will be returned to the caller. In the latter case, all fill jobs will
@@ -2515,53 +2518,92 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity,
 	}
 
 	if (!entity) {
-		entity = &adev->mman.clear_entities[0];
 		resv = &bo->tbo.base._resv;
-		r = dma_resv_reserve_fences(resv, 1);
+
+		/* Determine how much fences we're going to add to the
+		 * resv object.
+		 */
+		n_fences = 0;
+		amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst);
+		while (dst.remaining) {
+			cur_size = min(dst.size, 256ULL << 20);
+
+			n_fences += 1;
+			amdgpu_res_next(&dst, cur_size);
+		}
+		if (n_fences == 0)
+			return 0;
+
+		/* One slot per entity at most. */
+		n_fences = MIN(n_fences, adev->mman.num_clear_entities);
+
+		r = dma_resv_reserve_fences(resv, n_fences);
 		if (r)
 			return r;
+	} else {
+		mutex_lock(&entity->gart_window_lock);
 	}
 
 	amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst);
 
-	mutex_lock(&entity->gart_window_lock);
 	while (dst.remaining) {
-		struct dma_fence *next;
-		uint64_t cur_size, to;
-
 		/* Never fill more than 256MiB at once to avoid timeouts */
 		cur_size = min(dst.size, 256ULL << 20);
 
+		if (resv) {
+			/* Pick a new entity for each partial clear so they can
+			 * execute in parallel.
+			 */
+			e = atomic_inc_return(&adev->mman.next_clear_entity) %
+				adev->mman.num_clear_entities;
+			entity = &adev->mman.clear_entities[e];
+			mutex_lock(&entity->gart_window_lock);
+		}
+
 		r = amdgpu_ttm_map_buffer(&entity->base,
 					  &bo->tbo, bo->tbo.resource, &dst,
 					  entity->gart_window_id1, ring, false,
 					  &cur_size, &to,
 					  dependency,
 					  resv);
-		if (r)
+		if (r) {
+			mutex_unlock(&entity->gart_window_lock);
 			goto error;
+		}
 
 		r = amdgpu_ttm_fill_mem(ring, &entity->base,
 					src_data, to, cur_size, resv,
-					&next, true, k_job_id);
-		if (r)
+					&fence, true, k_job_id);
+		if (r) {
+			mutex_unlock(&entity->gart_window_lock);
 			goto error;
-
-		if (resv) {
-			dma_resv_add_fence(resv, next, DMA_RESV_USAGE_KERNEL);
-			dma_fence_put(next);
-		} else {
-			dma_fence_put(fence);
-			fence = next;
 		}
 
 		amdgpu_res_next(&dst, cur_size);
+
+		if (resv) {
+			/* Delay the addition of the fences to resv, otherwise the next partial
+			 * clears will depend on this one.
+			 */
+			fences[e] = fence;
+			mutex_unlock(&entity->gart_window_lock);
+		} else {
+			dma_fence_put(*f);
+			*f = fence;
+		}
 	}
 error:
-	mutex_unlock(&entity->gart_window_lock);
-	if (f)
-		*f = dma_fence_get(fence);
-	dma_fence_put(fence);
+	if (resv) {
+		for (e = 0; e < adev->mman.num_clear_entities; e++) {
+			if (fences[e]) {
+				dma_resv_add_fence(resv, fences[e], DMA_RESV_USAGE_KERNEL);
+				dma_fence_put(fences[e]);
+			}
+		}
+	} else {
+		mutex_unlock(&entity->gart_window_lock);
+	}
+
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 38df2b5b4bc7..3fc31c7c6bfe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -73,6 +73,7 @@ struct amdgpu_mman {
 
 	struct amdgpu_ttm_entity default_entity; /* has no gart windows */
 	struct amdgpu_ttm_entity *clear_entities;
+	atomic_t next_clear_entity;
 	u32 num_clear_entities;
 	struct amdgpu_ttm_entity move_entities[TTM_FENCES_MAX_SLOT_COUNT];
 	u32 num_move_entities;
-- 
2.43.0