linux-kernel - Re: [PATCH v7 1/5] drm/panthor: introduce job cycle and timestamp accounting

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <whrhelv4cvh56ig34kh5pqecbs3yhrqy3qepuzzxnmqifshnsa@75emb2et2rqg>
Date: Mon, 23 Sep 2024 21:48:35 +0100
From: Adrián Larumbe <adrian.larumbe@...labora.com>
To: Steven Price <steven.price@....com>
Cc: Boris Brezillon <boris.brezillon@...labora.com>, 
	Liviu Dudau <liviu.dudau@....com>, Maarten Lankhorst <maarten.lankhorst@...ux.intel.com>, 
	Maxime Ripard <mripard@...nel.org>, Thomas Zimmermann <tzimmermann@...e.de>, 
	David Airlie <airlied@...il.com>, Simona Vetter <simona@...ll.ch>, 
	Sumit Semwal <sumit.semwal@...aro.org>, Christian König <christian.koenig@....com>, 
	kernel@...labora.com, dri-devel@...ts.freedesktop.org, linux-kernel@...r.kernel.org, 
	linux-media@...r.kernel.org, linaro-mm-sig@...ts.linaro.org
Subject: Re: [PATCH v7 1/5] drm/panthor: introduce job cycle and timestamp
 accounting

On 23.09.2024 10:07, Steven Price wrote:
>On 21/09/2024 00:43, Adrián Larumbe wrote:
>> Enable calculations of job submission times in clock cycles and wall
>> time. This is done by expanding the boilerplate command stream when running
>> a job to include instructions that compute said times right before and
>> after a user CS.
>> 
>> A separate kernel BO is created per queue to store those values. Jobs can
>> access their sampled data through an index different from that of the
>> queue's ringbuffer. The reason for this is saving memory on the profiling
>> information kernel BO, since the amount of simultaneous profiled jobs we
>> can write into the queue's ringbuffer might be much smaller than for
>> regular jobs, as the former take more CSF instructions.
>> 
>> This commit is done in preparation for enabling DRM fdinfo support in the
>> Panthor driver, which depends on the numbers calculated herein.
>> 
>> A profile mode mask has been added that will in a future commit allow UM to
>> toggle performance metric sampling behaviour, which is disabled by default
>> to save power. When a ringbuffer CS is constructed, timestamp and cycling
>> sampling instructions are added depending on the enabled flags in the
>> profiling mask.
>> 
>> A helper was provided that calculates the number of instructions for a
>> given set of enablement mask, and these are passed as the number of credits
>> when initialising a DRM scheduler job.
>> 
>> Signed-off-by: Adrián Larumbe <adrian.larumbe@...labora.com>
>> Reviewed-by: Boris Brezillon <boris.brezillon@...labora.com>
>> Reviewed-by: Liviu Dudau <liviu.dudau@....com>
>
>I think just one bug remaining - see below...
>
>> ---
>>  drivers/gpu/drm/panthor/panthor_device.h |  22 ++
>>  drivers/gpu/drm/panthor/panthor_sched.c  | 328 +++++++++++++++++++----
>>  2 files changed, 301 insertions(+), 49 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
>> index e388c0472ba7..a48e30d0af30 100644
>> --- a/drivers/gpu/drm/panthor/panthor_device.h
>> +++ b/drivers/gpu/drm/panthor/panthor_device.h
>> @@ -66,6 +66,25 @@ struct panthor_irq {
>>  	atomic_t suspended;
>>  };
>>  
>> +/**
>> + * enum panthor_device_profiling_mode - Profiling state
>> + */
>> +enum panthor_device_profiling_flags {
>> +	/** @PANTHOR_DEVICE_PROFILING_DISABLED: Profiling is disabled. */
>> +	PANTHOR_DEVICE_PROFILING_DISABLED = 0,
>> +
>> +	/** @PANTHOR_DEVICE_PROFILING_CYCLES: Sampling job cycles. */
>> +	PANTHOR_DEVICE_PROFILING_CYCLES = BIT(0),
>> +
>> +	/** @PANTHOR_DEVICE_PROFILING_TIMESTAMP: Sampling job timestamp. */
>> +	PANTHOR_DEVICE_PROFILING_TIMESTAMP = BIT(1),
>> +
>> +	/** @PANTHOR_DEVICE_PROFILING_ALL: Sampling everything. */
>> +	PANTHOR_DEVICE_PROFILING_ALL =
>> +	PANTHOR_DEVICE_PROFILING_CYCLES |
>> +	PANTHOR_DEVICE_PROFILING_TIMESTAMP,
>> +};
>> +
>>  /**
>>   * struct panthor_device - Panthor device
>>   */
>> @@ -162,6 +181,9 @@ struct panthor_device {
>>  		 */
>>  		struct page *dummy_latest_flush;
>>  	} pm;
>> +
>> +	/** @profile_mask: User-set profiling flags for job accounting. */
>> +	u32 profile_mask;
>>  };
>>  
>>  /**
>> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
>> index 42afdf0ddb7e..6da5c3d0015e 100644
>> --- a/drivers/gpu/drm/panthor/panthor_sched.c
>> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
>> @@ -93,6 +93,9 @@
>>  #define MIN_CSGS				3
>>  #define MAX_CSG_PRIO				0xf
>>  
>> +#define NUM_INSTRS_PER_CACHE_LINE		(64 / sizeof(u64))
>> +#define MAX_INSTRS_PER_JOB			24
>> +
>>  struct panthor_group;
>>  
>>  /**
>> @@ -476,6 +479,18 @@ struct panthor_queue {
>>  		 */
>>  		struct list_head in_flight_jobs;
>>  	} fence_ctx;
>> +
>> +	/** @profiling: Job profiling data slots and access information. */
>> +	struct {
>> +		/** @slots: Kernel BO holding the slots. */
>> +		struct panthor_kernel_bo *slots;
>> +
>> +		/** @slot_count: Number of jobs ringbuffer can hold at once. */
>> +		u32 slot_count;
>> +
>> +		/** @seqno: Index of the next available profiling information slot. */
>> +		u32 seqno;
>> +	} profiling;
>>  };
>>  
>>  /**
>> @@ -661,6 +676,18 @@ struct panthor_group {
>>  	struct list_head wait_node;
>>  };
>>  
>> +struct panthor_job_profiling_data {
>> +	struct {
>> +		u64 before;
>> +		u64 after;
>> +	} cycles;
>> +
>> +	struct {
>> +		u64 before;
>> +		u64 after;
>> +	} time;
>> +};
>> +
>>  /**
>>   * group_queue_work() - Queue a group work
>>   * @group: Group to queue the work for.
>> @@ -774,6 +801,15 @@ struct panthor_job {
>>  
>>  	/** @done_fence: Fence signaled when the job is finished or cancelled. */
>>  	struct dma_fence *done_fence;
>> +
>> +	/** @profiling: Job profiling information. */
>> +	struct {
>> +		/** @mask: Current device job profiling enablement bitmask. */
>> +		u32 mask;
>> +
>> +		/** @slot: Job index in the profiling slots BO. */
>> +		u32 slot;
>> +	} profiling;
>>  };
>>  
>>  static void
>> @@ -838,6 +874,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
>>  
>>  	panthor_kernel_bo_destroy(queue->ringbuf);
>>  	panthor_kernel_bo_destroy(queue->iface.mem);
>> +	panthor_kernel_bo_destroy(queue->profiling.slots);
>>  
>>  	/* Release the last_fence we were holding, if any. */
>>  	dma_fence_put(queue->fence_ctx.last_fence);
>> @@ -1982,8 +2019,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
>>  	}
>>  }
>>  
>> -#define NUM_INSTRS_PER_SLOT		16
>> -
>>  static void
>>  group_term_post_processing(struct panthor_group *group)
>>  {
>> @@ -2815,65 +2850,192 @@ static void group_sync_upd_work(struct work_struct *work)
>>  	group_put(group);
>>  }
>>  
>> -static struct dma_fence *
>> -queue_run_job(struct drm_sched_job *sched_job)
>> +struct panthor_job_ringbuf_instrs {
>> +	u64 buffer[MAX_INSTRS_PER_JOB];
>> +	u32 count;
>> +};
>> +
>> +struct panthor_job_instr {
>> +	u32 profile_mask;
>> +	u64 instr;
>> +};
>> +
>> +#define JOB_INSTR(__prof, __instr) \
>> +	{ \
>> +		.profile_mask = __prof, \
>> +		.instr = __instr, \
>> +	}
>> +
>> +static void
>> +copy_instrs_to_ringbuf(struct panthor_queue *queue,
>> +		       struct panthor_job *job,
>> +		       struct panthor_job_ringbuf_instrs *instrs)
>> +{
>> +	u64 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
>> +	u64 start = job->ringbuf.start & (ringbuf_size - 1);
>> +	u64 size, written;
>> +
>> +	/*
>> +	 * We need to write a whole slot, including any trailing zeroes
>> +	 * that may come at the end of it. Also, because instrs.buffer has
>> +	 * been zero-initialised, there's no need to pad it with 0's
>> +	 */
>> +	instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
>> +	size = instrs->count * sizeof(u64);
>> +	WARN_ON(size > ringbuf_size);
>> +	written = min(ringbuf_size - start, size);
>> +
>> +	memcpy(queue->ringbuf->kmap + start, instrs->buffer, written);
>> +
>> +	if (written < size)
>> +		memcpy(queue->ringbuf->kmap,
>> +		       &instrs->buffer[written/sizeof(u64)],
>> +		       size - written);
>> +}
>> +
>> +struct panthor_job_cs_params {
>> +	u32 profile_mask;
>> +	u64 addr_reg; u64 val_reg;
>> +	u64 cycle_reg; u64 time_reg;
>> +	u64 sync_addr; u64 times_addr;
>> +	u64 cs_start; u64 cs_size;
>> +	u32 last_flush; u32 waitall_mask;
>> +};
>> +
>> +static void
>> +get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params)
>>  {
>> -	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
>>  	struct panthor_group *group = job->group;
>>  	struct panthor_queue *queue = group->queues[job->queue_idx];
>>  	struct panthor_device *ptdev = group->ptdev;
>>  	struct panthor_scheduler *sched = ptdev->scheduler;
>> -	u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
>> -	u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
>> -	u64 addr_reg = ptdev->csif_info.cs_reg_count -
>> -		       ptdev->csif_info.unpreserved_cs_reg_count;
>> -	u64 val_reg = addr_reg + 2;
>> -	u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
>> -			job->queue_idx * sizeof(struct panthor_syncobj_64b);
>> -	u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
>> -	struct dma_fence *done_fence;
>> -	int ret;
>>  
>> -	u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
>> -		/* MOV32 rX+2, cs.latest_flush */
>> -		(2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
>> +	params->addr_reg = ptdev->csif_info.cs_reg_count -
>> +			   ptdev->csif_info.unpreserved_cs_reg_count;
>> +	params->val_reg = params->addr_reg + 2;
>> +	params->cycle_reg = params->addr_reg;
>> +	params->time_reg = params->val_reg;
>>  
>> -		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
>> -		(36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
>> +	params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
>> +			    job->queue_idx * sizeof(struct panthor_syncobj_64b);
>> +	params->times_addr = panthor_kernel_bo_gpuva(queue->profiling.slots) +
>> +			     (job->profiling.slot * sizeof(struct panthor_job_profiling_data));
>> +	params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
>>  
>> -		/* MOV48 rX:rX+1, cs.start */
>> -		(1ull << 56) | (addr_reg << 48) | job->call_info.start,
>> +	params->cs_start = job->call_info.start;
>> +	params->cs_size = job->call_info.size;
>> +	params->last_flush = job->call_info.latest_flush;
>>  
>> -		/* MOV32 rX+2, cs.size */
>> -		(2ull << 56) | (val_reg << 48) | job->call_info.size,
>> +	params->profile_mask = job->profiling.mask;
>> +}
>>  
>> -		/* WAIT(0) => waits for FLUSH_CACHE2 instruction */
>> -		(3ull << 56) | (1 << 16),
>> +#define JOB_INSTR_ALWAYS(instr) \
>> +	JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (instr))
>> +#define JOB_INSTR_TIMESTAMP(instr) \
>> +	JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, (instr))
>> +#define JOB_INSTR_CYCLES(instr) \
>> +	JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, (instr))
>>  
>> +static void
>> +prepare_job_instrs(const struct panthor_job_cs_params *params,
>> +		   struct panthor_job_ringbuf_instrs *instrs)
>> +{
>> +	const struct panthor_job_instr instr_seq[] = {
>> +		/* MOV32 rX+2, cs.latest_flush */
>> +		JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->last_flush),
>> +		/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
>> +		JOB_INSTR_ALWAYS((36ull << 56) | (0ull << 48) | (params->val_reg << 40) | (0 << 16) | 0x233),
>> +		/* MOV48 rX:rX+1, cycles_offset */
>> +		JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
>> +				 (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.before))),
>> +		/* STORE_STATE cycles */
>> +		JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
>> +		/* MOV48 rX:rX+1, time_offset */
>> +		JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) | (params->times_addr +
>> +			   offsetof(struct panthor_job_profiling_data, time.before))),
>> +		/* STORE_STATE timer */
>> +		JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
>> +		/* MOV48 rX:rX+1, cs.start */
>> +		JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->cs_start),
>> +		/* MOV32 rX+2, cs.size */
>> +		JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->cs_size),
>> +		/* WAIT(0) => waits for FLUSH_CACHE2 instruction */
>> +		JOB_INSTR_ALWAYS((3ull << 56) | (1 << 16)),
>>  		/* CALL rX:rX+1, rX+2 */
>> -		(32ull << 56) | (addr_reg << 40) | (val_reg << 32),
>> -
>> +		JOB_INSTR_ALWAYS((32ull << 56) | (params->addr_reg << 40) | (params->val_reg << 32)),
>> +		/* MOV48 rX:rX+1, cycles_offset */
>> +		JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
>> +				 (params->times_addr + offsetof(struct panthor_job_profiling_data, cycles.after))),
>> +		/* STORE_STATE cycles */
>> +		JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
>> +		/* MOV48 rX:rX+1, time_offset */
>> +		JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) |
>> +			  (params->times_addr + offsetof(struct panthor_job_profiling_data, time.after))),
>> +		/* STORE_STATE timer */
>> +		JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
>>  		/* MOV48 rX:rX+1, sync_addr */
>> -		(1ull << 56) | (addr_reg << 48) | sync_addr,
>> -
>> +		JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->sync_addr),
>>  		/* MOV48 rX+2, #1 */
>> -		(1ull << 56) | (val_reg << 48) | 1,
>> -
>> +		JOB_INSTR_ALWAYS((1ull << 56) | (params->val_reg << 48) | 1),
>>  		/* WAIT(all) */
>> -		(3ull << 56) | (waitall_mask << 16),
>> -
>> +		JOB_INSTR_ALWAYS((3ull << 56) | (params->waitall_mask << 16)),
>>  		/* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
>> -		(51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
>> +		JOB_INSTR_ALWAYS((51ull << 56) | (0ull << 48) | (params->addr_reg << 40) |
>> +				 (params->val_reg << 32) | (0 << 16) | 1),
>> +		/* ERROR_BARRIER, so we can recover from faults at job boundaries. */
>> +		JOB_INSTR_ALWAYS((47ull << 56)),
>> +	};
>> +	u32 pad;
>>  
>> -		/* ERROR_BARRIER, so we can recover from faults at job
>> -		 * boundaries.
>> -		 */
>> -		(47ull << 56),
>> +	/* NEED to be cacheline aligned to please the prefetcher. */
>> +	static_assert(sizeof(instrs->buffer) % 64 == 0,
>> +		      "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline");
>> +
>> +	/* Make sure we have enough storage to store the whole sequence. */
>> +	static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) ==
>> +		      ARRAY_SIZE(instrs->buffer),
>> +		      "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch");
>> +
>> +	for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) {
>> +		/* If the profile mask of this instruction is not enabled, skip it. */
>> +		if (instr_seq[i].profile_mask &&
>> +		    !(instr_seq[i].profile_mask & params->profile_mask))
>> +			continue;
>> +
>> +		instrs->buffer[instrs->count++] = instr_seq[i].instr;
>> +	}
>> +
>> +	pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
>> +	memset(&instrs->buffer[instrs->count], 0,
>> +	       (pad - instrs->count) * sizeof(instrs->buffer[0]));
>> +	instrs->count = pad;
>> +}
>> +
>> +static u32 calc_job_credits(u32 profile_mask)
>> +{
>> +	struct panthor_job_ringbuf_instrs instrs = {
>> +		.count = 0,
>> +	};
>> +	struct panthor_job_cs_params params = {
>> +		.profile_mask = profile_mask,
>>  	};
>>  
>> -	/* Need to be cacheline aligned to please the prefetcher. */
>> -	static_assert(sizeof(call_instrs) % 64 == 0,
>> -		      "call_instrs is not aligned on a cacheline");
>> +	prepare_job_instrs(&params, &instrs);
>> +	return instrs.count;
>> +}
>> +
>> +static struct dma_fence *
>> +queue_run_job(struct drm_sched_job *sched_job)
>> +{
>> +	struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
>> +	struct panthor_group *group = job->group;
>> +	struct panthor_queue *queue = group->queues[job->queue_idx];
>> +	struct panthor_device *ptdev = group->ptdev;
>> +	struct panthor_scheduler *sched = ptdev->scheduler;
>> +	struct panthor_job_ringbuf_instrs instrs;
>
>instrs isn't initialised...
>
>> +	struct panthor_job_cs_params cs_params;
>> +	struct dma_fence *done_fence;
>> +	int ret;
>>  
>>  	/* Stream size is zero, nothing to do except making sure all previously
>>  	 * submitted jobs are done before we signal the
>> @@ -2900,17 +3062,23 @@ queue_run_job(struct drm_sched_job *sched_job)
>>  		       queue->fence_ctx.id,
>>  		       atomic64_inc_return(&queue->fence_ctx.seqno));
>>  
>> -	memcpy(queue->ringbuf->kmap + ringbuf_insert,
>> -	       call_instrs, sizeof(call_instrs));
>> +	job->profiling.slot = queue->profiling.seqno++;
>> +	if (queue->profiling.seqno == queue->profiling.slot_count)
>> +		queue->profiling.seqno = 0;
>> +
>> +	job->ringbuf.start = queue->iface.input->insert;
>> +
>> +	get_job_cs_params(job, &cs_params);
>> +	prepare_job_instrs(&cs_params, &instrs);
>
>...but it's passed into prepare_job_instrs() which depends on
>instrs.count (same bug as was in calc_job_credits()) - sorry I didn't
>spot it last review.
>
>Initializing instrs makes everything work for me.
>
>I'm not sure quite what kernel configuration you are using but I wonder
>if you've got a 'hardening' option enabled which is causing the stack to
>be zero-initialised. It's worth turning it off for testing purposes ;)

Thanks for catching this, it went completely unnoticed to me. Delving into my kernel
config, I found this option: CONFIG_INIT_STACK_ALL_ZERO. When I unset it, it triggers
an invalid memory reference in the kernel, I guess because of uninitialised stack
variables.

I don't know why I had this option enabled, but come to think of it seems like a
terrible idea. Thanks for bringing this up.

>Steve
>
>> +	copy_instrs_to_ringbuf(queue, job, &instrs);
>> +
>> +	job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
>>  
>>  	panthor_job_get(&job->base);
>>  	spin_lock(&queue->fence_ctx.lock);
>>  	list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
>>  	spin_unlock(&queue->fence_ctx.lock);
>>  
>> -	job->ringbuf.start = queue->iface.input->insert;
>> -	job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
>> -
>>  	/* Make sure the ring buffer is updated before the INSERT
>>  	 * register.
>>  	 */
>> @@ -3003,6 +3171,34 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
>>  	.free_job = queue_free_job,
>>  };
>>  
>> +static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev,
>> +				       u32 cs_ringbuf_size)
>> +{
>> +	u32 min_profiled_job_instrs = U32_MAX;
>> +	u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL);
>> +
>> +	/*
>> +	 * We want to calculate the minimum size of a profiled job's CS,
>> +	 * because since they need additional instructions for the sampling
>> +	 * of performance metrics, they might take up further slots in
>> +	 * the queue's ringbuffer. This means we might not need as many job
>> +	 * slots for keeping track of their profiling information. What we
>> +	 * need is the maximum number of slots we should allocate to this end,
>> +	 * which matches the maximum number of profiled jobs we can place
>> +	 * simultaneously in the queue's ring buffer.
>> +	 * That has to be calculated separately for every single job profiling
>> +	 * flag, but not in the case job profiling is disabled, since unprofiled
>> +	 * jobs don't need to keep track of this at all.
>> +	 */
>> +	for (u32 i = 0; i < last_flag; i++) {
>> +		if (BIT(i) & PANTHOR_DEVICE_PROFILING_ALL)
>> +			min_profiled_job_instrs =
>> +				min(min_profiled_job_instrs, calc_job_credits(BIT(i)));
>> +	}
>> +
>> +	return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64));
>> +}
>> +
>>  static struct panthor_queue *
>>  group_create_queue(struct panthor_group *group,
>>  		   const struct drm_panthor_queue_create *args)
>> @@ -3056,9 +3252,35 @@ group_create_queue(struct panthor_group *group,
>>  		goto err_free_queue;
>>  	}
>>  
>> +	queue->profiling.slot_count =
>> +		calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size);
>> +
>> +	queue->profiling.slots =
>> +		panthor_kernel_bo_create(group->ptdev, group->vm,
>> +					 queue->profiling.slot_count *
>> +					 sizeof(struct panthor_job_profiling_data),
>> +					 DRM_PANTHOR_BO_NO_MMAP,
>> +					 DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
>> +					 DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
>> +					 PANTHOR_VM_KERNEL_AUTO_VA);
>> +
>> +	if (IS_ERR(queue->profiling.slots)) {
>> +		ret = PTR_ERR(queue->profiling.slots);
>> +		goto err_free_queue;
>> +	}
>> +
>> +	ret = panthor_kernel_bo_vmap(queue->profiling.slots);
>> +	if (ret)
>> +		goto err_free_queue;
>> +
>> +	/*
>> +	 * Credit limit argument tells us the total number of instructions
>> +	 * across all CS slots in the ringbuffer, with some jobs requiring
>> +	 * twice as many as others, depending on their profiling status.
>> +	 */
>>  	ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
>>  			     group->ptdev->scheduler->wq, 1,
>> -			     args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
>> +			     args->ringbuf_size / sizeof(u64),
>>  			     0, msecs_to_jiffies(JOB_TIMEOUT_MS),
>>  			     group->ptdev->reset.wq,
>>  			     NULL, "panthor-queue", group->ptdev->base.dev);
>> @@ -3354,6 +3576,7 @@ panthor_job_create(struct panthor_file *pfile,
>>  {
>>  	struct panthor_group_pool *gpool = pfile->groups;
>>  	struct panthor_job *job;
>> +	u32 credits;
>>  	int ret;
>>  
>>  	if (qsubmit->pad)
>> @@ -3407,9 +3630,16 @@ panthor_job_create(struct panthor_file *pfile,
>>  		}
>>  	}
>>  
>> +	job->profiling.mask = pfile->ptdev->profile_mask;
>> +	credits = calc_job_credits(job->profiling.mask);
>> +	if (credits == 0) {
>> +		ret = -EINVAL;
>> +		goto err_put_job;
>> +	}
>> +
>>  	ret = drm_sched_job_init(&job->base,
>>  				 &job->group->queues[job->queue_idx]->entity,
>> -				 1, job->group);
>> +				 credits, job->group);
>>  	if (ret)
>>  		goto err_put_job;
>>