[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <60d07232-7203-f03f-5e5a-b48f56f377bd@quicinc.com>
Date: Fri, 14 Jun 2024 10:33:33 -0600
From: Jeffrey Hugo <quic_jhugo@...cinc.com>
To: Tomeu Vizoso <tomeu@...euvizoso.net>, Joerg Roedel <joro@...tes.org>,
Will
Deacon <will@...nel.org>, Robin Murphy <robin.murphy@....com>,
Heiko Stuebner
<heiko@...ech.de>, Rob Herring <robh@...nel.org>,
Krzysztof Kozlowski
<krzk+dt@...nel.org>,
Conor Dooley <conor+dt@...nel.org>, Oded Gabbay
<ogabbay@...nel.org>,
Tomeu Vizoso <tomeu.vizoso@...euvizoso.net>,
David
Airlie <airlied@...il.com>, Daniel Vetter <daniel@...ll.ch>,
Maarten
Lankhorst <maarten.lankhorst@...ux.intel.com>,
Maxime Ripard
<mripard@...nel.org>,
Thomas Zimmermann <tzimmermann@...e.de>,
Philipp Zabel
<p.zabel@...gutronix.de>,
Sumit Semwal <sumit.semwal@...aro.org>,
Christian König <christian.koenig@....com>
CC: <iommu@...ts.linux.dev>, <linux-arm-kernel@...ts.infradead.org>,
<linux-rockchip@...ts.infradead.org>, <linux-kernel@...r.kernel.org>,
<devicetree@...r.kernel.org>, <dri-devel@...ts.freedesktop.org>,
<linux-media@...r.kernel.org>, <linaro-mm-sig@...ts.linaro.org>
Subject: Re: [PATCH 8/9] accel/rocket: Add job submission IOCTL
On 6/12/2024 7:53 AM, Tomeu Vizoso wrote:
> Using the DRM GPU scheduler infrastructure, with a scheduler for each
> core.
>
> Userspace can decide for a series of tasks to be executed sequentially
> in the same core, so SRAM locality can be taken advantage of.
>
> The job submission code was intially based on Panfrost.
intially -> initially
>
> Signed-off-by: Tomeu Vizoso <tomeu@...euvizoso.net>
> ---
> drivers/accel/rocket/Makefile | 3 +-
> drivers/accel/rocket/rocket_core.c | 6 +
> drivers/accel/rocket/rocket_core.h | 16 +
> drivers/accel/rocket/rocket_device.c | 2 +
> drivers/accel/rocket/rocket_device.h | 2 +
> drivers/accel/rocket/rocket_drv.c | 15 +
> drivers/accel/rocket/rocket_drv.h | 3 +
> drivers/accel/rocket/rocket_job.c | 708 +++++++++++++++++++++++++++++++++++
> drivers/accel/rocket/rocket_job.h | 49 +++
> include/uapi/drm/rocket_accel.h | 55 +++
> 10 files changed, 858 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/accel/rocket/Makefile b/drivers/accel/rocket/Makefile
> index 875cac2243d9..4d59036af8d9 100644
> --- a/drivers/accel/rocket/Makefile
> +++ b/drivers/accel/rocket/Makefile
> @@ -6,4 +6,5 @@ rocket-y := \
> rocket_core.o \
> rocket_device.o \
> rocket_drv.o \
> - rocket_gem.o
> + rocket_gem.o \
> + rocket_job.o
> diff --git a/drivers/accel/rocket/rocket_core.c b/drivers/accel/rocket/rocket_core.c
> index d6680b00fb2f..2b2d8be38f0a 100644
> --- a/drivers/accel/rocket/rocket_core.c
> +++ b/drivers/accel/rocket/rocket_core.c
> @@ -11,6 +11,7 @@
>
> #include "rocket_core.h"
> #include "rocket_device.h"
> +#include "rocket_job.h"
> #include "rocket_registers.h"
>
> static int rocket_clk_init(struct rocket_core *core)
> @@ -122,6 +123,10 @@ int rocket_core_init(struct rocket_core *core)
> goto out_pm_domain;
> }
>
> + err = rocket_job_init(core);
> + if (err)
> + goto out_pm_domain;
> +
> version = rocket_read(core, REG_PC_VERSION) + (rocket_read(core, REG_PC_VERSION_NUM) & 0xffff);
> dev_info(rdev->dev, "Rockchip NPU core %d version: %d\n", core->index, version);
>
> @@ -134,6 +139,7 @@ int rocket_core_init(struct rocket_core *core)
>
> void rocket_core_fini(struct rocket_core *core)
> {
> + rocket_job_fini(core);
> rocket_pmdomain_fini(core);
> }
>
> diff --git a/drivers/accel/rocket/rocket_core.h b/drivers/accel/rocket/rocket_core.h
> index e5d4c848c9f4..e6401960a9b2 100644
> --- a/drivers/accel/rocket/rocket_core.h
> +++ b/drivers/accel/rocket/rocket_core.h
> @@ -8,6 +8,8 @@
> #include <asm/io.h>
> #include <asm-generic/io.h>
>
> +#include <drm/gpu_scheduler.h>
What about includes for workqueue or atomic?
> +static struct dma_fence *rocket_job_run(struct drm_sched_job *sched_job)
> +{
> + struct rocket_job *job = to_rocket_job(sched_job);
> + struct rocket_device *rdev = job->rdev;
> + struct rocket_core *core = sched_to_core(rdev, sched_job->sched);
> + struct dma_fence *fence = NULL;
> + int ret;
> +
> + if (unlikely(job->base.s_fence->finished.error))
> + return NULL;
> +
> + /* Nothing to execute: can happen if the job has finished while
> + * we were resetting the GPU.
> + */
Not the correct comment style
> + if (job->next_task_idx == job->task_count)
> + return NULL;
> +
> + fence = rocket_fence_create(core);
> + if (IS_ERR(fence))
> + return fence;
> +
> + if (job->done_fence)
> + dma_fence_put(job->done_fence);
> + job->done_fence = dma_fence_get(fence);
> +
> + ret = pm_runtime_get_sync(rdev->dev);
> + if (ret < 0)
> + return fence;
> +
> + spin_lock(&core->job_lock);
> +
> + core->in_flight_job = job;
> + rocket_job_hw_submit(core, job);
> +
> + spin_unlock(&core->job_lock);
> +
> + return fence;
> +}
> +
> +static void rocket_job_handle_done(struct rocket_core *core,
> + struct rocket_job *job)
> +{
> + if (job->next_task_idx < job->task_count) {
> + rocket_job_hw_submit(core, job);
> + return;
> + }
> +
> + core->in_flight_job = NULL;
> + dma_fence_signal_locked(job->done_fence);
> + pm_runtime_put_autosuspend(core->dev->dev);
> +}
> +
> +static void rocket_job_handle_irq(struct rocket_core *core)
> +{
> + uint32_t status, raw_status;
> +
> + pm_runtime_mark_last_busy(core->dev->dev);
> +
> + status = rocket_read(core, REG_PC_INTERRUPT_STATUS);
> + raw_status = rocket_read(core, REG_PC_INTERRUPT_RAW_STATUS);
> +
> + rocket_write(core, REG_PC_OPERATION_ENABLE, 0x0);
> + rocket_write(core, REG_PC_INTERRUPT_CLEAR, 0x1ffff);
> +
> + spin_lock(&core->job_lock);
> +
> + if (core->in_flight_job)
> + rocket_job_handle_done(core, core->in_flight_job);
> +
> + spin_unlock(&core->job_lock);
> +}
> +
> +static void
> +rocket_reset(struct rocket_core *core, struct drm_sched_job *bad)
> +{
> + struct rocket_device *rdev = core->dev;
> + bool cookie;
> +
> + if (!atomic_read(&core->reset.pending))
> + return;
> +
> + /* Stop the scheduler.
Not the correct comment style
> + *
> + * FIXME: We temporarily get out of the dma_fence_signalling section
> + * because the cleanup path generate lockdep splats when taking locks
> + * to release job resources. We should rework the code to follow this
> + * pattern:
> + *
> + * try_lock
> + * if (locked)
> + * release
> + * else
> + * schedule_work_to_release_later
> + */
> + drm_sched_stop(&core->sched, bad);
> +
> + cookie = dma_fence_begin_signalling();
> +
> + if (bad)
> + drm_sched_increase_karma(bad);
> +
> + /* Mask job interrupts and synchronize to make sure we won't be
> + * interrupted during our reset.
> + */
Not the correct comment style, again. This is the last time I'm going
to mention it.
> diff --git a/drivers/accel/rocket/rocket_job.h b/drivers/accel/rocket/rocket_job.h
> new file mode 100644
> index 000000000000..0c3c90e47d39
> --- /dev/null
> +++ b/drivers/accel/rocket/rocket_job.h
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/* Copyright 2024 Tomeu Vizoso <tomeu@...euvizoso.net> */
> +
> +#ifndef __ROCKET_JOB_H__
> +#define __ROCKET_JOB_H__
> +
> +#include <drm/gpu_scheduler.h>
> +#include <drm/drm_drv.h>
Alphabetical order
> diff --git a/include/uapi/drm/rocket_accel.h b/include/uapi/drm/rocket_accel.h
> index 8338726a83c3..888c9413e4cd 100644
> --- a/include/uapi/drm/rocket_accel.h
> +++ b/include/uapi/drm/rocket_accel.h
> @@ -12,8 +12,10 @@ extern "C" {
> #endif
>
> #define DRM_ROCKET_CREATE_BO 0x00
> +#define DRM_ROCKET_SUBMIT 0x01
>
> #define DRM_IOCTL_ROCKET_CREATE_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_ROCKET_CREATE_BO, struct drm_rocket_create_bo)
> +#define DRM_IOCTL_ROCKET_SUBMIT DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_SUBMIT, struct drm_rocket_submit)
>
> /**
> * struct drm_rocket_create_bo - ioctl argument for creating Rocket BOs.
> @@ -36,6 +38,59 @@ struct drm_rocket_create_bo {
> __u64 offset;
> };
>
> +/**
> + * struct drm_rocket_task - A task to be run on the NPU
> + *
> + * A task is the smallest unit of work that can be run on the NPU.
> + */
> +struct drm_rocket_task {
> + /** DMA address to NPU mapping of register command buffer */
> + __u64 regcmd;
> +
> + /** Number of commands in the register command buffer */
> + __u32 regcmd_count;
> +};
> +
> +/**
> + * struct drm_rocket_job - A job to be run on the NPU
> + *
> + * The kernel will schedule the execution of this job taking into account its
> + * dependencies with other jobs. All tasks in the same job will be executed
> + * sequentially on the same core, to benefit from memory residency in SRAM.
> + */
> +struct drm_rocket_job {
> + /** Pointer to an array of struct drm_rocket_task. */
> + __u64 tasks;
> +
> + /** Number of tasks passed in. */
> + __u32 task_count;
> +
> + /** Pointer to a u32 array of the BOs that are read by the job. */
> + __u64 in_bo_handles;
> +
> + /** Number of input BO handles passed in (size is that times 4). */
> + __u32 in_bo_handle_count;
> +
> + /** Pointer to a u32 array of the BOs that are written to by the job. */
> + __u64 out_bo_handles;
> +
> + /** Number of output BO handles passed in (size is that times 4). */
> + __u32 out_bo_handle_count;
> +};
I feels like the mixing of 32-bit and 64-bit fields violates the
guidelines on defining ioctls due to implicit padding that might or
might not be present.
> +
> +/**
> + * struct drm_rocket_submit - ioctl argument for submitting commands to the NPU.
> + *
> + * The kernel will schedule the execution of these jobs in dependency order.
> + */
> +struct drm_rocket_submit {
> + /** Pointer to an array of struct drm_rocket_job. */
> + __u64 jobs;
> +
> + /** Number of jobs passed in. */
> + __u32 job_count;
> +};
> +
> #if defined(__cplusplus)
> }
> #endif
>
Powered by blists - more mailing lists