linux-kernel - Re: [PATCH v4 2/2] accel: Add Arm Ethos-U NPU driver

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aO/4cQ8+eLnwqFSh@lizhi-Precision-Tower-5810>
Date: Wed, 15 Oct 2025 15:39:29 -0400
From: Frank Li <Frank.li@....com>
To: "Rob Herring (Arm)" <robh@...nel.org>
Cc: Tomeu Vizoso <tomeu@...euvizoso.net>,
	Krzysztof Kozlowski <krzk+dt@...nel.org>,
	Conor Dooley <conor+dt@...nel.org>,
	Oded Gabbay <ogabbay@...nel.org>,
	Maarten Lankhorst <maarten.lankhorst@...ux.intel.com>,
	Maxime Ripard <mripard@...nel.org>,
	Thomas Zimmermann <tzimmermann@...e.de>,
	David Airlie <airlied@...il.com>, Simona Vetter <simona@...ll.ch>,
	Sumit Semwal <sumit.semwal@...aro.org>,
	Christian König <christian.koenig@....com>,
	Robin Murphy <robin.murphy@....com>,
	Steven Price <steven.price@....com>,
	Daniel Stone <daniel@...ishbar.org>,
	Sui Jingfeng <sui.jingfeng@...ux.dev>, devicetree@...r.kernel.org,
	linux-kernel@...r.kernel.org, dri-devel@...ts.freedesktop.org,
	linux-media@...r.kernel.org, linaro-mm-sig@...ts.linaro.org
Subject: Re: [PATCH v4 2/2] accel: Add Arm Ethos-U NPU driver

On Wed, Oct 15, 2025 at 12:47:40PM -0500, Rob Herring (Arm) wrote:
> Add a driver for Arm Ethos-U65/U85 NPUs. The Ethos-U NPU has a
> relatively simple interface with single command stream to describe
> buffers, operation settings, and network operations. It supports up to 8
> memory regions (though no h/w bounds on a region). The Ethos NPUs
> are designed to use an SRAM for scratch memory. Region 2 is reserved
> for SRAM (like the downstream driver stack and compiler). Userspace
> doesn't need access to the SRAM.
>
> The h/w has no MMU nor external IOMMU and is a DMA engine which can
> read and write anywhere in memory without h/w bounds checks. The user
> submitted command streams must be validated against the bounds of the
> GEM BOs. This is similar to the VC4 design which validates shaders.
>
> The job submit is based on the rocket driver for the Rockchip NPU
> utilizing the GPU scheduler. It is simpler as there's only 1 core rather
> than 3.
>
> Tested on i.MX93 platform (U65) and FVP (U85) with WIP Mesa Teflon
> support.
>
> Acked-by: Thomas Zimmermann <tzimmermann@...e.de>
> Signed-off-by: Rob Herring (Arm) <robh@...nel.org>
> ---

How to test this driver?

> v4:
> - Use bulk clk API
> - Various whitespace fixes mostly due to ethos->ethosu rename
> - Drop error check on dma_set_mask_and_coherent()
> - Drop unnecessary pm_runtime_mark_last_busy() call
> - Move variable declarations out of switch (a riscv/clang build failure)
> - Use lowercase hex in all defines
> - Drop unused ethosu_device.coherent member
> - Add comments on all locks
>
...
> diff --git a/drivers/accel/ethosu/ethosu_device.h b/drivers/accel/ethosu/ethosu_device.h
> new file mode 100644
> index 000000000000..69d610c5c2d7
> --- /dev/null
> +++ b/drivers/accel/ethosu/ethosu_device.h
> @@ -0,0 +1,190 @@
> +/* SPDX-License-Identifier: GPL-2.0-only or MIT */
> +/* Copyright 2025 Arm, Ltd. */
> +
> +#ifndef __ETHOSU_DEVICE_H__
> +#define __ETHOSU_DEVICE_H__
> +
> +#include <linux/types.h>
> +
> +#include <drm/drm_device.h>
> +#include <drm/gpu_scheduler.h>
> +
> +#include <drm/ethosu_accel.h>
> +
> +struct clk;
> +struct gen_pool;

Supposed should include clk.h instead declear a struct.

...
> +
> +static int ethosu_open(struct drm_device *ddev, struct drm_file *file)
> +{
> +	int ret = 0;
> +	struct ethosu_file_priv *priv;
> +
> +	if (!try_module_get(THIS_MODULE))
> +		return -EINVAL;
> +
> +	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> +	if (!priv) {
> +		ret = -ENOMEM;
> +		goto err_put_mod;
> +	}
> +	priv->edev = to_ethosu_device(ddev);
> +
> +	ret = ethosu_job_open(priv);
> +	if (ret)
> +		goto err_free;
> +
> +	file->driver_priv = priv;

slice simple.

struct ethosu_file_priv __free(kfree) *priv = NULL;
...
priv = kzalloc(sizeof(*priv), GFP_KERNEL);

...

file->driver_priv = no_free_ptr(priv);


> +	return 0;
> +
> +err_free:
> +	kfree(priv);
> +err_put_mod:
> +	module_put(THIS_MODULE);
> +	return ret;
> +}
> +
...
> +
> +
> +static int ethosu_init(struct ethosu_device *ethosudev)
> +{
> +	int ret;
> +	u32 id, config;
> +
> +	ret = devm_pm_runtime_enable(ethosudev->base.dev);
> +	if (ret)
> +		return ret;
> +
> +	ret = pm_runtime_resume_and_get(ethosudev->base.dev);
> +	if (ret)
> +		return ret;
> +
> +	pm_runtime_set_autosuspend_delay(ethosudev->base.dev, 50);
> +	pm_runtime_use_autosuspend(ethosudev->base.dev);
> +
> +	/* If PM is disabled, we need to call ethosu_device_resume() manually. */
> +	if (!IS_ENABLED(CONFIG_PM)) {
> +		ret = ethosu_device_resume(ethosudev->base.dev);
> +		if (ret)
> +			return ret;
> +	}

I think it should call ethosu_device_resume() unconditional before
devm_pm_runtime_enable();

ethosu_device_resume();
pm_runtime_set_active();
pm_runtime_set_autosuspend_delay(ethosudev->base.dev, 50);
devm_pm_runtime_enable();

....

return 0;

//pm_runtime_autosuspend(); /* you already call it at caller probe() */

if not CONFIG_PM,  ethosu_device_resume() will enable clk unconditional.

If CONFIG_PM, pm_runtime_autosuspend() will turn off clock later.


> +
> +	ethosudev->npu_info.id = id = readl_relaxed(ethosudev->regs + NPU_REG_ID);
> +	ethosudev->npu_info.config = config = readl_relaxed(ethosudev->regs + NPU_REG_CONFIG);
> +
> +	ethosu_sram_init(ethosudev);
> +
> +	dev_info(ethosudev->base.dev,
> +		 "Ethos-U NPU, arch v%ld.%ld.%ld, rev r%ldp%ld, cmd stream ver%ld, %d MACs, %dKB SRAM\n",
> +		 FIELD_GET(ID_ARCH_MAJOR_MASK, id),
> +		 FIELD_GET(ID_ARCH_MINOR_MASK, id),
> +		 FIELD_GET(ID_ARCH_PATCH_MASK, id),
> +		 FIELD_GET(ID_VER_MAJOR_MASK, id),
> +		 FIELD_GET(ID_VER_MINOR_MASK, id),
> +		 FIELD_GET(CONFIG_CMD_STREAM_VER_MASK, config),
> +		 1 << FIELD_GET(CONFIG_MACS_PER_CC_MASK, config),
> +		 ethosudev->npu_info.sram_size / 1024);
> +
> +	return 0;
> +}
> +
> +static int ethosu_probe(struct platform_device *pdev)
> +{
> +	int ret;
> +	struct ethosu_device *ethosudev;
> +
> +	ethosudev = devm_drm_dev_alloc(&pdev->dev, &ethosu_drm_driver,
> +				       struct ethosu_device, base);
> +	if (IS_ERR(ethosudev))
> +		return -ENOMEM;
> +	platform_set_drvdata(pdev, ethosudev);
> +
> +	dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(40));
> +
> +	ethosudev->regs = devm_platform_ioremap_resource(pdev, 0);
> +
> +	ethosudev->num_clks = devm_clk_bulk_get_all(&pdev->dev, &ethosudev->clks);
> +	if (ethosudev->num_clks < 0)
> +		return ethosudev->num_clks;
> +
> +	ret = ethosu_job_init(ethosudev);
> +	if (ret)
> +		return ret;
> +
> +	ret = ethosu_init(ethosudev);
> +	if (ret)
> +		return ret;
> +
> +	ret = drm_dev_register(&ethosudev->base, 0);
> +	if (ret)
> +		pm_runtime_dont_use_autosuspend(ethosudev->base.dev);
> +
> +	pm_runtime_put_autosuspend(ethosudev->base.dev);
> +	return ret;
> +}
> +
...
> +
> +static int ethosu_gem_cmdstream_copy_and_validate(struct drm_device *ddev,
> +						  u32 __user *ucmds,
> +						  struct ethosu_gem_object *bo,
> +						  u32 size)
> +{
> +	struct ethosu_validated_cmdstream_info *info;
> +	u32 *bocmds = bo->base.vaddr;
> +	struct cmd_state st;
> +	int i, ret;
> +
> +	cmd_state_init(&st);
> +
> +	info = kzalloc(sizeof(*info), GFP_KERNEL);
> +	if (!info)
> +		return -ENOMEM;

this one use cleanup will simple code much

struct ethosu_validated_cmdstream_info __free(free) * info = kzalloc(sizeof(*info), GFP_KERNEL);

> +	info->cmd_size = size;
> +
> +	for (i = 0; i < size / 4; i++) {
> +		bool use_ifm, use_ifm2, use_scale;
> +		u64 dstlen, srclen;
> +		u16 cmd, param;
> +		u32 cmds[2];
> +		u64 addr;
> +
> +		if (get_user(cmds[0], ucmds++)) {
> +			ret = -EFAULT;
> +			goto fault;

so all goto, can be use

return ret;

> +		}
> +		bocmds[i] = cmds[0];
> +
> +		cmd = cmds[0];
> +		param = cmds[0] >> 16;
> +
> +		if (cmd & 0x4000) {
> +			if (get_user(cmds[1], ucmds++)) {
> +				ret = -EFAULT;
> +				goto fault;
> +			}
> +			i++;
> +			bocmds[i] = cmds[1];
> +			addr = cmd_to_addr(cmds);
> +		}
> +
> +		switch (cmd) {
> +		case NPU_OP_DMA_START:
> +			srclen = dma_length(info, &st.dma, &st.dma.src);
> +			dstlen = dma_length(info, &st.dma, &st.dma.dst);
> +
> +			if (st.dma.dst.region >= 0)
> +				info->output_region[st.dma.dst.region] = true;
> +			dev_dbg(ddev->dev, "cmd: DMA SRC:%d:0x%llx+0x%llx DST:%d:0x%llx+0x%llx\n",
> +				st.dma.src.region, st.dma.src.offset, srclen,
> +				st.dma.dst.region, st.dma.dst.offset, dstlen);
> +			break;
> +		case NPU_OP_CONV:
> +		case NPU_OP_DEPTHWISE:
> +			use_ifm2 = param & 0x1;  // weights_ifm2
> +			use_scale = !(st.ofm.precision & 0x100);
> +			ret = calc_sizes(ddev, info, cmd, &st, true, use_ifm2,
> +					 !use_ifm2, use_scale);
> +			if (ret)
> +				goto fault;
> +			break;
> +		case NPU_OP_POOL:
> +			use_ifm = param != 0x4;  // pooling mode
> +			use_scale = !(st.ofm.precision & 0x100);
> +			ret = calc_sizes(ddev, info, cmd, &st, use_ifm, false,
> +					 false, use_scale);
> +			if (ret)
> +				goto fault;
> +			break;
> +		case NPU_OP_ELEMENTWISE:
> +			use_ifm2 = !((st.ifm2.broadcast == 8) || (param == 5) ||
> +				(param == 6) || (param == 7) || (param == 0x24));
> +			use_ifm = st.ifm.broadcast != 8;
> +			ret = calc_sizes_elemwise(ddev, info, cmd, &st, use_ifm, use_ifm2);
> +			if (ret)
> +				goto fault;
> +			break;
> +		case NPU_OP_RESIZE: // U85 only
> +			WARN_ON(1); // TODO
> +			break;
> +		case NPU_SET_KERNEL_WIDTH_M1:
> +			st.ifm.width = param;
> +			break;
> +		case NPU_SET_KERNEL_HEIGHT_M1:
> +			st.ifm.height[2] = param;
> +			break;
> +		case NPU_SET_KERNEL_STRIDE:
> +			st.ifm.stride_kernel = param;
> +			break;
> +		case NPU_SET_IFM_PAD_TOP:
> +			st.ifm.pad_top = param & 0x7f;
> +			break;
> +		case NPU_SET_IFM_PAD_LEFT:
> +			st.ifm.pad_left = param & 0x7f;
> +			break;
> +		case NPU_SET_IFM_PAD_RIGHT:
> +			st.ifm.pad_right = param & 0xff;
> +			break;
> +		case NPU_SET_IFM_PAD_BOTTOM:
> +			st.ifm.pad_bottom = param & 0xff;
> +			break;
> +		case NPU_SET_IFM_DEPTH_M1:
> +			st.ifm.depth = param;
> +			break;
> +		case NPU_SET_IFM_PRECISION:
> +			st.ifm.precision = param;
> +			break;
> +		case NPU_SET_IFM_BROADCAST:
> +			st.ifm.broadcast = param;
> +			break;
> +		case NPU_SET_IFM_REGION:
> +			st.ifm.region = param & 0x7f;
> +			break;
> +		case NPU_SET_IFM_WIDTH0_M1:
> +			st.ifm.width0 = param;
> +			break;
> +		case NPU_SET_IFM_HEIGHT0_M1:
> +			st.ifm.height[0] = param;
> +			break;
> +		case NPU_SET_IFM_HEIGHT1_M1:
> +			st.ifm.height[1] = param;
> +			break;
> +		case NPU_SET_IFM_BASE0:
> +		case NPU_SET_IFM_BASE1:
> +		case NPU_SET_IFM_BASE2:
> +		case NPU_SET_IFM_BASE3:
> +			st.ifm.base[cmd & 0x3] = addr;
> +			break;
> +		case NPU_SET_IFM_STRIDE_X:
> +			st.ifm.stride_x = addr;
> +			break;
> +		case NPU_SET_IFM_STRIDE_Y:
> +			st.ifm.stride_y = addr;
> +			break;
> +		case NPU_SET_IFM_STRIDE_C:
> +			st.ifm.stride_c = addr;
> +			break;
> +
> +		case NPU_SET_OFM_WIDTH_M1:
> +			st.ofm.width = param;
> +			break;
> +		case NPU_SET_OFM_HEIGHT_M1:
> +			st.ofm.height[2] = param;
> +			break;
> +		case NPU_SET_OFM_DEPTH_M1:
> +			st.ofm.depth = param;
> +			break;
> +		case NPU_SET_OFM_PRECISION:
> +			st.ofm.precision = param;
> +			break;
> +		case NPU_SET_OFM_REGION:
> +			st.ofm.region = param & 0x7;
> +			break;
> +		case NPU_SET_OFM_WIDTH0_M1:
> +			st.ofm.width0 = param;
> +			break;
> +		case NPU_SET_OFM_HEIGHT0_M1:
> +			st.ofm.height[0] = param;
> +			break;
> +		case NPU_SET_OFM_HEIGHT1_M1:
> +			st.ofm.height[1] = param;
> +			break;
> +		case NPU_SET_OFM_BASE0:
> +		case NPU_SET_OFM_BASE1:
> +		case NPU_SET_OFM_BASE2:
> +		case NPU_SET_OFM_BASE3:
> +			st.ofm.base[cmd & 0x3] = addr;
> +			break;
> +		case NPU_SET_OFM_STRIDE_X:
> +			st.ofm.stride_x = addr;
> +			break;
> +		case NPU_SET_OFM_STRIDE_Y:
> +			st.ofm.stride_y = addr;
> +			break;
> +		case NPU_SET_OFM_STRIDE_C:
> +			st.ofm.stride_c = addr;
> +			break;
> +
> +		case NPU_SET_IFM2_BROADCAST:
> +			st.ifm2.broadcast = param;
> +			break;
> +		case NPU_SET_IFM2_PRECISION:
> +			st.ifm2.precision = param;
> +			break;
> +		case NPU_SET_IFM2_REGION:
> +			st.ifm2.region = param & 0x7;
> +			break;
> +		case NPU_SET_IFM2_WIDTH0_M1:
> +			st.ifm2.width0 = param;
> +			break;
> +		case NPU_SET_IFM2_HEIGHT0_M1:
> +			st.ifm2.height[0] = param;
> +			break;
> +		case NPU_SET_IFM2_HEIGHT1_M1:
> +			st.ifm2.height[1] = param;
> +			break;
> +		case NPU_SET_IFM2_BASE0:
> +		case NPU_SET_IFM2_BASE1:
> +		case NPU_SET_IFM2_BASE2:
> +		case NPU_SET_IFM2_BASE3:
> +			st.ifm2.base[cmd & 0x3] = addr;
> +			break;
> +		case NPU_SET_IFM2_STRIDE_X:
> +			st.ifm2.stride_x = addr;
> +			break;
> +		case NPU_SET_IFM2_STRIDE_Y:
> +			st.ifm2.stride_y = addr;
> +			break;
> +		case NPU_SET_IFM2_STRIDE_C:
> +			st.ifm2.stride_c = addr;
> +			break;
> +
> +		case NPU_SET_WEIGHT_REGION:
> +			st.weight[0].region = param & 0x7;
> +			break;
> +		case NPU_SET_SCALE_REGION:
> +			st.scale[0].region = param & 0x7;
> +			break;
> +		case NPU_SET_WEIGHT_BASE:
> +			st.weight[0].base = addr;
> +			break;
> +		case NPU_SET_WEIGHT_LENGTH:
> +			st.weight[0].length = cmds[1];
> +			break;
> +		case NPU_SET_SCALE_BASE:
> +			st.scale[0].base = addr;
> +			break;
> +		case NPU_SET_SCALE_LENGTH:
> +			st.scale[0].length = cmds[1];
> +			break;
> +		case NPU_SET_WEIGHT1_BASE:
> +			st.weight[1].base = addr;
> +			break;
> +		case NPU_SET_WEIGHT1_LENGTH:
> +			st.weight[1].length = cmds[1];
> +			break;
> +		case NPU_SET_SCALE1_BASE: // NPU_SET_WEIGHT2_BASE (U85)
> +			if (0 /*U85*/)
> +				st.weight[2].base = addr;
> +			else
> +				st.scale[1].base = addr;
> +			break;
> +		case NPU_SET_SCALE1_LENGTH: // NPU_SET_WEIGHT2_LENGTH (U85)
> +			if (0 /*U85*/)
> +				st.weight[1].length = cmds[1];
> +			else
> +				st.scale[1].length = cmds[1];
> +			break;
> +		case NPU_SET_WEIGHT3_BASE:
> +			st.weight[3].base = addr;
> +			break;
> +		case NPU_SET_WEIGHT3_LENGTH:
> +			st.weight[3].length = cmds[1];
> +			break;
> +
> +		case NPU_SET_DMA0_SRC_REGION:
> +			if (param & 0x100)
> +				st.dma.src.region = -1;
> +			else
> +				st.dma.src.region = param & 0x7;
> +			st.dma.mode = (param >> 9) & 0x3;
> +			break;
> +		case NPU_SET_DMA0_DST_REGION:
> +			if (param & 0x100)
> +				st.dma.dst.region = -1;
> +			else
> +				st.dma.dst.region = param & 0x7;
> +			break;
> +		case NPU_SET_DMA0_SIZE0:
> +			st.dma.size0 = param;
> +			break;
> +		case NPU_SET_DMA0_SIZE1:
> +			st.dma.size1 = param;
> +			break;
> +		case NPU_SET_DMA0_SRC_STRIDE0:
> +			st.dma.src.stride[0] = ((s64)addr << 24) >> 24;
> +			break;
> +		case NPU_SET_DMA0_SRC_STRIDE1:
> +			st.dma.src.stride[1] = ((s64)addr << 24) >> 24;
> +			break;
> +		case NPU_SET_DMA0_DST_STRIDE0:
> +			st.dma.dst.stride[0] = ((s64)addr << 24) >> 24;
> +			break;
> +		case NPU_SET_DMA0_DST_STRIDE1:
> +			st.dma.dst.stride[1] = ((s64)addr << 24) >> 24;
> +			break;
> +		case NPU_SET_DMA0_SRC:
> +			st.dma.src.offset = addr;
> +			break;
> +		case NPU_SET_DMA0_DST:
> +			st.dma.dst.offset = addr;
> +			break;
> +		case NPU_SET_DMA0_LEN:
> +			st.dma.src.len = st.dma.dst.len = addr;
> +			break;
> +		default:
> +			break;
> +		}
> +	}
> +
> +	for (i = 0; i < NPU_BASEP_REGION_MAX; i++) {
> +		if (!info->region_size[i])
> +			continue;
> +		dev_dbg(ddev->dev, "region %d max size: 0x%llx\n",
> +			i, info->region_size[i]);
> +	}
> +
> +	bo->info = info;

bo->info = no_free_ptr(info);

> +	return 0;
> +
> +fault:
> +	kfree(info);
> +	return ret;
> +}
...

> +
> +static const struct drm_sched_backend_ops ethosu_sched_ops = {
> +	.run_job = ethosu_job_run,
> +	.timedout_job = ethosu_job_timedout,
> +	.free_job = ethosu_job_free
> +};
> +
> +int ethosu_job_init(struct ethosu_device *dev)
> +{
> +	struct drm_sched_init_args args = {
> +		.ops = &ethosu_sched_ops,
> +		.num_rqs = DRM_SCHED_PRIORITY_COUNT,
> +		.credit_limit = 1,
> +		.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS),
> +		.name = dev_name(dev->base.dev),
> +		.dev = dev->base.dev,
> +	};
> +	int ret;
> +
> +	spin_lock_init(&dev->fence_lock);
> +	mutex_init(&dev->job_lock);
> +	mutex_init(&dev->sched_lock);

use devm_mutex_init();

> +
> +	dev->irq = platform_get_irq(to_platform_device(dev->base.dev), 0);
> +	if (dev->irq < 0)
> +		return dev->irq;
> +
> +	ret = devm_request_threaded_irq(dev->base.dev, dev->irq,
> +					ethosu_job_irq_handler,
> +					ethosu_job_irq_handler_thread,
> +					IRQF_SHARED, KBUILD_MODNAME,
> +					dev);
> +	if (ret) {
> +		dev_err(dev->base.dev, "failed to request irq");
> +		return ret;
> +	}
> +
> +	dev->fence_context = dma_fence_context_alloc(1);
> +
> +	ret = drm_sched_init(&dev->sched, &args);
> +	if (ret) {
> +		dev_err(dev->base.dev, "Failed to create scheduler: %d.", ret);
> +		goto err_sched;
> +	}
> +
> +	return 0;
> +
> +err_sched:
> +	drm_sched_fini(&dev->sched);
> +	return ret;
> +}
> +
> +void ethosu_job_fini(struct ethosu_device *dev)
> +{
> +	drm_sched_fini(&dev->sched);
> +}
> +
> +int ethosu_job_open(struct ethosu_file_priv *ethosu_priv)
> +{
> +	struct ethosu_device *dev = ethosu_priv->edev;
> +	struct drm_gpu_scheduler *sched = &dev->sched;
> +	int ret;
> +
> +	ret = drm_sched_entity_init(&ethosu_priv->sched_entity,
> +				    DRM_SCHED_PRIORITY_NORMAL,
> +				    &sched, 1, NULL);
> +	return WARN_ON(ret);
> +}
> +
> +void ethosu_job_close(struct ethosu_file_priv *ethosu_priv)
> +{
> +	struct drm_sched_entity *entity = &ethosu_priv->sched_entity;
> +
> +	drm_sched_entity_destroy(entity);
> +}
> +
> +int ethosu_job_is_idle(struct ethosu_device *dev)
> +{
> +	/* If there are any jobs in this HW queue, we're not idle */
> +	if (atomic_read(&dev->sched.credit_count))
> +		return false;
> +
> +	return true;
> +}
> +
> +static int ethosu_ioctl_submit_job(struct drm_device *dev, struct drm_file *file,
> +				   struct drm_ethosu_job *job)
> +{
> +	struct ethosu_device *edev = to_ethosu_device(dev);
> +	struct ethosu_file_priv *file_priv = file->driver_priv;
> +	struct ethosu_job *ejob = NULL;
> +	struct ethosu_validated_cmdstream_info *cmd_info;
> +	int ret = 0;
> +
> +	/* BO region 2 is reserved if SRAM is used */
> +	if (job->region_bo_handles[ETHOSU_SRAM_REGION] && job->sram_size)
> +		return -EINVAL;
> +
> +	if (edev->npu_info.sram_size < job->sram_size)
> +		return -EINVAL;
> +
> +	ejob = kzalloc(sizeof(*ejob), GFP_KERNEL);
> +	if (!ejob)
> +		return -ENOMEM;
> +
> +	kref_init(&ejob->refcount);
> +
> +	ejob->dev = edev;
> +	ejob->sram_size = job->sram_size;
> +
> +	ret = drm_sched_job_init(&ejob->base,
> +				 &file_priv->sched_entity,
> +				 1, NULL, file->client_id);
> +	if (ret)
> +		goto out_put_job;
> +
> +	ejob->cmd_bo = drm_gem_object_lookup(file, job->cmd_bo);
> +	if (!ejob->cmd_bo) {
> +		ret = -ENOENT;
> +		goto out_cleanup_job;
> +	}
> +	cmd_info = to_ethosu_bo(ejob->cmd_bo)->info;
> +	if (!cmd_info) {
> +		ret = -EINVAL;
> +		goto out_cleanup_job;
> +	}
> +
> +	for (int i = 0; i < NPU_BASEP_REGION_MAX; i++) {
> +		struct drm_gem_object *gem;
> +
> +		/* Can only omit a BO handle if the region is not used or used for SRAM */
> +		if (!job->region_bo_handles[i] &&
> +		    (!cmd_info->region_size[i] || (i == ETHOSU_SRAM_REGION && job->sram_size)))
> +			continue;
> +
> +		if (job->region_bo_handles[i] && !cmd_info->region_size[i]) {
> +			dev_err(dev->dev,
> +				"Cmdstream BO handle %d set for unused region %d\n",
> +				job->region_bo_handles[i], i);
> +			ret = -EINVAL;
> +			goto out_cleanup_job;
> +		}
> +
> +		gem = drm_gem_object_lookup(file, job->region_bo_handles[i]);
> +		if (!gem) {
> +			dev_err(dev->dev,
> +				"Invalid BO handle %d for region %d\n",
> +				job->region_bo_handles[i], i);
> +			ret = -ENOENT;
> +			goto out_cleanup_job;
> +		}
> +
> +		ejob->region_bo[ejob->region_cnt] = gem;
> +		ejob->region_bo_num[ejob->region_cnt] = i;
> +		ejob->region_cnt++;
> +
> +		if (to_ethosu_bo(gem)->info) {
> +			dev_err(dev->dev,
> +				"Cmdstream BO handle %d used for region %d\n",
> +				job->region_bo_handles[i], i);
> +			ret = -EINVAL;
> +			goto out_cleanup_job;
> +		}
> +
> +		/* Verify the command stream doesn't have accesses outside the BO */
> +		if (cmd_info->region_size[i] > gem->size) {
> +			dev_err(dev->dev,
> +				"cmd stream region %d size greater than BO size (%llu > %zu)\n",
> +				i, cmd_info->region_size[i], gem->size);
> +			ret = -EOVERFLOW;
> +			goto out_cleanup_job;
> +		}
> +	}
> +	ret = ethosu_job_push(ejob);
> +
> +out_cleanup_job:
> +	if (ret)
> +		drm_sched_job_cleanup(&ejob->base);
> +out_put_job:
> +	ethosu_job_put(ejob);
> +
> +	return ret;
> +}
> +
> +int ethosu_ioctl_submit(struct drm_device *dev, void *data, struct drm_file *file)
> +{
> +	struct drm_ethosu_submit *args = data;
> +	struct drm_ethosu_job *jobs;
> +	int ret = 0;
> +	unsigned int i = 0;
> +
> +	if (args->pad) {
> +		drm_dbg(dev, "Reserved field in drm_ethosu_submit struct should be 0.\n");
> +		return -EINVAL;
> +	}
> +
> +	jobs = kvmalloc_array(args->job_count, sizeof(*jobs), GFP_KERNEL);

here use _free() can remote all goto

Frank
> +	if (!jobs)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(jobs,
> +			   (void __user *)(uintptr_t)args->jobs,
> +			   args->job_count * sizeof(*jobs))) {
> +		ret = -EFAULT;
> +		drm_dbg(dev, "Failed to copy incoming job array\n");
> +		goto exit;
> +	}
> +
> +	for (i = 0; i < args->job_count; i++) {
> +		ret = ethosu_ioctl_submit_job(dev, file, &jobs[i]);
> +		if (ret)
> +			break;
> +	}
> +
> +exit:
> +	kfree(jobs);
> +
> +	return ret;
> +}
> diff --git a/drivers/accel/ethosu/ethosu_job.h b/drivers/accel/ethosu/ethosu_job.h
> new file mode 100644
> index 000000000000..80358fcbdad7
> --- /dev/null
> +++ b/drivers/accel/ethosu/ethosu_job.h
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: GPL-2.0-only OR MIT */
> +/* Copyright 2024-2025 Tomeu Vizoso <tomeu@...euvizoso.net> */
> +/* Copyright 2025 Arm, Ltd. */
> +
> +#ifndef __ETHOSU_JOB_H__
> +#define __ETHOSU_JOB_H__
> +
> +#include <linux/kref.h>
> +#include <drm/gpu_scheduler.h>
> +
> +struct ethosu_device;
> +struct ethosu_file_priv;
> +
> +struct ethosu_job {
> +	struct drm_sched_job base;
> +	struct ethosu_device *dev;
> +
> +	struct drm_gem_object *cmd_bo;
> +	struct drm_gem_object *region_bo[NPU_BASEP_REGION_MAX];
> +	u8 region_bo_num[NPU_BASEP_REGION_MAX];
> +	u8 region_cnt;
> +	u32 sram_size;
> +
> +	/* Fence to be signaled by drm-sched once its done with the job */
> +	struct dma_fence *inference_done_fence;
> +
> +	/* Fence to be signaled by IRQ handler when the job is complete. */
> +	struct dma_fence *done_fence;
> +
> +	struct kref refcount;
> +};
> +
> +int ethosu_ioctl_submit(struct drm_device *dev, void *data, struct drm_file *file);
> +
> +int ethosu_job_init(struct ethosu_device *dev);
> +void ethosu_job_fini(struct ethosu_device *dev);
> +int ethosu_job_open(struct ethosu_file_priv *ethosu_priv);
> +void ethosu_job_close(struct ethosu_file_priv *ethosu_priv);
> +int ethosu_job_is_idle(struct ethosu_device *dev);
> +
> +#endif
> diff --git a/include/uapi/drm/ethosu_accel.h b/include/uapi/drm/ethosu_accel.h
> new file mode 100644
> index 000000000000..af78bb4686d7
> --- /dev/null
> +++ b/include/uapi/drm/ethosu_accel.h
> @@ -0,0 +1,261 @@
> +/* SPDX-License-Identifier: MIT */
> +/* Copyright (C) 2025 Arm, Ltd. */
> +#ifndef _ETHOSU_DRM_H_
> +#define _ETHOSU_DRM_H_
> +
> +#include "drm.h"
> +
> +#if defined(__cplusplus)
> +extern "C" {
> +#endif
> +
> +/**
> + * DOC: IOCTL IDs
> + *
> + * enum drm_ethosu_ioctl_id - IOCTL IDs
> + *
> + * Place new ioctls at the end, don't re-order, don't replace or remove entries.
> + *
> + * These IDs are not meant to be used directly. Use the DRM_IOCTL_ETHOSU_xxx
> + * definitions instead.
> + */
> +enum drm_ethosu_ioctl_id {
> +	/** @DRM_ETHOSU_DEV_QUERY: Query device information. */
> +	DRM_ETHOSU_DEV_QUERY = 0,
> +
> +	/** @DRM_ETHOSU_BO_CREATE: Create a buffer object. */
> +	DRM_ETHOSU_BO_CREATE,
> +
> +	/** @DRM_ETHOSU_BO_WAIT: Wait on a buffer object's fence. */
> +	DRM_ETHOSU_BO_WAIT,
> +
> +	/**
> +	 * @DRM_ETHOSU_BO_MMAP_OFFSET: Get the file offset to pass to
> +	 * mmap to map a GEM object.
> +	 */
> +	DRM_ETHOSU_BO_MMAP_OFFSET,
> +
> +	/**
> +	 * @DRM_ETHOSU_CMDSTREAM_BO_CREATE: Create a command stream buffer
> +	 * object.
> +	 */
> +	DRM_ETHOSU_CMDSTREAM_BO_CREATE,
> +
> +	/** @DRM_ETHOSU_SUBMIT: Submit a job and BOs to run. */
> +	DRM_ETHOSU_SUBMIT,
> +};
> +
> +/**
> + * DOC: IOCTL arguments
> + */
> +
> +/**
> + * enum drm_ethosu_dev_query_type - Query type
> + *
> + * Place new types at the end, don't re-order, don't remove or replace.
> + */
> +enum drm_ethosu_dev_query_type {
> +	/** @DRM_ETHOSU_DEV_QUERY_NPU_INFO: Query NPU information. */
> +	DRM_ETHOSU_DEV_QUERY_NPU_INFO = 0,
> +};
> +
> +/**
> + * struct drm_ethosu_gpu_info - NPU information
> + *
> + * Structure grouping all queryable information relating to the NPU.
> + */
> +struct drm_ethosu_npu_info {
> +	/** @id : NPU ID. */
> +	__u32 id;
> +#define DRM_ETHOSU_ARCH_MAJOR(x)			((x) >> 28)
> +#define DRM_ETHOSU_ARCH_MINOR(x)			(((x) >> 20) & 0xff)
> +#define DRM_ETHOSU_ARCH_PATCH(x)			(((x) >> 16) & 0xf)
> +#define DRM_ETHOSU_PRODUCT_MAJOR(x)		(((x) >> 12) & 0xf)
> +#define DRM_ETHOSU_VERSION_MAJOR(x)		(((x) >> 8) & 0xf)
> +#define DRM_ETHOSU_VERSION_MINOR(x)		(((x) >> 4) & 0xff)
> +#define DRM_ETHOSU_VERSION_STATUS(x)		((x) & 0xf)
> +
> +	/** @gpu_rev: GPU revision. */
> +	__u32 config;
> +
> +	__u32 sram_size;
> +};
> +
> +/**
> + * struct drm_ethosu_dev_query - Arguments passed to DRM_ETHOSU_IOCTL_DEV_QUERY
> + */
> +struct drm_ethosu_dev_query {
> +	/** @type: the query type (see drm_ethosu_dev_query_type). */
> +	__u32 type;
> +
> +	/**
> +	 * @size: size of the type being queried.
> +	 *
> +	 * If pointer is NULL, size is updated by the driver to provide the
> +	 * output structure size. If pointer is not NULL, the driver will
> +	 * only copy min(size, actual_structure_size) bytes to the pointer,
> +	 * and update the size accordingly. This allows us to extend query
> +	 * types without breaking userspace.
> +	 */
> +	__u32 size;
> +
> +	/**
> +	 * @pointer: user pointer to a query type struct.
> +	 *
> +	 * Pointer can be NULL, in which case, nothing is copied, but the
> +	 * actual structure size is returned. If not NULL, it must point to
> +	 * a location that's large enough to hold size bytes.
> +	 */
> +	__u64 pointer;
> +};
> +
> +/**
> + * enum drm_ethosu_bo_flags - Buffer object flags, passed at creation time.
> + */
> +enum drm_ethosu_bo_flags {
> +	/**
> +	 * @DRM_ETHOSU_BO_NO_MMAP: The buffer object will never be CPU-mapped
> +	 * in userspace.
> +	 */
> +	DRM_ETHOSU_BO_NO_MMAP = (1 << 0),
> +};
> +
> +/**
> + * struct drm_ethosu_bo_create - Arguments passed to DRM_IOCTL_ETHOSU_BO_CREATE.
> + */
> +struct drm_ethosu_bo_create {
> +	/**
> +	 * @size: Requested size for the object
> +	 *
> +	 * The (page-aligned) allocated size for the object will be returned.
> +	 */
> +	__u64 size;
> +
> +	/**
> +	 * @flags: Flags. Must be a combination of drm_ethosu_bo_flags flags.
> +	 */
> +	__u32 flags;
> +
> +	/**
> +	 * @handle: Returned handle for the object.
> +	 *
> +	 * Object handles are nonzero.
> +	 */
> +	__u32 handle;
> +};
> +
> +/**
> + * struct drm_ethosu_bo_mmap_offset - Arguments passed to DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET.
> + */
> +struct drm_ethosu_bo_mmap_offset {
> +	/** @handle: Handle of the object we want an mmap offset for. */
> +	__u32 handle;
> +
> +	/** @pad: MBZ. */
> +	__u32 pad;
> +
> +	/** @offset: The fake offset to use for subsequent mmap calls. */
> +	__u64 offset;
> +};
> +
> +/**
> + * struct drm_ethosu_wait_bo - ioctl argument for waiting for
> + * completion of the last DRM_ETHOSU_SUBMIT on a BO.
> + *
> + * This is useful for cases where multiple processes might be
> + * rendering to a BO and you want to wait for all rendering to be
> + * completed.
> + */
> +struct drm_ethosu_bo_wait {
> +	__u32 handle;
> +	__u32 pad;
> +	__s64 timeout_ns;	/* absolute */
> +};
> +
> +struct drm_ethosu_cmdstream_bo_create {
> +	/* Size of the data argument. */
> +	__u32 size;
> +
> +	/* Flags, currently must be 0. */
> +	__u32 flags;
> +
> +	/* Pointer to the data. */
> +	__u64 data;
> +
> +	/** Returned GEM handle for the BO. */
> +	__u32 handle;
> +
> +	/* Pad, must be 0. */
> +	__u32 pad;
> +};
> +
> +/**
> + * struct drm_ethosu_job - A job to be run on the NPU
> + *
> + * The kernel will schedule the execution of this job taking into account its
> + * dependencies with other jobs. All tasks in the same job will be executed
> + * sequentially on the same core, to benefit from memory residency in SRAM.
> + */
> +struct drm_ethosu_job {
> +	/** Input: BO handle for cmdstream. */
> +	__u32 cmd_bo;
> +
> +	/** Input: Amount of SRAM to use. */
> +	__u32 sram_size;
> +
> +#define ETHOSU_MAX_REGIONS	8
> +	/** Input: Array of BO handles for each region. */
> +	__u32 region_bo_handles[ETHOSU_MAX_REGIONS];
> +};
> +
> +/**
> + * struct drm_ethosu_submit - ioctl argument for submitting commands to the NPU.
> + *
> + * The kernel will schedule the execution of these jobs in dependency order.
> + */
> +struct drm_ethosu_submit {
> +	/** Input: Pointer to an array of struct drm_ethosu_job. */
> +	__u64 jobs;
> +
> +	/** Input: Number of jobs passed in. */
> +	__u32 job_count;
> +
> +	/** Reserved, must be zero. */
> +	__u32 pad;
> +};
> +
> +/**
> + * DRM_IOCTL_ETHOSU() - Build a ethosu IOCTL number
> + * @__access: Access type. Must be R, W or RW.
> + * @__id: One of the DRM_ETHOSU_xxx id.
> + * @__type: Suffix of the type being passed to the IOCTL.
> + *
> + * Don't use this macro directly, use the DRM_IOCTL_ETHOSU_xxx
> + * values instead.
> + *
> + * Return: An IOCTL number to be passed to ioctl() from userspace.
> + */
> +#define DRM_IOCTL_ETHOSU(__access, __id, __type) \
> +	DRM_IO ## __access(DRM_COMMAND_BASE + DRM_ETHOSU_ ## __id, \
> +			   struct drm_ethosu_ ## __type)
> +
> +enum {
> +	DRM_IOCTL_ETHOSU_DEV_QUERY =
> +		DRM_IOCTL_ETHOSU(WR, DEV_QUERY, dev_query),
> +	DRM_IOCTL_ETHOSU_BO_CREATE =
> +		DRM_IOCTL_ETHOSU(WR, BO_CREATE, bo_create),
> +	DRM_IOCTL_ETHOSU_BO_WAIT =
> +		DRM_IOCTL_ETHOSU(WR, BO_WAIT, bo_wait),
> +	DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET =
> +		DRM_IOCTL_ETHOSU(WR, BO_MMAP_OFFSET, bo_mmap_offset),
> +	DRM_IOCTL_ETHOSU_CMDSTREAM_BO_CREATE =
> +		DRM_IOCTL_ETHOSU(WR, CMDSTREAM_BO_CREATE, cmdstream_bo_create),
> +	DRM_IOCTL_ETHOSU_SUBMIT =
> +		DRM_IOCTL_ETHOSU(WR, SUBMIT, submit),
> +};
> +
> +#if defined(__cplusplus)
> +}
> +#endif
> +
> +#endif /* _ETHOSU_DRM_H_ */
>
> --
> 2.51.0
>