linux-kernel - [PATCH v2 2/3] accel/amdpk: add driver for AMD PKI accelerator

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250409173033.2261755-2-nipun.gupta@amd.com>
Date: Wed, 9 Apr 2025 23:00:32 +0530
From: Nipun Gupta <nipun.gupta@....com>
To: <dri-devel@...ts.freedesktop.org>, <devicetree@...r.kernel.org>,
	<linux-kernel@...r.kernel.org>, <krzk+dt@...nel.org>,
	<gregkh@...uxfoundation.org>, <robh@...nel.org>, <conor+dt@...nel.org>,
	<ogabbay@...nel.org>, <maarten.lankhorst@...ux.intel.com>,
	<mripard@...nel.org>, <tzimmermann@...e.de>, <airlied@...il.com>,
	<simona@...ll.ch>, <derek.kiernan@....com>, <dragan.cvetic@....com>,
	<arnd@...db.de>
CC: <praveen.jain@....com>, <harpreet.anand@....com>,
	<nikhil.agarwal@....com>, <srivatsa@...il.mit.edu>, <code@...icks.com>,
	<ptsm@...ux.microsoft.com>, Nipun Gupta <nipun.gupta@....com>
Subject: [PATCH v2 2/3] accel/amdpk: add driver for AMD PKI accelerator

The AMD PKI accelerator driver provides a accel interface to interact
with the device for offloading and accelerating asymmetric crypto
operations.

Signed-off-by: Nipun Gupta <nipun.gupta@....com>
---

Changes RFC->v2:
- moved from misc to accel
- added architecture and compile test dependency in Kconfig
- removed sysfs (and added debugfs in new patch 3/3)
- fixed platform compat
- removed redundant resource index 1 configuration (which was there in
  RFC patch)

 MAINTAINERS                     |   2 +
 drivers/accel/Kconfig           |   1 +
 drivers/accel/Makefile          |   1 +
 drivers/accel/amdpk/Kconfig     |  18 +
 drivers/accel/amdpk/Makefile    |   8 +
 drivers/accel/amdpk/amdpk_drv.c | 736 ++++++++++++++++++++++++++++++++
 drivers/accel/amdpk/amdpk_drv.h | 271 ++++++++++++
 include/uapi/drm/amdpk.h        |  49 +++
 8 files changed, 1086 insertions(+)
 create mode 100644 drivers/accel/amdpk/Kconfig
 create mode 100644 drivers/accel/amdpk/Makefile
 create mode 100644 drivers/accel/amdpk/amdpk_drv.c
 create mode 100644 drivers/accel/amdpk/amdpk_drv.h
 create mode 100644 include/uapi/drm/amdpk.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 11f8815daa77..cdc305a206aa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1161,6 +1161,8 @@ L:	dri-devel@...ts.freedesktop.org
 S:	Maintained
 T:	git https://gitlab.freedesktop.org/drm/misc/kernel.git
 F:	Documentation/devicetree/bindings/accel/amd,versal-net-pki.yaml
+F:	drivers/accel/amdpk/
+F:	include/uapi/drm/amdpk.h

 AMD PMC DRIVER
 M:	Shyam Sundar S K <Shyam-sundar.S-k@....com>
diff --git a/drivers/accel/Kconfig b/drivers/accel/Kconfig
index 5b9490367a39..5632c6c62c15 100644
--- a/drivers/accel/Kconfig
+++ b/drivers/accel/Kconfig
@@ -28,5 +28,6 @@ source "drivers/accel/amdxdna/Kconfig"
 source "drivers/accel/habanalabs/Kconfig"
 source "drivers/accel/ivpu/Kconfig"
 source "drivers/accel/qaic/Kconfig"
+source "drivers/accel/amdpk/Kconfig"

 endif
diff --git a/drivers/accel/Makefile b/drivers/accel/Makefile
index a301fb6089d4..caea6d636ac8 100644
--- a/drivers/accel/Makefile
+++ b/drivers/accel/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_DRM_ACCEL_AMDXDNA)		+= amdxdna/
 obj-$(CONFIG_DRM_ACCEL_HABANALABS)	+= habanalabs/
 obj-$(CONFIG_DRM_ACCEL_IVPU)		+= ivpu/
 obj-$(CONFIG_DRM_ACCEL_QAIC)		+= qaic/
+obj-$(CONFIG_DRM_ACCEL_AMDPK)		+= amdpk/
diff --git a/drivers/accel/amdpk/Kconfig b/drivers/accel/amdpk/Kconfig
new file mode 100644
index 000000000000..c0b459bb66a7
--- /dev/null
+++ b/drivers/accel/amdpk/Kconfig
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for AMD PKI accelerator for versal-net
+#
+
+config DRM_ACCEL_AMDPK
+	tristate "AMD PKI accelerator for versal-net"
+	depends on DRM_ACCEL
+	depends on ARM64 || COMPILE_TEST
+	help
+	  Enables platform driver for AMD PKI accelerator that are designed
+	  for high performance Public Key asymmetric crypto operations on AMD
+	  versal-net.
+
+	  If unsure, say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called amdpk.
diff --git a/drivers/accel/amdpk/Makefile b/drivers/accel/amdpk/Makefile
new file mode 100644
index 000000000000..826f43ccebdf
--- /dev/null
+++ b/drivers/accel/amdpk/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for AMD PKI accelerator for versal-net
+#
+
+obj-$(CONFIG_DRM_ACCEL_AMDPK) := amdpk.o
+
+amdpk-y := amdpk_drv.o
diff --git a/drivers/accel/amdpk/amdpk_drv.c b/drivers/accel/amdpk/amdpk_drv.c
new file mode 100644
index 000000000000..17c328d03db8
--- /dev/null
+++ b/drivers/accel/amdpk/amdpk_drv.c
@@ -0,0 +1,736 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018-2021 Silex Insight sa
+ * Copyright (c) 2018-2021 Beerten Engineering scs
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ */
+
+/*
+ * Device Overview
+ * ===============
+ * AMD PKI accelerator is a device on AMD versal-net to execute public
+ * key asymmetric crypto operations like ECDSA, ECDH, RSA etc. with high
+ * performance. The driver provides accel interface to applications for
+ * configuring the device and performing the required operations. AMD PKI
+ * device comprises of multiple Barco Silex ba414 PKI engines bundled together,
+ * and providing a queue based interface to interact with these devices on AMD
+ * versal-net.
+ *
+ * Following figure provides the brief overview of the device interface with
+ * the software:
+ *
+ * +------------------+
+ * |    Software      |
+ * +------------------+
+ *     |          |
+ *     |          v
+ *     |     +-----------------------------------------------------------+
+ *     |     |                     RAM                                   |
+ *     |     |  +----------------------------+   +---------------------+ |
+ *     |     |  |           RQ pages         |   |       CQ pages      | |
+ *     |     |  | +------------------------+ |   | +-----------------+ | |
+ *     |     |  | |   START (cmd)          | |   | | req_id | status | | |
+ *     |     |  | |   TFRI (addr, sz)---+  | |   | | req_id | status | | |
+ *     |     |  | | +-TFRO (addr, sz)   |  | |   | | ...             | | |
+ *     |     |  | | | NTFY (req_id)     |  | |   | +-----------------+ | |
+ *     |     |  | +-|-------------------|--+ |   |                     | |
+ *     |     |  |   |                   v    |   +---------------------+ |
+ *     |     |  |   |         +-----------+  |                           |
+ *     |     |  |   |         | input     |  |                           |
+ *     |     |  |   |         | data      |  |                           |
+ *     |     |  |   v         +-----------+  |                           |
+ *     |     |  |  +----------------+        |                           |
+ *     |     |  |  |  output data   |        |                           |
+ *     |     |  |  +----------------+        |                           |
+ *     |     |  +----------------------------+                           |
+ *     |     |                                                           |
+ *     |     +-----------------------------------------------------------+
+ *     |
+ *     |
+ * +---|----------------------------------------------------+
+ * |   v                AMD PKI device                      |
+ * |  +-------------------+     +------------------------+  |
+ * |  | New request FIFO  | --> |       PK engines       |  |
+ * |  +-------------------+     +------------------------+  |
+ * +--------------------------------------------------------+
+ *
+ * To perform a crypto operation, the software writes a sequence of descriptors,
+ * into the RQ memory. This includes input data and designated location for the
+ * output data. After preparing the request, request offset (from the RQ memory
+ * region) is written into the NEW_REQUEST register. Request is then stored in a
+ * common hardware FIFO shared among all RQs.
+ *
+ * When a PK engine becomes available, device pops the request from the FIFO and
+ * fetches the descriptors. It DMAs the input data from RQ memory and executes
+ * the necessary computations. After computation is complete, the device writes
+ * output data back to RAM via DMA. Device then writes a new entry in CQ ring
+ * buffer in RAM, indicating completion of the request. Device also generates
+ * an interrupt for notifying completion to the software.
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/eventfd.h>
+#include <drm/drm_accel.h>
+#include <drm/drm_ioctl.h>
+
+#include "amdpk_drv.h"
+
+#define DRIVER_NAME "amdpk"
+
+static void amdpk_init_cq(struct amdpk_dev *pkdev, struct amdpk_cq *cq,
+			  int szcode, char *base)
+{
+	cq->pkdev = pkdev;
+	cq->generation = 1;
+	cq->szcode = szcode;
+	cq->base = (u32 *)base;
+	cq->tail = 0;
+}
+
+static int amdpk_pop_cq(struct amdpk_cq *cq, int *rid)
+{
+	u32 status = CQ_STATUS_VALID;
+	unsigned int sz;
+	u32 completion;
+
+	completion = cq->base[cq->tail + 1];
+	if ((completion & CQ_GENERATION_BIT) != cq->generation)
+		return CQ_STATUS_INVALID;
+
+	*rid = (completion >> 16) & 0xffff;
+	/* read memory barrier: to avoid a race condition, the status field should
+	 * not be read before the completion generation bit. Otherwise we could
+	 * get stale outdated status data.
+	 */
+	rmb();
+	status |= cq->base[cq->tail];
+	/* advance completion queue tail */
+	cq->tail += 2;
+	sz = 1 << (cq->szcode - 2);
+	if (cq->tail >= sz) {
+		cq->tail = 0;
+		cq->generation ^= 1; /* invert generation bit */
+	}
+
+	/* evaluate status from the completion queue */
+	if (completion & CQ_COMPLETION_BIT)
+		status |= CQ_COMPLETION_ERROR;
+
+	return status;
+}
+
+static int amdpk_trigpos(struct amdpk_cq *cq)
+{
+	int trigpos;
+
+	/* Set trigger position on next completed operation */
+	trigpos = cq->tail / 2 + (cq->generation << (cq->szcode - 3));
+	trigpos++;
+	trigpos &= (1 << (cq->szcode - 2)) - 1;
+
+	return trigpos;
+}
+
+static void amdpk_cq_workfn(struct kthread_work *work)
+{
+	struct amdpk_work *pkwork;
+	struct amdpk_dev *pkdev;
+	struct amdpk_user *user;
+	int qid, rid, trigpos;
+	u32 status;
+
+	pkwork = to_amdpk_work(work);
+	pkdev = pkwork->pkdev;
+	qid = pkwork->qid;
+
+	user = pkwork->user;
+	status = amdpk_pop_cq(&pkdev->work[qid]->pk_cq, &rid);
+	if (rid < user->rq_entries && status != CQ_STATUS_INVALID) {
+		u32 *status_mem;
+
+		status_mem = (u32 *)user->stmem;
+		status_mem[rid] = status;
+		eventfd_signal(user->evfd_ctx[rid]);
+	}
+
+	trigpos = amdpk_trigpos(&pkdev->work[qid]->pk_cq);
+	pk_wrreg(pkdev->regs, REG_CTL_CQ_NTFY(user->qid), trigpos);
+}
+
+static irqreturn_t amdpk_cq_irq(int irq, void *dev)
+{
+	struct amdpk_dev *pkdev = (struct amdpk_dev *)dev;
+	u64 active = 0;
+	int i;
+
+	active = pk_rdreg(pkdev->regs, REG_PK_IRQ_STATUS);
+	pk_wrreg(pkdev->regs, REG_PK_IRQ_RESET, active);
+
+	for (i = 0; i < pkdev->max_queues && active; i++, active >>= 1) {
+		if (!(active & 1))
+			continue;
+		if (!pkdev->users[i])
+			continue;
+		kthread_queue_work(pkdev->work[i]->cq_wq, &pkdev->work[i]->cq_work);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void amdpk_free_rqmem(struct amdpk_dev *pkdev, struct amdpk_user *user)
+{
+	int pages = user->rq_pages;
+	int pagemult = pages / 4;
+	int i;
+
+	for (i = 0; i < pages / pagemult; i++) {
+		if (!user->rqmem[i])
+			continue;
+		dma_free_coherent(pkdev->dev, PAGE_SIZE * pagemult,
+				  user->rqmem[i], user->physrq[i]);
+		user->rqmem[i] = NULL;
+	}
+}
+
+static int amdpk_accel_get_info(struct drm_device *dev, void *data, struct drm_file *fp)
+{
+	struct amdpk_user *user = fp->driver_priv;
+	struct amdpk_dev *pkdev = user->pkdev;
+	struct amdpk_info *info = data;
+
+	info->avail_qdepth = atomic_read(&pkdev->avail_qdepth);
+	return 0;
+}
+
+static int amdpk_accel_configure(struct amdpk_user *user, struct amdpk_conf *conf)
+{
+	struct amdpk_dev *pkdev = user->pkdev;
+	struct amdpk_work *pkwork = NULL;
+	int qid = user->qid;
+	int trigpos, ret, i;
+	char wq_name[32];
+
+	i = atomic_sub_return(conf->qdepth, &pkdev->avail_qdepth);
+	if (i < 0) {
+		/* If enough entries are not present, give back the reserved entries. */
+		dev_err(user->pkdev->dev, "Out of descriptors\n");
+		atomic_add(conf->qdepth, &pkdev->avail_qdepth);
+		return -ENOSPC;
+	}
+	user->rq_entries = conf->qdepth;
+
+	for (i = 0; i < user->rq_entries; i++) {
+		if (conf->eventfd[i] <= 0) {
+			dev_err(user->pkdev->dev, "Invalid eventfd: %d\n", conf->eventfd[i]);
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		user->evfd_ctx[i] = eventfd_ctx_fdget(conf->eventfd[i]);
+		if (IS_ERR(user->evfd_ctx[i])) {
+			dev_err(user->pkdev->dev, "Invalid eventfd: %d\n", conf->eventfd[i]);
+			ret = PTR_ERR(user->evfd_ctx[i]);
+			goto fail;
+		}
+	}
+
+	user->cqmem = dma_alloc_coherent(pkdev->dev, PAGE_SIZE, &user->physcq, GFP_KERNEL);
+	if (!user->cqmem) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/* Initialize completion queue handler */
+	pkwork = pkdev->work[qid];
+	amdpk_init_cq(pkdev, &pkwork->pk_cq, __builtin_ctz(PAGE_SIZE), user->cqmem);
+
+	snprintf(wq_name, sizeof(wq_name), "cq_worker_%d", qid);
+	pkwork->cq_wq = kthread_create_worker(0, wq_name);
+	if (IS_ERR(pkwork->cq_wq)) {
+		ret = PTR_ERR(pkwork->cq_wq);
+		pkwork->cq_wq = NULL;
+		goto fail;
+	}
+	kthread_init_work(&pkwork->cq_work, amdpk_cq_workfn);
+
+	pk_wrreg(pkdev->regs, REG_CQ_CFG_IRQ_NR(qid), qid);
+	pk_wrreg(pkdev->regs, REG_CQ_CFG_ADDR(qid), user->physcq);
+	pk_wrreg(pkdev->regs, REG_CQ_CFG_SIZE(qid), PAGE_SHIFT);
+	pk_wrreg(pkdev->regs, REG_RQ_CFG_CQID(qid), qid);
+	pk_wrreg(pkdev->regs, REG_RQ_CFG_DEPTH(qid), user->rq_entries);
+
+	/* set trigger position for notifications */
+	trigpos = amdpk_trigpos(&pkwork->pk_cq);
+	pk_wrreg(pkdev->regs, REG_CTL_CQ_NTFY(qid), trigpos);
+
+	return 0;
+fail:
+	if (pkwork->cq_wq) {
+		kthread_destroy_worker(pkwork->cq_wq);
+		pkwork->cq_wq = NULL;
+	}
+	if (user->cqmem) {
+		dma_free_coherent(pkdev->dev, PAGE_SIZE, user->cqmem, user->physcq);
+		user->cqmem = NULL;
+	}
+	atomic_add(user->rq_entries, &pkdev->avail_qdepth);
+	user->rq_entries = 0;
+
+	return ret;
+}
+
+static int amdpk_accel_set_conf(struct drm_device *dev, void *data, struct drm_file *fp)
+{
+	struct amdpk_user *user = fp->driver_priv;
+	struct amdpk_conf *conf = data;
+	int ret;
+
+	if (conf->qdepth == 0 || conf->qdepth > MAX_CQ_ENTRIES_ON_PAGE) {
+		dev_err(user->pkdev->dev, "Invalid qdepth: %d\n", conf->qdepth);
+		return -EINVAL;
+	}
+
+	if (user->configured) {
+		dev_err(user->pkdev->dev, "User already configured\n");
+		return -EEXIST;
+	}
+
+	ret = amdpk_accel_configure(user, conf);
+	if (ret)
+		return ret;
+
+	user->configured = true;
+	return 0;
+}
+
+static int amdpk_mmap_regs(struct vm_area_struct *vma)
+{
+	struct amdpk_user *user = vma->vm_private_data;
+	struct amdpk_dev *pkdev = user->pkdev;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	return io_remap_pfn_range(vma, vma->vm_start,
+				  (pkdev->regsphys + REG_CTL_BASE(user->qid)) >> PAGE_SHIFT,
+				  vma->vm_end - vma->vm_start, vma->vm_page_prot);
+}
+
+static int mmap_dmamem(struct vm_area_struct *vma, struct amdpk_dev *pkdev,
+		       void *addr, dma_addr_t phys, off_t offset, size_t sz)
+{
+	unsigned long vmstart = vma->vm_start;
+	unsigned long pgoff = vma->vm_pgoff;
+	int ret;
+
+	vma->vm_pgoff = 0;
+	vma->vm_start = vmstart + offset;
+	vma->vm_end = vma->vm_start + sz;
+	ret = dma_mmap_coherent(pkdev->dev, vma, addr, phys, sz);
+	vma->vm_pgoff = pgoff;
+	vma->vm_start = vmstart;
+
+	return ret;
+}
+
+static int amdpk_mmap_mem(struct vm_area_struct *vma)
+{
+	struct amdpk_user *user = vma->vm_private_data;
+	struct amdpk_dev *pkdev = user->pkdev;
+	int pagemult, pagemultshift;
+	int requested_pages;
+	int qid = user->qid;
+	int ret, i;
+
+	if (!user->configured) {
+		dev_err(pkdev->dev, "configuration not found!");
+		return -ENODEV;
+	}
+	/* Mapping already done */
+	if (user->stmem) {
+		dev_err(pkdev->dev, "memory already mapped\n");
+		return -EINVAL;
+	}
+
+	requested_pages = vma_pages(vma);
+	/* As the last page is reserved for the status and the starting ones are for
+	 * the rq, the mmap must be at least 2 pages big.
+	 */
+	if (requested_pages < 2) {
+		dev_err(pkdev->dev, "Invalid request pages: %d\n", requested_pages);
+		return -EINVAL;
+	}
+	/* Store number of rq pages. 1 page is reserved for status */
+	user->rq_pages = requested_pages - 1;
+	/* Requests memory can have up to 4 hardware pages. All hardware pages have the
+	 * same size. If requesting more than 4 OS pages, the hardware pages will use
+	 * the same multiple (pagemult) of OS pages. Thus the requested size for the
+	 * request queue must be a multiple of pagemult.
+	 */
+	pagemult = (requested_pages - 1 + 3) / 4;
+	if ((requested_pages - 1) % pagemult != 0) {
+		dev_err(pkdev->dev, "requested pages: %d not multiple of page multiplier: %d\n",
+			requested_pages, pagemult);
+		return -EINVAL;
+	}
+	/* hardware page size must be a power of 2, and as a consequence pagemult too. */
+	if ((pagemult & (pagemult - 1)) != 0) {
+		dev_err(pkdev->dev, "page multiplier: %d is not power of 2\n", pagemult);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < (requested_pages - 1) / pagemult; i++) {
+		user->rqmem[i] = dma_alloc_coherent(pkdev->dev, PAGE_SIZE * pagemult,
+						    &user->physrq[i], GFP_KERNEL);
+		if (!user->rqmem[i]) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+		pk_wrreg(pkdev->regs, REG_RQ_CFG_PAGE(qid, i), user->physrq[i]);
+	}
+
+	user->stmem = dma_alloc_coherent(pkdev->dev, PAGE_SIZE, &user->physst, GFP_KERNEL);
+	if (!user->stmem) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/* Configure unused rq pages with start of allocated shared mem. Those should not
+	 * be accessed, but if descriptors of a (malicious) user writes descriptors for
+	 * those pages, it will not break the rest of the system.
+	 */
+	for (i = (requested_pages - 1) / pagemult; i < MAX_RQMEM_PER_QUEUE; i++)
+		pk_wrreg(pkdev->regs, REG_RQ_CFG_PAGE(qid, i), user->physrq[0]);
+
+	pagemultshift = pagemult - 1;
+	pagemultshift = (pagemultshift & 5) + ((pagemultshift & 0xa) >> 1);
+	pagemultshift = (pagemultshift & 3) + ((pagemultshift >> 2) & 3);
+	pk_wrreg(pkdev->regs, REG_RQ_CFG_PAGE_SIZE(qid), PAGE_SHIFT + pagemultshift);
+	pk_wrreg(pkdev->regs, REG_RQ_CFG_PAGES_WREN(qid),
+		 (1 << ((requested_pages - 1) / pagemult)));
+
+	ret = mmap_dmamem(vma, pkdev, user->stmem, user->physst, 0, PAGE_SIZE);
+	if (ret)
+		goto fail;
+	for (i = 0; i < (requested_pages - 1) / pagemult; i++) {
+		ret = mmap_dmamem(vma, pkdev, user->rqmem[i], user->physrq[i],
+				  (i * pagemult + 1) * PAGE_SIZE, PAGE_SIZE * pagemult);
+		if (ret)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	amdpk_free_rqmem(pkdev, user);
+	if (user->stmem) {
+		dma_free_coherent(pkdev->dev, PAGE_SIZE, user->stmem, user->physst);
+		user->stmem = NULL;
+	}
+	return ret;
+}
+
+static int amdpk_accel_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+	struct drm_file *dfp = fp->private_data;
+	struct amdpk_user *user;
+	int ret = 0;
+
+	user = dfp->driver_priv;
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+
+	vma->vm_private_data = user;
+
+	switch (vma->vm_pgoff) {
+	case AMDPK_MMAP_REGS:
+		ret = amdpk_mmap_regs(vma);
+		break;
+	case AMDPK_MMAP_MEM:
+		ret = amdpk_mmap_mem(vma);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int amdpk_open(struct drm_device *dev, struct drm_file *file)
+{
+	struct amdpk_work *pkwork = NULL;
+	struct amdpk_user *user = NULL;
+	struct amdpk_dev *pkdev;
+	int ret, qid;
+
+	pkdev = to_amdpk_dev(dev);
+	qid = ida_alloc_range(&pkdev->avail_queues, 0, pkdev->max_queues - 1, GFP_KERNEL);
+	if (qid < 0)
+		return -ENOSPC;
+
+	get_device(pkdev->dev);
+
+	user = kzalloc(sizeof(*user), GFP_KERNEL);
+	if (!user) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	user->pkdev = pkdev;
+	user->qid = qid;
+	user->rq_entries = 0;
+	file->driver_priv = user;
+	pkdev->users[qid] = user;
+
+	pkwork = kzalloc(sizeof(*pkwork), GFP_KERNEL);
+	if (!pkwork) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	pkwork->qid = qid;
+	pkwork->pkdev = pkdev;
+	pkwork->user = user;
+	pkdev->work[qid] = pkwork;
+
+	return 0;
+
+fail:
+	kfree(user);
+	kfree(pkwork);
+	ida_free(&pkdev->avail_queues, qid);
+	put_device(pkdev->dev);
+	return ret;
+}
+
+static void amdpk_postclose(struct drm_device *dev, struct drm_file *file)
+{
+	struct amdpk_user *user = file->driver_priv;
+	struct amdpk_dev *pkdev = user->pkdev;
+	char __iomem *regs = pkdev->regs;
+
+	/* Set pkdev->users[qid] to NULL first, so that Completion interrupt handler gets
+	 * to know that this user will not exists, and so it does not schedule any completion
+	 * work on cq worker kthread.
+	 */
+	pkdev->users[user->qid] = NULL;
+
+	if (user->configured) {
+		unsigned int attempts = 0;
+
+		/* Disable RQCQ pages to help the hardware finish potential
+		 * pending requests sooner.
+		 */
+		pk_wrreg(regs, REG_RQ_CFG_PAGE_SIZE(user->qid), 0);
+		pk_wrreg(regs, REG_RQ_CFG_PAGES_WREN(user->qid), 0);
+		pk_wrreg(regs, REG_CQ_CFG_SIZE(user->qid), 0);
+
+		/* The hardware does not have a flush mechanism for the requests pending
+		 * in the RQ. Instead check periodically with REG_CTL_PENDING_REQS if the
+		 * user still has requests going on. If the hardware never completes the
+		 * requests, abort after a MAX_FLUSH_WAIT_ATTEMPTS and don't free resources.
+		 */
+		while (pk_rdreg(regs, REG_CTL_BASE(user->qid) + REG_CTL_PENDING_REQS)) {
+			attempts++;
+			if (attempts > MAX_FLUSH_WAIT_ATTEMPTS) {
+				dev_err(pkdev->dev,
+					"Time out waiting for hw completions. Resources leaked.\n");
+				goto abort_cleanup;
+			}
+			msleep(20);
+		}
+
+		if (pkdev->work[user->qid]->cq_wq) {
+			kthread_cancel_work_sync(&pkdev->work[user->qid]->cq_work);
+			kthread_destroy_worker(pkdev->work[user->qid]->cq_wq);
+		}
+
+		amdpk_free_rqmem(pkdev, user);
+		if (user->cqmem) {
+			dma_free_coherent(pkdev->dev, PAGE_SIZE, user->cqmem, user->physcq);
+			user->cqmem = NULL;
+		}
+		if (user->stmem) {
+			dma_free_coherent(pkdev->dev, PAGE_SIZE, user->stmem, user->physst);
+			user->stmem = NULL;
+		}
+
+		atomic_add(user->rq_entries, &pkdev->avail_qdepth);
+	}
+	ida_free(&pkdev->avail_queues, user->qid);
+
+abort_cleanup:
+	put_device(pkdev->dev);
+	kfree(pkdev->work[user->qid]);
+	pkdev->work[user->qid] = NULL;
+	kfree(user);
+}
+
+static const struct drm_ioctl_desc amdpk_accel_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(AMDPK_GET_INFO, amdpk_accel_get_info, 0),
+	DRM_IOCTL_DEF_DRV(AMDPK_SET_CONF, amdpk_accel_set_conf, 0),
+};
+
+static const struct file_operations amdpk_accel_fops = {
+	.owner		= THIS_MODULE,
+	.open		= accel_open,
+	.release	= drm_release,
+	.unlocked_ioctl	= drm_ioctl,
+	.compat_ioctl	= drm_compat_ioctl,
+	.llseek		= noop_llseek,
+	.mmap		= amdpk_accel_mmap,
+};
+
+static const struct drm_driver amdpk_accel_driver = {
+	.driver_features	= DRIVER_COMPUTE_ACCEL,
+
+	.name			= "amdpk_accel_driver",
+	.desc			= "AMD PKI Accelerator for versal-net",
+
+	.fops			= &amdpk_accel_fops,
+	.open			= amdpk_open,
+	.postclose		= amdpk_postclose,
+
+	.ioctls			= amdpk_accel_ioctls,
+	.num_ioctls		= ARRAY_SIZE(amdpk_accel_ioctls),
+};
+
+static int amdpk_create_device(struct amdpk_dev *pkdev, struct device *dev, int irq)
+{
+	u64 qdepth, ver;
+	long magic;
+	int ret;
+
+	magic = pk_rdreg(pkdev->regs, REG_MAGIC);
+	if (magic != AMDPK_MAGIC) {
+		dev_err(dev, "Invalid magic constant %08lx !\n", magic);
+		return -ENODEV;
+	}
+	ver = pk_rdreg(pkdev->regs, REG_SEMVER);
+	if (AMDPK_SEMVER_MAJOR(ver) != 1 || AMDPK_SEMVER_MINOR(ver) < 1) {
+		dev_err(dev, "Hardware version (%d.%d) not supported.\n",
+			(int)AMDPK_SEMVER_MAJOR(ver), (int)AMDPK_SEMVER_MINOR(ver));
+		return -ENODEV;
+	}
+
+	/* Reset all accelerators and the hw scheduler */
+	pk_wrreg(pkdev->regs, REG_PK_GLOBAL_STATE, 0x1);
+	pk_wrreg(pkdev->regs, REG_PK_GLOBAL_STATE, 0x0);
+
+	pkdev->max_queues = (int)pk_rdreg(pkdev->regs, REG_CFG_REQ_QUEUES_CNT);
+	qdepth = pk_rdreg(pkdev->regs, REG_CFG_MAX_PENDING_REQ);
+	atomic_set(&pkdev->avail_qdepth, qdepth);
+	pkdev->max_queues = (int)pk_rdreg(pkdev->regs, REG_CFG_REQ_QUEUES_CNT);
+
+	pk_wrreg(pkdev->regs, REG_IRQ_ENABLE, 0);
+	pk_wrreg(pkdev->regs, REG_PK_IRQ_RESET, ~0);
+	pk_wrreg(pkdev->regs, REG_IRQ_ENABLE, (1 << pkdev->max_queues) - 1);
+
+	ret = devm_request_irq(dev, irq, amdpk_cq_irq, 0, "amdpk", pkdev);
+	if (ret)
+		return ret;
+
+	ida_init(&pkdev->avail_queues);
+
+	return 0;
+}
+
+static void amdpk_remove_device(struct amdpk_dev *pkdev)
+{
+	drm_dev_unplug(&pkdev->ddev);
+	pk_wrreg(pkdev->regs, REG_IRQ_ENABLE, 0);
+	ida_destroy(&pkdev->avail_queues);
+}
+
+static int amdpk_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct amdpk_dev *pkdev;
+	struct resource *memres;
+	int irq, ret;
+
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+	if (ret < 0)
+		return ret;
+
+	pkdev = devm_drm_dev_alloc(dev, &amdpk_accel_driver, typeof(*pkdev), ddev);
+	if (IS_ERR(pkdev))
+		return PTR_ERR(pkdev);
+	pkdev->dev = dev;
+
+	memres = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	pkdev->regs = devm_ioremap_resource(dev, memres);
+	if (IS_ERR(pkdev->regs))
+		return PTR_ERR(pkdev->regs);
+	pkdev->regsphys = memres->start;
+	platform_set_drvdata(pdev, pkdev);
+
+	if (platform_irq_count(pdev) != 1)
+		return -ENODEV;
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return -ENODEV;
+
+	ret = drm_dev_register(&pkdev->ddev, 0);
+	if (ret) {
+		dev_err(&pdev->dev, "DRM register failed, ret %d", ret);
+		return ret;
+	}
+
+	return amdpk_create_device(pkdev, dev, irq);
+}
+
+static void amdpk_remove(struct platform_device *pdev)
+{
+	struct amdpk_dev *pkdev = platform_get_drvdata(pdev);
+
+	amdpk_remove_device(pkdev);
+}
+
+static void amdpk_shutdown(struct platform_device *pdev)
+{
+	amdpk_remove(pdev);
+}
+
+static const struct of_device_id amdpk_match_table[] = {
+	{ .compatible = "amd,versal-net-pki" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, amdpk_match_table);
+
+static struct platform_driver amdpk_pdrv = {
+	.probe = amdpk_probe,
+	.remove = amdpk_remove,
+	.shutdown = amdpk_shutdown,
+	.driver = {
+		.name = DRIVER_NAME,
+		.of_match_table = amdpk_match_table,
+	},
+};
+
+static int __init amdpk_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&amdpk_pdrv);
+	if (ret) {
+		pr_err("can't register platform driver\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __exit amdpk_exit(void)
+{
+	platform_driver_unregister(&amdpk_pdrv);
+}
+
+module_init(amdpk_init);
+module_exit(amdpk_exit);
+
+MODULE_AUTHOR("AMD");
+MODULE_DESCRIPTION("AMD PKI accelerator for versal-net");
+MODULE_LICENSE("GPL");
diff --git a/drivers/accel/amdpk/amdpk_drv.h b/drivers/accel/amdpk/amdpk_drv.h
new file mode 100644
index 000000000000..c14c10db5d97
--- /dev/null
+++ b/drivers/accel/amdpk/amdpk_drv.h
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018-2021 Silex Insight sa
+ * Copyright (c) 2018-2021 Beerten Engineering scs
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ */
+
+#ifndef __AMDPK_DRV_H__
+#define __AMDPK_DRV_H__
+
+#include <linux/types.h>
+#include <linux/kthread.h>
+#include <linux/io.h>
+#include <drm/drm_drv.h>
+#include <uapi/drm/amdpk.h>
+
+/* Magic number in the AMD PKI device, required to validate hardware access. */
+#define AMDPK_MAGIC 0x5113C50C
+
+/* Contains magic number 0x5113C5OC.
+ * Used to validate access to the hardware registers.
+ */
+#define REG_MAGIC (0x00)
+
+/* Contains version of the hardware interface as semver.
+ * The semantic version : major 8 bits, minor 8 bits in little endian order.
+ */
+#define REG_SEMVER (0x08)
+
+/* The number of request queues available in the hardware. */
+#define REG_CFG_REQ_QUEUES_CNT 0x10
+
+/* The maximum number of pending requests from all request queues combined. */
+#define REG_CFG_MAX_PENDING_REQ 0x18
+
+/* The maximum number of pending requests in a single request queue. */
+#define REG_CFG_MAX_REQ_QUEUE_ENTRIES 0x0020
+
+/* The first 16 bits give the amount of PK core instances with 64 multipliers.
+ * The next 16 bits give the amount of PK core instances with 256 multipliers.
+ */
+#define REG_CFG_PK_INST 0x28
+
+/* Writing 0x1 puts all pkcore accelerators and scheduler in reset.
+ * Writing 0x0 makes all pkcore accelerators and scheduler leave reset
+ * and become operational.
+ */
+#define REG_PK_GLOBAL_STATE 0x38
+
+/* The semantic version : major 8 bits, minor 8 bits,
+ * scm id 16 bits in little endian order.
+ */
+#define REG_HW_VERSION (0x40)
+
+/* Bitmask of which CQ interrupts are raised. */
+#define REG_PK_IRQ_STATUS 0x88
+
+/* Bitmask of which CQ may trigger interrupts. */
+#define REG_IRQ_ENABLE 0x90
+
+/* Bitmask of CQ interrupts to reset. */
+#define REG_PK_IRQ_RESET 0xA0
+
+/* Bus address of the page p for the given request queue.
+ * The address must be aligned on the page size.
+ */
+#define REG_RQ_CFG_PAGE(qid, pageidx) (0x00100 + (qid) * 0x80 + (pageidx) * 0x8)
+
+/* Size in bytes of the pages represented as a power of 2.
+ *
+ * Allowed values :
+ * ================ ==============
+ *  register value   size in bytes
+ * ================ ==============
+ *     7               128
+ *     8               256
+ *     9               512
+ *    10              1024
+ *    11              2048
+ *    12              4096
+ *    13              8192
+ *    14             16384
+ *    15             32768
+ *    16             65536
+ * ================ ==============
+ */
+#define REG_RQ_CFG_PAGE_SIZE(qid) (0x00120 + (qid) * 0x80)
+
+/* Index of the associated completion queue. */
+#define REG_RQ_CFG_CQID(qid) (0x00128 + (qid) * 0x80)
+
+/* Bit field of pages where descriptor can write to.
+ * When a bit is 1, a descriptor can write to the corresponding page.
+ */
+#define REG_RQ_CFG_PAGES_WREN(qid) (0x00138 + (qid) * 0x80)
+
+/* Maximum number of entries which can be written into this request queue. */
+#define REG_RQ_CFG_DEPTH(qid) (0x00140 + (qid) * 0x80)
+
+/* Bus address of the ring base of completion queue n.
+ * The address must be aligned on 64 bits.
+ */
+#define REG_CQ_CFG_ADDR(qid) (0x1100 + (qid) * 0x80)
+
+/* CQ notification trigger position. */
+#define REG_CTL_CQ_NTFY(qid) (0x2028 + (qid) * 0x1000)
+
+/* Size in bytes of the completion ring represented as a power of 2.
+ *
+ * Allowed sizes :
+ * ================ ============== ==============
+ *  register value   size in bytes  max entries
+ * ================ ============== ==============
+ *      7             128             16
+ *      8             256             32
+ *      9             512             64
+ *     10            1024            128
+ *     11            2048            256
+ *     12            4096            512
+ *     13            8192           1024
+ *     14           16384           2048
+ * ================ ============== ==============
+ */
+#define REG_CQ_CFG_SIZE(qid) (0x1108 + (qid) * 0x80)
+
+/* Interrupt number for this completion queue. */
+#define REG_CQ_CFG_IRQ_NR(qid) (0x1110 + (qid) * 0x80)
+
+/* Control registers base address for the given request completion queue pair. */
+#define REG_CTL_BASE(qid) (0x2000 + (qid) * 0x1000)
+
+/* Count of how many requests are queued at a given time for this RQCQ.
+ * When this count reaches 0, the resources of the request and
+ * completion queues can be deleted.
+ */
+#define REG_CTL_PENDING_REQS  0x18
+
+/* Busy cycle count register address. */
+#define REG_PK_BUSY_CYCLES 0x2108
+/* Busy cycle count  register address.*/
+#define REG_PK_IDLE_CYCLES 0x2110
+
+/* Hardware interface versions. */
+#define AMDPK_SEMVER_MAJOR(v) (((v) >> 24) & 0xff)
+#define AMDPK_SEMVER_MINOR(v) (((v) >> 16) & 0xff)
+#define AMDPK_SEMVER_PATCH(v) ((v) & 0xffff)
+
+/* Hardware implementation versions. */
+#define AMDPK_HWVER_MAJOR(v)  (((v) >> 24) & 0xff)
+#define AMDPK_HWVER_MINOR(v)  (((v) >> 16) & 0xff)
+#define AMDPK_HWVER_SVN(v)    ((v) & 0xffff)
+
+/* Maximum number of queues supported by the driver. */
+#define MAX_QUEUES 4
+
+/* Number of RQ memory addresses for each queue. */
+#define MAX_RQMEM_PER_QUEUE 4
+
+/* Wait attempts for HW to flush all requests before close. */
+#define MAX_FLUSH_WAIT_ATTEMPTS 500
+
+/* Bit 0 (0x1) is the Generation bit. */
+#define CQ_GENERATION_BIT BIT(0)
+
+/* Bit 1 (0x2) is set when completion is valid. */
+#define CQ_COMPLETION_BIT BIT(1)
+
+/* Maximal value of rq_entries is 512. There is 1 CQ of 4K bytes.
+ * Each completion status is 8 Bytes. Only 4096 / 8 = 512 entries
+ * are possible at any time.
+ */
+#define MAX_CQ_ENTRIES_ON_PAGE (PAGE_SIZE / 8)
+
+/* Forward declaration */
+struct amdpk_dev;
+struct amdpk_user;
+
+/* structure to hold completion queue information */
+struct amdpk_cq {
+	/* PKI device */
+	struct amdpk_dev *pkdev;
+	/* Base address of the completion queue */
+	u32 *base;
+	/* tail representing last completion */
+	unsigned int tail;
+	/* generation bit which toggles as per the device */
+	unsigned int generation;
+	/* size code as configured in REG_RQ_CFG_PAGE_SIZE */
+	u16 szcode;
+};
+
+/* represents PKI work context */
+struct amdpk_work {
+	/* PKI device */
+	struct amdpk_dev *pkdev;
+	/* PKI user */
+	struct amdpk_user *user;
+	/* Completion queue */
+	struct amdpk_cq pk_cq;
+	/* Kthread work associated with the PKI work */
+	struct kthread_work cq_work;
+	/* Kthred worker to handle completions */
+	struct kthread_worker *cq_wq;
+	/* Associated queue ID */
+	u16 qid;
+};
+
+/* AMD PKI device */
+struct amdpk_dev {
+	/* DRM device associated with PKI device */
+	struct drm_device ddev;
+	/* Core device */
+	struct device *dev;
+	/* PKI register space address */
+	char __iomem *regs;
+	/* PKI register space physical address */
+	resource_size_t regsphys;
+	/* Maximum queues supported by device. */
+	u16 max_queues;
+	/* Available queues */
+	struct ida avail_queues;
+	/* Total available queues */
+	atomic_t avail_qdepth;
+	/* List of all the AMD users */
+	struct amdpk_user *users[MAX_QUEUES];
+	/* PKI work for each queue */
+	struct amdpk_work *work[MAX_QUEUES];
+};
+
+/* AMD PKI user */
+struct amdpk_user {
+	/* PKI device */
+	struct amdpk_dev *pkdev;
+	/* Indicates if user has been configured */
+	bool configured;
+	/* Queue ID allocated for the user */
+	u16 qid;
+	/* Number of pages allocated on request queue */
+	u16 rq_pages;
+	/* RQ entries reserved for this user */
+	size_t rq_entries;
+	/* DMA address for RQ pages */
+	dma_addr_t physrq[MAX_RQMEM_PER_QUEUE];
+	/* RQ pages addresses */
+	u8 *rqmem[MAX_RQMEM_PER_QUEUE];
+	/* DMA address for CQ page */
+	dma_addr_t physcq;
+	/* CQ page address */
+	u8 *cqmem;
+	/* DMA address for status page */
+	dma_addr_t physst;
+	/* Status page address */
+	u8 *stmem;
+	/* Eventfd context for each request */
+	struct eventfd_ctx *evfd_ctx[MAX_PK_REQS];
+};
+
+#define to_amdpk_dev(dev) container_of(dev, struct amdpk_dev, ddev)
+#define to_amdpk_work(work) container_of(work, struct amdpk_work, cq_work)
+
+static void __maybe_unused pk_wrreg(char __iomem *regs, int addr, u64 val)
+{
+	iowrite64(val, regs + addr);
+}
+
+static u64 pk_rdreg(char __iomem *regs, int addr)
+{
+	return ioread64(regs + addr);
+}
+
+#endif /* __AMDPK_DRV_H__ */
diff --git a/include/uapi/drm/amdpk.h b/include/uapi/drm/amdpk.h
new file mode 100644
index 000000000000..e5e18fdbc2c4
--- /dev/null
+++ b/include/uapi/drm/amdpk.h
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ */
+
+#ifndef __AMDPK_H__
+#define __AMDPK_H__
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define MAX_PK_REQS		256
+
+struct amdpk_info {
+	/** maximum available queue depth */
+	unsigned int avail_qdepth;
+};
+
+struct amdpk_conf {
+	/** queue depth to configure */
+	unsigned int qdepth;
+	/** eventfd's associated with the descriptors */
+	int eventfd[MAX_PK_REQS];
+};
+
+/* IOCTL */
+#define DRM_AMDPK_GET_INFO	0x0
+#define DRM_AMDPK_SET_CONF	0x1
+
+#define DRM_IOCTL_AMDPK_GET_INFO	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDPK_GET_INFO, \
+						 struct amdpk_info)
+#define DRM_IOCTL_AMDPK_SET_CONF	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDPK_SET_CONF, \
+						 struct amdpk_conf)
+
+/* MMAP */
+#define AMDPK_MMAP_REGS		0
+#define AMDPK_MMAP_MEM		1
+
+/* Completion Status */
+#define CQ_STATUS_INVALID	0x0
+#define CQ_STATUS_VALID		0x80000000
+#define CQ_COMPLETION_ERROR	0x40000000
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __AMDPK_H__ */
--
2.34.1