linux-kernel - Re: [PATCH 11/15] habanalabs: add command submission module

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20190127151103.GA21117@rapoport-lnx>
Date:   Sun, 27 Jan 2019 17:11:07 +0200
From:   Mike Rapoport <rppt@...ux.ibm.com>
To:     Oded Gabbay <oded.gabbay@...il.com>
Cc:     gregkh@...uxfoundation.org, linux-kernel@...r.kernel.org,
        ogabbay@...ana.ai
Subject: Re: [PATCH 11/15] habanalabs: add command submission module

On Wed, Jan 23, 2019 at 02:00:53AM +0200, Oded Gabbay wrote:
> This patch adds the main flow for the user to submit work to the device.
> 
> Each work is described by a command submission object (CS). The CS contains
> 3 arrays of command buffers: One for execution, and two for context-switch
> (store and restore).
> 
> For each CB, the user specifies on which queue to put that CB. In case of
> an internal queue, the entry doesn't contain a pointer to the CB but the
> address in the on-chip memory that the CB resides at.
> 
> The driver parses some of the CBs to enforce security restrictions.
> 
> The user receives a sequence number that represents the CS object. The user
> can then query the driver regarding the status of the CS, using that
> sequence number.
> 
> In case the CS doesn't finish before the timeout expires, the driver will
> perform a soft-reset of the device.
> 
> Signed-off-by: Oded Gabbay <oded.gabbay@...il.com>
> ---
>  drivers/misc/habanalabs/Makefile             |    3 +-
>  drivers/misc/habanalabs/command_submission.c |  787 +++++++++++++
>  drivers/misc/habanalabs/context.c            |   52 +-
>  drivers/misc/habanalabs/device.c             |   16 +
>  drivers/misc/habanalabs/goya/goya.c          | 1082 ++++++++++++++++++
>  drivers/misc/habanalabs/habanalabs.h         |  274 +++++
>  drivers/misc/habanalabs/habanalabs_drv.c     |   23 +
>  drivers/misc/habanalabs/habanalabs_ioctl.c   |    4 +-
>  drivers/misc/habanalabs/hw_queue.c           |  250 ++++
>  drivers/misc/habanalabs/memory.c             |  200 ++++
>  include/uapi/misc/habanalabs.h               |  158 ++-
>  11 files changed, 2842 insertions(+), 7 deletions(-)
>  create mode 100644 drivers/misc/habanalabs/command_submission.c
>  create mode 100644 drivers/misc/habanalabs/memory.c
> 
> diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile
> index b5607233d216..d2fd0e18b1eb 100644
> --- a/drivers/misc/habanalabs/Makefile
> +++ b/drivers/misc/habanalabs/Makefile
> @@ -5,7 +5,8 @@
>  obj-m	:= habanalabs.o
>  
>  habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \
> -		command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o
> +		command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \
> +		command_submission.o
>  
>  include $(src)/goya/Makefile
>  habanalabs-y += $(HL_GOYA_FILES)
> diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
> new file mode 100644
> index 000000000000..0116c2262f17
> --- /dev/null
> +++ b/drivers/misc/habanalabs/command_submission.c
> @@ -0,0 +1,787 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright 2016-2018 HabanaLabs, Ltd.
> + * All Rights Reserved.
> + */
> +
> +#include <uapi/misc/habanalabs.h>
> +#include "habanalabs.h"
> +
> +#include <linux/sched/mm.h>
> +#include <linux/sched/task.h>
> +#include <linux/sched/signal.h>
> +#include <linux/wait.h>
> +#include <linux/mm.h>
> +#include <linux/highmem.h>

[ ... ]

> +static void cs_do_release(struct kref *ref)
> +{
> +	struct hl_cs *cs = container_of(ref, struct hl_cs,
> +						refcount);
> +	struct hl_device *hdev = cs->ctx->hdev;
> +	struct hl_cs_job *job, *tmp;
> +
> +	cs->completed = true;
> +
> +	/*
> +	 * Although if we reached here it means that all external jobs have
> +	 * finished, because each one of them took refcnt to CS, we still
> +	 * need to go over the internal jobs and free them. Otherwise, we
> +	 * will have leaked memory and what's worse, the CS object (and
> +	 * potentially the CTX object) could be released, while the JOB
> +	 * still holds a pointer to them (but no reference).
> +	 */
> +	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
> +		free_job(hdev, job);
> +
> +	/* We also need to update CI for internal queues */
> +	if (cs->submitted) {
> +		hl_int_hw_queue_update_ci(cs);
> +
> +		spin_lock(&hdev->hw_queues_mirror_lock);
> +		/* remove CS from hw_queues mirror list */
> +		list_del_init(&cs->mirror_node);
> +		spin_unlock(&hdev->hw_queues_mirror_lock);
> +
> +		/*
> +		 * Don't cancel TDR in case this CS was timedout because we
> +		 * might be running from the TDR context
> +		 */
> +		if ((!cs->timedout) &&
> +			(hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT)) {
> +			struct hl_cs *next;
> +
> +			if (cs->tdr_active)
> +				cancel_delayed_work_sync(&cs->work_tdr);
> +
> +			spin_lock(&hdev->hw_queues_mirror_lock);
> +			/* queue TDR for next CS */
> +			next = list_first_entry_or_null(
> +					&hdev->hw_queues_mirror_list,
> +					struct hl_cs, mirror_node);
> +			if ((next) && (!next->tdr_active)) {
> +				next->tdr_active = true;
> +				schedule_delayed_work(&next->work_tdr,
> +							hdev->timeout_jiffies);
> +				spin_unlock(&hdev->hw_queues_mirror_lock);
> +			} else {
> +				spin_unlock(&hdev->hw_queues_mirror_lock);
> +			}

'else' can be dropped, just move spin_unlock() outside the 'if'

> +		}
> +	}
> +
> +	hl_ctx_put(cs->ctx);
> +
> +	if (cs->timedout)
> +		dma_fence_set_error(cs->fence, -ETIMEDOUT);
> +	else if (cs->aborted)
> +		dma_fence_set_error(cs->fence, -EIO);
> +
> +	dma_fence_signal(cs->fence);
> +	dma_fence_put(cs->fence);
> +
> +	kfree(cs);
> +}

[ ... ]

> +static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
> +			struct hl_cs **cs_new)
> +{
> +	struct hl_dma_fence *fence;
> +	struct dma_fence *other = NULL;
> +	struct hl_cs *cs;
> +	int rc;
> +
> +	cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
> +	if (!cs)
> +		return -ENOMEM;

Does this ever run from a context that cannot use GFP_KERNEL?
This applies to other allocations below.

> +
> +	cs->ctx = ctx;
> +	cs->submitted = false;
> +	cs->completed = false;
> +	INIT_LIST_HEAD(&cs->job_list);
> +	INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
> +	kref_init(&cs->refcount);
> +	spin_lock_init(&cs->job_lock);
> +
> +	fence = kmalloc(sizeof(*fence), GFP_ATOMIC);

kzalloc?

> +	if (!fence) {
> +		rc = -ENOMEM;
> +		goto free_cs;
> +	}
> +
> +	fence->hdev = hdev;
> +	spin_lock_init(&fence->lock);
> +	cs->fence = &fence->base_fence;
> +
> +	spin_lock(&ctx->cs_lock);
> +
> +	fence->cs_seq = ctx->cs_sequence;
> +	other = ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)];
> +	if ((other) && (!dma_fence_is_signaled(other))) {
> +		spin_unlock(&ctx->cs_lock);
> +		rc = -EAGAIN;
> +		goto free_fence;
> +	}
> +
> +	dma_fence_init(&fence->base_fence, &hl_fence_ops, &fence->lock,
> +			ctx->asid, ctx->cs_sequence);
> +
> +	cs->sequence = fence->cs_seq;
> +
> +	ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)] =
> +							&fence->base_fence;
> +	ctx->cs_sequence++;
> +
> +	dma_fence_get(&fence->base_fence);
> +
> +	dma_fence_put(other);
> +
> +	spin_unlock(&ctx->cs_lock);
> +
> +	*cs_new = cs;
> +
> +	return 0;
> +
> +free_fence:
> +	kfree(fence);
> +free_cs:
> +	kfree(cs);
> +	return rc;
> +}
> +

[ ... ]

> +
> +static int goya_validate_cb(struct hl_device *hdev,
> +			struct hl_cs_parser *parser, bool is_mmu)
> +{
> +	u32 cb_parsed_length = 0;
> +	int rc = 0;
> +
> +	parser->patched_cb_size = 0;
> +
> +	/* cb_user_size is more than 0 so loop will always be executed */
> +	while ((cb_parsed_length < parser->user_cb_size) && (!rc)) {
> +		enum packet_id pkt_id;
> +		u16 pkt_size;
> +		void *user_pkt;
> +
> +		user_pkt = (void *) (parser->user_cb->kernel_address +
> +							cb_parsed_length);
> +
> +		pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
> +				PACKET_HEADER_PACKET_ID_MASK) >>
> +					PACKET_HEADER_PACKET_ID_SHIFT);
> +
> +		pkt_size = goya_packet_sizes[pkt_id];
> +		cb_parsed_length += pkt_size;
> +		if (cb_parsed_length > parser->user_cb_size) {
> +			dev_err(hdev->dev,
> +				"packet 0x%x is out of CB boundary\n", pkt_id);
> +			rc = -EINVAL;
> +			continue;

For me !rc in the while statement was blind. Please consider break here and 

	if (!rc)
		break;

after the switch

> +		}
> +
> +		switch (pkt_id) {
> +		case PACKET_WREG_32:
> +			/*
> +			 * Although it is validated after copy in patch_cb(),
> +			 * need to validate here as well because patch_cb() is
> +			 * not called in MMU path while this function is called
> +			 */
> +			rc = goya_validate_wreg32(hdev, parser, user_pkt);
> +			break;
> +
> +		case PACKET_WREG_BULK:
> +			dev_err(hdev->dev,
> +				"User not allowed to use WREG_BULK\n");
> +			rc = -EPERM;
> +			break;
> +
> +		case PACKET_MSG_PROT:
> +			dev_err(hdev->dev,
> +				"User not allowed to use MSG_PROT\n");
> +			rc = -EPERM;
> +			break;
> +
> +		case PACKET_CP_DMA:
> +			dev_err(hdev->dev, "User not allowed to use CP_DMA\n");
> +			rc = -EPERM;
> +			break;
> +
> +		case PACKET_STOP:
> +			dev_err(hdev->dev, "User not allowed to use STOP\n");
> +			rc = -EPERM;
> +			break;
> +
> +		case PACKET_LIN_DMA:
> +			if (is_mmu)
> +				rc = goya_validate_dma_pkt_mmu(hdev, parser,
> +						user_pkt);
> +			else
> +				rc = goya_validate_dma_pkt_no_mmu(hdev, parser,
> +						user_pkt);
> +			break;
> +
> +		case PACKET_MSG_LONG:
> +		case PACKET_MSG_SHORT:
> +		case PACKET_FENCE:
> +		case PACKET_NOP:
> +			parser->patched_cb_size += pkt_size;
> +			break;
> +
> +		default:
> +			dev_err(hdev->dev, "Invalid packet header 0x%x\n",
> +				pkt_id);
> +			rc = -EINVAL;
> +			break;
> +		}
> +	}
> +
> +	/*
> +	 * The new CB should have space at the end for two MSG_PROT packets:
> +	 * 1. A packet that will act as a completion packet
> +	 * 2. A packet that will generate MSI-X interrupt
> +	 */
> +	parser->patched_cb_size += sizeof(struct packet_msg_prot) * 2;
> +
> +	return rc;
> +}

[ ... ]

> +static int goya_patch_cb(struct hl_device *hdev,
> +				struct hl_cs_parser *parser)
> +{
> +	u32 cb_parsed_length = 0;
> +	u32 cb_patched_cur_length = 0;
> +	int rc = 0;
> +
> +	/* cb_user_size is more than 0 so loop will always be executed */
> +	while ((cb_parsed_length < parser->user_cb_size) && (!rc)) {
> +		enum packet_id pkt_id;
> +		u16 pkt_size;
> +		u32 new_pkt_size = 0;
> +		void *user_pkt, *kernel_pkt;
> +
> +		user_pkt = (void *) (parser->user_cb->kernel_address +
> +							cb_parsed_length);
> +		kernel_pkt = (void *) (parser->patched_cb->kernel_address +
> +							cb_patched_cur_length);
> +
> +		pkt_id = (enum packet_id) (((*(u64 *) user_pkt) &
> +				PACKET_HEADER_PACKET_ID_MASK) >>
> +					PACKET_HEADER_PACKET_ID_SHIFT);
> +
> +		pkt_size = goya_packet_sizes[pkt_id];
> +		cb_parsed_length += pkt_size;
> +		if (cb_parsed_length > parser->user_cb_size) {
> +			dev_err(hdev->dev,
> +				"packet 0x%x is out of CB boundary\n", pkt_id);
> +			rc = -EINVAL;
> +			continue;

Ditto

> +		}
> +
> +		switch (pkt_id) {
> +		case PACKET_LIN_DMA:
> +			rc = goya_patch_dma_packet(hdev, parser, user_pkt,
> +						kernel_pkt, &new_pkt_size);
> +			cb_patched_cur_length += new_pkt_size;
> +			break;
> +
> +		case PACKET_WREG_32:
> +			memcpy(kernel_pkt, user_pkt, pkt_size);
> +			cb_patched_cur_length += pkt_size;
> +			rc = goya_validate_wreg32(hdev, parser, kernel_pkt);
> +			break;
> +
> +		case PACKET_WREG_BULK:
> +			dev_err(hdev->dev,
> +				"User not allowed to use WREG_BULK\n");
> +			rc = -EPERM;
> +			break;
> +
> +		case PACKET_MSG_PROT:
> +			dev_err(hdev->dev,
> +				"User not allowed to use MSG_PROT\n");
> +			rc = -EPERM;
> +			break;
> +
> +		case PACKET_CP_DMA:
> +			dev_err(hdev->dev, "User not allowed to use CP_DMA\n");
> +			rc = -EPERM;
> +			break;
> +
> +		case PACKET_STOP:
> +			dev_err(hdev->dev, "User not allowed to use STOP\n");
> +			rc = -EPERM;
> +			break;
> +
> +		case PACKET_MSG_LONG:
> +		case PACKET_MSG_SHORT:
> +		case PACKET_FENCE:
> +		case PACKET_NOP:
> +			memcpy(kernel_pkt, user_pkt, pkt_size);
> +			cb_patched_cur_length += pkt_size;
> +			break;
> +
> +		default:
> +			dev_err(hdev->dev, "Invalid packet header 0x%x\n",
> +				pkt_id);
> +			rc = -EINVAL;
> +			break;
> +		}
> +	}
> +
> +	return rc;
> +}

[ ... ]

>  static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id,
>  		u16 event_type, char *axi_name, int len)
>  {
> @@ -4645,6 +5677,48 @@ static void goya_disable_clock_gating(struct hl_device *hdev)
>  
>  }
>  
> +static bool goya_is_device_idle(struct hl_device *hdev)
> +{
> +	u64 offset, dma_qm_reg, tpc_qm_reg, tpc_cmdq_reg, tpc_cfg_reg;
> +	bool val = true;
> +	int i;
> +
> +	offset = mmDMA_QM_1_GLBL_STS0 - mmDMA_QM_0_GLBL_STS0;
> +
> +	for (i = 0 ; i < DMA_MAX_NUM ; i++) {
> +		dma_qm_reg = mmDMA_QM_0_GLBL_STS0 + i * offset;
> +
> +		val = val && ((RREG32(dma_qm_reg) & DMA_QM_IDLE_MASK) ==
> +				DMA_QM_IDLE_MASK);
> +	}
> +
> +	offset = mmTPC1_QM_GLBL_STS0 - mmTPC0_QM_GLBL_STS0;
> +
> +	for (i = 0 ; i < TPC_MAX_NUM ; i++) {
> +		tpc_qm_reg = mmTPC0_QM_GLBL_STS0 + i * offset;
> +		tpc_cmdq_reg = mmTPC0_CMDQ_GLBL_STS0 + i * offset;
> +		tpc_cfg_reg = mmTPC0_CFG_STATUS + i * offset;
> +
> +		val = val && ((RREG32(tpc_qm_reg) & TPC_QM_IDLE_MASK) ==
> +				TPC_QM_IDLE_MASK);
> +		val = val && ((RREG32(tpc_cmdq_reg) & TPC_CMDQ_IDLE_MASK) ==
> +				TPC_CMDQ_IDLE_MASK);
> +		val = val && ((RREG32(tpc_cfg_reg) & TPC_CFG_IDLE_MASK) ==
> +				TPC_CFG_IDLE_MASK);
> +	}
> +
> +	val = val && ((RREG32(mmMME_QM_GLBL_STS0) & MME_QM_IDLE_MASK) ==
> +			MME_QM_IDLE_MASK);
> +	val = val && ((RREG32(mmMME_CMDQ_GLBL_STS0) & MME_CMDQ_IDLE_MASK) ==
> +			MME_CMDQ_IDLE_MASK);
> +	val = val && ((RREG32(mmMME_ARCH_STATUS) & MME_ARCH_IDLE_MASK) ==
> +			MME_ARCH_IDLE_MASK);
> +	val = val && ((RREG32(mmMME_SHADOW_0_STATUS) & MME_SHADOW_IDLE_MASK) ==
> +			0);

Huh, these are neat, but IMHO plain

	if ((RREG(reg) & mask) != mask)
		return false;

are more readable...

> +
> +	return val;
> +}
> +

[ ... ]

-- 
Sincerely yours,
Mike.