[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8022345d676ec5869ef3c7f4a6796869f597ffaf.camel@collabora.com>
Date: Fri, 29 Aug 2025 14:01:44 -0400
From: Nicolas Dufresne <nicolas.dufresne@...labora.com>
To: "Jackson.lee" <jackson.lee@...psnmedia.com>, mchehab@...nel.org,
hverkuil-cisco@...all.nl, bob.beckett@...labora.com
Cc: linux-media@...r.kernel.org, linux-kernel@...r.kernel.org,
lafley.kim@...psnmedia.com, b-brnich@...com, hverkuil@...all.nl,
nas.chung@...psnmedia.com
Subject: Re: [PATCH v3 4/4] media: chips-media: wave5: Improve performance
of decoder
Le lundi 23 juin 2025 à 09:21 +0900, Jackson.lee a écrit :
> From: Jackson Lee <jackson.lee@...psnmedia.com>
>
> The current decoding method was to wait until each frame was
> decoded after feeding a bitstream. As a result, performance was low
> and Wave5 could not achieve max pixel processing rate.
>
> Update driver to use an asynchronous approach for decoding and feeding a
> bitstream in order to achieve full capabilities of the device.
>
> WAVE5 supports command-queueing to maximize performance by pipelining
> internal commands and by hiding wait cycle taken to receive a command
> from Host processor.
>
> Instead of waiting for each command to be executed before sending the
> next command, Host processor just places all the commands in the
> command-queue and goes on doing other things while the commands in the
> queue are processed by VPU.
>
> While Host processor handles its own tasks, it can receive VPU interrupt
> request (IRQ).
> In this case, host processor can simply exit interrupt service routine
> (ISR) without accessing to host interface to read the result of the
> command reported by VPU.
> After host processor completed its tasks, host processor can read the
> command result when host processor needs the reports and does
> response processing.
>
> To archive this goal, the device_run() calls v4l2_m2m_job_finish
> so that next command can be sent to VPU continuously, if there is
> any result, then irq is triggered and gets decoded frames and returns
> them to upper layer.
> Theses processes work independently each other without waiting
> a decoded frame.
>
> Signed-off-by: Jackson Lee <jackson.lee@...psnmedia.com>
> Signed-off-by: Nas Chung <nas.chung@...psnmedia.com>
> ---
> .../platform/chips-media/wave5/wave5-hw.c | 2 +-
> .../chips-media/wave5/wave5-vpu-dec.c | 131 ++++++++++++------
> .../platform/chips-media/wave5/wave5-vpuapi.c | 22 ++-
> .../platform/chips-media/wave5/wave5-vpuapi.h | 5 +
> .../chips-media/wave5/wave5-vpuconfig.h | 1 +
> 5 files changed, 119 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/media/platform/chips-media/wave5/wave5-hw.c b/drivers/media/platform/chips-media/wave5/wave5-hw.c
> index d94cf84c3ee5..687ce6ccf3ae 100644
> --- a/drivers/media/platform/chips-media/wave5/wave5-hw.c
> +++ b/drivers/media/platform/chips-media/wave5/wave5-hw.c
> @@ -102,7 +102,7 @@ static void _wave5_print_reg_err(struct vpu_device *vpu_dev, u32 reg_fail_reason
> dev_dbg(dev, "%s: queueing failure: 0x%x\n", func, reg_val);
> break;
> case WAVE5_SYSERR_RESULT_NOT_READY:
> - dev_err(dev, "%s: result not ready: 0x%x\n", func, reg_fail_reason);
> + dev_dbg(dev, "%s: result not ready: 0x%x\n", func, reg_fail_reason);
> break;
> case WAVE5_SYSERR_ACCESS_VIOLATION_HW:
> dev_err(dev, "%s: access violation: 0x%x\n", func, reg_fail_reason);
> diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c b/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
> index 2df7668575f4..4554a24df8a1 100644
> --- a/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
> +++ b/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
> @@ -268,6 +268,7 @@ static void send_eos_event(struct vpu_instance *inst)
>
> v4l2_event_queue_fh(&inst->v4l2_fh, &vpu_event_eos);
> inst->eos = false;
> + inst->sent_eos = true;
> }
>
> static int handle_dynamic_resolution_change(struct vpu_instance *inst)
> @@ -347,13 +348,12 @@ static void wave5_vpu_dec_finish_decode(struct vpu_instance *inst)
> struct vb2_v4l2_buffer *dec_buf = NULL;
> struct vb2_v4l2_buffer *disp_buf = NULL;
> struct vb2_queue *dst_vq = v4l2_m2m_get_dst_vq(m2m_ctx);
> - struct queue_status_info q_status;
>
> dev_dbg(inst->dev->dev, "%s: Fetch output info from firmware.", __func__);
>
> ret = wave5_vpu_dec_get_output_info(inst, &dec_info);
> if (ret) {
> - dev_warn(inst->dev->dev, "%s: could not get output info.", __func__);
> + dev_dbg(inst->dev->dev, "%s: could not get output info.", __func__);
> v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
> return;
> }
> @@ -442,18 +442,14 @@ static void wave5_vpu_dec_finish_decode(struct vpu_instance *inst)
> spin_unlock_irqrestore(&inst->state_spinlock, flags);
> }
>
> - /*
> - * During a resolution change and while draining, the firmware may flush
> - * the reorder queue regardless of having a matching decoding operation
> - * pending. Only terminate the job if there are no more IRQ coming.
> - */
> - wave5_vpu_dec_give_command(inst, DEC_GET_QUEUE_STATUS, &q_status);
> - if (q_status.report_queue_count == 0 &&
> - (q_status.instance_queue_count == 0 || dec_info.sequence_changed)) {
> - dev_dbg(inst->dev->dev, "%s: finishing job.\n", __func__);
> - pm_runtime_mark_last_busy(inst->dev->dev);
> - pm_runtime_put_autosuspend(inst->dev->dev);
> - v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
> + if (inst->sent_eos &&
> + v4l2_m2m_get_curr_priv(inst->v4l2_m2m_dev)) {
> + struct queue_status_info q_status;
> +
> + wave5_vpu_dec_give_command(inst, DEC_GET_QUEUE_STATUS, &q_status);
> + if (q_status.report_queue_count == 0 &&
> + q_status.instance_queue_count == 0)
> + v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
> }
> }
>
> @@ -1146,8 +1142,8 @@ static int write_to_ringbuffer(struct vpu_instance *inst, void *buffer, size_t b
> static int fill_ringbuffer(struct vpu_instance *inst)
> {
> struct v4l2_m2m_ctx *m2m_ctx = inst->v4l2_fh.m2m_ctx;
> - struct v4l2_m2m_buffer *buf, *n;
> - int ret;
> + struct vpu_src_buffer *vpu_buf;
> + int ret = 0;
>
> if (m2m_ctx->last_src_buf) {
> struct vpu_src_buffer *vpu_buf = wave5_to_vpu_src_buf(m2m_ctx->last_src_buf);
> @@ -1158,9 +1154,8 @@ static int fill_ringbuffer(struct vpu_instance *inst)
> }
> }
>
> - v4l2_m2m_for_each_src_buf_safe(m2m_ctx, buf, n) {
> - struct vb2_v4l2_buffer *vbuf = &buf->vb;
> - struct vpu_src_buffer *vpu_buf = wave5_to_vpu_src_buf(vbuf);
> + list_for_each_entry(vpu_buf, &inst->avail_src_bufs, list) {
> + struct vb2_v4l2_buffer *vbuf = &vpu_buf->v4l2_m2m_buf.vb;
> struct vpu_buf *ring_buffer = &inst->bitstream_vbuf;
> size_t src_size = vb2_get_plane_payload(&vbuf->vb2_buf, 0);
> void *src_buf = vb2_plane_vaddr(&vbuf->vb2_buf, 0);
> @@ -1220,9 +1215,13 @@ static int fill_ringbuffer(struct vpu_instance *inst)
> dev_dbg(inst->dev->dev, "last src buffer written to the ring buffer\n");
> break;
> }
> +
> + inst->queuing_num++;
> + list_del_init(&vpu_buf->list);
> + break;
> }
>
> - return 0;
> + return ret;
> }
>
> static void wave5_vpu_dec_buf_queue_src(struct vb2_buffer *vb)
> @@ -1236,6 +1235,11 @@ static void wave5_vpu_dec_buf_queue_src(struct vb2_buffer *vb)
> vbuf->sequence = inst->queued_src_buf_num++;
>
> v4l2_m2m_buf_queue(m2m_ctx, vbuf);
> +
> + INIT_LIST_HEAD(&vpu_buf->list);
> + mutex_lock(&inst->feed_lock);
> + list_add_tail(&vpu_buf->list, &inst->avail_src_bufs);
> + mutex_unlock(&inst->feed_lock);
Needs some comment. Also why not do everything in the mutex ?
> }
>
> static void wave5_vpu_dec_buf_queue_dst(struct vb2_buffer *vb)
> @@ -1287,10 +1291,13 @@ static void wave5_vpu_dec_buf_queue(struct vb2_buffer *vb)
> __func__, vb->type, vb->index, vb2_plane_size(&vbuf->vb2_buf, 0),
> vb2_plane_size(&vbuf->vb2_buf, 1), vb2_plane_size(&vbuf->vb2_buf, 2));
>
> - if (vb->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE)
> + if (vb->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) {
> + if (inst->empty_queue)
> + inst->empty_queue = false;
> wave5_vpu_dec_buf_queue_src(vb);
> - else if (vb->type == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE)
> + } else if (vb->type == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) {
> wave5_vpu_dec_buf_queue_dst(vb);
> + }
> }
>
> static int wave5_vpu_dec_allocate_ring_buffer(struct vpu_instance *inst)
> @@ -1385,6 +1392,13 @@ static int streamoff_output(struct vb2_queue *q)
> dma_addr_t new_rd_ptr;
> struct dec_output_info dec_info;
> unsigned int i;
> + struct vpu_src_buffer *vpu_buf, *tmp;
> +
> + inst->retry = false;
> + inst->queuing_num = 0;
> +
> + list_for_each_entry_safe(vpu_buf, tmp, &inst->avail_src_bufs, list)
> + list_del_init(&vpu_buf->list);
>
> for (i = 0; i < v4l2_m2m_num_dst_bufs_ready(m2m_ctx); i++) {
> ret = wave5_vpu_dec_set_disp_flag(inst, i);
> @@ -1474,6 +1488,8 @@ static void wave5_vpu_dec_stop_streaming(struct vb2_queue *q)
>
> dev_dbg(inst->dev->dev, "%s: type: %u\n", __func__, q->type);
> pm_runtime_resume_and_get(inst->dev->dev);
> + inst->empty_queue = false;
> + inst->sent_eos = false;
>
> while (check_cmd) {
> struct queue_status_info q_status;
> @@ -1481,11 +1497,11 @@ static void wave5_vpu_dec_stop_streaming(struct vb2_queue *q)
>
> wave5_vpu_dec_give_command(inst, DEC_GET_QUEUE_STATUS, &q_status);
>
> - if (q_status.report_queue_count == 0)
> + if ((inst->state == VPU_INST_STATE_STOP || q_status.instance_queue_count == 0) &&
> + q_status.report_queue_count == 0)
> break;
>
> - if (wave5_vpu_wait_interrupt(inst, VPU_DEC_TIMEOUT) < 0)
> - break;
> + wave5_vpu_wait_interrupt(inst, VPU_DEC_STOP_TIMEOUT);
>
> if (wave5_vpu_dec_get_output_info(inst, &dec_output_info))
> dev_dbg(inst->dev->dev, "there is no output info\n");
> @@ -1577,13 +1593,24 @@ static void wave5_vpu_dec_device_run(void *priv)
> struct queue_status_info q_status;
> u32 fail_res = 0;
> int ret = 0;
> + unsigned long flags;
>
> dev_dbg(inst->dev->dev, "%s: Fill the ring buffer with new bitstream data", __func__);
> pm_runtime_resume_and_get(inst->dev->dev);
> - ret = fill_ringbuffer(inst);
> - if (ret) {
> - dev_warn(inst->dev->dev, "Filling ring buffer failed\n");
> - goto finish_job_and_return;
> + if (!inst->retry) {
> + mutex_lock(&inst->feed_lock);
> + ret = fill_ringbuffer(inst);
> + mutex_unlock(&inst->feed_lock);
> + if (ret < 0) {
> + dev_warn(inst->dev->dev, "Filling ring buffer failed\n");
> + goto finish_job_and_return;
> + } else if (!inst->eos &&
> + inst->queuing_num == 0 &&
> + inst->state == VPU_INST_STATE_PIC_RUN) {
> + dev_dbg(inst->dev->dev, "%s: no bitstream for feeding, so skip ", __func__);
> + inst->empty_queue = true;
> + goto finish_job_and_return;
> + }
> }
>
> switch (inst->state) {
> @@ -1608,7 +1635,9 @@ static void wave5_vpu_dec_device_run(void *priv)
> }
> spin_unlock_irqrestore(&inst->state_spinlock, flags);
> } else {
> + spin_lock_irqsave(&inst->state_spinlock, flags);
If you are to lock/only for every state switch, just do that inside the helper
function, that will reduce the footprint. Also, consider using guard().
> switch_state(inst, VPU_INST_STATE_INIT_SEQ);
> + spin_unlock_irqrestore(&inst->state_spinlock, flags);
> }
>
> break;
> @@ -1619,8 +1648,9 @@ static void wave5_vpu_dec_device_run(void *priv)
> * we had a chance to switch, which leads to an invalid state
> * change.
> */
> + spin_lock_irqsave(&inst->state_spinlock, flags);
> switch_state(inst, VPU_INST_STATE_PIC_RUN);
> -
> + spin_unlock_irqrestore(&inst->state_spinlock, flags);
> /*
> * During DRC, the picture decoding remains pending, so just leave the job
> * active until this decode operation completes.
> @@ -1634,12 +1664,14 @@ static void wave5_vpu_dec_device_run(void *priv)
> ret = wave5_prepare_fb(inst);
> if (ret) {
> dev_warn(inst->dev->dev, "Framebuffer preparation, fail: %d\n", ret);
> + spin_lock_irqsave(&inst->state_spinlock, flags);
> switch_state(inst, VPU_INST_STATE_STOP);
> + spin_unlock_irqrestore(&inst->state_spinlock, flags);
> break;
> }
>
> if (q_status.instance_queue_count) {
> - dev_dbg(inst->dev->dev, "%s: leave with active job", __func__);
> + v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
> return;
> }
>
> @@ -1650,26 +1682,42 @@ static void wave5_vpu_dec_device_run(void *priv)
> dev_err(inst->dev->dev,
> "Frame decoding on m2m context (%p), fail: %d (result: %d)\n",
> m2m_ctx, ret, fail_res);
> - break;
> + goto finish_job_and_return;
> + }
> +
> + if (fail_res == WAVE5_SYSERR_QUEUEING_FAIL) {
> + inst->retry = true;
> + } else {
> + inst->retry = false;
> + if (!inst->eos)
> + inst->queuing_num--;
> }
> - /* Return so that we leave this job active */
> - dev_dbg(inst->dev->dev, "%s: leave with active job", __func__);
> - return;
> - default:
> - WARN(1, "Execution of a job in state %s illegal.\n", state_to_str(inst->state));
> break;
> + default:
> + dev_dbg(inst->dev->dev, "Execution of a job in state %s illegal.\n",
> + state_to_str(inst->state));
> }
>
> finish_job_and_return:
> dev_dbg(inst->dev->dev, "%s: leave and finish job", __func__);
> pm_runtime_mark_last_busy(inst->dev->dev);
> pm_runtime_put_autosuspend(inst->dev->dev);
> - v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
> + /*
> + * After receiving CMD_STOP, there is no input, but we have to run device_run
> + * to send DEC_PIC command until display index == -1, so job_finish was always
> + * called in the device_run to archive it, the logic was very wasteful
> + * in power and CPU time.
> + * If EOS is passed, device_run will not call job_finish no more, it is called
> + * only if HW is idle status in order to reduce overhead.
> + */
> + if (!inst->sent_eos)
> + v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
> }
>
> static void wave5_vpu_dec_job_abort(void *priv)
> {
> struct vpu_instance *inst = priv;
> + struct v4l2_m2m_ctx *m2m_ctx = inst->v4l2_fh.m2m_ctx;
> int ret;
>
> ret = switch_state(inst, VPU_INST_STATE_STOP);
> @@ -1680,6 +1728,8 @@ static void wave5_vpu_dec_job_abort(void *priv)
> if (ret)
> dev_warn(inst->dev->dev,
> "Setting EOS for the bitstream, fail: %d\n", ret);
> +
> + v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
> }
>
> static int wave5_vpu_dec_job_ready(void *priv)
> @@ -1715,7 +1765,8 @@ static int wave5_vpu_dec_job_ready(void *priv)
> "No capture buffer ready to decode!\n");
> break;
> } else if (!wave5_is_draining_or_eos(inst) &&
> - !v4l2_m2m_num_src_bufs_ready(m2m_ctx)) {
> + (!v4l2_m2m_num_src_bufs_ready(m2m_ctx) ||
> + inst->empty_queue)) {
> dev_dbg(inst->dev->dev,
> "No bitstream data to decode!\n");
> break;
> @@ -1755,6 +1806,8 @@ static int wave5_vpu_open_dec(struct file *filp)
> inst->ops = &wave5_vpu_dec_inst_ops;
>
> spin_lock_init(&inst->state_spinlock);
> + mutex_init(&inst->feed_lock);
> + INIT_LIST_HEAD(&inst->avail_src_bufs);
>
> inst->codec_info = kzalloc(sizeof(*inst->codec_info), GFP_KERNEL);
> if (!inst->codec_info)
> diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpuapi.c b/drivers/media/platform/chips-media/wave5/wave5-vpuapi.c
> index 5b10f9f49b9f..edbe69540ef1 100644
> --- a/drivers/media/platform/chips-media/wave5/wave5-vpuapi.c
> +++ b/drivers/media/platform/chips-media/wave5/wave5-vpuapi.c
> @@ -207,6 +207,7 @@ int wave5_vpu_dec_close(struct vpu_instance *inst, u32 *fail_res)
> int retry = 0;
> struct vpu_device *vpu_dev = inst->dev;
> int i;
> + struct dec_output_info dec_info;
>
> *fail_res = 0;
> if (!inst->codec_info)
> @@ -227,11 +228,26 @@ int wave5_vpu_dec_close(struct vpu_instance *inst, u32 *fail_res)
> goto unlock_and_return;
> }
>
> - if (*fail_res == WAVE5_SYSERR_VPU_STILL_RUNNING &&
> - retry++ >= MAX_FIRMWARE_CALL_RETRY) {
> + if (ret == 0)
> + break;
> +
> + if (*fail_res != WAVE5_SYSERR_VPU_STILL_RUNNING) {
> + dev_warn(inst->dev->dev, "dec_finish_seq timed out\n");
> + goto unlock_and_return;
> + }
> +
> + if (retry++ >= MAX_FIRMWARE_CALL_RETRY) {
> ret = -ETIMEDOUT;
> goto unlock_and_return;
> }
> +
> + mutex_unlock(&vpu_dev->hw_lock);
> + wave5_vpu_dec_get_output_info(inst, &dec_info);
> + ret = mutex_lock_interruptible(&vpu_dev->hw_lock);
> + if (ret) {
> + pm_runtime_put_sync(inst->dev->dev);
> + return ret;
> + }
> } while (ret != 0);
>
> dev_dbg(inst->dev->dev, "%s: dec_finish_seq complete\n", __func__);
> @@ -248,6 +264,8 @@ int wave5_vpu_dec_close(struct vpu_instance *inst, u32 *fail_res)
>
> wave5_vdi_free_dma_memory(vpu_dev, &p_dec_info->vb_task);
>
> + mutex_destroy(&inst->feed_lock);
> +
> unlock_and_return:
> mutex_unlock(&vpu_dev->hw_lock);
> pm_runtime_put_sync(inst->dev->dev);
> diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h b/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h
> index bc101397204d..adfbc104f939 100644
> --- a/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h
> +++ b/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h
> @@ -818,6 +818,11 @@ struct vpu_instance {
> bool cbcr_interleave;
> bool nv21;
> bool eos;
> + bool sent_eos; /* check if EOS is sent to application */
> + bool retry; /* retry to feed bitstream if failure reason is WAVE5_SYSERR_QUEUEING_FAIL*/
> + int queuing_num; /* count of bitstream queued */
> + struct mutex feed_lock; /* lock for feeding bitstream buffers */
> + bool empty_queue;
So my overall impression is that most of this change make sense if you really
don't want to architecture the VPU library part. I left a minor remark that
needs addressing, along with fixing the crash and deadlock found during testing.
regards,
Nicolas
> struct vpu_buf bitstream_vbuf;
> dma_addr_t last_rd_ptr;
> size_t remaining_consumed_bytes;
> diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpuconfig.h b/drivers/media/platform/chips-media/wave5/wave5-vpuconfig.h
> index 1ea9f5f31499..4ebd48d5550e 100644
> --- a/drivers/media/platform/chips-media/wave5/wave5-vpuconfig.h
> +++ b/drivers/media/platform/chips-media/wave5/wave5-vpuconfig.h
> @@ -59,6 +59,7 @@
> // application specific configuration
> #define VPU_ENC_TIMEOUT 60000
> #define VPU_DEC_TIMEOUT 60000
> +#define VPU_DEC_STOP_TIMEOUT 10
>
> // for WAVE encoder
> #define USE_SRC_PRP_AXI 0
Download attachment "signature.asc" of type "application/pgp-signature" (229 bytes)
Powered by blists - more mailing lists