[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20250623002153.51-5-jackson.lee@chipsnmedia.com>
Date: Mon, 23 Jun 2025 09:21:53 +0900
From: "Jackson.lee" <jackson.lee@...psnmedia.com>
To: mchehab@...nel.org,
hverkuil-cisco@...all.nl,
nicolas.dufresne@...labora.com,
bob.beckett@...labora.com
Cc: linux-media@...r.kernel.org,
linux-kernel@...r.kernel.org,
jackson.lee@...psnmedia.com,
lafley.kim@...psnmedia.com,
b-brnich@...com,
hverkuil@...all.nl,
nas.chung@...psnmedia.com
Subject: [PATCH v3 4/4] media: chips-media: wave5: Improve performance of decoder
From: Jackson Lee <jackson.lee@...psnmedia.com>
The current decoding method was to wait until each frame was
decoded after feeding a bitstream. As a result, performance was low
and Wave5 could not achieve max pixel processing rate.
Update driver to use an asynchronous approach for decoding and feeding a
bitstream in order to achieve full capabilities of the device.
WAVE5 supports command-queueing to maximize performance by pipelining
internal commands and by hiding wait cycle taken to receive a command
from Host processor.
Instead of waiting for each command to be executed before sending the
next command, Host processor just places all the commands in the
command-queue and goes on doing other things while the commands in the
queue are processed by VPU.
While Host processor handles its own tasks, it can receive VPU interrupt
request (IRQ).
In this case, host processor can simply exit interrupt service routine
(ISR) without accessing to host interface to read the result of the
command reported by VPU.
After host processor completed its tasks, host processor can read the
command result when host processor needs the reports and does
response processing.
To archive this goal, the device_run() calls v4l2_m2m_job_finish
so that next command can be sent to VPU continuously, if there is
any result, then irq is triggered and gets decoded frames and returns
them to upper layer.
Theses processes work independently each other without waiting
a decoded frame.
Signed-off-by: Jackson Lee <jackson.lee@...psnmedia.com>
Signed-off-by: Nas Chung <nas.chung@...psnmedia.com>
---
.../platform/chips-media/wave5/wave5-hw.c | 2 +-
.../chips-media/wave5/wave5-vpu-dec.c | 131 ++++++++++++------
.../platform/chips-media/wave5/wave5-vpuapi.c | 22 ++-
.../platform/chips-media/wave5/wave5-vpuapi.h | 5 +
.../chips-media/wave5/wave5-vpuconfig.h | 1 +
5 files changed, 119 insertions(+), 42 deletions(-)
diff --git a/drivers/media/platform/chips-media/wave5/wave5-hw.c b/drivers/media/platform/chips-media/wave5/wave5-hw.c
index d94cf84c3ee5..687ce6ccf3ae 100644
--- a/drivers/media/platform/chips-media/wave5/wave5-hw.c
+++ b/drivers/media/platform/chips-media/wave5/wave5-hw.c
@@ -102,7 +102,7 @@ static void _wave5_print_reg_err(struct vpu_device *vpu_dev, u32 reg_fail_reason
dev_dbg(dev, "%s: queueing failure: 0x%x\n", func, reg_val);
break;
case WAVE5_SYSERR_RESULT_NOT_READY:
- dev_err(dev, "%s: result not ready: 0x%x\n", func, reg_fail_reason);
+ dev_dbg(dev, "%s: result not ready: 0x%x\n", func, reg_fail_reason);
break;
case WAVE5_SYSERR_ACCESS_VIOLATION_HW:
dev_err(dev, "%s: access violation: 0x%x\n", func, reg_fail_reason);
diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c b/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
index 2df7668575f4..4554a24df8a1 100644
--- a/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
+++ b/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
@@ -268,6 +268,7 @@ static void send_eos_event(struct vpu_instance *inst)
v4l2_event_queue_fh(&inst->v4l2_fh, &vpu_event_eos);
inst->eos = false;
+ inst->sent_eos = true;
}
static int handle_dynamic_resolution_change(struct vpu_instance *inst)
@@ -347,13 +348,12 @@ static void wave5_vpu_dec_finish_decode(struct vpu_instance *inst)
struct vb2_v4l2_buffer *dec_buf = NULL;
struct vb2_v4l2_buffer *disp_buf = NULL;
struct vb2_queue *dst_vq = v4l2_m2m_get_dst_vq(m2m_ctx);
- struct queue_status_info q_status;
dev_dbg(inst->dev->dev, "%s: Fetch output info from firmware.", __func__);
ret = wave5_vpu_dec_get_output_info(inst, &dec_info);
if (ret) {
- dev_warn(inst->dev->dev, "%s: could not get output info.", __func__);
+ dev_dbg(inst->dev->dev, "%s: could not get output info.", __func__);
v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
return;
}
@@ -442,18 +442,14 @@ static void wave5_vpu_dec_finish_decode(struct vpu_instance *inst)
spin_unlock_irqrestore(&inst->state_spinlock, flags);
}
- /*
- * During a resolution change and while draining, the firmware may flush
- * the reorder queue regardless of having a matching decoding operation
- * pending. Only terminate the job if there are no more IRQ coming.
- */
- wave5_vpu_dec_give_command(inst, DEC_GET_QUEUE_STATUS, &q_status);
- if (q_status.report_queue_count == 0 &&
- (q_status.instance_queue_count == 0 || dec_info.sequence_changed)) {
- dev_dbg(inst->dev->dev, "%s: finishing job.\n", __func__);
- pm_runtime_mark_last_busy(inst->dev->dev);
- pm_runtime_put_autosuspend(inst->dev->dev);
- v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
+ if (inst->sent_eos &&
+ v4l2_m2m_get_curr_priv(inst->v4l2_m2m_dev)) {
+ struct queue_status_info q_status;
+
+ wave5_vpu_dec_give_command(inst, DEC_GET_QUEUE_STATUS, &q_status);
+ if (q_status.report_queue_count == 0 &&
+ q_status.instance_queue_count == 0)
+ v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
}
}
@@ -1146,8 +1142,8 @@ static int write_to_ringbuffer(struct vpu_instance *inst, void *buffer, size_t b
static int fill_ringbuffer(struct vpu_instance *inst)
{
struct v4l2_m2m_ctx *m2m_ctx = inst->v4l2_fh.m2m_ctx;
- struct v4l2_m2m_buffer *buf, *n;
- int ret;
+ struct vpu_src_buffer *vpu_buf;
+ int ret = 0;
if (m2m_ctx->last_src_buf) {
struct vpu_src_buffer *vpu_buf = wave5_to_vpu_src_buf(m2m_ctx->last_src_buf);
@@ -1158,9 +1154,8 @@ static int fill_ringbuffer(struct vpu_instance *inst)
}
}
- v4l2_m2m_for_each_src_buf_safe(m2m_ctx, buf, n) {
- struct vb2_v4l2_buffer *vbuf = &buf->vb;
- struct vpu_src_buffer *vpu_buf = wave5_to_vpu_src_buf(vbuf);
+ list_for_each_entry(vpu_buf, &inst->avail_src_bufs, list) {
+ struct vb2_v4l2_buffer *vbuf = &vpu_buf->v4l2_m2m_buf.vb;
struct vpu_buf *ring_buffer = &inst->bitstream_vbuf;
size_t src_size = vb2_get_plane_payload(&vbuf->vb2_buf, 0);
void *src_buf = vb2_plane_vaddr(&vbuf->vb2_buf, 0);
@@ -1220,9 +1215,13 @@ static int fill_ringbuffer(struct vpu_instance *inst)
dev_dbg(inst->dev->dev, "last src buffer written to the ring buffer\n");
break;
}
+
+ inst->queuing_num++;
+ list_del_init(&vpu_buf->list);
+ break;
}
- return 0;
+ return ret;
}
static void wave5_vpu_dec_buf_queue_src(struct vb2_buffer *vb)
@@ -1236,6 +1235,11 @@ static void wave5_vpu_dec_buf_queue_src(struct vb2_buffer *vb)
vbuf->sequence = inst->queued_src_buf_num++;
v4l2_m2m_buf_queue(m2m_ctx, vbuf);
+
+ INIT_LIST_HEAD(&vpu_buf->list);
+ mutex_lock(&inst->feed_lock);
+ list_add_tail(&vpu_buf->list, &inst->avail_src_bufs);
+ mutex_unlock(&inst->feed_lock);
}
static void wave5_vpu_dec_buf_queue_dst(struct vb2_buffer *vb)
@@ -1287,10 +1291,13 @@ static void wave5_vpu_dec_buf_queue(struct vb2_buffer *vb)
__func__, vb->type, vb->index, vb2_plane_size(&vbuf->vb2_buf, 0),
vb2_plane_size(&vbuf->vb2_buf, 1), vb2_plane_size(&vbuf->vb2_buf, 2));
- if (vb->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE)
+ if (vb->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) {
+ if (inst->empty_queue)
+ inst->empty_queue = false;
wave5_vpu_dec_buf_queue_src(vb);
- else if (vb->type == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE)
+ } else if (vb->type == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) {
wave5_vpu_dec_buf_queue_dst(vb);
+ }
}
static int wave5_vpu_dec_allocate_ring_buffer(struct vpu_instance *inst)
@@ -1385,6 +1392,13 @@ static int streamoff_output(struct vb2_queue *q)
dma_addr_t new_rd_ptr;
struct dec_output_info dec_info;
unsigned int i;
+ struct vpu_src_buffer *vpu_buf, *tmp;
+
+ inst->retry = false;
+ inst->queuing_num = 0;
+
+ list_for_each_entry_safe(vpu_buf, tmp, &inst->avail_src_bufs, list)
+ list_del_init(&vpu_buf->list);
for (i = 0; i < v4l2_m2m_num_dst_bufs_ready(m2m_ctx); i++) {
ret = wave5_vpu_dec_set_disp_flag(inst, i);
@@ -1474,6 +1488,8 @@ static void wave5_vpu_dec_stop_streaming(struct vb2_queue *q)
dev_dbg(inst->dev->dev, "%s: type: %u\n", __func__, q->type);
pm_runtime_resume_and_get(inst->dev->dev);
+ inst->empty_queue = false;
+ inst->sent_eos = false;
while (check_cmd) {
struct queue_status_info q_status;
@@ -1481,11 +1497,11 @@ static void wave5_vpu_dec_stop_streaming(struct vb2_queue *q)
wave5_vpu_dec_give_command(inst, DEC_GET_QUEUE_STATUS, &q_status);
- if (q_status.report_queue_count == 0)
+ if ((inst->state == VPU_INST_STATE_STOP || q_status.instance_queue_count == 0) &&
+ q_status.report_queue_count == 0)
break;
- if (wave5_vpu_wait_interrupt(inst, VPU_DEC_TIMEOUT) < 0)
- break;
+ wave5_vpu_wait_interrupt(inst, VPU_DEC_STOP_TIMEOUT);
if (wave5_vpu_dec_get_output_info(inst, &dec_output_info))
dev_dbg(inst->dev->dev, "there is no output info\n");
@@ -1577,13 +1593,24 @@ static void wave5_vpu_dec_device_run(void *priv)
struct queue_status_info q_status;
u32 fail_res = 0;
int ret = 0;
+ unsigned long flags;
dev_dbg(inst->dev->dev, "%s: Fill the ring buffer with new bitstream data", __func__);
pm_runtime_resume_and_get(inst->dev->dev);
- ret = fill_ringbuffer(inst);
- if (ret) {
- dev_warn(inst->dev->dev, "Filling ring buffer failed\n");
- goto finish_job_and_return;
+ if (!inst->retry) {
+ mutex_lock(&inst->feed_lock);
+ ret = fill_ringbuffer(inst);
+ mutex_unlock(&inst->feed_lock);
+ if (ret < 0) {
+ dev_warn(inst->dev->dev, "Filling ring buffer failed\n");
+ goto finish_job_and_return;
+ } else if (!inst->eos &&
+ inst->queuing_num == 0 &&
+ inst->state == VPU_INST_STATE_PIC_RUN) {
+ dev_dbg(inst->dev->dev, "%s: no bitstream for feeding, so skip ", __func__);
+ inst->empty_queue = true;
+ goto finish_job_and_return;
+ }
}
switch (inst->state) {
@@ -1608,7 +1635,9 @@ static void wave5_vpu_dec_device_run(void *priv)
}
spin_unlock_irqrestore(&inst->state_spinlock, flags);
} else {
+ spin_lock_irqsave(&inst->state_spinlock, flags);
switch_state(inst, VPU_INST_STATE_INIT_SEQ);
+ spin_unlock_irqrestore(&inst->state_spinlock, flags);
}
break;
@@ -1619,8 +1648,9 @@ static void wave5_vpu_dec_device_run(void *priv)
* we had a chance to switch, which leads to an invalid state
* change.
*/
+ spin_lock_irqsave(&inst->state_spinlock, flags);
switch_state(inst, VPU_INST_STATE_PIC_RUN);
-
+ spin_unlock_irqrestore(&inst->state_spinlock, flags);
/*
* During DRC, the picture decoding remains pending, so just leave the job
* active until this decode operation completes.
@@ -1634,12 +1664,14 @@ static void wave5_vpu_dec_device_run(void *priv)
ret = wave5_prepare_fb(inst);
if (ret) {
dev_warn(inst->dev->dev, "Framebuffer preparation, fail: %d\n", ret);
+ spin_lock_irqsave(&inst->state_spinlock, flags);
switch_state(inst, VPU_INST_STATE_STOP);
+ spin_unlock_irqrestore(&inst->state_spinlock, flags);
break;
}
if (q_status.instance_queue_count) {
- dev_dbg(inst->dev->dev, "%s: leave with active job", __func__);
+ v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
return;
}
@@ -1650,26 +1682,42 @@ static void wave5_vpu_dec_device_run(void *priv)
dev_err(inst->dev->dev,
"Frame decoding on m2m context (%p), fail: %d (result: %d)\n",
m2m_ctx, ret, fail_res);
- break;
+ goto finish_job_and_return;
+ }
+
+ if (fail_res == WAVE5_SYSERR_QUEUEING_FAIL) {
+ inst->retry = true;
+ } else {
+ inst->retry = false;
+ if (!inst->eos)
+ inst->queuing_num--;
}
- /* Return so that we leave this job active */
- dev_dbg(inst->dev->dev, "%s: leave with active job", __func__);
- return;
- default:
- WARN(1, "Execution of a job in state %s illegal.\n", state_to_str(inst->state));
break;
+ default:
+ dev_dbg(inst->dev->dev, "Execution of a job in state %s illegal.\n",
+ state_to_str(inst->state));
}
finish_job_and_return:
dev_dbg(inst->dev->dev, "%s: leave and finish job", __func__);
pm_runtime_mark_last_busy(inst->dev->dev);
pm_runtime_put_autosuspend(inst->dev->dev);
- v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
+ /*
+ * After receiving CMD_STOP, there is no input, but we have to run device_run
+ * to send DEC_PIC command until display index == -1, so job_finish was always
+ * called in the device_run to archive it, the logic was very wasteful
+ * in power and CPU time.
+ * If EOS is passed, device_run will not call job_finish no more, it is called
+ * only if HW is idle status in order to reduce overhead.
+ */
+ if (!inst->sent_eos)
+ v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
}
static void wave5_vpu_dec_job_abort(void *priv)
{
struct vpu_instance *inst = priv;
+ struct v4l2_m2m_ctx *m2m_ctx = inst->v4l2_fh.m2m_ctx;
int ret;
ret = switch_state(inst, VPU_INST_STATE_STOP);
@@ -1680,6 +1728,8 @@ static void wave5_vpu_dec_job_abort(void *priv)
if (ret)
dev_warn(inst->dev->dev,
"Setting EOS for the bitstream, fail: %d\n", ret);
+
+ v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
}
static int wave5_vpu_dec_job_ready(void *priv)
@@ -1715,7 +1765,8 @@ static int wave5_vpu_dec_job_ready(void *priv)
"No capture buffer ready to decode!\n");
break;
} else if (!wave5_is_draining_or_eos(inst) &&
- !v4l2_m2m_num_src_bufs_ready(m2m_ctx)) {
+ (!v4l2_m2m_num_src_bufs_ready(m2m_ctx) ||
+ inst->empty_queue)) {
dev_dbg(inst->dev->dev,
"No bitstream data to decode!\n");
break;
@@ -1755,6 +1806,8 @@ static int wave5_vpu_open_dec(struct file *filp)
inst->ops = &wave5_vpu_dec_inst_ops;
spin_lock_init(&inst->state_spinlock);
+ mutex_init(&inst->feed_lock);
+ INIT_LIST_HEAD(&inst->avail_src_bufs);
inst->codec_info = kzalloc(sizeof(*inst->codec_info), GFP_KERNEL);
if (!inst->codec_info)
diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpuapi.c b/drivers/media/platform/chips-media/wave5/wave5-vpuapi.c
index 5b10f9f49b9f..edbe69540ef1 100644
--- a/drivers/media/platform/chips-media/wave5/wave5-vpuapi.c
+++ b/drivers/media/platform/chips-media/wave5/wave5-vpuapi.c
@@ -207,6 +207,7 @@ int wave5_vpu_dec_close(struct vpu_instance *inst, u32 *fail_res)
int retry = 0;
struct vpu_device *vpu_dev = inst->dev;
int i;
+ struct dec_output_info dec_info;
*fail_res = 0;
if (!inst->codec_info)
@@ -227,11 +228,26 @@ int wave5_vpu_dec_close(struct vpu_instance *inst, u32 *fail_res)
goto unlock_and_return;
}
- if (*fail_res == WAVE5_SYSERR_VPU_STILL_RUNNING &&
- retry++ >= MAX_FIRMWARE_CALL_RETRY) {
+ if (ret == 0)
+ break;
+
+ if (*fail_res != WAVE5_SYSERR_VPU_STILL_RUNNING) {
+ dev_warn(inst->dev->dev, "dec_finish_seq timed out\n");
+ goto unlock_and_return;
+ }
+
+ if (retry++ >= MAX_FIRMWARE_CALL_RETRY) {
ret = -ETIMEDOUT;
goto unlock_and_return;
}
+
+ mutex_unlock(&vpu_dev->hw_lock);
+ wave5_vpu_dec_get_output_info(inst, &dec_info);
+ ret = mutex_lock_interruptible(&vpu_dev->hw_lock);
+ if (ret) {
+ pm_runtime_put_sync(inst->dev->dev);
+ return ret;
+ }
} while (ret != 0);
dev_dbg(inst->dev->dev, "%s: dec_finish_seq complete\n", __func__);
@@ -248,6 +264,8 @@ int wave5_vpu_dec_close(struct vpu_instance *inst, u32 *fail_res)
wave5_vdi_free_dma_memory(vpu_dev, &p_dec_info->vb_task);
+ mutex_destroy(&inst->feed_lock);
+
unlock_and_return:
mutex_unlock(&vpu_dev->hw_lock);
pm_runtime_put_sync(inst->dev->dev);
diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h b/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h
index bc101397204d..adfbc104f939 100644
--- a/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h
+++ b/drivers/media/platform/chips-media/wave5/wave5-vpuapi.h
@@ -818,6 +818,11 @@ struct vpu_instance {
bool cbcr_interleave;
bool nv21;
bool eos;
+ bool sent_eos; /* check if EOS is sent to application */
+ bool retry; /* retry to feed bitstream if failure reason is WAVE5_SYSERR_QUEUEING_FAIL*/
+ int queuing_num; /* count of bitstream queued */
+ struct mutex feed_lock; /* lock for feeding bitstream buffers */
+ bool empty_queue;
struct vpu_buf bitstream_vbuf;
dma_addr_t last_rd_ptr;
size_t remaining_consumed_bytes;
diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpuconfig.h b/drivers/media/platform/chips-media/wave5/wave5-vpuconfig.h
index 1ea9f5f31499..4ebd48d5550e 100644
--- a/drivers/media/platform/chips-media/wave5/wave5-vpuconfig.h
+++ b/drivers/media/platform/chips-media/wave5/wave5-vpuconfig.h
@@ -59,6 +59,7 @@
// application specific configuration
#define VPU_ENC_TIMEOUT 60000
#define VPU_DEC_TIMEOUT 60000
+#define VPU_DEC_STOP_TIMEOUT 10
// for WAVE encoder
#define USE_SRC_PRP_AXI 0
--
2.43.0
Powered by blists - more mailing lists