[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ae4c7968-f593-6c8d-2c10-f7a2b31318e2@oss.qualcomm.com>
Date: Wed, 22 Oct 2025 11:14:22 +0530
From: Vikash Garodia <vikash.garodia@....qualcomm.com>
To: Bryan O'Donoghue <bod@...nel.org>,
Dikshita Agarwal <dikshita.agarwal@....qualcomm.com>,
Abhinav Kumar <abhinav.kumar@...ux.dev>,
Mauro Carvalho Chehab <mchehab@...nel.org>,
Rob Herring <robh@...nel.org>,
Krzysztof Kozlowski <krzk+dt@...nel.org>,
Conor Dooley
<conor+dt@...nel.org>,
Philipp Zabel <p.zabel@...gutronix.de>,
Dmitry Baryshkov <dmitry.baryshkov@....qualcomm.com>,
Konrad Dybcio <konrad.dybcio@....qualcomm.com>
Cc: linux-arm-msm@...r.kernel.org, linux-media@...r.kernel.org,
devicetree@...r.kernel.org, linux-kernel@...r.kernel.org,
Vishnu Reddy <quic_bvisredd@...cinc.com>
Subject: Re: [PATCH v2 4/8] media: iris: Introduce buffer size calculations
for vpu4
On 10/22/2025 4:55 AM, Bryan O'Donoghue wrote:
> On 17/10/2025 15:16, Vikash Garodia wrote:
>> Introduces vp4 buffer size calculation for both encoder and decoder.
>> Reuse the buffer size calculation which are common, while adding the
>> vpu4 ones separately.
>>
>> Co-developed-by: Vishnu Reddy <quic_bvisredd@...cinc.com>
>> Signed-off-by: Vishnu Reddy <quic_bvisredd@...cinc.com>
>> Signed-off-by: Vikash Garodia <vikash.garodia@....qualcomm.com>
>> ---
>> drivers/media/platform/qcom/iris/iris_vpu_buffer.c | 345 +++++++++++++++++++++
>> drivers/media/platform/qcom/iris/iris_vpu_buffer.h | 15 +
>> 2 files changed, 360 insertions(+)
>>
>> diff --git a/drivers/media/platform/qcom/iris/iris_vpu_buffer.c
>> b/drivers/media/platform/qcom/iris/iris_vpu_buffer.c
>> index
>> 4463be05ce165adef6b152eb0c155d2e6a7b3c36..8cc52d7aba3ffb968191519c1a1a10e326403205 100644
>> --- a/drivers/media/platform/qcom/iris/iris_vpu_buffer.c
>> +++ b/drivers/media/platform/qcom/iris/iris_vpu_buffer.c
>> @@ -1408,6 +1408,307 @@ static u32 iris_vpu_enc_vpss_size(struct iris_inst *inst)
>> return hfi_buffer_vpss_enc(width, height, ds_enable, 0, 0);
>> }
>>
>> +static inline u32 size_dpb_opb(u32 height, u32 lcu_size)
>> +{
>> + u32 max_tile_height = ((height + lcu_size - 1) / lcu_size) * lcu_size + 8;
>> + u32 dpb_opb = 3 * ((max_tile_height >> 3) * DMA_ALIGNMENT);
>> + u32 num_luma_chrome_plane = 2;
>> +
>> + return dpb_opb = num_luma_chrome_plane * ALIGN(dpb_opb, DMA_ALIGNMENT);
>
> return thing = someother-thing.
>
> You must mean
>
> return (num_luma_chrome_plane * ALIGN(dpb_opb, DMA_ALIGNMENT));
Ack
>
>> +}
>> +
>> +static u32 hfi_vpu4x_vp9d_lb_size(u32 frame_width, u32 frame_height, u32
>> num_vpp_pipes)
>> +{
>> + u32 vp9_top_lb, vp9_fe_left_lb, vp9_se_left_lb, dpb_opb, vp9d_qp,
>> num_lcu_per_pipe;
>> + u32 lcu_size = 64, fe_top_ctrl_line_numbers = 3,
>> fe_top_data_luma_line_numbers = 2,
>> + fe_top_data_chroma_line_numbers = 3, fe_lft_ctrl_line_numbers = 4,
>> + fe_lft_db_data_line_numbers = 2, fe_lft_lr_data_line_numbers = 4;
>
> You can reduce this very long variable list to macro constants.
>
> For example fe_lft_db_data_line_numbers doesn't vary so it shouldn't be a variable.
>
Ack
>> +
>> + vp9_top_lb = ALIGN(size_vp9d_lb_vsp_top(frame_width, frame_height),
>> DMA_ALIGNMENT);
>> + vp9_top_lb += ALIGN(size_vpxd_lb_se_top_ctrl(frame_width, frame_height),
>> DMA_ALIGNMENT);
>> + vp9_top_lb += max3(DIV_ROUND_UP(frame_width, BUFFER_ALIGNMENT_16_BYTES) *
>> + MAX_PE_NBR_DATA_LCU16_LINE_BUFFER_SIZE,
>> + DIV_ROUND_UP(frame_width, BUFFER_ALIGNMENT_32_BYTES) *
>> + MAX_PE_NBR_DATA_LCU32_LINE_BUFFER_SIZE,
>> + DIV_ROUND_UP(frame_width, BUFFER_ALIGNMENT_64_BYTES) *
>> + MAX_PE_NBR_DATA_LCU64_LINE_BUFFER_SIZE);
>> + vp9_top_lb = ALIGN(vp9_top_lb, DMA_ALIGNMENT);
>> + vp9_top_lb += ALIGN((DMA_ALIGNMENT * DIV_ROUND_UP(frame_width, lcu_size)),
>> + DMA_ALIGNMENT) * fe_top_ctrl_line_numbers;
>> + vp9_top_lb += ALIGN(DMA_ALIGNMENT * 8 * DIV_ROUND_UP(frame_width, lcu_size),
>> + DMA_ALIGNMENT) * (fe_top_data_luma_line_numbers +
>> + fe_top_data_chroma_line_numbers);
>> +
>> + num_lcu_per_pipe = (DIV_ROUND_UP(frame_height, lcu_size) / num_vpp_pipes) +
>> + (DIV_ROUND_UP(frame_height, lcu_size) % num_vpp_pipes);
>> + vp9_fe_left_lb = ALIGN((DMA_ALIGNMENT * num_lcu_per_pipe), DMA_ALIGNMENT) *
>> + fe_lft_ctrl_line_numbers;
>> + vp9_fe_left_lb += ((ALIGN((DMA_ALIGNMENT * 8 * num_lcu_per_pipe),
>> DMA_ALIGNMENT) *
>> + fe_lft_db_data_line_numbers) +
>> + ALIGN((DMA_ALIGNMENT * 3 * num_lcu_per_pipe), DMA_ALIGNMENT) +
>> + ALIGN((DMA_ALIGNMENT * 4 * num_lcu_per_pipe), DMA_ALIGNMENT) +
>> + (ALIGN((DMA_ALIGNMENT * 24 * num_lcu_per_pipe), DMA_ALIGNMENT) *
>> + fe_lft_lr_data_line_numbers));
>> + vp9_fe_left_lb = vp9_fe_left_lb * num_vpp_pipes;
>> +
>> + vp9_se_left_lb = ALIGN(size_vpxd_lb_se_left_ctrl(frame_width, frame_height),
>> + DMA_ALIGNMENT);
>> + dpb_opb = size_dpb_opb(frame_height, lcu_size);
>> + vp9d_qp = ALIGN(size_vp9d_qp(frame_width, frame_height), DMA_ALIGNMENT);
>> +
>> + return vp9_top_lb + vp9_fe_left_lb + (vp9_se_left_lb * num_vpp_pipes) +
>> + (dpb_opb * num_vpp_pipes) + vp9d_qp;
>> +}
>> +
>> +static u32 hfi_vpu4x_buffer_line_vp9d(u32 frame_width, u32 frame_height, u32
>> _yuv_bufcount_min,
>> + bool is_opb, u32 num_vpp_pipes)
>> +{
>> + u32 lb_size = hfi_vpu4x_vp9d_lb_size(frame_width, frame_height,
>> num_vpp_pipes);
>> + u32 dpb_obp_size = 0, lcu_size = 64;
>> +
>> + if (is_opb)
>> + dpb_obp_size = size_dpb_opb(frame_height, lcu_size) * num_vpp_pipes;
>> +
>> + return lb_size + dpb_obp_size;
>> +}
>> +
>> +static u32 iris_vpu4x_dec_line_size(struct iris_inst *inst)
>> +{
>> + u32 num_vpp_pipes = inst->core->iris_platform_data->num_vpp_pipe;
>> + u32 out_min_count = inst->buffers[BUF_OUTPUT].min_count;
>> + struct v4l2_format *f = inst->fmt_src;
>> + u32 height = f->fmt.pix_mp.height;
>> + u32 width = f->fmt.pix_mp.width;
>> + bool is_opb = false;
>> +
>> + if (iris_split_mode_enabled(inst))
>> + is_opb = true;
>> +
>> + if (inst->codec == V4L2_PIX_FMT_H264)
>> + return hfi_buffer_line_h264d(width, height, is_opb, num_vpp_pipes);
>> + else if (inst->codec == V4L2_PIX_FMT_HEVC)
>> + return hfi_buffer_line_h265d(width, height, is_opb, num_vpp_pipes);
>> + else if (inst->codec == V4L2_PIX_FMT_VP9)
>> + return hfi_vpu4x_buffer_line_vp9d(width, height, out_min_count, is_opb,
>> + num_vpp_pipes);
>> +
>> + return 0;
>> +}
>> +
>> +static u32 hfi_vpu4x_buffer_persist_h265d(u32 rpu_enabled)
>> +{
>> + return ALIGN((SIZE_SLIST_BUF_H265 * NUM_SLIST_BUF_H265 + H265_NUM_FRM_INFO *
>> + H265_DISPLAY_BUF_SIZE + (H265_NUM_TILE * sizeof(u32)) +
>> (NUM_HW_PIC_BUF *
>> + (SIZE_SEI_USERDATA + SIZE_H265D_ARP + SIZE_THREE_DIMENSION_USERDATA)) +
>> + rpu_enabled * NUM_HW_PIC_BUF * SIZE_DOLBY_RPU_METADATA), DMA_ALIGNMENT);
>> +}
>> +
>> +static u32 hfi_vpu4x_buffer_persist_vp9d(void)
>> +{
>> + return ALIGN(VP9_NUM_PROBABILITY_TABLE_BUF * VP9_PROB_TABLE_SIZE,
>> DMA_ALIGNMENT) +
>> + (ALIGN(hfi_iris3_vp9d_comv_size(), DMA_ALIGNMENT) * 2) +
>> + ALIGN(MAX_SUPERFRAME_HEADER_LEN, DMA_ALIGNMENT) +
>> + ALIGN(VP9_UDC_HEADER_BUF_SIZE, DMA_ALIGNMENT) +
>> + ALIGN(VP9_NUM_FRAME_INFO_BUF * CCE_TILE_OFFSET_SIZE, DMA_ALIGNMENT) +
>> + ALIGN(VP9_NUM_FRAME_INFO_BUF * VP9_FRAME_INFO_BUF_SIZE_VPU4X,
>> DMA_ALIGNMENT) +
>> + HDR10_HIST_EXTRADATA_SIZE;
>> +}
>> +
>> +static u32 iris_vpu4x_dec_persist_size(struct iris_inst *inst)
>> +{
>> + if (inst->codec == V4L2_PIX_FMT_H264)
>> + return hfi_buffer_persist_h264d();
>> + else if (inst->codec == V4L2_PIX_FMT_HEVC)
>> + return hfi_vpu4x_buffer_persist_h265d(0);
>> + else if (inst->codec == V4L2_PIX_FMT_VP9)
>> + return hfi_vpu4x_buffer_persist_vp9d();
>> +
>> + return 0;
>> +}
>> +
>> +static u32 size_se_lb(u32 standard, u32 num_vpp_pipes_enc,
>> + u32 frame_width_coded, u32 frame_height_coded)
>> +{
>> + u32 se_tlb_size = ALIGN(frame_width_coded, DMA_ALIGNMENT);
>> + u32 se_llb_size = (standard == HFI_CODEC_ENCODE_HEVC) ?
>> + ((frame_height_coded + BUFFER_ALIGNMENT_32_BYTES - 1) /
>> + BUFFER_ALIGNMENT_32_BYTES) * LOG2_16 * LLB_UNIT_SIZE :
>> + ((frame_height_coded + BUFFER_ALIGNMENT_16_BYTES - 1) /
>> + BUFFER_ALIGNMENT_16_BYTES) * LOG2_32 * LLB_UNIT_SIZE;
>> +
>> + se_llb_size = ALIGN(se_llb_size, BUFFER_ALIGNMENT_32_BYTES);
>> +
>> + if (num_vpp_pipes_enc > 1)
>> + se_llb_size = ALIGN(se_llb_size + BUFFER_ALIGNMENT_512_BYTES,
>> + DMA_ALIGNMENT) * num_vpp_pipes_enc;
>> +
>> + return ALIGN(se_tlb_size + se_llb_size, DMA_ALIGNMENT);
>> +}
>> +
>> +static u32 size_te_lb(bool is_ten_bit, u32 num_vpp_pipes_enc, u32 width_in_lcus,
>> + u32 frame_height_coded, u32 frame_width_coded)
>> +{
>> + u32 num_pixel_10_bit = 3, num_pixel_8_bit = 2, num_pixel_te_llb = 3;
>> + u32 te_llb_col_rc_size = ALIGN(32 * width_in_lcus / num_vpp_pipes_enc,
>> + DMA_ALIGNMENT) * num_vpp_pipes_enc;
>> + u32 te_tlb_recon_data_size = ALIGN((is_ten_bit ? num_pixel_10_bit :
>> num_pixel_8_bit) *
>> + frame_width_coded, DMA_ALIGNMENT);
>> + u32 te_llb_recon_data_size = ((1 + is_ten_bit) * num_pixel_te_llb *
>> frame_height_coded +
>> + num_vpp_pipes_enc - 1) / num_vpp_pipes_enc;
>> + te_llb_recon_data_size = ALIGN(te_llb_recon_data_size, DMA_ALIGNMENT) *
>> num_vpp_pipes_enc;
>> +
>> + return ALIGN(te_llb_recon_data_size + te_llb_col_rc_size +
>> te_tlb_recon_data_size,
>> + DMA_ALIGNMENT);
>> +}
>> +
>> +static inline u32 calc_fe_tlb_size(u32 size_per_lcu, bool is_ten_bit)
>> +{
>> + u32 num_pixels_fe_tlb_10_bit = 128, num_pixels_fe_tlb_8_bit = 64;
>> +
>> + return is_ten_bit ? (num_pixels_fe_tlb_10_bit * (size_per_lcu + 1)) :
>> + (size_per_lcu * num_pixels_fe_tlb_8_bit);
>> +}
>> +
>> +static u32 size_fe_lb(bool is_ten_bit, u32 standard, u32 num_vpp_pipes_enc,
>> + u32 frame_height_coded, u32 frame_width_coded)
>> +{
>> + u32 log2_lcu_size, num_cu_in_height_pipe, num_cu_in_width,
>> + fb_llb_db_ctrl_size, fb_llb_db_luma_size, fb_llb_db_chroma_size,
>> + fb_tlb_db_ctrl_size, fb_tlb_db_luma_size, fb_tlb_db_chroma_size,
>> + fb_llb_sao_ctrl_size, fb_llb_sao_luma_size, fb_llb_sao_chroma_size,
>> + fb_tlb_sao_ctrl_size, fb_tlb_sao_luma_size, fb_tlb_sao_chroma_size,
>> + fb_lb_top_sdc_size, fb_lb_se_ctrl_size, fe_tlb_size, size_per_lcu;
>> + u32 fe_sdc_data_per_block = 16, se_ctrl_data_per_block = 2020;
>
> Again you can reduce this - at least a little bit
>
> fe_sdc_data_per_block
> se_ctrl_data_per_block
>
> are const
Ack
>
>> +
>> + log2_lcu_size = (standard == HFI_CODEC_ENCODE_HEVC) ? 5 : 4;
>> + num_cu_in_height_pipe = ((frame_height_coded >> log2_lcu_size) +
>> num_vpp_pipes_enc - 1) /
>> + num_vpp_pipes_enc;
>> + num_cu_in_width = frame_width_coded >> log2_lcu_size;
>> +
>> + size_per_lcu = 2;
>> + fe_tlb_size = calc_fe_tlb_size(size_per_lcu, 1);
>> + fb_llb_db_ctrl_size = ALIGN(fe_tlb_size, DMA_ALIGNMENT) *
>> num_cu_in_height_pipe;
>> + fb_llb_db_ctrl_size = ALIGN(fb_llb_db_ctrl_size, DMA_ALIGNMENT) *
>> num_vpp_pipes_enc;
>> +
>> + size_per_lcu = (1 << (log2_lcu_size - 3));
>> + fe_tlb_size = calc_fe_tlb_size(size_per_lcu, is_ten_bit);
>> + fb_llb_db_luma_size = ALIGN(fe_tlb_size, DMA_ALIGNMENT) *
>> num_cu_in_height_pipe;
>> + fb_llb_db_luma_size = ALIGN(fb_llb_db_luma_size, DMA_ALIGNMENT) *
>> num_vpp_pipes_enc;
>> +
>> + size_per_lcu = ((1 << (log2_lcu_size - 4)) * 2);
>> + fe_tlb_size = calc_fe_tlb_size(size_per_lcu, is_ten_bit);
>> + fb_llb_db_chroma_size = ALIGN(fe_tlb_size, DMA_ALIGNMENT) *
>> num_cu_in_height_pipe;
>> + fb_llb_db_chroma_size = ALIGN(fb_llb_db_chroma_size, DMA_ALIGNMENT) *
>> num_vpp_pipes_enc;
>> +
>> + size_per_lcu = 1;
>> + fe_tlb_size = calc_fe_tlb_size(size_per_lcu, 1);
>> + fb_tlb_db_ctrl_size = ALIGN(fe_tlb_size, DMA_ALIGNMENT) * num_cu_in_width;
>> + fb_llb_sao_ctrl_size = ALIGN(fe_tlb_size, DMA_ALIGNMENT) *
>> num_cu_in_height_pipe;
>> + fb_llb_sao_ctrl_size = fb_llb_sao_ctrl_size * num_vpp_pipes_enc;
>> + fb_tlb_sao_ctrl_size = ALIGN(fe_tlb_size, DMA_ALIGNMENT) * num_cu_in_width;
>> +
>> + size_per_lcu = ((1 << (log2_lcu_size - 3)) + 1);
>> + fe_tlb_size = calc_fe_tlb_size(size_per_lcu, is_ten_bit);
>> + fb_tlb_db_luma_size = ALIGN(fe_tlb_size, DMA_ALIGNMENT) * num_cu_in_width;
>> +
>> + size_per_lcu = (2 * ((1 << (log2_lcu_size - 4)) + 1));
>> + fe_tlb_size = calc_fe_tlb_size(size_per_lcu, is_ten_bit);
>> + fb_tlb_db_chroma_size = ALIGN(fe_tlb_size, DMA_ALIGNMENT) * num_cu_in_width;
>> +
>> + fb_llb_sao_luma_size = BUFFER_ALIGNMENT_256_BYTES * num_vpp_pipes_enc;
>> + fb_llb_sao_chroma_size = BUFFER_ALIGNMENT_256_BYTES * num_vpp_pipes_enc;
>> + fb_tlb_sao_luma_size = BUFFER_ALIGNMENT_256_BYTES;
>> + fb_tlb_sao_chroma_size = BUFFER_ALIGNMENT_256_BYTES;
>> + fb_lb_top_sdc_size = ALIGN((fe_sdc_data_per_block * (frame_width_coded >>
>> 5)),
>> + DMA_ALIGNMENT);
>> + fb_lb_se_ctrl_size = ALIGN((se_ctrl_data_per_block * (frame_width_coded
>> >> 5)),
>> + DMA_ALIGNMENT);
>
> On the one hand lots of variables.
>
> On the other hand I think the code is more readable with assigned names instead
> of a big morass of return ALIGN(stuff) + ALIGN(other stuff).
Good to know its better now interms of readability.
>
> Anyway I think you can reduce this enormomous variable list by at lest two.
>
> u32 fe_sdc_data_per_block = 16, se_ctrl_data_per_block = 2020;
> ->
> #define FE_SDC_DATA_PER_BLOCK 16
> #define SE_CTRL_DATA_PER_BLOCK 2020
>
Ack
Regards,
Vikash
Powered by blists - more mailing lists