[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4f359eef-5a5c-41ca-a318-8c4343709d54@linaro.org>
Date: Wed, 4 Dec 2024 16:35:44 +0100
From: Neil Armstrong <neil.armstrong@...aro.org>
To: Akhil P Oommen <quic_akhilpo@...cinc.com>, Rob Clark
<robdclark@...il.com>, Sean Paul <sean@...rly.run>,
Konrad Dybcio <konradybcio@...nel.org>,
Abhinav Kumar <quic_abhinavk@...cinc.com>,
Dmitry Baryshkov <dmitry.baryshkov@...aro.org>,
Marijn Suijten <marijn.suijten@...ainline.org>,
David Airlie <airlied@...il.com>, Simona Vetter <simona@...ll.ch>,
Bjorn Andersson <andersson@...nel.org>, Rob Herring <robh@...nel.org>,
Krzysztof Kozlowski <krzk+dt@...nel.org>, Conor Dooley <conor+dt@...nel.org>
Cc: linux-arm-msm@...r.kernel.org, dri-devel@...ts.freedesktop.org,
freedreno@...ts.freedesktop.org, linux-kernel@...r.kernel.org,
devicetree@...r.kernel.org
Subject: Re: [PATCH v3 2/7] drm/msm: adreno: add plumbing to generate
bandwidth vote table for GMU
On 02/12/2024 09:46, Neil Armstrong wrote:
> On 30/11/2024 22:49, Akhil P Oommen wrote:
>> On 11/28/2024 3:55 PM, Neil Armstrong wrote:
>>> The Adreno GPU Management Unit (GMU) can also scale DDR Bandwidth along
>>> the Frequency and Power Domain level, but by default we leave the
>>> OPP core scale the interconnect ddr path.
>>>
>>> While scaling via the interconnect path was sufficient, newer GPUs
>>> like the A750 requires specific vote paremeters and bandwidth to
>>> achieve full functionality.
>>>
>>> In order to calculate vote values used by the GPU Management
>>> Unit (GMU), we need to parse all the possible OPP Bandwidths and
>>> create a vote value to be sent to the appropriate Bus Control
>>> Modules (BCMs) declared in the GPU info struct.
>>>
>>> This vote value is called IB, while on the othe side the GMU also
>>> takes another vote called AB which is a 16bit quantized value
>>> of the bandwidth against the maximum supported bandwidth.
>>>
>>> The vote array will then be used to dynamically generate the GMU
>>> bw_table sent during the GMU power-up.
>>>
>>> Signed-off-by: Neil Armstrong <neil.armstrong@...aro.org>
>>> ---
>>> drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 174 ++++++++++++++++++++++++++++++++++
>>> drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 14 +++
>>> drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 1 +
>>> drivers/gpu/drm/msm/adreno/a6xx_hfi.h | 5 +
>>> 4 files changed, 194 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>>> index 14db7376c712d19446b38152e480bd5a1e0a5198..ee2010a01186721dd377f1655fcf05ddaff77131 100644
>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>>> @@ -9,6 +9,7 @@
>>> #include <linux/pm_domain.h>
>>> #include <linux/pm_opp.h>
>>> #include <soc/qcom/cmd-db.h>
>>> +#include <soc/qcom/tcs.h>
>>> #include <drm/drm_gem.h>
>>> #include "a6xx_gpu.h"
>>> @@ -1287,6 +1288,131 @@ static int a6xx_gmu_memory_probe(struct a6xx_gmu *gmu)
>>> return 0;
>>> }
>>> +/**
>>> + * struct bcm_db - Auxiliary data pertaining to each Bus Clock Manager (BCM)
>>> + * @unit: divisor used to convert bytes/sec bw value to an RPMh msg
>>> + * @width: multiplier used to convert bytes/sec bw value to an RPMh msg
>>> + * @vcd: virtual clock domain that this bcm belongs to
>>> + * @reserved: reserved field
>>> + */
>>> +struct bcm_db {
>>> + __le32 unit;
>>> + __le16 width;
>>> + u8 vcd;
>>> + u8 reserved;
>>> +};
>>> +
>>> +static u64 bcm_div(u64 num, u32 base)
>>> +{
>>> + /* Ensure that small votes aren't lost. */
>>> + if (num && num < base)
>>> + return 1;
>>> +
>>> + do_div(num, base);
>>> +
>>> + return num;
>>> +}
>>> +
>>> +static int a6xx_gmu_rpmh_bw_votes_init(const struct a6xx_info *info,
>>> + struct a6xx_gmu *gmu)
>>> +{
>>> + const struct bcm_db *bcm_data[GMU_MAX_BCMS] = { 0 };
>>> + unsigned int bcm_index, bw_index, bcm_count = 0;
>>> +
>>> + if (!info->bcms)
>>> + return 0;
>>> +
>>> + /* Retrieve BCM data from cmd-db */
>>> + for (bcm_index = 0; bcm_index < GMU_MAX_BCMS; bcm_index++) {
>>> + size_t count;
>>> +
>>> + /* Stop at first unconfigured bcm */
>>> + if (!info->bcms[bcm_index].name)
>>> + break;
>>> +
>>> + bcm_data[bcm_index] = cmd_db_read_aux_data(
>>> + info->bcms[bcm_index].name,
>>> + &count);
>>> + if (IS_ERR(bcm_data[bcm_index]))
>>> + return PTR_ERR(bcm_data[bcm_index]);
>>> +
>>> + if (!count)
>>> + return -EINVAL;
>>> +
>>> + ++bcm_count;
>>> + }
>>> +
>>> + /* Generate BCM votes values for each bandwidth & BCM */
>>> + for (bw_index = 0; bw_index < gmu->nr_gpu_bws; bw_index++) {
>>> + u32 *data = gmu->gpu_ib_votes[bw_index];
>>> + u32 bw = gmu->gpu_bw_table[bw_index];
>>> +
>>> + /* Calculations loosely copied from bcm_aggregate() & tcs_cmd_gen() */
>>> + for (bcm_index = 0; bcm_index < bcm_count; bcm_index++) {
>>> + bool commit = false;
>>> + u64 peak, vote;
>>> + u16 width;
>>> + u32 unit;
>>> +
>>> + /* Skip unconfigured BCM */
>>> + if (!bcm_data[bcm_index])
>>> + continue;
>>> +
>>> + if (bcm_index == bcm_count - 1 ||
>>> + (bcm_data[bcm_index + 1] &&
>>> + bcm_data[bcm_index]->vcd != bcm_data[bcm_index + 1]->vcd))
>>> + commit = true;
>>> +
>>> + if (!bw) {
>>> + data[bcm_index] = BCM_TCS_CMD(commit, false, 0, 0);
>>> + continue;
>>> + }
>>> +
>>> + if (info->bcms[bcm_index].fixed) {
>>> + u32 perfmode = 0;
>>> +
>>> + if (bw >= info->bcms[bcm_index].perfmode_bw)
>>> + perfmode = info->bcms[bcm_index].perfmode;
>>> +
>>> + data[bcm_index] = BCM_TCS_CMD(commit, true, 0, perfmode);
>>> + continue;
>>> + }
>>> +
>>> + /* Multiply the bandwidth by the width of the connection */
>>> + width = le16_to_cpu(bcm_data[bcm_index]->width);
>>> + peak = bcm_div((u64)bw * width, info->bcms[bcm_index].buswidth);
>>> +
>>> + /* Input bandwidth value is in KBps, scale the value to BCM unit */
>>> + unit = le32_to_cpu(bcm_data[bcm_index]->unit);
>>> + vote = bcm_div(peak * 1000ULL, unit);
>>> +
>>> + if (vote > BCM_TCS_CMD_VOTE_MASK)
>>> + vote = BCM_TCS_CMD_VOTE_MASK;
>>
>> use clamp()?
>
> Yep, I think I could replace bcm_div with clamp
>
>>
>>> +
>>> + data[bcm_index] = BCM_TCS_CMD(commit, true, vote, vote);
>>> + }
>>> + }
>>> +
>>> + /* Generate AB votes which are a quantitized bandwidth value */
>>> + for (bw_index = 0; bw_index < gmu->nr_gpu_bws; bw_index++) {
>>> + u64 tmp;
>>> +
>>> + /*
>>> + * The AB vote consists of a 16 bit wide quantized level
>>> + * against the maximum supported bandwidth.
>>> + * Quantization can be calculated as below:
>>> + * vote = (bandwidth * 2^16) / max bandwidth
>>> + */
>>> + tmp = gmu->gpu_bw_table[bw_index] * MAX_AB_VOTE;
>>> +
>>> + /* Divide by the maximum bandwidth to get a quantized value */
>>> + gmu->gpu_ab_votes[bw_index] =
>>> + bcm_div(tmp, gmu->gpu_bw_table[gmu->nr_gpu_bws - 1]);
>>> + }
>>
>> So I suppose you are trying to vote AB equal to IB. Aggregation logic
>> for both are different. So this will make DDR scale very aggressively. A
>> more reasonable approach would be to vote a % of IB vote (25%?). Ideally
>> we should measure GPU's bandwidth usage and vote that if you are really
>> concerned about stability issues. IB is just a floor vote, GPU can
>> generate way higher traffic.
>
> I think this should be optimized better in a different patchset, so I would
> like to make the simplest vote possible here to retain functionnality.
>
> So if I understand I should divide this vote value by 4 ? Downstram uses 25% by default
> when no AB was calculated, but what does that mean exactly ?
>
> Is there a counter somewhere to measure the GPU traffic ?
What if we also specified opp-avg-kBps for each OPP, define it to the downstream "qcom,bus-min"
and use this value for the AB vote ? and use 25% as fallback like downstream.
Neil
>
>>
>>> +
>>> + return 0;
>>> +}
>>> +
>>> /* Return the 'arc-level' for the given frequency */
>>> static unsigned int a6xx_gmu_get_arc_level(struct device *dev,
>>> unsigned long freq)
>>> @@ -1390,12 +1516,15 @@ static int a6xx_gmu_rpmh_arc_votes_init(struct device *dev, u32 *votes,
>>> * The GMU votes with the RPMh for itself and on behalf of the GPU but we need
>>> * to construct the list of votes on the CPU and send it over. Query the RPMh
>>> * voltage levels and build the votes
>>> + * The GMU can also vote for DDR interconnects, use the OPP bandwidth entries
>>> + * and BCM parameters to build the votes.
>>> */
>>> static int a6xx_gmu_rpmh_votes_init(struct a6xx_gmu *gmu)
>>> {
>>> struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>>> struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
>>> + const struct a6xx_info *info = adreno_gpu->info->a6xx;
>>> struct msm_gpu *gpu = &adreno_gpu->base;
>>> int ret;
>>> @@ -1407,6 +1536,10 @@ static int a6xx_gmu_rpmh_votes_init(struct a6xx_gmu *gmu)
>>> ret |= a6xx_gmu_rpmh_arc_votes_init(gmu->dev, gmu->cx_arc_votes,
>>> gmu->gmu_freqs, gmu->nr_gmu_freqs, "cx.lvl");
>>> + /* Build the interconnect votes */
>>> + if (info->bcms && gmu->nr_gpu_bws > 1)
>>> + ret |= a6xx_gmu_rpmh_bw_votes_init(info, gmu);
>>> +
>>> return ret;
>>> }
>>> @@ -1442,10 +1575,43 @@ static int a6xx_gmu_build_freq_table(struct device *dev, unsigned long *freqs,
>>> return index;
>>> }
>>> +static int a6xx_gmu_build_bw_table(struct device *dev, unsigned long *bandwidths,
>>> + u32 size)
>>> +{
>>> + int count = dev_pm_opp_get_opp_count(dev);
>>
>> I am less concerned about this now since you are not voting real AB BW.
>
> Sorry I don't understand
>
> Thanks,
> Neil
>
>>
>> -Akhil.
>>
>>> + struct dev_pm_opp *opp;
>>> + int i, index = 0;
>>> + unsigned int bandwidth = 1;
>>> +
>>> + /*
>>> + * The OPP table doesn't contain the "off" bandwidth level so we need to
>>> + * add 1 to the table size to account for it
>>> + */
>>> +
>>> + if (WARN(count + 1 > size,
>>> + "The GMU bandwidth table is being truncated\n"))
>>> + count = size - 1;
>>> +
>>> + /* Set the "off" bandwidth */
>>> + bandwidths[index++] = 0;
>>> +
>>> + for (i = 0; i < count; i++) {
>>> + opp = dev_pm_opp_find_bw_ceil(dev, &bandwidth, 0);
>>> + if (IS_ERR(opp))
>>> + break;
>>> +
>>> + dev_pm_opp_put(opp);
>>> + bandwidths[index++] = bandwidth++;
>>> + }
>>> +
>>> + return index;
>>> +}
>>> +
>>> static int a6xx_gmu_pwrlevels_probe(struct a6xx_gmu *gmu)
>>> {
>>> struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
>>> struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
>>> + const struct a6xx_info *info = adreno_gpu->info->a6xx;
>>> struct msm_gpu *gpu = &adreno_gpu->base;
>>> int ret = 0;
>>> @@ -1472,6 +1638,14 @@ static int a6xx_gmu_pwrlevels_probe(struct a6xx_gmu *gmu)
>>> gmu->current_perf_index = gmu->nr_gpu_freqs - 1;
>>> + /*
>>> + * The GMU also handles GPU Interconnect Votes so build a list
>>> + * of DDR bandwidths from the GPU OPP table
>>> + */
>>> + if (info->bcms)
>>> + gmu->nr_gpu_bws = a6xx_gmu_build_bw_table(&gpu->pdev->dev,
>>> + gmu->gpu_bw_table, ARRAY_SIZE(gmu->gpu_bw_table));
>>> +
>>> /* Build the list of RPMh votes that we'll send to the GMU */
>>> return a6xx_gmu_rpmh_votes_init(gmu);
>>> }
>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>> index 88f18ea6a38a08b5b171709e5020010947a5d347..bdfc106cb3a578c90d7cd84f7d4fe228d761a994 100644
>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
>>> @@ -21,6 +21,15 @@ struct a6xx_gmu_bo {
>>> #define GMU_MAX_GX_FREQS 16
>>> #define GMU_MAX_CX_FREQS 4
>>> +#define GMU_MAX_BCMS 3
>>> +
>>> +struct a6xx_bcm {
>>> + char *name;
>>> + unsigned int buswidth;
>>> + bool fixed;
>>> + unsigned int perfmode;
>>> + unsigned int perfmode_bw;
>>> +};
>>> /*
>>> * These define the different GMU wake up options - these define how both the
>>> @@ -85,6 +94,11 @@ struct a6xx_gmu {
>>> unsigned long gpu_freqs[GMU_MAX_GX_FREQS];
>>> u32 gx_arc_votes[GMU_MAX_GX_FREQS];
>>> + int nr_gpu_bws;
>>> + unsigned long gpu_bw_table[GMU_MAX_GX_FREQS];
>>> + u32 gpu_ib_votes[GMU_MAX_GX_FREQS][GMU_MAX_BCMS];
>>> + u16 gpu_ab_votes[GMU_MAX_GX_FREQS];
>>> +
>>> int nr_gmu_freqs;
>>> unsigned long gmu_freqs[GMU_MAX_CX_FREQS];
>>> u32 cx_arc_votes[GMU_MAX_CX_FREQS];
>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>>> index 4aceffb6aae89c781facc2a6e4a82b20b341b6cb..9201a53dd341bf432923ffb44947e015208a3d02 100644
>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>>> @@ -44,6 +44,7 @@ struct a6xx_info {
>>> u32 gmu_chipid;
>>> u32 gmu_cgc_mode;
>>> u32 prim_fifo_threshold;
>>> + const struct a6xx_bcm *bcms;
>>> };
>>> struct a6xx_gpu {
>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.h b/drivers/gpu/drm/msm/adreno/a6xx_hfi.h
>>> index 528110169398f69f16443a29a1594d19c36fb595..52ba4a07d7b9a709289acd244a751ace9bdaab5d 100644
>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.h
>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.h
>>> @@ -173,6 +173,11 @@ struct a6xx_hfi_gx_bw_perf_vote_cmd {
>>> u32 bw;
>>> };
>>> +#define AB_VOTE_MASK GENMASK(31, 16)
>>> +#define MAX_AB_VOTE (FIELD_MAX(AB_VOTE_MASK) - 1)
>>> +#define AB_VOTE(vote) FIELD_PREP(AB_VOTE_MASK, (vote))
>>> +#define AB_VOTE_ENABLE BIT(8)
>>> +
>>> #define HFI_H2F_MSG_PREPARE_SLUMBER 33
>>> struct a6xx_hfi_prep_slumber_cmd {
>>>
>>
>
Powered by blists - more mailing lists