lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c3de920a-9630-c26a-e418-f42d3572509b@amd.com>
Date:   Mon, 21 Aug 2023 19:42:20 +0530
From:   "Yadav, Arvind" <arvyadav@....com>
To:     Shashank Sharma <shashank.sharma@....com>,
        Arvind Yadav <Arvind.Yadav@....com>, Christian.Koenig@....com,
        alexander.deucher@....com, Xinhui.Pan@....com, airlied@...il.com,
        daniel@...ll.ch, Felix.Kuehling@....com,
        amd-gfx@...ts.freedesktop.org
Cc:     dri-devel@...ts.freedesktop.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 1/7] drm/amdgpu: Added init/fini functions for workload


On 8/21/2023 7:24 PM, Shashank Sharma wrote:
>
> On 21/08/2023 15:35, Yadav, Arvind wrote:
>>
>> On 8/21/2023 6:36 PM, Shashank Sharma wrote:
>>> Hey Arvind,
>>>
>>> On 21/08/2023 08:47, Arvind Yadav wrote:
>>>> The'struct amdgpu_smu_workload' initialization/cleanup
>>>> functions is added by this patch.
>>>>
>>>> v2:
>>>> - Splitting big patch into separate patches.
>>>> - Added new fini function.
>>>>
>>>> Cc: Shashank Sharma <shashank.sharma@....com>
>>>> Cc: Christian Koenig <christian.koenig@....com>
>>>> Cc: Alex Deucher <alexander.deucher@....com>
>>>> Signed-off-by: Arvind Yadav <Arvind.Yadav@....com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/Makefile           |  2 +-
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  3 ++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  4 ++
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_workload.c  | 44 +++++++++++++++
>>>>   drivers/gpu/drm/amd/include/amdgpu_workload.h | 53 
>>>> +++++++++++++++++++
>>>>   5 files changed, 105 insertions(+), 1 deletion(-)
>>>>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_workload.c
>>>>   create mode 100644 drivers/gpu/drm/amd/include/amdgpu_workload.h
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
>>>> b/drivers/gpu/drm/amd/amdgpu/Makefile
>>>> index 415a7fa395c4..6a9e187d61e1 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
>>>> @@ -60,7 +60,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
>>>>       amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
>>>>       amdgpu_fw_attestation.o amdgpu_securedisplay.o \
>>>>       amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
>>>> -    amdgpu_ring_mux.o
>>>> +    amdgpu_ring_mux.o amdgpu_workload.o
>>>>     amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index 02b827785e39..1939fa1af8a6 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -107,6 +107,7 @@
>>>>   #include "amdgpu_fdinfo.h"
>>>>   #include "amdgpu_mca.h"
>>>>   #include "amdgpu_ras.h"
>>>> +#include "amdgpu_workload.h"
>>>>     #define MAX_GPU_INSTANCE        16
>>>>   @@ -1050,6 +1051,8 @@ struct amdgpu_device {
>>>>         bool                            job_hang;
>>>>       bool                            dc_enabled;
>>>> +
>>>> +    struct amdgpu_smu_workload    smu_workload;
>>>>   };
>>>>     static inline struct amdgpu_device *drm_to_adev(struct 
>>>> drm_device *ddev)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 5c7d40873ee2..cd3bf641b630 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -2243,6 +2243,8 @@ static int amdgpu_device_ip_early_init(struct 
>>>> amdgpu_device *adev)
>>>>       adev->cg_flags &= amdgpu_cg_mask;
>>>>       adev->pg_flags &= amdgpu_pg_mask;
>>>>   +    amdgpu_workload_profile_init(adev);
>>>> +
>>>>       return 0;
>>>>   }
>>>>   @@ -2890,6 +2892,8 @@ static int amdgpu_device_ip_fini(struct 
>>>> amdgpu_device *adev)
>>>>   {
>>>>       int i, r;
>>>>   +    amdgpu_workload_profile_fini(adev);
>>>> +
>>>>       if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
>>>>           amdgpu_virt_release_ras_err_handler_data(adev);
>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_workload.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_workload.c
>>>> new file mode 100644
>>>> index 000000000000..32166f482f77
>>>> --- /dev/null
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_workload.c
>>>> @@ -0,0 +1,44 @@
>>>> +// SPDX-License-Identifier: MIT
>>>> +/*
>>>> + * Copyright 2023 Advanced Micro Devices, Inc.
>>>> + *
>>>> + * Permission is hereby granted, free of charge, to any person 
>>>> obtaining a
>>>> + * copy of this software and associated documentation files (the 
>>>> "Software"),
>>>> + * to deal in the Software without restriction, including without 
>>>> limitation
>>>> + * the rights to use, copy, modify, merge, publish, distribute, 
>>>> sublicense,
>>>> + * and/or sell copies of the Software, and to permit persons to 
>>>> whom the
>>>> + * Software is furnished to do so, subject to the following 
>>>> conditions:
>>>> + *
>>>> + * The above copyright notice and this permission notice shall be 
>>>> included in
>>>> + * all copies or substantial portions of the Software.
>>>> + *
>>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>>>> EXPRESS OR
>>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>>>> MERCHANTABILITY,
>>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO 
>>>> EVENT SHALL
>>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
>>>> DAMAGES OR
>>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
>>>> OTHERWISE,
>>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
>>>> USE OR
>>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>>> + *
>>>> + */
>>>> +
>>>> +#include "amdgpu.h"
>>>> +
>>>> +void amdgpu_workload_profile_init(struct amdgpu_device *adev)
>>>> +{
>>>> +    adev->smu_workload.adev = adev;
>>>> +    adev->smu_workload.submit_workload_status = 0;
>>>> +    adev->smu_workload.initialized = true;
>>> why do we need this variable ?
>>
>> Hi Shashank,
>>
>> If any error comes while the device is booting then amdgpu will start 
>> unloading everything.
>> So I am using 'initialized' for unloading a driver successfully. This 
>> variable is to identify that the driver is loaded or not.
>
> I am not sure if I am getting this right. This variable is only 
> getting used in this patch here, just being set and reset.
>
> How does this flag help us ? I guess if AMDGPU driver is getting 
> unloaded we already know that we can't set power profile.
>
We are setting "initialized = true" and checking in 
amdgpu_workload_profile_fini() that 'initialized' is set or not set because
amdgpu_workload_profile_fini() is destroying mutex and same for delayed 
work which I have implemented in patch 0003.

In the below error case where amdgpu_workload_profile_init() was not 
called because psp firmware failed to load but amdgpu driver is
calling all unload function as well amdgpu_workload_profile_fini().

ThankYou
~Arvind

> - Shashank
>
>>
>> This is the below error for which the amdgpu driver is unloading when 
>> it is not getting firmware.
>>
>> [   12.421609] amdgpu 0000:08:00.0: Direct firmware load for 
>> amdgpu/renoir_ta.bin failed with error -2
>> [   12.421618] [drm:amdgpu_device_init [amdgpu]] *ERROR* early_init 
>> of IP block <psp> failed -19
>> [   12.428207] [drm] VCN decode is enabled in VM mode
>> [   12.428212] [drm] VCN encode is enabled in VM mode
>> [   12.430925] [drm] JPEG decode is enabled in VM mode
>> [   12.430931] amdgpu 0000:08:00.0: amdgpu: Fatal error during GPU init
>> [   12.431184] amdgpu 0000:08:00.0: amdgpu: amdgpu: finishing device.
>> [   12.431296] ------------[ cut here ]------------
>> [   12.431297] WARNING: CPU: 3 PID: 438 at kernel/workqueue.c:3379 
>> __flush_work+0x22f/0x240
>> [   12.431305] Modules linked in: ledtrig_audio snd_hda_codec_hdmi 
>> snd_hda_intel snd_intel_dspcfg snd_intel_sdw_acpi snd_hda_codec 
>> snd_hda_core amdgpu(OE+) snd_hwdep snd_pcm kvm snd_seq_midi 
>> snd_seq_midi_event drm_exec amdxcp snd_rawmidi iommu_v2 
>> crct10dif_pclmul drm_buddy gpu_sched ghash_clmulni_intel sha512_ssse3 
>> snd_seq drm_suballoc_helper aesni_intel drm_ttm_helper binfmt_misc 
>> crypto_simd snd_seq_device ttm cryptd snd_timer drm_display_helper 
>> input_leds rapl joydev cec wmi_bmof rc_core snd drm_kms_helper 
>> k10temp ccp soundcore mac_hid sch_fq_codel msr parport_pc ppdev lp 
>> parport ramoops reed_solomon drm pstore_blk pstore_zone efi_pstore 
>> ip_tables x_tables autofs4 hid_generic usbhid hid crc32_pclmul nvme 
>> igb ahci i2c_piix4 xhci_pci i2c_algo_bit nvme_core libahci 
>> xhci_pci_renesas dca video wmi
>> [   12.431360] CPU: 3 PID: 438 Comm: systemd-udevd Tainted: G        
>> W  OE      6.5.0-rc2-custom #1
>> [   12.431362] Hardware name: Gigabyte Technology Co., Ltd. X570 
>> AORUS ELITE/X570 AORUS ELITE, BIOS F34 06/10/2021
>> [   12.431364] RIP: 0010:__flush_work+0x22f/0x240
>> [   12.431367] Code: 8b 43 30 48 8b 53 40 89 c1 e9 f9 fe ff ff 4c 89 
>> f7 e8 45 0b db 00 e8 90 f5 08 00 45 31 ff e9 11 ff ff ff 0f 0b e9 0a 
>> ff ff ff <0f> 0b 45 31 ff e9 00 ff ff ff e8 02 a0 d9 00 66 90 90 90 
>> 90 90 90
>> [   12.431368] RSP: 0018:ffffb0668156f818 EFLAGS: 00010246
>> [   12.431370] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 
>> 0000000000000000
>> [   12.431371] RDX: 0000000000000001 RSI: 0000000000000001 RDI: 
>> ffff9cea492c7840
>> [   12.431372] RBP: ffffb0668156f890 R08: 0000000000000000 R09: 
>> ffffb0668156f7a0
>> [   12.431372] R10: 0000000000000001 R11: 0000000000000001 R12: 
>> ffff9cea492c7840
>> [   12.431373] R13: 0000000000000001 R14: ffff9cea43839940 R15: 
>> 0000000000000001
>> [   12.431374] FS:  00007fde83c18880(0000) GS:ffff9cf15e2c0000(0000) 
>> knlGS:0000000000000000
>> [   12.431375] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [   12.431376] CR2: 00007f2648000010 CR3: 00000001059e2000 CR4: 
>> 0000000000350ee0
>> [   12.431377] Call Trace:
>> [   12.431379]  <TASK>
>> [   12.431384]  ? show_regs+0x68/0x70
>> [   12.431388]  ? __flush_work+0x22f/0x240
>> [   12.431389]  ? __warn+0x8f/0x150
>> [   12.431392]  ? __flush_work+0x22f/0x240
>> [   12.431394]  ? report_bug+0x1f5/0x200
>> [   12.431399]  ? handle_bug+0x46/0x80
>> [   12.431402]  ? exc_invalid_op+0x19/0x70
>> [   12.431404]  ? asm_exc_invalid_op+0x1b/0x20
>> [   12.431408]  ? __flush_work+0x22f/0x240
>> [   12.431410]  ? irq_work_queue+0x10/0x60
>> [   12.431414]  ? __wake_up_klogd.part.0+0x5a/0x80
>> [   12.431419]  __cancel_work_timer+0x124/0x1b0
>> [   12.431421]  ? _printk+0x58/0x80
>> [   12.431423]  cancel_delayed_work_sync+0x13/0x20
>> [   12.431427]  amdgpu_workload_profile_fini+0x25/0x40 [amdgpu]
>> [   12.431854]  amdgpu_device_fini_sw+0x33/0x550 [amdgpu]
>> [   12.432035]  amdgpu_driver_release_kms+0x16/0x30 [amdgpu]
>> [   12.432213]  drm_dev_release+0x28/0x50 [drm]
>> [   12.432256]  devm_drm_dev_init_release+0x38/0x60 [drm]
>> [   12.432278]  devm_action_release+0x15/0x20
>> [   12.432283]  release_nodes+0x40/0xc0
>> [   12.432285]  devres_release_all+0x9e/0xe0
>> [   12.432286]  device_unbind_cleanup+0x12/0x80
>> [   12.432289]  really_probe+0x116/0x3e0
>> [   12.432291]  __driver_probe_device+0x7e/0x170
>> [   12.432293]  driver_probe_device+0x23/0xa0
>> [   12.432295]  __driver_attach+0xc5/0x190
>> [   12.432297]  ? __pfx___driver_attach+0x10/0x10
>> [   12.432299]  bus_for_each_dev+0x7c/0xd0
>> [   12.432302]  driver_attach+0x1e/0x30
>> [   12.432304]  bus_add_driver+0x11c/0x220
>> [   12.432306]  driver_register+0x64/0x130
>> [   12.432309]  ? __pfx_amdgpu_init+0x10/0x10 [amdgpu]
>> [   12.432491]  __pci_register_driver+0x68/0x70
>> [   12.432494]  amdgpu_init+0x63/0xff0 [amdgpu]
>> [   12.432667]  do_one_initcall+0x48/0x310
>> [   12.432671]  ? kmalloc_trace+0x2a/0xa0
>> [   12.432675]  do_init_module+0x6a/0x260
>> [   12.432677]  load_module+0x1db3/0x2050
>> [   12.432681]  init_module_from_file+0x9c/0xe0
>> [   12.432682]  ? init_module_from_file+0x9c/0xe0
>> [   12.432685]  idempotent_init_module+0x179/0x230
>> [   12.432687]  __x64_sys_finit_module+0x5d/0xb0
>> [   12.432689]  do_syscall_64+0x3b/0x90
>> [   12.432691]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>>
>>>> +
>>>> +    mutex_init(&adev->smu_workload.workload_lock);
>>>> +}
>>>> +
>>>> +void amdgpu_workload_profile_fini(struct amdgpu_device *adev)
>>>> +{
>>>> +    if (!adev->smu_workload.initialized)
>>>> +        return;
>>>> +
>>>> +    adev->smu_workload.submit_workload_status = 0;
>>>> +    adev->smu_workload.initialized = false;
>>>> + mutex_destroy(&adev->smu_workload.workload_lock);
>>>> +}
>>>> diff --git a/drivers/gpu/drm/amd/include/amdgpu_workload.h 
>>>> b/drivers/gpu/drm/amd/include/amdgpu_workload.h
>>>> new file mode 100644
>>>> index 000000000000..5d0f068422d4
>>>> --- /dev/null
>>>> +++ b/drivers/gpu/drm/amd/include/amdgpu_workload.h
>>>> @@ -0,0 +1,53 @@
>>>> +/* SPDX-License-Identifier: MIT */
>>>> +/*
>>>> + * Copyright 2023 Advanced Micro Devices, Inc.
>>>> + *
>>>> + * Permission is hereby granted, free of charge, to any person 
>>>> obtaining a
>>>> + * copy of this software and associated documentation files (the 
>>>> "Software"),
>>>> + * to deal in the Software without restriction, including without 
>>>> limitation
>>>> + * the rights to use, copy, modify, merge, publish, distribute, 
>>>> sublicense,
>>>> + * and/or sell copies of the Software, and to permit persons to 
>>>> whom the
>>>> + * Software is furnished to do so, subject to the following 
>>>> conditions:
>>>> + *
>>>> + * The above copyright notice and this permission notice shall be 
>>>> included in
>>>> + * all copies or substantial portions of the Software.
>>>> + *
>>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>>>> EXPRESS OR
>>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>>>> MERCHANTABILITY,
>>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO 
>>>> EVENT SHALL
>>>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
>>>> DAMAGES OR
>>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
>>>> OTHERWISE,
>>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
>>>> USE OR
>>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>>> + *
>>>> + */
>>>> +
>>>> +#ifndef _AMDGPU_WORKLOAD_H_
>>>> +#define _AMDGPU_WORKLOAD_H_
>>>> +
>>>> +struct amdgpu_smu_workload {
>>>> +    struct amdgpu_device    *adev;
>>>> +    struct mutex        workload_lock;
>>>> +    struct delayed_work    smu_delayed_work;
>>>
>>> call it power_profile_work instead ? Looks good otherwise.
>>>
>> Noted.
>>
>> Thank you
>>
>> ~Arvind
>>
>>> - Shashank
>>>
>>>> +    uint32_t submit_workload_status;
>>>> +    bool            initialized;
>>>> +    atomic_t power_profile_ref[PP_SMC_POWER_PROFILE_COUNT];
>>>> +};
>>>> +
>>>> +/* Workload mode names */
>>>> +static const char * const amdgpu_workload_mode_name[] = {
>>>> +    "Default",
>>>> +    "3D",
>>>> +    "Powersaving",
>>>> +    "Video",
>>>> +    "VR",
>>>> +    "Compute",
>>>> +    "Custom",
>>>> +    "Window3D"
>>>> +};
>>>> +
>>>> +void amdgpu_workload_profile_init(struct amdgpu_device *adev);
>>>> +
>>>> +void amdgpu_workload_profile_fini(struct amdgpu_device *adev);
>>>> +
>>>> +#endif

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ