[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <90d283d0-06f6-baa5-b41b-fcd2f4b3ba99@amd.com>
Date: Fri, 27 Aug 2021 16:10:06 -0500
From: "Limonciello, Mario" <mario.limonciello@....com>
To: Guenter Roeck <linux@...ck-us.net>
Cc: Clemens Ladisch <clemens@...isch.de>, linux-hwmon@...r.kernel.org,
Gabriel Craciunescu <nix.or.die@...glemail.com>,
Wei Huang <wei.huang2@....com>,
Jean Delvare <jdelvare@...e.com>,
open list <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v2 3/3] hwmon: (k10temp): Show errors failing to read
On 8/27/2021 16:06, Guenter Roeck wrote:
> On Fri, Aug 27, 2021 at 03:15:27PM -0500, Mario Limonciello wrote:
>> Enabling Yellow Carp was initially not working "properly"
>> because extra IDs were needed, but this wasn't obvious because fail values
>> from `amd_smn_read` were ignored.
>>
>> Don't discard errors from any functions providing them, instead pass up
>> to the caller.
>>
>> Signed-off-by: Mario Limonciello <mario.limonciello@....com>
>> ---
>> drivers/hwmon/k10temp.c | 87 ++++++++++++++++++++++++-----------------
>> 1 file changed, 52 insertions(+), 35 deletions(-)
>>
>> diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
>> index 38bc35ac8135..2edb49d39d22 100644
>> --- a/drivers/hwmon/k10temp.c
>> +++ b/drivers/hwmon/k10temp.c
>> @@ -98,8 +98,8 @@ static DEFINE_MUTEX(nb_smu_ind_mutex);
>>
>> struct k10temp_data {
>> struct pci_dev *pdev;
>> - void (*read_htcreg)(struct pci_dev *pdev, u32 *regval);
>> - void (*read_tempreg)(struct pci_dev *pdev, u32 *regval);
>> + int (*read_htcreg)(struct pci_dev *pdev, u32 *regval);
>> + int (*read_tempreg)(struct pci_dev *pdev, u32 *regval);
>> int temp_offset;
>> u32 temp_adjust_mask;
>> u32 show_temp;
>> @@ -129,55 +129,65 @@ static const struct tctl_offset tctl_offset_table[] = {
>> { 0x17, "AMD Ryzen Threadripper 29", 27000 }, /* 29{20,50,70,90}[W]X */
>> };
>>
>> -static void read_htcreg_pci(struct pci_dev *pdev, u32 *regval)
>> +static int read_htcreg_pci(struct pci_dev *pdev, u32 *regval)
>> {
>> - pci_read_config_dword(pdev, REG_HARDWARE_THERMAL_CONTROL, regval);
>> + return pci_read_config_dword(pdev, REG_HARDWARE_THERMAL_CONTROL, regval);
>> }
>>
>> -static void read_tempreg_pci(struct pci_dev *pdev, u32 *regval)
>> +static int read_tempreg_pci(struct pci_dev *pdev, u32 *regval)
>> {
>> - pci_read_config_dword(pdev, REG_REPORTED_TEMPERATURE, regval);
>> + return pci_read_config_dword(pdev, REG_REPORTED_TEMPERATURE, regval);
>> }
>>
>> -static void amd_nb_index_read(struct pci_dev *pdev, unsigned int devfn,
>> +static int amd_nb_index_read(struct pci_dev *pdev, unsigned int devfn,
>> unsigned int base, int offset, u32 *val)
>> {
>> + int ret;
>> +
>> mutex_lock(&nb_smu_ind_mutex);
>> - pci_bus_write_config_dword(pdev->bus, devfn,
>> - base, offset);
>> - pci_bus_read_config_dword(pdev->bus, devfn,
>> - base + 4, val);
>> + ret = pci_bus_write_config_dword(pdev->bus, devfn,
>> + base, offset);
>> + if (ret)
>> + goto out;
>> + ret = pci_bus_read_config_dword(pdev->bus, devfn,
>> + base + 4, val);
>> +out:
>> mutex_unlock(&nb_smu_ind_mutex);
>> + return ret;
>> }
>>
>> -static void read_htcreg_nb_f15(struct pci_dev *pdev, u32 *regval)
>> +static int read_htcreg_nb_f15(struct pci_dev *pdev, u32 *regval)
>> {
>> - amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8,
>> - F15H_M60H_HARDWARE_TEMP_CTRL_OFFSET, regval);
>> + return amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8,
>> + F15H_M60H_HARDWARE_TEMP_CTRL_OFFSET, regval);
>> }
>>
>> -static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval)
>> +static int read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval)
>> {
>> - amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8,
>> - F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, regval);
>> + return amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8,
>> + F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, regval);
>> }
>>
>> -static void read_tempreg_nb_zen(struct pci_dev *pdev, u32 *regval)
>> +static int read_tempreg_nb_zen(struct pci_dev *pdev, u32 *regval)
>> {
>> - amd_smn_read(amd_pci_dev_to_node_id(pdev),
>> - ZEN_REPORTED_TEMP_CTRL_BASE, regval);
>> + return amd_smn_read(amd_pci_dev_to_node_id(pdev),
>> + ZEN_REPORTED_TEMP_CTRL_BASE, regval);
>> }
>>
>> -static long get_raw_temp(struct k10temp_data *data)
>> +static int get_raw_temp(struct k10temp_data *data, long *val)
>> {
>> u32 regval;
>> - long temp;
>> + int ret;
>>
>> - data->read_tempreg(data->pdev, ®val);
>> - temp = (regval >> ZEN_CUR_TEMP_SHIFT) * 125;
>> + ret = data->read_tempreg(data->pdev, ®val);
>> + if (ret)
>> + return ret;
>> + *val = (regval >> ZEN_CUR_TEMP_SHIFT) * 125;
>> if (regval & data->temp_adjust_mask)
>> - temp -= 49000;
>> - return temp;
>> + *val -= 49000;
>> + if (*val < 0)
>> + return -EINVAL;
>
> Please don't do that. More on that see below.
>
>> + return 0;
>> }
>>
>> static const char *k10temp_temp_label[] = {
>> @@ -212,24 +222,27 @@ static int k10temp_read_temp(struct device *dev, u32 attr, int channel,
>> {
>> struct k10temp_data *data = dev_get_drvdata(dev);
>> u32 regval;
>> + int ret;
>>
>> switch (attr) {
>> case hwmon_temp_input:
>> switch (channel) {
>> case 0: /* Tctl */
>> - *val = get_raw_temp(data);
>> - if (*val < 0)
>> - *val = 0;
>
> We have to take the history into account here. A negative value
> is not an error per se, but it suggests that the chip returns wrong
> data. See commit aef17ca12719 ("hwmon: (k10temp) Only apply temperature
> offset if result is positive") for some of the background. I don't really
> want to change that into an error return just because we don't know
> what the chip is doing. Please retain the above code, either by fixing
> the values up here or in get_raw_temp().
Actually I thought what I was doing *was* making it a lot less ambiguous.
The caller getting -EINVAL from get_raw_tempt will indicate that the
data shouldn't be trusted rather than a surely wrong "0".
>
> Thanks,
> Guenter
>
>> + ret = get_raw_temp(data, val);
>> + if (ret)
>> + return ret;
>> break;
>> case 1: /* Tdie */
>> - *val = get_raw_temp(data) - data->temp_offset;
>> - if (*val < 0)
>> - *val = 0;
>> + ret = get_raw_temp(data, val) - data->temp_offset;
>> + if (ret)
>> + return ret;
>> break;
>> case 2 ... 9: /* Tccd{1-8} */
>> - amd_smn_read(amd_pci_dev_to_node_id(data->pdev),
>> + ret = amd_smn_read(amd_pci_dev_to_node_id(data->pdev),
>> ZEN_CCD_TEMP(data->ccd_offset, channel - 2),
>> ®val);
>> + if (ret)
>> + return ret;
>> *val = (regval & ZEN_CCD_TEMP_MASK) * 125 - 49000;
>> break;
>> default:
>> @@ -240,11 +253,15 @@ static int k10temp_read_temp(struct device *dev, u32 attr, int channel,
>> *val = 70 * 1000;
>> break;
>> case hwmon_temp_crit:
>> - data->read_htcreg(data->pdev, ®val);
>> + ret = data->read_htcreg(data->pdev, ®val);
>> + if (ret)
>> + return ret;
>> *val = ((regval >> 16) & 0x7f) * 500 + 52000;
>> break;
>> case hwmon_temp_crit_hyst:
>> - data->read_htcreg(data->pdev, ®val);
>> + ret = data->read_htcreg(data->pdev, ®val);
>> + if (ret)
>> + return ret;
>> *val = (((regval >> 16) & 0x7f)
>> - ((regval >> 24) & 0xf)) * 500 + 52000;
>> break;
Powered by blists - more mailing lists