linux-kernel - Re: [PATCH v2 3/3] hwmon: (k10temp): Show errors failing to read

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <90d283d0-06f6-baa5-b41b-fcd2f4b3ba99@amd.com>
Date:   Fri, 27 Aug 2021 16:10:06 -0500
From:   "Limonciello, Mario" <mario.limonciello@....com>
To:     Guenter Roeck <linux@...ck-us.net>
Cc:     Clemens Ladisch <clemens@...isch.de>, linux-hwmon@...r.kernel.org,
        Gabriel Craciunescu <nix.or.die@...glemail.com>,
        Wei Huang <wei.huang2@....com>,
        Jean Delvare <jdelvare@...e.com>,
        open list <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v2 3/3] hwmon: (k10temp): Show errors failing to read

On 8/27/2021 16:06, Guenter Roeck wrote:
> On Fri, Aug 27, 2021 at 03:15:27PM -0500, Mario Limonciello wrote:
>> Enabling Yellow Carp was initially not working "properly"
>> because extra IDs were needed, but this wasn't obvious because fail values
>> from `amd_smn_read` were ignored.
>>
>> Don't discard errors from any functions providing them, instead pass up
>> to the caller.
>>
>> Signed-off-by: Mario Limonciello <mario.limonciello@....com>
>> ---
>>   drivers/hwmon/k10temp.c | 87 ++++++++++++++++++++++++-----------------
>>   1 file changed, 52 insertions(+), 35 deletions(-)
>>
>> diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
>> index 38bc35ac8135..2edb49d39d22 100644
>> --- a/drivers/hwmon/k10temp.c
>> +++ b/drivers/hwmon/k10temp.c
>> @@ -98,8 +98,8 @@ static DEFINE_MUTEX(nb_smu_ind_mutex);
>>   
>>   struct k10temp_data {
>>   	struct pci_dev *pdev;
>> -	void (*read_htcreg)(struct pci_dev *pdev, u32 *regval);
>> -	void (*read_tempreg)(struct pci_dev *pdev, u32 *regval);
>> +	int (*read_htcreg)(struct pci_dev *pdev, u32 *regval);
>> +	int (*read_tempreg)(struct pci_dev *pdev, u32 *regval);
>>   	int temp_offset;
>>   	u32 temp_adjust_mask;
>>   	u32 show_temp;
>> @@ -129,55 +129,65 @@ static const struct tctl_offset tctl_offset_table[] = {
>>   	{ 0x17, "AMD Ryzen Threadripper 29", 27000 }, /* 29{20,50,70,90}[W]X */
>>   };
>>   
>> -static void read_htcreg_pci(struct pci_dev *pdev, u32 *regval)
>> +static int read_htcreg_pci(struct pci_dev *pdev, u32 *regval)
>>   {
>> -	pci_read_config_dword(pdev, REG_HARDWARE_THERMAL_CONTROL, regval);
>> +	return pci_read_config_dword(pdev, REG_HARDWARE_THERMAL_CONTROL, regval);
>>   }
>>   
>> -static void read_tempreg_pci(struct pci_dev *pdev, u32 *regval)
>> +static int read_tempreg_pci(struct pci_dev *pdev, u32 *regval)
>>   {
>> -	pci_read_config_dword(pdev, REG_REPORTED_TEMPERATURE, regval);
>> +	return pci_read_config_dword(pdev, REG_REPORTED_TEMPERATURE, regval);
>>   }
>>   
>> -static void amd_nb_index_read(struct pci_dev *pdev, unsigned int devfn,
>> +static int amd_nb_index_read(struct pci_dev *pdev, unsigned int devfn,
>>   			      unsigned int base, int offset, u32 *val)
>>   {
>> +	int ret;
>> +
>>   	mutex_lock(&nb_smu_ind_mutex);
>> -	pci_bus_write_config_dword(pdev->bus, devfn,
>> -				   base, offset);
>> -	pci_bus_read_config_dword(pdev->bus, devfn,
>> -				  base + 4, val);
>> +	ret = pci_bus_write_config_dword(pdev->bus, devfn,
>> +					 base, offset);
>> +	if (ret)
>> +		goto out;
>> +	ret = pci_bus_read_config_dword(pdev->bus, devfn,
>> +					base + 4, val);
>> +out:
>>   	mutex_unlock(&nb_smu_ind_mutex);
>> +	return ret;
>>   }
>>   
>> -static void read_htcreg_nb_f15(struct pci_dev *pdev, u32 *regval)
>> +static int read_htcreg_nb_f15(struct pci_dev *pdev, u32 *regval)
>>   {
>> -	amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8,
>> -			  F15H_M60H_HARDWARE_TEMP_CTRL_OFFSET, regval);
>> +	return amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8,
>> +				F15H_M60H_HARDWARE_TEMP_CTRL_OFFSET, regval);
>>   }
>>   
>> -static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval)
>> +static int read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval)
>>   {
>> -	amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8,
>> -			  F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, regval);
>> +	return amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8,
>> +				F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, regval);
>>   }
>>   
>> -static void read_tempreg_nb_zen(struct pci_dev *pdev, u32 *regval)
>> +static int read_tempreg_nb_zen(struct pci_dev *pdev, u32 *regval)
>>   {
>> -	amd_smn_read(amd_pci_dev_to_node_id(pdev),
>> -		     ZEN_REPORTED_TEMP_CTRL_BASE, regval);
>> +	return amd_smn_read(amd_pci_dev_to_node_id(pdev),
>> +			    ZEN_REPORTED_TEMP_CTRL_BASE, regval);
>>   }
>>   
>> -static long get_raw_temp(struct k10temp_data *data)
>> +static int get_raw_temp(struct k10temp_data *data, long *val)
>>   {
>>   	u32 regval;
>> -	long temp;
>> +	int ret;
>>   
>> -	data->read_tempreg(data->pdev, &regval);
>> -	temp = (regval >> ZEN_CUR_TEMP_SHIFT) * 125;
>> +	ret = data->read_tempreg(data->pdev, &regval);
>> +	if (ret)
>> +		return ret;
>> +	*val = (regval >> ZEN_CUR_TEMP_SHIFT) * 125;
>>   	if (regval & data->temp_adjust_mask)
>> -		temp -= 49000;
>> -	return temp;
>> +		*val -= 49000;
>> +	if (*val < 0)
>> +		return -EINVAL;
> 
> Please don't do that. More on that see below.
> 
>> +	return 0;
>>   }
>>   
>>   static const char *k10temp_temp_label[] = {
>> @@ -212,24 +222,27 @@ static int k10temp_read_temp(struct device *dev, u32 attr, int channel,
>>   {
>>   	struct k10temp_data *data = dev_get_drvdata(dev);
>>   	u32 regval;
>> +	int ret;
>>   
>>   	switch (attr) {
>>   	case hwmon_temp_input:
>>   		switch (channel) {
>>   		case 0:		/* Tctl */
>> -			*val = get_raw_temp(data);
>> -			if (*val < 0)
>> -				*val = 0;
> 
> We have to take the history into account here. A negative value
> is not an error per se, but it suggests that the chip returns wrong
> data. See commit aef17ca12719 ("hwmon: (k10temp) Only apply temperature
> offset if result is positive") for some of the background. I don't really
> want to change that into an error return just because we don't know
> what the chip is doing. Please retain the above code, either by fixing
> the values up here or in get_raw_temp().

Actually I thought what I was doing *was* making it a lot less ambiguous.

The caller getting -EINVAL from get_raw_tempt will indicate that the 
data shouldn't be trusted rather than a surely wrong "0".

> 
> Thanks,
> Guenter
> 
>> +			ret = get_raw_temp(data, val);
>> +			if (ret)
>> +				return ret;
>>   			break;
>>   		case 1:		/* Tdie */
>> -			*val = get_raw_temp(data) - data->temp_offset;
>> -			if (*val < 0)
>> -				*val = 0;
>> +			ret = get_raw_temp(data, val) - data->temp_offset;
>> +			if (ret)
>> +				return ret;
>>   			break;
>>   		case 2 ... 9:		/* Tccd{1-8} */
>> -			amd_smn_read(amd_pci_dev_to_node_id(data->pdev),
>> +			ret = amd_smn_read(amd_pci_dev_to_node_id(data->pdev),
>>   				     ZEN_CCD_TEMP(data->ccd_offset, channel - 2),
>>   						  &regval);
>> +			if (ret)
>> +				return ret;
>>   			*val = (regval & ZEN_CCD_TEMP_MASK) * 125 - 49000;
>>   			break;
>>   		default:
>> @@ -240,11 +253,15 @@ static int k10temp_read_temp(struct device *dev, u32 attr, int channel,
>>   		*val = 70 * 1000;
>>   		break;
>>   	case hwmon_temp_crit:
>> -		data->read_htcreg(data->pdev, &regval);
>> +		ret = data->read_htcreg(data->pdev, &regval);
>> +		if (ret)
>> +			return ret;
>>   		*val = ((regval >> 16) & 0x7f) * 500 + 52000;
>>   		break;
>>   	case hwmon_temp_crit_hyst:
>> -		data->read_htcreg(data->pdev, &regval);
>> +		ret = data->read_htcreg(data->pdev, &regval);
>> +		if (ret)
>> +			return ret;
>>   		*val = (((regval >> 16) & 0x7f)
>>   			- ((regval >> 24) & 0xf)) * 500 + 52000;
>>   		break;