[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <273b9080d42bcd2fb36fc4510416f0e111edee62.camel@intel.com>
Date: Wed, 12 Feb 2025 13:43:25 +0000
From: "Zhang, Rui" <rui.zhang@...el.com>
To: "Mehta, Sohil" <sohil.mehta@...el.com>, "Luck, Tony"
<tony.luck@...el.com>, "Hansen, Dave" <dave.hansen@...el.com>,
"x86@...nel.org" <x86@...nel.org>, "dave.hansen@...ux.intel.com"
<dave.hansen@...ux.intel.com>
CC: "linux-pm@...r.kernel.org" <linux-pm@...r.kernel.org>,
"viresh.kumar@...aro.org" <viresh.kumar@...aro.org>,
"andrew.cooper3@...rix.com" <andrew.cooper3@...rix.com>,
"alexander.shishkin@...ux.intel.com" <alexander.shishkin@...ux.intel.com>,
"luto@...nel.org" <luto@...nel.org>, "david.laight.linux@...il.com"
<david.laight.linux@...il.com>, "linux-hwmon@...r.kernel.org"
<linux-hwmon@...r.kernel.org>, "linux-perf-users@...r.kernel.org"
<linux-perf-users@...r.kernel.org>, "Hunter, Adrian"
<adrian.hunter@...el.com>, "jdelvare@...e.com" <jdelvare@...e.com>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"mingo@...hat.com" <mingo@...hat.com>, "irogers@...gle.com"
<irogers@...gle.com>, "tglx@...utronix.de" <tglx@...utronix.de>,
"linux@...ck-us.net" <linux@...ck-us.net>, "lenb@...nel.org"
<lenb@...nel.org>, "kan.liang@...ux.intel.com" <kan.liang@...ux.intel.com>,
"hpa@...or.com" <hpa@...or.com>, "peterz@...radead.org"
<peterz@...radead.org>, "mark.rutland@....com" <mark.rutland@....com>,
"bp@...en8.de" <bp@...en8.de>, "acme@...nel.org" <acme@...nel.org>,
"rafael@...nel.org" <rafael@...nel.org>, "jolsa@...nel.org"
<jolsa@...nel.org>, "linux-acpi@...r.kernel.org"
<linux-acpi@...r.kernel.org>, "namhyung@...nel.org" <namhyung@...nel.org>
Subject: Re: [PATCH v2 07/17] hwmon: Fix Intel Family-model checks to include
extended Families
On Tue, 2025-02-11 at 13:38 -0800, Sohil Mehta wrote:
> On 2/11/2025 12:58 PM, Dave Hansen wrote:
> > On 2/11/25 11:43, Sohil Mehta wrote:
> > > + /*
> > > + * Return without adjustment if the Family isn't 6.
> > > + * The rest of the function assumes Family 6.
> > > + */
> > > + if (c->x86 != 6)
> > > + return tjmax;
> >
> > Shouldn't we be converting this over to the vfm matches?
> >
>
> For drivers/, I mainly focused on fixes instead of cleanups.
>
> Converting drivers over to VFM checks is significant work. There are
> a
> lot of such comparisons and switch cases (probably more than 50)
> across
> drivers/cpufreq/ and drivers/hwmon/.
>
> Some of the functions might need significant refactoring and
> rewrites. I
> think someone with expertise in that particular driver should
> probably
> do it. I did start with it initially but it is beyond my bandwidth at
> the moment.
>
I agree.
adjust_tjmax() contains a list of quirks based on PCI-
ID/x86_vendor_id/x86_model/x86_stepping. The common problem is that all
the quirks are for Fam6 processors but the family id is not checked. So
the fix is sufficient. In fact, I think it is better to move the check
to the very beginning of adjust_tjmax().
Plus that, I do think we can have more cleanups on top
1. rename adjust_tjmax() to adjust_tjmax_for_fam6()
2. move all model specific quirks altogether and avoid model checks in
the main functions.
3. for processors newer than fam6, the driver should fail to probe
rather than using a hardcoded value when reading
MSR_IA32_TEMPERATURE_TARGET fails.
maybe I can start with something like below.
---
drivers/hwmon/coretemp.c | 98 +++++++++++++++++++++++-----------------
1 file changed, 57 insertions(+), 41 deletions(-)
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 1aa67a2b5f18..fc2cf607aa36 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -99,6 +99,7 @@ struct platform_data {
struct device_attribute name_attr;
};
+/* Beginning of Model specific quirks */
struct tjmax_pci {
unsigned int device;
int tjmax;
@@ -147,12 +148,11 @@ static const struct tjmax_model tjmax_model_table[] = {
*/
};
-static bool is_pkg_temp_data(struct temp_data *tdata)
-{
- return tdata->index < 0;
-}
-
-static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev)
+/*
+ * Adjust tjmax value for early Fam6 CPUs with unreadable MSR_IA32_TEMPERATURE_TARGET
+ * NOTE: the calculated value may not be correct.
+ */
+static int adjust_tjmax_for_fam6(struct cpuinfo_x86 *c, u32 id, struct device *dev)
{
/* The 100C is default for both mobile and non mobile CPUs */
@@ -163,8 +163,16 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev)
u32 eax, edx;
int i;
u16 devfn = PCI_DEVFN(0, 0);
- struct pci_dev *host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn);
+ struct pci_dev *host_bridge;
+
+ /*
+ * Return without adjustment if the Family isn't 6.
+ * The rest of the function assumes Family 6.
+ */
+ if (c->x86 != 6)
+ return tjmax;
+ host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn);
/*
* Explicit tjmax table entries override heuristics.
* First try PCI host bridge IDs, followed by model ID strings
@@ -185,12 +193,6 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev)
return tjmax_table[i].tjmax;
}
- /*
- * Return without adjustment if the Family isn't 6.
- * The rest of the function assumes Family 6.
- */
- if (c->x86 != 6)
- return tjmax;
for (i = 0; i < ARRAY_SIZE(tjmax_model_table); i++) {
const struct tjmax_model *tm = &tjmax_model_table[i];
@@ -280,6 +282,37 @@ static bool cpu_has_tjmax(struct cpuinfo_x86 *c)
model != 0x36);
}
+static bool cpu_has_ttarget(struct temp_data *tdata)
+{
+ struct cpuinfo_x86 *c = &cpu_data(tdata->cpu);
+
+ /*
+ * The target temperature is available on older CPUs but not in the
+ * MSR_IA32_TEMPERATURE_TARGET register. Atoms don't have the register
+ * at all.
+ */
+ if (c->x86 > 15 || (c->x86 == 6 && c->x86_model > 0xe && c->x86_model != 0x1c))
+ return true;
+ return false;
+}
+
+static bool cpu_has_broken_ucode(unsigned int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Check if we have problem with errata AE18 of Core processors:
+ * Readings might stop update when processor visited too deep sleep,
+ * fixed for stepping D0 (6EC).
+ */
+ if (c->x86 == 6 && c->x86_model == 0xe && c->x86_stepping < 0xc && c->microcode < 0x39) {
+ pr_err("Errata AE18 not fixed, update BIOS or microcode of the CPU!\n");
+ return true;
+ }
+ return false;
+}
+/* End of Model specific quirks */
+
static int get_tjmax(struct temp_data *tdata, struct device *dev)
{
struct cpuinfo_x86 *c = &cpu_data(tdata->cpu);
@@ -312,9 +345,8 @@ static int get_tjmax(struct temp_data *tdata, struct device *dev)
} else {
/*
* An assumption is made for early CPUs and unreadable MSR.
- * NOTE: the calculated value may not be correct.
*/
- tdata->tjmax = adjust_tjmax(c, tdata->cpu, dev);
+ tdata->tjmax = adjust_tjmax_for_fam6(c, tdata->cpu, dev);
}
return tdata->tjmax;
}
@@ -324,6 +356,8 @@ static int get_ttarget(struct temp_data *tdata, struct device *dev)
u32 eax, edx;
int tjmax, ttarget_offset, ret;
+ if (!cpu_has_ttarget(tdata))
+ return -ENODEV;
/*
* ttarget is valid only if tjmax can be retrieved from
* MSR_IA32_TEMPERATURE_TARGET
@@ -348,6 +382,11 @@ static int max_zones __read_mostly;
/* Array of zone pointers. Serialized by cpu hotplug lock */
static struct platform_device **zone_devices;
+static bool is_pkg_temp_data(struct temp_data *tdata)
+{
+ return tdata->index < 0;
+}
+
static ssize_t show_label(struct device *dev,
struct device_attribute *devattr, char *buf)
{
@@ -460,23 +499,6 @@ static int create_core_attrs(struct temp_data *tdata, struct device *dev)
return sysfs_create_group(&dev->kobj, &tdata->attr_group);
}
-
-static int chk_ucode_version(unsigned int cpu)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- /*
- * Check if we have problem with errata AE18 of Core processors:
- * Readings might stop update when processor visited too deep sleep,
- * fixed for stepping D0 (6EC).
- */
- if (c->x86 == 6 && c->x86_model == 0xe && c->x86_stepping < 0xc && c->microcode < 0x39) {
- pr_err("Errata AE18 not fixed, update BIOS or microcode of the CPU!\n");
- return -ENODEV;
- }
- return 0;
-}
-
static struct platform_device *coretemp_get_pdev(unsigned int cpu)
{
int id = topology_logical_die_id(cpu);
@@ -585,14 +607,8 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
/* Make sure tdata->tjmax is a valid indicator for dynamic/static tjmax */
get_tjmax(tdata, &pdev->dev);
- /*
- * The target temperature is available on older CPUs but not in the
- * MSR_IA32_TEMPERATURE_TARGET register. Atoms don't have the register
- * at all.
- */
- if (c->x86 > 15 || (c->x86 == 6 && c->x86_model > 0xe && c->x86_model != 0x1c))
- if (get_ttarget(tdata, &pdev->dev) >= 0)
- tdata->attr_size++;
+ if (get_ttarget(tdata, &pdev->dev) >= 0)
+ tdata->attr_size++;
/* Create sysfs interfaces */
err = create_core_attrs(tdata, pdata->hwmon_dev);
@@ -696,7 +712,7 @@ static int coretemp_cpu_online(unsigned int cpu)
struct device *hwmon;
/* Check the microcode version of the CPU */
- if (chk_ucode_version(cpu))
+ if (cpu_has_broken_ucode(cpu))
return -EINVAL;
/*
--
2.43.0
Powered by blists - more mailing lists