lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240307091011.877921-1-kane.chen@intel.com>
Date: Thu,  7 Mar 2024 17:10:11 +0800
From: Kane Chen <kane.chen@...el.com>
To: kane.chen@...el.com,
	linux-kernel@...r.kernel.org,
	platform-driver-x86@...r.kernel.org,
	ilpo.jarvinen@...ux.intel.com,
	david.e.box@...el.com,
	kane.chen@...el.corp-partner.google.com
Subject: [PATCH 1/1] platform/x86/intel/pmc: Improve PKGC residency counters debug

The current code only prints PKGC-10 residency when the PKGC-10
is not reached in previous 'freeze' attempt. To debug PKGC-10 issues, we
also need to know other PKGC residency counters to better triage issues.
Ex:
1. When system is stuck in PC2, it can be caused short LTR from device.
2. When system is stuck in PC8, it can be caused by display engine.

To better triage issues, all PKGC residency are needed when issues happens.

Example log:
 CPU did not enter Package C10!!! (Package C10 cnt=0x0)
 Prev Package C2 cnt = 0x2191a325de, Current Package C2 cnt = 0x21aba30724
 Prev Package C3 cnt = 0x0, Current Package C3 cnt = 0x0
 Prev Package C6 cnt = 0x0, Current Package C6 cnt = 0x0
 Prev Package C7 cnt = 0x0, Current Package C7 cnt = 0x0
 Prev Package C8 cnt = 0x0, Current Package C8 cnt = 0x0
 Prev Package C9 cnt = 0x0, Current Package C9 cnt = 0x0
 Prev Package C10 cnt = 0x0, Current Package C10 cnt = 0x0

With this log, we can know whether it's a stuck PC2 issue, and we can
check whether the short LTR from device causes the issue.

Signed-off-by: Kane Chen <kane.chen@...el.com>
---
 drivers/platform/x86/intel/pmc/core.c | 47 ++++++++++++++++++++-------
 drivers/platform/x86/intel/pmc/core.h |  7 ++--
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c
index 8f9c036809c79..b8910b671667e 100644
--- a/drivers/platform/x86/intel/pmc/core.c
+++ b/drivers/platform/x86/intel/pmc/core.c
@@ -1389,6 +1389,15 @@ static int pmc_core_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	pmcdev->pmcs[PMC_IDX_MAIN] = primary_pmc;
 
+	/* The last element in msr_map is empty */
+	pmcdev->num_of_pkgc = ARRAY_SIZE(msr_map) - 1;
+	pmcdev->pkgc_res_cnt = devm_kcalloc(&pdev->dev,
+					    pmcdev->num_of_pkgc,
+					    sizeof(*pmcdev->pkgc_res_cnt),
+					    GFP_KERNEL);
+	if (!pmcdev->pkgc_res_cnt)
+		return -ENOMEM;
+
 	/*
 	 * Coffee Lake has CPU ID of Kaby Lake and Cannon Lake PCH. So here
 	 * Sunrisepoint PCH regmap can't be used. Use Cannon Lake PCH regmap
@@ -1432,6 +1441,7 @@ static __maybe_unused int pmc_core_suspend(struct device *dev)
 {
 	struct pmc_dev *pmcdev = dev_get_drvdata(dev);
 	struct pmc *pmc = pmcdev->pmcs[PMC_IDX_MAIN];
+	int i;
 
 	if (pmcdev->suspend)
 		pmcdev->suspend(pmcdev);
@@ -1440,9 +1450,11 @@ static __maybe_unused int pmc_core_suspend(struct device *dev)
 	if (pm_suspend_via_firmware())
 		return 0;
 
-	/* Save PC10 residency for checking later */
-	if (rdmsrl_safe(MSR_PKG_C10_RESIDENCY, &pmcdev->pc10_counter))
-		return -EIO;
+	/* Save PKGC residency for checking later */
+	for (i = 0; i < pmcdev->num_of_pkgc; i++) {
+		if (rdmsrl_safe(msr_map[i].bit_mask, &pmcdev->pkgc_res_cnt[i]))
+			return -EIO;
+	}
 
 	/* Save S0ix residency for checking later */
 	if (pmc_core_dev_state_get(pmc, &pmcdev->s0ix_counter))
@@ -1451,14 +1463,15 @@ static __maybe_unused int pmc_core_suspend(struct device *dev)
 	return 0;
 }
 
-static inline bool pmc_core_is_pc10_failed(struct pmc_dev *pmcdev)
+static inline bool pmc_core_is_deepest_pkgc_failed(struct pmc_dev *pmcdev)
 {
-	u64 pc10_counter;
+	u32 deepest_pkgc_msr = msr_map[pmcdev->num_of_pkgc - 1].bit_mask;
+	u64 deepest_pkgc_residency;
 
-	if (rdmsrl_safe(MSR_PKG_C10_RESIDENCY, &pc10_counter))
+	if (rdmsrl_safe(deepest_pkgc_msr, &deepest_pkgc_residency))
 		return false;
 
-	if (pc10_counter == pmcdev->pc10_counter)
+	if (deepest_pkgc_residency == pmcdev->pkgc_res_cnt[pmcdev->num_of_pkgc - 1])
 		return true;
 
 	return false;
@@ -1497,10 +1510,22 @@ int pmc_core_resume_common(struct pmc_dev *pmcdev)
 	if (!warn_on_s0ix_failures)
 		return 0;
 
-	if (pmc_core_is_pc10_failed(pmcdev)) {
-		/* S0ix failed because of PC10 entry failure */
-		dev_info(dev, "CPU did not enter PC10!!! (PC10 cnt=0x%llx)\n",
-			 pmcdev->pc10_counter);
+	if (pmc_core_is_deepest_pkgc_failed(pmcdev)) {
+		/* S0ix failed because of deepest PKGC entry failure */
+		dev_info(dev, "CPU did not enter %s!!! (%s cnt=0x%llx)\n",
+			 msr_map[pmcdev->num_of_pkgc - 1].name,
+			 msr_map[pmcdev->num_of_pkgc - 1].name,
+			 pmcdev->pkgc_res_cnt[pmcdev->num_of_pkgc - 1]);
+
+		for (i = 0; i < pmcdev->num_of_pkgc; i++) {
+			u64 pc_cnt;
+
+			if (!rdmsrl_safe(msr_map[i].bit_mask, &pc_cnt)) {
+				dev_info(dev, "Prev %s cnt = 0x%llx, Current %s cnt = 0x%llx\n",
+					 msr_map[i].name, pmcdev->pkgc_res_cnt[i],
+					 msr_map[i].name, pc_cnt);
+			}
+		}
 		return 0;
 	}
 
diff --git a/drivers/platform/x86/intel/pmc/core.h b/drivers/platform/x86/intel/pmc/core.h
index 54137faaae2b2..fd7ae76984ac7 100644
--- a/drivers/platform/x86/intel/pmc/core.h
+++ b/drivers/platform/x86/intel/pmc/core.h
@@ -385,7 +385,8 @@ struct pmc {
  * @pmc_xram_read_bit:	flag to indicate whether PMC XRAM shadow registers
  *			used to read MPHY PG and PLL status are available
  * @mutex_lock:		mutex to complete one transcation
- * @pc10_counter:	PC10 residency counter
+ * @pkgc_res_cnt:	PKGC residency counter
+ * @num_of_pkgc:	Number of PKGC
  * @s0ix_counter:	S0ix residency (step adjusted)
  * @num_lpm_modes:	Count of enabled modes
  * @lpm_en_modes:	Array of enabled modes from lowest to highest priority
@@ -403,13 +404,15 @@ struct pmc_dev {
 	int pmc_xram_read_bit;
 	struct mutex lock; /* generic mutex lock for PMC Core */
 
-	u64 pc10_counter;
 	u64 s0ix_counter;
 	int num_lpm_modes;
 	int lpm_en_modes[LPM_MAX_NUM_MODES];
 	void (*suspend)(struct pmc_dev *pmcdev);
 	int (*resume)(struct pmc_dev *pmcdev);
 
+	u64 *pkgc_res_cnt;
+	u8 num_of_pkgc;
+
 	bool has_die_c6;
 	u32 die_c6_offset;
 	struct telem_endpoint *punit_ep;
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ