lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240214194427.GPZc0YG1e_6QWbGUQQ@fat_crate.local>
Date: Wed, 14 Feb 2024 20:44:27 +0100
From: Borislav Petkov <bp@...en8.de>
To: Yazen Ghannam <yazen.ghannam@....com>
Cc: tony.luck@...el.com, linux-edac@...r.kernel.org,
	linux-kernel@...r.kernel.org, avadhut.naik@....com,
	john.allen@....com, muralidhara.mk@....com,
	naveenkrishna.chatradhi@....com, sathyapriya.k@....com
Subject: Re: [PATCH 2/2] RAS: Introduce the FRU Memory Poison Manager

On Wed, Feb 14, 2024 at 10:26:41AM -0500, Yazen Ghannam wrote:
> > > +	/* This is the first entry, so just save it. */
> > > +	if (!has_valid_entries(fmp->validation_bits))
> > > +		goto save_fpd;
> > 
> > Not needed - if it is the first entry, it'll get saved there.
> > 
> 
> Get saved where?
> 
> For brand new records, the module will allocate them with the headers
> and no descriptor entries (empty list).

As discussed offlist, let's stick to checking validation_bits even if
->nr_entries should be 0 when rec doesn't have valid entries yet.

So lemme readd it.

diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c
index e50f11fb90a4..daab7f58505a 100644
--- a/drivers/ras/amd/fmpm.c
+++ b/drivers/ras/amd/fmpm.c
@@ -275,6 +275,10 @@ static void update_fru_record(struct fru_rec *rec, struct mce *m)
 	fpd.addr_type	= FPD_ADDR_TYPE_MCA_ADDR;
 	fpd.addr	= m->addr;
 
+	/* This is the first entry, so just save it. */
+	if (!rec_has_valid_entries(rec))
+		goto save_fpd;
+
 	/* Ignore already recorded errors. */
 	if (rec_has_fpd(rec, &fpd))
 		goto out_unlock;
@@ -287,6 +291,7 @@ static void update_fru_record(struct fru_rec *rec, struct mce *m)
 	entry	  = fmp->nr_entries;
 	fpd_dest  = &rec->entries[entry];
 
+save_fpd:
 	memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc));
 
 	fmp->nr_entries		 = entry + 1;


> > Yap, exactly, this should use atl_err and not struct mce.
> > 
> 
> Yes, tried to do *some* things generic.

Right.

> > > +	 * This should not happen on real errors. But it could happen from
> > 
> > What exactly is "This" here?
> > 
> 
> Ah right. The module should have created, or restored, a record for each FRU
> in the system during module init. So the runtime handler should always find
> a valid record for a FRU. The only exception I could think of, besides bugs,
> is if the user does software error injection and a valid FRU ID doesn't get
> set.

Changed to:

        /*
         * An invalid FRU ID should not happen on real errors. But it
         * could happen from software error injection, etc.
         */
        rec = get_fru_record(m->ppin);

> > Pass in that fmp thing into retire_dram_row() so that you can delay
> > that get_cpu_from_fru_id() call until the moment you actually need it.

IOW, this:

diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c
index a314598186a0..c2dc83a4e82a 100644
--- a/drivers/ras/amd/fmpm.c
+++ b/drivers/ras/amd/fmpm.c
@@ -47,6 +47,7 @@
 
 #include <linux/cper.h>
 #include <linux/ras.h>
+#include <linux/cpu.h>
 
 #include <acpi/apei.h>
 
@@ -347,21 +348,10 @@ static struct notifier_block fru_mem_poison_nb = {
 	.priority	= MCE_PRIO_LOWEST,
 };
 
-static u32 get_cpu_from_fru_id(u64 fru_id)
-{
-	unsigned int cpu = 0;
-
-	/* Should there be more robust error handling if none found? */
-	for_each_online_cpu(cpu) {
-		if (topology_ppin(cpu) == fru_id)
-			break;
-	}
-
-	return cpu;
-}
-
-static void retire_mem_fmp(struct fru_rec *rec, u32 nr_entries, u32 cpu)
+static void retire_mem_fmp(struct fru_rec *rec, u32 nr_entries)
 {
+	struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
+	unsigned int cpu, err_cpu = -1;
 	unsigned int i;
 
 	for (i = 0; i < nr_entries; i++) {
@@ -373,7 +363,16 @@ static void retire_mem_fmp(struct fru_rec *rec, u32 nr_entries, u32 cpu)
 		if (fpd->addr_type != FPD_ADDR_TYPE_MCA_ADDR)
 			continue;
 
-		retire_dram_row(fpd->addr, fpd->hw_id, cpu);
+		cpus_read_lock();
+		for_each_online_cpu(cpu) {
+			if (topology_ppin(cpu) == fmp->fru_id) {
+				err_cpu = cpu;
+				break;
+			}
+		}
+		cpus_read_unlock();
+
+		retire_dram_row(fpd->addr, fpd->hw_id, err_cpu);
 	}
 }
 
@@ -382,7 +381,6 @@ static void retire_mem_records(void)
 	struct cper_sec_fru_mem_poison *fmp;
 	struct fru_rec *rec;
 	unsigned int i;
-	u32 cpu;
 
 	for_each_fru(i, rec) {
 		fmp = &rec->fmp;
@@ -390,9 +388,7 @@ static void retire_mem_records(void)
 		if (!rec_has_valid_entries(rec))
 			continue;
 
-		cpu = get_cpu_from_fru_id(fmp->fru_id);
-
-		retire_mem_fmp(rec, fmp->nr_entries, cpu);
+		retire_mem_fmp(rec, fmp->nr_entries);
 	}
 }
 
-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ