[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <f6067e3e-a2bc-483d-b214-6e3fe6691279@efficios.com>
Date: Tue, 13 Feb 2024 15:56:53 -0500
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Dan Williams <dan.j.williams@...el.com>
Cc: linux-kernel <linux-kernel@...r.kernel.org>,
Dave Hansen <dave.hansen@...ux.intel.com>, linux-cxl@...r.kernel.org,
nvdimm@...ts.linux.dev, Vishal Verma <vishal.l.verma@...el.com>,
Dave Jiang <dave.jiang@...el.com>, rostedt <rostedt@...dmis.org>,
"Masami Hiramatsu (Google)" <mhiramat@...nel.org>
Subject: Re: CPU data cache across reboot/kexec for pmem/dax devices
On 2024-02-09 15:15, Dan Williams wrote:
> Mathieu Desnoyers wrote:
>> Hi Dan,
>>
>> In the context of extracting user-space trace data when the kernel crashes,
>> the LTTng user-space tracer recommends using nvdimm/pmem to reserve an area
>> of physical (volatile) RAM at boot (memmap=nn[KMG]!ss[KMG]), and use the
>> resulting device to create/mount a dax-enabled fs (e.g. ext4).
>>
>> We then use this filesystem to mmap() the shared memory files for the tracer.
>>
>> I want to make sure that the very last events from the userspace tracer written
>> to the memory mapped buffers (mmap()) by userspace are present after a
>> warm-reboot (or kexec/kdump).
>>
>> Note that the LTTng user-space tracer (LTTng-UST) does *not* issue any clflush
>> (or equivalent pmem_persist() from libpmem) for performance reasons: ring buffer
>> data is usually overwritten many times before the system actually crashes, and
>> the only thing we really need to make sure is that the cache lines are not
>> invalidated without write back.
>>
>> So I understand that the main use-case for pmem is nvdimm, and that in order to
>> guarantee persistence of the data on power off an explicit pmem_persist() is
>> needed after each "transaction", but for the needs of tracing, is there some
>> kind of architectural guarantee that the data present in the cpu data cache
>> is not invalidated prior to write back in each of those scenarios ?
>>
>> - reboot with bios explicitly not clearing memory,
>
> This one gives me pause, because a trip through the BIOS typically means
> lots of resets and other low level magic, so this would likely require
> pushing dirty data out of CPU caches prior to entering the BIOS code
> paths.
>
> So this either needs explicit cache flushing or mapping the memory with
> write-through semantics. That latter one is not supported in the stack
> today.
I would favor the explicit cache flushing approach over write-through semantics
for performance reasons: typically the ring buffers are overwritten, so always
storing the data to memory would be wasteful.
But I would rather do the cache flushing only on crash (die oops/panic notifiers)
rather than require the tracer to flush cache lines immediately after each event
is produced, again for performance reasons.
I have done a crude attempt at registering die oops/panic notifiers from the pmem
driver, and flush all pmem areas to memory when die oops/panic notifiers are
called (see the untested patch below). I wonder if this is something that would be
generally useful ?
>
>> - kexec/kdump.
>
> This should maintain the state of CPU caches. As far as the CPU is
> concerned it is just long jumping into a new kernel in memory without
> resetting any CPU cache state.
Good to know that this scenario is expected to "Just Work".
Thanks,
Mathieu
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index e9898457a7bd..b92f18607d39 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -26,12 +26,18 @@
#include <linux/dax.h>
#include <linux/nd.h>
#include <linux/mm.h>
+#include <linux/panic_notifier.h>
#include <asm/cacheflush.h>
#include "pmem.h"
#include "btt.h"
#include "pfn.h"
#include "nd.h"
+static int pmem_die_handler(struct notifier_block *self,
+ unsigned long ev, void *unused);
+static int pmem_panic_handler(struct notifier_block *self,
+ unsigned long ev, void *unused);
+
static struct device *to_dev(struct pmem_device *pmem)
{
/*
@@ -423,6 +429,9 @@ static void pmem_release_disk(void *__pmem)
{
struct pmem_device *pmem = __pmem;
+ atomic_notifier_chain_unregister(&panic_notifier_list,
+ &pmem->panic_notifier);
+ unregister_die_notifier(&pmem->die_notifier);
dax_remove_host(pmem->disk);
kill_dax(pmem->dax_dev);
put_dax(pmem->dax_dev);
@@ -573,9 +582,20 @@ static int pmem_attach_disk(struct device *dev,
goto out_cleanup_dax;
dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
}
- rc = device_add_disk(dev, disk, pmem_attribute_groups);
+ pmem->die_notifier.notifier_call = pmem_die_handler;
+ pmem->die_notifier.priority = -INT_MAX;
+ rc = register_die_notifier(&pmem->die_notifier);
if (rc)
goto out_remove_host;
+ pmem->panic_notifier.notifier_call = pmem_panic_handler;
+ pmem->panic_notifier.priority = -INT_MAX;
+ rc = atomic_notifier_chain_register(&panic_notifier_list,
+ &pmem->panic_notifier);
+ if (rc)
+ goto out_unregister_die;
+ rc = device_add_disk(dev, disk, pmem_attribute_groups);
+ if (rc)
+ goto out_unregister_panic;
if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
return -ENOMEM;
@@ -587,6 +607,11 @@ static int pmem_attach_disk(struct device *dev,
dev_warn(dev, "'badblocks' notification disabled\n");
return 0;
+out_unregister_panic:
+ atomic_notifier_chain_unregister(&panic_notifier_list,
+ &pmem->panic_notifier);
+out_unregister_die:
+ unregister_die_notifier(&pmem->die_notifier);
out_remove_host:
dax_remove_host(pmem->disk);
out_cleanup_dax:
@@ -749,6 +774,35 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
}
}
+/*
+ * For volatile memory use-cases where explicit flushing of the data
+ * cache is not useful after stores, the pmem panic notifier is called
+ * on die/panic to make sure the content of the pmem memory area is
+ * flushed from data cache to memory, so it can be preserved across
+ * warm reboot. Use a low notifier priority to flush to memory stores
+ * that are performed from other die notifiers.
+ */
+static int pmem_die_handler(struct notifier_block *self,
+ unsigned long ev, void *unused)
+{
+ struct pmem_device *pmem = container_of(self, struct pmem_device, die_notifier);
+
+ /* Only trigger on DIE_OOPS for the die notifier. */
+ if (ev != DIE_OOPS)
+ return NOTIFY_DONE;
+ arch_wb_cache_pmem(pmem->virt_addr, pmem->size);
+ return NOTIFY_DONE;
+}
+
+static int pmem_panic_handler(struct notifier_block *self,
+ unsigned long ev, void *unused)
+{
+ struct pmem_device *pmem = container_of(self, struct pmem_device, panic_notifier);
+
+ arch_wb_cache_pmem(pmem->virt_addr, pmem->size);
+ return NOTIFY_DONE;
+}
+
MODULE_ALIAS("pmem");
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 392b0b38acb9..6c5661a3d37b 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -4,6 +4,7 @@
#include <linux/page-flags.h>
#include <linux/badblocks.h>
#include <linux/memremap.h>
+#include <linux/notifier.h>
#include <linux/types.h>
#include <linux/pfn_t.h>
#include <linux/fs.h>
@@ -27,6 +28,8 @@ struct pmem_device {
struct dax_device *dax_dev;
struct gendisk *disk;
struct dev_pagemap pgmap;
+ struct notifier_block die_notifier;
+ struct notifier_block panic_notifier;
};
long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Powered by blists - more mailing lists